#ifndef __SHMEM_FS_H #define __SHMEM_FS_H #include <linux/file.h> #include <linux/swap.h> #include <linux/mempolicy.h> #include <linux/pagemap.h> #include <linux/percpu_counter.h> #include <linux/xattr.h> /* inode in-kernel data */ struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ struct shared_policy policy; /* NUMA memory alloc policy */ struct list_head swaplist; /* chain of maybes on swap */ struct simple_xattrs xattrs; /* list of xattrs */ struct inode vfs_inode; }; struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ umode_t mode; /* Mount mode for root directory */ struct mempolicy *mpol; /* default memory policy for mappings */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) { 705 return container_of(inode, struct shmem_inode_info, vfs_inode); } /* * Functions in mm/shmem.c called directly from elsewhere: */ extern int shmem_init(void); extern int shmem_fill_super(struct super_block *sb, void *data, int silent); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); extern bool shmem_mapping(struct address_space *mapping); extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(swp_entry_t entry, struct page *page); static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { return shmem_read_mapping_page_gfp(mapping, index, mapping_gfp_mask(mapping)); } #ifdef CONFIG_TMPFS extern int shmem_add_seals(struct file *file, unsigned int seals); extern int shmem_get_seals(struct file *file); extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) { return -EINVAL; } #endif #endif
/* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> * * Architecture independence: * Copyright (c) 2005, Bull S.A. * Written by Pierre Peiffer <pierre.peiffer@bull.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public Licens * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ /* * Extents support for EXT4 * * TODO: * - ext4*_error() should be used in some situations * - analyze all BUG()/BUG_ON(), use -EIO where appropriate * - smart tree reduction */ #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> #include <asm/uaccess.h> #include <linux/fiemap.h> #include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" #include <trace/events/ext4.h> /* * used by extent splitting. */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } static int ext4_extent_block_csum_verify(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; 43 if (!ext4_has_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); if (et->et_checksum != ext4_extent_block_csum(inode, eh)) return 0; return 1; } 419 static void ext4_extent_block_csum_set(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; 419 if (!ext4_has_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); 419 et->et_checksum = ext4_extent_block_csum(inode, eh); } static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags); static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags); static int ext4_find_delayed_extent(struct inode *inode, struct extent_status *newes); static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) { int err; 320 if (!ext4_handle_valid(handle)) 320 return 0; if (handle->h_buffer_credits > needed) return 0; err = ext4_journal_extend(handle, needed); if (err <= 0) return err; err = ext4_truncate_restart_trans(handle, inode, needed); if (err == 0) err = -EAGAIN; return err; } /* * could return: * - EROFS * - ENOMEM */ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); 420 return ext4_journal_get_write_access(handle, path->p_bh); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ return 0; } /* * could return: * - EROFS * - ENOMEM * - EIO */ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; 792 WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); 792 if (path->p_bh) { 419 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); } else { /* path points to leaf/index in inode body */ 780 err = ext4_mark_inode_dirty(handle, inode); } 792 return err; } static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { 602 if (path) { 602 int depth = path->p_depth; struct ext4_extent *ex; /* * Try to predict block placement assuming that we are * filling in a file which will eventually be * non-sparse --- i.e., in the case of libbfd writing * an ELF object sections out-of-order but in a way * the eventually results in a contiguous object or * executable file, or some database extending a table * space file. However, this is actually somewhat * non-ideal if we are writing a sparse file such as * qemu or KVM writing a raw image file that is going * to stay fairly sparse, since it will end up * fragmenting the file system's free space. Maybe we * should have some hueristics or some way to allow * userspace to pass a hint to file system, * especially if the latter case turns out to be * common. */ ex = path[depth].p_ext; if (ex) { 530 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); if (block > ext_block) 602 return ext_pblk + (block - ext_block); else 56 return ext_pblk - (ext_block - block); } /* it looks like index is empty; * try to find starting block from index itself */ 502 if (path[depth].p_bh) return path[depth].p_bh->b_blocknr; } /* OK. use inode's group */ 502 return ext4_inode_to_goal_block(inode); } /* * Allocation for a meta data block */ static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex, int *err, unsigned int flags) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, err); return newblock; } static inline int ext4_ext_space_block(struct inode *inode, int check) { int size; 301 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 6) size = 6; #endif return size; } static inline int ext4_ext_space_block_idx(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 5) size = 5; #endif return size; } static inline int ext4_ext_space_root(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 3) size = 3; #endif return size; } static inline int ext4_ext_space_root_idx(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 4) size = 4; #endif return size; } static inline int ext4_force_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t lblk, int nofail) { struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); 17 return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); } /* * Calculate the number of metadata blocks needed * to allocate @blocks * Worse case is one block per extent */ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); int idxs; idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx)); /* * If the new delayed allocation block is contiguous with the * previous da block, it can share index blocks with the * previous block, so we only need to allocate a new index * block every idxs leaf blocks. At ldxs**2 blocks, we need * an additional index block, and at ldxs**3 blocks, yet * another index blocks. */ if (ei->i_da_metadata_calc_len && ei->i_da_metadata_calc_last_lblock+1 == lblock) { int num = 0; if ((ei->i_da_metadata_calc_len % idxs) == 0) num++; if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) num++; if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { num++; ei->i_da_metadata_calc_len = 0; } else ei->i_da_metadata_calc_len++; ei->i_da_metadata_calc_last_lblock++; return num; } /* * In the worst case we need a new set of index blocks at * every level of the inode's extent tree. */ ei->i_da_metadata_calc_len = 1; ei->i_da_metadata_calc_last_lblock = lblock; return ext_depth(inode) + 1; } static int ext4_ext_max_entries(struct inode *inode, int depth) { int max; 457 if (depth == ext_depth(inode)) { if (depth == 0) max = ext4_ext_space_root(inode, 1); else max = ext4_ext_space_root_idx(inode, 1); } else { 43 if (depth == 0) max = ext4_ext_space_block(inode, 1); else max = ext4_ext_space_block_idx(inode, 1); } return max; } static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { 264 ext4_fsblk_t block = ext4_ext_pblock(ext); 264 int len = ext4_ext_get_actual_len(ext); 264 ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); /* * We allow neither: * - zero length * - overflow/wrap-around */ if (lblock + len <= lblock) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); } static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, int depth) { unsigned short entries; 457 if (eh->eh_entries == 0) return 1; entries = le16_to_cpu(eh->eh_entries); 349 if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); 264 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; ext4_fsblk_t pblock = 0; ext4_lblk_t lblock = 0; ext4_lblk_t prev = 0; int len = 0; while (entries) { 264 if (!ext4_valid_extent(inode, ext)) return 0; /* Check for overlapping extents */ 264 lblock = le32_to_cpu(ext->ee_block); 264 len = ext4_ext_get_actual_len(ext); 264 if ((lblock <= prev) && prev) { pblock = ext4_ext_pblock(ext); es->s_last_error_block = cpu_to_le64(pblock); return 0; } 264 ext++; entries--; prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); while (entries) { 185 if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; 185 ext_idx++; entries--; } } return 1; } static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, int depth, ext4_fsblk_t pblk) { const char *error_msg; int max = 0, err = -EFSCORRUPTED; 457 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { error_msg = "invalid magic"; goto corrupted; } 457 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { error_msg = "unexpected eh_depth"; goto corrupted; } 457 if (unlikely(eh->eh_max == 0)) { error_msg = "invalid eh_max"; goto corrupted; } 457 max = ext4_ext_max_entries(inode, depth); 457 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { error_msg = "too large eh_max"; goto corrupted; } 457 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { error_msg = "invalid eh_entries"; goto corrupted; } 457 if (!ext4_valid_extent_entries(inode, eh, depth)) { error_msg = "invalid extent entries"; goto corrupted; } 374 if (unlikely(depth > 32)) { error_msg = "too large eh_depth"; goto corrupted; } /* Verify checksum on non-root extent tree nodes */ 457 if (ext_depth(inode) != depth && 43 !ext4_extent_block_csum_verify(inode, eh)) { error_msg = "extent tree corrupted"; err = -EFSBADCRC; goto corrupted; } 457 return 0; corrupted: ext4_error_inode(inode, function, line, 0, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", (unsigned long long) pblk, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); return err; } #define ext4_ext_check(inode, eh, depth, pblk) \ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) int ext4_ext_check_inode(struct inode *inode) { 18 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, ext4_fsblk_t pblk, int depth, int flags) { struct buffer_head *bh; int err; 425 bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); 425 if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); err = bh_submit_read(bh); if (err < 0) goto errout; } 425 if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; 43 err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), depth, pblk); if (err) goto errout; 43 set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { 43 struct ext4_extent_header *eh = ext_block_hdr(bh); struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); ext4_lblk_t prev = 0; int i; for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { unsigned int status = EXTENT_STATUS_WRITTEN; 43 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); 43 int len = ext4_ext_get_actual_len(ex); 43 if (prev && (prev != lblk)) ext4_es_cache_extent(inode, prev, lblk - prev, ~0, EXTENT_STATUS_HOLE); 43 if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; 43 ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); prev = lblk + len; } } return bh; errout: put_bh(bh); 425 return ERR_PTR(err); } #define read_extent_tree_block(inode, pblk, depth, flags) \ __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ (depth), (flags)) /* * This function is called to cache a file's extent information in the * extent status tree */ int ext4_ext_precache(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_ext_path *path = NULL; struct buffer_head *bh; int i = 0, depth, ret = 0; 6 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ 5 down_read(&ei->i_data_sem); depth = ext_depth(inode); path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); if (path == NULL) { up_read(&ei->i_data_sem); return -ENOMEM; } /* Don't cache anything if there are no external extent blocks */ 5 if (depth == 0) goto out; 2 path[0].p_hdr = ext_inode_hdr(inode); ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); if (ret) goto out; 2 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); 2 while (i >= 0) { /* * If this is a leaf block or we've reached the end of * the index block, go up */ 2 if ((i == depth) || 2 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { 2 brelse(path[i].p_bh); 2 path[i].p_bh = NULL; i--; continue; } 2 bh = read_extent_tree_block(inode, ext4_idx_pblock(path[i].p_idx++), depth - i - 1, EXT4_EX_FORCE_CACHE); if (IS_ERR(bh)) { ret = PTR_ERR(bh); break; } 2 i++; path[i].p_bh = bh; path[i].p_hdr = ext_block_hdr(bh); path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); } 2 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); out: 5 up_read(&ei->i_data_sem); ext4_ext_drop_refs(path); kfree(path); 5 return ret; } #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { int k, l = path->p_depth; ext_debug("path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else ext_debug(" []"); } ext_debug("\n"); } static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) { int depth = ext_depth(inode); struct ext4_extent_header *eh; struct ext4_extent *ex; int i; if (!path) return; eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); } static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext4_fsblk_t newblock, int level) { int depth = ext_depth(inode); struct ext4_extent *ex; if (depth != level) { struct ext4_extent_idx *idx; idx = path[level].p_idx; while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { ext_debug("%d: move %d:%llu in new index %llu\n", level, le32_to_cpu(idx->ei_block), ext4_idx_pblock(idx), newblock); idx++; } return; } ex = path[depth].p_ext; while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; } } #else #define ext4_ext_show_path(inode, path) #define ext4_ext_show_leaf(inode, path) #define ext4_ext_show_move(inode, path, newblock, level) #endif void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; 873 if (!path) return; 865 depth = path->p_depth; 873 for (i = 0; i <= depth; i++, path++) 865 if (path->p_bh) { 338 brelse(path->p_bh); path->p_bh = NULL; } } /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent_idx *r, *l, *m; ext_debug("binsearch for %u(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; r = EXT_LAST_INDEX(eh); while (l <= r) { m = l + (r - l) / 2; if (block < le32_to_cpu(m->ei_block)) r = m - 1; else l = m + 1; ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), r, le32_to_cpu(r->ei_block)); } 339 path->p_idx = l - 1; ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH { struct ext4_extent_idx *chix, *ix; int k; chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { if (k != 0 && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); printk(KERN_DEBUG "%u <= %u\n", le32_to_cpu(ix->ei_block), le32_to_cpu(ix[-1].ei_block)); } BUG_ON(k && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)); if (block < le32_to_cpu(ix->ei_block)) break; chix = ix; } BUG_ON(chix != path->p_idx); } #endif } /* * ext4_ext_binsearch: * binary search for closest extent of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent *r, *l, *m; if (eh->eh_entries == 0) { /* * this leaf is empty: * we get such a leaf in split/add case */ return; } ext_debug("binsearch for %u: ", block); 591 l = EXT_FIRST_EXTENT(eh) + 1; r = EXT_LAST_EXTENT(eh); 504 while (l <= r) { 504 m = l + (r - l) / 2; if (block < le32_to_cpu(m->ee_block)) 102 r = m - 1; else 464 l = m + 1; ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), r, le32_to_cpu(r->ee_block)); } 591 path->p_ext = l - 1; ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { struct ext4_extent *chex, *ex; int k; chex = ex = EXT_FIRST_EXTENT(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { BUG_ON(k && le32_to_cpu(ex->ee_block) <= le32_to_cpu(ex[-1].ee_block)); if (block < le32_to_cpu(ex->ee_block)) break; chex = ex; } BUG_ON(chex != path->p_ext); } #endif } int ext4_ext_tree_init(handle_t *handle, struct inode *inode) { struct ext4_extent_header *eh; eh = ext_inode_hdr(inode); 50 eh->eh_depth = 0; eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); return 0; } struct ext4_ext_path * ext4_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_ext_path **orig_path, int flags) 711 { struct ext4_extent_header *eh; struct buffer_head *bh; 357 struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; eh = ext_inode_hdr(inode); depth = ext_depth(inode); 707 if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", depth); ret = -EFSCORRUPTED; goto err; } 357 if (path) { 346 ext4_ext_drop_refs(path); if (depth > path[0].p_maxdepth) { kfree(path); *orig_path = path = NULL; } } if (!path) { /* account possible depth increase */ 711 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), GFP_NOFS); if (unlikely(!path)) return ERR_PTR(-ENOMEM); 711 path[0].p_maxdepth = depth + 1; } 711 path[0].p_hdr = eh; path[0].p_bh = NULL; i = depth; /* walk through the tree */ while (i) { ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 339 ext4_ext_binsearch_idx(inode, path + ppos, block); path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); path[ppos].p_depth = i; path[ppos].p_ext = NULL; bh = read_extent_tree_block(inode, path[ppos].p_block, --i, flags); if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; } 339 eh = ext_block_hdr(bh); ppos++; if (unlikely(ppos > depth)) { put_bh(bh); EXT4_ERROR_INODE(inode, "ppos %d > depth %d", ppos, depth); ret = -EFSCORRUPTED; goto err; } 339 path[ppos].p_bh = bh; path[ppos].p_hdr = eh; } 711 path[ppos].p_depth = i; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ 591 ext4_ext_binsearch(inode, path + ppos, block); /* if not an empty leaf */ if (path[ppos].p_ext) 591 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); return path; err: ext4_ext_drop_refs(path); kfree(path); if (orig_path) *orig_path = NULL; return ERR_PTR(ret); } /* * ext4_ext_insert_index: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { struct ext4_extent_idx *ix; int len, err; err = ext4_ext_get_access(handle, inode, curp); if (err) return err; if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { EXT4_ERROR_INODE(inode, "logical %d == ei_block %d!", logical, le32_to_cpu(curp->p_idx->ei_block)); return -EFSCORRUPTED; } if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) >= le16_to_cpu(curp->p_hdr->eh_max))) { EXT4_ERROR_INODE(inode, "eh_entries %d >= eh_max %d!", le16_to_cpu(curp->p_hdr->eh_entries), le16_to_cpu(curp->p_hdr->eh_max)); return -EFSCORRUPTED; } if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ ext_debug("insert new index %d after: %llu\n", logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ ext_debug("insert new index %d before: %llu\n", logical, ptr); ix = curp->p_idx; } len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { ext_debug("insert new index %d: " "move %d indices from 0x%p to 0x%p\n", logical, len, ix, ix + 1); memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); } if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); return -EFSCORRUPTED; } ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EFSCORRUPTED; } err = ext4_ext_dirty(handle, inode, curp); ext4_std_error(inode->i_sb, err); return err; } /* * ext4_ext_split: * inserts new subtree into the path, using free index entry * at depth @at: * - allocates all needed blocks (new leaf and all intermediate index blocks) * - makes decision where to split * - moves remaining extents and index entries (right to the split point) * into the newly allocated blocks * - initializes subtree */ static int ext4_ext_split(handle_t *handle, struct inode *inode, unsigned int flags, struct ext4_ext_path *path, struct ext4_extent *newext, int at) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); struct ext4_extent_header *neh; struct ext4_extent_idx *fidx; int i = at, k, m, a; ext4_fsblk_t newblock, oldblock; __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ int err = 0; size_t ext_size = 0; /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ /* if current leaf will be split, then we should use * border from split point */ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); return -EFSCORRUPTED; } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; ext_debug("leaf will be split." " next leaf starts at %d\n", le32_to_cpu(border)); } else { border = newext->ee_block; ext_debug("leaf will be added." " next leaf starts at %d\n", le32_to_cpu(border)); } /* * If error occurs, then we break processing * and mark filesystem read-only. index won't * be inserted and tree will be in consistent * state. Next mount will repair buffers too. */ /* * Get array to track all allocated blocks. * We need this to handle errors and free blocks * upon them. */ ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); if (!ablocks) return -ENOMEM; /* allocate all needed blocks */ ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err, flags); if (newblock == 0) goto cleanup; ablocks[a] = newblock; } /* initialize new leaf */ newblock = ablocks[--a]; if (unlikely(newblock == 0)) { EXT4_ERROR_INODE(inode, "newblock == 0!"); err = -EFSCORRUPTED; goto cleanup; } bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, bh); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = 0; neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max)) { EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", path[depth].p_hdr->eh_entries, path[depth].p_hdr->eh_max); err = -EFSCORRUPTED; goto cleanup; } /* start copy from next extent */ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; ext4_ext_show_move(inode, path, newblock, depth); if (m) { struct ext4_extent *ex; ex = EXT_FIRST_EXTENT(neh); memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old leaf */ if (m) { err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto cleanup; } /* create intermediate indexes */ k = depth - at - 1; if (unlikely(k < 0)) { EXT4_ERROR_INODE(inode, "k %d < 0!", k); err = -EFSCORRUPTED; goto cleanup; } if (k) ext_debug("create %d intermediate indices\n", k); /* insert new index into current index block */ /* current depth stored in i var */ i = depth - 1; while (k--) { oldblock = newblock; newblock = ablocks[--a]; bh = sb_getblk(inode->i_sb, newblock); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, bh); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = cpu_to_le16(1); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); ext_debug("int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); /* move remainder of path[i] to the new index block */ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr))) { EXT4_ERROR_INODE(inode, "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", le32_to_cpu(path[i].p_ext->ee_block)); err = -EFSCORRUPTED; goto cleanup; } /* start copy indexes */ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); ext4_ext_show_move(inode, path, newblock, i); if (m) { memmove(++fidx, path[i].p_idx, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old index */ if (m) { err = ext4_ext_get_access(handle, inode, path + i); if (err) goto cleanup; le16_add_cpu(&path[i].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + i); if (err) goto cleanup; } i--; } /* insert new index */ err = ext4_ext_insert_index(handle, inode, path + at, le32_to_cpu(border), newblock); cleanup: if (bh) { if (buffer_locked(bh)) unlock_buffer(bh); brelse(bh); } if (err) { /* free all allocated blocks in error case */ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); return err; } /* * ext4_ext_grow_indepth: * implements tree growing procedure: * - allocates new block 302 * - moves top-level data (index block or leaf) into the new block * - initializes new top-level, creating index that points to the * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; ext4_fsblk_t newblock, goal = 0; 302 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 302 int err = 0; size_t ext_size = 0; 301 /* Try to prepend new index to old one */ if (ext_depth(inode)) 301 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); if (goal > le32_to_cpu(es->s_first_data_block)) { flags |= EXT4_MB_HINT_TRY_GOAL; 301 goal--; } else 301 goal = ext4_inode_to_goal_block(inode); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, &err); if (newblock == 0) return err; bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); 301 if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); err = ext4_journal_get_create_access(handle, bh); if (err) { unlock_buffer(bh); goto out; } 301 ext_size = sizeof(EXT4_I(inode)->i_data); /* move top-level index/leaf into new block */ memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); /* zero out unused area in the extent block */ memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); /* set size of new block */ neh = ext_block_hdr(bh); /* old root could have indexes or leaves * so calculate e_max right way */ if (ext_depth(inode)) neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 301 else neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; ext4_extent_block_csum_set(inode, neh); 301 set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; /* Update top-level index: num,max,pointer */ neh = ext_inode_hdr(inode); 301 neh->eh_entries = cpu_to_le16(1); ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); if (neh->eh_depth == 0) { 301 /* Root extent block becomes index block */ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block; } ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); le16_add_cpu(&neh->eh_depth, 1); ext4_mark_inode_dirty(handle, inode); out: brelse(bh); return err; } /* * ext4_ext_create_new_leaf: * finds empty index and adds new leaf. 302 * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, unsigned int mb_flags, unsigned int gb_flags, struct ext4_ext_path **ppath, struct ext4_extent *newext) { struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; 302 repeat: i = depth = ext_depth(inode); /* walk up to the tree and look for free index entry */ curp = path + depth; while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { i--; curp--; } /* we use already allocated block for index block, * so subsequent data blocks should be contiguous */ if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that 302 * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; /* refill path */ 301 path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto out; 301 /* refill path */ 301 path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; } /* * only first (depth 0 -> 1) produces free space; * in all other cases we have to split the grown tree */ depth = ext_depth(inode); if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { /* now we need to split */ goto repeat; } } out: return err; } /* * search the closest allocated block to the left for *logical * and returns it at @logical + it's physical address at @phys * if *logical is the smallest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code 602 */ static int ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, 589 ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth, ee_len; if (unlikely(path == NULL)) { 530 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 530 return -EFSCORRUPTED; 530 } 56 depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; 56 /* usually extent in the path covers blocks smaller 16 * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", *logical, le32_to_cpu(ex->ee_block)); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; 515 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", ix != NULL ? le32_to_cpu(ix->ei_block) : 0, EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, depth); 515 return -EFSCORRUPTED; } } return 0; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; *phys = ext4_ext_pblock(ex) + ee_len - 1; return 0; } /* * search the closest allocated block to the right for *logical * and returns it at @logical + it's physical address at @phys * if *logical is the largest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code */ 602 static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, struct ext4_extent **ret_ex) 602 { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; 589 struct ext4_extent_idx *ix; struct ext4_extent *ex; ext4_fsblk_t block; int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; if (unlikely(path == NULL)) { 530 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 530 return -EFSCORRUPTED; 530 } 56 depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; 56 /* usually extent in the path covers blocks smaller 16 * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); 602 if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "first_extent(path[%d].p_hdr) != ex", depth); return -EFSCORRUPTED; 515 } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix != EXT_FIRST_INDEX *logical %d!", *logical); 515 return -EFSCORRUPTED; } 37 } goto found_extent; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 494 EXT4_ERROR_INODE(inode, 308 "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ ex++; goto found_extent; } /* go up and search for index to the right */ while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) goto got_index; } /* we've gone up to the root and found no index to the right */ return 0; got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ ix++; block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); if (IS_ERR(bh)) 66 return PTR_ERR(bh); eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); block = ext4_idx_pblock(ix); put_bh(bh); } bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); *ret_ex = ex; if (bh) put_bh(bh); return 0; } 560 560 /* * ext4_ext_next_allocated_block: 546 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. * NOTE: it considers block number from index entry as * allocated block. Thus, index entries have to be consistent * with leaves. 557 */ ext4_lblk_t 557 ext4_ext_next_allocated_block(struct ext4_ext_path *path) { 557 int depth; 560 BUG_ON(path == NULL); depth = path->p_depth; 310 if (depth == 0 && path->p_ext == NULL) return EXT_MAX_BLOCKS; 526 while (depth >= 0) { if (depth == path->p_depth) { /* leaf */ if (path[depth].p_ext && path[depth].p_ext != EXT_LAST_EXTENT(path[depth].p_hdr)) return le32_to_cpu(path[depth].p_ext[1].ee_block); } else { /* index */ if (path[depth].p_idx != EXT_LAST_INDEX(path[depth].p_hdr)) return le32_to_cpu(path[depth].p_idx[1].ei_block); } depth--; 297 } 297 return EXT_MAX_BLOCKS; } /* * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; /* zero-tree has no leaf blocks at all */ if (depth == 0) return EXT_MAX_BLOCKS; /* go to index block */ depth--; while (depth >= 0) { if (path[depth].p_idx != EXT_LAST_INDEX(path[depth].p_hdr)) return (ext4_lblk_t) le32_to_cpu(path[depth].p_idx[1].ei_block); depth--; } 610 return EXT_MAX_BLOCKS; } /* * ext4_ext_correct_indexes: * if leaf gets modified and modified extent is first in the leaf, * then we have to correct all indexes above. * TODO: do we need to correct tree in all cases? 610 */ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { struct ext4_extent_header *eh; int depth = ext_depth(inode); 610 struct ext4_extent *ex; __le32 border; 610 int k, err = 0; eh = path[depth].p_hdr; 336 ex = path[depth].p_ext; if (unlikely(ex == NULL || eh == NULL)) { EXT4_ERROR_INODE(inode, "ex %p == NULL or eh %p == NULL", ex, eh); return -EFSCORRUPTED; } 35 if (depth == 0) { /* there is no tree at all */ return 0; } 35 if (ex != EXT_FIRST_EXTENT(eh)) { /* we correct tree if first leaf got modified only */ return 0; } 35 /* * TODO: we need correction if border is smaller than current one */ k = depth - 1; border = path[depth].p_ext->ee_block; err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; while (k--) { /* change all left-side indexes */ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) break; err = ext4_ext_get_access(handle, inode, path + k); if (err) break; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); 540 if (err) break; } 533 533 return err; } 533 int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; 516 ext1_ee_len = ext4_ext_get_actual_len(ex1); 516 ext2_ee_len = ext4_ext_get_actual_len(ex2); 173 173 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; /* * To allow future support for preallocated extents to be added * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. 540 */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; #endif if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) return 1; return 0; } /* * This function tries to merge the "ex" extent to the next extent in the tree. 608 * It always tries to merge towards right. If you want to merge towards * left, pass "ex - 1" as argument instead of "ex". * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. 608 */ 502 static int ext4_ext_try_to_merge_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) 608 { 35 struct ext4_extent_header *eh; unsigned int depth, len; int merge_done = 0, unwritten; 13 depth = ext_depth(inode); 35 BUG_ON(path[depth].p_hdr == NULL); 19 eh = path[depth].p_hdr; while (ex < EXT_LAST_EXTENT(eh)) { if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 35 break; /* merge with next extent! */ unwritten = ext4_ext_is_unwritten(ex); 35 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); if (unwritten) ext4_ext_mark_unwritten(ex); 608 if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) * sizeof(struct ext4_extent); memmove(ex + 1, ex + 2, len); } le16_add_cpu(&eh->eh_entries, -1); merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); } return merge_done; } 608 335 /* 335 * This function does a very simple check to see if we can collapse * an extent tree with a single extent tree leaf block into the inode. */ static void ext4_ext_try_to_merge_up(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { size_t s; 15 unsigned max_root = ext4_ext_space_root(inode, 0); ext4_fsblk_t blk; if ((path[0].p_depth != 1) || (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) 15 return; /* * We need to modify the block allocation bitmap and the block * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ if (ext4_journal_extend(handle, 2)) return; /* * Copy the extent data up to the inode 15 */ 15 blk = ext4_idx_pblock(path[0].p_idx); s = le16_to_cpu(path[1].p_hdr->eh_entries) * sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); path[0].p_hdr->eh_max = cpu_to_le16(max_root); brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } 608 /* * This function tries to merge the @ex extent to neighbours in the tree. * return 1 if merge left else 0. 608 */ 476 static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, 608 struct ext4_extent *ex) { struct ext4_extent_header *eh; 608 unsigned int depth; 608 int merge_done = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; if (ex > EXT_FIRST_EXTENT(eh)) merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) (void) ext4_ext_try_to_merge_right(inode, path, ex); ext4_ext_try_to_merge_up(handle, inode, path); } /* * check if a portion of the "newext" extent overlaps with an * existing extent. * * If there is an overlap discovered, it updates the length of the newext * such that there will be no overlap, and then returns 1. 602 * If there is no overlap found, it returns 0. */ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, 530 struct inode *inode, struct ext4_extent *newext, struct ext4_ext_path *path) { ext4_lblk_t b1, b2; unsigned int depth, len1; unsigned int ret = 0; 515 b1 = le32_to_cpu(newext->ee_block); len1 = ext4_ext_get_actual_len(newext); 37 depth = ext_depth(inode); if (!path[depth].p_ext) goto out; b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); 56 /* * get the next allocated block if the extent in the path * is before the requested block(s) */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); 66 if (b2 == EXT_MAX_BLOCKS) 2 goto out; b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ if (b1 + len1 < b1) { len1 = EXT_MAX_BLOCKS - b1; newext->ee_len = cpu_to_le16(len1); ret = 1; } /* check for overlap */ if (b1 + len1 > b2) { newext->ee_len = cpu_to_le16(b2 - b1); ret = 1; } out: 608 return ret; } /* * ext4_ext_insert_extent: * tries to merge requsted extent into the existing extent or * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, 608 struct ext4_extent *newext, int gb_flags) { struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; 608 struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; int mb_flags = 0, unwritten; if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) mb_flags |= EXT4_MB_DELALLOC_RESERVED; 608 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); return -EFSCORRUPTED; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 514 return -EFSCORRUPTED; 51 } 51 /* try to insert block into found extent and return */ 7 if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { 514 /* 421 * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ 514 if (ex < EXT_LAST_EXTENT(eh) && (le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex) < le32_to_cpu(newext->ee_block))) { ex += 1; goto prepend; } else if ((ex > EXT_FIRST_EXTENT(eh)) && (le32_to_cpu(newext->ee_block) + ext4_ext_get_actual_len(newext) < 454 le32_to_cpu(ex->ee_block))) ex -= 1; /* Try to append newex to the ex */ 454 if (ext4_can_extents_be_merged(inode, ex, newext)) { 320 ext_debug("append [%d]%d block to %u:[%d]%d" "(from %llu)\n", ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) 462 return err; unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; } 3 prepend: /* Try to prepend newex to the ex */ if (ext4_can_extents_be_merged(inode, newext, ex)) { ext_debug("prepend %u[%d]%d block to %u:[%d]%d" 3 "(from %llu)\n", le32_to_cpu(newext->ee_block), ext4_ext_is_unwritten(newext), 454 ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), 146 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; 595 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) 302 ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; 297 goto merge; } } depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) goto has_space; /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); next = EXT_MAX_BLOCKS; if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 302 ext_debug("next leaf isn't full(%d)\n", 20 le16_to_cpu(eh->eh_entries)); 302 path = npath; goto has_space; } ext_debug("next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 301 } /* 595 * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. 324 */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags |= EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, 595 ppath, newext); if (err) goto cleanup; depth = ext_depth(inode); eh = path[depth].p_hdr; has_space: 500 nearex = path[depth].p_ext; 492 err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; if (!nearex) { /* there is no extent in this leaf, create first one */ ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), 469 ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { 54 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ ext_debug("insert %u:%llu:[%d]%d before: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), 492 nearex); nearex++; 77 } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); ext_debug("insert %u:%llu:[%d]%d after: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } len = EXT_LAST_EXTENT(eh) - nearex + 1; 595 if (len > 0) { ext_debug("insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, 585 len * sizeof(struct ext4_extent)); } } 608 le16_add_cpu(&eh->eh_entries, 1); path[depth].p_ext = nearex; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 608 nearex->ee_len = newext->ee_len; merge: 608 /* try to merge extents */ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) 608 ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) goto cleanup; err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: ext4_ext_drop_refs(npath); kfree(npath); return err; } 9 static int ext4_fill_fiemap_extents(struct inode *inode, ext4_lblk_t block, ext4_lblk_t num, struct fiemap_extent_info *fieinfo) 9 { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; struct extent_status es; ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; int exists, depth = 0, err = 0; unsigned int flags = 0; unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 9 while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); path = ext4_find_extent(inode, block, &path, 0); if (IS_ERR(path)) { 9 up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); path = NULL; break; } depth = ext_depth(inode); if (unlikely(path[depth].p_hdr == NULL)) { up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 6 err = -EFSCORRUPTED; break; } ex = path[depth].p_ext; 1 next = ext4_ext_next_allocated_block(path); flags = 0; 6 exists = 0; if (!ex) { /* there is no extent yet, so try to allocate * all requested space */ 2 start = block; end = block + num; } else if (le32_to_cpu(ex->ee_block) > block) { /* need to allocate space before found extent */ start = block; end = le32_to_cpu(ex->ee_block); if (block + num < end) end = block + num; } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { 5 /* need to allocate space after found extent */ start = block; end = block + num; if (end >= next) end = next; } else if (block >= le32_to_cpu(ex->ee_block)) { 6 /* * some part of requested space is covered * by found extent */ 6 start = block; end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); if (block + num < end) 5 end = block + num; exists = 1; } else { BUG(); } BUG_ON(end <= start); if (!exists) { es.es_lblk = start; es.es_len = end - start; es.es_pblk = 0; 9 } else { 1 es.es_lblk = le32_to_cpu(ex->ee_block); es.es_len = ext4_ext_get_actual_len(ex); 4 es.es_pblk = ext4_ext_pblock(ex); if (ext4_ext_is_unwritten(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; 9 } /* * Find delayed extent and update es accordingly. We call * it even in !exists case to find out whether es is the * last existing extent or not. */ next_del = ext4_find_delayed_extent(inode, &es); if (!exists && next_del) { exists = 1; flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } up_read(&EXT4_I(inode)->i_data_sem); if (unlikely(es.es_len == 0)) { EXT4_ERROR_INODE(inode, "es.es_len == 0"); err = -EFSCORRUPTED; break; 9 } 8 /* * This is possible iff next == next_del == EXT_MAX_BLOCKS. * we need to check next == EXT_MAX_BLOCKS because it is * possible that an extent is with unwritten and delayed * status due to when an extent is delayed allocated and * is allocated by fallocate status tree will track both of * them in a extent. * * So we could return a unwritten and delayed extent, and * its block is equal to 'next'. */ 9 if (next == next_del && next == EXT_MAX_BLOCKS) { flags |= FIEMAP_EXTENT_LAST; if (unlikely(next_del != EXT_MAX_BLOCKS || next != EXT_MAX_BLOCKS)) { 8 EXT4_ERROR_INODE(inode, "next extent == %u, next " "delalloc extent = %u", next, next_del); 7 err = -EFSCORRUPTED; break; } } if (exists) { 4 err = fiemap_fill_next_extent(fieinfo, (__u64)es.es_lblk << blksize_bits, (__u64)es.es_pblk << blksize_bits, 10 (__u64)es.es_len << blksize_bits, flags); if (err < 0) break; if (err == 1) { err = 0; break; } } block = es.es_lblk + es.es_len; } ext4_ext_drop_refs(path); 559 kfree(path); return err; } /* * ext4_ext_put_gap_in_cache: * calculate boundaries of the gap that the requested block fits into * and cache this gap */ static void ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) 31 { int depth = ext_depth(inode); 5 ext4_lblk_t len; ext4_lblk_t lblock; struct ext4_extent *ex; struct extent_status es; ex = path[depth].p_ext; 26 if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ lblock = 0; len = EXT_MAX_BLOCKS; ext_debug("cache gap(whole file):"); 26 } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; len = le32_to_cpu(ex->ee_block) - block; ext_debug("cache gap(before): %u [%u:%u]", block, le32_to_cpu(ex->ee_block), 26 ext4_ext_get_actual_len(ex)); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; lblock = le32_to_cpu(ex->ee_block) 559 + ext4_ext_get_actual_len(ex); next = ext4_ext_next_allocated_block(path); 6 ext_debug("cache gap(after): [%u:%u] %u", 3 le32_to_cpu(ex->ee_block), 3 ext4_ext_get_actual_len(ex), block); BUG_ON(next == lblock); 559 len = next - lblock; } else { BUG(); } ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ if (es.es_lblk <= lblock) return; len = min(es.es_lblk - lblock, len); } ext_debug(" -> %u:%u\n", lblock, len); ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); 165 } 165 /* * ext4_ext_rm_idx: * removes index from the index block. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 165 struct ext4_ext_path *path, int depth) { int err; ext4_fsblk_t leaf; 165 /* free index block */ depth--; path = path + depth; leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { 165 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); return -EFSCORRUPTED; } err = ext4_ext_get_access(handle, inode, path); if (err) 165 return err; 165 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; len *= sizeof(struct ext4_extent_idx); 165 memmove(path->p_idx, path->p_idx + 1, len); } le16_add_cpu(&path->p_hdr->eh_entries, -1); 165 err = ext4_ext_dirty(handle, inode, path); if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); trace_ext4_ext_rm_idx(inode, leaf); ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); while (--depth >= 0) { if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) break; path--; err = ext4_ext_get_access(handle, inode, path); if (err) break; path->p_idx->ei_block = (path+1)->p_idx->ei_block; err = ext4_ext_dirty(handle, inode, path); if (err) break; } 4 return err; 4 } /* * ext4_ext_calc_credits_for_single_extent: * This routine returns max. credits that needed to insert an extent * to the extent tree. * When pass the actual path, the caller should calculate credits * under i_data_sem. */ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { if (path) { int depth = ext_depth(inode); int ret = 0; 4 /* probably there is space in leaf? */ if (le16_to_cpu(path[depth].p_hdr->eh_entries) < le16_to_cpu(path[depth].p_hdr->eh_max)) { /* 4 * There are some space in the leaf tree, no * need to account for leaf block credit * * bitmaps and block group descriptor blocks * and other metadata blocks still need to be * accounted. */ /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } } return ext4_chunk_trans_blocks(inode, nrblocks); } /* * How many index/leaf blocks need to change/allocate to add @extents extents? 825 * * If we add a single extent, then in the worse case, each tree level * index/leaf need to be changed in case of the tree split. 825 * * If more extents are inserted, they could cause the whole tree split more * than once, but this is really rare. 825 */ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; 320 depth = ext_depth(inode); 442 if (extents <= 1) index = depth * 2; else index = depth * 3; return index; } static inline int get_default_free_blocks_flags(struct inode *inode) 320 { 320 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 320 else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; return 0; } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, long long *partial_cluster, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 320 unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; int flags = get_default_free_blocks_flags(inode); /* * For bigalloc file systems, we never free a partial cluster 320 * at the beginning of the extent. Instead, we make a note * that we tried freeing the cluster, and check to see if we * need to free it on a subsequent call to ext4_remove_blocks, * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); /* * If we have a partial cluster, and it's different from the * cluster of the last block, we need to explicitly free the * partial cluster here. */ pblk = ext4_ext_pblock(ex) + ee_len - 1; if (*partial_cluster > 0 && *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, flags); *partial_cluster = 0; } #ifdef EXTENTS_STATS { 320 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 320 spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; if (ee_len < sbi->s_ext_min) sbi->s_ext_min = ee_len; 320 if (ee_len > sbi->s_ext_max) sbi->s_ext_max = ee_len; if (ext_depth(inode) > sbi->s_depth_max) sbi->s_depth_max = ext_depth(inode); spin_unlock(&sbi->s_ext_stats_lock); } #endif if (from >= le32_to_cpu(ex->ee_block) && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; long long first_cluster; 320 num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; /* * Usually we want to free partial cluster at the end of the * extent, except for the situation when the cluster is still * used by any other extent (partial_cluster is negative). */ if (*partial_cluster < 0 && *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; ext_debug("free last %u blocks starting %llu partial %lld\n", num, pblk, *partial_cluster); ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* * If the block range to be freed didn't start at the * beginning of a cluster, and we removed the entire * extent and the cluster is not used by any other extent, * save the partial cluster here, since we might need to * delete if we determine that the truncate or punch hole * operation has removed all of the blocks in the cluster. * If that cluster is used by another extent, preserve its * negative value so it isn't freed later on. * * If the whole extent wasn't freed, we've reached the * start of the truncated/punched region and have finished * removing blocks. If there's a partial cluster here it's * shared with the remainder of the extent and is no longer * a candidate for removal. */ if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { first_cluster = (long long) EXT4_B2C(sbi, pblk); if (first_cluster != -*partial_cluster) *partial_cluster = first_cluster; } else { *partial_cluster = 0; } } else ext4_error(sbi->s_sb, "strange request: removal(2) " "%u-%u from %u:%u\n", from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf 442 * @partial_cluster: The cluster which we'll have to free if all extents * has been released from it. However, if this value is * negative, it's a cluster just to the right of the * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, long long *partial_cluster, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; 183 int depth = ext_depth(inode), credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; unsigned unwritten = 0; 442 struct ext4_extent *ex; ext4_fsblk_t pblk; 431 /* the header must be checked already in ext4_ext_remove_space() */ 442 ext_debug("truncate since %u in leaf to %u\n", start, end); 442 if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 442 eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { 442 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 334 return -EFSCORRUPTED; } 320 /* find where to start removing */ ex = path[depth].p_ext; if (!ex) ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 320 320 while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { if (ext4_ext_is_unwritten(ex)) unwritten = 1; else unwritten = 0; ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, unwritten, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; b = ex_ee_block+ex_ee_len - 1 < end ? 2 ex_ee_block+ex_ee_len - 1 : end; ext_debug(" border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ 2 if (end < ex_ee_block) { /* * We're going to skip this extent and move to another, * so note that its first cluster is in use to avoid * freeing it when removing blocks. Eventually, the * right edge of the truncated/punched region will * be just to the left. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); *partial_cluster = -(long long) EXT4_B2C(sbi, pblk); } ex--; 25 ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { EXT4_ERROR_INODE(inode, "can not handle truncate %u:%u " "on extent %u:%u", start, end, ex_ee_block, ex_ee_block + ex_ee_len - 1); err = -EFSCORRUPTED; goto out; 320 } else if (a != ex_ee_block) { /* remove tail of the extent */ num = a - ex_ee_block; 308 } else { /* remove whole extent: excellent! */ 320 num = 0; } /* * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block * groups plus ex_ee_len/blocks_per_block_group for 320 * the worst case */ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); if (ex == EXT_FIRST_EXTENT(eh)) { 320 correct_index = 1; credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 320 err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) 314 goto out; 25 err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; err = ext4_remove_blocks(handle, inode, ex, partial_cluster, a, b); 12 if (err) goto out; if (num == 0) /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); ex->ee_len = cpu_to_le16(num); /* * Do not mark unwritten if all the blocks in the * extent have been removed. */ 22 if (unwritten && num) ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf */ if (num == 0) { 314 if (end != EXT_MAX_BLOCKS - 1) { /* * For hole punching, we need to scoot all the 320 * extents up when an extent is removed so that * we dont have blank extents in the middle */ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * sizeof(struct ext4_extent)); 320 /* Now get rid of the one at the end */ memset(EXT_LAST_EXTENT(eh), 0, 320 sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); 317 } 28 err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); 434 ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) err = ext4_ext_correct_indexes(handle, inode, path); /* * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent * we zero partial_cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ 441 if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { 165 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, get_default_free_blocks_flags(inode)); } *partial_cluster = 0; } /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) err = ext4_ext_rm_idx(handle, inode, path, depth); 184 out: return err; } /* * ext4_ext_more_to_rm: * returns 1 if current index has to be freed (even partial) 183 */ static int ext4_ext_more_to_rm(struct ext4_ext_path *path) { BUG_ON(path->p_idx == NULL); if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) return 0; 444 /* * if truncate on deeper level happened, it wasn't partial, * so we have to consider current index for truncation */ if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) return 0; return 1; } int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) 443 { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); 444 struct ext4_ext_path *path = NULL; long long partial_cluster = 0; handle_t *handle; int i = 0, err = 0; ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); 444 if (IS_ERR(handle)) return PTR_ERR(handle); again: trace_ext4_ext_remove_space(inode, start, end, depth); 25 /* * Check if we are removing extents inside the extent tree. If that * is the case, we are going to punch a hole inside the extent tree * so we have to check whether we need to split the extent covering * the last block to remove so we can easily remove the part of it 25 * in ext4_ext_rm_leaf(). */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; 3 ext4_lblk_t ee_block, ex_end, lblk; ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { 3 ext4_journal_stop(handle); return PTR_ERR(path); } 23 depth = ext_depth(inode); 23 /* Leaf not may not exist only if inode has no blocks at all */ ex = path[depth].p_ext; if (!ex) { if (depth) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EFSCORRUPTED; 22 } goto out; } ee_block = le32_to_cpu(ex->ee_block); ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; 1 /* * See if the last block is inside the extent, if so split * the extent at 'end' block so we can easily remove the * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ if (end >= ee_block && end < ex_end) { /* * If we're going to split the extent, note that * the cluster containing the block after 'end' is * in use to avoid freeing it when removing blocks. 1 */ if (sbi->s_cluster_ratio > 1) { 1 pblk = ext4_ext_pblock(ex) + end - ee_block + 2; partial_cluster = -(long long) EXT4_B2C(sbi, pblk); 23 } /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ err = ext4_force_split_extent_at(handle, inode, &path, end + 1, 1); if (err < 0) goto out; } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { /* * If there's an extent to the right its first cluster 23 * contains the immediate right boundary of the * truncated/punched region. Set partial_cluster to * its negative value so it won't be freed if shared * with the current extent. The end < ee_block case * is handled in ext4_ext_rm_leaf(). */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, 442 &ex); if (err) 23 goto out; if (pblk) partial_cluster = -(long long) EXT4_B2C(sbi, pblk); 431 } } /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ 431 depth = ext_depth(inode); if (path) { int k = i = depth; while (--k > 0) path[k].p_block = le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); if (path == NULL) { ext4_journal_stop(handle); 442 return -ENOMEM; 442 } path[0].p_maxdepth = path[0].p_depth = depth; 442 path[0].p_hdr = ext_inode_hdr(inode); i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { 441 err = -EFSCORRUPTED; 441 goto out; } } err = 0; while (i >= 0 && err == 0) { 184 if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial_cluster, start, end); 184 /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); 183 path[i].p_bh = NULL; i--; continue; } /* this is index block */ if (!path[i].p_hdr) { 182 ext_debug("initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); } 184 if (!path[i].p_idx) { /* this level hasn't been touched yet */ 184 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; ext_debug("init index ptr: hdr 0x%p, num %d\n", 183 path[i].p_hdr, le16_to_cpu(path[i].p_hdr->eh_entries)); } else { /* we were already here, see at next index */ path[i].p_idx--; } ext_debug("level %d - index, first 0x%p, cur 0x%p\n", i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { struct buffer_head *bh; /* go to the next level */ 183 ext_debug("move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, ext4_idx_pblock(path[i].p_idx), depth - i - 1, 183 EXT4_EX_NOCACHE); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); break; } /* Yield here to deal with large extent trees. * Should be a no-op if we did IO above. */ 182 cond_resched(); if (WARN_ON(i + 1 > depth)) { err = -EFSCORRUPTED; break; } path[i + 1].p_bh = bh; 182 /* save actual number of indexes since this 182 * number is changed at the next iteration */ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); i++; } else { /* we finished processing this index, go up */ if (path[i].p_hdr->eh_entries == 0 && i > 0) { 441 /* index is empty, remove it; * handle must be already prepared by the * truncatei_leaf() */ err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; i--; 441 ext_debug("return to level %d\n", i); } 431 } trace_ext4_ext_remove_space_done(inode, start, end, depth, partial_cluster, path->p_hdr->eh_entries); /* * If we still have something in the partial cluster and we have removed 441 * even the first extent, then we should free the blocks in the partial * cluster as well. (This code will only run when there are no leaves * to the immediate left of the truncated/punched region.) */ if (partial_cluster > 0 && err == 0) { 409 /* don't zero partial_cluster since it's not used afterwards */ ext4_free_blocks(handle, inode, NULL, 409 EXT4_C2B(sbi, partial_cluster), sbi->s_cluster_ratio, get_default_free_blocks_flags(inode)); } /* TODO: flexible tree reduction should be here */ if (path->p_hdr->eh_entries == 0) { 443 /* * truncate to zero freed all the tree, * so we need to correct eh_depth */ err = ext4_ext_get_access(handle, inode, path); 443 if (err == 0) { ext_inode_hdr(inode)->eh_depth = 0; ext_inode_hdr(inode)->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); err = ext4_ext_dirty(handle, inode, path); } } out: ext4_ext_drop_refs(path); kfree(path); path = NULL; if (err == -EAGAIN) goto again; ext4_journal_stop(handle); return err; } /* * called at mount time */ void ext4_ext_init(struct super_block *sb) { /* * possible initialization would be here */ if (ext4_has_feature_extents(sb)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST ", aggressive tests" #endif #ifdef CHECK_BINSEARCH ", check binsearch" #endif #ifdef EXTENTS_STATS ", stats" #endif "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); EXT4_SB(sb)->s_ext_min = 1 << 30; EXT4_SB(sb)->s_ext_max = 0; #endif } } /* * called at umount time */ void ext4_ext_release(struct super_block *sb) { if (!ext4_has_feature_extents(sb)) return; #ifdef EXTENTS_STATS if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { struct ext4_sb_info *sbi = EXT4_SB(sb); printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", sbi->s_ext_blocks, sbi->s_ext_extents, 7 sbi->s_ext_blocks / sbi->s_ext_extents); 15 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); } 15 #endif 15 } static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) { ext4_lblk_t ee_block; ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) 7 return 0; 7 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, EXTENT_STATUS_WRITTEN); } /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; int ret; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); if (ext4_encrypted_inode(inode)) return ext4_encrypted_zeroout(inode, ex); ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } /* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle * @inode: the file inode * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states * of which are deterimined by split_flag. * 32 * There are two cases: * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * * return 0 on success. */ static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags) { struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; 32 struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; 32 int err = 0; 32 BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == 32 (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); 32 ext_debug("ext4_split_extents_at: inode %lu, logical" "block %llu\n", inode->i_ino, (unsigned long long)split); ext4_ext_show_leaf(inode, path); 12 depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); 32 ee_len = ext4_ext_get_actual_len(ex); newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | 10 EXT4_EXT_MARK_UNWRIT1 | 4 EXT4_EXT_MARK_UNWRIT2)); 6 err = ext4_ext_get_access(handle, inode, path + depth); if (err) 10 goto out; 6 if (split == ee_block) { 10 /* 32 * case b: block @split is the block that the extent begins with * then we just change the state of the extent, and splitting * is not needed. */ 29 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 17 ext4_ext_mark_unwritten(ex); else 13 ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); 29 goto out; } /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); if (split_flag & EXT4_EXT_MARK_UNWRIT1) ext4_ext_mark_unwritten(ex); 11 /* * path may lead to new leaf, not to original leaf any more 29 * after ext4_ext_insert_extent() returns, */ err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto fix_extent_len; ex2 = &newex; ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); zero_ex.ee_block = ex2->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex2)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex2)); } else { err = ext4_ext_zeroout(inode, ex); zero_ex.ee_block = ex->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); } } else { err = ext4_ext_zeroout(inode, &orig_ex); zero_ex.ee_block = orig_ex.ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(&orig_ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(&orig_ex)); } 29 if (err) goto fix_extent_len; /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) goto fix_extent_len; /* update extent status tree */ err = ext4_zeroout_es(inode, &zero_ex); goto out; } else if (err) goto fix_extent_len; out: ext4_ext_show_leaf(inode, path); return err; fix_extent_len: ex->ee_len = orig_ex.ee_len; ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } /* * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * * It may result in splitting the extent into multiple extents (up to three) 15 * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent * c> Splits in three extents: Somone is splitting in middle of the extent * */ static int ext4_split_extent(handle_t *handle, 10 struct inode *inode, struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags) 15 { struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; 10 unsigned int ee_len, depth; int err = 0; int unwritten; int split_flag1, flags1; int allocated = map->m_len; 10 1 depth = ext_depth(inode); 10 ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); unwritten = ext4_ext_is_unwritten(ex); 5 if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; 15 if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, ppath, 15 map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; } else { allocated = ee_len - (map->m_lblk - ee_block); } /* 15 * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ 15 path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); 15 ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 15 (unsigned long) map->m_lblk); return -EFSCORRUPTED; } unwritten = ext4_ext_is_unwritten(ex); split_flag1 = 0; if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; 15 if (unwritten) { split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk, split_flag1, flags); if (err) goto out; } ext4_ext_show_leaf(inode, path); out: return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() if someone tries to write * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset 15 * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * * Post-conditions on success: * - the returned value is the number of blocks beyond map->l_lblk * that are allocated and initialized. * It is guaranteed to be >= map->m_len. */ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags) { struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; struct ext4_extent *ex, *abut_ex; 14 ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = 0; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 15 "block %llu, max_blocks %u\n", inode->i_ino, 15 (unsigned long long)map->m_lblk, map_len); 15 sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; 15 if (eof_block < map->m_lblk + map_len) 15 eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently 15 * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in 8 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) * followed by append writes. * * Limitations of the current logic: * - L1: we do not deal with writes covering the whole extent. 7 * This would require removing the extent if the transfer 7 * is possible. 7 * - L2: we only attempt to merge with an extent stored in the 7 * same extent tree node. */ if ((map->m_lblk == ee_block) && /* See if we can merge left */ (map_len < ee_len) && /*L1*/ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; unsigned int prev_len; abut_ex = ex - 1; prev_lblk = le32_to_cpu(abut_ex->ee_block); prev_len = ext4_ext_get_actual_len(abut_ex); prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); 7 7 /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, 7 * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. 7 */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ 7 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); 10 /* Shift the start of ex by 'map_len' blocks */ 2 ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); 1 /* Result: number of initialized blocks past m_lblk */ allocated = map_len; } } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && (map_len < ee_len) && /*L1*/ ex < EXT_LAST_EXTENT(eh)) { /*L2*/ /* See if we can merge right */ ext4_lblk_t next_lblk; ext4_fsblk_t next_pblk, ee_pblk; unsigned int next_len; abut_ex = ex + 1; next_lblk = le32_to_cpu(abut_ex->ee_block); next_len = ext4_ext_get_actual_len(abut_ex); next_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); 7 /* Shift the start of abut_ex by 'map_len' blocks */ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); 7 ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); 14 /* Result: number of initialized blocks past m_lblk */ allocated = map_len; } } if (allocated) { /* Mark the block containing both extents as dirty */ ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ 13 path[depth].p_ext = abut_ex; goto out; } else allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* 13 * It is safe to convert extent to initialized via explicit 6 * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 6 6 if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); 2 if (ext4_encrypted_inode(inode)) max_zeroout = 0; 6 /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); if (err) goto out; zero_ex.ee_block = ex->ee_block; zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; 11 ext4_ext_mark_initialized(ex); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; 8 } /* * four cases: * 1. split the extent into three extents. * 2. split the extent into two extents, zeroout the first half. * 3. split the extent into two extents, zeroout the second half. * 4. split the extent into two extents with out zeroout. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; 8 if (max_zeroout && (allocated > map->m_len)) { if (allocated <= max_zeroout) { 3 /* case 3 */ 2 zero_ex.ee_block = cpu_to_le32(map->m_lblk); zero_ex.ee_len = cpu_to_le16(allocated); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex) + map->m_lblk - ee_block); err = ext4_ext_zeroout(inode, &zero_ex); 2 if (err) goto out; split_map.m_lblk = map->m_lblk; split_map.m_len = allocated; 3 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { /* case 2 */ if (map->m_lblk != ee_block) { zero_ex.ee_block = ex->ee_block; zero_ex.ee_len = cpu_to_le16(map->m_lblk - ee_block); 11 ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex); if (err) goto out; } 6 15 split_map.m_lblk = ee_block; 15 split_map.m_len = map->m_lblk - ee_block + map->m_len; allocated = map->m_len; } } err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) err = 0; out: /* If we have gotten a failure, don't zero out status tree */ if (!err) err = ext4_zeroout_es(inode, &zero_ex); return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write * to an unwritten extent. * * Writing to an unwritten extent may result in splitting the unwritten * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * This works the same way in the case of initialized -> unwritten conversion. * * One of more index blocks maybe needed if the extent tree grow after * the unwritten extent split. To prevent ENOSPC occur at the IO * complete, we need to split the unwritten extent before DIO submit 4 * the IO. The unwritten extent called at this time will be split * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * * Returns the size of unwritten extent to be written on success. */ static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags) { struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; int split_flag = 0, depth; ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", __func__, inode->i_ino, 4 (unsigned long long)map->m_lblk, map->m_len); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 4 inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; 4 /* 4 * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully insde i_size or new_size. */ depth = ext_depth(inode); 4 ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); /* Convert to unwritten */ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { split_flag |= EXT4_EXT_DATA_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { 26 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); } 26 static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, 26 struct ext4_map_blocks *map, struct ext4_ext_path **ppath) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; 26 ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); /* If extent is larger than requested it is a clear sign that we still * have some extent state machine issues left. So extent_split is still * required. * TODO: Once all related issues will be fixed this situation should be * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef EXT4_DEBUG ext4_warning("Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len, 26 (unsigned long long)map->m_lblk, map->m_len); #endif err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT); 26 if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); 7 /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); return err; } static void unmap_underlying_metadata_blocks(struct block_device *bdev, sector_t block, int count) { int i; for (i = 0; i < count; i++) unmap_underlying_metadata(bdev, block + i); } 479 /* * Handle EOFBLOCKS_FL flag, clearing it if necessary 37 */ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, ext4_lblk_t lblk, struct ext4_ext_path *path, unsigned int len) { int i, depth; struct ext4_extent_header *eh; struct ext4_extent *last_ex; 37 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) return 0; depth = ext_depth(inode); eh = path[depth].p_hdr; /* * We're going to remove EOFBLOCKS_FL entirely in future so we * do not care for this case anymore. Simply remove the flag * if there are no extents. 37 */ 37 if (unlikely(!eh->eh_entries)) 37 goto out; last_ex = EXT_LAST_EXTENT(eh); /* * We should clear the EOFBLOCKS_FL flag if we are writing the * last block in the last extent in the file. We test this by * first checking to see if the caller to * ext4_ext_get_blocks() was interested in the last block (or * a block beyond the last block) in the current extent. If * this turns out to be false, we can bail out from this * function immediately. */ if (lblk + len < le32_to_cpu(last_ex->ee_block) + 2 ext4_ext_get_actual_len(last_ex)) return 0; /* * If the caller does appear to be planning to write at or * beyond the end of the current extent, we then test to see * if the current extent is the last extent in the file, by * checking to make sure it was reached via the rightmost node * at each level of the tree. */ for (i = depth-1; i >= 0; i--) if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) return 0; out: ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); return ext4_mark_inode_dirty(handle, inode); 192 } /** 1 * ext4_find_delalloc_range: find delayed allocated block in the given range. 1 * * Return 1 if there is a delalloc block in the range, otherwise 0. */ int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, 192 ext4_lblk_t lblk_end) { struct extent_status es; ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); if (es.es_len == 0) return 0; /* there is no delay extent in this tree */ else if (es.es_lblk <= lblk_start && lblk_start < es.es_lblk + es.es_len) return 1; else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) return 1; else return 0; } int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return ext4_find_delalloc_range(inode, lblk_start, lblk_end); } /** * Determines how many complete clusters (out of those specified by the 'map') * are under delalloc and were reserved quota for. * This function is called when we are writing out the blocks that were * originally written with their allocation delayed, but then the space was * allocated using fallocate() before the delayed allocation could be resolved. * The cases to look for are: * ('=' indicated delayed allocated blocks * '-' indicates non-delayed allocated blocks) * (a) partial clusters towards beginning and/or end outside of allocated range * are not delalloc'ed. * Ex: * |----c---=|====c====|====c====|===-c----| * |++++++ allocated ++++++| * ==> 4 complete clusters in above example * * (b) partial cluster (outside of allocated range) towards either end is * marked for delayed allocation. In this case, we will exclude that * cluster. * Ex: * |----====c========|========c========| * |++++++ allocated ++++++| * ==> 1 complete clusters in above example * * Ex: * |================c================| 197 * |++++++ allocated ++++++| * ==> 0 complete clusters in above example * * The ext4_da_update_reserve_space will be called only if we * determine here that there were some "entire" clusters that span * this 'allocated' range. * In the non-bigalloc case, this function will just end up returning num_blks * without ever calling ext4_find_delalloc_range. */ static unsigned int get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, 197 unsigned int num_blks) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 197 ext4_lblk_t alloc_cluster_start, alloc_cluster_end; ext4_lblk_t lblk_from, lblk_to, c_offset; unsigned int allocated_clusters = 0; alloc_cluster_start = EXT4_B2C(sbi, lblk_start); alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); /* max possible clusters for this allocation */ allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; 197 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); 197 /* Check towards left side */ c_offset = EXT4_LBLK_COFF(sbi, lblk_start); if (c_offset) { lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 197 allocated_clusters--; } /* Now check towards right. */ c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); if (allocated_clusters && c_offset) { lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } return allocated_clusters; } static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; /* * Make sure that the extent is no bigger than we support with * unwritten extent */ if (map->m_len > EXT_UNWRITTEN_MAX_LEN) map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug("%s: inode %lu, logical" "block %llu, max_blocks %u\n", __func__, inode->i_ino, (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); return -EFSCORRUPTED; } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; /* first mark the extent as unwritten */ ext4_ext_mark_unwritten(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) return err; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); if (err) return err; map->m_flags |= EXT4_MAP_UNWRITTEN; if (allocated > map->m_len) allocated = map->m_len; 45 map->m_len = allocated; return allocated; } static int ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); 45 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 45 flags, allocated); 4 ext4_ext_show_leaf(inode, path); /* * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; trace_ext4_ext_handle_unwritten_extents(inode, map, flags, 4 allocated, newblock); 3 /* get_block() before submit the IO, split the extent */ 1 if (flags & EXT4_GET_BLOCKS_PRE_IO) { 4 ret = ext4_split_convert_extents(handle, inode, map, ppath, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; 43 /* 26 * Flag the inode(non aio case) or end_io struct (aio case) * that this IO needs to conversion to written when IO is 26 * completed 26 */ 26 if (io) ext4_set_io_unwritten_flag(inode, io); else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 26 map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); } else 17 err = ret; 1 map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; 16 goto out2; } /* buffered IO case */ /* * repeat fallocate creation request * we already have an unwritten extent */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { 2 map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } /* buffered READ or buffered write_begin() lookup */ 15 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* 15 * We have blocks reserved already. We * return allocated blocks so that delalloc 15 * won't do block reservation for us. But * the buffer head will be unmapped so that * a read from the block returns 0s. */ 19 map->m_flags |= EXT4_MAP_UNWRITTEN; goto out1; } /* buffered write, writepage time, convert*/ ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: if (ret <= 0) { 7 err = ret; goto out2; 7 } else 7 allocated = ret; map->m_flags |= EXT4_MAP_NEW; 19 /* * if we allocated more blocks than requested * we need to make sure we unmap the extra block * allocated. The actual needed block will get * unmapped later when we find the buffer_head marked * new. */ if (allocated > map->m_len) { unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 18 newblock + map->m_len, allocated - map->m_len); 1 allocated = map->m_len; } map->m_len = allocated; 1 /* * If we have done fallocate with the offset that is already * delayed allocated, we would have block reservation * and quota reservation done in the delayed write path. * But fallocate would have already updated quota and block 20 * count for this offset. So cancel these reservation */ 19 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { unsigned int reserved_clusters; reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, map->m_len); if (reserved_clusters) ext4_da_update_reserve_space(inode, 20 reserved_clusters, 0); } map_out: map->m_flags |= EXT4_MAP_MAPPED; 45 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); if (err < 0) goto out2; } out1: if (allocated > map->m_len) allocated = map->m_len; ext4_ext_show_leaf(inode, path); map->m_pblk = newblock; map->m_len = allocated; out2: return err ? err : allocated; } /* * get_implied_cluster_alloc - check to see if the requested * allocation (in the map structure) overlaps with a cluster already * allocated in an extent. * @sb The filesystem superblock structure * @map The requested lblk->pblk mapping * @ex The extent structure which might contain an implied * cluster allocation * * This function is called by ext4_ext_map_blocks() after we failed to * find blocks that were already in the inode's extent tree. Hence, * we know that the beginning of the requested region cannot overlap * the extent from the inode's extent tree. There are three cases we * want to catch. The first is this case: * * |--- cluster # N--| * |--- extent ---| |---- requested region ---| * |==========| * * The second case that we need to test for is this one: * * |--------- cluster # N ----------------| * |--- requested region --| |------- extent ----| * |=======================| * * The third case is when the requested region lies between two extents * within the same cluster: * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| * * In each of the above cases, we need to set the map->m_pblk and * map->m_len so it corresponds to the return the extent labelled as * "|====|" from cluster #N, since it is already in use for data in * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to * signal to ext4_ext_map_blocks() that map->m_pblk should be treated * as a new "allocated" block region. Otherwise, we will return 0 and * ext4_ext_map_blocks() will then allocate one or more new clusters * by calling ext4_mb_new_blocks(). */ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_map_blocks *map, struct ext4_extent *ex, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); /* The extent passed in that we are trying to match */ ex_cluster_start = EXT4_B2C(sbi, ee_block); ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* * Check for and handle this case: * * |--------- cluster # N-------------| * |------- extent ----| * |--- requested region ---| * |===========| */ if (map->m_lblk < ee_block) map->m_len = min(map->m_len, ee_block - map->m_lblk); /* * Check for the case where there is already another allocated * block to the right of 'ex' but before the end of the cluster. * * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| */ if (map->m_lblk > ee_block) { ext4_lblk_t next = ext4_ext_next_allocated_block(path); map->m_len = min(map->m_len, next - map->m_lblk); } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); return 1; } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); return 0; } /* * Block allocation/map/preallocation routine for extents based files * * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * 705 * return > 0, number of of blocks already mapped/allocated * if create == 0 and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * * return = 0, if plain look up failed (blocks have not been allocated) * buffer head is unmapped * * return < 0, error case. */ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent newex, *ex, *ex2; 705 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; int free_on_err = 0, err = 0, depth, ret; 705 unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; int set_unwritten = 0; bool map_from_cluster = false; 705 ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; goto out2; } depth = ext_depth(inode); /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; 575 * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " "lblock: %lu, depth: %d pblock %lld", (unsigned long) map->m_lblk, depth, path[depth].p_block); err = -EFSCORRUPTED; goto out2; 575 } 575 ex = path[depth].p_ext; if (ex) { ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 575 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 87 unsigned short ee_len; /* * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); 43 /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ 76 allocated = ee_len - (map->m_lblk - ee_block); ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); 45 /* * If the extent is initialized check whether the * caller wants to convert it to unwritten. 45 */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 45 allocated = convert_initialized_extent( handle, inode, map, &path, flags, allocated, newblock); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; ret = ext4_ext_handle_unwritten_extents( handle, inode, map, &path, flags, 683 allocated, newblock); if (ret < 0) err = ret; else allocated = ret; 559 goto out2; } } /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero 602 */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * put just found gap into cache to speed up * subsequent requests */ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } /* * Okay, we need to do block allocation. */ newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 602 /* 602 * If we are doing bigalloc, check to see if the extent returned * by ext4_find_extent() implies a cluster we can use. */ 602 if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; map_from_cluster = true; goto got_allocated_blocks; } 602 /* find neighbour allocated blocks */ ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) goto out2; ar.lright = map->m_lblk; ex2 = NULL; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err) goto out2; /* Check if the extent after searching to the right implies a * cluster we can use. */ if ((sbi->s_cluster_ratio > 1) && ex2 && 602 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { 162 ar.len = allocated = map->m_len; 162 newblock = map->m_pblk; 551 map_from_cluster = true; 90 goto got_allocated_blocks; 162 } /* 550 * See if request is beyond maximum number of blocks we can have in 602 * a single extent. For an initialized extent this limit is * EXT_INIT_MAX_LEN and for an unwritten extent this limit is 2 * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; 602 else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); err = ext4_ext_check_overlap(sbi, inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else allocated = map->m_len; /* allocate new block */ ar.inode = inode; ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); ar.logical = map->m_lblk; /* * We calculate the offset from the beginning of the cluster * for the logical block number, since when we allocate a * physical cluster, the physical block should start at the 602 * same offset from the beginning of the cluster. This is 4 * needed so that future calls to get_implied_cluster_alloc() 601 * work correctly. 196 */ 602 offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 196 ar.len = EXT4_NUM_B2C(sbi, offset+allocated); 602 ar.goal -= offset; ar.logical -= offset; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; else /* disable in-core preallocation for non-regular files */ 598 ar.flags = 0; 598 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; 598 newblock = ext4_mb_new_blocks(handle, &ar, &err); 451 if (!newblock) goto out2; ext_debug("allocate new block: goal %llu, found %llu/%u\n", 192 ar.goal, newblock, allocated); free_on_err = 1; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; if (ar.len > allocated) ar.len = allocated; got_allocated_blocks: /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ 598 ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; 467 /* * io_end structure was created for every IO write to an 597 * unwritten extent. To avoid unnecessary conversion, 598 * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. 598 */ 29 if (flags & EXT4_GET_BLOCKS_PRE_IO) 5 set_unwritten = 1; } 24 err = 0; if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) err = check_eofblocks_fl(handle, inode, map->m_lblk, 26 path, ar.len); if (!err) err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); if (!err && set_unwritten) { if (io) ext4_set_io_unwritten_flag(inode, io); else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } 598 if (err && free_on_err) { 598 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 598 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, NULL, newblock, EXT4_C2B(sbi, allocated_clusters), fb_flags); goto out2; } /* previous routine could use block we allocated */ newblock = ext4_ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); 196 if (allocated > map->m_len) allocated = map->m_len; map->m_flags |= EXT4_MAP_NEW; 196 196 /* * Update reserved blocks/metadata blocks after successful * block allocation which had been deferred till now. */ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { unsigned int reserved_clusters; /* * Check how many clusters we had reserved this allocated range */ reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, allocated); if (!map_from_cluster) { BUG_ON(allocated_clusters < reserved_clusters); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - reserved_clusters; /* * It seems we claimed few clusters outside of * the range of this allocation. We should give * it back to the reservation pool. This can * happen in the following case: * * * Suppose s_cluster_ratio is 4 (i.e., each * cluster has 4 blocks. Thus, the clusters * are [0-3],[4-7],[8-11]... * * First comes delayed allocation write for * logical blocks 10 & 11. Since there were no * previous delayed allocated blocks in the * range [8-11], we would reserve 1 cluster * for this write. * * Next comes write for logical blocks 3 to 8. * In this case, we will reserve 2 clusters * (for [0-3] and [4-7]; and not for [8-11] as * that range has a delayed allocated blocks. * Thus total reserved clusters now becomes 3. * * Now, during the delayed allocation writeout * time, we will first write blocks [3-8] and * allocate 3 clusters for writing these * blocks. Also, we would claim all these * three clusters above. * * Now when we come here to writeout the * blocks [10-11], we would expect to claim * the reservation of 1 cluster we had made * (and we would claim it since there are no * more delayed allocated blocks in the range * [8-11]. But our reserved cluster count had * already gone to 0. * * Thus, at the step 4 above when we determine * that there are still some unwritten delayed * allocated blocks outside of our current * block range, we should increment the * reserved clusters count so that when the * remaining blocks finally gets written, we * could claim them. */ 196 dquot_reserve_block(inode, EXT4_C2B(sbi, reservation)); spin_lock(&ei->i_block_reservation_lock); ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } /* * We will claim quota for all newly allocated blocks. * We're updating the reserved space *after* the 597 * correction above so we do not accidentally free 450 * all the metadata reservation because we might * actually need it later on. 192 */ ext4_da_update_reserve_space(inode, allocated_clusters, 625 1); } } /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an unwritten extent. 704 */ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); 704 else ext4_update_inode_fsync_trans(handle, inode, 0); 704 out: if (allocated > map->m_len) allocated = map->m_len; ext4_ext_show_leaf(inode, path); map->m_flags |= EXT4_MAP_MAPPED; 431 map->m_pblk = newblock; map->m_len = allocated; out2: ext4_ext_drop_refs(path); kfree(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); return err ? err : allocated; } void ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; int err = 0; 431 /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. */ /* we have to know where to truncate from in crash case */ 431 EXT4_I(inode)->i_disksize = inode->i_size; ext4_mark_inode_dirty(handle, inode); last_block = (inode->i_size + sb->s_blocksize - 1) 431 >> EXT4_BLOCK_SIZE_BITS(sb); 430 retry: err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); if (err == -ENOMEM) { cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } if (err) { ext4_std_error(inode->i_sb, err); return; 166 } err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); ext4_std_error(inode->i_sb, err); } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ext4_lblk_t len, loff_t new_size, int flags, int mode) { struct inode *inode = file_inode(file); handle_t *handle; int ret = 0; int ret2 = 0; int retries = 0; 4 int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos; 166 map.m_lblk = offset; map.m_len = len; /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple 166 * extents. */ if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 166 /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); 166 /* 146 * We can only call ext_depth() on extent based inodes */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) depth = ext_depth(inode); 166 else depth = -1; retry: while (ret >= 0 && len) { /* 166 * Recalculate credits when extent tree depth changes. */ if (depth >= 0 && depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } 48 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { 165 ret = PTR_ERR(handle); break; } 165 ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { ext4_debug("inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); 165 ret2 = ext4_journal_stop(handle); 164 break; } map.m_lblk += ret; 165 map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; 165 inode->i_ctime = ext4_current_time(inode); 108 if (new_size) { if (epos > new_size) epos = new_size; 48 if (ext4_update_inode_size(inode, epos) & 0x1) 47 inode->i_mtime = inode->i_ctime; } else { if (epos > inode->i_size) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 71 } 166 ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); ret2 = ext4_journal_stop(handle); if (ret2) break; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) { ret = 0; goto retry; } return ret > 0 ? ret2 : ret; } static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); handle_t *handle = NULL; unsigned int max_blocks; loff_t new_size = 0; int ret = 0; int flags; int credits; int partial_begin, partial_end; 147 loff_t start, end; ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); if (!S_ISREG(inode->i_mode)) return -EINVAL; /* Call ext4_force_commit to flush all data in case of data=journal. */ if (ext4_should_journal_data(inode)) { ret = ext4_force_commit(inode->i_sb); if (ret) return ret; } /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the * range. */ start = round_up(offset, 1 << blkbits); end = round_down((offset + len), 1 << blkbits); if (start < offset || end > offset + len) return -EINVAL; partial_begin = offset & ((1 << blkbits) - 1); partial_end = (offset + len) & ((1 << blkbits) - 1); lblk = start >> blkbits; max_blocks = (end >> blkbits); if (max_blocks < lblk) max_blocks = 0; else max_blocks -= lblk; mutex_lock(&inode->i_mutex); /* * Indirect files do not support unwritten extnets */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; goto out_mutex; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > i_size_read(inode) || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) goto out_mutex; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, round_down(offset, 1 << blkbits) >> blkbits, (round_up((offset + len), 1 << blkbits) - round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags, mode); if (ret) goto out_dio; } /* Zero range excluding the unaligned edges */ if (max_blocks > 0) { flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); /* * Prevent page faults from reinstantiating pages we have * released from page cache. */ down_write(&EXT4_I(inode)->i_mmap_sem); ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { up_write(&EXT4_I(inode)->i_mmap_sem); goto out_dio; } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } if (!partial_begin && !partial_end) goto out_dio; /* * In worst case we have to writeout two nonadjacent unwritten * blocks and update the inode */ credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; if (ext4_should_journal_data(inode)) credits += 2; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); goto out_dio; } inode->i_mtime = inode->i_ctime = ext4_current_time(inode); if (new_size) { ext4_update_inode_size(inode, new_size); } else { /* * Mark that we allocate beyond EOF so the subsequent truncate * can proceed even if the new size is the same as i_size. */ if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); ext4_journal_stop(handle); out_dio: ext4_inode_resume_unlocked_dio(inode); out_mutex: 204 mutex_unlock(&inode->i_mutex); return ret; } /* * preallocate space for a file. This implements ext4's fallocate file 170 * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); loff_t new_size = 0; unsigned int max_blocks; int ret = 0; int flags; ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; /* * Encrypted inodes can't handle collapse range or insert * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). * * XXX It's not clear why zero range isn't working, but we'll * leave it disabled for encrypted inodes for now. This is a 204 * bug we should fix.... 34 */ if (ext4_encrypted_inode(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | FALLOC_FL_ZERO_RANGE))) return -EOPNOTSUPP; 170 /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | 170 FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; 170 if (mode & FALLOC_FL_PUNCH_HOLE) 147 return ext4_punch_hole(inode, offset, len); 170 ret = ext4_convert_inline_data(inode); 170 if (ret) return ret; if (mode & FALLOC_FL_COLLAPSE_RANGE) return ext4_collapse_range(inode, offset, len); if (mode & FALLOC_FL_INSERT_RANGE) return ext4_insert_range(inode, offset, len); if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); 170 trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - lblk; flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; mutex_lock(&inode->i_mutex); /* * We only support preallocation for extent-based files only */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; 166 goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > i_size_read(inode) || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) 70 goto out; } /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); 113 inode_dio_wait(inode); 113 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } out: mutex_unlock(&inode->i_mutex); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } /* * This function convert a range of blocks to written extents * The caller of this function will pass the start offset and the size. * all unwritten extents within this range will be converted to 27 * written extents. * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; int ret = 0; int ret2 = 0; struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - map.m_lblk); /* * This is somewhat ugly but the idea is clear: When transaction is 27 * reserved, everything goes into it. Otherwise we rather start several * smaller transactions for conversion of each extent separately. 27 */ 27 if (handle) { handle = ext4_journal_start_reserved(handle, EXT4_HT_EXT_CONVERT); 27 if (IS_ERR(handle)) return PTR_ERR(handle); credits = 0; } else { /* * credits to insert 1 extent into extent tree */ 27 credits = ext4_chunk_trans_blocks(inode, max_blocks); } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); if (credits) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 27 if (IS_ERR(handle)) { ret = PTR_ERR(handle); 27 break; } } ret = ext4_map_blocks(handle, inode, &map, 27 EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) 27 ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); if (credits) ret2 = ext4_journal_stop(handle); if (ret <= 0 || ret2) break; } if (!credits) ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } /* * If newes is not existing extent (newes->ec_pblk equals zero) find 5 * delayed extent at start of newes and update newes accordingly and 6 * return start of the next delayed extent. * * If newes is existing extent (newes->ec_pblk is not equal zero) * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed * extent found. Leave newes unmodified. */ static int ext4_find_delayed_extent(struct inode *inode, struct extent_status *newes) { struct extent_status es; ext4_lblk_t block, next_del; 4 if (newes->es_pblk == 0) { 2 ext4_es_find_delayed_extent_range(inode, newes->es_lblk, newes->es_lblk + newes->es_len - 1, &es); 4 /* * No extent in extent-tree contains block @newes->es_pblk, 4 * then the block may stay in 1)a hole or 2)delayed-extent. */ if (es.es_len == 0) /* A hole found. */ 8 return 0; if (es.es_lblk > newes->es_lblk) { /* A hole found. */ 2 newes->es_len = min(es.es_lblk - newes->es_lblk, newes->es_len); 8 return 0; } newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; } block = newes->es_lblk + newes->es_len; ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); if (es.es_len == 0) next_del = EXT_MAX_BLOCKS; else next_del = es.es_lblk; return next_del; } 1 /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) static int ext4_xattr_fiemap(struct inode *inode, 1 struct fiemap_extent_info *fieinfo) { __u64 physical = 0; 1 __u64 length; __u32 flags = FIEMAP_EXTENT_LAST; int blockbits = inode->i_sb->s_blocksize_bits; int error = 0; /* in-inode? */ 1 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct ext4_iloc iloc; int offset; /* offset of xattr in inode */ error = ext4_get_inode_loc(inode, &iloc); if (error) 1 return error; 1 physical = (__u64)iloc.bh->b_blocknr << blockbits; offset = EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize; physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; flags |= FIEMAP_EXTENT_DATA_INLINE; brelse(iloc.bh); } else { /* external block */ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; } 20 if (physical) error = fiemap_fill_next_extent(fieinfo, 0, physical, length, flags); return (error < 0 ? error : 0); } int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { 20 ext4_lblk_t start_blk; 3 int error = 0; 20 if (ext4_has_inline_data(inode)) { int has_inline = 1; error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, 20 start, len); 9 if (has_inline) return error; 13 } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { 11 error = ext4_ext_precache(inode); 1 if (error) return error; } /* fallback to generic here if not in extents fmt */ 10 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, ext4_get_block); if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) return -EBADR; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { error = ext4_xattr_fiemap(inode, fieinfo); } else { 10 ext4_lblk_t len_blks; __u64 last_blk; start_blk = start >> inode->i_sb->s_blocksize_bits; last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; if (last_blk >= EXT_MAX_BLOCKS) last_blk = EXT_MAX_BLOCKS-1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* * Walk the extent tree gathering extent information * and pushing extents back to the user. */ error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } return error; } /* * ext4_access_path: * Function to access the path buffer for marking it dirty. * It also checks if there are sufficient credits left in the journal handle * to update path. */ static int ext4_access_path(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int credits, err; if (!ext4_handle_valid(handle)) return 0; /* * Check if need to extend journal credits * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block * groups */ if (handle->h_buffer_credits < 7) { credits = ext4_writepage_trans_blocks(inode); err = ext4_ext_truncate_extend_restart(handle, inode, credits); /* EAGAIN is success */ if (err && err != -EAGAIN) return err; } err = ext4_ext_get_access(handle, inode, path); return err; } /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; bool update = 0; depth = path->p_depth; while (depth >= 0) { if (depth == path->p_depth) { ex_start = path[depth].p_ext; if (!ex_start) return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); err = ext4_access_path(handle, inode, path + depth); if (err) goto out; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { le32_add_cpu(&ex_start->ee_block, -shift); /* Try to merge to the left. */ if ((ex_start > EXT_FIRST_EXTENT(path[depth].p_hdr)) && ext4_ext_try_to_merge_right(inode, path, ex_start - 1)) ex_last--; else ex_start++; } else { le32_add_cpu(&ex_last->ee_block, shift); ext4_ext_try_to_merge_right(inode, path, ex_last); ex_last--; } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; if (--depth < 0 || !update) break; } /* Update index too */ err = ext4_access_path(handle, inode, path + depth); if (err) goto out; if (SHIFT == SHIFT_LEFT) le32_add_cpu(&path[depth].p_idx->ei_block, -shift); else le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; /* we are done if current index is not a starting index */ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) break; depth--; } out: return err; } /* * ext4_ext_shift_extents: * All the extents which lies in the range from @start to the last allocated * block for the @inode are shifted either towards left or right (depending * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t start, ext4_lblk_t shift, enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) goto out; stop = le32_to_cpu(extent->ee_block); /* * For left shifts, make sure the hole on the left is big enough to * accommodate the shift. For right shifts, make sure the last extent * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { path = ext4_find_extent(inode, start - 1, &path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (extent) { ex_start = le32_to_cpu(extent->ee_block); ex_end = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { ex_start = 0; ex_end = 0; } if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) { ret = -EINVAL; goto out; } } else { if (shift > EXT_MAX_BLOCKS - (stop + ext4_ext_get_actual_len(extent))) { ret = -EINVAL; goto out; } } /* * In case of left shift, iterator points to start and it is increased * till we reach stop. In case of right shift, iterator points to stop * and it is decreased till we reach start. */ if (SHIFT == SHIFT_LEFT) iterator = &start; else iterator = &stop; /* * Its safe to start updating extents. Start and stop are unsigned, so * in case of right shift if extent with 0 block is reached, iterator * becomes NULL to indicate the end of the loop. */ while (iterator && start <= stop) { path = ext4_find_extent(inode, *iterator, &path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); return -EFSCORRUPTED; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { *iterator = ext4_ext_next_allocated_block(path); continue; } } if (SHIFT == SHIFT_LEFT) { extent = EXT_LAST_EXTENT(path[depth].p_hdr); *iterator = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); if (le32_to_cpu(extent->ee_block) > 0) *iterator = le32_to_cpu(extent->ee_block) - 1; else /* Beginning is reached, end of the loop */ iterator = NULL; /* Update path extent in case we need to stop */ while (le32_to_cpu(extent->ee_block) < start) extent++; path[depth].p_ext = extent; } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, SHIFT); if (ret) break; } out: ext4_ext_drop_refs(path); kfree(path); return ret; } /* * ext4_collapse_range: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; loff_t new_size, ioffset; int ret; /* * We need to test this early because xfstests assumes that a * collapse range of (0, 1) will return EOPNOTSUPP if the file * system does not support collapse range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || len & (EXT4_CLUSTER_SIZE(sb) - 1)) return -EINVAL; if (!S_ISREG(inode->i_mode)) return -EINVAL; trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); /* Call ext4_force_commit to flush all data in case of data=journal. */ if (ext4_should_journal_data(inode)) { ret = ext4_force_commit(inode->i_sb); if (ret) return ret; } mutex_lock(&inode->i_mutex); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ if (offset + len >= i_size_read(inode)) { ret = -EINVAL; goto out_mutex; } /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; goto out_mutex; } /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ down_write(&EXT4_I(inode)->i_mmap_sem); /* * Need to round down offset to be aligned with page size boundary * for page size > block size. */ ioffset = round_down(offset, PAGE_SIZE); /* * Write tail of the last page before removed range since it will get * removed from the page cache below. */ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); if (ret) goto out_mmap; /* * Write data that will be shifted to preserve them when discarding * page cache below. We are also protected from pages becoming dirty * by i_mmap_sem. */ ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, LLONG_MAX); if (ret) goto out_mmap; truncate_pagecache(inode, ioffset); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_mmap; } down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ret = ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } new_size = i_size_read(inode) - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); return ret; } /* * ext4_insert_range: * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. * The data blocks starting from @offset to the EOF are shifted by @len * towards right to create a hole in the @inode. Inode size is increased * by len bytes. * Returns 0 on success, error otherwise. */ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; int ret = 0, depth, split_flag = 0; loff_t ioffset; /* * We need to test this early because xfstests assumes that an * insert range of (0, 1) will return EOPNOTSUPP if the file * system does not support insert range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Insert range works only on fs block size aligned offsets. */ if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || len & (EXT4_CLUSTER_SIZE(sb) - 1)) return -EINVAL; if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; trace_ext4_insert_range(inode, offset, len); offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); /* Call ext4_force_commit to flush all data in case of data=journal */ if (ext4_should_journal_data(inode)) { ret = ext4_force_commit(inode->i_sb); if (ret) return ret; } mutex_lock(&inode->i_mutex); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; goto out_mutex; } /* Check for wrap through zero */ if (inode->i_size + len > inode->i_sb->s_maxbytes) { ret = -EFBIG; goto out_mutex; } /* Offset should be less than i_size */ if (offset >= i_size_read(inode)) { ret = -EINVAL; goto out_mutex; } /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ down_write(&EXT4_I(inode)->i_mmap_sem); /* * Need to round down to align start offset to page size boundary * for page size > block size. */ ioffset = round_down(offset, PAGE_SIZE); /* Write out all dirty pages */ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, LLONG_MAX); if (ret) goto out_mmap; truncate_pagecache(inode, ioffset); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_mmap; } /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } depth = ext_depth(inode); extent = path[depth].p_ext; if (extent) { ee_start_lblk = le32_to_cpu(extent->ee_block); ee_len = ext4_ext_get_actual_len(extent); /* * If offset_lblk is not the starting block of extent, split * the extent @offset_lblk */ if ((offset_lblk > ee_start_lblk) && (offset_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; ret = ext4_split_extent_at(handle, inode, &path, offset_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); } ext4_ext_drop_refs(path); kfree(path); if (ret < 0) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } } else { ext4_ext_drop_refs(path); kfree(path); } ret = ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } /* * if offset_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); return ret; } /** * ext4_swap_extents - Swap extents between two inodes * * @inode1: First inode * @inode2: Second inode * @lblk1: Start block for first inode * @lblk2: Start block for second inode * @count: Number of blocks to swap * @mark_unwritten: Mark second inode's extents as unwritten after swap * @erp: Pointer to save error value * * This helper routine does exactly what is promise "swap extents". All other 43 * stuff such as page-cache locking consistency, bh mapping consistency or * extent's data copying must be performed by caller. * Locking: * i_mutex is held for both inodes * i_data_sem is locked for write for both inodes 43 * Assumptions: 43 * All pages from requested range are locked for both inodes 43 */ int 43 ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int unwritten, int *erp) 43 { struct ext4_ext_path *path1 = NULL; struct ext4_ext_path *path2 = NULL; int replaced_count = 0; 43 BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!mutex_is_locked(&inode1->i_mutex)); BUG_ON(!mutex_is_locked(&inode2->i_mutex)); 43 *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) return 0; *erp = ext4_es_remove_extent(inode2, lblk2, count); if (unlikely(*erp)) return 0; while (count) { 43 struct ext4_extent *ex1, *ex2, tmp_ex; ext4_lblk_t e1_blk, e2_blk; int e1_len, e2_len, len; int split = 0; path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); 43 if (IS_ERR(path1)) { *erp = PTR_ERR(path1); path1 = NULL; 16 finish: count = 0; goto repeat; 16 } path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); 16 if (IS_ERR(path2)) { 16 *erp = PTR_ERR(path2); path2 = NULL; goto finish; 16 } 16 ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have somthing to swap ? */ if (unlikely(!ex2 || !ex1)) 3 goto finish; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); e1_len = ext4_ext_get_actual_len(ex1); 3 e2_len = ext4_ext_get_actual_len(ex2); /* Hole handling */ 3 if (!in_range(lblk1, e1_blk, e1_len) || !in_range(lblk2, e2_blk, e2_len)) { ext4_lblk_t next1, next2; 1 /* if hole after extent, then go to next extent */ next1 = ext4_ext_next_allocated_block(path1); next2 = ext4_ext_next_allocated_block(path2); /* If hole before extent, then shift to that extent */ if (e1_blk > lblk1) next1 = e1_blk; if (e2_blk > lblk2) next2 = e1_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) goto finish; 16 /* Move to the rightest boundary */ len = next1 - lblk1; 2 if (len < next2 - lblk2) len = next2 - lblk2; if (len > count) len = count; lblk1 += len; 16 lblk2 += len; count -= len; 4 goto repeat; } /* Prepare left boundary */ if (e1_blk < lblk1) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, &path1, lblk1, 0); if (unlikely(*erp)) goto finish; } if (e2_blk < lblk2) { 16 split = 1; *erp = ext4_force_split_extent_at(handle, inode2, &path2, lblk2, 0); if (unlikely(*erp)) goto finish; } /* ext4_split_extent_at() may result in leaf extent split, 16 * path must to be revalidated. */ if (split) goto repeat; /* Prepare right boundary */ 16 len = count; if (len > e1_blk + e1_len - lblk1) 13 len = e1_blk + e1_len - lblk1; if (len > e2_blk + e2_len - lblk2) 3 len = e2_blk + e2_len - lblk2; if (len != e1_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, &path1, lblk1 + len, 0); if (unlikely(*erp)) goto finish; } 16 if (len != e2_len) { 5 split = 1; *erp = ext4_force_split_extent_at(handle, inode2, 16 &path2, lblk2 + len, 0); if (*erp) goto finish; } /* ext4_split_extent_at() may result in leaf extent split, 16 * path must to be revalidated. */ if (split) goto repeat; BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); 16 if (unlikely(*erp)) 16 goto finish; 1 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) 16 goto finish; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); 16 ex1->ee_len = cpu_to_le16(e2_len); ex2->ee_len = cpu_to_le16(e1_len); if (unwritten) ext4_ext_mark_unwritten(ex2); if (ext4_ext_is_unwritten(&tmp_ex)) ext4_ext_mark_unwritten(ex1); ext4_ext_try_to_merge(handle, inode2, path2, ex2); ext4_ext_try_to_merge(handle, inode1, path1, ex1); *erp = ext4_ext_dirty(handle, inode2, path2 + 16 path2->p_depth); if (unlikely(*erp)) goto finish; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* 43 * Looks scarry ah..? second inode already points to new blocks, * and it was successfully dirtied. But luckily error may happen * only due to journal error, so full transaction will be * aborted anyway. 43 */ if (unlikely(*erp)) goto finish; lblk1 += len; lblk2 += len; replaced_count += len; count -= len; repeat: ext4_ext_drop_refs(path1); kfree(path1); ext4_ext_drop_refs(path2); kfree(path2); path1 = path2 = NULL; } return replaced_count; }
/* * mm/mprotect.c * * (C) Copyright 1994 Linus Torvalds * (C) Copyright 2002 Christoph Hellwig * * Address space accounting code <alan@lxorguk.ukuu.org.uk> * (C) Copyright 2002 Red Hat Inc, All Rights Reserved */ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/security.h> #include <linux/mempolicy.h> #include <linux/personality.h> #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/perf_event.h> #include <linux/ksm.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include "internal.h" /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This * function checks for a transhuge pmd under the appropriate lock. It * returns a pte if it was successfully locked or NULL if it raced with * a transhuge insertion. */ static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, int prot_numa, spinlock_t **ptl) { pte_t *pte; spinlock_t *pmdl; /* !prot_numa is protected by mmap_sem held for write */ if (!prot_numa) 16 return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); pmdl = pmd_lock(vma->vm_mm, pmd); if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { spin_unlock(pmdl); return NULL; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); spin_unlock(pmdl); return pte; } static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { 16 struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; 16 pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); if (!pte) return 0; 16 flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { 16 oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; 15 bool preserve_write = prot_numa && pte_write(oldpte); /* * Avoid trapping faults against the zero or KSM * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { struct page *page; page = vm_normal_page(vma, addr, oldpte); if (!page || PageKsm(page)) continue; /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; } 15 ptent = ptep_modify_prot_start(mm, addr, pte); 15 ptent = pte_modify(ptent, newprot); if (preserve_write) ptent = pte_mkwrite(ptent); /* Avoid taking write faults for known dirty pages */ 15 if (dirty_accountable && pte_dirty(ptent) && (pte_soft_dirty(ptent) || !(vma->vm_flags & VM_SOFTDIRTY))) { 1 ptent = pte_mkwrite(ptent); } 15 ptep_modify_prot_commit(mm, addr, pte, ptent); pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { 14 swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { pte_t newpte; /* * A protection check is difficult so * just be safe and disable write */ make_migration_entry_read(&entry); newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); set_pte_at(mm, addr, pte, newpte); pages++; } } 16 } while (pte++, addr += PAGE_SIZE, addr != end); 16 arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return pages; } static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; struct mm_struct *mm = vma->vm_mm; unsigned long next; unsigned long pages = 0; unsigned long nr_huge_updates = 0; unsigned long mni_start = 0; 25 pmd = pmd_offset(pud, addr); do { unsigned long this_pages; 25 next = pmd_addr_end(addr, end); 25 if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) continue; /* invoke the mmu notifier if the pmd is populated */ if (!mni_start) { mni_start = addr; mmu_notifier_invalidate_range_start(mm, mni_start, end); } if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { pages += HPAGE_PMD_NR; nr_huge_updates++; } /* huge pmd was handled */ continue; } } /* fall through, the trans huge pmd just split */ } 16 this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; 20 } while (pmd++, addr = next, addr != end); if (mni_start) mmu_notifier_invalidate_range_end(mm, mni_start, end); if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; } static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; unsigned long pages = 0; pud = pud_offset(pgd, addr); do { 25 next = pud_addr_end(addr, end); 25 if (pud_none_or_clear_bad(pud)) continue; 25 pages += change_pmd_range(vma, pud, addr, next, newprot, dirty_accountable, prot_numa); 25 } while (pud++, addr = next, addr != end); return pages; } static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { 25 struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; unsigned long start = addr; unsigned long pages = 0; BUG_ON(addr >= end); 25 pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); set_tlb_flush_pending(mm); do { 25 next = pgd_addr_end(addr, end); 25 if (pgd_none_or_clear_bad(pgd)) continue; 25 pages += change_pud_range(vma, pgd, addr, next, newprot, dirty_accountable, prot_numa); 25 } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ 25 if (pages) 15 flush_tlb_range(vma, start, end); 25 clear_tlb_flush_pending(mm); return pages; } unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { unsigned long pages; if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); return pages; } static int prot_none_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk) { return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } static int prot_none_test(unsigned long addr, unsigned long next, struct mm_walk *walk) { return 0; } static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags) { pgprot_t new_pgprot = vm_get_page_prot(newflags); struct mm_walk prot_none_walk = { .pte_entry = prot_none_pte_entry, .hugetlb_entry = prot_none_hugetlb_entry, .test_walk = prot_none_test, .mm = current->mm, .private = &new_pgprot, }; return walk_page_range(start, end, &prot_none_walk); } int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) { 25 struct mm_struct *mm = vma->vm_mm; 26 unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; unsigned long charged = 0; pgoff_t pgoff; int error; int dirty_accountable = 0; if (newflags == oldflags) { 2 *pprev = vma; return 0; } /* * Do PROT_NONE PFN permission checks here when we can still * bail out without undoing a lot of state. This is a rather * uncommon case, so doesn't need to be very optimized. */ if (arch_has_pfn_modify_check() && 25 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 1 (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { error = prot_none_walk(vma, start, end, newflags); if (error) return error; } /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we * make it unwritable again. hugetlb mapping were accounted for * even if read-only so there is no need to account for them here */ 25 if (newflags & VM_WRITE) { 8 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; 1 if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; 1 newflags |= VM_ACCOUNT; } } /* * First try to merge with previous and/or next vma. */ 25 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 25 *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); if (*pprev) { vma = *pprev; goto success; } 25 *pprev = vma; if (start != vma->vm_start) { 7 error = split_vma(mm, vma, start, 1); if (error) goto fail; } 25 if (end != vma->vm_end) { 22 error = split_vma(mm, vma, end, 0); if (error) goto fail; } success: /* * vm_flags and vm_page_prot are protected by the mmap_sem * held in write mode. */ 25 vma->vm_flags = newflags; dirty_accountable = vma_wants_writenotify(vma); vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. */ if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && 2 (newflags & VM_WRITE)) { 1 populate_vma_page_range(vma, start, end, NULL); } 25 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); 26 return 0; fail: vm_unacct_memory(charged); return error; } 33 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, unsigned long, prot) { unsigned long vm_flags, nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); 30 prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; 32 if (start & ~PAGE_MASK) return -EINVAL; 32 if (!len) return 0; 31 len = PAGE_ALIGN(len); end = start + len; if (end <= start) return -ENOMEM; if (!arch_validate_prot(prot)) return -EINVAL; reqprot = prot; /* * Does the application expect PROT_READ to imply PROT_EXEC: */ 6 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 2 prot |= PROT_EXEC; vm_flags = calc_vm_prot_bits(prot); 30 down_write(¤t->mm->mmap_sem); vma = find_vma(current->mm, start); error = -ENOMEM; if (!vma) goto out; 30 prev = vma->vm_prev; if (unlikely(grows & PROT_GROWSDOWN)) { 3 if (vma->vm_start >= end) goto out; start = vma->vm_start; error = -EINVAL; 2 if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; } else { 27 if (vma->vm_start > start) goto out; 26 if (unlikely(grows & PROT_GROWSUP)) { end = vma->vm_end; error = -EINVAL; if (!(vma->vm_flags & VM_GROWSUP)) goto out; } } 25 if (start > vma->vm_start) 8 prev = vma; for (nstart = start ; ; ) { unsigned long newflags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ newflags = vm_flags; 26 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { error = -EACCES; goto out; } 26 error = security_file_mprotect(vma, reqprot, prot); if (error) goto out; 26 tmp = vma->vm_end; if (tmp > end) tmp = end; error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; nstart = tmp; 26 if (nstart < prev->vm_end) nstart = prev->vm_end; if (nstart >= end) goto out; 7 vma = prev->vm_next; 7 if (!vma || vma->vm_start != nstart) { error = -ENOMEM; goto out; } } out: 30 up_write(¤t->mm->mmap_sem); 33 return error; }
#ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ #include <linux/rhashtable.h> struct netns_frags { /* sysctls */ long high_thresh; long low_thresh; int timeout; struct inet_frags *f; struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ atomic_long_t mem ____cacheline_aligned_in_smp; }; /** * fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), }; struct frag_v4_compare_key { __be32 saddr; __be32 daddr; u32 user; u32 vif; __be16 id; u16 protocol; }; struct frag_v6_compare_key { struct in6_addr saddr; struct in6_addr daddr; u32 user; __be32 id; u32 iif; }; /** * struct inet_frag_queue - fragment queue * * @node: rhash node * @key: keys identifying this frag. * @timer: queue expiration timer * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @fragments: received fragments head * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far * @flags: fragment queue flags * @max_size: maximum received fragment size * @net: namespace that this frag belongs to * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { struct rhash_head node; union { struct frag_v4_compare_key v4; struct frag_v6_compare_key v6; } key; struct timer_list timer; spinlock_t lock; atomic_t refcnt; struct sk_buff *fragments; /* Used in IPv6. */ struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; __u8 flags; u16 max_size; struct netns_frags *net; struct rcu_head rcu; }; struct inet_frags { int qsize; void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*skb_free)(struct sk_buff *); void (*frag_expire)(unsigned long data); struct kmem_cache *frags_cachep; const char *frags_cache_name; struct rhashtable_params rhash_params; }; int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); static inline int inet_frags_init_net(struct netns_frags *nf) { atomic_long_set(&nf->mem, 0); return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); } void inet_frags_exit_net(struct netns_frags *nf); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); /* Free all skbs in the queue; return the sum of their truesizes. */ unsigned int inet_frag_rbtree_purge(struct rb_root *root); static inline void inet_frag_put(struct inet_frag_queue *q) { 36 if (atomic_dec_and_test(&q->refcnt)) inet_frag_destroy(q); } /* Memory Tracking Functions. */ static inline long frag_mem_limit(const struct netns_frags *nf) { 35 return atomic_long_read(&nf->mem); } static inline void sub_frag_mem_limit(struct netns_frags *nf, long val) { atomic_long_sub(val, &nf->mem); } static inline void add_frag_mem_limit(struct netns_frags *nf, long val) { atomic_long_add(val, &nf->mem); } /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. */ #define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ #define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ #define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ #define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ extern const u8 ip_frag_ecn_table[16]; #endif
/* * drivers/staging/android/ion/ion_system_heap.c * * Copyright (C) 2011 Google, Inc. * * This software is licensed under the terms of the GNU General Public * License version 2, as published by the Free Software Foundation, and * may be copied, distributed, and modified under those terms. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ #include <asm/page.h> #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include "ion.h" #include "ion_priv.h" static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_RECLAIM; static gfp_t low_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN); static const unsigned int orders[] = {8, 4, 0}; static const int num_orders = ARRAY_SIZE(orders); static int order_to_index(unsigned int order) { int i; for (i = 0; i < num_orders; i++) if (order == orders[i]) return i; BUG(); return -1; } static inline unsigned int order_to_size(int order) { return PAGE_SIZE << order; } struct ion_system_heap { struct ion_heap heap; struct ion_page_pool *pools[0]; }; static struct page *alloc_buffer_page(struct ion_system_heap *heap, struct ion_buffer *buffer, unsigned long order) { bool cached = ion_buffer_cached(buffer); struct ion_page_pool *pool = heap->pools[order_to_index(order)]; struct page *page; if (!cached) { page = ion_page_pool_alloc(pool); } else { gfp_t gfp_flags = low_order_gfp_flags; if (order > 4) gfp_flags = high_order_gfp_flags; page = alloc_pages(gfp_flags | __GFP_COMP, order); if (!page) return NULL; ion_pages_sync_for_device(NULL, page, PAGE_SIZE << order, DMA_BIDIRECTIONAL); } return page; } static void free_buffer_page(struct ion_system_heap *heap, struct ion_buffer *buffer, struct page *page) { unsigned int order = compound_order(page); bool cached = ion_buffer_cached(buffer); if (!cached) { struct ion_page_pool *pool = heap->pools[order_to_index(order)]; if (buffer->private_flags & ION_PRIV_FLAG_SHRINKER_FREE) ion_page_pool_free_immediate(pool, page); else ion_page_pool_free(pool, page); } else { __free_pages(page, order); } } static struct page *alloc_largest_available(struct ion_system_heap *heap, struct ion_buffer *buffer, unsigned long size, unsigned int max_order) { struct page *page; int i; for (i = 0; i < num_orders; i++) { if (size < order_to_size(orders[i])) continue; if (max_order < orders[i]) continue; page = alloc_buffer_page(heap, buffer, orders[i]); if (!page) continue; return page; } return NULL; } static int ion_system_heap_allocate(struct ion_heap *heap, struct ion_buffer *buffer, unsigned long size, unsigned long align, unsigned long flags) { struct ion_system_heap *sys_heap = container_of(heap, struct ion_system_heap, heap); struct sg_table *table; struct scatterlist *sg; struct list_head pages; struct page *page, *tmp_page; int i = 0; unsigned long size_remaining = PAGE_ALIGN(size); unsigned int max_order = orders[0]; if (align > PAGE_SIZE) return -EINVAL; if (size / PAGE_SIZE > totalram_pages / 2) return -ENOMEM; INIT_LIST_HEAD(&pages); while (size_remaining > 0) { page = alloc_largest_available(sys_heap, buffer, size_remaining, max_order); if (!page) goto free_pages; list_add_tail(&page->lru, &pages); size_remaining -= PAGE_SIZE << compound_order(page); max_order = compound_order(page); i++; } table = kmalloc(sizeof(struct sg_table), GFP_KERNEL); if (!table) goto free_pages; if (sg_alloc_table(table, i, GFP_KERNEL)) goto free_table; sg = table->sgl; list_for_each_entry_safe(page, tmp_page, &pages, lru) { sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0); sg = sg_next(sg); list_del(&page->lru); } buffer->priv_virt = table; return 0; free_table: kfree(table); free_pages: list_for_each_entry_safe(page, tmp_page, &pages, lru) free_buffer_page(sys_heap, buffer, page); return -ENOMEM; } static void ion_system_heap_free(struct ion_buffer *buffer) { struct ion_system_heap *sys_heap = container_of(buffer->heap, struct ion_system_heap, heap); struct sg_table *table = buffer->sg_table; bool cached = ion_buffer_cached(buffer); struct scatterlist *sg; int i; /* * uncached pages come from the page pools, zero them before returning * for security purposes (other allocations are zerod at * alloc time */ if (!cached && !(buffer->private_flags & ION_PRIV_FLAG_SHRINKER_FREE)) ion_heap_buffer_zero(buffer); for_each_sg(table->sgl, sg, table->nents, i) free_buffer_page(sys_heap, buffer, sg_page(sg)); sg_free_table(table); kfree(table); } static struct sg_table *ion_system_heap_map_dma(struct ion_heap *heap, struct ion_buffer *buffer) { return buffer->priv_virt; } static void ion_system_heap_unmap_dma(struct ion_heap *heap, struct ion_buffer *buffer) { } static int ion_system_heap_shrink(struct ion_heap *heap, gfp_t gfp_mask, int nr_to_scan) { struct ion_system_heap *sys_heap; int nr_total = 0; int i, nr_freed; int only_scan = 0; sys_heap = container_of(heap, struct ion_system_heap, heap); if (!nr_to_scan) only_scan = 1; 2 for (i = 0; i < num_orders; i++) { 2 struct ion_page_pool *pool = sys_heap->pools[i]; nr_freed = ion_page_pool_shrink(pool, gfp_mask, nr_to_scan); nr_total += nr_freed; if (!only_scan) { nr_to_scan -= nr_freed; /* shrink completed */ if (nr_to_scan <= 0) break; } } 2 return nr_total; } static struct ion_heap_ops system_heap_ops = { .allocate = ion_system_heap_allocate, .free = ion_system_heap_free, .map_dma = ion_system_heap_map_dma, .unmap_dma = ion_system_heap_unmap_dma, .map_kernel = ion_heap_map_kernel, .unmap_kernel = ion_heap_unmap_kernel, .map_user = ion_heap_map_user, .shrink = ion_system_heap_shrink, }; static int ion_system_heap_debug_show(struct ion_heap *heap, struct seq_file *s, void *unused) { struct ion_system_heap *sys_heap = container_of(heap, struct ion_system_heap, heap); int i; for (i = 0; i < num_orders; i++) { struct ion_page_pool *pool = sys_heap->pools[i]; seq_printf(s, "%d order %u highmem pages in pool = %lu total\n", pool->high_count, pool->order, (PAGE_SIZE << pool->order) * pool->high_count); seq_printf(s, "%d order %u lowmem pages in pool = %lu total\n", pool->low_count, pool->order, (PAGE_SIZE << pool->order) * pool->low_count); } return 0; } struct ion_heap *ion_system_heap_create(struct ion_platform_heap *unused) { struct ion_system_heap *heap; int i; heap = kzalloc(sizeof(struct ion_system_heap) + sizeof(struct ion_page_pool *) * num_orders, GFP_KERNEL); if (!heap) return ERR_PTR(-ENOMEM); heap->heap.ops = &system_heap_ops; heap->heap.type = ION_HEAP_TYPE_SYSTEM; heap->heap.flags = ION_HEAP_FLAG_DEFER_FREE; for (i = 0; i < num_orders; i++) { struct ion_page_pool *pool; gfp_t gfp_flags = low_order_gfp_flags; if (orders[i] > 4) gfp_flags = high_order_gfp_flags; pool = ion_page_pool_create(gfp_flags, orders[i]); if (!pool) goto destroy_pools; heap->pools[i] = pool; } heap->heap.debug_show = ion_system_heap_debug_show; return &heap->heap; destroy_pools: while (i--) ion_page_pool_destroy(heap->pools[i]); kfree(heap); return ERR_PTR(-ENOMEM); } void ion_system_heap_destroy(struct ion_heap *heap) { struct ion_system_heap *sys_heap = container_of(heap, struct ion_system_heap, heap); int i; for (i = 0; i < num_orders; i++) ion_page_pool_destroy(sys_heap->pools[i]); kfree(sys_heap); } static int ion_system_contig_heap_allocate(struct ion_heap *heap, struct ion_buffer *buffer, unsigned long len, unsigned long align, unsigned long flags) { int order = get_order(len); struct page *page; struct sg_table *table; unsigned long i; int ret; if (align > (PAGE_SIZE << order)) return -EINVAL; page = alloc_pages(low_order_gfp_flags, order); if (!page) return -ENOMEM; split_page(page, order); len = PAGE_ALIGN(len); for (i = len >> PAGE_SHIFT; i < (1 << order); i++) __free_page(page + i); table = kmalloc(sizeof(struct sg_table), GFP_KERNEL); if (!table) { ret = -ENOMEM; goto free_pages; } ret = sg_alloc_table(table, 1, GFP_KERNEL); if (ret) goto free_table; sg_set_page(table->sgl, page, len, 0); buffer->priv_virt = table; ion_pages_sync_for_device(NULL, page, len, DMA_BIDIRECTIONAL); return 0; free_table: kfree(table); free_pages: for (i = 0; i < len >> PAGE_SHIFT; i++) __free_page(page + i); return ret; } static void ion_system_contig_heap_free(struct ion_buffer *buffer) { struct sg_table *table = buffer->priv_virt; struct page *page = sg_page(table->sgl); unsigned long pages = PAGE_ALIGN(buffer->size) >> PAGE_SHIFT; unsigned long i; for (i = 0; i < pages; i++) __free_page(page + i); sg_free_table(table); kfree(table); } static int ion_system_contig_heap_phys(struct ion_heap *heap, struct ion_buffer *buffer, ion_phys_addr_t *addr, size_t *len) { struct sg_table *table = buffer->priv_virt; struct page *page = sg_page(table->sgl); *addr = page_to_phys(page); *len = buffer->size; return 0; } static struct sg_table *ion_system_contig_heap_map_dma(struct ion_heap *heap, struct ion_buffer *buffer) { return buffer->priv_virt; } static void ion_system_contig_heap_unmap_dma(struct ion_heap *heap, struct ion_buffer *buffer) { } static struct ion_heap_ops kmalloc_ops = { .allocate = ion_system_contig_heap_allocate, .free = ion_system_contig_heap_free, .phys = ion_system_contig_heap_phys, .map_dma = ion_system_contig_heap_map_dma, .unmap_dma = ion_system_contig_heap_unmap_dma, .map_kernel = ion_heap_map_kernel, .unmap_kernel = ion_heap_unmap_kernel, .map_user = ion_heap_map_user, }; struct ion_heap *ion_system_contig_heap_create(struct ion_platform_heap *unused) { struct ion_heap *heap; heap = kzalloc(sizeof(struct ion_heap), GFP_KERNEL); if (!heap) return ERR_PTR(-ENOMEM); heap->ops = &kmalloc_ops; heap->type = ION_HEAP_TYPE_SYSTEM_CONTIG; return heap; } void ion_system_contig_heap_destroy(struct ion_heap *heap) { kfree(heap); }
#ifndef _LINUX_PAGEMAP_H #define _LINUX_PAGEMAP_H /* * Copyright 1995 Linus Torvalds */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/compiler.h> #include <asm/uaccess.h> #include <linux/gfp.h> #include <linux/bitops.h> #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> /* * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page * allocation mode flags. */ enum mapping_flags { AS_EIO = __GFP_BITS_SHIFT + 0, /* IO error on async write */ AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ AS_EXITING = __GFP_BITS_SHIFT + 4, /* final truncate in progress */ }; static inline void mapping_set_error(struct address_space *mapping, int error) { if (unlikely(error)) { if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else set_bit(AS_EIO, &mapping->flags); } } static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_clear_unevictable(struct address_space *mapping) { clear_bit(AS_UNEVICTABLE, &mapping->flags); } static inline int mapping_unevictable(struct address_space *mapping) { if (mapping) 7 return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } static inline void mapping_set_exiting(struct address_space *mapping) { 673 set_bit(AS_EXITING, &mapping->flags); } static inline int mapping_exiting(struct address_space *mapping) { 2 return test_bit(AS_EXITING, &mapping->flags); } static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { 820 return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; } /* Restricts the given gfp_mask to what the mapping allows. */ static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, gfp_t gfp_mask) { 288 return mapping_gfp_mask(mapping) & gfp_mask; } /* * This is non-atomic. Only to be used before the mapping is activated. * Probably needs a barrier... */ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) { m->flags = (m->flags & ~(__force unsigned long)__GFP_BITS_MASK) | (__force unsigned long)mask; } /* * The page cache can be done in larger chunks than * one page, because it allows for more efficient * throughput (it can then be mapped into user * space in smaller chunks for same flexibility). * * Or rather, it _will_ be done in larger chunks. */ #define PAGE_CACHE_SHIFT PAGE_SHIFT #define PAGE_CACHE_SIZE PAGE_SIZE #define PAGE_CACHE_MASK PAGE_MASK #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK) #define page_cache_get(page) get_page(page) #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, bool cold); /* * speculatively take a reference to a page. * If the page is free (_count == 0), then _count is untouched, and 0 * is returned. Otherwise, _count is incremented by 1 and 1 is returned. * * This function must be called inside the same rcu_read_lock() section as has * been used to lookup the page in the pagecache radix-tree (or page table): * this allows allocators to use a synchronize_rcu() to stabilize _count. * * Unless an RCU grace period has passed, the count of all pages coming out * of the allocator must be considered unstable. page_count may return higher * than expected, and put_page must be able to do the right thing when the * page has been finished with, no matter what it is subsequently allocated * for (because put_page is what is used here to drop an invalid speculative * reference). * * This is the interesting part of the lockless pagecache (and lockless * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page) * has the following pattern: * 1. find page in radix tree * 2. conditionally increment refcount * 3. check the page is still in pagecache (if no, goto 1) * * Remove-side that cares about stability of _count (eg. reclaim) has the * following (with tree_lock held for write): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache * C. free the page * * There are 2 critical interleavings that matter: * - 2 runs before A: in this case, A sees elevated refcount and bails out * - A runs before 2: in this case, 2 sees zero refcount and retries; * subsequently, B will complete and 1 will find no page, causing the * lookup to return NULL. * * It is possible that between 1 and 2, the page is removed then the exact same * page is inserted into the same position in pagecache. That's OK: the * old find_get_page using tree_lock could equally have run before or after * such a re-insertion, depending on order that locks are granted. * * Lookups racing against pagecache insertion isn't a big problem: either 1 * will find the page or it will not. Likewise, the old find_get_page could run * either before the insertion or afterwards, depending on timing. */ static inline int page_cache_get_speculative(struct page *page) { 1685 VM_BUG_ON(in_interrupt()); #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif /* * Preempt must be disabled here - we rely on rcu_read_lock doing * this for us. * * Pagecache won't be truncated from interrupt context, so if we have * found a page in the radix tree here, we have pinned its refcount by * disabling preempt, and hence no need for the "speculative get" that * SMP requires. */ VM_BUG_ON_PAGE(page_count(page) == 0, page); atomic_inc(&page->_count); #else 1685 if (unlikely(!get_page_unless_zero(page))) { /* * Either the page has been freed, or will be freed. * In either case, retry here and the caller should * do the right thing (see comments above). */ return 0; } #endif 1685 VM_BUG_ON_PAGE(PageTail(page), page); return 1; } /* * Same as above, but add instead of inc (could just be merged) */ static inline int page_cache_add_speculative(struct page *page, int count) { VM_BUG_ON(in_interrupt()); #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif VM_BUG_ON_PAGE(page_count(page) == 0, page); atomic_add(count, &page->_count); #else if (unlikely(!atomic_add_unless(&page->_count, count, 0))) return 0; #endif VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); return 1; } static inline int page_freeze_refs(struct page *page, int count) { 1 return likely(atomic_cmpxchg(&page->_count, count, 0) == count); } static inline void page_unfreeze_refs(struct page *page, int count) { 19 VM_BUG_ON_PAGE(page_count(page) != 0, page); 1 VM_BUG_ON(count == 0); 18 atomic_set(&page->_count, count); } #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else static inline struct page *__page_cache_alloc(gfp_t gfp) { 689 return alloc_pages(gfp, 0); } #endif static inline struct page *page_cache_alloc(struct address_space *x) { return __page_cache_alloc(mapping_gfp_mask(x)); } static inline struct page *page_cache_alloc_cold(struct address_space *x) { 288 return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); } static inline struct page *page_cache_alloc_readahead(struct address_space *x) { 528 return __page_cache_alloc(mapping_gfp_mask(x) | __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN); } typedef int filler_t(void *, struct page *); pgoff_t page_cache_next_hole(struct address_space *mapping, pgoff_t index, unsigned long max_scan); pgoff_t page_cache_prev_hole(struct address_space *mapping, pgoff_t index, unsigned long max_scan); #define FGP_ACCESSED 0x00000001 #define FGP_LOCK 0x00000002 #define FGP_CREAT 0x00000004 #define FGP_WRITE 0x00000008 #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); /** * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned with an increased refcount. * * Otherwise, %NULL is returned. */ static inline struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { 527 return pagecache_get_page(mapping, offset, 0, 0); } static inline struct page *find_get_page_flags(struct address_space *mapping, pgoff_t offset, int fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } /** * find_lock_page - locate, pin and lock a pagecache page * pagecache_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned locked and with an increased * refcount. * * Otherwise, %NULL is returned. * * find_lock_page() may sleep. */ static inline struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, FGP_LOCK, 0); } /** * find_or_create_page - locate or add a pagecache page * @mapping: the page's address_space * @index: the page's index into the mapping * @gfp_mask: page allocation mode * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned locked and with an increased * refcount. * * If the page is not present, a new page is allocated using @gfp_mask * and added to the page cache and the VM's LRU list. The page is * returned locked and with an increased refcount. * * On memory exhaustion, %NULL is returned. * * find_or_create_page() may sleep, even if @gfp_flags specifies an * atomic allocation! */ static inline struct page *find_or_create_page(struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { return pagecache_get_page(mapping, offset, FGP_LOCK|FGP_ACCESSED|FGP_CREAT, gfp_mask); } /** * grab_cache_page_nowait - returns locked page at given index in given cache * @mapping: target address_space * @index: the page index * * Same as grab_cache_page(), but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. * * Clear __GFP_FS when allocating the page to avoid recursion into the fs * and deadlock against the caller's locked page. */ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, mapping_gfp_mask(mapping)); } struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); /* * Returns locked page at given index in given cache, creating it if needed. */ static inline struct page *grab_cache_page(struct address_space *mapping, pgoff_t index) { return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); } extern struct page * read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { filler_t *filler = (filler_t *)mapping->a_ops->readpage; return read_cache_page(mapping, index, filler, data); } /* * Get the offset in PAGE_SIZE. * (TODO: hugepage should have ->index in PAGE_SIZE) */ static inline pgoff_t page_to_pgoff(struct page *page) { if (unlikely(PageHeadHuge(page))) return page->index << compound_order(page); else return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); } /* * Return byte-offset into filesystem object for page. */ static inline loff_t page_offset(struct page *page) { 97 return ((loff_t)page->index) << PAGE_CACHE_SHIFT; } static inline loff_t page_file_offset(struct page *page) { return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT; } 4 extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, unsigned long address); static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { pgoff_t pgoff; if (unlikely(is_vm_hugetlb_page(vma))) return linear_hugepage_index(vma, address); pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT); } extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 4 unsigned int flags); extern void unlock_page(struct page *page); static inline void __set_page_locked(struct page *page) { __set_bit(PG_locked, &page->flags); } static inline void __clear_page_locked(struct page *page) { __clear_bit(PG_locked, &page->flags); } static inline int trylock_page(struct page *page) { return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } 28 /* * lock_page may only be called if we have the page's inode pinned. */ static inline void lock_page(struct page *page) { 1737 might_sleep(); if (!trylock_page(page)) __lock_page(page); } /* * lock_page_killable is like lock_page but can be interrupted by fatal * signals. It returns 0 if it locked the page and -EINTR if it was 1095 * killed while waiting. 1146 */ 338 static inline int lock_page_killable(struct page *page) { might_sleep(); if (!trylock_page(page)) return __lock_page_killable(page); return 0; } /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_sem implications depend on flags; see * __lock_page_or_retry(). */ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { might_sleep(); return trylock_page(page) || __lock_page_or_retry(page, mm, flags); } /* * This is exported only for wait_on_page_locked/wait_on_page_writeback, * and for filesystems which need to wait on PG_private. */ extern void wait_on_page_bit(struct page *page, int bit_nr); 76 extern int wait_on_page_bit_killable(struct page *page, int bit_nr); extern int wait_on_page_bit_killable_timeout(struct page *page, int bit_nr, unsigned long timeout); static inline int wait_on_page_locked_killable(struct page *page) { if (PageLocked(page)) return wait_on_page_bit_killable(page, PG_locked); return 0; } extern wait_queue_head_t *page_waitqueue(struct page *page); static inline void wake_up_page(struct page *page, int bit) { 20 __wake_up_bit(page_waitqueue(page), &page->flags, bit); 20 } /* * Wait for a page to be unlocked. * * This must be called with the caller "holding" the page, * ie with increased "page->count" so that the page won't * go away during the wait.. */ static inline void wait_on_page_locked(struct page *page) { if (PageLocked(page)) wait_on_page_bit(page, PG_locked); } /* * Wait for a page to complete writeback */ static inline void wait_on_page_writeback(struct page *page) 2 { 21 if (PageWriteback(page)) wait_on_page_bit(page, PG_writeback); } extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); void page_endio(struct page *page, int rw, int err); 605 384 /* * Add an arbitrary waiter to a page's wait queue */ extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter); /* * Fault a userspace page into pagetables. Return non-zero on a fault. * * This assumes that two userspace pages are always sufficient. That's * not true if PAGE_CACHE_SIZE > PAGE_SIZE. */ static inline int fault_in_pages_writeable(char __user *uaddr, int size) { int ret; if (unlikely(size == 0)) return 0; /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. */ ret = __put_user(0, uaddr); if (ret == 0) { char __user *end = uaddr + size - 1; /* * If the page was already mapped, this will get a cache miss * for sure, so try to avoid doing it. */ 602 if (((unsigned long)uaddr & PAGE_MASK) != ((unsigned long)end & PAGE_MASK)) 595 ret = __put_user(0, end); } return ret; } static inline int fault_in_pages_readable(const char __user *uaddr, int size) { volatile char c; 525 int ret; if (unlikely(size == 0)) return 0; ret = __get_user(c, uaddr); if (ret == 0) { const char __user *end = uaddr + size - 1; if (((unsigned long)uaddr & PAGE_MASK) != ((unsigned long)end & PAGE_MASK)) { ret = __get_user(c, end); (void)c; 267 } } 264 return ret; } /* 246 * Multipage variants of the above prefault helpers, useful if more than * PAGE_SIZE of data needs to be prefaulted. These are separate from the above * functions (which only handle up to PAGE_SIZE) to avoid clobbering the * filemap.c hotpaths. */ static inline int fault_in_multipages_writeable(char __user *uaddr, int size) { char __user *end = uaddr + size - 1; if (unlikely(size == 0)) return 0; if (unlikely(uaddr > end)) return -EFAULT; /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. */ do { if (unlikely(__put_user(0, uaddr) != 0)) return -EFAULT; uaddr += PAGE_SIZE; } while (uaddr <= end); /* Check whether the range spilled into the next page. */ if (((unsigned long)uaddr & PAGE_MASK) == ((unsigned long)end & PAGE_MASK)) return __put_user(0, end); return 0; } static inline int fault_in_multipages_readable(const char __user *uaddr, int size) { volatile char c; const char __user *end = uaddr + size - 1; if (unlikely(size == 0)) return 0; if (unlikely(uaddr > end)) return -EFAULT; do { if (unlikely(__get_user(c, uaddr) != 0)) return -EFAULT; uaddr += PAGE_SIZE; } while (uaddr <= end); 363 /* Check whether the range spilled into the next page. */ if (((unsigned long)uaddr & PAGE_MASK) == ((unsigned long)end & PAGE_MASK)) { 363 return __get_user(c, end); } 363 return 0; } 363 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); 255 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow, struct mem_cgroup *memcg); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __set_page_locked() against it. */ static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; __set_page_locked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) __clear_page_locked(page); return error; } static inline unsigned long dir_pages(struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; } #endif /* _LINUX_PAGEMAP_H */
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Forwarding Information Base. * * Authors: A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #ifndef _NET_IP_FIB_H #define _NET_IP_FIB_H #include <net/flow.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/fib_rules.h> #include <net/inetpeer.h> #include <linux/percpu.h> struct fib_config { u8 fc_dst_len; u8 fc_tos; u8 fc_protocol; u8 fc_scope; u8 fc_type; /* 3 bytes unused */ u32 fc_table; __be32 fc_dst; __be32 fc_gw; int fc_oif; u32 fc_flags; u32 fc_priority; __be32 fc_prefsrc; struct nlattr *fc_mx; struct rtnexthop *fc_mp; int fc_mx_len; int fc_mp_len; u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; }; struct fib_info; struct rtable; struct fib_nh_exception { struct fib_nh_exception __rcu *fnhe_next; int fnhe_genid; __be32 fnhe_daddr; u32 fnhe_pmtu; bool fnhe_mtu_locked; __be32 fnhe_gw; unsigned long fnhe_expires; struct rtable __rcu *fnhe_rth_input; struct rtable __rcu *fnhe_rth_output; unsigned long fnhe_stamp; struct rcu_head rcu; }; struct fnhe_hash_bucket { struct fib_nh_exception __rcu *chain; }; #define FNHE_HASH_SHIFT 11 #define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 struct fib_nh { struct net_device *nh_dev; struct hlist_node nh_hash; struct fib_info *nh_parent; unsigned int nh_flags; unsigned char nh_scope; #ifdef CONFIG_IP_ROUTE_MULTIPATH int nh_weight; atomic_t nh_upper_bound; #endif #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif int nh_oif; __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; struct lwtunnel_state *nh_lwtstate; }; /* * This structure contains data shared by many of routes. */ struct fib_info { struct hlist_node fib_hash; struct hlist_node fib_lhash; struct net *fib_net; int fib_treeref; atomic_t fib_clntref; unsigned int fib_flags; unsigned char fib_dead; unsigned char fib_protocol; unsigned char fib_scope; unsigned char fib_type; __be32 fib_prefsrc; u32 fib_priority; struct dst_metrics *fib_metrics; #define fib_mtu fib_metrics->metrics[RTAX_MTU-1] #define fib_window fib_metrics->metrics[RTAX_WINDOW-1] #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_weight; #endif struct rcu_head rcu; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev }; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; #endif struct fib_table; struct fib_result { unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; u32 tclassid; struct fib_info *fi; struct fib_table *table; struct hlist_head *fa_head; }; struct fib_result_nl { __be32 fl_addr; /* To be looked up*/ u32 fl_mark; unsigned char fl_tos; unsigned char fl_scope; unsigned char tb_id_in; unsigned char tb_id; /* Results */ unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; int err; }; #ifdef CONFIG_IP_ROUTE_MULTIPATH #define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) #else /* CONFIG_IP_ROUTE_MULTIPATH */ #define FIB_RES_NH(res) ((res).fi->fib_nh[0]) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ #ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 #else #define FIB_TABLE_HASHSZ 2 #endif __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); #define FIB_RES_SADDR(net, res) \ ((FIB_RES_NH(res).nh_saddr_genid == \ atomic_read(&(net)->ipv4.dev_addr_genid)) ? \ FIB_RES_NH(res).nh_saddr : \ fib_info_update_nh_saddr((net), &FIB_RES_NH(res))) #define FIB_RES_GW(res) (FIB_RES_NH(res).nh_gw) #define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev) #define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif) #define FIB_RES_PREFSRC(net, res) ((res).fi->fib_prefsrc ? : \ FIB_RES_SADDR(net, res)) struct fib_table { struct hlist_node tb_hlist; u32 tb_id; int tb_num_default; struct rcu_head rcu; unsigned long *tb_data; unsigned long __data[0]; }; int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags); int fib_table_insert(struct fib_table *, struct fib_config *); int fib_table_delete(struct fib_table *, struct fib_config *); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb); int fib_table_flush(struct fib_table *table, bool flush_all); struct fib_table *fib_trie_unmerge(struct fib_table *main_tb); void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); #ifndef CONFIG_IP_MULTIPLE_TABLES #define TABLE_LOCAL_INDEX (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1)) #define TABLE_MAIN_INDEX (RT_TABLE_MAIN & (FIB_TABLE_HASHSZ - 1)) static inline struct fib_table *fib_get_table(struct net *net, u32 id) { struct hlist_node *tb_hlist; struct hlist_head *ptr; ptr = id == RT_TABLE_LOCAL ? &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] : &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]; tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr)); return hlist_entry(tb_hlist, struct fib_table, tb_hlist); } static inline struct fib_table *fib_new_table(struct net *net, u32 id) { return fib_get_table(net, id); } static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); if (tb) err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } #else /* CONFIG_IP_MULTIPLE_TABLES */ int __net_init fib4_rules_init(struct net *net); void __net_exit fib4_rules_exit(struct net *net); struct fib_table *fib_new_table(struct net *net, u32 id); struct fib_table *fib_get_table(struct net *net, u32 id); int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags); static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; flags |= FIB_LOOKUP_NOREF; 519 if (net->ipv4.fib_has_custom_rules) 211 return __fib_lookup(net, flp, res, flags); 390 rcu_read_lock(); 390 res->tclassid = 0; 390 tb = rcu_dereference_rtnl(net->ipv4.fib_main); 390 if (tb) err = fib_table_lookup(tb, flp, res, flags); if (!err) goto out; 167 tb = rcu_dereference_rtnl(net->ipv4.fib_default); 167 if (tb) err = fib_table_lookup(tb, flp, res, flags); out: 167 if (err == -EAGAIN) err = -ENETUNREACH; 390 rcu_read_unlock(); return err; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); __be32 fib_compute_spec_dst(struct sk_buff *skb); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); void fib_select_default(const struct flowi4 *flp, struct fib_result *res); #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { return net->ipv4.fib_num_tclassid_users; } #else static inline int fib_num_tclassid_users(struct net *net) { return 0; } #endif int fib_unmerge(struct net *net); void fib_flush_external(struct net *net); /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_addr(struct net *net, __be32 local); int fib_sync_up(struct net_device *dev, unsigned int nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); extern u32 fib_multipath_secret __read_mostly; static inline int fib_multipath_hash(__be32 saddr, __be32 daddr) { return jhash_2words(saddr, daddr, fib_multipath_secret) >> 1; } void fib_select_multipath(struct fib_result *res, int hash); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, int mp_hash); /* Exported by fib_trie.c */ void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) { #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif *itag = FIB_RES_NH(*res).nh_tclassid<<16; #ifdef CONFIG_IP_MULTIPLE_TABLES rtag = res->tclassid; if (*itag == 0) *itag = (rtag<<16); *itag |= (rtag>>16); #endif #endif } void free_fib_info(struct fib_info *fi); static inline void fib_info_put(struct fib_info *fi) { if (atomic_dec_and_test(&fi->fib_clntref)) 56 free_fib_info(fi); } #ifdef CONFIG_PROC_FS int __net_init fib_proc_init(struct net *net); void __net_exit fib_proc_exit(struct net *net); #else static inline int fib_proc_init(struct net *net) { return 0; } static inline void fib_proc_exit(struct net *net) { } #endif #endif /* _NET_FIB_H */
/* * mm/interval_tree.c - interval tree for mapping->i_mmap * * Copyright (C) 2012, Michel Lespinasse <walken@google.com> * * This file is released under the GPL v2. */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/rmap.h> #include <linux/interval_tree_generic.h> static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) { 355 return v->vm_pgoff; } static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) { 293 return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; } 493 INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root *root) { struct rb_node **link; struct vm_area_struct *parent; 150 unsigned long last = vma_last_pgoff(node); VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); 150 if (!prev->shared.rb.rb_right) { parent = prev; 143 link = &prev->shared.rb.rb_right; } else { 122 parent = rb_entry(prev->shared.rb.rb_right, struct vm_area_struct, shared.rb); 122 if (parent->shared.rb_subtree_last < last) 1 parent->shared.rb_subtree_last = last; 122 while (parent->shared.rb.rb_left) { 2 parent = rb_entry(parent->shared.rb.rb_left, struct vm_area_struct, shared.rb); 2 if (parent->shared.rb_subtree_last < last) 1 parent->shared.rb_subtree_last = last; } 122 link = &parent->shared.rb.rb_left; } 150 node->shared.rb_subtree_last = last; rb_link_node(&node->shared.rb, &parent->shared.rb, link); rb_insert_augmented(&node->shared.rb, root, &vma_interval_tree_augment); } static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) { 592 return vma_start_pgoff(avc->vma); } static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) { 127 return vma_last_pgoff(avc->vma); } 595 INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, avc_start_pgoff, avc_last_pgoff, static inline, __anon_vma_interval_tree) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root *root) { #ifdef CONFIG_DEBUG_VM_RB 591 node->cached_vma_start = avc_start_pgoff(node); node->cached_vma_last = avc_last_pgoff(node); #endif 591 __anon_vma_interval_tree_insert(node, root); } void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root *root) { 293 __anon_vma_interval_tree_remove(node, root); 293 } struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root *root, unsigned long first, unsigned long last) { 5 return __anon_vma_interval_tree_iter_first(root, first, last); } struct anon_vma_chain * anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, unsigned long first, unsigned long last) { 5 return __anon_vma_interval_tree_iter_next(node, first, last); } #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node) { 410 WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); 410 WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); 410 } #endif
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic TIME_WAIT sockets functions * * From code orinally in TCP */ #include <linux/kernel.h> #include <linux/kmemcheck.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash * @tw: timewait socket * @hashinfo: hashinfo pointer * * unhash a timewait socket from bind hash, if hashed. * bind hash lock must be held by caller. * Returns 1 if caller should call inet_twsk_put() after lock release. */ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { 54 struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) return; 54 __hlist_del(&tw->tw_bind_node); 54 tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 54 __sock_put((struct sock *)tw); } /* Must be called with locally disabled BHs. */ static void inet_twsk_kill(struct inet_timewait_sock *tw) { 54 struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); struct inet_bind_hashbucket *bhead; spin_lock(lock); sk_nulls_del_node_init_rcu((struct sock *)tw); 54 spin_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; spin_lock(&bhead->lock); inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); atomic_dec(&tw->tw_dr->tw_count); inet_twsk_put(tw); } void inet_twsk_free(struct inet_timewait_sock *tw) { 54 struct module *owner = tw->tw_prot->owner; 54 twsk_destructor((struct sock *)tw); #ifdef SOCK_REFCNT_DEBUG pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif 54 kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } void inet_twsk_put(struct inet_timewait_sock *tw) { 62 if (atomic_dec_and_test(&tw->tw_refcnt)) 54 inet_twsk_free(tw); 62 } EXPORT_SYMBOL_GPL(inet_twsk_put); static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, struct hlist_nulls_head *list) { 8 hlist_nulls_add_head_rcu(&tw->tw_node, list); } static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, struct hlist_head *list) { 8 hlist_add_head(&tw->tw_bind_node, list); } /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); 8 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); 8 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); spin_lock(lock); /* * Step 2: Hash TW into tcp ehash chain. * Notes : * - tw_refcnt is set to 4 because : * - We have one reference from bhash chain. * - We have one reference from ehash chain. * - We have one reference from timer. * - One reference for ourself (our caller will release it). * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. */ atomic_set(&tw->tw_refcnt, 4); 8 inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ 8 if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 8 spin_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); static void tw_timer_handler(unsigned long data) { struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; if (tw->tw_kill) NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); else NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); inet_twsk_kill(tw); } struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state) { struct inet_timewait_sock *tw; 8 if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets) return NULL; 8 tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, GFP_ATOMIC); if (tw) { const struct inet_sock *inet = inet_sk(sk); kmemcheck_annotate_bitfield(tw, flags); 8 tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr; tw->tw_bound_dev_if = sk->sk_bound_dev_if; tw->tw_tos = inet->tos; tw->tw_num = inet->inet_num; tw->tw_state = TCP_TIME_WAIT; tw->tw_substate = state; tw->tw_sport = inet->inet_sport; tw->tw_dport = inet->inet_dport; tw->tw_family = sk->sk_family; tw->tw_reuse = sk->sk_reuse; tw->tw_reuseport = sk->sk_reuseport; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this * timewait socket. */ atomic_set(&tw->tw_refcnt, 0); 8 __module_get(tw->tw_prot->owner); } return tw; } EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ /* This is for handling early-kills of TIME_WAIT sockets. * Warning : consume reference. * Caller should not access tw anymore. */ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { 54 if (del_timer_sync(&tw->tw_timer)) 54 inet_twsk_kill(tw); 54 inet_twsk_put(tw); } EXPORT_SYMBOL(inet_twsk_deschedule_put); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * * 3.5 = 1+2+0.5 to wait for two retransmits. * * RATIONALE: if FIN arrived and we entered TIME-WAIT state, * our ACK acking that FIN can be lost. If N subsequent retransmitted * FINs (or previous seqments) are lost (probability of such event * is p^(N+1), where p is probability to lose single packet and * time to detect the loss is about RTO*(2^N - 1) with exponential * backoff). Normal timewait length is calculated so, that we * waited at least for one retransmitted FIN (maximal RTO is 120sec). * [ BTW Linux. following BSD, violates this requirement waiting * only for 60sec, we should wait at least for 240 secs. * Well, 240 consumes too much of resources 8) * ] * This interval is not reduced to catch old duplicate and * responces to our wandering segments living for two MSLs. * However, if we use PAWS to detect * old duplicates, we can reduce the interval to bounds required * by RTO, rather than MSL. So, if peer understands PAWS, we * kill tw bucket after 3.5*RTO (it is important that this number * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ 8 tw->tw_kill = timeo <= 4*HZ; if (!rearm) { 8 BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); 8 atomic_inc(&tw->tw_dr->tw_count); } else { mod_timer_pending(&tw->tw_timer, jiffies + timeo); } 8 } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) { struct inet_timewait_sock *tw; struct sock *sk; struct hlist_nulls_node *node; unsigned int slot; for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; restart_rcu: cond_resched(); rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_state != TCP_TIME_WAIT) continue; tw = inet_twsk(sk); if ((tw->tw_family != family) || atomic_read(&twsk_net(tw)->count)) continue; if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt))) continue; if (unlikely((tw->tw_family != family) || atomic_read(&twsk_net(tw)->count))) { inet_twsk_put(tw); goto restart; } rcu_read_unlock(); local_bh_disable(); inet_twsk_deschedule_put(tw); local_bh_enable(); goto restart_rcu; } /* If the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto restart; rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(inet_twsk_purge);
/* * NET Generic infrastructure for Network protocols. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/tcp.h> #include <linux/vmalloc.h> #include <net/request_sock.h> /* * Maximum number of SYN_RECV sockets in queue per LISTEN socket. * One SYN_RECV socket costs about 80bytes on a 32bit machine. * It would be better to replace it with a global counter for all sockets * but then some measure against one socket starving all other sockets * would be needed. * * The minimum value of it is 128. Experiments with real servers show that * it is absolutely not enough even at 100conn/sec. 256 cures most * of problems. * This value is adjusted to 128 for low memory machines, * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ int sysctl_max_syn_backlog = 256; EXPORT_SYMBOL(sysctl_max_syn_backlog); void reqsk_queue_alloc(struct request_sock_queue *queue) { 24 spin_lock_init(&queue->rskq_lock); spin_lock_init(&queue->fastopenq.lock); queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; queue->rskq_accept_head = NULL; } /* * This function is called to set a Fast Open socket's "fastopen_rsk" field * to NULL when a TFO socket no longer needs to access the request_sock. * This happens only after 3WHS has been either completed or aborted (e.g., * RST is received). * * Before TFO, a child socket is created only after 3WHS is completed, * hence it never needs to access the request_sock. things get a lot more * complex with TFO. A child socket, accepted or not, has to access its * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, * until 3WHS is either completed or aborted. Afterwards the req will stay * until either the child socket is accepted, or in the rare case when the * listener is closed before the child is accepted. * * In short, a request socket is only freed after BOTH 3WHS has completed * (or aborted) and the child socket has been accepted (or listener closed). * When a child socket is accepted, its corresponding req->sk is set to * NULL since it's no longer needed. More importantly, "req->sk == NULL" * will be used by the code below to determine if a child socket has been * accepted or not, and the check is protected by the fastopenq->lock * described below. * * Note that fastopen_rsk is only accessed from the child socket's context * with its socket lock held. But a request_sock (req) can be accessed by * both its child socket through fastopen_rsk, and a listener socket through * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. * only in the rare case when both the listener and the child locks are held, * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. * The lock also protects other fields such as fastopenq->qlen, which is * decremented by this function when fastopen_rsk is no longer needed. * * Note that another solution was to simply use the existing socket lock * from the listener. But first socket lock is difficult to use. It is not * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to * acquire a child's lock while holding listener's socket lock. A corner * case might also exist in tcp_v4_hnd_req() that will trigger this locking * order. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; if (!reset || lsk->sk_state != TCP_LISTEN) { /* If the listener has been closed don't bother with the * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. * This is a simple defense against TFO spoofing attack - by * counting the req against fastopen.max_qlen, and disabling * TFO when the qlen exceeds max_qlen. * * For more details see CoNext'11 "TCP Fast Open" paper. */ req->rsk_timer.expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else fastopenq->rskq_rst_tail->dl_next = req; req->dl_next = NULL; fastopenq->rskq_rst_tail = req; fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); }
/* * mm/truncate.c - code for taking down pages from address_spaces * * Copyright (C) 2002, Linus Torvalds * * 10Sep2002 Andrew Morton * Initial version. */ #include <linux/kernel.h> #include <linux/backing-dev.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ #include <linux/cleancache.h> #include <linux/rmap.h> #include "internal.h" 2 static void clear_exceptional_entry(struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_node *node; void **slot; /* Handled by shmem itself */ 2 if (shmem_mapping(mapping)) return; 2 spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) goto unlock; 2 if (*slot != entry) goto unlock; 2 radix_tree_replace_slot(slot, NULL); mapping->nrshadows--; if (!node) goto unlock; 2 workingset_node_shadows_dec(node); /* * Don't track node without shadow entries. * * Avoid acquiring the list_lru lock if already untracked. * The list_empty() test is safe as node->private_list is * protected by mapping->tree_lock. */ if (!workingset_node_shadows(node) && 2 !list_empty(&node->private_list)) 2 list_lru_del(&workingset_shadow_nodes, &node->private_list); 2 __radix_tree_delete_node(&mapping->page_tree, node); unlock: 2 spin_unlock_irq(&mapping->tree_lock); } /** * do_invalidatepage - invalidate part or all of a page * @page: the page which is affected * @offset: start of the range to invalidate * @length: length of the range to invalidate * * do_invalidatepage() is called when all or part of the page has become * invalidated by a truncate operation. * * do_invalidatepage() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { void (*invalidatepage)(struct page *, unsigned int, unsigned int); 346 invalidatepage = page->mapping->a_ops->invalidatepage; #ifdef CONFIG_BLOCK if (!invalidatepage) invalidatepage = block_invalidatepage; #endif 17 if (invalidatepage) 343 (*invalidatepage)(page, offset, length); 1 } /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * * We need to bale out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ static int truncate_complete_page(struct address_space *mapping, struct page *page) { 483 if (page->mapping != mapping) return -EIO; 483 if (page_has_private(page)) 342 do_invalidatepage(page, 0, PAGE_CACHE_SIZE); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ 483 cancel_dirty_page(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; } /* * This is for invalidate_mapping_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can * be marked dirty at any time too, so use remove_mapping which safely * discards clean, unused pages. * * Returns non-zero if the page was successfully invalidated. */ static int invalidate_complete_page(struct address_space *mapping, struct page *page) { int ret; 18 if (page->mapping != mapping) return 0; 18 if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; 18 ret = remove_mapping(mapping, page); return ret; } int truncate_inode_page(struct address_space *mapping, struct page *page) { 483 if (page_mapped(page)) { unmap_mapping_range(mapping, 2 (loff_t)page->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE, 0); } 483 return truncate_complete_page(mapping, page); } /* * Used to get rid of pages on hardware memory corruption. */ int generic_error_remove_page(struct address_space *mapping, struct page *page) { if (!mapping) return -EINVAL; /* * Only punch for normal data pages for now. * Handling other types like directories would need more auditing. */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; return truncate_inode_page(mapping, page); } EXPORT_SYMBOL(generic_error_remove_page); /* * Safely invalidate one page from its pagecache mapping. * It only drops clean, unused pages. The page must be locked. * * Returns 1 if the page is successfully invalidated, otherwise 0. */ int invalidate_inode_page(struct page *page) { 21 struct address_space *mapping = page_mapping(page); if (!mapping) 21 return 0; 21 if (PageDirty(page) || PageWriteback(page)) return 0; 21 if (page_mapped(page)) return 0; 18 return invalidate_complete_page(mapping, page); } /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidatepage() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; cleancache_invalidate_inode(mapping); 1196 if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; /* Offsets within partial pages */ 480 partial_start = lstart & (PAGE_CACHE_SIZE - 1); partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else 30 end = (lend + 1) >> PAGE_CACHE_SHIFT; 460 pagevec_init(&pvec, 0); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, 480 min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { 449 for (i = 0; i < pagevec_count(&pvec); i++) { 449 struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) break; 449 if (radix_tree_exceptional_entry(page)) { 2 clear_exceptional_entry(mapping, index, page); continue; } 449 if (!trylock_page(page)) continue; WARN_ON(page->index != index); 449 if (PageWriteback(page)) { 7 unlock_page(page); continue; } 449 truncate_inode_page(mapping, page); unlock_page(page); } 449 pagevec_remove_exceptionals(&pvec); 449 pagevec_release(&pvec); 449 cond_resched(); index++; } 428 if (partial_start) { 35 struct page *page = find_lock_page(mapping, start - 1); if (page) { unsigned int top = PAGE_CACHE_SIZE; 21 if (start > end) { /* Truncation within a single page */ top = partial_end; partial_end = 0; } 21 wait_on_page_writeback(page); 21 zero_user_segment(page, partial_start, top); cleancache_invalidate_page(mapping, page); 21 if (page_has_private(page)) 14 do_invalidatepage(page, partial_start, top - partial_start); 21 unlock_page(page); page_cache_release(page); } } 428 if (partial_end) { struct page *page = find_lock_page(mapping, end); if (page) { wait_on_page_writeback(page); zero_user_segment(page, 0, partial_end); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, 0, partial_end); unlock_page(page); page_cache_release(page); } } /* * If the truncation happened within a single page no pages * will be released, just zeroed, so we can bail out now. */ 428 if (start >= end) return; index = start; for ( ; ; ) { 428 cond_resched(); if (!pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { /* If all gone from start onwards, we're done */ 413 if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } 44 if (index == start && indices[0] >= end) { /* All gone out of hole to be punched, we're done */ 25 pagevec_remove_exceptionals(&pvec); 25 pagevec_release(&pvec); break; } 20 for (i = 0; i < pagevec_count(&pvec); i++) { 20 struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) { /* Restart punch to make sure all gone */ index = start - 1; break; } 20 if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } 20 lock_page(page); 20 WARN_ON(page->index != index); 20 wait_on_page_writeback(page); 20 truncate_inode_page(mapping, page); unlock_page(page); } 20 pagevec_remove_exceptionals(&pvec); 20 pagevec_release(&pvec); index++; } cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); /** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_mutex. * * Note: When this function returns, there can be a page in the process of * deletion (inside __delete_from_page_cache()) in the specified range. Thus * mapping->nrpages can be non-zero when this function returns even after * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { 786 truncate_inode_pages_range(mapping, lstart, (loff_t)-1); } EXPORT_SYMBOL(truncate_inode_pages); /** * truncate_inode_pages_final - truncate *all* pages before inode dies * @mapping: mapping to truncate * * Called under (and serialized by) inode->i_mutex. * * Filesystems have to use this in the .evict_inode path to inform the * VM that this is the final truncate and the inode is going away. */ void truncate_inode_pages_final(struct address_space *mapping) { unsigned long nrshadows; unsigned long nrpages; /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the * inode teardown. Tell it when the address space is exiting, * so that it does not install eviction information after the * final truncate has begun. */ 673 mapping_set_exiting(mapping); /* * When reclaim installs eviction entries, it increases * nrshadows first, then decreases nrpages. Make sure we see * this in the right order or we might miss an entry. */ nrpages = mapping->nrpages; smp_rmb(); nrshadows = mapping->nrshadows; if (nrpages || nrshadows) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree * modification that does not see AS_EXITING is * completed before starting the final truncate. */ 15 spin_lock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock); } /* * Cleancache needs notification even if there are no pages or shadow * entries. */ 673 truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); /** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; 21 pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, 21 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { 21 for (i = 0; i < pagevec_count(&pvec); i++) { 21 struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index > end) break; 21 if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } 21 if (!trylock_page(page)) continue; WARN_ON(page->index != index); 21 ret = invalidate_inode_page(page); unlock_page(page); /* * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ if (!ret) 11 deactivate_file_page(page); 21 count += ret; } 21 pagevec_remove_exceptionals(&pvec); 21 pagevec_release(&pvec); 21 cond_resched(); index++; } 21 return count; } EXPORT_SYMBOL(invalidate_mapping_pages); /* * This is like invalidate_complete_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the lru_cache_add() pagevecs. */ static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { struct mem_cgroup *memcg; unsigned long flags; 343 if (page->mapping != mapping) return 0; 343 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; memcg = mem_cgroup_begin_page_stat(page); 343 spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; 343 BUG_ON(page_has_private(page)); 343 __delete_from_page_cache(page, NULL, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); mem_cgroup_end_page_stat(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); 343 page_cache_release(page); /* pagecache ref */ return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); mem_cgroup_end_page_stat(memcg); return 0; } static int do_launder_page(struct address_space *mapping, struct page *page) { 343 if (!PageDirty(page)) return 0; 38 if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) return 0; return mapping->a_ops->launder_page(page); } /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space * @start: the page offset 'from' which to invalidate * @end: the page offset 'to' which to invalidate (inclusive) * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; cleancache_invalidate_inode(mapping); 362 pagevec_init(&pvec, 0); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, 362 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { 343 for (i = 0; i < pagevec_count(&pvec); i++) { 343 struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index > end) break; 343 if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } 343 lock_page(page); 343 WARN_ON(page->index != index); 343 if (page->mapping != mapping) { unlock_page(page); continue; } 343 wait_on_page_writeback(page); 343 if (page_mapped(page)) { 30 if (!did_range_unmap) { /* * Zap the rest of the file in one hit. */ 30 unmap_mapping_range(mapping, (loff_t)index << PAGE_CACHE_SHIFT, (loff_t)(1 + end - index) << PAGE_CACHE_SHIFT, 0); did_range_unmap = 1; } else { /* * Just zap this page */ 8 unmap_mapping_range(mapping, (loff_t)index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE, 0); } } 343 BUG_ON(page_mapped(page)); 343 ret2 = do_launder_page(mapping, page); if (ret2 == 0) { 343 if (!invalidate_complete_page2(mapping, page)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; 343 unlock_page(page); } 343 pagevec_remove_exceptionals(&pvec); 343 pagevec_release(&pvec); 343 cond_resched(); index++; } cleancache_invalidate_inode(mapping); 361 return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); /** * invalidate_inode_pages2 - remove all pages from an address_space * @mapping: the address_space * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. * * This function should typically be called before the filesystem * releases resources associated with the freed range (eg. deallocates * blocks). This way, pagecache will always stay logically coherent * with on-disk format, and the filesystem would not have to deal with * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ void truncate_pagecache(struct inode *inode, loff_t newsize) { 513 struct address_space *mapping = inode->i_mapping; loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for * efficiency so that truncate_inode_pages does fewer * single-page unmaps. However after this first call, and * before truncate_inode_pages finishes, it is possible for * private pages to be COWed, which remain after * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ unmap_mapping_range(mapping, holebegin, 0, 1); truncate_inode_pages(mapping, newsize); unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); /** * truncate_setsize - update inode and pagecache for a new file size * @inode: inode * @newsize: new file size * * truncate_setsize updates i_size and performs pagecache truncation (if * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * * Must be called with a lock serializing truncates and writes (generally * i_mutex but e.g. xfs uses a different lock) and before all filesystem * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { 25 loff_t oldsize = inode->i_size; i_size_write(inode, newsize); if (newsize > oldsize) 6 pagecache_isize_extended(inode, oldsize, newsize); 25 truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** * pagecache_isize_extended - update pagecache after extension of i_size * @inode: inode for which i_size was extended * @from: original inode size * @to: new inode size * * Handle extension of inode size either caused by extending truncate or by * write starting after current i_size. We mark the page straddling current * i_size RO so that page_mkwrite() is called on the nearest write access to * the page. This way filesystem can be sure that page_mkwrite() is called on * the page before user writes to the page via mmap after the i_size has been * changed. * * The function must be called after i_size is updated so that page fault * coming after we unlock the page will already see the new i_size. * The function must be called while we still hold i_mutex - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. */ 1 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { 76 int bsize = i_blocksize(inode); loff_t rounded_from; struct page *page; pgoff_t index; WARN_ON(to > inode->i_size); 76 if (from >= to || bsize == PAGE_CACHE_SIZE) return; /* Page straddling @from will not have any hole block created? */ 5 rounded_from = round_up(from, bsize); 76 if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) return; 1 index = from >> PAGE_CACHE_SHIFT; page = find_lock_page(inode->i_mapping, index); /* Page not cached? Nothing to do */ if (!page) return; /* * See clear_page_dirty_for_io() for details why set_page_dirty() * is needed. */ if (page_mkclean(page)) set_page_dirty(page); unlock_page(page); page_cache_release(page); } EXPORT_SYMBOL(pagecache_isize_extended); /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole * @lend: offset of last byte of hole * * This function should typically be called before the filesystem * releases resources associated with the freed range (eg. deallocates * blocks). This way, pagecache will always stay logically coherent * with on-disk format, and the filesystem would not have to deal with * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) { 31 struct address_space *mapping = inode->i_mapping; loff_t unmap_start = round_up(lstart, PAGE_SIZE); loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; /* * This rounding is currently just for example: unmap_mapping_range * expands its hole outwards, whereas we want it to contract the hole * inwards. However, existing callers of truncate_pagecache_range are * doing their own page rounding first. Note that unmap_mapping_range * allows holelen 0 for all, and we allow lend -1 for end of file. */ /* * Unlike in truncate_pagecache, unmap_mapping_range is called only * once (before truncating pagecache), and without "even_cows" flag: * hole-punching should not remove private COWed pages from the hole. */ if ((u64)unmap_end > (u64)unmap_start) 31 unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); 31 truncate_inode_pages_range(mapping, lstart, lend); } EXPORT_SYMBOL(truncate_pagecache_range);
/* * NTP state machine interfaces and logic. * * This code was mainly moved from kernel/timer.c and kernel/time.c * Please see those files for relevant copyright info and historical * changelogs. */ #include <linux/capability.h> #include <linux/clocksource.h> #include <linux/workqueue.h> #include <linux/hrtimer.h> #include <linux/jiffies.h> #include <linux/math64.h> #include <linux/timex.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/rtc.h> #include "ntp_internal.h" /* * NTP timekeeping variables: * * Note: All of the NTP state is protected by the timekeeping locks. */ /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; /* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; #define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) #define MAX_TAI_OFFSET 100000 /* * phase-lock loop variables */ /* * clock synchronization status * * (TIME_ERROR prevents overwriting the CMOS clock) */ static int time_state = TIME_OK; /* clock status bits: */ static int time_status = STA_UNSYNC; /* time adjustment (nsecs): */ static s64 time_offset; /* pll time constant: */ static long time_constant = 2; /* maximum error (usecs): */ static long time_maxerror = NTP_PHASE_LIMIT; /* estimated error (usecs): */ static long time_esterror = NTP_PHASE_LIMIT; /* frequency offset (scaled nsecs/secs): */ static s64 time_freq; /* time at last adjustment (secs): */ static long time_reftime; static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; /* second value of the next pending leapsecond, or TIME64_MAX if no leap */ static time64_t ntp_next_leap_sec = TIME64_MAX; #ifdef CONFIG_NTP_PPS /* * The following variables are used when a pulse-per-second (PPS) signal * is available. They establish the engineering parameters of the clock * discipline loop when controlled by the PPS signal. */ #define PPS_VALID 10 /* PPS signal watchdog max (s) */ #define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ #define PPS_INTMIN 2 /* min freq interval (s) (shift) */ #define PPS_INTMAX 8 /* max freq interval (s) (shift) */ #define PPS_INTCOUNT 4 /* number of consecutive good intervals to increase pps_shift or consecutive bad intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ static int pps_valid; /* signal watchdog counter */ static long pps_tf[3]; /* phase median filter */ static long pps_jitter; /* current jitter (ns) */ static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ static long pps_stabil; /* current stability (scaled ns/s) */ /* * PPS signal quality monitors */ static long pps_calcnt; /* calibration intervals */ static long pps_jitcnt; /* jitter limit exceeded */ static long pps_stbcnt; /* stability limit exceeded */ static long pps_errcnt; /* calibration errors */ /* PPS kernel consumer compensates the whole phase error immediately. * Otherwise, reduce the offset by a fixed factor times the time constant. */ static inline s64 ntp_offset_chunk(s64 offset) { if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) return offset; else return shift_right(offset, SHIFT_PLL + time_constant); } static inline void pps_reset_freq_interval(void) { /* the PPS calibration interval may end surprisingly early */ pps_shift = PPS_INTMIN; pps_intcnt = 0; } /** * pps_clear - Clears the PPS state variables */ static inline void pps_clear(void) { pps_reset_freq_interval(); pps_tf[0] = 0; pps_tf[1] = 0; pps_tf[2] = 0; pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; pps_freq = 0; } /* Decrease pps_valid to indicate that another second has passed since * the last PPS signal. When it reaches 0, indicate that PPS signal is * missing. */ static inline void pps_dec_valid(void) { if (pps_valid > 0) pps_valid--; else { time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); pps_clear(); } } static inline void pps_set_freq(s64 freq) { pps_freq = freq; } static inline int is_error_status(int status) { return (status & (STA_UNSYNC|STA_CLOCKERR)) /* PPS signal lost when either PPS time or * PPS frequency synchronization requested */ || ((status & (STA_PPSFREQ|STA_PPSTIME)) && !(status & STA_PPSSIGNAL)) /* PPS jitter exceeded when * PPS time synchronization requested */ || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) /* PPS wander exceeded or calibration error when * PPS frequency synchronization requested */ || ((status & STA_PPSFREQ) && (status & (STA_PPSWANDER|STA_PPSERROR))); } static inline void pps_fill_timex(struct timex *txc) { txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->jitter = pps_jitter; if (!(time_status & STA_NANO)) txc->jitter /= NSEC_PER_USEC; txc->shift = pps_shift; txc->stabil = pps_stabil; txc->jitcnt = pps_jitcnt; txc->calcnt = pps_calcnt; txc->errcnt = pps_errcnt; txc->stbcnt = pps_stbcnt; } #else /* !CONFIG_NTP_PPS */ static inline s64 ntp_offset_chunk(s64 offset) { return shift_right(offset, SHIFT_PLL + time_constant); } static inline void pps_reset_freq_interval(void) {} static inline void pps_clear(void) {} static inline void pps_dec_valid(void) {} static inline void pps_set_freq(s64 freq) {} static inline int is_error_status(int status) { return status & (STA_UNSYNC|STA_CLOCKERR); } static inline void pps_fill_timex(struct timex *txc) { /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; txc->jitter = 0; txc->shift = 0; txc->stabil = 0; txc->jitcnt = 0; txc->calcnt = 0; txc->errcnt = 0; txc->stbcnt = 0; } #endif /* CONFIG_NTP_PPS */ /** * ntp_synced - Returns 1 if the NTP status is not UNSYNC * */ static inline int ntp_synced(void) { return !(time_status & STA_UNSYNC); } /* * NTP methods: */ /* * Update (tick_length, tick_length_base, tick_nsec), based * on (tick_usec, ntp_tick_adj, time_freq): */ static void ntp_update_frequency(void) { u64 second_length; u64 new_base; second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; second_length += ntp_tick_adj; second_length += time_freq; tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; new_base = div_u64(second_length, NTP_INTERVAL_FREQ); /* * Don't wait for the next second_overflow, apply * the change to the tick length immediately: */ tick_length += new_base - tick_length_base; tick_length_base = new_base; } static inline s64 ntp_update_offset_fll(s64 offset64, long secs) { time_status &= ~STA_MODE; if (secs < MINSEC) return 0; if (!(time_status & STA_FLL) && (secs <= MAXSEC)) return 0; time_status |= STA_MODE; return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } static void ntp_update_offset(long offset) { s64 freq_adj; s64 offset64; long secs; if (!(time_status & STA_PLL)) return; if (!(time_status & STA_NANO)) offset *= NSEC_PER_USEC; /* * Scale the phase adjustment and * clamp to the operating range. */ offset = min(offset, MAXPHASE); offset = max(offset, -MAXPHASE); /* * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ secs = get_seconds() - time_reftime; if (unlikely(time_status & STA_FREQHOLD)) secs = 0; time_reftime = get_seconds(); offset64 = offset; freq_adj = ntp_update_offset_fll(offset64, secs); /* * Clamp update interval to reduce PLL gain with low * sampling rate (e.g. intermittent network connection) * to avoid instability. */ if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) secs = 1 << (SHIFT_PLL + 1 + time_constant); freq_adj += (offset64 * secs) << (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); time_freq = max(freq_adj, -MAXFREQ_SCALED); time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } /** * ntp_clear - Clears the NTP state variables */ void ntp_clear(void) { time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; ntp_update_frequency(); tick_length = tick_length_base; time_offset = 0; ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ pps_clear(); } u64 ntp_tick_length(void) { return tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t * * Provides the time of the next leapsecond against CLOCK_REALTIME in * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. */ ktime_t ntp_get_next_leap(void) { ktime_t ret; 5 if ((time_state == TIME_INS) && (time_status & STA_INS)) return ktime_set(ntp_next_leap_sec, 0); 5 ret.tv64 = KTIME_MAX; return ret; } /* * this routine handles the overflow of the microsecond field * * The tricky bits of code to handle the accurate clock support * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. * They were originally developed for SUN and DEC kernels. * All the kudos should go to Dave for this stuff. * * Also handles leap second processing, and returns leap offset */ int second_overflow(unsigned long secs) { s64 delta; int leap = 0; /* * Leap second processing. If in leap-insert state at the end of the * day, the system clock is set back one second; if in leap-delete * state, the system clock is set ahead one second. */ switch (time_state) { case TIME_OK: if (time_status & STA_INS) { time_state = TIME_INS; ntp_next_leap_sec = secs + SECS_PER_DAY - (secs % SECS_PER_DAY); } else if (time_status & STA_DEL) { time_state = TIME_DEL; ntp_next_leap_sec = secs + SECS_PER_DAY - ((secs+1) % SECS_PER_DAY); } break; case TIME_INS: if (!(time_status & STA_INS)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; } else if (secs % SECS_PER_DAY == 0) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: if (!(time_status & STA_DEL)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; } else if ((secs + 1) % SECS_PER_DAY == 0) { leap = 1; ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; break; case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; break; } /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; if (time_maxerror > NTP_PHASE_LIMIT) { time_maxerror = NTP_PHASE_LIMIT; time_status |= STA_UNSYNC; } /* Compute the phase adjustment for the next second */ tick_length = tick_length_base; delta = ntp_offset_chunk(time_offset); time_offset -= delta; tick_length += delta; /* Check PPS signal */ pps_dec_valid(); if (!time_adjust) goto out; if (time_adjust > MAX_TICKADJ) { time_adjust -= MAX_TICKADJ; tick_length += MAX_TICKADJ_SCALED; goto out; } if (time_adjust < -MAX_TICKADJ) { time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; goto out; } tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; out: return leap; } #ifdef CONFIG_GENERIC_CMOS_UPDATE int __weak update_persistent_clock(struct timespec now) { return -ENODEV; } int __weak update_persistent_clock64(struct timespec64 now64) { struct timespec now; now = timespec64_to_timespec(now64); return update_persistent_clock(now); } #endif #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_cmos_clock(struct work_struct *work); static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { struct timespec64 now; struct timespec64 next; int fail = 1; /* * If we have an externally synchronized Linux clock, then update * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... * We want the clock to be within a couple of ticks from the target. */ if (!ntp_synced()) { /* * Not synced, exit, do not restart a timer (if one is * running, let it run out). */ return; } getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { struct timespec64 adjust = now; fail = -ENODEV; if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); #ifdef CONFIG_GENERIC_CMOS_UPDATE fail = update_persistent_clock64(adjust); #endif #ifdef CONFIG_RTC_SYSTOHC if (fail == -ENODEV) fail = rtc_set_ntp_time(adjust); #endif } next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; if (!fail || fail == -ENODEV) next.tv_sec = 659; else next.tv_sec = 0; if (next.tv_nsec >= NSEC_PER_SEC) { next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, timespec64_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) 5 { queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else void ntp_notify_cmos_timer(void) { } #endif /* * Propagate a new txc->status value into the NTP state: */ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; ntp_next_leap_sec = TIME64_MAX; /* restart PPS frequency calibration */ pps_reset_freq_interval(); } /* * If we turn on PLL adjustments then reset the * reference time to current time. */ if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) time_reftime = get_seconds(); /* only set allowed bits */ time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; } static inline void process_adjtimex_modes(struct timex *txc, struct timespec64 *ts, s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc, ts); if (txc->modes & ADJ_NANO) time_status |= STA_NANO; if (txc->modes & ADJ_MICRO) time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); /* update pps_freq */ pps_set_freq(time_freq); } if (txc->modes & ADJ_MAXERROR) time_maxerror = txc->maxerror; if (txc->modes & ADJ_ESTERROR) time_esterror = txc->esterror; if (txc->modes & ADJ_TIMECONST) { time_constant = txc->constant; if (!(time_status & STA_NANO)) time_constant += 4; time_constant = min(time_constant, (long)MAXTC); time_constant = max(time_constant, 0l); } if (txc->modes & ADJ_TAI && txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) ntp_update_frequency(); } /** * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex */ 9 int ntp_validate_timex(struct timex *txc) { 5 if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ 4 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) 1 return -EINVAL; if (!(txc->modes & ADJ_OFFSET_READONLY) && !capable(CAP_SYS_TIME)) return -EPERM; 4 } else { /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; /* * if the quartz is off by more than 10% then * something is VERY wrong! */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; 6 } 1 if (txc->modes & ADJ_SETOFFSET) { /* In order to inject time, you gotta be super-user! */ if (!capable(CAP_SYS_TIME)) return -EPERM; if (txc->modes & ADJ_NANO) { struct timespec ts; ts.tv_sec = txc->time.tv_sec; ts.tv_nsec = txc->time.tv_usec; if (!timespec_inject_offset_valid(&ts)) return -EINVAL; } else { if (!timeval_inject_offset_valid(&txc->time)) return -EINVAL; } } /* * Check for potential multiplication overflows that can 5 * only happen on 64-bit systems: 1 */ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { 1 if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; 9 } return 0; } /* * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) { 5 int result; 2 if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; if (!(txc->modes & ADJ_OFFSET_READONLY)) { /* adjtime() is independent from ntp_adjtime() */ time_adjust = txc->offset; 2 ntp_update_frequency(); } txc->offset = save_adjust; } else { 3 /* If there are input parameters, then process them: */ if (txc->modes) 3 process_adjtimex_modes(txc, ts, time_tai); 3 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, 3 NTP_SCALE_SHIFT); if (!(time_status & STA_NANO)) txc->offset /= NSEC_PER_USEC; 5 } result = time_state; /* mostly `TIME_OK' */ /* check for errors */ if (is_error_status(time_status)) 5 result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; txc->constant = time_constant; txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = tick_usec; txc->tai = *time_tai; /* fill PPS status fields */ pps_fill_timex(txc); txc->time.tv_sec = (time_t)ts->tv_sec; 5 txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; 5 /* Handle leapsec adjustments */ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { if ((time_state == TIME_INS) && (time_status & STA_INS)) { result = TIME_OOP; txc->tai++; txc->time.tv_sec--; } if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { result = TIME_WAIT; txc->tai--; txc->time.tv_sec++; } if ((time_state == TIME_OOP) && (ts->tv_sec == ntp_next_leap_sec)) { result = TIME_WAIT; } 5 } return result; } #ifdef CONFIG_NTP_PPS /* actually struct pps_normtime is good old struct timespec, but it is * semantically different (and it is the reason why it was invented): * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ struct pps_normtime { s64 sec; /* seconds */ long nsec; /* nanoseconds */ }; /* normalize the timestamp so that nsec is in the ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) { struct pps_normtime norm = { .sec = ts.tv_sec, .nsec = ts.tv_nsec }; if (norm.nsec > (NSEC_PER_SEC >> 1)) { norm.nsec -= NSEC_PER_SEC; norm.sec++; } return norm; } /* get current phase correction and jitter */ static inline long pps_phase_filter_get(long *jitter) { *jitter = pps_tf[0] - pps_tf[1]; if (*jitter < 0) *jitter = -*jitter; /* TODO: test various filters */ return pps_tf[0]; } /* add the sample to the phase filter */ static inline void pps_phase_filter_add(long err) { pps_tf[2] = pps_tf[1]; pps_tf[1] = pps_tf[0]; pps_tf[0] = err; } /* decrease frequency calibration interval length. * It is halved after four consecutive unstable intervals. */ static inline void pps_dec_freq_interval(void) { if (--pps_intcnt <= -PPS_INTCOUNT) { pps_intcnt = -PPS_INTCOUNT; if (pps_shift > PPS_INTMIN) { pps_shift--; pps_intcnt = 0; } } } /* increase frequency calibration interval length. * It is doubled after four consecutive stable intervals. */ static inline void pps_inc_freq_interval(void) { if (++pps_intcnt >= PPS_INTCOUNT) { pps_intcnt = PPS_INTCOUNT; if (pps_shift < PPS_INTMAX) { pps_shift++; pps_intcnt = 0; } } } /* update clock frequency based on MONOTONIC_RAW clock PPS signal * timestamps * * At the end of the calibration interval the difference between the * first and last MONOTONIC_RAW clock timestamps divided by the length * of the interval becomes the frequency update. If the interval was * too long, the data are discarded. * Returns the difference between old and new frequency values. */ static long hardpps_update_freq(struct pps_normtime freq_norm) { long delta, delta_mod; s64 ftemp; /* check if the frequency interval was too long */ if (freq_norm.sec > (2 << pps_shift)) { time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); return 0; } /* here the raw frequency offset and wander (stability) is * calculated. If the wander is less than the wander threshold * the interval is increased; otherwise it is decreased. */ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, freq_norm.sec); delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); } else { /* good sample */ pps_inc_freq_interval(); } /* the stability metric is calculated as the average of recent * frequency changes, but is used only for performance * monitoring */ delta_mod = delta; if (delta_mod < 0) delta_mod = -delta_mod; pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; /* if enabled, the system clock frequency is updated */ if ((time_status & STA_PPSFREQ) != 0 && (time_status & STA_FREQHOLD) == 0) { time_freq = pps_freq; ntp_update_frequency(); } return delta; } /* correct REALTIME clock phase error against PPS signal */ static void hardpps_update_phase(long error) { long correction = -error; long jitter; /* add the sample to the median filter */ pps_phase_filter_add(correction); correction = pps_phase_filter_get(&jitter); /* Nominal jitter is due to PPS signal noise. If it exceeds the * threshold, the sample is discarded; otherwise, if so enabled, * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { /* correct the time using the phase offset */ time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); /* cancel running adjtime() */ time_adjust = 0; } /* update jitter */ pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; } /* * __hardpps() - discipline CPU clock oscillator to external PPS signal * * This routine is called at each PPS signal arrival in order to * discipline the CPU clock oscillator to the PPS signal. It takes two * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former * is used to correct clock phase error and the latter is used to * correct the frequency. * * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { struct pps_normtime pts_norm, freq_norm; pts_norm = pps_normalize_ts(*phase_ts); /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); /* indicate signal presence */ time_status |= STA_PPSSIGNAL; pps_valid = PPS_VALID; /* when called for the first time, * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; return; } /* ok, now we have a base for frequency calculation */ freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); /* check that the signal is in the range * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ if ((freq_norm.sec == 0) || (freq_norm.nsec > MAXFREQ * freq_norm.sec) || (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } /* signal is ok */ /* check if the current frequency interval is finished */ if (freq_norm.sec >= (1 << pps_shift)) { pps_calcnt++; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; hardpps_update_freq(freq_norm); } hardpps_update_phase(pts_norm.nsec); } #endif /* CONFIG_NTP_PPS */ static int __init ntp_tick_adj_setup(char *str) { int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); if (rc) return rc; ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); }
/* * RT Mutexes: blocking mutual exclusion locks with PI support * * started by Ingo Molnar and Thomas Gleixner: * * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * This file contains the private data structure and API definitions. */ #ifndef __KERNEL_RTMUTEX_COMMON_H #define __KERNEL_RTMUTEX_COMMON_H #include <linux/rtmutex.h> /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * * @tree_entry: pi node to enqueue into the mutex waiters tree * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task */ struct rt_mutex_waiter { struct rb_node tree_entry; struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES unsigned long ip; struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif int prio; }; /* * Various helpers to access the waiters-tree: */ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters); } static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); return w; } static inline int task_has_pi_waiters(struct task_struct *p) { return !RB_EMPTY_ROOT(&p->pi_waiters); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, pi_tree_entry); } /* * lock->owner state tracking: */ #define RT_MUTEX_HAS_WAITERS 1UL #define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { 9 unsigned long owner = (unsigned long) READ_ONCE(lock->owner); return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL); } /* * Constants for rt mutex functions which have a selectable deadlock * detection. * * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are * no further PI adjustments to be made. * * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full * walk of the lock chain. */ enum rtmutex_chainwalk { RT_MUTEX_MIN_CHAINWALK, RT_MUTEX_FULL_CHAINWALK, }; /* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter); extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, struct wake_q_head *wqh); extern void rt_mutex_adjust_prio(struct task_struct *task); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" #else # include "rtmutex.h" #endif #endif
/* * lib/hexdump.c * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. See README and COPYING for * more details. */ #include <linux/types.h> #include <linux/ctype.h> #include <linux/kernel.h> #include <linux/export.h> #include <asm/unaligned.h> const char hex_asc[] = "0123456789abcdef"; EXPORT_SYMBOL(hex_asc); const char hex_asc_upper[] = "0123456789ABCDEF"; EXPORT_SYMBOL(hex_asc_upper); /** * hex_to_bin - convert a hex digit to its real value * @ch: ascii character represents hex digit * * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad * input. */ int hex_to_bin(char ch) { 4 if ((ch >= '0') && (ch <= '9')) 2 return ch - '0'; 3 ch = tolower(ch); 3 if ((ch >= 'a') && (ch <= 'f')) 3 return ch - 'a' + 10; return -1; } EXPORT_SYMBOL(hex_to_bin); /** * hex2bin - convert an ascii hexadecimal string to its binary representation * @dst: binary result * @src: ascii hexadecimal string * @count: result length * * Return 0 on success, -1 in case of bad input. */ int hex2bin(u8 *dst, const char *src, size_t count) { while (count--) { int hi = hex_to_bin(*src++); int lo = hex_to_bin(*src++); if ((hi < 0) || (lo < 0)) return -1; *dst++ = (hi << 4) | lo; } return 0; } EXPORT_SYMBOL(hex2bin); /** * bin2hex - convert binary data to an ascii hexadecimal string * @dst: ascii hexadecimal result * @src: binary data * @count: binary data length */ char *bin2hex(char *dst, const void *src, size_t count) { const unsigned char *_src = src; while (count--) dst = hex_byte_pack(dst, *_src++); return dst; } EXPORT_SYMBOL(bin2hex); /** * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory * @buf: data blob to dump * @len: number of bytes in the @buf * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @linebuf: where to put the converted data * @linebuflen: total size of @linebuf, including space for terminating NUL * @ascii: include ASCII after the hex output * * hex_dump_to_buffer() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data * to a hex + ASCII dump at the supplied memory location. * The converted output is always NUL-terminated. * * E.g.: * hex_dump_to_buffer(frame->data, frame->len, 16, 1, * linebuf, sizeof(linebuf), true); * * example output buffer: * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * * Return: * The amount of bytes placed in the buffer without terminating NUL. If the * output was truncated, then the return value is the number of bytes * (excluding the terminating NUL) which would have been written to the final * string if enough space had been available. */ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, char *linebuf, size_t linebuflen, bool ascii) { const u8 *ptr = buf; int ngroups; u8 ch; int j, lx = 0; int ascii_column; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; if (len > rowsize) /* limit to one line at a time */ len = rowsize; if (!is_power_of_2(groupsize) || groupsize > 8) groupsize = 1; if ((len % groupsize) != 0) /* no mixed size output */ groupsize = 1; ngroups = len / groupsize; ascii_column = rowsize * 2 + rowsize / groupsize + 1; if (!linebuflen) goto overflow1; if (!len) goto nil; if (groupsize == 8) { const u64 *ptr8 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%16.16llx", j ? " " : "", get_unaligned(ptr8 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 4) { const u32 *ptr4 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%8.8x", j ? " " : "", get_unaligned(ptr4 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 2) { const u16 *ptr2 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%4.4x", j ? " " : "", get_unaligned(ptr2 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else { for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = hex_asc_lo(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } if (j) lx--; } if (!ascii) goto nil; while (lx < ascii_column) { if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; } nil: linebuf[lx] = '\0'; return lx; overflow2: linebuf[lx++] = '\0'; overflow1: return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; } EXPORT_SYMBOL(hex_dump_to_buffer); #ifdef CONFIG_PRINTK /** * print_hex_dump - print a text hex dump to syslog for a binary blob of data * @level: kernel log level (e.g. KERN_DEBUG) * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @buf: data blob to dump * @len: number of bytes in the @buf * @ascii: include ASCII after the hex output * * Given a buffer of u8 data, print_hex_dump() prints a hex + ASCII dump * to the kernel log at the specified kernel log level, with an optional * leading prefix. * * print_hex_dump() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * print_hex_dump() iterates over the entire input @buf, breaking it into * "line size" chunks to format and print. * * E.g.: * print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS, * 16, 1, frame->data, frame->len, true); * * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode: * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * Example output using %DUMP_PREFIX_ADDRESS and 4-byte mode: * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c pqrstuvwxyz{|}~. */ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; unsigned char linebuf[32 * 3 + 2 + 32 + 1]; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len; i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, linebuf, sizeof(linebuf), ascii); switch (prefix_type) { case DUMP_PREFIX_ADDRESS: printk("%s%s%p: %s\n", level, prefix_str, ptr + i, linebuf); break; case DUMP_PREFIX_OFFSET: printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf); break; default: printk("%s%s%s\n", level, prefix_str, linebuf); break; } } } EXPORT_SYMBOL(print_hex_dump); #if !defined(CONFIG_DYNAMIC_DEBUG) /** * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @buf: data blob to dump * @len: number of bytes in the @buf * * Calls print_hex_dump(), with log level of KERN_DEBUG, * rowsize of 16, groupsize of 1, and ASCII output included. */ void print_hex_dump_bytes(const char *prefix_str, int prefix_type, const void *buf, size_t len) { print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1, buf, len, true); } EXPORT_SYMBOL(print_hex_dump_bytes); #endif /* !defined(CONFIG_DYNAMIC_DEBUG) */ #endif /* defined(CONFIG_PRINTK) */
/* * Implementation of the kernel access vector cache (AVC). * * Authors: Stephen Smalley, <sds@epoch.ncsc.mil> * James Morris <jmorris@redhat.com> * * Update: KaiGai, Kohei <kaigai@ak.jp.nec.com> * Replaced the avc_lock spinlock by RCU. * * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/dcache.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/percpu.h> #include <linux/list.h> #include <net/sock.h> #include <linux/un.h> #include <net/af_unix.h> #include <linux/ip.h> #include <linux/audit.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include "avc.h" #include "avc_ss.h" #include "classmap.h" #define AVC_CACHE_SLOTS 512 #define AVC_DEF_CACHE_THRESHOLD 512 #define AVC_CACHE_RECLAIM 16 #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS #define avc_cache_stats_incr(field) this_cpu_inc(avc_cache_stats.field) #else #define avc_cache_stats_incr(field) do {} while (0) #endif struct avc_entry { u32 ssid; u32 tsid; u16 tclass; struct av_decision avd; struct avc_xperms_node *xp_node; }; struct avc_node { struct avc_entry ae; struct hlist_node list; /* anchored in avc_cache->slots[i] */ struct rcu_head rhead; }; struct avc_xperms_decision_node { struct extended_perms_decision xpd; struct list_head xpd_list; /* list of extended_perms_decision */ }; struct avc_xperms_node { struct extended_perms xp; struct list_head xpd_head; /* list head of extended_perms_decision */ }; struct avc_cache { struct hlist_head slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */ spinlock_t slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */ atomic_t lru_hint; /* LRU hint for reclaim scan */ atomic_t active_nodes; u32 latest_notif; /* latest revocation notification */ }; struct avc_callback_node { int (*callback) (u32 event); u32 events; struct avc_callback_node *next; }; /* Exported via selinufs */ unsigned int avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD; #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 }; #endif static struct avc_cache avc_cache; static struct avc_callback_node *avc_callbacks; static struct kmem_cache *avc_node_cachep; static struct kmem_cache *avc_xperms_data_cachep; static struct kmem_cache *avc_xperms_decision_cachep; static struct kmem_cache *avc_xperms_cachep; static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass) { 2118 return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1); } /** * avc_dump_av - Display an access vector in human-readable form. * @tclass: target security class * @av: access vector */ static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) { const char **perms; int i, perm; if (av == 0) { audit_log_format(ab, " null"); return; } BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)); perms = secclass_map[tclass-1].perms; audit_log_format(ab, " {"); i = 0; perm = 1; while (i < (sizeof(av) * 8)) { 149 if ((perm & av) && perms[i]) { 149 audit_log_format(ab, " %s", perms[i]); av &= ~perm; } 149 i++; perm <<= 1; } 149 if (av) audit_log_format(ab, " 0x%x", av); 149 audit_log_format(ab, " }"); } /** * avc_dump_query - Display a SID pair and a class in human-readable form. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class */ static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass) { int rc; char *scontext; u32 scontext_len; rc = security_sid_to_context(ssid, &scontext, &scontext_len); if (rc) audit_log_format(ab, "ssid=%d", ssid); else { 149 audit_log_format(ab, "scontext=%s", scontext); kfree(scontext); } 149 rc = security_sid_to_context(tsid, &scontext, &scontext_len); if (rc) audit_log_format(ab, " tsid=%d", tsid); else { 149 audit_log_format(ab, " tcontext=%s", scontext); kfree(scontext); } 149 BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)); 149 audit_log_format(ab, " tclass=%s", secclass_map[tclass-1].name); } /** * avc_init - Initialize the AVC. * * Initialize the access vector cache. */ void __init avc_init(void) { int i; for (i = 0; i < AVC_CACHE_SLOTS; i++) { INIT_HLIST_HEAD(&avc_cache.slots[i]); spin_lock_init(&avc_cache.slots_lock[i]); } atomic_set(&avc_cache.active_nodes, 0); atomic_set(&avc_cache.lru_hint, 0); avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node), 0, SLAB_PANIC, NULL); avc_xperms_cachep = kmem_cache_create("avc_xperms_node", sizeof(struct avc_xperms_node), 0, SLAB_PANIC, NULL); avc_xperms_decision_cachep = kmem_cache_create( "avc_xperms_decision_node", sizeof(struct avc_xperms_decision_node), 0, SLAB_PANIC, NULL); avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data", sizeof(struct extended_perms_data), 0, SLAB_PANIC, NULL); audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n"); } int avc_get_hash_stats(char *page) { int i, chain_len, max_chain_len, slots_used; struct avc_node *node; struct hlist_head *head; 8 rcu_read_lock(); slots_used = 0; max_chain_len = 0; 8 for (i = 0; i < AVC_CACHE_SLOTS; i++) { head = &avc_cache.slots[i]; 8 if (!hlist_empty(head)) { 8 slots_used++; chain_len = 0; 8 hlist_for_each_entry_rcu(node, head, list) 8 chain_len++; 8 if (chain_len > max_chain_len) max_chain_len = chain_len; } } 8 rcu_read_unlock(); return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n" "longest chain: %d\n", atomic_read(&avc_cache.active_nodes), slots_used, AVC_CACHE_SLOTS, max_chain_len); } /* * using a linked list for extended_perms_decision lookup because the list is * always small. i.e. less than 5, typically 1 */ static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver, struct avc_xperms_node *xp_node) { struct avc_xperms_decision_node *xpd_node; list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) { if (xpd_node->xpd.driver == driver) return &xpd_node->xpd; } return NULL; } static inline unsigned int avc_xperms_has_perm(struct extended_perms_decision *xpd, u8 perm, u8 which) { unsigned int rc = 0; if ((which == XPERMS_ALLOWED) && (xpd->used & XPERMS_ALLOWED)) rc = security_xperm_test(xpd->allowed->p, perm); else if ((which == XPERMS_AUDITALLOW) && (xpd->used & XPERMS_AUDITALLOW)) rc = security_xperm_test(xpd->auditallow->p, perm); else if ((which == XPERMS_DONTAUDIT) && (xpd->used & XPERMS_DONTAUDIT)) rc = security_xperm_test(xpd->dontaudit->p, perm); return rc; } static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node, u8 driver, u8 perm) { struct extended_perms_decision *xpd; security_xperm_set(xp_node->xp.drivers.p, driver); xpd = avc_xperms_decision_lookup(driver, xp_node); if (xpd && xpd->allowed) security_xperm_set(xpd->allowed->p, perm); } static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node) { struct extended_perms_decision *xpd; xpd = &xpd_node->xpd; if (xpd->allowed) kmem_cache_free(avc_xperms_data_cachep, xpd->allowed); if (xpd->auditallow) kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow); if (xpd->dontaudit) kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit); kmem_cache_free(avc_xperms_decision_cachep, xpd_node); } static void avc_xperms_free(struct avc_xperms_node *xp_node) { struct avc_xperms_decision_node *xpd_node, *tmp; 56 if (!xp_node) return; list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) { list_del(&xpd_node->xpd_list); avc_xperms_decision_free(xpd_node); } 56 kmem_cache_free(avc_xperms_cachep, xp_node); } static void avc_copy_xperms_decision(struct extended_perms_decision *dest, struct extended_perms_decision *src) { dest->driver = src->driver; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) memcpy(dest->allowed->p, src->allowed->p, sizeof(src->allowed->p)); if (dest->used & XPERMS_AUDITALLOW) memcpy(dest->auditallow->p, src->auditallow->p, sizeof(src->auditallow->p)); if (dest->used & XPERMS_DONTAUDIT) memcpy(dest->dontaudit->p, src->dontaudit->p, sizeof(src->dontaudit->p)); } /* * similar to avc_copy_xperms_decision, but only copy decision * information relevant to this perm */ static inline void avc_quick_copy_xperms_decision(u8 perm, struct extended_perms_decision *dest, struct extended_perms_decision *src) { /* * compute index of the u32 of the 256 bits (8 u32s) that contain this * command permission */ u8 i = perm >> 5; dest->used = src->used; if (dest->used & XPERMS_ALLOWED) dest->allowed->p[i] = src->allowed->p[i]; if (dest->used & XPERMS_AUDITALLOW) dest->auditallow->p[i] = src->auditallow->p[i]; if (dest->used & XPERMS_DONTAUDIT) dest->dontaudit->p[i] = src->dontaudit->p[i]; } static struct avc_xperms_decision_node *avc_xperms_decision_alloc(u8 which) { struct avc_xperms_decision_node *xpd_node; struct extended_perms_decision *xpd; xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT); if (!xpd_node) return NULL; xpd = &xpd_node->xpd; if (which & XPERMS_ALLOWED) { xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, GFP_NOWAIT); if (!xpd->allowed) goto error; } if (which & XPERMS_AUDITALLOW) { xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, GFP_NOWAIT); if (!xpd->auditallow) goto error; } if (which & XPERMS_DONTAUDIT) { xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, GFP_NOWAIT); if (!xpd->dontaudit) goto error; } return xpd_node; error: avc_xperms_decision_free(xpd_node); return NULL; } static int avc_add_xperms_decision(struct avc_node *node, struct extended_perms_decision *src) { struct avc_xperms_decision_node *dest_xpd; node->ae.xp_node->xp.len++; dest_xpd = avc_xperms_decision_alloc(src->used); if (!dest_xpd) return -ENOMEM; avc_copy_xperms_decision(&dest_xpd->xpd, src); list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head); return 0; } static struct avc_xperms_node *avc_xperms_alloc(void) { struct avc_xperms_node *xp_node; xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT); if (!xp_node) return xp_node; INIT_LIST_HEAD(&xp_node->xpd_head); return xp_node; } static int avc_xperms_populate(struct avc_node *node, struct avc_xperms_node *src) { struct avc_xperms_node *dest; struct avc_xperms_decision_node *dest_xpd; struct avc_xperms_decision_node *src_xpd; if (src->xp.len == 0) return 0; dest = avc_xperms_alloc(); if (!dest) return -ENOMEM; memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p)); dest->xp.len = src->xp.len; /* for each source xpd allocate a destination xpd and copy */ list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) { dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used); if (!dest_xpd) goto error; avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd); list_add(&dest_xpd->xpd_list, &dest->xpd_head); } node->ae.xp_node = dest; return 0; error: avc_xperms_free(dest); return -ENOMEM; } static inline u32 avc_xperms_audit_required(u32 requested, struct av_decision *avd, struct extended_perms_decision *xpd, u8 perm, int result, u32 *deniedp) { u32 denied, audited; denied = requested & ~avd->allowed; if (unlikely(denied)) { 1 audited = denied & avd->auditdeny; 1 if (audited && xpd) { if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT)) audited &= ~requested; } 1978 } else if (result) { audited = denied = requested; } else { 1978 audited = requested & avd->auditallow; if (audited && xpd) { if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW)) audited &= ~requested; } } *deniedp = denied; return audited; } static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd, struct extended_perms_decision *xpd, u8 perm, int result, struct common_audit_data *ad) { u32 audited, denied; 1979 audited = avc_xperms_audit_required( requested, avd, xpd, perm, result, &denied); 1979 if (likely(!audited)) return 0; 1 return slow_avc_audit(ssid, tsid, tclass, requested, audited, denied, result, ad, 0); } static void avc_node_free(struct rcu_head *rhead) { struct avc_node *node = container_of(rhead, struct avc_node, rhead); avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); } static void avc_node_delete(struct avc_node *node) { 2064 hlist_del_rcu(&node->list); call_rcu(&node->rhead, avc_node_free); atomic_dec(&avc_cache.active_nodes); } static void avc_node_kill(struct avc_node *node) { 56 avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); atomic_dec(&avc_cache.active_nodes); } static void avc_node_replace(struct avc_node *new, struct avc_node *old) { 120 hlist_replace_rcu(&old->list, &new->list); call_rcu(&old->rhead, avc_node_free); atomic_dec(&avc_cache.active_nodes); } static inline int avc_reclaim_node(void) { struct avc_node *node; int hvalue, try, ecx; unsigned long flags; struct hlist_head *head; spinlock_t *lock; 2097 for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) { 2097 hvalue = atomic_inc_return(&avc_cache.lru_hint) & (AVC_CACHE_SLOTS - 1); head = &avc_cache.slots[hvalue]; lock = &avc_cache.slots_lock[hvalue]; 3 if (!spin_trylock_irqsave(lock, flags)) continue; 2097 rcu_read_lock(); 2097 hlist_for_each_entry(node, head, list) { 2058 avc_node_delete(node); avc_cache_stats_incr(reclaims); ecx++; if (ecx >= AVC_CACHE_RECLAIM) { 1 rcu_read_unlock(); spin_unlock_irqrestore(lock, flags); goto out; } } 2097 rcu_read_unlock(); spin_unlock_irqrestore(lock, flags); } out: return ecx; } 2097 static struct avc_node *avc_alloc_node(void) { struct avc_node *node; 2120 node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT); if (!node) goto out; 2120 INIT_HLIST_NODE(&node->list); avc_cache_stats_incr(allocations); if (atomic_inc_return(&avc_cache.active_nodes) > avc_cache_threshold) 2097 avc_reclaim_node(); out: 2120 return node; } static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) { node->ae.ssid = ssid; node->ae.tsid = tsid; node->ae.tclass = tclass; memcpy(&node->ae.avd, avd, sizeof(node->ae.avd)); } static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass) { struct avc_node *node, *ret = NULL; int hvalue; struct hlist_head *head; hvalue = avc_hash(ssid, tsid, tclass); head = &avc_cache.slots[hvalue]; 7546 hlist_for_each_entry_rcu(node, head, list) { 7546 if (ssid == node->ae.ssid && 8180 tclass == node->ae.tclass && 7543 tsid == node->ae.tsid) { ret = node; break; } } return ret; } /** * avc_lookup - Look up an AVC entry. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * * Look up an AVC entry that is valid for the * (@ssid, @tsid), interpreting the permissions * based on @tclass. If a valid AVC entry exists, * then this function returns the avc_node. * Otherwise, this function returns NULL. */ static struct avc_node *avc_lookup(u32 ssid, u32 tsid, u16 tclass) { struct avc_node *node; 8180 avc_cache_stats_incr(lookups); 8180 node = avc_search_node(ssid, tsid, tclass); if (node) return node; 2119 avc_cache_stats_incr(misses); return NULL; } static int avc_latest_notif_update(int seqno, int is_insert) { int ret = 0; static DEFINE_SPINLOCK(notif_lock); unsigned long flag; 6 spin_lock_irqsave(¬if_lock, flag); if (is_insert) { if (seqno < avc_cache.latest_notif) { printk(KERN_WARNING "SELinux: avc: seqno %d < latest_notif %d\n", seqno, avc_cache.latest_notif); ret = -EAGAIN; } } else { if (seqno > avc_cache.latest_notif) 6 avc_cache.latest_notif = seqno; } 2118 spin_unlock_irqrestore(¬if_lock, flag); return ret; } /** * avc_insert - Insert an AVC entry. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @avd: resulting av decision * @xp_node: resulting extended permissions * * Insert an AVC entry for the SID pair * (@ssid, @tsid) and class @tclass. * The access vectors and the sequence number are * normally provided by the security server in * response to a security_compute_av() call. If the * sequence number @avd->seqno is not less than the latest * revocation notification, then the function copies * the access vectors into a cache entry, returns * avc_node inserted. Otherwise, this function returns NULL. */ static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd, struct avc_xperms_node *xp_node) { struct avc_node *pos, *node = NULL; int hvalue; unsigned long flag; 2118 if (avc_latest_notif_update(avd->seqno, 1)) goto out; node = avc_alloc_node(); if (node) { struct hlist_head *head; spinlock_t *lock; int rc = 0; 2118 hvalue = avc_hash(ssid, tsid, tclass); avc_node_populate(node, ssid, tsid, tclass, avd); rc = avc_xperms_populate(node, xp_node); if (rc) { kmem_cache_free(avc_node_cachep, node); return NULL; } 2118 head = &avc_cache.slots[hvalue]; lock = &avc_cache.slots_lock[hvalue]; spin_lock_irqsave(lock, flag); 119 hlist_for_each_entry(pos, head, list) { 119 if (pos->ae.ssid == ssid && 119 pos->ae.tsid == tsid && 112 pos->ae.tclass == tclass) { 112 avc_node_replace(node, pos); goto found; } } 2115 hlist_add_head_rcu(&node->list, head); found: 2118 spin_unlock_irqrestore(lock, flag); } out: return node; } /** * avc_audit_pre_callback - SELinux specific information * will be called by generic audit code * @ab: the audit buffer * @a: audit_data */ static void avc_audit_pre_callback(struct audit_buffer *ab, void *a) { struct common_audit_data *ad = a; 149 audit_log_format(ab, "avc: %s ", 149 ad->selinux_audit_data->denied ? "denied" : "granted"); 149 avc_dump_av(ab, ad->selinux_audit_data->tclass, ad->selinux_audit_data->audited); 149 audit_log_format(ab, " for "); } /** * avc_audit_post_callback - SELinux specific information * will be called by generic audit code * @ab: the audit buffer * @a: audit_data */ static void avc_audit_post_callback(struct audit_buffer *ab, void *a) { struct common_audit_data *ad = a; 149 audit_log_format(ab, " "); 149 avc_dump_query(ab, ad->selinux_audit_data->ssid, ad->selinux_audit_data->tsid, ad->selinux_audit_data->tclass); if (ad->selinux_audit_data->denied) { audit_log_format(ab, " permissive=%u", 149 ad->selinux_audit_data->result ? 0 : 1); } 149 } /* This is the slow part of avc audit with big stack footprint */ noinline int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, u32 audited, u32 denied, int result, struct common_audit_data *a, unsigned flags) { struct common_audit_data stack_data; struct selinux_audit_data sad; 149 if (!a) { a = &stack_data; 133 a->type = LSM_AUDIT_DATA_NONE; } /* * When in a RCU walk do the audit on the RCU retry. This is because * the collection of the dname in an inode audit message is not RCU * safe. Note this may drop some audits when the situation changes * during retry. However this is logically just as if the operation * happened a little later. */ 44 if ((a->type == LSM_AUDIT_DATA_INODE) && 1 (flags & MAY_NOT_BLOCK)) return -ECHILD; 149 sad.tclass = tclass; sad.requested = requested; sad.ssid = ssid; sad.tsid = tsid; sad.audited = audited; sad.denied = denied; sad.result = result; a->selinux_audit_data = &sad; common_lsm_audit(a, avc_audit_pre_callback, avc_audit_post_callback); 150 return 0; } /** * avc_add_callback - Register a callback for security events. * @callback: callback function * @events: security events * * Register a callback function for events in the set @events. * Returns %0 on success or -%ENOMEM if insufficient memory * exists to add the callback. */ int __init avc_add_callback(int (*callback)(u32 event), u32 events) { struct avc_callback_node *c; int rc = 0; c = kmalloc(sizeof(*c), GFP_KERNEL); if (!c) { rc = -ENOMEM; goto out; } c->callback = callback; c->events = events; c->next = avc_callbacks; avc_callbacks = c; out: return rc; } /** * avc_update_node Update an AVC entry * @event : Updating event * @perms : Permission mask bits * @ssid,@tsid,@tclass : identifier of an AVC entry * @seqno : sequence number when decision was made * @xpd: extended_perms_decision to be added to the node * * if a valid AVC entry doesn't exist,this function returns -ENOENT. * if kmalloc() called internal returns NULL, this function returns -ENOMEM. * otherwise, this function updates the AVC entry. The original AVC-entry object * will release later by RCU. */ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno, struct extended_perms_decision *xpd, u32 flags) { int hvalue, rc = 0; unsigned long flag; struct avc_node *pos, *node, *orig = NULL; struct hlist_head *head; spinlock_t *lock; 59 node = avc_alloc_node(); if (!node) { rc = -ENOMEM; goto out; } /* Lock the target slot */ 59 hvalue = avc_hash(ssid, tsid, tclass); head = &avc_cache.slots[hvalue]; lock = &avc_cache.slots_lock[hvalue]; spin_lock_irqsave(lock, flag); 8 hlist_for_each_entry(pos, head, list) { 8 if (ssid == pos->ae.ssid && 8 tsid == pos->ae.tsid && 8 tclass == pos->ae.tclass && 8 seqno == pos->ae.avd.seqno){ orig = pos; break; } } if (!orig) { rc = -ENOENT; 56 avc_node_kill(node); goto out_unlock; } /* * Copy and replace original node. */ 8 avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd); if (orig->ae.xp_node) { rc = avc_xperms_populate(node, orig->ae.xp_node); if (rc) { kmem_cache_free(avc_node_cachep, node); goto out_unlock; } } 8 switch (event) { case AVC_CALLBACK_GRANT: 8 node->ae.avd.allowed |= perms; if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS)) avc_xperms_allow_perm(node->ae.xp_node, driver, xperm); break; case AVC_CALLBACK_TRY_REVOKE: case AVC_CALLBACK_REVOKE: node->ae.avd.allowed &= ~perms; break; case AVC_CALLBACK_AUDITALLOW_ENABLE: node->ae.avd.auditallow |= perms; break; case AVC_CALLBACK_AUDITALLOW_DISABLE: node->ae.avd.auditallow &= ~perms; break; case AVC_CALLBACK_AUDITDENY_ENABLE: node->ae.avd.auditdeny |= perms; break; case AVC_CALLBACK_AUDITDENY_DISABLE: node->ae.avd.auditdeny &= ~perms; break; case AVC_CALLBACK_ADD_XPERMS: avc_add_xperms_decision(node, xpd); break; } 8 avc_node_replace(node, orig); out_unlock: 59 spin_unlock_irqrestore(lock, flag); out: 59 return rc; } /** * avc_flush - Flush the cache */ static void avc_flush(void) 6 { struct hlist_head *head; struct avc_node *node; spinlock_t *lock; unsigned long flag; int i; for (i = 0; i < AVC_CACHE_SLOTS; i++) { head = &avc_cache.slots[i]; lock = &avc_cache.slots_lock[i]; 6 spin_lock_irqsave(lock, flag); /* * With preemptable RCU, the outer spinlock does not * prevent RCU grace periods from ending. */ 6 rcu_read_lock(); 6 hlist_for_each_entry(node, head, list) 6 avc_node_delete(node); 6 rcu_read_unlock(); spin_unlock_irqrestore(lock, flag); } 6 } /** * avc_ss_reset - Flush the cache and revalidate migrated permissions. * @seqno: policy sequence number */ int avc_ss_reset(u32 seqno) { struct avc_callback_node *c; int rc = 0, tmprc; 6 avc_flush(); 6 for (c = avc_callbacks; c; c = c->next) { 6 if (c->events & AVC_CALLBACK_RESET) { 6 tmprc = c->callback(AVC_CALLBACK_RESET); /* save the first error encountered for the return value and continue processing the callbacks */ if (!rc) rc = tmprc; } } 6 avc_latest_notif_update(seqno, 0); return rc; } /* * Slow-path helper function for avc_has_perm_noaudit, * when the avc_node lookup fails. We get called with * the RCU read lock held, and need to return with it * still held, but drop if for the security compute. * * Don't inline this, since it's the slow-path and just * results in a bigger stack frame. */ static noinline struct avc_node *avc_compute_av(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd, struct avc_xperms_node *xp_node) { 2119 rcu_read_unlock(); INIT_LIST_HEAD(&xp_node->xpd_head); security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp); 2118 rcu_read_lock(); 2118 return avc_insert(ssid, tsid, tclass, avd, xp_node); } static noinline int avc_denied(u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 xperm, unsigned flags, struct av_decision *avd) { 154 if (flags & AVC_STRICT) return -EACCES; 150 if (selinux_enforcing && !(avd->flags & AVD_FLAGS_PERMISSIVE)) return -EACCES; 59 avc_update_node(AVC_CALLBACK_GRANT, requested, driver, xperm, ssid, tsid, tclass, avd->seqno, NULL, flags); 154 return 0; } /* * The avc extended permissions logic adds an additional 256 bits of * permissions to an avc node when extended permissions for that node are * specified in the avtab. If the additional 256 permissions is not adequate, * as-is the case with ioctls, then multiple may be chained together and the * driver field is used to specify which set contains the permission. */ int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 xperm, struct common_audit_data *ad) { struct avc_node *node; struct av_decision avd; u32 denied; struct extended_perms_decision local_xpd; struct extended_perms_decision *xpd = NULL; struct extended_perms_data allowed; struct extended_perms_data auditallow; struct extended_perms_data dontaudit; struct avc_xperms_node local_xp_node; struct avc_xperms_node *xp_node; int rc = 0, rc2; xp_node = &local_xp_node; 1979 BUG_ON(!requested); 1979 rcu_read_lock(); 1979 node = avc_lookup(ssid, tsid, tclass); if (unlikely(!node)) { 466 node = avc_compute_av(ssid, tsid, tclass, &avd, xp_node); } else { 1654 memcpy(&avd, &node->ae.avd, sizeof(avd)); xp_node = node->ae.xp_node; } /* if extended permissions are not defined, only consider av_decision */ 466 if (!xp_node || !xp_node->xp.len) goto decision; local_xpd.allowed = &allowed; local_xpd.auditallow = &auditallow; local_xpd.dontaudit = &dontaudit; xpd = avc_xperms_decision_lookup(driver, xp_node); if (unlikely(!xpd)) { /* * Compute the extended_perms_decision only if the driver * is flagged */ if (!security_xperm_test(xp_node->xp.drivers.p, driver)) { avd.allowed &= ~requested; goto decision; } rcu_read_unlock(); security_compute_xperms_decision(ssid, tsid, tclass, driver, &local_xpd); rcu_read_lock(); avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm, ssid, tsid, tclass, avd.seqno, &local_xpd, 0); } else { avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd); } xpd = &local_xpd; if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED)) avd.allowed &= ~requested; decision: 1979 denied = requested & ~(avd.allowed); if (unlikely(denied)) 1 rc = avc_denied(ssid, tsid, tclass, requested, driver, xperm, AVC_EXTENDED_PERMS, &avd); 1979 rcu_read_unlock(); 1979 rc2 = avc_xperms_audit(ssid, tsid, tclass, requested, &avd, xpd, xperm, rc, ad); if (rc2) return rc2; return rc; } /** * avc_has_perm_noaudit - Check permissions but perform no auditing. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions, interpreted based on @tclass * @flags: AVC_STRICT or 0 * @avd: access vector decisions * * Check the AVC to determine whether the @requested permissions are granted * for the SID pair (@ssid, @tsid), interpreting the permissions * based on @tclass, and call the security server on a cache miss to obtain * a new decision and add it to the cache. Return a copy of the decisions * in @avd. Return %0 if all @requested permissions are granted, * -%EACCES if any permissions are denied, or another -errno upon * other errors. This function is typically called by avc_has_perm(), * but may also be called directly to separate permission checking from * auditing, e.g. in cases where a lock must be held for the check but * should be released for the auditing. */ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, unsigned flags, struct av_decision *avd) { struct avc_node *node; struct avc_xperms_node xp_node; int rc = 0; u32 denied; 3717 BUG_ON(!requested); 7467 rcu_read_lock(); 7467 node = avc_lookup(ssid, tsid, tclass); if (unlikely(!node)) 1904 node = avc_compute_av(ssid, tsid, tclass, avd, &xp_node); else 6968 memcpy(avd, &node->ae.avd, sizeof(*avd)); 7466 denied = requested & ~(avd->allowed); if (unlikely(denied)) 153 rc = avc_denied(ssid, tsid, tclass, requested, 0, 0, flags, avd); 7466 rcu_read_unlock(); return rc; } /** * avc_has_perm - Check permissions and perform any appropriate auditing. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions, interpreted based on @tclass * @auditdata: auxiliary audit data * * Check the AVC to determine whether the @requested permissions are granted * for the SID pair (@ssid, @tsid), interpreting the permissions * based on @tclass, and call the security server on a cache miss to obtain * a new decision and add it to the cache. Audit the granting or denial of * permissions in accordance with the policy. Return %0 if all @requested * permissions are granted, -%EACCES if any permissions are denied, or * another -errno upon other errors. */ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata) { struct av_decision avd; int rc, rc2; 7083 rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd); 7083 rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata, 0); if (rc2) return rc2; return rc; } int avc_has_perm_flags(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata, int flags) { struct av_decision avd; int rc, rc2; 679 rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd); 679 rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata, flags); if (rc2) return rc2; return rc; } u32 avc_policy_seqno(void) { 3291 return avc_cache.latest_notif; } void avc_disable(void) { /* * If you are looking at this because you have realized that we are * not destroying the avc_node_cachep it might be easy to fix, but * I don't know the memory barrier semantics well enough to know. It's * possible that some other task dereferenced security_ops when * it still pointed to selinux operations. If that is the case it's * possible that it is about to use the avc and is about to need the * avc_node_cachep. I know I could wrap the security.c security_ops call * in an rcu_lock, but seriously, it's not worth it. Instead I just flush * the cache and get that memory back. */ if (avc_node_cachep) { avc_flush(); /* kmem_cache_destroy(avc_node_cachep); */ } }
/* binder_alloc.c * * Android IPC Subsystem * * Copyright (C) 2007-2017 Google, Inc. * * This software is licensed under the terms of the GNU General Public * License version 2, as published by the Free Software Foundation, and * may be copied, distributed, and modified under those terms. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/cacheflush.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/rtmutex.h> #include <linux/rbtree.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/list_lru.h> #include "binder_alloc.h" #include "binder_trace.h" struct list_lru binder_alloc_lru; static DEFINE_MUTEX(binder_alloc_mmap_lock); enum { BINDER_DEBUG_OPEN_CLOSE = 1U << 1, BINDER_DEBUG_BUFFER_ALLOC = 1U << 2, BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3, }; static uint32_t binder_alloc_debug_mask; module_param_named(debug_mask, binder_alloc_debug_mask, uint, S_IWUSR | S_IRUGO); #define binder_alloc_debug(mask, x...) \ do { \ if (binder_alloc_debug_mask & mask) \ pr_info(x); \ } while (0) static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer) { return list_entry(buffer->entry.next, struct binder_buffer, entry); } static struct binder_buffer *binder_buffer_prev(struct binder_buffer *buffer) { return list_entry(buffer->entry.prev, struct binder_buffer, entry); } 64 static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, struct binder_buffer *buffer) { 64 if (list_is_last(&buffer->entry, &alloc->buffers)) 64 return (u8 *)alloc->buffer + alloc->buffer_size - (u8 *)buffer->data; 56 return (u8 *)binder_buffer_next(buffer)->data - (u8 *)buffer->data; } static void binder_insert_free_buffer(struct binder_alloc *alloc, struct binder_buffer *new_buffer) { 63 struct rb_node **p = &alloc->free_buffers.rb_node; struct rb_node *parent = NULL; struct binder_buffer *buffer; size_t buffer_size; size_t new_buffer_size; BUG_ON(!new_buffer->free); 63 new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer); 63 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: add free buffer, size %zd, at %pK\n", alloc->pid, new_buffer_size, new_buffer); 63 while (*p) { parent = *p; buffer = rb_entry(parent, struct binder_buffer, rb_node); 55 BUG_ON(!buffer->free); 55 buffer_size = binder_alloc_buffer_size(alloc, buffer); 55 if (new_buffer_size < buffer_size) 3 p = &parent->rb_left; else 52 p = &parent->rb_right; } 63 rb_link_node(&new_buffer->rb_node, parent, p); rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers); } static void binder_insert_allocated_buffer_locked( struct binder_alloc *alloc, struct binder_buffer *new_buffer) { struct rb_node **p = &alloc->allocated_buffers.rb_node; struct rb_node *parent = NULL; struct binder_buffer *buffer; BUG_ON(new_buffer->free); 55 while (*p) { parent = *p; buffer = rb_entry(parent, struct binder_buffer, rb_node); 4 BUG_ON(buffer->free); 4 if (new_buffer->data < buffer->data) p = &parent->rb_left; 4 else if (new_buffer->data > buffer->data) 4 p = &parent->rb_right; else BUG(); } 55 rb_link_node(&new_buffer->rb_node, parent, p); rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers); } static struct binder_buffer *binder_alloc_prepare_to_free_locked( struct binder_alloc *alloc, uintptr_t user_ptr) { struct rb_node *n = alloc->allocated_buffers.rb_node; struct binder_buffer *buffer; void *kern_ptr; kern_ptr = (void *)(user_ptr - alloc->user_buffer_offset); 1 while (n) { 1 buffer = rb_entry(n, struct binder_buffer, rb_node); 2 BUG_ON(buffer->free); 2 if (kern_ptr < buffer->data) 1 n = n->rb_left; 1 else if (kern_ptr > buffer->data) n = n->rb_right; else { /* * Guard against user threads attempting to * free the buffer when in use by kernel or * after it's already been freed. */ 1 if (!buffer->allow_user_free) return ERR_PTR(-EPERM); buffer->allow_user_free = 0; return buffer; } } return NULL; } /** * binder_alloc_buffer_lookup() - get buffer given user ptr * @alloc: binder_alloc for this proc * @user_ptr: User pointer to buffer data * * Validate userspace pointer to buffer data and return buffer corresponding to * that user pointer. Search the rb tree for buffer that matches user data * pointer. * * Return: Pointer to buffer or NULL */ struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc, uintptr_t user_ptr) { struct binder_buffer *buffer; 3 mutex_lock(&alloc->mutex); 2 buffer = binder_alloc_prepare_to_free_locked(alloc, user_ptr); 3 mutex_unlock(&alloc->mutex); return buffer; } 53 static int binder_update_page_range(struct binder_alloc *alloc, int allocate, void *start, void *end) { void *page_addr; unsigned long user_page_addr; struct binder_lru_page *page; struct vm_area_struct *vma = NULL; struct mm_struct *mm = NULL; bool need_mm = false; 57 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: %s pages %pK-%pK\n", alloc->pid, allocate ? "allocate" : "free", start, end); 57 if (end <= start) 57 return 0; 53 trace_binder_update_page_range(alloc, allocate, start, end); 53 if (allocate == 0) goto free_range; 4 for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { 52 page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; if (!page->page_ptr) { need_mm = true; break; } } /* Same as mmget_not_zero() in later kernel versions */ 48 if (need_mm && atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users)) 48 mm = alloc->vma_vm_mm; if (mm) { 48 down_read(&mm->mmap_sem); if (!mmget_still_valid(mm)) { if (allocate == 0) goto free_range; 48 goto err_no_vma; } vma = alloc->vma; } if (!vma && need_mm) { 4 pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", alloc->pid); goto err_no_vma; } 52 for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; bool on_lru; size_t index; 4 index = (page_addr - alloc->buffer) / PAGE_SIZE; 4 page = &alloc->pages[index]; if (page->page_ptr) { 4 trace_binder_alloc_lru_start(alloc, index); 51 on_lru = list_lru_del(&binder_alloc_lru, &page->lru); WARN_ON(!on_lru); 48 trace_binder_alloc_lru_end(alloc, index); continue; 48 } 48 if (WARN_ON(!vma)) goto err_page_ptr_cleared; trace_binder_alloc_page_start(alloc, index); page->page_ptr = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); 48 if (!page->page_ptr) { pr_err("%d: binder_alloc_buf failed for page at %pK\n", alloc->pid, page_addr); goto err_alloc_page_failed; } page->alloc = alloc; INIT_LIST_HEAD(&page->lru); ret = map_kernel_range_noflush((unsigned long)page_addr, PAGE_SIZE, PAGE_KERNEL, &page->page_ptr); flush_cache_vmap((unsigned long)page_addr, (unsigned long)page_addr + PAGE_SIZE); if (ret != 1) { 48 pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n", alloc->pid, page_addr); goto err_map_kernel_failed; 1 } user_page_addr = (uintptr_t)page_addr + alloc->user_buffer_offset; ret = vm_insert_page(vma, user_page_addr, page[0].page_ptr); if (ret) { 47 pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", 47 alloc->pid, user_page_addr); goto err_vm_insert_page_failed; 51 } if (index + 1 > alloc->pages_high) 51 alloc->pages_high = index + 1; 47 trace_binder_alloc_page_end(alloc, index); /* vm_insert_page does not seem to increment the refcount */ } if (mm) { up_read(&mm->mmap_sem); 29 mmput(mm); 30 } return 0; free_range: 29 for (page_addr = end - PAGE_SIZE; page_addr >= start; page_addr -= PAGE_SIZE) { bool ret; 29 size_t index; 29 index = (page_addr - alloc->buffer) / PAGE_SIZE; page = &alloc->pages[index]; 29 trace_binder_free_lru_start(alloc, index); ret = list_lru_add(&binder_alloc_lru, &page->lru); WARN_ON(!ret); trace_binder_free_lru_end(alloc, index); 1 continue; err_vm_insert_page_failed: unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); err_map_kernel_failed: __free_page(page->page_ptr); page->page_ptr = NULL; 30 err_alloc_page_failed: 1 err_page_ptr_cleared: ; } 30 err_no_vma: if (mm) { up_read(&mm->mmap_sem); mmput(mm); } return vma ? -ENOMEM : -ESRCH; } static struct binder_buffer *binder_alloc_new_buf_locked( struct binder_alloc *alloc, 57 size_t data_size, size_t offsets_size, size_t extra_buffers_size, int is_async) { struct rb_node *n = alloc->free_buffers.rb_node; struct binder_buffer *buffer; size_t buffer_size; struct rb_node *best_fit = NULL; void *has_page_addr; 8 void *end_page_addr; size_t size, data_offsets_size; int ret; if (alloc->vma == NULL) { 61 pr_err("%d: binder_alloc_buf, no vma\n", alloc->pid); return ERR_PTR(-ESRCH); 60 } 2 data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); if (data_offsets_size < data_size || data_offsets_size < offsets_size) { 59 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, 58 "%d: got transaction with invalid size %zd-%zd\n", 1 alloc->pid, data_size, offsets_size); return ERR_PTR(-EINVAL); } size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *)); if (size < data_offsets_size || size < extra_buffers_size) { 58 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, 4 "%d: got transaction with invalid extra_buffers_size %zd\n", 1 alloc->pid, extra_buffers_size); return ERR_PTR(-EINVAL); } if (is_async && alloc->free_async_space < size + sizeof(struct binder_buffer)) { binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_alloc_buf size %zd failed, no async space left\n", alloc->pid, size); return ERR_PTR(-ENOSPC); 57 } 57 /* Pad 0-size buffers so they get assigned unique addresses */ 57 size = max(size, sizeof(void *)); 57 while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); 56 BUG_ON(!buffer->free); 1 buffer_size = binder_alloc_buffer_size(alloc, buffer); 1 if (size < buffer_size) { best_fit = n; n = n->rb_left; } else if (size > buffer_size) n = n->rb_right; 57 else { best_fit = n; break; } } if (best_fit == NULL) { size_t allocated_buffers = 0; size_t largest_alloc_size = 0; 1 size_t total_alloc_size = 0; size_t free_buffers = 0; size_t largest_free_size = 0; size_t total_free_size = 0; for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) { buffer = rb_entry(n, struct binder_buffer, rb_node); buffer_size = binder_alloc_buffer_size(alloc, buffer); 1 allocated_buffers++; total_alloc_size += buffer_size; if (buffer_size > largest_alloc_size) 1 largest_alloc_size = buffer_size; 1 } for (n = rb_first(&alloc->free_buffers); n != NULL; n = rb_next(n)) { buffer = rb_entry(n, struct binder_buffer, rb_node); buffer_size = binder_alloc_buffer_size(alloc, buffer); 1 free_buffers++; total_free_size += buffer_size; if (buffer_size > largest_free_size) largest_free_size = buffer_size; } pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", alloc->pid, size); pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n", 56 total_alloc_size, allocated_buffers, largest_alloc_size, 56 total_free_size, free_buffers, largest_free_size); return ERR_PTR(-ENOSPC); } 56 if (n == NULL) { buffer = rb_entry(best_fit, struct binder_buffer, rb_node); buffer_size = binder_alloc_buffer_size(alloc, buffer); } 56 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n", 56 alloc->pid, size, buffer, buffer_size); has_page_addr = (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); WARN_ON(n && buffer_size != size); end_page_addr = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size); 1 if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; 55 ret = binder_update_page_range(alloc, 1, (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr); if (ret) 55 return ERR_PTR(ret); if (buffer_size != size) { struct binder_buffer *new_buffer; new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); 55 if (!new_buffer) { 55 pr_err("%s: %d failed to alloc new buffer struct\n", 55 __func__, alloc->pid); goto err_alloc_buf_struct_failed; } new_buffer->data = (u8 *)buffer->data + size; 55 list_add(&new_buffer->entry, &buffer->entry); new_buffer->free = 1; binder_insert_free_buffer(alloc, new_buffer); 55 } rb_erase(best_fit, &alloc->free_buffers); buffer->free = 0; 55 buffer->allow_user_free = 0; binder_insert_allocated_buffer_locked(alloc, buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_alloc_buf size %zd got %pK\n", alloc->pid, size, buffer); 3 buffer->data_size = data_size; buffer->offsets_size = offsets_size; buffer->async_transaction = is_async; buffer->extra_buffers_size = extra_buffers_size; if (is_async) { alloc->free_async_space -= size + sizeof(struct binder_buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, "%d: binder_alloc_buf size %zd async free %zd\n", alloc->pid, size, alloc->free_async_space); } return buffer; err_alloc_buf_struct_failed: binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr); return ERR_PTR(-ENOMEM); } /** * binder_alloc_new_buf() - Allocate a new binder buffer * @alloc: binder_alloc for this proc * @data_size: size of user data buffer * @offsets_size: user specified buffer offset * @extra_buffers_size: size of extra space for meta-data (eg, security context) * @is_async: buffer for async transaction * * Allocate a new buffer given the requested sizes. Returns * the kernel version of the buffer pointer. The size allocated * is the sum of the three given sizes (each rounded up to * pointer-sized boundary) * * Return: The allocated buffer or %NULL if error */ struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, size_t data_size, size_t offsets_size, 65 size_t extra_buffers_size, 65 int is_async) { 65 struct binder_buffer *buffer; mutex_lock(&alloc->mutex); buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size, extra_buffers_size, is_async); mutex_unlock(&alloc->mutex); 27 return buffer; } static void *buffer_start_page(struct binder_buffer *buffer) { return (void *)((uintptr_t)buffer->data & PAGE_MASK); } static void *prev_buffer_end_page(struct binder_buffer *buffer) { return (void *)(((uintptr_t)(buffer->data) - 1) & PAGE_MASK); } 31 static void binder_delete_free_buffer(struct binder_alloc *alloc, 31 struct binder_buffer *buffer) { 31 struct binder_buffer *prev, *next = NULL; bool to_free = true; 2 BUG_ON(alloc->buffers.next == &buffer->entry); prev = binder_buffer_prev(buffer); BUG_ON(!prev->free); if (prev_buffer_end_page(prev) == buffer_start_page(buffer)) { to_free = false; 31 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK share page with %pK\n", alloc->pid, buffer->data, prev->data); } if (!list_is_last(&buffer->entry, &alloc->buffers)) { next = binder_buffer_next(buffer); if (buffer_start_page(next) == buffer_start_page(buffer)) { to_free = false; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK share page with %pK\n", alloc->pid, 31 buffer->data, 2 next->data); } } if (PAGE_ALIGNED(buffer->data)) { binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, 29 "%d: merge free, buffer start %pK is page aligned\n", 27 alloc->pid, buffer->data); to_free = false; } if (to_free) { 27 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK do not share page with %pK or %pK\n", 31 alloc->pid, buffer->data, prev->data, next ? next->data : NULL); binder_update_page_range(alloc, 0, buffer_start_page(buffer), buffer_start_page(buffer) + PAGE_SIZE); } list_del(&buffer->entry); kfree(buffer); } 31 static void binder_free_buf_locked(struct binder_alloc *alloc, struct binder_buffer *buffer) 31 { size_t size, buffer_size; buffer_size = binder_alloc_buffer_size(alloc, buffer); size = ALIGN(buffer->data_size, sizeof(void *)) + ALIGN(buffer->offsets_size, sizeof(void *)) + ALIGN(buffer->extra_buffers_size, sizeof(void *)); 31 31 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, 31 "%d: binder_free_buf %pK size %zd buffer_size %zd\n", 31 alloc->pid, buffer, size, buffer_size); 31 BUG_ON(buffer->free); 31 BUG_ON(size > buffer_size); 2 BUG_ON(buffer->transaction != NULL); BUG_ON(buffer->data < alloc->buffer); 2 BUG_ON(buffer->data > alloc->buffer + alloc->buffer_size); if (buffer->async_transaction) { alloc->free_async_space += size + sizeof(struct binder_buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, "%d: binder_free_buf size %zd async free %zd\n", 31 alloc->pid, size, alloc->free_async_space); } binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK)); 31 rb_erase(&buffer->rb_node, &alloc->allocated_buffers); 31 buffer->free = 1; if (!list_is_last(&buffer->entry, &alloc->buffers)) { struct binder_buffer *next = binder_buffer_next(buffer); 31 if (next->free) { 2 rb_erase(&next->rb_node, &alloc->free_buffers); binder_delete_free_buffer(alloc, next); } } if (alloc->buffers.next != &buffer->entry) { struct binder_buffer *prev = binder_buffer_prev(buffer); if (prev->free) { 31 binder_delete_free_buffer(alloc, buffer); rb_erase(&prev->rb_node, &alloc->free_buffers); buffer = prev; } } binder_insert_free_buffer(alloc, buffer); } /** * binder_alloc_free_buf() - free a binder buffer * @alloc: binder_alloc for this proc * @buffer: kernel pointer to buffer * 31 * Free the buffer allocated via binder_alloc_new_buffer() */ void binder_alloc_free_buf(struct binder_alloc *alloc, struct binder_buffer *buffer) { mutex_lock(&alloc->mutex); binder_free_buf_locked(alloc, buffer); mutex_unlock(&alloc->mutex); } /** * binder_alloc_mmap_handler() - map virtual address space for proc * @alloc: alloc structure for this proc * @vma: vma passed to mmap() * * Called by binder_mmap() to initialize the space specified in * vma for allocating binder buffers * * Return: * 0 = success * -EBUSY = address space already mapped * -ENOMEM = failed to map memory to given address space */ int binder_alloc_mmap_handler(struct binder_alloc *alloc, struct vm_area_struct *vma) { 22 int ret; struct vm_struct *area; const char *failure_string; struct binder_buffer *buffer; mutex_lock(&binder_alloc_mmap_lock); if (alloc->buffer) { 21 ret = -EBUSY; failure_string = "already mapped"; goto err_already_mapped; } area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC); 21 if (area == NULL) { ret = -ENOMEM; failure_string = "get_vm_area"; goto err_get_vm_area_failed; } alloc->buffer = area->addr; alloc->user_buffer_offset = vma->vm_start - (uintptr_t)alloc->buffer; mutex_unlock(&binder_alloc_mmap_lock); #ifdef CONFIG_CPU_CACHE_VIPT if (cache_is_vipt_aliasing()) { while (CACHE_COLOUR( (vma->vm_start ^ (uint32_t)alloc->buffer))) { pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n", alloc->pid, vma->vm_start, vma->vm_end, alloc->buffer); vma->vm_start += PAGE_SIZE; } } #endif alloc->pages = kzalloc(sizeof(alloc->pages[0]) * ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL); 21 if (alloc->pages == NULL) { ret = -ENOMEM; failure_string = "alloc page array"; goto err_alloc_pages_failed; } alloc->buffer_size = vma->vm_end - vma->vm_start; buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); if (!buffer) { 21 ret = -ENOMEM; 21 failure_string = "alloc buffer struct"; 21 goto err_alloc_buf_struct_failed; } buffer->data = alloc->buffer; list_add(&buffer->entry, &alloc->buffers); buffer->free = 1; binder_insert_free_buffer(alloc, buffer); alloc->free_async_space = alloc->buffer_size / 2; barrier(); 22 alloc->vma = vma; alloc->vma_vm_mm = vma->vm_mm; /* Same as mmgrab() in later kernel versions */ atomic_inc(&alloc->vma_vm_mm->mm_count); return 0; err_alloc_buf_struct_failed: kfree(alloc->pages); alloc->pages = NULL; err_alloc_pages_failed: 1 mutex_lock(&binder_alloc_mmap_lock); vfree(alloc->buffer); alloc->buffer = NULL; err_get_vm_area_failed: err_already_mapped: mutex_unlock(&binder_alloc_mmap_lock); pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret); return ret; } void binder_alloc_deferred_release(struct binder_alloc *alloc) { struct rb_node *n; int buffers, page_count; struct binder_buffer *buffer; BUG_ON(alloc->vma); buffers = 0; mutex_lock(&alloc->mutex); while ((n = rb_first(&alloc->allocated_buffers))) { buffer = rb_entry(n, struct binder_buffer, rb_node); /* Transaction should already have been freed */ BUG_ON(buffer->transaction); binder_free_buf_locked(alloc, buffer); buffers++; } while (!list_empty(&alloc->buffers)) { buffer = list_first_entry(&alloc->buffers, struct binder_buffer, entry); WARN_ON(!buffer->free); list_del(&buffer->entry); WARN_ON_ONCE(!list_empty(&alloc->buffers)); kfree(buffer); } page_count = 0; if (alloc->pages) { int i; for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { void *page_addr; bool on_lru; if (!alloc->pages[i].page_ptr) continue; on_lru = list_lru_del(&binder_alloc_lru, &alloc->pages[i].lru); page_addr = alloc->buffer + i * PAGE_SIZE; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%s: %d: page %d at %pK %s\n", __func__, alloc->pid, i, page_addr, on_lru ? "on lru" : "active"); unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); __free_page(alloc->pages[i].page_ptr); page_count++; } kfree(alloc->pages); vfree(alloc->buffer); } mutex_unlock(&alloc->mutex); if (alloc->vma_vm_mm) mmdrop(alloc->vma_vm_mm); binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d buffers %d, pages %d\n", __func__, alloc->pid, buffers, page_count); } static void print_binder_buffer(struct seq_file *m, const char *prefix, struct binder_buffer *buffer) { seq_printf(m, "%s %d: %pK size %zd:%zd:%zd %s\n", prefix, buffer->debug_id, buffer->data, buffer->data_size, buffer->offsets_size, buffer->extra_buffers_size, buffer->transaction ? "active" : "delivered"); } /** * binder_alloc_print_allocated() - print buffer info * @m: seq_file for output via seq_printf() * @alloc: binder_alloc for this proc * * Prints information about every buffer associated with * the binder_alloc state to the given seq_file */ void binder_alloc_print_allocated(struct seq_file *m, struct binder_alloc *alloc) { struct rb_node *n; mutex_lock(&alloc->mutex); for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) print_binder_buffer(m, " buffer", rb_entry(n, struct binder_buffer, rb_node)); mutex_unlock(&alloc->mutex); } /** * binder_alloc_print_pages() - print page usage * @m: seq_file for output via seq_printf() * @alloc: binder_alloc for this proc */ void binder_alloc_print_pages(struct seq_file *m, struct binder_alloc *alloc) { struct binder_lru_page *page; int i; int active = 0; int lru = 0; int free = 0; mutex_lock(&alloc->mutex); for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { page = &alloc->pages[i]; if (!page->page_ptr) free++; else if (list_empty(&page->lru)) active++; else lru++; } mutex_unlock(&alloc->mutex); seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high); } /** * binder_alloc_get_allocated_count() - return count of buffers * @alloc: binder_alloc for this proc * * Return: count of allocated buffers */ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) { struct rb_node *n; int count = 0; mutex_lock(&alloc->mutex); for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n)) count++; mutex_unlock(&alloc->mutex); return count; } /** * binder_alloc_vma_close() - invalidate address space * @alloc: binder_alloc for this proc * * Called from binder_vma_close() when releasing address space. 4 * Clears alloc->vma to prevent new incoming transactions from * allocating more buffers. */ void binder_alloc_vma_close(struct binder_alloc *alloc) { WRITE_ONCE(alloc->vma, NULL); } /** * binder_alloc_free_page() - shrinker callback to free pages * @item: item to free * @lock: lock protecting the item * @cb_arg: callback argument * * Called from list_lru_walk() in binder_shrink_scan() to free * up pages when the system is under memory pressure. */ enum lru_status binder_alloc_free_page(struct list_head *item, struct list_lru_one *lru, spinlock_t *lock, void *cb_arg) { struct mm_struct *mm = NULL; struct binder_lru_page *page = container_of(item, struct binder_lru_page, lru); struct binder_alloc *alloc; uintptr_t page_addr; size_t index; struct vm_area_struct *vma; alloc = page->alloc; if (!mutex_trylock(&alloc->mutex)) goto err_get_alloc_mutex_failed; if (!page->page_ptr) goto err_page_already_freed; index = page - alloc->pages; page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; mm = alloc->vma_vm_mm; /* Same as mmget_not_zero() in later kernel versions */ if (!atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users)) goto err_mmget; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; vma = alloc->vma; list_lru_isolate(lru, item); spin_unlock(lock); if (vma) { trace_binder_unmap_user_start(alloc, index); zap_page_range(vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE, NULL); trace_binder_unmap_user_end(alloc, index); } up_write(&mm->mmap_sem); mmput(mm); trace_binder_unmap_kernel_start(alloc, index); unmap_kernel_range(page_addr, PAGE_SIZE); __free_page(page->page_ptr); page->page_ptr = NULL; trace_binder_unmap_kernel_end(alloc, index); spin_lock(lock); mutex_unlock(&alloc->mutex); return LRU_REMOVED_RETRY; err_down_write_mmap_sem_failed: mmput_async(mm); err_mmget: err_page_already_freed: mutex_unlock(&alloc->mutex); err_get_alloc_mutex_failed: return LRU_SKIP; } 2 static unsigned long binder_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long ret = list_lru_count(&binder_alloc_lru); return ret; } static unsigned long binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { unsigned long ret; ret = list_lru_walk(&binder_alloc_lru, binder_alloc_free_page, NULL, sc->nr_to_scan); return ret; } static struct shrinker binder_shrinker = { .count_objects = binder_shrink_count, .scan_objects = binder_shrink_scan, .seeks = DEFAULT_SEEKS, }; /** * binder_alloc_init() - called by binder_open() for per-proc initialization * @alloc: binder_alloc for this proc * * Called from binder_open() to initialize binder_alloc fields for * new binder proc 41 */ void binder_alloc_init(struct binder_alloc *alloc) { alloc->pid = current->group_leader->pid; mutex_init(&alloc->mutex); INIT_LIST_HEAD(&alloc->buffers); } int binder_alloc_shrinker_init(void) { int ret = list_lru_init(&binder_alloc_lru); if (ret == 0) { ret = register_shrinker(&binder_shrinker); if (ret) list_lru_destroy(&binder_alloc_lru); } return ret; }
#include <linux/sched.h> #include <linux/errno.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/pid.h> #include <linux/security.h> #include <linux/file.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include "../mount.h" #include "internal.h" #include "fd.h" static int seq_show(struct seq_file *m, void *v) { struct files_struct *files = NULL; 27 int f_flags = 0, ret = -ENOENT; struct file *file = NULL; struct task_struct *task; 27 task = get_proc_task(m->private); if (!task) return -ENOENT; 26 files = get_files_struct(task); 26 put_task_struct(task); 26 if (files) { 25 int fd = proc_fd(m->private); spin_lock(&files->file_lock); 25 file = fcheck_files(files, fd); if (file) { 24 struct fdtable *fdt = files_fdtable(files); f_flags = file->f_flags; if (close_on_exec(fd, fdt)) 1 f_flags |= O_CLOEXEC; 24 get_file(file); ret = 0; } 1 spin_unlock(&files->file_lock); put_files_struct(files); } if (ret) return ret; seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", (long long)file->f_pos, f_flags, real_mount(file->f_path.mnt)->mnt_id); show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; 24 if (file->f_op->show_fdinfo) 19 file->f_op->show_fdinfo(m, file); out: 24 fput(file); return 0; } static int seq_fdinfo_open(struct inode *inode, struct file *file) { 6 return single_open(file, seq_show, inode); } static const struct file_operations proc_fdinfo_file_operations = { .open = seq_fdinfo_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) { struct files_struct *files; struct task_struct *task; const struct cred *cred; struct inode *inode; int fd; 71 if (flags & LOOKUP_RCU) return -ECHILD; 71 inode = d_inode(dentry); task = get_proc_task(inode); 71 fd = proc_fd(inode); if (task) { files = get_files_struct(task); if (files) { struct file *file; 71 rcu_read_lock(); 71 file = fcheck_files(files, fd); if (file) { unsigned f_mode = file->f_mode; 62 rcu_read_unlock(); put_files_struct(files); 62 if (task_dumpable(task)) { 61 rcu_read_lock(); 61 cred = __task_cred(task); 61 inode->i_uid = cred->euid; inode->i_gid = cred->egid; 61 rcu_read_unlock(); } else { 1 inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; } 62 if (S_ISLNK(inode->i_mode)) { unsigned i_mode = S_IFLNK; 50 if (f_mode & FMODE_READ) i_mode |= S_IRUSR | S_IXUSR; 50 if (f_mode & FMODE_WRITE) 30 i_mode |= S_IWUSR | S_IXUSR; 50 inode->i_mode = i_mode; } 62 security_task_to_inode(task, inode); 62 put_task_struct(task); return 1; } 12 rcu_read_unlock(); put_files_struct(files); } 12 put_task_struct(task); } 71 return 0; } static const struct dentry_operations tid_fd_dentry_operations = { .d_revalidate = tid_fd_revalidate, .d_delete = pid_delete_dentry, }; static int proc_fd_link(struct dentry *dentry, struct path *path) { struct files_struct *files = NULL; struct task_struct *task; int ret = -ENOENT; 47 task = get_proc_task(d_inode(dentry)); if (task) { 47 files = get_files_struct(task); 47 put_task_struct(task); } 47 if (files) { 47 int fd = proc_fd(d_inode(dentry)); struct file *fd_file; spin_lock(&files->file_lock); 47 fd_file = fcheck_files(files, fd); if (fd_file) { *path = fd_file->f_path; path_get(&fd_file->f_path); ret = 0; } 47 spin_unlock(&files->file_lock); put_files_struct(files); } 47 return ret; } static int proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { 56 unsigned fd = (unsigned long)ptr; struct proc_inode *ei; struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; ei = PROC_I(inode); 56 ei->fd = fd; inode->i_mode = S_IFLNK; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, 0)) 56 return 0; out: return -ENOENT; } static struct dentry *proc_lookupfd_common(struct inode *dir, struct dentry *dentry, instantiate_t instantiate) { struct task_struct *task = get_proc_task(dir); int result = -ENOENT; 70 unsigned fd = name_to_int(&dentry->d_name); 70 if (!task) goto out_no_task; if (fd == ~0U) goto out; 65 result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); out: 68 put_task_struct(task); out_no_task: 70 return ERR_PTR(result); } static int proc_readfd_common(struct file *file, struct dir_context *ctx, instantiate_t instantiate) { 8 struct task_struct *p = get_proc_task(file_inode(file)); struct files_struct *files; unsigned int fd; if (!p) return -ENOENT; 7 if (!dir_emit_dots(file, ctx)) goto out; 6 files = get_files_struct(p); if (!files) goto out; 5 rcu_read_lock(); 5 for (fd = ctx->pos - 2; 5 fd < files_fdtable(files)->max_fds; fd++, ctx->pos++) { char name[PROC_NUMBUF]; int len; 5 if (!fcheck_files(files, fd)) 5 continue; 5 rcu_read_unlock(); len = snprintf(name, sizeof(name), "%d", fd); if (!proc_fill_cache(file, ctx, name, len, instantiate, p, (void *)(unsigned long)fd)) 3 goto out_fd_loop; 5 rcu_read_lock(); } 3 rcu_read_unlock(); out_fd_loop: 5 put_files_struct(files); out: 7 put_task_struct(p); 8 return 0; } static int proc_readfd(struct file *file, struct dir_context *ctx) { 4 return proc_readfd_common(file, ctx, proc_fd_instantiate); } const struct file_operations proc_fd_operations = { .read = generic_read_dir, .iterate = proc_readfd, .llseek = default_llseek, }; static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, unsigned int flags) { 55 return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); } /* * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ int proc_fd_permission(struct inode *inode, int mask) { struct task_struct *p; int rv; 70 rv = generic_permission(inode, mask); 70 if (rv == 0) return rv; rcu_read_lock(); p = pid_task(proc_pid(inode), PIDTYPE_PID); if (p && same_thread_group(p, current)) rv = 0; rcu_read_unlock(); return rv; } const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .permission = proc_fd_permission, .setattr = proc_setattr, }; static int proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { 14 unsigned fd = (unsigned long)ptr; struct proc_inode *ei; struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; ei = PROC_I(inode); 14 ei->fd = fd; inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, 0)) 14 return 0; out: return -ENOENT; } static struct dentry * proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) { 15 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } static int proc_readfdinfo(struct file *file, struct dir_context *ctx) { 4 return proc_readfd_common(file, ctx, proc_fdinfo_instantiate); } const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { .read = generic_read_dir, .iterate = proc_readfdinfo, .llseek = default_llseek, };
#include <linux/sysctl.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/xfrm.h> static void __net_init __xfrm_sysctl_init(struct net *net) { 30 net->xfrm.sysctl_aevent_etime = XFRM_AE_ETIME; net->xfrm.sysctl_aevent_rseqth = XFRM_AE_SEQT_SIZE; net->xfrm.sysctl_larval_drop = 1; net->xfrm.sysctl_acq_expires = 30; } #ifdef CONFIG_SYSCTL static struct ctl_table xfrm_table[] = { { .procname = "xfrm_aevent_etime", .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "xfrm_aevent_rseqth", .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "xfrm_larval_drop", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "xfrm_acq_expires", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, {} }; int __net_init xfrm_sysctl_init(struct net *net) { struct ctl_table *table; __xfrm_sysctl_init(net); table = kmemdup(xfrm_table, sizeof(xfrm_table), GFP_KERNEL); if (!table) goto out_kmemdup; 30 table[0].data = &net->xfrm.sysctl_aevent_etime; table[1].data = &net->xfrm.sysctl_aevent_rseqth; table[2].data = &net->xfrm.sysctl_larval_drop; table[3].data = &net->xfrm.sysctl_acq_expires; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) 30 table[0].procname = NULL; 30 net->xfrm.sysctl_hdr = register_net_sysctl(net, "net/core", table); if (!net->xfrm.sysctl_hdr) goto out_register; 30 return 0; out_register: kfree(table); out_kmemdup: return -ENOMEM; } void __net_exit xfrm_sysctl_fini(struct net *net) { struct ctl_table *table; table = net->xfrm.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->xfrm.sysctl_hdr); kfree(table); } #else int __net_init xfrm_sysctl_init(struct net *net) { __xfrm_sysctl_init(net); return 0; } #endif
#include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/statfs.h> #include <linux/security.h> #include <linux/uaccess.h> #include "internal.h" static int flags_by_mnt(int mnt_flags) { int flags = 0; if (mnt_flags & MNT_READONLY) flags |= ST_RDONLY; if (mnt_flags & MNT_NOSUID) 1 flags |= ST_NOSUID; 8 if (mnt_flags & MNT_NODEV) 2 flags |= ST_NODEV; 8 if (mnt_flags & MNT_NOEXEC) 1 flags |= ST_NOEXEC; 8 if (mnt_flags & MNT_NOATIME) 1 flags |= ST_NOATIME; 8 if (mnt_flags & MNT_NODIRATIME) 1 flags |= ST_NODIRATIME; 8 if (mnt_flags & MNT_RELATIME) 4 flags |= ST_RELATIME; return flags; } static int flags_by_sb(int s_flags) { int flags = 0; if (s_flags & MS_SYNCHRONOUS) flags |= ST_SYNCHRONOUS; 8 if (s_flags & MS_MANDLOCK) 1 flags |= ST_MANDLOCK; return flags; } static int calculate_f_flags(struct vfsmount *mnt) { 8 return ST_VALID | flags_by_mnt(mnt->mnt_flags) | 8 flags_by_sb(mnt->mnt_sb->s_flags); } 13 static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) { int retval; 13 if (!dentry->d_sb->s_op->statfs) return -ENOSYS; 13 memset(buf, 0, sizeof(*buf)); retval = security_sb_statfs(dentry); if (retval) return retval; 13 retval = dentry->d_sb->s_op->statfs(dentry, buf); 13 if (retval == 0 && buf->f_frsize == 0) 13 buf->f_frsize = buf->f_bsize; return retval; } 8 int vfs_statfs(struct path *path, struct kstatfs *buf) { int error; 8 error = statfs_by_dentry(path->dentry, buf); if (!error) 8 buf->f_flags = calculate_f_flags(path->mnt); 8 return error; } EXPORT_SYMBOL(vfs_statfs); int user_statfs(const char __user *pathname, struct kstatfs *st) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT; retry: 4 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { 2 error = vfs_statfs(&path, st); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } 4 return error; } int fd_statfs(int fd, struct kstatfs *st) { 7 struct fd f = fdget_raw(fd); int error = -EBADF; if (f.file) { 6 error = vfs_statfs(&f.file->f_path, st); 5 fdput(f); } 7 return error; } static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) { struct statfs buf; if (sizeof(buf) == sizeof(*st)) memcpy(&buf, st, sizeof(*st)); else { if (sizeof buf.f_blocks == 4) { if ((st->f_blocks | st->f_bfree | st->f_bavail | st->f_bsize | st->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* * f_files and f_ffree may be -1; it's okay to stuff * that into 32 bits */ if (st->f_files != -1 && (st->f_files & 0xffffffff00000000ULL)) return -EOVERFLOW; if (st->f_ffree != -1 && (st->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } buf.f_type = st->f_type; buf.f_bsize = st->f_bsize; buf.f_blocks = st->f_blocks; buf.f_bfree = st->f_bfree; buf.f_bavail = st->f_bavail; buf.f_files = st->f_files; buf.f_ffree = st->f_ffree; buf.f_fsid = st->f_fsid; buf.f_namelen = st->f_namelen; buf.f_frsize = st->f_frsize; buf.f_flags = st->f_flags; memset(buf.f_spare, 0, sizeof(buf.f_spare)); } if (copy_to_user(p, &buf, sizeof(buf))) return -EFAULT; return 0; } static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) { struct statfs64 buf; if (sizeof(buf) == sizeof(*st)) memcpy(&buf, st, sizeof(*st)); else { buf.f_type = st->f_type; buf.f_bsize = st->f_bsize; buf.f_blocks = st->f_blocks; buf.f_bfree = st->f_bfree; buf.f_bavail = st->f_bavail; buf.f_files = st->f_files; buf.f_ffree = st->f_ffree; buf.f_fsid = st->f_fsid; buf.f_namelen = st->f_namelen; buf.f_frsize = st->f_frsize; buf.f_flags = st->f_flags; memset(buf.f_spare, 0, sizeof(buf.f_spare)); } if (copy_to_user(p, &buf, sizeof(buf))) return -EFAULT; return 0; } SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) { struct kstatfs st; int error = user_statfs(pathname, &st); if (!error) error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) { struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; error = user_statfs(pathname, &st); if (!error) error = do_statfs64(&st, buf); return error; } SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) { struct kstatfs st; int error = fd_statfs(fd, &st); if (!error) error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) { struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; error = fd_statfs(fd, &st); if (!error) error = do_statfs64(&st, buf); return error; } int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { 7 struct super_block *s = user_get_super(dev); int err; if (!s) return -EINVAL; 5 err = statfs_by_dentry(s->s_root, sbuf); drop_super(s); 7 return err; } SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) { struct ustat tmp; struct kstatfs sbuf; int err = vfs_ustat(new_decode_dev(dev), &sbuf); if (err) return err; memset(&tmp,0,sizeof(struct ustat)); tmp.f_tfree = sbuf.f_bfree; tmp.f_tinode = sbuf.f_ffree; return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; }
#ifndef _LINUX_MM_TYPES_H #define _LINUX_MM_TYPES_H #include <linux/auxvec.h> #include <linux/types.h> #include <linux/threads.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/rwsem.h> #include <linux/completion.h> #include <linux/cpumask.h> #include <linux/uprobes.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <asm/page.h> #include <asm/mmu.h> #ifndef AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 0 #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) struct address_space; struct mem_cgroup; #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. * * The objects in struct page are organized in double word blocks in * order to allows us to use atomic double word operations on portions * of struct page. That is currently only used by slub but the arrangement * allows the use of atomic double word operations on the flags/mapping * and lru list pointers also. */ struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ union { struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and * it points to anon_vma object: * see PAGE_MAPPING_ANON below. */ void *s_mem; /* slab first object */ }; /* Second double word */ struct { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ }; union { #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) /* Used for cmpxchg_double in slub */ unsigned long counters; #else /* * Keep _count separate from slub cmpxchg_double data. * As the rest of the double word is protected by * slab_lock but _count is not. */ unsigned counters; #endif struct { union { /* * Count of ptes mapped in * mms, to show when page is * mapped & limit reverse map * searches. * * Used also for tail pages * refcounting instead of * _count. Tail pages cannot * be mapped and keeping the * tail page _count zero at * all times guarantees * get_page_unless_zero() will * never succeed on tail * pages. */ atomic_t _mapcount; struct { /* SLUB */ unsigned inuse:16; unsigned objects:15; unsigned frozen:1; }; int units; /* SLOB */ }; atomic_t _count; /* Usage count, see below. */ }; unsigned int active; /* SLAB */ }; }; /* * Third double word block * * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! * Can be used as a generic list * by the page owner. */ struct { /* slub per cpu partial pages */ struct page *next; /* Next partial slab */ #ifdef CONFIG_64BIT int pages; /* Nr of partial slabs left */ int pobjects; /* Approximate # of objects */ #else short int pages; short int pobjects; #endif }; struct rcu_head rcu_head; /* Used by SLAB * when destroying via RCU */ /* Tail pages of compound page */ struct { unsigned long compound_head; /* If bit zero is set */ /* First tail page only */ #ifdef CONFIG_64BIT /* * On 64 bit system we have enough space in struct page * to encode compound_dtor and compound_order with * unsigned int. It can help compiler generate better or * smaller code on some archtectures. */ unsigned int compound_dtor; unsigned int compound_order; #else unsigned short int compound_dtor; unsigned short int compound_order; #endif }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS struct { unsigned long __pad; /* do not overlay pmd_huge_pte * with compound_head to avoid * possible bit 0 collision. */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ }; #endif }; /* Remainder is not double word aligned */ union { unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache; * indicates order in the buddy * system if PG_buddy is set. */ #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; #endif #endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ }; #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ #ifdef CONFIG_KMEMCHECK /* * kmemcheck wants to track the status of each byte in a page; this * is a pointer to such a status block. NULL if not tracked. */ void *shadow; #endif #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif } /* * The struct page can be forced to be double word aligned so that atomic ops * on double words work. The SLUB allocator can make use of such a feature. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE __aligned(2 * sizeof(unsigned long)) #endif ; struct page_frag { struct page *page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) __u32 offset; __u32 size; #else __u16 offset; __u16 size; #endif }; #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) __u16 offset; __u16 size; #else __u32 offset; #endif /* we maintain a pagecount bias, so that we dont dirty cache line * containing page->_count every time we allocate a fragment. */ unsigned int pagecnt_bias; bool pfmemalloc; }; typedef unsigned long vm_flags_t; /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that * map parts of them. */ struct vm_region { struct rb_node vm_rb; /* link in global region tree */ vm_flags_t vm_flags; /* VMA vm_flags */ unsigned long vm_start; /* start address of region */ unsigned long vm_end; /* region initialised to here */ unsigned long vm_top; /* region allocated to here */ unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ int vm_usage; /* region usage count (access under nommu_region_sem) */ bool vm_icache_flushed : 1; /* true if the icache has been flushed for * this region */ }; #ifdef CONFIG_USERFAULTFD #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) struct vm_userfaultfd_ctx { struct userfaultfd_ctx *ctx; }; #else /* CONFIG_USERFAULTFD */ #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ /* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ unsigned long vm_start; /* Our start address within vm_mm. */ unsigned long vm_end; /* The first byte after our end address within vm_mm. */ /* linked list of VM areas per task, sorted by address */ struct vm_area_struct *vm_next, *vm_prev; struct rb_node vm_rb; /* * Largest free memory gap in bytes to the left of this VMA. * Either between this VMA and vma->vm_prev, or between one of the * VMAs below us in the VMA rbtree and its ->vm_prev. This helps * get_unmapped_area find a free area of the right size. */ unsigned long rb_subtree_gap; /* Second cache line starts here. */ struct mm_struct *vm_mm; /* The address space we belong to. */ pgprot_t vm_page_prot; /* Access permissions of this VMA. */ unsigned long vm_flags; /* Flags, see mm.h. */ /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * * For private anonymous mappings, a pointer to a null terminated string * in the user process containing the name given to the vma, or NULL * if unnamed. */ union { struct { struct rb_node rb; unsigned long rb_subtree_last; } shared; const char __user *anon_name; }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */ struct list_head anon_vma_chain; /* Serialized by mmap_sem & * page_table_lock */ struct anon_vma *anon_vma; /* Serialized by page_table_lock */ /* Function pointers to deal with this struct. */ const struct vm_operations_struct *vm_ops; /* Information about our backing store: */ unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ #ifndef CONFIG_MMU struct vm_region *vm_region; /* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; }; struct core_thread { struct task_struct *task; struct core_thread *next; }; struct core_state { atomic_t nr_threads; struct core_thread dumper; struct completion startup; }; enum { MM_FILEPAGES, MM_ANONPAGES, MM_SWAPENTS, NR_MM_COUNTERS }; #if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) #define SPLIT_RSS_COUNTING /* per-thread cached information, */ struct task_rss_stat { int events; /* for synchronization threshold */ int count[NR_MM_COUNTERS]; }; #endif /* USE_SPLIT_PTE_PTLOCKS */ struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; }; struct kioctx_table; struct mm_struct { struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ atomic_long_t nr_ptes; /* PTE page table pages */ #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ #endif int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ struct rw_semaphore mmap_sem; struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung * together off init_mm.mmlist, and are protected * by mmlist_lock */ unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ unsigned long pinned_vm; /* Refcount permanently increased */ unsigned long shared_vm; /* Shared pages (files) */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ unsigned long def_flags; unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ /* * Special counters, in some configurations protected by the * page_table_lock, in other configurations by being atomic. */ struct mm_rss_stat rss_stat; struct linux_binfmt *binfmt; cpumask_var_t cpu_vm_mask_var; /* Architecture-specific MM context */ mm_context_t context; unsigned long flags; /* Must use atomic bitops to access the bits */ struct core_state *core_state; /* coredumping support */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in * order for it to be changed: * * current == mm->owner * current->mm != mm * new_owner->mm == mm * new_owner->alloc_lock is held */ struct task_struct __rcu *owner; #endif struct user_namespace *user_ns; /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_CPUMASK_OFFSTACK struct cpumask cpumask_allocation; #endif #ifdef CONFIG_NUMA_BALANCING /* * numa_next_scan is the next time that the PTEs will be marked * pte_numa. NUMA hinting faults will gather statistics and migrate * pages to new nodes if necessary. */ unsigned long numa_next_scan; /* Restart point for scanning and setting pte_numa */ unsigned long numa_scan_offset; /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; #endif #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * An operation with batched TLB flushing is going on. Anything that * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */ bool tlb_flush_pending; #endif #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_X86_INTEL_MPX /* address of the bounds directory */ void __user *bd_addr; #endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; }; static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK mm->cpu_vm_mask_var = &mm->cpumask_allocation; #endif cpumask_clear(mm->cpu_vm_mask_var); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { return mm->cpu_vm_mask_var; } #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * Memory barriers to keep this state in sync are graciously provided by * the page table locks, outside of which no page table modifications happen. * The barriers below prevent the compiler from re-ordering the instructions * around the memory barriers that are already present in the code. */ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { 1 barrier(); return mm->tlb_flush_pending; } static inline void set_tlb_flush_pending(struct mm_struct *mm) { mm->tlb_flush_pending = true; /* * Guarantee that the tlb_flush_pending store does not leak into the * critical section updating the page tables */ smp_mb__before_spinlock(); } /* Clearing is done after a TLB flush, which also provides a barrier. */ static inline void clear_tlb_flush_pending(struct mm_struct *mm) { 25 barrier(); mm->tlb_flush_pending = false; } #else static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { return false; } static inline void set_tlb_flush_pending(struct mm_struct *mm) { } static inline void clear_tlb_flush_pending(struct mm_struct *mm) { } #endif struct vm_special_mapping { const char *name; struct page **pages; }; enum tlb_flush_reason { TLB_FLUSH_ON_TASK_SWITCH, TLB_REMOTE_SHOOTDOWN, TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, NR_TLB_FLUSH_REASONS, }; /* * A swap entry has to fit into a "unsigned long", as the entry is hidden * in the "index" field of the swapper address space. */ typedef struct { unsigned long val; } swp_entry_t; /* Return the name for an anonymous mapping or NULL for a file-backed mapping */ static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma) { 84 if (vma->vm_file) return NULL; 104 return vma->anon_name; } #endif /* _LINUX_MM_TYPES_H */
/* * net/ipv6/fib6_rules.c IPv6 Routing Policy Rules * * Copyright (C)2003-2006 Helsinki University of Technology * Copyright (C)2003-2006 USAGI/WIDE Project * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, version 2. * * Authors * Thomas Graf <tgraf@suug.ch> * Ville Nuorvala <vnuorval@tcs.hut.fi> */ #include <linux/netdevice.h> #include <linux/export.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/netlink.h> struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; u8 tclass; }; struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { 688 struct fib_lookup_arg arg = { .lookup_ptr = lookup, .flags = FIB_LOOKUP_NOREF, }; fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); 688 if (arg.result) return arg.result; 65 dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; struct fib6_table *table; 686 struct net *net = rule->fr_net; 686 pol_lookup_t lookup = arg->lookup_ptr; int err = 0; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: err = -ENETUNREACH; rt = net->ipv6.ip6_null_entry; goto discard_pkt; default: case FR_ACT_BLACKHOLE: err = -EINVAL; rt = net->ipv6.ip6_blk_hole_entry; goto discard_pkt; case FR_ACT_PROHIBIT: err = -EACCES; rt = net->ipv6.ip6_prohibit_entry; goto discard_pkt; } table = fib6_get_table(net, rule->table); if (!table) { err = -EAGAIN; goto out; } rt = lookup(net, table, flp6, flags); if (rt != net->ipv6.ip6_null_entry) { struct fib6_rule *r = (struct fib6_rule *)rule; /* * If we need to find a source address for this traffic, * we check the result if it meets requirement of the rule. */ 651 if ((rule->flags & FIB_RULE_FIND_SADDR) && r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { struct in6_addr saddr; if (ipv6_dev_get_saddr(net, ip6_dst_idev(&rt->dst)->dev, &flp6->daddr, rt6_flags2srcprefs(flags), &saddr)) goto again; if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) goto again; flp6->saddr = saddr; } 651 err = rt->dst.error; if (err != -EAGAIN) goto out; } again: 281 ip6_rt_put(rt); err = -EAGAIN; rt = NULL; goto out; discard_pkt: dst_hold(&rt->dst); out: 686 arg->result = rt; return err; } static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { 650 struct rt6_info *rt = (struct rt6_info *) arg->result; struct net_device *dev = NULL; if (rt->rt6i_idev) dev = rt->rt6i_idev->dev; /* do not accept result if the route does * not meet the required prefix length */ 650 if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device * belonging to a forbidden interface group */ 650 if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) goto suppress_route; return false; suppress_route: ip6_rt_put(rt); 650 return true; } static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib6_rule *r = (struct fib6_rule *) rule; struct flowi6 *fl6 = &fl->u.ip6; 686 if (r->dst.plen && 686 !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen)) return 0; /* * If FIB_RULE_FIND_SADDR is set and we do not have a * source address for the traffic, we defer check for * source address. */ 686 if (r->src.plen) { if (flags & RT6_LOOKUP_F_HAS_SADDR) { if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr, r->src.plen)) return 0; } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) return 0; } 686 if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel)) return 0; return 1; } static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = { FRA_GENERIC_POLICY, }; static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb) { int err = -EINVAL; 2 struct net *net = sock_net(skb->sk); struct fib6_rule *rule6 = (struct fib6_rule *) rule; 4 if (rule->action == FR_ACT_TO_TBL) { 3 if (rule->table == RT6_TABLE_UNSPEC) goto errout; if (fib6_new_table(net, rule->table) == NULL) { err = -ENOBUFS; goto errout; } } 3 if (frh->src_len) rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]); 3 if (frh->dst_len) rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]); 3 rule6->src.plen = frh->src_len; rule6->dst.plen = frh->dst_len; rule6->tclass = frh->tos; err = 0; errout: 4 return err; } static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; 3 if (frh->src_len && (rule6->src.plen != frh->src_len)) 3 return 0; 3 if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) return 0; 3 if (frh->tos && (rule6->tclass != frh->tos)) return 0; 1 if (frh->src_len && nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) return 0; 1 if (frh->dst_len && nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr))) return 0; return 1; } static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; 7 frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; frh->tos = rule6->tclass; if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || 7 (rule6->src.plen && nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr))) goto nla_put_failure; 7 return 0; nla_put_failure: return -ENOBUFS; } static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ 4 + nla_total_size(16); /* src */ } static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, .nlgroup = RTNLGRP_IPV6_RULE, .policy = fib6_rule_policy, .owner = THIS_MODULE, .fro_net = &init_net, }; static int __net_init fib6_rules_net_init(struct net *net) { struct fib_rules_ops *ops; int err = -ENOMEM; 28 ops = fib_rules_register(&fib6_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); 28 err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL, 0); if (err) goto out_fib6_rules_ops; 28 err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN, 0); if (err) goto out_fib6_rules_ops; 28 net->ipv6.fib6_rules_ops = ops; out: return err; out_fib6_rules_ops: fib_rules_unregister(ops); goto out; } static void __net_exit fib6_rules_net_exit(struct net *net) { rtnl_lock(); fib_rules_unregister(net->ipv6.fib6_rules_ops); rtnl_unlock(); } static struct pernet_operations fib6_rules_net_ops = { .init = fib6_rules_net_init, .exit = fib6_rules_net_exit, }; int __init fib6_rules_init(void) { return register_pernet_subsys(&fib6_rules_net_ops); } void fib6_rules_cleanup(void) { unregister_pernet_subsys(&fib6_rules_net_ops); }
/* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> * - July2000 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 */ /* * This handles all read/write requests to block devices */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> #include <linux/list_sort.h> #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include <linux/math64.h> EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ struct kmem_cache *request_cachep = NULL; /* * For queue allocation */ struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; static void blk_clear_congested(struct request_list *rl, int sync) { #ifdef CONFIG_CGROUP_WRITEBACK clear_wb_congested(rl->blkg->wb_congested, sync); #else /* * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't * flip its congestion state for events on other blkcgs. */ 162 if (rl == &rl->q->root_rl) 162 clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); #endif } static void blk_set_congested(struct request_list *rl, int sync) { #ifdef CONFIG_CGROUP_WRITEBACK set_wb_congested(rl->blkg->wb_congested, sync); #else /* see blk_clear_congested() */ 1 if (rl == &rl->q->root_rl) 1 set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); #endif } void blk_queue_congestion_threshold(struct request_queue *q) { int nr; 23 nr = q->nr_requests - (q->nr_requests / 8) + 1; if (nr > q->nr_requests) nr = q->nr_requests; 23 q->nr_congestion_on = nr; nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; if (nr < 1) nr = 1; q->nr_congestion_off = nr; } /** * blk_get_backing_dev_info - get the address of a queue's backing_dev_info * @bdev: device * * Locates the passed device's request queue and returns the address of its * backing_dev_info. This function can only be called if @bdev is opened * and the return value is never NULL. */ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { 478 struct request_queue *q = bdev_get_queue(bdev); return &q->backing_dev_info; } EXPORT_SYMBOL(blk_get_backing_dev_info); void blk_rq_init(struct request_queue *q, struct request *rq) { 750 memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); INIT_LIST_HEAD(&rq->timeout_list); rq->cpu = -1; rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->cmd = rq->__cmd; rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { 1 if (error) 1 bio->bi_error = error; 1 if (unlikely(rq->cmd_flags & REQ_QUIET)) bio_set_flag(bio, BIO_QUIET); 1 bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ 1 if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) 1 bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) { int bit; printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { printk(KERN_INFO " cdb: "); for (bit = 0; bit < BLK_MAX_CDB; bit++) printk("%02x ", rq->cmd[bit]); printk("\n"); } } EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_delay_work(struct work_struct *work) { struct request_queue *q; q = container_of(work, struct request_queue, delay_work.work); spin_lock_irq(q->queue_lock); __blk_run_queue(q); spin_unlock_irq(q->queue_lock); } /** * blk_delay_queue - restart queueing after defined interval * @q: The &struct request_queue in question * @msecs: Delay in msecs * * Description: * Sometimes queueing needs to be postponed for a little while, to allow * resources to come back. This function will make sure that queueing is * restarted around the specified time. Queue lock must be held. */ void blk_delay_queue(struct request_queue *q, unsigned long msecs) { if (likely(!blk_queue_dead(q))) queue_delayed_work(kblockd_workqueue, &q->delay_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_delay_queue); /** * blk_start_queue_async - asynchronously restart a previously stopped queue * @q: The &struct request_queue in question * * Description: * blk_start_queue_async() will clear the stop flag on the queue, and * ensure that the request_fn for the queue is run from an async * context. **/ void blk_start_queue_async(struct request_queue *q) { queue_flag_clear(QUEUE_FLAG_STOPPED, q); blk_run_queue_async(q); } EXPORT_SYMBOL(blk_start_queue_async); /** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question * * Description: * blk_start_queue() will clear the stop flag on the queue, and call * the request_fn for the queue if it was in a stopped state when * entered. Also see blk_stop_queue(). Queue lock must be held. **/ void blk_start_queue(struct request_queue *q) { WARN_ON(!in_interrupt() && !irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); } EXPORT_SYMBOL(blk_start_queue); /** * blk_stop_queue - stop a queue * @q: The &struct request_queue in question * * Description: * The Linux block layer assumes that a block driver will consume all * entries on the request queue when the request_fn strategy is called. * Often this will not happen, because of hardware limitations (queue * depth settings). If a device driver gets a 'queue full' response, * or if it simply chooses not to queue more I/O at one point, it can * call this function to prevent the request_fn from being called until * the driver has signalled it's ready to go again. This happens by calling * blk_start_queue() to restart queue operations. Queue lock must be held. **/ void blk_stop_queue(struct request_queue *q) { cancel_delayed_work(&q->delay_work); queue_flag_set(QUEUE_FLAG_STOPPED, q); } EXPORT_SYMBOL(blk_stop_queue); /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue * * Description: * The block layer may perform asynchronous callback activity * on a queue, such as calling the unplug function after a timeout. * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure * that its ->make_request_fn will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ 32 void blk_sync_queue(struct request_queue *q) { 32 del_timer_sync(&q->timeout); if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; int i; 32 queue_for_each_hw_ctx(q, hctx, i) { cancel_delayed_work_sync(&hctx->run_work); cancel_delayed_work_sync(&hctx->delay_work); } } else { cancel_delayed_work_sync(&q->delay_work); } 32 } EXPORT_SYMBOL(blk_sync_queue); /** * __blk_run_queue_uncond - run a queue whether or not it has been stopped * @q: The queue to run * * Description: * Invoke request handling on a queue if there are any pending requests. * May be used to restart request handling after a request has completed. * This variant runs the queue whether or not the queue has been * stopped. Must be called with the queue lock held and interrupts * disabled. See also @blk_run_queue. */ inline void __blk_run_queue_uncond(struct request_queue *q) { 650 if (unlikely(blk_queue_dead(q))) return; /* * Some request_fn implementations, e.g. scsi_request_fn(), unlock * the queue lock internally. As a result multiple threads may be * running such a request function concurrently. Keep track of the * number of active request_fn invocations such that blk_drain_queue() * can wait until all these request_fn calls have finished. */ 650 q->request_fn_active++; q->request_fn(q); 650 q->request_fn_active--; } EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); /** * __blk_run_queue - run a single device queue * @q: The queue to run * * Description: * See @blk_run_queue. This variant must be called with the queue lock * held and interrupts disabled. */ void __blk_run_queue(struct request_queue *q) { 650 if (unlikely(blk_queue_stopped(q))) return; 650 __blk_run_queue_uncond(q); } EXPORT_SYMBOL(__blk_run_queue); /** * blk_run_queue_async - run a single device queue in workqueue context * @q: The queue to run * * Description: * Tells kblockd to perform the equivalent of @blk_run_queue on behalf * of us. The caller must hold the queue lock. */ void blk_run_queue_async(struct request_queue *q) { 433 if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) 433 mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); 433 } EXPORT_SYMBOL(blk_run_queue_async); /** * blk_run_queue - run a single device queue * @q: The queue to run * * Description: * Invoke request handling on this queue, if it has pending work to do. * May be used to restart queueing when a request has completed. */ void blk_run_queue(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_run_queue); void blk_put_queue(struct request_queue *q) { 32 kobject_put(&q->kobj); } EXPORT_SYMBOL(blk_put_queue); /** * __blk_drain_queue - drain requests from request_queue * @q: queue to drain * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV * * Drain requests from @q. If @drain_all is set, all requests are drained. * If not, only ELVPRIV requests are drained. The caller is responsible * for ensuring that no new requests which need to be drained are queued. */ static void __blk_drain_queue(struct request_queue *q, bool drain_all) __releases(q->queue_lock) __acquires(q->queue_lock) { int i; lockdep_assert_held(q->queue_lock); while (true) { bool drain = false; /* * The caller might be trying to drain @q before its * elevator is initialized. */ if (q->elevator) elv_drain_elevator(q); blkcg_drain_queue(q); /* * This function might be called on a queue which failed * driver init after queue creation or is not yet fully * active yet. Some drivers (e.g. fd and loop) get unhappy * in such cases. Kick queue iff dispatch queue has * something on it and @q has request_fn set. */ if (!list_empty(&q->queue_head) && q->request_fn) __blk_run_queue(q); drain |= q->nr_rqs_elvpriv; drain |= q->request_fn_active; /* * Unfortunately, requests are queued at and tracked from * multiple places and there's no single counter which can * be drained. Check all the queues and counters. */ if (drain_all) { struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; if (fq) drain |= !list_empty(&fq->flush_queue[i]); } } if (!drain) break; spin_unlock_irq(q->queue_lock); msleep(10); spin_lock_irq(q->queue_lock); } /* * With queue marked dead, any woken up waiter will fail the * allocation path, so the wakeup chaining is lost and we're * left with hung waiters. We need to wake up those waiters. */ if (q->request_fn) { struct request_list *rl; blk_queue_for_each_rl(rl, q) for (i = 0; i < ARRAY_SIZE(rl->wait); i++) wake_up_all(&rl->wait[i]); } } /** * blk_queue_bypass_start - enter queue bypass mode * @q: queue of interest * * In bypass mode, only the dispatch FIFO queue of @q is used. This * function makes @q enter bypass mode and drains all requests which were * throttled or issued before. On return, it's guaranteed that no request * is being throttled or has ELVPRIV set and blk_queue_bypass() %true * inside queue or RCU read lock. */ void blk_queue_bypass_start(struct request_queue *q) { spin_lock_irq(q->queue_lock); q->bypass_depth++; queue_flag_set(QUEUE_FLAG_BYPASS, q); spin_unlock_irq(q->queue_lock); /* * Queues start drained. Skip actual draining till init is * complete. This avoids lenghty delays during queue init which * can happen many times during boot. */ if (blk_queue_init_done(q)) { spin_lock_irq(q->queue_lock); __blk_drain_queue(q, false); spin_unlock_irq(q->queue_lock); /* ensure blk_queue_bypass() is %true inside RCU read lock */ synchronize_rcu(); } } EXPORT_SYMBOL_GPL(blk_queue_bypass_start); /** * blk_queue_bypass_end - leave queue bypass mode * @q: queue of interest * * Leave bypass mode and restore the normal queueing behavior. */ void blk_queue_bypass_end(struct request_queue *q) { 23 spin_lock_irq(q->queue_lock); if (!--q->bypass_depth) 23 queue_flag_clear(QUEUE_FLAG_BYPASS, q); 23 WARN_ON_ONCE(q->bypass_depth < 0); 23 spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_queue_bypass_end); void blk_set_queue_dying(struct request_queue *q) { 32 spin_lock_irq(q->queue_lock); 32 queue_flag_set(QUEUE_FLAG_DYING, q); spin_unlock_irq(q->queue_lock); if (q->mq_ops) 32 blk_mq_wake_waiters(q); else { struct request_list *rl; blk_queue_for_each_rl(rl, q) { if (rl->rq_pool) { wake_up_all(&rl->wait[BLK_RW_SYNC]); wake_up_all(&rl->wait[BLK_RW_ASYNC]); } } } 32 } EXPORT_SYMBOL_GPL(blk_set_queue_dying); /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. */ void blk_cleanup_queue(struct request_queue *q) { 32 spinlock_t *lock = q->queue_lock; /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); spin_lock_irq(lock); /* * A dying queue is permanently in bypass mode till released. Note * that, unlike blk_queue_bypass_start(), we aren't performing * synchronize_rcu() after entering bypass mode to avoid the delay * as some drivers create and destroy a lot of queues while * probing. This is still safe because blk_release_queue() will be * called only after the queue refcnt drops to zero and nothing, * RCU or not, would be traversing the queue by then. */ q->bypass_depth++; 32 queue_flag_set(QUEUE_FLAG_BYPASS, q); 32 queue_flag_set(QUEUE_FLAG_NOMERGES, q); 32 queue_flag_set(QUEUE_FLAG_NOXMERGES, q); 32 queue_flag_set(QUEUE_FLAG_DYING, q); spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); /* * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that q->request_fn() gets invoked after draining finished. */ blk_freeze_queue(q); spin_lock_irq(lock); if (!q->mq_ops) __blk_drain_queue(q, true); 32 queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); if (q->mq_ops) 32 blk_mq_free_queue(q); 32 percpu_ref_exit(&q->q_usage_counter); spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; 32 spin_unlock_irq(lock); bdi_unregister(&q->backing_dev_info); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); /* Allocate memory local to the request queue */ static void *alloc_request_struct(gfp_t gfp_mask, void *data) { int nid = (int)(long)data; 694 return kmem_cache_alloc_node(request_cachep, gfp_mask, nid); } static void free_request_struct(void *element, void *unused) { 162 kmem_cache_free(request_cachep, element); } int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { if (unlikely(rl->rq_pool)) return 0; rl->q = q; rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct, free_request_struct, (void *)(long)q->node, gfp_mask, q->node); if (!rl->rq_pool) return -ENOMEM; return 0; } void blk_exit_rl(struct request_list *rl) { 32 if (rl->rq_pool) mempool_destroy(rl->rq_pool); 32 } struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); } EXPORT_SYMBOL(blk_alloc_queue); int blk_queue_enter(struct request_queue *q, gfp_t gfp) 762 { while (true) { 762 if (percpu_ref_tryget_live(&q->q_usage_counter)) 761 return 0; 7 if (!gfpflags_allow_blocking(gfp)) return -EBUSY; 7 wait_event(q->mq_freeze_wq, !atomic_read(&q->mq_freeze_depth) || blk_queue_dying(q)); 7 if (blk_queue_dying(q)) return -ENODEV; } } void blk_queue_exit(struct request_queue *q) { 761 percpu_ref_put(&q->q_usage_counter); 761 } static void blk_queue_usage_counter_release(struct percpu_ref *ref) { struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); wake_up_all(&q->mq_freeze_wq); } struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; int err; 23 q = kmem_cache_alloc_node(blk_requestq_cachep, gfp_mask | __GFP_ZERO, node_id); if (!q) return NULL; 23 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) goto fail_q; 23 q->bio_split = bioset_create(BIO_POOL_SIZE, 0); if (!q->bio_split) goto fail_id; 23 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info.name = "block"; q->node = node_id; err = bdi_init(&q->backing_dev_info); if (err) goto fail_split; 23 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); /* * By default initialize queue_lock to internal lock and driver can * override it later if need be. */ q->queue_lock = &q->__queue_lock; /* * A queue starts its life with bypass turned on to avoid * unnecessary bypass on/off overhead and nasty surprises during * init. The initial bypass will be finished when the queue is * registered by blk_register_queue(). */ q->bypass_depth = 1; __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); init_waitqueue_head(&q->mq_freeze_wq); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ 23 if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; if (blkcg_init_queue(q)) goto fail_ref; return q; fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: bdi_destroy(&q->backing_dev_info); fail_split: bioset_free(q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } EXPORT_SYMBOL(blk_alloc_queue_node); /** * blk_init_queue - prepare a request queue for use with a block device * @rfn: The function to be called to process requests that have been * placed on the queue. * @lock: Request queue spin lock * * Description: * If a block device wishes to use the standard request handling procedures, * which sorts requests and coalesces adjacent requests, then it must * call blk_init_queue(). The function @rfn will be called when there * are requests on the queue that need to be processed. If the device * supports plugging, then @rfn may not be called immediately when requests * are available on the queue, but may be called at some time later instead. * Plugged queues are generally unplugged when a buffer belonging to one * of the requests on the queue is needed, or due to memory pressure. * * @rfn is not required, or even expected, to remove all requests off the * queue, but only as many as it can handle at a time. If it does leave * requests on the queue, it is responsible for arranging that the requests * get dealt with eventually. * * The queue spin lock must be held while manipulating the requests on the * request queue; this lock will be taken also from interrupt context, so irq * disabling is needed for it. * * Function returns a pointer to the initialized request queue, or %NULL if * it didn't succeed. * * Note: * blk_init_queue() must be paired with a blk_cleanup_queue() call * when the block device is deactivated (such as at module unload). **/ struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) { return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); } EXPORT_SYMBOL(blk_init_queue); struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { struct request_queue *uninit_q, *q; uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); if (!uninit_q) return NULL; q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) blk_cleanup_queue(uninit_q); return q; } EXPORT_SYMBOL(blk_init_queue_node); static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, spinlock_t *lock) { if (!q) return NULL; q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0); if (!q->fq) return NULL; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; q->queue_flags |= QUEUE_FLAG_DEFAULT; /* Override internal queue lock with supplied lock pointer */ if (lock) q->queue_lock = lock; /* * This also sets hw/phys segments, boundary and size */ blk_queue_make_request(q, blk_queue_bio); q->sg_reserved_size = INT_MAX; /* Protect q->elevator from elevator_change */ mutex_lock(&q->sysfs_lock); /* init elevator */ if (elevator_init(q, NULL)) { mutex_unlock(&q->sysfs_lock); goto fail; } mutex_unlock(&q->sysfs_lock); return q; fail: blk_free_flush_queue(q->fq); q->fq = NULL; return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); bool blk_get_queue(struct request_queue *q) 23 { 23 if (likely(!blk_queue_dying(q))) { 23 __blk_get_queue(q); return true; } return false; } EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_list *rl, struct request *rq) 162 { 162 if (rq->cmd_flags & REQ_ELVPRIV) { elv_put_request(rl->q, rq); 162 if (rq->elv.icq) put_io_context(rq->elv.icq->ioc); } 162 mempool_free(rq, rl->rq_pool); } /* * ioc_batching returns true if the ioc is a valid batching request and * should be given priority access to a request. */ static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) 694 { if (!ioc) return 0; /* * Make sure the process is able to allocate at least 1 request * even if the batch times out, otherwise we could theoretically * lose wakeups. 694 */ return ioc->nr_batch_requests == q->nr_batching || 1 (ioc->nr_batch_requests > 0 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); } /* * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This * will cause the process to be a "batcher" on all queues in the system. This * is the behaviour we want though - once it gets a wakeup it should be given * a nice run. */ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) 1 { if (!ioc || ioc_batching(q, ioc)) return; 1 ioc->nr_batch_requests = q->nr_batching; ioc->last_waited = jiffies; } 162 static void __freed_request(struct request_list *rl, int sync) 162 { struct request_queue *q = rl->q; 162 if (rl->count[sync] < queue_congestion_off_threshold(q)) blk_clear_congested(rl, sync); 162 162 if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); 162 blk_clear_rl_full(rl, sync); 162 } } /* * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ static void freed_request(struct request_list *rl, unsigned int flags) 162 { struct request_queue *q = rl->q; int sync = rw_is_sync(flags); q->nr_rqs[sync]--; rl->count[sync]--; 162 if (flags & REQ_ELVPRIV) q->nr_rqs_elvpriv--; 162 __freed_request(rl, sync); if (unlikely(rl->starved[sync ^ 1])) 162 __freed_request(rl, sync ^ 1); } int blk_update_nr_requests(struct request_queue *q, unsigned int nr) { struct request_list *rl; int on_thresh, off_thresh; spin_lock_irq(q->queue_lock); q->nr_requests = nr; blk_queue_congestion_threshold(q); on_thresh = queue_congestion_on_threshold(q); off_thresh = queue_congestion_off_threshold(q); blk_queue_for_each_rl(rl, q) { if (rl->count[BLK_RW_SYNC] >= on_thresh) blk_set_congested(rl, BLK_RW_SYNC); else if (rl->count[BLK_RW_SYNC] < off_thresh) blk_clear_congested(rl, BLK_RW_SYNC); if (rl->count[BLK_RW_ASYNC] >= on_thresh) blk_set_congested(rl, BLK_RW_ASYNC); else if (rl->count[BLK_RW_ASYNC] < off_thresh) blk_clear_congested(rl, BLK_RW_ASYNC); if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_rl_full(rl, BLK_RW_SYNC); } else { blk_clear_rl_full(rl, BLK_RW_SYNC); wake_up(&rl->wait[BLK_RW_SYNC]); } if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { blk_set_rl_full(rl, BLK_RW_ASYNC); } else { blk_clear_rl_full(rl, BLK_RW_ASYNC); wake_up(&rl->wait[BLK_RW_ASYNC]); } } spin_unlock_irq(q->queue_lock); return 0; } /* * Determine if elevator data should be initialized when allocating the * request associated with @bio. */ static bool blk_rq_should_init_elevator(struct bio *bio) { if (!bio) return true; /* * Flush requests do not use the elevator so skip initialization. * This allows a request to share the flush and elevator data. 694 */ if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) return false; return true; } /** * rq_ioc - determine io_context for request allocation * @bio: request being allocated is for this bio (can be %NULL) * * Determine io_context to use for request allocation for @bio. May return * %NULL if %current->io_context doesn't exist. */ static struct io_context *rq_ioc(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP if (bio && bio->bi_ioc) return bio->bi_ioc; #endif return current->io_context; } /** * __get_request - get a free request * @rl: request list to allocate from * @rw_flags: RW and SYNC flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, int rw_flags, struct bio *bio, gfp_t gfp_mask) 694 { struct request_queue *q = rl->q; struct request *rq; struct elevator_type *et = q->elevator->type; struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; int may_queue; if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); 694 may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; 694 1 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { if (rl->count[is_sync]+1 >= q->nr_requests) { /* * The queue will fill after this allocation, so set * it as full, and mark this process as "batching". * This process will be allowed to complete a batch of * requests, others will be blocked. 1 */ 1 if (!blk_rl_full(rl, is_sync)) { 1 ioc_set_batching(q, ioc); blk_set_rl_full(rl, is_sync); 1 } else { 1 if (may_queue != ELV_MQUEUE_MUST && !ioc_batching(q, ioc)) { /* * The queue is full and the allocating * process is not a "batcher", and not * exempted by the IO scheduler */ return ERR_PTR(-ENOMEM); } } 1 } blk_set_congested(rl, is_sync); } /* * Only allow batching queuers to allocate up to 50% over the defined * limit of requests, otherwise we could have thousands of requests * allocated with any setting of ->nr_requests 694 */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) return ERR_PTR(-ENOMEM); 694 q->nr_rqs[is_sync]++; rl->count[is_sync]++; rl->starved[is_sync] = 0; /* * Decide whether the new request will be managed by elevator. If * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will * prevent the current elevator from being destroyed until the new * request is freed. This guarantees icq's won't be destroyed and * makes creating new ones safe. * * Also, lookup icq while holding queue_lock. If it doesn't exist, * it will be created after releasing queue_lock. 694 */ 694 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { rw_flags |= REQ_ELVPRIV; 694 q->nr_rqs_elvpriv++; 694 if (et->icq_cache && ioc) icq = ioc_lookup_icq(ioc, q); } 694 694 if (blk_queue_io_stat(q)) 694 rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); /* allocate and init request */ rq = mempool_alloc(rl->rq_pool, gfp_mask); if (!rq) goto fail_alloc; 694 blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); rq->cmd_flags = rw_flags | REQ_ALLOCED; /* init elvpriv */ 694 if (rw_flags & REQ_ELVPRIV) { 652 if (unlikely(et->icq_cache && !icq)) { 652 if (ioc) icq = ioc_create_icq(ioc, q, gfp_mask); if (!icq) goto fail_elvpriv; } 694 rq->elv.icq = icq; if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) goto fail_elvpriv; /* @rq->elv.icq holds io_context until @rq is freed */ 694 if (icq) get_io_context(icq->ioc); } out: /* * ioc may be NULL here, and ioc_batching will be false. That's * OK, if the queue is under the request limit then requests need * not count toward the nr_batch_requests limit. There will always * be some limit enforced by BLK_BATCH_TIME. 694 */ 1 if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; 694 trace_block_getrq(q, bio, rw_flags & 1); return rq; fail_elvpriv: /* * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed * and may fail indefinitely under memory pressure and thus * shouldn't stall IO. Treat this request as !elvpriv. This will * disturb iosched and blkcg but weird is bettern than dead. */ printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", __func__, dev_name(q->backing_dev_info.dev)); rq->cmd_flags &= ~REQ_ELVPRIV; rq->elv.icq = NULL; spin_lock_irq(q->queue_lock); q->nr_rqs_elvpriv--; spin_unlock_irq(q->queue_lock); goto out; fail_alloc: /* * Allocation failed presumably due to memory. Undo anything we * might have messed up. * * Allocating task should really be put onto the front of the wait * queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); freed_request(rl, rw_flags); /* * in the very unlikely event that allocation failed and no * requests for this direction was pending, mark us starved so that * freeing of a request in the other direction will notice * us. another possible fix would be to split the rq mempool into * READ and WRITE */ rq_starved: if (unlikely(rl->count[is_sync] == 0)) rl->starved[is_sync] = 1; return ERR_PTR(-ENOMEM); } /** * get_request - get a free request * @q: request_queue to allocate request from * @rw_flags: RW and SYNC flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) 694 { const bool is_sync = rw_is_sync(rw_flags) != 0; DEFINE_WAIT(wait); struct request_list *rl; struct request *rq; rl = blk_get_rl(q, bio); /* transferred to @rq on success */ 694 retry: 694 rq = __get_request(rl, rw_flags, bio, gfp_mask); if (!IS_ERR(rq)) return rq; if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } /* wait on @rl and retry */ prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, TASK_UNINTERRUPTIBLE); trace_block_sleeprq(q, bio, rw_flags & 1); spin_unlock_irq(q->queue_lock); io_schedule(); /* * After sleeping, we become a "batching" process and will be able * to allocate at least one request, and up to a big batch of them * for a small period time. See ioc_batching, ioc_set_batching 1 */ ioc_set_batching(q, current->io_context); 1 spin_lock_irq(q->queue_lock); finish_wait(&rl->wait[is_sync], &wait); goto retry; } static struct request *blk_old_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { struct request *rq; BUG_ON(rw != READ && rw != WRITE); /* create ioc upfront */ create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); rq = get_request(q, rw, NULL, gfp_mask); if (IS_ERR(rq)) spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ return rq; } struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) return blk_mq_alloc_request(q, rw, gfp_mask, false); else return blk_old_get_request(q, rw, gfp_mask); } EXPORT_SYMBOL(blk_get_request); /** * blk_make_request - given a bio, allocate a corresponding struct request. * @q: target request queue * @bio: The bio describing the memory mappings that will be submitted for IO. * It may be a chained-bio properly constructed by block/bio layer. * @gfp_mask: gfp flags to be used for memory allocation * * blk_make_request is the parallel of generic_make_request for BLOCK_PC * type commands. Where the struct request needs to be farther initialized by * the caller. It is passed a &struct bio, which describes the memory info of * the I/O transfer. * * The caller of blk_make_request must make sure that bi_io_vec * are set to describe the memory buffers. That bio_data_dir() will return * the needed direction of the request. (And all bio's in the passed bio-chain * are properly set accordingly) * * If called under none-sleepable conditions, mapped bio buffers must not * need bouncing, by calling the appropriate masked or flagged allocator, * suitable for the target device. Otherwise the call to blk_queue_bounce will * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be * given to how you allocate bios. In particular, you cannot use * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise * you risk waiting for IO completion of a bio that hasn't been submitted yet, * thus resulting in a deadlock. Alternatively bios should be allocated using * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ struct request *blk_make_request(struct request_queue *q, struct bio *bio, gfp_t gfp_mask) { struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); if (IS_ERR(rq)) return rq; blk_rq_set_block_pc(rq); for_each_bio(bio) { struct bio *bounce_bio = bio; int ret; blk_queue_bounce(q, &bounce_bio); ret = blk_rq_append_bio(q, rq, bounce_bio); if (unlikely(ret)) { blk_put_request(rq); return ERR_PTR(ret); } } return rq; } EXPORT_SYMBOL(blk_make_request); /** * blk_rq_set_block_pc - initialize a request to type BLOCK_PC * @rq: request to be initialized * */ void blk_rq_set_block_pc(struct request *rq) { rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; memset(rq->__cmd, 0, sizeof(rq->__cmd)); } EXPORT_SYMBOL(blk_rq_set_block_pc); /** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted * @rq: request to be inserted * * Description: * Drivers often keep queueing requests until the hardware cannot accept * more, when that condition happens we need to put the request back * on the queue. Must be called with queue lock held. */ void blk_requeue_request(struct request_queue *q, struct request *rq) { blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); elv_requeue_request(q, rq); } EXPORT_SYMBOL(blk_requeue_request); static void add_acct_request(struct request_queue *q, struct request *rq, int where) { blk_account_io_start(rq, true); __elv_add_request(q, rq, where); } static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { int inflight; 761 if (now == part->stamp) return; 729 inflight = part_in_flight(part); 497 if (inflight) { __part_stat_add(cpu, part, time_in_queue, inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); 761 } part->stamp = now; } /** * part_round_stats() - Round off the performance stats on a struct disk_stats. * @cpu: cpu number for stats access * @part: target partition * * The average IO queue length and utilisation statistics are maintained * by observing the current state of the queue length and the amount of * time it has been in this state for. * * Normally, that accounting is done on IO completion, but that can result * in more than a second's worth of IO being accounted for within any one * second, leading to >100% utilisation. To deal with that, we call this * function to do a round-off before returning the results when reading * /proc/diskstats. This accounts immediately for all queue usage up to * the current jiffies and restarts the counters again. */ void part_round_stats(int cpu, struct hd_struct *part) 761 { unsigned long now = jiffies; 694 if (part->partno) 761 part_round_stats_single(cpu, &part_to_disk(part)->part0, now); part_round_stats_single(cpu, part, now); } EXPORT_SYMBOL_GPL(part_round_stats); #ifdef CONFIG_PM static void blk_pm_put_request(struct request *rq) 162 { if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) pm_runtime_mark_last_busy(rq->q->dev); } #else static inline void blk_pm_put_request(struct request *rq) {} #endif /* * queue lock must be held 162 */ void __blk_put_request(struct request_queue *q, struct request *req) 162 { if (unlikely(!q)) return; 162 if (q->mq_ops) { blk_mq_free_request(req); return; } 162 blk_pm_put_request(req); 162 elv_completed_request(q, req); /* this is a bio leak */ WARN_ON(req->bio != NULL); /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools 162 */ 162 if (req->cmd_flags & REQ_ALLOCED) { unsigned int flags = req->cmd_flags; struct request_list *rl = blk_rq_rl(req); 162 BUG_ON(!list_empty(&req->queuelist)); BUG_ON(ELV_ON_HASH(req)); 162 blk_free_request(rl, req); 162 freed_request(rl, flags); blk_put_rl(rl); } } EXPORT_SYMBOL_GPL(__blk_put_request); void blk_put_request(struct request *req) { struct request_queue *q = req->q; if (q->mq_ops) blk_mq_free_request(req); else { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); __blk_put_request(q, req); spin_unlock_irqrestore(q->queue_lock, flags); } } EXPORT_SYMBOL(blk_put_request); /** * blk_add_request_payload - add a payload to a request * @rq: request to update * @page: page backing the payload * @len: length of the payload. * * This allows to later add a payload to an already submitted request by * a block driver. The driver needs to take care of freeing the payload * itself. * * Note that this is a quite horrible hack and nothing but handling of * discard requests should ever use it. */ void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len) { struct bio *bio = rq->bio; bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; bio->bi_io_vec->bv_len = len; bio->bi_iter.bi_size = len; bio->bi_vcnt = 1; bio->bi_phys_segments = 1; rq->__data_len = rq->resid_len = len; rq->nr_phys_segments = 1; } EXPORT_SYMBOL_GPL(blk_add_request_payload); bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio) 440 { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; if (!ll_back_merge_fn(q, req, bio)) return false; 345 trace_block_bio_backmerge(q, req, bio); 345 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); 345 req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 440 blk_account_io_start(req, false); return true; } bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio) 22 { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; if (!ll_front_merge_fn(q, req, bio)) return false; 21 trace_block_bio_frontmerge(q, req, bio); 21 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); 21 bio->bi_next = req->bio; req->bio = bio; req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 22 blk_account_io_start(req, false); return true; } /** * blk_attempt_plug_merge - try to merge with %current's plugged list * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests * @same_queue_rq: pointer to &struct request that gets filled in when * another request associated with @q is found on the plug list * (optional, may be %NULL) * * Determine whether @bio being queued on @q can be merged with a request * on %current's plugged list. Returns %true if merge was successful, * otherwise %false. * * Plugging coalesces IOs from the same issuer for the same purpose without * going through @q->queue_lock. As such it's more of an issuing mechanism * than scheduling, and the request, while may have elvpriv data, is not * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. * * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count, struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; bool ret = false; struct list_head *plug_list; 694 plug = current->plug; if (!plug) 652 goto out; *request_count = 0; if (q->mq_ops) plug_list = &plug->mq_list; 652 else plug_list = &plug->list; 652 list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; 524 524 if (rq->q == q) { (*request_count)++; /* * Only blk-mq multiple hardware queues case checks the * rq in the same queue, there should be only one such * rq in a queue **/ if (same_queue_rq) *same_queue_rq = rq; } 524 if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; 523 el_ret = blk_try_merge(rq, bio); 410 if (el_ret == ELEVATOR_BACK_MERGE) { ret = bio_attempt_back_merge(q, rq, bio); if (ret) 485 break; 3 } else if (el_ret == ELEVATOR_FRONT_MERGE) { ret = bio_attempt_front_merge(q, rq, bio); if (ret) break; } } 694 out: return ret; } unsigned int blk_plug_queued_count(struct request_queue *q) { struct blk_plug *plug; struct request *rq; struct list_head *plug_list; unsigned int ret = 0; 67 plug = current->plug; if (!plug) goto out; 66 66 if (q->mq_ops) plug_list = &plug->mq_list; else plug_list = &plug->list; 66 63 list_for_each_entry(rq, plug_list, queuelist) { 63 if (rq->q == q) ret++; } 67 out: return ret; } void init_request_from_bio(struct request *req, struct bio *bio) 761 { req->cmd_type = REQ_TYPE_FS; req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; if (bio->bi_rw & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; 761 req->errors = 0; req->__sector = bio->bi_iter.bi_sector; req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); } static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) 694 { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even * ISA dma in theory) */ blk_queue_bounce(q, &bio); blk_queue_split(q, &bio, q->bio_split); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio->bi_error = -EIO; bio_endio(bio); return BLK_QC_T_NONE; } 379 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { spin_lock_irq(q->queue_lock); where = ELEVATOR_INSERT_FLUSH; goto get_rq; } /* * Check if we can merge with the plugged list before grabbing * any locks. 694 */ 694 if (!blk_queue_nomerges(q)) { if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) return BLK_QC_T_NONE; } else request_count = blk_plug_queued_count(q); 694 spin_lock_irq(q->queue_lock); el_ret = elv_merge(q, &req, bio); 90 if (el_ret == ELEVATOR_BACK_MERGE) { 90 if (bio_attempt_back_merge(q, req, bio)) { elv_bio_merged(q, req, bio); 90 if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out_unlock; 694 } 19 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 19 if (bio_attempt_front_merge(q, req, bio)) { elv_bio_merged(q, req, bio); 18 if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); goto out_unlock; } } get_rq: /* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the * rq allocator and io schedulers. 694 */ rw_flags = bio_data_dir(bio); 672 if (sync) rw_flags |= REQ_SYNC; /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. 694 */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). * We don't worry about that case for efficiency. It won't happen * often, and the elevators are able to handle it. 694 */ init_request_from_bio(req, bio); 694 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) req->cpu = raw_smp_processor_id(); 694 plug = current->plug; if (plug) { /* * If this is the first request added after a plug, fire * of a plug trace. 652 */ 648 if (!request_count) trace_block_plug(q); 494 else { 21 if (request_count >= BLK_MAX_REQUEST_COUNT) { 652 blk_flush_plug_list(plug, false); trace_block_plug(q); } 652 } 694 list_add_tail(&req->queuelist, &plug->list); blk_account_io_start(req, true); 555 } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); __blk_run_queue(q); 557 out_unlock: spin_unlock_irq(q->queue_lock); } return BLK_QC_T_NONE; } /* * If bio->bi_dev is a partition, remap the location */ static inline void blk_partition_remap(struct bio *bio) 762 { struct block_device *bdev = bio->bi_bdev; 763 695 if (bio_sectors(bio) && bdev != bdev->bd_contains) { struct hd_struct *p = bdev->bd_part; bio->bi_iter.bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; 695 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, bdev->bd_dev, bio->bi_iter.bi_sector - p->start_sect); } } static void handle_bad_sector(struct bio *bio) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "attempt to access beyond end of device\n"); printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", bdevname(bio->bi_bdev, b), bio->bi_rw, (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); } #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); static int __init setup_fail_make_request(char *str) { return setup_fault_attr(&fail_make_request, str); } __setup("fail_make_request=", setup_fail_make_request); static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { return part->make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool should_fail_request(struct hd_struct *part, unsigned int bytes) { return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ /* * Check whether this bio extends beyond the end of the device. */ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) { sector_t maxsector; 763 if (!nr_sectors) return 0; 762 /* Test device or partition size, when known. */ maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; 762 if (maxsector) { sector_t sector = bio->bi_iter.bi_sector; 762 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { /* * This may well happen - the kernel calls bread() * without checking the size of the device, e.g., when * mounting a device. */ handle_bad_sector(bio); return 1; } } return 0; } static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { 763 struct request_queue *q; int nr_sectors = bio_sectors(bio); int err = -EIO; char b[BDEVNAME_SIZE]; struct hd_struct *part; might_sleep(); 762 if (bio_check_eod(bio, nr_sectors)) goto end_io; 763 q = bdev_get_queue(bio->bi_bdev); if (unlikely(!q)) { printk(KERN_ERR "generic_make_request: Trying to access " "nonexistent block-device %s (%Lu)\n", bdevname(bio->bi_bdev, b), (long long) bio->bi_iter.bi_sector); goto end_io; } 763 part = bio->bi_bdev->bd_part; 763 if (should_fail_request(part, bio->bi_iter.bi_size) || should_fail_request(&part_to_disk(part)->part0, bio->bi_iter.bi_size)) goto end_io; /* * If this device has partitions, remap block n * of partition p to block n+start(p) of the disk. 763 */ blk_partition_remap(bio); 763 if (bio_check_eod(bio, nr_sectors)) goto end_io; /* * Filter flush bio's early so that make_request based * drivers without flush support don't have to worry * about them. 763 */ 1 if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); if (!nr_sectors) { err = 0; goto end_io; } } 762 1 if ((bio->bi_rw & REQ_DISCARD) && 1 (!blk_queue_discard(q) || ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { err = -EOPNOTSUPP; goto end_io; } 762 if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { err = -EOPNOTSUPP; goto end_io; } /* * Various block parts want %current->io_context and lazy ioc * allocation ends up trading a lot of pain for a small amount of * memory. Just allocate it upfront. This may fail and block * layer knows how to live with it. 762 */ create_io_context(GFP_ATOMIC, q->node); if (!blkcg_bio_issue_check(q, bio)) return false; 762 436 trace_block_bio_queue(q, bio); return true; 1 end_io: bio->bi_error = err; 763 bio_endio(bio); return false; } /** * generic_make_request - hand a buffer to its device driver for I/O * @bio: The bio describing the location in memory and on the device. * * generic_make_request() is used to make I/O requests of block * devices. It is passed a &struct bio, which describes the I/O that needs * to be done. * * generic_make_request() does not return any status. The * success/failure status of the request, along with notification of * completion, is delivered asynchronously through the bio->bi_end_io * function described (one day) else where. * * The caller of generic_make_request must make sure that bi_io_vec * are set to describe the memory buffer, and that bi_dev and bi_sector are * set to describe the device address, and the * bi_end_io and optionally bi_private are set to describe how * completion notification should be signaled. * * generic_make_request and the drivers it calls may use bi_next if this * bio happens to be merged with someone else, and may resubmit the bio to * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. 762 */ blk_qc_t generic_make_request(struct bio *bio) { /* * bio_list_on_stack[0] contains bios submitted by the current * make_request_fn. * bio_list_on_stack[1] contains bios that were submitted before * the current make_request_fn, but that haven't been processed * yet. */ struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; 763 if (!generic_make_request_checks(bio)) goto out; /* * We only want one ->make_request_fn to be active at a time, else * stack usage with stacked devices could be a problem. So use * current->bio_list to keep a list of requests submited by a * make_request_fn function. current->bio_list is also used as a * flag to say if generic_make_request is currently active in this * task or not. If it is NULL, then no make_request is active. If * it is non-NULL, then a make_request is active, and new requests * should be added at the tail 762 */ 139 if (current->bio_list) { bio_list_add(¤t->bio_list[0], bio); goto out; } /* following loop may be a bit non-obvious, and so deserves some * explanation. * Before entering the loop, bio->bi_next is NULL (as all callers * ensure that) so we have a list with a single bio. * We pretend that we have just taken it off a longer list, so * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be * added. ->make_request() may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from * bio_list, and call into ->make_request() again. 762 */ 762 BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; 762 do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) { struct bio_list lower, same; 761 /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); ret = q->make_request_fn(q, bio); blk_queue_exit(q); /* sort new bios into those for a lower level * and those for the same level */ bio_list_init(&lower); 139 bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) 139 if (q == bdev_get_queue(bio->bi_bdev)) bio_list_add(&same, bio); else bio_list_add(&lower, bio); 139 /* now assemble so we handle the lowest level first */ 139 bio_list_merge(&bio_list_on_stack[0], &lower); 761 bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } else { bio_io_error(bio); 761 } bio = bio_list_pop(&bio_list_on_stack[0]); 760 } while (bio); current->bio_list = NULL; /* deactivate */ 762 out: return ret; } EXPORT_SYMBOL(generic_make_request); /** * submit_bio - submit a bio to the block device layer for I/O * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) * @bio: The &struct bio which describes the I/O * * submit_bio() is very similar in purpose to generic_make_request(), and * uses that function to do most of the work. Both are fairly rough * interfaces; @bio must be presetup and ready for I/O. * */ blk_qc_t submit_bio(int rw, struct bio *bio) 763 { bio->bi_rw |= rw; /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. 763 */ if (bio_has_data(bio)) { unsigned int count; 761 7 if (unlikely(rw & REQ_WRITE_SAME)) count = bdev_logical_block_size(bio->bi_bdev) >> 9; 761 else count = bio_sectors(bio); 761 if (rw & WRITE) { count_vm_events(PGPGOUT, count); 386 } else { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } 750 if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_iter.bi_sector, bdevname(bio->bi_bdev, b), count); } } 763 return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for new the queue limits * @q: the queue * @rq: the request being checked * * Description: * @rq may have been made based on weaker limitations of upper-level queues * in request stacking drivers, and it may violate the limitation of @q. * Since the block layer and the underlying device driver trust @rq * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * * Request stacking drivers like request-based dm may change the queue * limits when retrying requests on other queues. Those requests need * to be checked against the new queue limits again during dispatch. */ static int blk_cloned_rq_check_limits(struct request_queue *q, struct request *rq) { if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; } /* * queue's settings related to segment counting like q->bounce_pfn * may differ from that of other stacking queues. * Recalculate it to check the request correctly on this queue's * limitation. */ blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit.\n", __func__); return -EIO; } return 0; } /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @q: the queue to submit the request * @rq: the request being queued */ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned long flags; int where = ELEVATOR_INSERT_BACK; if (blk_cloned_rq_check_limits(q, rq)) return -EIO; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return -EIO; if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); blk_mq_insert_request(rq, false, true, false); return 0; } spin_lock_irqsave(q->queue_lock, flags); if (unlikely(blk_queue_dying(q))) { spin_unlock_irqrestore(q->queue_lock, flags); return -ENODEV; } /* * Submitting request must be dequeued before calling this function * because it will be linked to another request_queue */ BUG_ON(blk_queued_rq(rq)); if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA)) where = ELEVATOR_INSERT_FLUSH; add_acct_request(q, rq, where); if (where == ELEVATOR_INSERT_FLUSH) __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); return 0; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_err_bytes - determine number of bytes till the next failure boundary * @rq: request to examine * * Description: * A request could be merge of IOs which require different failure * handling. This function determines the number of bytes which * can be failed from the beginning of the request without * crossing into area which need to be retried further. * * Return: * The number of bytes to fail. * * Context: * queue_lock must be held. */ unsigned int blk_rq_err_bytes(const struct request *rq) { unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; unsigned int bytes = 0; struct bio *bio; if (!(rq->cmd_flags & REQ_MIXED_MERGE)) return blk_rq_bytes(rq); /* * Currently the only 'mixing' which can happen is between * different fastfail types. We can safely fail portions * which have all the failfast bits that the first one has - * the ones which are at least as eager to fail as the first * one. */ for (bio = rq->bio; bio; bio = bio->bi_next) { if ((bio->bi_rw & ff) != ff) break; bytes += bio->bi_iter.bi_size; } /* this could lead to infinite loop */ BUG_ON(blk_rq_bytes(rq) && !bytes); return bytes; } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); void blk_account_io_completion(struct request *req, unsigned int bytes) 1 { 1 if (blk_do_io_stat(req)) { const int rw = rq_data_dir(req); struct hd_struct *part; int cpu; 1 cpu = part_stat_lock(); part = req->part; 1 part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); 1 } } void blk_account_io_done(struct request *req) { /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. 1 */ 1 if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; int cpu; 1 cpu = part_stat_lock(); part = req->part; 1 part_stat_inc(cpu, part, ios[rw]); 1 part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); part_dec_in_flight(part, rw); 1 1 hd_struct_put(part); part_stat_unlock(); 1 } } #ifdef CONFIG_PM /* * Don't process normal requests when queue is suspended * or in the process of suspending/resuming */ static struct request *blk_pm_peek_request(struct request_queue *q, struct request *rq) 645 { if (q->dev && (q->rpm_status == RPM_SUSPENDED || (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM)))) return NULL; else return rq; } #else static inline struct request *blk_pm_peek_request(struct request_queue *q, struct request *rq) { return rq; } #endif void blk_account_io_start(struct request *rq, bool new_io) { 761 struct hd_struct *part; int rw = rq_data_dir(rq); int cpu; 761 if (!blk_do_io_stat(rq)) return; 761 cpu = part_stat_lock(); 348 if (!new_io) { 348 part = rq->part; part_stat_inc(cpu, part, merges[rw]); 761 } else { 761 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); if (!hd_struct_try_get(part)) { /* * The partition is already being removed, * the request will be accounted on the disk only * * We take a reference on disk->part0 although that * partition will never be deleted, so we can treat * it as any other partition. */ part = &rq->rq_disk->part0; hd_struct_get(part); 761 } 694 part_round_stats(cpu, part); 761 part_inc_in_flight(part, rw); rq->part = part; } 761 part_stat_unlock(); } /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at * * Description: * Return the request at the top of @q. The returned request * should be started using blk_start_request() before LLD starts * processing it. * * Return: * Pointer to the request at the top of @q if available. Null * otherwise. * * Context: * queue_lock must be held. */ 650 struct request *blk_peek_request(struct request_queue *q) { struct request *rq; int ret; 650 while ((rq = __elv_next_request(q)) != NULL) { 645 rq = blk_pm_peek_request(q, rq); if (!rq) break; 645 if (!(rq->cmd_flags & REQ_STARTED)) { /* * This is the first time the device driver * sees this request (possibly after * requeueing). Notify IO scheduler. 645 */ 645 if (rq->cmd_flags & REQ_SORTED) elv_activate_rq(q, rq); /* * just mark as started even if we don't start * it, a request that has been delayed should * not be passed by new incoming requests 645 */ 645 rq->cmd_flags |= REQ_STARTED; trace_block_rq_issue(q, rq); } 645 645 if (!q->boundary_rq || q->boundary_rq == rq) { q->end_sector = rq_end_sector(rq); q->boundary_rq = NULL; } 645 if (rq->cmd_flags & REQ_DONTPREP) break; 645 if (q->dma_drain_size && blk_rq_bytes(rq)) { /* * make sure space for the drain appears we * know we can do this because max_hw_segments * has been adjusted to be one fewer than the * device can handle */ rq->nr_phys_segments++; } 645 if (!q->prep_rq_fn) break; 645 ret = q->prep_rq_fn(q, rq); if (ret == BLKPREP_OK) { break; } else if (ret == BLKPREP_DEFER) { /* * the request may have been (partially) prepped. * we need to keep this request in the front to * avoid resource deadlock. REQ_STARTED will * prevent other fs requests from passing this one. */ if (q->dma_drain_size && blk_rq_bytes(rq) && !(rq->cmd_flags & REQ_DONTPREP)) { /* * remove the space for the drain we added * so that we don't add it again */ --rq->nr_phys_segments; } rq = NULL; break; } else if (ret == BLKPREP_KILL) { rq->cmd_flags |= REQ_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. */ blk_start_request(rq); __blk_end_request_all(rq, -EIO); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; } } 650 return rq; } EXPORT_SYMBOL(blk_peek_request); void blk_dequeue_request(struct request *rq) 645 { struct request_queue *q = rq->q; 645 645 BUG_ON(list_empty(&rq->queuelist)); BUG_ON(ELV_ON_HASH(rq)); 645 list_del_init(&rq->queuelist); /* * the time frame between a request being removed from the lists * and to it is freed is accounted as io that is in progress at * the driver side. 645 */ 645 if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]++; set_io_start_time_ns(rq); 645 } } /** * blk_start_request - start request processing on the driver * @req: request to dequeue * * Description: * Dequeue @req and start timeout timer on it. This hands off the * request to the driver. * * Block internal functions which don't want to start timer should * call blk_dequeue_request(). * * Context: * queue_lock must be held. */ void blk_start_request(struct request *req) 645 { blk_dequeue_request(req); /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. */ req->resid_len = blk_rq_bytes(req); if (unlikely(blk_bidi_rq(req))) req->next_rq->resid_len = blk_rq_bytes(req->next_rq); 645 645 BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); /** * blk_fetch_request - fetch a request from a request queue * @q: request queue to fetch a request from * * Description: * Return the request at the top of @q. The request is started on * return and LLD can start processing it immediately. * * Return: * Pointer to the request at the top of @q if available. Null * otherwise. * * Context: * queue_lock must be held. */ struct request *blk_fetch_request(struct request_queue *q) { struct request *rq; rq = blk_peek_request(q); if (rq) blk_start_request(rq); return rq; } EXPORT_SYMBOL(blk_fetch_request); /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * This special helper function is only for request stacking drivers * (e.g. request-based dm) so that they can handle partial completion. * Actual device drivers should use blk_end_request instead. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) { int total_bytes; 1 trace_block_rq_complete(req->q, req, nr_bytes); 1 if (!req->bio) return false; /* * For fs requests, rq is just carrier of independent bio's * and each partial completion should be handled separately. * Reset per-request error on each partial completion. * * TODO: tj: This is too subtle. It would be better to let * low level drivers do what they see fit. 1 */ 1 if (req->cmd_type == REQ_TYPE_FS) req->errors = 0; 1 if (error && req->cmd_type == REQ_TYPE_FS && !(req->cmd_flags & REQ_QUIET)) { char *error_type; 1 switch (error) { case -ENOLINK: error_type = "recoverable transport"; break; case -EREMOTEIO: error_type = "critical target"; break; case -EBADE: error_type = "critical nexus"; break; case -ETIMEDOUT: error_type = "timeout"; break; case -ENOSPC: error_type = "critical space allocation"; break; case -ENODATA: error_type = "critical medium"; break; case -EIO: default: error_type = "I/O"; break; 1 } printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", __func__, error_type, req->rq_disk ? req->rq_disk->disk_name : "?", (unsigned long long)blk_rq_pos(req)); } 1 blk_account_io_completion(req, nr_bytes); 1 total_bytes = 0; while (req->bio) { 1 struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); 1 if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; 1 req_bio_endio(req, bio, bio_bytes, error); 1 total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done 1 */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. 1 */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (req->cmd_type == REQ_TYPE_FS) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->cmd_flags & REQ_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK; } /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ blk_recalc_rq_segments(req); 1 return true; } EXPORT_SYMBOL_GPL(blk_update_request); static bool blk_update_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_request(rq, error, nr_bytes)) return true; /* Bidi request must be completed as a whole */ if (unlikely(blk_bidi_rq(rq)) && blk_update_request(rq->next_rq, error, bidi_bytes)) return true; if (blk_queue_add_random(rq->q)) add_disk_randomness(rq->rq_disk); return false; } /** * blk_unprep_request - unprepare a request * @req: the request * * This function makes a request ready for complete resubmission (or * completion). It happens only after all error handling is complete, * so represents the appropriate moment to deallocate any resources * that were allocated to the request in the prep_rq_fn. The queue * lock is held when calling this. */ void blk_unprep_request(struct request *req) { struct request_queue *q = req->q; req->cmd_flags &= ~REQ_DONTPREP; if (q->unprep_rq_fn) q->unprep_rq_fn(q, req); } EXPORT_SYMBOL_GPL(blk_unprep_request); /* * queue lock must be held */ void blk_finish_request(struct request *req, int error) { if (req->cmd_flags & REQ_QUEUED) blk_queue_end_tag(req->q, req); BUG_ON(blk_queued_rq(req)); if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS) laptop_io_completion(&req->q->backing_dev_info); blk_delete_timer(req); if (req->cmd_flags & REQ_DONTPREP) blk_unprep_request(req); blk_account_io_done(req); if (req->end_io) req->end_io(req, error); else { if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); __blk_put_request(req->q, req); } } EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request * @rq: the request to complete * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * * Description: * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. * Drivers that supports bidi can safely call this member for any * type of request, bidi or uni. In the later case @bidi_bytes is * just ignored. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ static bool blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { struct request_queue *q = rq->q; unsigned long flags; if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; spin_lock_irqsave(q->queue_lock, flags); blk_finish_request(rq, error); spin_unlock_irqrestore(q->queue_lock, flags); return false; } /** * __blk_end_bidi_request - Complete a bidi request with queue lock held * @rq: the request to complete * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * * Description: * Identical to blk_end_bidi_request() except that queue lock is * assumed to be locked on entry and remains so on return. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; blk_finish_request(rq, error); return false; } /** * blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete * * Description: * Ends I/O on a number of bytes attached to @rq. * If @rq has leftover, sets it up for the next range of segments. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { return blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(blk_end_request); /** * blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish * @error: %0 for success, < %0 for error * * Description: * Completely finish @rq. */ void blk_end_request_all(struct request *rq, int error) { bool pending; unsigned int bidi_bytes = 0; if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } EXPORT_SYMBOL(blk_end_request_all); /** * blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for * @error: %0 for success, < %0 for error * * Description: * Complete the current consecutively mapped chunk from @rq. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool blk_end_request_cur(struct request *rq, int error) { return blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } EXPORT_SYMBOL(blk_end_request_cur); /** * blk_end_request_err - Finish a request till the next failure boundary. * @rq: the request to finish till the next failure boundary for * @error: must be negative errno * * Description: * Complete @rq till the next failure boundary. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool blk_end_request_err(struct request *rq, int error) { WARN_ON(error >= 0); return blk_end_request(rq, error, blk_rq_err_bytes(rq)); } EXPORT_SYMBOL_GPL(blk_end_request_err); /** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete * * Description: * Must be called with queue lock held unlike blk_end_request(). * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { return __blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(__blk_end_request); /** * __blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish * @error: %0 for success, < %0 for error * * Description: * Completely finish @rq. Must be called with queue lock held. */ void __blk_end_request_all(struct request *rq, int error) { bool pending; unsigned int bidi_bytes = 0; if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } EXPORT_SYMBOL(__blk_end_request_all); /** * __blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for * @error: %0 for success, < %0 for error * * Description: * Complete the current consecutively mapped chunk from @rq. Must * be called with queue lock held. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool __blk_end_request_cur(struct request *rq, int error) { return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } EXPORT_SYMBOL(__blk_end_request_cur); /** * __blk_end_request_err - Finish a request till the next failure boundary. * @rq: the request to finish till the next failure boundary for * @error: must be negative errno * * Description: * Complete @rq till the next failure boundary. Must be called * with queue lock held. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool __blk_end_request_err(struct request *rq, int error) { WARN_ON(error >= 0); return __blk_end_request(rq, error, blk_rq_err_bytes(rq)); } EXPORT_SYMBOL_GPL(__blk_end_request_err); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { 761 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= bio->bi_rw & REQ_WRITE; 761 760 if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); 761 rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; 761 if (bio->bi_bdev) 761 rq->rq_disk = bio->bi_bdev->bd_disk; } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE /** * rq_flush_dcache_pages - Helper function to flush all pages in a request * @rq: the request to be flushed * * Description: * Flush all pages in @rq. */ void rq_flush_dcache_pages(struct request *rq) { struct req_iterator iter; struct bio_vec bvec; rq_for_each_segment(bvec, rq, iter) flush_dcache_page(bvec.bv_page); } EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); #endif /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked * * Description: * Check if underlying low-level drivers of a device are busy. * If the drivers want to export their busy state, they must set own * exporting function using blk_queue_lld_busy() first. * * Basically, this function is used only by request stacking drivers * to stop dispatching requests to underlying devices when underlying * devices are busy. This behavior helps more I/O merging on the queue * of the request stacking driver and prevents I/O throughput regression * on burst I/O load. * * Return: * 0 - Not busy (The request stacking driver should dispatch request) * 1 - Busy (The request stacking driver should stop dispatching request) */ int blk_lld_busy(struct request_queue *q) { if (q->lld_busy_fn) return q->lld_busy_fn(q); return 0; } EXPORT_SYMBOL_GPL(blk_lld_busy); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /* * Copy attributes of the original request to the clone request. * The actual data parts (e.g. ->cmd, ->sense) are not copied. */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); dst->nr_phys_segments = src->nr_phys_segments; dst->ioprio = src->ioprio; dst->extra_len = src->extra_len; } /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * The actual data parts of @rq_src (e.g. ->cmd, ->sense) * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio, *bio_src; if (!bs) bs = fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else rq->bio = rq->biotail = bio; } __blk_rq_prep_clone(rq, rq_src); return 0; free_and_out: if (bio) bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); int kblockd_schedule_work(struct work_struct *work) 92 { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_schedule_delayed_work); int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) 58 { return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized * * Description: * Tracking blk_plug inside the task_struct will help with auto-flushing the * pending I/O should the task end up blocking between blk_start_plug() and * blk_finish_plug(). This is important from a performance perspective, but * also ensures that we don't deadlock. For instance, if the task is blocking * for a memory allocation, memory reclaim could end up wanting to free a * page belonging to that request that is currently residing in our private * plug. By flushing the pending I/O when the process goes to sleep, we avoid * this kind of deadlock. */ void blk_start_plug(struct blk_plug *plug) 946 { struct task_struct *tsk = current; /* * If this is a nested plug, don't actually assign it. 469 */ if (tsk->plug) return; 946 INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier 946 */ tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); 468 468 return !(rqa->q < rqb->q || (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb))); } /* * If 'from_schedule' is true, then postpone the dispatch of requests * until a safe kblockd context. We due this to avoid accidental big * additional stack usage in driver dispatch, in places where the originally * plugger did not intend it. */ static void queue_unplugged(struct request_queue *q, unsigned int depth, bool from_schedule) __releases(q->queue_lock) 628 { trace_block_unplug(q, depth, !from_schedule); 628 433 if (from_schedule) blk_run_queue_async(q); 565 else 626 __blk_run_queue(q); spin_unlock(q->queue_lock); } static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); 912 while (!list_empty(&plug->cb_list)) { list_splice_init(&plug->cb_list, &callbacks); while (!list_empty(&callbacks)) { struct blk_plug_cb *cb = list_first_entry(&callbacks, struct blk_plug_cb, list); list_del(&cb->list); cb->callback(cb, from_schedule); } } } struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size) { struct blk_plug *plug = current->plug; struct blk_plug_cb *cb; if (!plug) return NULL; list_for_each_entry(cb, &plug->cb_list, list) if (cb->callback == unplug && cb->data == data) return cb; /* Not currently on the callback list */ BUG_ON(size < sizeof(*cb)); cb = kzalloc(size, GFP_ATOMIC); if (cb) { cb->data = data; cb->callback = unplug; list_add(&cb->list, &plug->cb_list); } return cb; } EXPORT_SYMBOL(blk_check_plugged); void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; unsigned long flags; 912 struct request *rq; LIST_HEAD(list); unsigned int depth; 912 flush_plug_callbacks(plug, from_schedule); 66 if (!list_empty(&plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); 912 910 if (list_empty(&plug->list)) return; 628 list_splice_init(&plug->list, &list); list_sort(NULL, &list, plug_rq_cmp); q = NULL; depth = 0; /* * Save and disable interrupts here, to avoid doing it for every * queue lock we have to take. */ 628 local_irq_save(flags); while (!list_empty(&list)) { 628 rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); 628 BUG_ON(!rq->q); if (rq->q != q) { /* * This drops the queue lock 628 */ if (q) queue_unplugged(q, depth, from_schedule); q = rq->q; 628 depth = 0; spin_lock(q->queue_lock); } /* * Short-circuit if @q is dead 628 */ if (unlikely(blk_queue_dying(q))) { __blk_end_request_all(rq, -ENODEV); continue; } /* * rq is already accounted, so use raw insert 628 */ 139 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); 628 else __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); 628 depth++; } /* * This drops the queue lock 628 */ 628 if (q) queue_unplugged(q, depth, from_schedule); 626 local_irq_restore(flags); } 899 void blk_finish_plug(struct blk_plug *plug) 904 { if (plug != current->plug) 899 return; blk_flush_plug_list(plug, false); 899 current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); bool blk_poll(struct request_queue *q, blk_qc_t cookie) { struct blk_plug *plug; long state; 346 if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || 346 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return false; plug = current->plug; if (plug) blk_flush_plug_list(plug, false); state = current->state; while (!need_resched()) { unsigned int queue_num = blk_qc_t_to_queue_num(cookie); struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; int ret; hctx->poll_invoked++; ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); if (ret > 0) { hctx->poll_success++; set_current_state(TASK_RUNNING); return true; } if (signal_pending_state(state, current)) set_current_state(TASK_RUNNING); if (current->state == TASK_RUNNING) return true; if (ret < 0) break; cpu_relax(); } return false; } #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine * @q: the queue of the device * @dev: the device the queue belongs to * * Description: * Initialize runtime-PM-related fields for @q and start auto suspend for * @dev. Drivers that want to take advantage of request-based runtime PM * should call this function after @dev has been initialized, and its * request queue @q has been allocated, and runtime PM for it can not happen * yet(either due to disabled/forbidden or its usage_count > 0). In most * cases, driver should call this function before any I/O has taken place. * * This function takes care of setting up using auto suspend for the device, * the autosuspend delay is set to -1 to make runtime suspend impossible * until an updated value is either set by user or by driver. Drivers do * not need to touch other autosuspend settings. * * The block layer runtime PM is request based, so only works for drivers * that use request as their IO unit instead of those directly use bio's. */ void blk_pm_runtime_init(struct request_queue *q, struct device *dev) { q->dev = dev; q->rpm_status = RPM_ACTIVE; pm_runtime_set_autosuspend_delay(q->dev, -1); pm_runtime_use_autosuspend(q->dev); } EXPORT_SYMBOL(blk_pm_runtime_init); /** * blk_pre_runtime_suspend - Pre runtime suspend check * @q: the queue of the device * * Description: * This function will check if runtime suspend is allowed for the device * by examining if there are any requests pending in the queue. If there * are requests pending, the device can not be runtime suspended; otherwise, * the queue's status will be updated to SUSPENDING and the driver can * proceed to suspend the device. * * For the not allowed case, we mark last busy for the device so that * runtime PM core will try to autosuspend it some time later. * * This function should be called near the start of the device's * runtime_suspend callback. * * Return: * 0 - OK to runtime suspend the device * -EBUSY - Device should not be runtime suspended */ int blk_pre_runtime_suspend(struct request_queue *q) { int ret = 0; if (!q->dev) return ret; spin_lock_irq(q->queue_lock); if (q->nr_pending) { ret = -EBUSY; pm_runtime_mark_last_busy(q->dev); } else { q->rpm_status = RPM_SUSPENDING; } spin_unlock_irq(q->queue_lock); return ret; } EXPORT_SYMBOL(blk_pre_runtime_suspend); /** * blk_post_runtime_suspend - Post runtime suspend processing * @q: the queue of the device * @err: return value of the device's runtime_suspend function * * Description: * Update the queue's runtime status according to the return value of the * device's runtime suspend function and mark last busy for the device so * that PM core will try to auto suspend the device at a later time. * * This function should be called near the end of the device's * runtime_suspend callback. */ void blk_post_runtime_suspend(struct request_queue *q, int err) { if (!q->dev) return; spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; } else { q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); } spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL(blk_post_runtime_suspend); /** * blk_pre_runtime_resume - Pre runtime resume processing * @q: the queue of the device * * Description: * Update the queue's runtime status to RESUMING in preparation for the * runtime resume of the device. * * This function should be called near the start of the device's * runtime_resume callback. */ void blk_pre_runtime_resume(struct request_queue *q) { if (!q->dev) return; spin_lock_irq(q->queue_lock); q->rpm_status = RPM_RESUMING; spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL(blk_pre_runtime_resume); /** * blk_post_runtime_resume - Post runtime resume processing * @q: the queue of the device * @err: return value of the device's runtime_resume function * * Description: * Update the queue's runtime status according to the return value of the * device's runtime_resume function. If it is successfully resumed, process * the requests that are queued into the device's queue when it is resuming * and then mark last busy and initiate autosuspend for it. * * This function should be called near the end of the device's * runtime_resume callback. */ void blk_post_runtime_resume(struct request_queue *q, int err) { if (!q->dev) return; spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; __blk_run_queue(q); pm_runtime_mark_last_busy(q->dev); pm_request_autosuspend(q->dev); } else { q->rpm_status = RPM_SUSPENDED; } spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL(blk_post_runtime_resume); #endif int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * FIELD_SIZEOF(struct request, cmd_flags)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); blk_requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); return 0; } /* * Blk IO latency support. We want this to be as cheap as possible, so doing * this lockless (and avoiding atomics), a few off by a few errors in this * code is not harmful, and we don't want to do anything that is * perf-impactful. * TODO : If necessary, we can make the histograms per-cpu and aggregate * them when printing them out. */ ssize_t blk_latency_hist_show(char* name, struct io_latency_state *s, char *buf, int buf_size) { int i; int bytes_written = 0; u_int64_t num_elem, elem; int pct; u_int64_t average; num_elem = s->latency_elems; if (num_elem > 0) { average = div64_u64(s->latency_sum, s->latency_elems); bytes_written += scnprintf(buf + bytes_written, buf_size - bytes_written, "IO svc_time %s Latency Histogram (n = %llu," " average = %llu):\n", name, num_elem, average); for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { elem = s->latency_y_axis[i]; pct = div64_u64(elem * 100, num_elem); bytes_written += scnprintf(buf + bytes_written, PAGE_SIZE - bytes_written, "\t< %6lluus%15llu%15d%%\n", latency_x_axis_us[i], elem, pct); } /* Last element in y-axis table is overflow */ elem = s->latency_y_axis[i]; pct = div64_u64(elem * 100, num_elem); bytes_written += scnprintf(buf + bytes_written, PAGE_SIZE - bytes_written, "\t>=%6lluus%15llu%15d%%\n", latency_x_axis_us[i - 1], elem, pct); } return bytes_written; } EXPORT_SYMBOL(blk_latency_hist_show);
/* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/syscalls.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/mount.h> #include <linux/file.h> #include <asm/uaccess.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/swap.h> #include <linux/bootmem.h> #include <linux/fs_struct.h> #include <linux/hardirq.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/prefetch.h> #include <linux/ratelimit.h> #include <linux/list_lru.h> #include <linux/kasan.h> #include "internal.h" #include "mount.h" /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_anon bl list spinlock protects: * - the s_anon list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_subdirs * - childrens' d_child and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_anon lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * if (dentry1 < dentry2) * dentry1->d_lock * dentry2->d_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. */ static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return dentry_hashtable + hash_32(hash, d_hash_shift); } /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { 1996 a = *(unsigned long *)cs; b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; 361 if (unlikely(a != b)) return 1; 360 cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } 1995 mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { const unsigned char *cs; /* * Be careful about RCU walk racing with rename: * use ACCESS_ONCE to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ 1230 cs = ACCESS_ONCE(dentry->d_name.name); smp_read_barrier_depends(); 1996 return dentry_string_cmp(cs, ct, tcount); } struct external_name { union { atomic_t count; struct rcu_head head; } u; unsigned char name[]; }; static inline struct external_name *external_name(struct dentry *dentry) { 4 return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); 467 kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { 840 return dentry->d_name.name != dentry->d_iname; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { 90 spin_lock(&dentry->d_lock); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); 9 atomic_inc(&p->u.count); spin_unlock(&dentry->d_lock); name->name = p->name; } else { memcpy(name->inline_name, dentry->d_iname, 81 dentry->d_name.len + 1); spin_unlock(&dentry->d_lock); name->name = name->inline_name; } 90 } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { 90 if (unlikely(name->name != name->inline_name)) { struct external_name *p; p = container_of(name->name, struct external_name, name[0]); 9 if (unlikely(atomic_dec_and_test(&p->u.count))) 4 kfree_rcu(p, u.head); } 90 } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); flags |= type_flags; WRITE_ONCE(dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { 569 unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; } 467 static void dentry_free(struct dentry *dentry) { 830 WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); 830 if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); 15 if (likely(atomic_dec_and_test(&p->u.count))) { 12 call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ 819 if (!(dentry->d_flags & DCACHE_RCUACCESS)) 467 __d_free(&dentry->d_u.d_rcu); else 389 call_rcu(&dentry->d_u.d_rcu, __d_free); } /** * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups * @dentry: the target dentry * After this call, in-progress rcu-walk path lookup will fail. This * should be called after unhashing, and after changing d_inode (if * the dentry has not already been unhashed). */ static inline void dentry_rcuwalk_invalidate(struct dentry *dentry) { 280 lockdep_assert_held(&dentry->d_lock); /* Go through am invalidation barrier */ 280 write_seqcount_invalidate(&dentry->d_seq); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. Dentry has no refcount * and is unhashed. */ static void dentry_iput(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { 832 struct inode *inode = dentry->d_inode; if (inode) { 569 __d_clear_type_and_inode(dentry); 569 hlist_del_init(&dentry->d_u.d_alias); 569 spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) 43 fsnotify_inoderemove(inode); 569 if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else 569 iput(inode); } else { 288 spin_unlock(&dentry->d_lock); } } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. dentry remains in-use. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); 70 hlist_del_init(&dentry->d_u.d_alias); 70 raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) 70 fsnotify_inoderemove(inode); 70 if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else 70 iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { 215 D_FLAG_VERIFY(dentry, 0); 215 dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { 21 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); 21 dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); 21 } static void d_shrink_del(struct dentry *dentry) { 7 D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); 7 list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { 7 D_FLAG_VERIFY(dentry, 0); 7 list_add(&dentry->d_lru, list); 7 dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; list_lru_isolate_move(lru, &dentry->d_lru, list); } /* * dentry_lru_(add|del)_list) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { 283 if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) 215 d_lru_add(dentry); } /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock. */ void __d_drop(struct dentry *dentry) { 880 if (!d_unhashed(dentry)) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_alias, which are always IS_ROOT: */ 280 if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_anon; else 280 b = d_hash(dentry->d_parent, dentry->d_name.hash); 280 hlist_bl_lock(b); 280 __hlist_bl_del(&dentry->d_hash); 280 dentry->d_hash.pprev = NULL; 280 hlist_bl_unlock(b); 280 dentry_rcuwalk_invalidate(dentry); } 880 } EXPORT_SYMBOL(__d_drop); void d_drop(struct dentry *dentry) { 24 spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static void __dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; 832 if (!IS_ROOT(dentry)) parent = dentry->d_parent; /* * The dentry is now unrecoverably dead to the world. */ 832 lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) 16 dentry->d_op->d_prune(dentry); 832 if (dentry->d_flags & DCACHE_LRU_LIST) { 15 if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) 15 d_lru_del(dentry); } /* if it was on the hash then remove it */ 832 __d_drop(dentry); 832 __list_del_entry(&dentry->d_child); /* * Inform d_walk() that we are no longer attached to the * dentry tree */ 832 dentry->d_flags |= DCACHE_DENTRY_KILLED; if (parent) 355 spin_unlock(&parent->d_lock); 832 dentry_iput(dentry); /* * dentry_iput drops the locks, at which point nobody (except * transient RCU lookups) can reach this dentry. */ 830 BUG_ON(dentry->d_lockref.count > 0); 830 this_cpu_dec(nr_dentry); 681 if (dentry->d_op && dentry->d_op->d_release) 8 dentry->d_op->d_release(dentry); 830 spin_lock(&dentry->d_lock); if (dentry->d_flags & DCACHE_SHRINK_LIST) { dentry->d_flags |= DCACHE_MAY_FREE; can_free = false; } 830 spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); 830 } /* * Finish off a dentry we've decided to kill. * dentry->d_lock must be held, returns with it unlocked. * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ static struct dentry *dentry_kill(struct dentry *dentry) __releases(dentry->d_lock) { 831 struct inode *inode = dentry->d_inode; struct dentry *parent = NULL; 568 if (inode && unlikely(!spin_trylock(&inode->i_lock))) goto failed; 831 if (!IS_ROOT(dentry)) { parent = dentry->d_parent; 350 if (unlikely(!spin_trylock(&parent->d_lock))) { if (inode) spin_unlock(&inode->i_lock); goto failed; } } 831 __dentry_kill(dentry); return parent; failed: spin_unlock(&dentry->d_lock); return dentry; /* try again with same dentry */ } static inline struct dentry *lock_parent(struct dentry *dentry) { 1 struct dentry *parent = dentry->d_parent; if (IS_ROOT(dentry)) return NULL; 7 if (unlikely(dentry->d_lockref.count < 0)) return NULL; 7 if (likely(spin_trylock(&parent->d_lock))) return parent; rcu_read_lock(); spin_unlock(&dentry->d_lock); again: parent = ACCESS_ONCE(dentry->d_parent); spin_lock(&parent->d_lock); /* * We can't blindly lock dentry until we are sure * that we won't violate the locking order. * Any changes of dentry->d_parent must have * been done with parent->d_lock held, so * spin_lock() above is enough of a barrier * for checking if it's still our child. */ if (unlikely(parent != dentry->d_parent)) { spin_unlock(&parent->d_lock); goto again; } if (parent != dentry) { spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (unlikely(dentry->d_lockref.count < 0)) { spin_unlock(&parent->d_lock); parent = NULL; } } else { parent = NULL; } rcu_read_unlock(); return parent; } /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; unsigned int d_flags; /* * If we have a d_op->d_delete() operation, we sould not * let the dentry count go to zero, so use "put_or_lock". */ 2852 if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) 1965 return lockref_put_or_lock(&dentry->d_lockref); /* * .. otherwise, we can try to just decrement the * lockref optimistically. */ 1945 ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { 1945 spin_lock(&dentry->d_lock); if (dentry->d_lockref.count > 1) { 1579 dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return 1; } return 0; } /* * If we weren't the last ref, we're done. */ if (ret) return 1; /* * Careful, careful. The reference count went down * to zero, but we don't hold the dentry lock, so * somebody else could get it again, and do another * dput(), and we need to not race with that. * * However, there is a very special and common case * where we don't care, because there is nothing to * do: the dentry is still hashed, it does not have * a 'delete' op, and it's referenced and already on * the LRU list. * * NOTE! Since we aren't locked, these values are * not "stable". However, it is sufficient that at * some point after we dropped the reference the * dentry was hashed and the flags had the proper * value. Other dentry users may have re-gotten * a reference to the dentry and change that, but * our work is done - we can leave the dentry * around with a zero refcount. */ smp_rmb(); d_flags = ACCESS_ONCE(dentry->d_flags); d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) return 1; /* * Not the fast normal case? Get the lock. We've already decremented * the refcount, but we'll need to re-check the situation after * getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ if (dentry->d_lockref.count) { spin_unlock(&dentry->d_lock); return 1; } /* * Re-get the reference we optimistically dropped. We hold the * lock, and we just tested that it was zero, so we can just * set it to 1. */ dentry->d_lockref.count = 1; return 0; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ 2852 void dput(struct dentry *dentry) { 2869 if (unlikely(!dentry)) return; repeat: 2852 might_sleep(); 2852 rcu_read_lock(); 2852 if (likely(fast_dput(dentry))) { 2558 rcu_read_unlock(); 2852 return; } /* Slow case: now with the dentry lock held */ 1013 rcu_read_unlock(); /* Unreachable? Get rid of it */ if (unlikely(d_unhashed(dentry))) goto kill_it; 441 if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) goto kill_it; 441 if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { 353 if (dentry->d_op->d_delete(dentry)) goto kill_it; } 283 if (!(dentry->d_flags & DCACHE_REFERENCED)) 215 dentry->d_flags |= DCACHE_REFERENCED; 283 dentry_lru_add(dentry); 283 dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return; kill_it: 831 dentry = dentry_kill(dentry); 350 if (dentry) { 350 cond_resched(); goto repeat; } } EXPORT_SYMBOL(dput); /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) { 68 dentry->d_lockref.count++; } static inline void __dget(struct dentry *dentry) { lockref_get(&dentry->d_lockref); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; /* * Do optimistic parent lookup without any * locking. */ 66 rcu_read_lock(); 66 ret = ACCESS_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); 66 rcu_read_unlock(); if (likely(gotref)) { 66 if (likely(ret == ACCESS_ONCE(dentry->d_parent))) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias, *discon_alias; again: discon_alias = NULL; 68 hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { 68 spin_lock(&alias->d_lock); 68 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { 68 if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; } else { 68 __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } } spin_unlock(&alias->d_lock); } if (discon_alias) { alias = discon_alias; spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); goto again; } return NULL; } struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; 68 if (!hlist_empty(&inode->i_dentry)) { 68 spin_lock(&inode->i_lock); 68 de = __d_find_alias(inode); 68 spin_unlock(&inode->i_lock); } 68 return de; } EXPORT_SYMBOL(d_find_alias); /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { struct dentry *dentry; restart: spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) { struct dentry *parent = lock_parent(dentry); if (likely(!dentry->d_lockref.count)) { __dentry_kill(dentry); dput(parent); goto restart; } if (parent) spin_unlock(&parent->d_lock); } spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_prune_aliases); static void shrink_dentry_list(struct list_head *list) { struct dentry *dentry, *parent; 7 while (!list_empty(list)) { struct inode *inode; 7 dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); 7 parent = lock_parent(dentry); /* * The dispose list is isolated and dentries are not accounted * to the LRU here, so we can simply remove it from the list * here regardless of whether it is referenced or not. */ 7 d_shrink_del(dentry); /* * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ if (dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); if (parent) spin_unlock(&parent->d_lock); continue; } 7 if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { bool can_free = dentry->d_flags & DCACHE_MAY_FREE; spin_unlock(&dentry->d_lock); if (parent) spin_unlock(&parent->d_lock); if (can_free) dentry_free(dentry); continue; } 7 inode = dentry->d_inode; 7 if (inode && unlikely(!spin_trylock(&inode->i_lock))) { d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); if (parent) spin_unlock(&parent->d_lock); continue; } 7 __dentry_kill(dentry); /* * We need to prune ancestors too. This is necessary to prevent * quadratic behavior of shrink_dcache_parent(), but is also * expected to be beneficial in reducing dentry cache * fragmentation. */ dentry = parent; 7 while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) { 1 parent = lock_parent(dentry); 1 if (dentry->d_lockref.count != 1) { dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); if (parent) spin_unlock(&parent->d_lock); break; } 1 inode = dentry->d_inode; /* can't be NULL */ if (unlikely(!spin_trylock(&inode->i_lock))) { spin_unlock(&dentry->d_lock); if (parent) spin_unlock(&parent->d_lock); cpu_relax(); continue; } 1 __dentry_kill(dentry); dentry = parent; } } 7 } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add and list_lru_del. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); cond_resched(); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * @finish: callback when successfully finished the walk * * The @enter() and @finish() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *), void (*finish)(void *)) { struct dentry *this_parent; struct list_head *next; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: 63 read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; 63 spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); 63 switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: 63 next = this_parent->d_subdirs.next; resume: 63 while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; 15 struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } 15 if (!list_empty(&dentry->d_subdirs)) { 3 spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } 15 spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ 63 rcu_read_lock(); ascend: 63 if (this_parent != parent) { struct dentry *child = this_parent; 3 this_parent = child->d_parent; spin_unlock(&child->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ 3 if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ do { 3 next = child->d_child.next; if (next == &this_parent->d_subdirs) goto ascend; 2 child = list_entry(next, struct dentry, d_child); } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); 2 rcu_read_unlock(); goto resume; } 63 if (need_seqretry(&rename_lock, seq)) goto rename_retry; 63 rcu_read_unlock(); if (finish) 17 finish(data); out_unlock: 63 spin_unlock(&this_parent->d_lock); 63 done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } /* * Search for at least 1 mount point in the dentry's subdirs. * We descend to the next level whenever the d_subdirs * list is non-empty and continue searching. */ static enum d_walk_ret check_mount(void *data, struct dentry *dentry) { int *ret = data; if (d_mountpoint(dentry)) { *ret = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * have_submounts - check for mounts over a dentry * @parent: dentry to check. * * Return true if the parent or its subdirectories contain * a mount point */ int have_submounts(struct dentry *parent) { int ret = 0; d_walk(parent, &ret, check_mount, NULL); return ret; } EXPORT_SYMBOL(have_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; 153 write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ 133 spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } 133 spin_unlock(&p->d_lock); } 153 spin_lock(&dentry->d_lock); 117 if (!d_unlinked(dentry)) { ret = -EBUSY; 153 if (!d_mountpoint(dentry)) { 153 dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } 153 spin_unlock(&dentry->d_lock); out: 153 write_sequnlock(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_subdirs list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; struct list_head dispose; int found; }; 7 static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; 63 if (data->start == dentry) goto out; 15 if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else { 15 if (dentry->d_flags & DCACHE_LRU_LIST) 6 d_lru_del(dentry); 15 if (!dentry->d_lockref.count) { 7 d_shrink_add(dentry, &data->dispose); data->found++; } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ 15 if (!list_empty(&data->dispose)) 7 ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: 63 return ret; } /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune * * Prune the dcache to remove unused children of the parent dentry. */ void shrink_dcache_parent(struct dentry *parent) { for (;;) { struct select_data data; 46 INIT_LIST_HEAD(&data.dispose); data.start = parent; data.found = 0; d_walk(parent, &data, select_collect, NULL); 46 if (!data.found) break; 6 shrink_dentry_list(&data.dispose); cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ 24 if (!list_empty(&dentry->d_subdirs)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ 24 if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); WARN_ON(1); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { 24 shrink_dcache_parent(dentry); d_walk(dentry, dentry, umount_check, NULL); d_drop(dentry); 24 dput(dentry); 24 } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; 24 WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); 24 dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); do_one_tree(dentry); } 24 } struct detach_data { struct select_data select; struct dentry *mountpoint; }; static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry) { struct detach_data *data = _data; 17 if (d_mountpoint(dentry)) { __dget_dlock(dentry); data->mountpoint = dentry; return D_WALK_QUIT; } 17 return select_collect(&data->select, dentry); } 17 static void check_and_drop(void *_data) { struct detach_data *data = _data; 17 if (!data->mountpoint && list_empty(&data->select.dispose)) 17 __d_drop(data->select.start); 17 } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) * * no dcache lock. * * The final d_drop is done as an atomic operation relative to * rename_lock ensuring there are no races with d_set_mounted. This * ensures there are no unhashed dentries on the path to a mountpoint. */ 17 void d_invalidate(struct dentry *dentry) { /* * If it's already been dropped, return OK. */ 17 spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } 17 spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) { d_drop(dentry); return; } for (;;) { struct detach_data data; 17 data.mountpoint = NULL; INIT_LIST_HEAD(&data.select.dispose); data.select.start = dentry; data.select.found = 0; d_walk(dentry, &data, detach_and_collect, check_and_drop); if (!list_empty(&data.select.dispose)) 1 shrink_dentry_list(&data.select.dispose); else if (!data.mountpoint) 17 return; if (data.mountpoint) { detach_mounts(data.mountpoint); dput(data.mountpoint); 17 } cond_resched(); } 1 } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) 2076 return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (name->len > DNAME_INLINE_LEN-1) { 2076 size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); if (!p) { 20 kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->u.count, 1); dname = p->name; 20 if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS)) kasan_unpoison_shadow(dname, round_up(name->len + 1, sizeof(unsigned long))); } else { dname = dentry->d_iname; } 2063 dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; 2076 memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_wmb(); dentry->d_name.name = dname; dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_init(&dentry->d_seq); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_child); d_set_d_op(dentry, dentry->d_sb->s_d_op); this_cpu_inc(nr_dentry); return dentry; } 2076 /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) 1140 return NULL; dentry->d_flags |= DCACHE_RCUACCESS; spin_lock(&parent->d_lock); 1140 /* * don't need child lock because it is not subject * to concurrency here */ __dget_dlock(parent); dentry->d_parent = parent; list_add(&dentry->d_child, &parent->d_subdirs); spin_unlock(&parent->d_lock); 1140 1140 return dentry; } 1140 EXPORT_SYMBOL(d_alloc); /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { return __d_alloc(sb, name); } 1101 EXPORT_SYMBOL(d_alloc_pseudo); struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.len = strlen(name); 149 q.hash = full_name_hash(q.name, q.len); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | 2076 DCACHE_OP_COMPARE | 2076 DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_SELECT_INODE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) 2076 return; if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; 1815 if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; 1815 if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; 1815 if (op->d_weak_revalidate) 499 dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; 1815 if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; 1815 if (op->d_prune) 898 dentry->d_flags |= DCACHE_OP_PRUNE; 1815 if (op->d_select_inode) 15 dentry->d_flags |= DCACHE_OP_SELECT_INODE; 1815 if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; 1815 2076 } EXPORT_SYMBOL(d_set_d_op); /* * d_set_fallthru - Mark a dentry as falling through to a lower layer * @dentry - The dentry to mark * * Mark a dentry as falling through to the lower layer (as set with * d_pin_lower()). This flag may be recorded on the medium. */ void d_set_fallthru(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_FALLTHRU; spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_set_fallthru); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; 2024 if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; 1952 if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) 628 add_flags = DCACHE_AUTODIR_TYPE; 628 else inode->i_opflags |= IOP_LOOKUP; } 628 goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->follow_link)) { 1788 add_flags = DCACHE_SYMLINK_TYPE; 1758 goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } 1644 if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; 1676 type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; 2024 return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); 2024 spin_lock(&dentry->d_lock); if (inode) hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); 1952 __d_set_inode_and_type(dentry, inode, add_flags); 2024 raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); fsnotify_d_instantiate(dentry, inode); } 1952 2024 /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) 1986 spin_lock(&inode->i_lock); 1986 __d_instantiate(entry, inode); 1917 if (inode) 432 spin_unlock(&inode->i_lock); security_d_instantiate(entry, inode); } 1986 EXPORT_SYMBOL(d_instantiate); /** * d_instantiate_unique - instantiate a non-aliased dentry * @entry: dentry to instantiate * @inode: inode to attach to this dentry * * Fill in inode information in the entry. On success, it returns NULL. * If an unhashed alias of "entry" already exists, then we return the * aliased dentry instead and drop one reference to inode. * * Note that in order to avoid conflicts with rename() etc, the caller * had better be holding the parent directory semaphore. * * This also assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ static struct dentry *__d_instantiate_unique(struct dentry *entry, struct inode *inode) { struct dentry *alias; int len = entry->d_name.len; const char *name = entry->d_name.name; unsigned int hash = entry->d_name.hash; if (!inode) { __d_instantiate(entry, NULL); return NULL; } hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or * parent changes, because the parent inode i_mutex is held. */ if (alias->d_name.hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; if (alias->d_name.len != len) continue; if (dentry_cmp(alias, name, len)) continue; __dget(alias); return alias; } __d_instantiate(entry, inode); return NULL; } struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) { struct dentry *result; BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) spin_lock(&inode->i_lock); result = __d_instantiate_unique(entry, inode); if (inode) spin_unlock(&inode->i_lock); if (!result) { security_d_instantiate(entry, inode); return NULL; } BUG_ON(!d_unhashed(result)); iput(inode); return result; } EXPORT_SYMBOL(d_instantiate_unique); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); 47 lockdep_annotate_inode_mutex_key(inode); 47 spin_lock(&inode->i_lock); 47 __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); 47 spin_unlock(&inode->i_lock); security_d_instantiate(entry, inode); } EXPORT_SYMBOL(d_instantiate_new); /** * d_instantiate_no_diralias - instantiate a non-aliased dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. If a directory alias is found, then * return an error (and drop inode). Together with d_materialise_unique() this * guarantees that a directory inode may never have more than one alias. */ int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) { spin_unlock(&inode->i_lock); iput(inode); return -EBUSY; } __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); security_d_instantiate(entry, inode); return 0; } EXPORT_SYMBOL(d_instantiate_no_diralias); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { static const struct qstr name = QSTR_INIT("/", 1); 162 res = __d_alloc(root_inode->i_sb, &name); if (res) { 162 res->d_flags |= DCACHE_RCUACCESS; d_instantiate(res, root_inode); 162 } else { iput(root_inode); } } return res; } 162 EXPORT_SYMBOL(d_make_root); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; 6 alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); __dget(alias); 26 return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); 26 spin_unlock(&inode->i_lock); 26 return de; 26 } EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) { static const struct qstr anonstring = QSTR_INIT("/", 1); struct dentry *tmp; struct dentry *res; unsigned add_flags; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); res = d_find_any_alias(inode); if (res) goto out_iput; tmp = __d_alloc(inode->i_sb, &anonstring); if (!tmp) { res = ERR_PTR(-ENOMEM); goto out_iput; } spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); if (res) { spin_unlock(&inode->i_lock); dput(tmp); goto out_iput; } /* attach a disconnected dentry */ add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&tmp->d_lock); __d_set_inode_and_type(tmp, inode, add_flags); hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry); hlist_bl_lock(&tmp->d_sb->s_anon); hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); hlist_bl_unlock(&tmp->d_sb->s_anon); spin_unlock(&tmp->d_lock); spin_unlock(&inode->i_lock); security_d_instantiate(tmp, inode); return tmp; out_iput: if (res && !IS_ERR(res)) security_d_instantiate(res, inode); iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, 1); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, 0); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the the case-exact dentry * already exists in in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found; struct dentry *new; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (!found) { new = d_alloc(dentry->d_parent, name); if (!new) { found = ERR_PTR(-ENOMEM); } else { found = d_splice_alias(inode, new); if (found) { dput(new); return found; } return new; } } iput(inode); return found; } EXPORT_SYMBOL(d_add_ci); /* * Do the slow-case of the dentry name compare. * * Unlike the dentry_cmp() function, we need to atomically * load the name and length information, so that the * filesystem can rely on them, and can use the 'name' and * 'len' information without worrying about walking off the * end of memory etc. * * Thus the read_seqcount_retry() and the "duplicate" info * in arguments (the low-level filesystem should not look * at the dentry inode or name contents directly, since * rename can change them while we're in RCU mode). */ enum slow_d_compare { D_COMP_OK, D_COMP_NOMATCH, D_COMP_SEQRETRY, }; static noinline enum slow_d_compare slow_dentry_cmp( const struct dentry *parent, struct dentry *dentry, unsigned int seq, const struct qstr *name) { int tlen = dentry->d_name.len; const char *tname = dentry->d_name.name; if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); return D_COMP_SEQRETRY; } if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) return D_COMP_NOMATCH; return D_COMP_OK; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; 1685 struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen)); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; 1685 seqretry: /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) 1607 continue; if (d_unhashed(dentry)) continue; 1604 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; *seqp = seq; switch (slow_dentry_cmp(parent, dentry, seq, name)) { case D_COMP_OK: return dentry; case D_COMP_NOMATCH: continue; default: goto seqretry; } } if (dentry->d_name.hash_len != hashlen) continue; 1604 *seqp = seq; if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) 1603 return dentry; 1607 } return NULL; } 1685 /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); 1544 if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; 1032 } 1544 EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int len = name->len; unsigned int hash = name->hash; 1646 const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(parent, hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); 1646 hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { 1646 if (dentry->d_name.hash != hash) continue; 1245 spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) 1231 goto next; if (d_unhashed(dentry)) goto next; 1231 /* * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). */ if (parent->d_flags & DCACHE_OP_COMPARE) { int tlen = dentry->d_name.len; const char *tname = dentry->d_name.name; if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) goto next; } else { if (dentry->d_name.len != len) goto next; 1231 if (dentry_cmp(dentry, str, len)) goto next; 1230 } dentry->d_lockref.count++; found = dentry; 1229 spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } 2 rcu_read_unlock(); 1646 return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { 67 int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } 67 EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode; 129 int isdir = 0; /* * Are we the only user? */ again: spin_lock(&dentry->d_lock); inode = dentry->d_inode; 129 isdir = S_ISDIR(inode->i_mode); if (dentry->d_lockref.count == 1) { if (!spin_trylock(&inode->i_lock)) { spin_unlock(&dentry->d_lock); 70 cpu_relax(); goto again; } dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); 70 fsnotify_nameremove(dentry, isdir); 70 return; 70 } if (!d_unhashed(dentry)) __d_drop(dentry); 62 62 spin_unlock(&dentry->d_lock); 62 fsnotify_nameremove(dentry, isdir); } 129 EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) { 1082 BUG_ON(!d_unhashed(entry)); hlist_bl_lock(b); 1082 hlist_bl_add_head_rcu(&entry->d_hash, b); 1082 hlist_bl_unlock(b); 1082 } 1082 1082 static void _d_rehash(struct dentry * entry) { __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash)); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); _d_rehash(entry); 1068 spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); /** * dentry_update_name_case - update case insensitive dentry with a new name * @dentry: dentry to be updated * @name: new name * * Update a case insensitive dentry with new case of name. * * dentry must have been returned by d_lookup with name @name. Old and new * name lengths must match (ie. no d_compare which allows mismatched name * lengths). * * Parent inode i_mutex must be held over d_lookup and into this call (to * keep renames and concurrent inserts, and readdir(2) away). */ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) { BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ spin_lock(&dentry->d_lock); write_seqcount_begin(&dentry->d_seq); memcpy((unsigned char *)dentry->d_name.name, name->name, name->len); write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dentry_update_name_case); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* 2 * Both external: swap the pointers */ swap(target->d_name.name, dentry->d_name.name); } else { 1 /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ memcpy(target->d_iname, dentry->d_name.name, dentry->d_name.len + 1); dentry->d_name.name = target->d_name.name; 1 target->d_name.name = target->d_iname; } } else { if (unlikely(dname_external(dentry))) { /* 6 * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); target->d_name.name = dentry->d_name.name; 2 dentry->d_name.name = dentry->d_iname; } else { /* * Both are internal. */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN); kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); 4 } } } swap(dentry->d_name.hash_len, target->d_name.hash_len); } 8 static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); 21 if (unlikely(dname_external(target))) { 4 atomic_inc(&external_name(target)->u.count); 21 dentry->d_name = target->d_name; 3 } else { memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); dentry->d_name.name = dentry->d_iname; 18 dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) kfree_rcu(old_name, u.head); 21 } static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) { /* * XXXX: do we really need to take target->d_lock? */ if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent) spin_lock(&target->d_parent->d_lock); 29 else { 19 if (d_ancestor(dentry->d_parent, target->d_parent)) { spin_lock(&dentry->d_parent->d_lock); 10 spin_lock_nested(&target->d_parent->d_lock, 6 DENTRY_D_LOCK_NESTED); } else { spin_lock(&target->d_parent->d_lock); spin_lock_nested(&dentry->d_parent->d_lock, 4 DENTRY_D_LOCK_NESTED); } } if (target < dentry) { spin_lock_nested(&target->d_lock, 2); 29 spin_lock_nested(&dentry->d_lock, 3); 24 } else { spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); 24 } } static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target) { if (target->d_parent != dentry->d_parent) spin_unlock(&dentry->d_parent->d_lock); if (target->d_parent != target) 10 spin_unlock(&target->d_parent->d_lock); 29 spin_unlock(&target->d_lock); 29 spin_unlock(&dentry->d_lock); 29 } /* * When switching names, the actual string doesn't strictly have to * be preserved in the target - because we're dropping the target * anyway. As such, we can just do a simple memcpy() to copy over * the new name before we switch, unless we are going to rehash * it. Note that if we *do* unhash the target, we are not allowed * to rehash it without giving it a new name/hash key - whether * we swap or overwrite the names here, resulting name won't match * the reality in filesystem; it's only there for d_path() purposes. * Note that all of this is happening under rename_lock, so the * any hash lookup seeing it in the middle of manipulations will * be discarded anyway. So we do not care what happens to the hash * key in that case. */ /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) 29 { if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); 29 BUG_ON(d_ancestor(dentry, target)); BUG_ON(d_ancestor(target, dentry)); 29 29 dentry_lock_for_move(dentry, target); 29 write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); 29 /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ /* * Move the dentry to the target hash queue. Don't bother checking * for the same hash queue because of how unlikely it is. */ __d_drop(dentry); __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); /* * Unhash the target (d_delete() is not usable here). If exchanging * the two dentries, then rehash onto the other's hash queue. */ __d_drop(target); if (exchange) { __d_rehash(target, d_hash(dentry->d_parent, dentry->d_name.hash)); 8 } /* Switch the names.. */ if (exchange) swap_names(dentry, target); else 8 copy_name(dentry, target); 21 /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { /* splicing a tree */ 29 dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_parent = target->d_parent; target->d_parent = target; list_del_init(&target->d_child); list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); } else { /* swapping two dentries */ swap(dentry->d_parent, target->d_parent); list_move(&target->d_child, &target->d_parent->d_subdirs); 29 list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); 29 if (exchange) 29 fsnotify_d_move(target); 29 fsnotify_d_move(dentry); 8 } 29 write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); 29 dentry_unlock_for_move(dentry, target); } 29 /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); 21 write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); 8 WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); 8 WARN_ON(IS_ROOT(dentry2)); 8 8 __d_move(dentry1, dentry2, true); 8 write_sequnlock(&rename_lock); } /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) 64 return p; 64 } return NULL; } 14 /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct inode *inode, struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL, *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: if (m2) mutex_unlock(m2); if (m1) mutex_unlock(m1); return ret; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { if (IS_ERR(inode)) return ERR_CAST(inode); 52 BUG_ON(!d_unhashed(dentry)); 52 if (!inode) { __d_instantiate(dentry, NULL); 52 goto out; 46 } spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { 6 struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { 6 /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { int err = __d_unalias(inode, dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); security_d_instantiate(new, inode); } iput(inode); return new; } } /* already taking inode->i_lock, so d_add() by hand */ __d_instantiate(dentry, inode); spin_unlock(&inode->i_lock); 6 out: security_d_instantiate(dentry, inode); d_rehash(dentry); 52 return NULL; } 52 EXPORT_SYMBOL(d_splice_alias); static int prepend(char **buffer, int *buflen, const char *str, int namelen) { *buflen -= namelen; if (*buflen < 0) 388 return -ENAMETOOLONG; *buffer -= namelen; memcpy(*buffer, str, namelen); 93 return 0; } /** * prepend_name - prepend a pathname in front of current buffer pointer * @buffer: buffer pointer * @buflen: allocated length of the buffer * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. * The length cannot be trusted, we need to copy it byte-by-byte until * the length is reached or a null byte is found. It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * * Data dependency barrier is needed to make sure that we see that terminating * NUL. Alpha strikes again, film at 11... */ static int prepend_name(char **buffer, int *buflen, struct qstr *name) { const char *dname = ACCESS_ONCE(name->name); u32 dlen = ACCESS_ONCE(name->len); 347 char *p; smp_read_barrier_depends(); *buflen -= dlen + 1; if (*buflen < 0) return -ENAMETOOLONG; p = *buffer -= dlen + 1; *p++ = '/'; 347 while (dlen--) { char c = *dname++; if (!c) 347 break; *p++ = c; } 347 return 0; } 347 /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. * It only checks the sequence number of the global rename_lock as any change * in the dentry's d_seq will be preceded by changes in the rename_lock * sequence number. If the sequence number had been changed, it will restart * the whole pathname back-tracing sequence again by taking the rename_lock. * In this case, there is no need to take the RCU read lock as the recursive * parent pointer references will keep the dentry chain alive as long as no * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, char **buffer, int *buflen) { struct dentry *dentry; struct vfsmount *vfsmnt; struct mount *mnt; int error = 0; unsigned seq, m_seq = 0; char *bptr; int blen; rcu_read_lock(); restart_mnt: 96 read_seqbegin_or_lock(&mount_lock, &m_seq); seq = 0; 96 rcu_read_lock(); restart: 96 bptr = *buffer; blen = *buflen; 96 error = 0; dentry = path->dentry; vfsmnt = path->mnt; mnt = real_mount(vfsmnt); read_seqbegin_or_lock(&rename_lock, &seq); while (dentry != root->dentry || vfsmnt != root->mnt) { 96 struct dentry * parent; 96 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); 96 /* Escaped? */ 93 if (dentry != vfsmnt->mnt_root) { bptr = *buffer; blen = *buflen; error = 3; break; } /* Global root? */ if (mnt != parent) { dentry = ACCESS_ONCE(mnt->mnt_mountpoint); mnt = parent; 92 vfsmnt = &mnt->mnt; continue; } if (!error) error = is_mounted(vfsmnt) ? 1 : 2; break; 90 } parent = dentry->d_parent; prefetch(parent); error = prepend_name(&bptr, &blen, &dentry->d_name); 95 if (error) break; 95 dentry = parent; } if (!(seq & 1)) rcu_read_unlock(); 96 if (need_seqretry(&rename_lock, seq)) { 96 seq = 1; 96 goto restart; } done_seqretry(&rename_lock, seq); if (!(m_seq & 1)) rcu_read_unlock(); 96 if (need_seqretry(&mount_lock, m_seq)) { 96 m_seq = 1; 96 goto restart_mnt; } done_seqretry(&mount_lock, m_seq); if (error >= 0 && bptr == *buffer) { if (--blen < 0) 96 error = -ENAMETOOLONG; 48 else *--bptr = '/'; } 48 *buffer = bptr; *buflen = blen; 96 return error; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. * * Returns a pointer into the buffer or an error code if the * path was too long. * * "buflen" should be positive. * * If the path is not reachable from the supplied root, return %NULL. */ char *__d_path(const struct path *path, const struct path *root, char *buf, int buflen) { char *res = buf + buflen; int error; 47 prepend(&res, &buflen, "\0", 1); error = prepend_path(path, root, &res, &buflen); 47 47 if (error < 0) return ERR_PTR(error); if (error > 0) 1 return NULL; 47 return res; } 47 char *d_absolute_path(const struct path *path, char *buf, int buflen) { struct path root = {}; char *res = buf + buflen; int error; prepend(&res, &buflen, "\0", 1); error = prepend_path(path, &root, &res, &buflen); if (error > 1) error = -EINVAL; if (error < 0) return ERR_PTR(error); return res; } EXPORT_SYMBOL(d_absolute_path); /* * same as __d_path but appends "(deleted)" for unlinked files. */ static int path_with_deleted(const struct path *path, const struct path *root, char **buf, int *buflen) { prepend(buf, buflen, "\0", 1); if (d_unlinked(path->dentry)) { 45 int error = prepend(buf, buflen, " (deleted)", 10); 45 if (error) 1 return error; } return prepend_path(path, root, buf, buflen); } 45 static int prepend_unreachable(char **buffer, int *buflen) { return prepend(buffer, buflen, "(unreachable)", 13); } 1 static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); *root = fs->root; 45 } while (read_seqcount_retry(&fs->seq, seq)); } /** * d_path - return the path of a dentry * @path: path to report * @buf: buffer to return value in * @buflen: buffer length * * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * * Returns a pointer into the buffer or an error code if the path was * too long. Note: Callers should use the returned pointer, not the passed * in buffer, to use the name! The implementation often starts at an offset * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ char *d_path(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; 53 int error; /* * We have various synthetic filesystems that never get mounted. On * these filesystems dentries are never used for lookup purposes, and * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: * * Some pseudo inodes are mountable. When they are mounted * path->dentry == path->mnt->mnt_root. In that case don't call d_dname * and instead have d_path return the mounted path. */ if (path->dentry->d_op && path->dentry->d_op->d_dname && (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) 42 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 14 53 rcu_read_lock(); get_fs_root_rcu(current->fs, &root); 45 error = path_with_deleted(path, &root, &res, &buflen); 45 rcu_read_unlock(); 45 45 if (error < 0) res = ERR_PTR(error); 45 return res; 1 } EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; int sz; va_start(args, fmt); sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1; 8 va_end(args); if (sz > sizeof(temp) || sz > buflen) return ERR_PTR(-ENAMETOOLONG); 8 8 buffer += buflen - sz; return memcpy(buffer, temp, sz); 8 } char *simple_dname(struct dentry *dentry, char *buffer, int buflen) { char *end = buffer + buflen; /* these dentries are never renamed, so d_lock is not needed */ 6 if (prepend(&end, &buflen, " (deleted)", 11) || prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || 6 prepend(&end, &buflen, "/", 1)) 6 end = ERR_PTR(-ENAMETOOLONG); 6 return end; } 6 EXPORT_SYMBOL(simple_dname); /* * Write full pathname from the root of the filesystem into the buffer. */ static char *__dentry_path(struct dentry *d, char *buf, int buflen) { struct dentry *dentry; char *end, *retval; int len, seq = 0; int error = 0; if (buflen < 2) goto Elong; 339 rcu_read_lock(); restart: 339 dentry = d; end = buf + buflen; len = buflen; prepend(&end, &len, "\0", 1); /* Get '/' right */ 339 retval = end-1; *retval = '/'; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { 339 struct dentry *parent = dentry->d_parent; prefetch(parent); error = prepend_name(&end, &len, &dentry->d_name); 269 if (error) break; retval = end; dentry = parent; 269 } if (!(seq & 1)) rcu_read_unlock(); 339 if (need_seqretry(&rename_lock, seq)) { 339 seq = 1; goto restart; } done_seqretry(&rename_lock, seq); if (error) goto Elong; 339 return retval; Elong: return ERR_PTR(-ENAMETOOLONG); } 339 char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) { return __dentry_path(dentry, buf, buflen); } 322 EXPORT_SYMBOL(dentry_path_raw); char *dentry_path(struct dentry *dentry, char *buf, int buflen) { char *p = NULL; char *retval; if (d_unlinked(dentry)) { p = buf + buflen; 17 if (prepend(&p, &buflen, "//deleted", 10) != 0) 1 goto Elong; 1 buflen++; } retval = __dentry_path(dentry, buf, buflen); if (!IS_ERR(retval) && p) 17 *p = '/'; /* restore '/' overriden with '\0' */ 17 return retval; 1 Elong: return ERR_PTR(-ENAMETOOLONG); } static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, struct path *pwd) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); *root = fs->root; 5 *pwd = fs->pwd; } while (read_seqcount_retry(&fs->seq, seq)); } /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just * returns the length of the buffer filled (which * includes the ending '\0' character), or a negative * error value. So libc would do something like * * char *getcwd(char * buf, size_t size) * { * int retval; * * retval = sys_getcwd(buf, size); * if (retval >= 0) * return buf; * errno = -retval; * return NULL; * } */ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { 5 int error; struct path pwd, root; char *page = __getname(); 5 if (!page) return -ENOMEM; rcu_read_lock(); get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); 5 5 error = -ENOENT; if (!d_unlinked(pwd.dentry)) { unsigned long len; 5 char *cwd = page + PATH_MAX; int buflen = PATH_MAX; prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); 4 rcu_read_unlock(); 4 if (error < 0) goto out; 4 /* Unreachable from current root */ if (error > 0) { error = prepend_unreachable(&cwd, &buflen); 4 if (error) 1 goto out; } error = -ERANGE; len = PATH_MAX + page - cwd; if (len <= size) { 4 error = len; if (copy_to_user(buf, cwd, len)) 2 error = -EFAULT; 3 } } else { rcu_read_unlock(); } 1 out: __putname(page); return error; 5 } 5 /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns 1 if new_dentry is a subdirectory of the parent (at any depth). * Returns 0 otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { int result; unsigned seq; if (new_dentry == old_dentry) return 1; 66 do { /* for restarting inner loop in case of seq retry */ seq = read_seqbegin(&rename_lock); /* 31 * Need rcu_readlock to protect against the d_parent trashing * due to d_move */ rcu_read_lock(); if (d_ancestor(old_dentry, new_dentry)) 31 result = 1; 31 else result = 0; rcu_read_unlock(); } while (read_seqretry(&rename_lock, seq)); 31 66 return result; } static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { 21 struct dentry *root = data; if (dentry != root) { if (d_unhashed(dentry) || !dentry->d_inode) 21 return D_WALK_SKIP; 2 if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; 21 dentry->d_lockref.count--; 2 } } return D_WALK_CONTINUE; } void d_genocide(struct dentry *parent) { d_walk(parent, parent, d_genocide_kill, NULL); } 21 void d_tmpfile(struct dentry *dentry, struct inode *inode) { inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || 3 !hlist_unhashed(&dentry->d_u.d_alias) || 3 !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 3 dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY, &d_hash_shift, &d_hash_mask, 0, 0); for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { unsigned int loop; /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, 0, &d_hash_shift, &d_hash_mask, 0, 0); for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __read_mostly; EXPORT_SYMBOL(names_cachep); EXPORT_SYMBOL(d_genocide); void __init vfs_caches_init_early(void) { dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
/* * Rusty Russell (C)2000 -- This code is GPL. * Patrick McHardy (c) 2006-2012 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter_bridge.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/protocol.h> #include <net/netfilter/nf_queue.h> #include <net/dst.h> #include "nf_internals.h" /* * Hook for nfnetlink_queue to register its queue handler. * We do this so that most of the NFQUEUE code can be modular. * * Once the queue is registered it must reinject all packets it * receives, no matter what. */ /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh) { /* should never happen, we only have one queueing backend in kernel */ 30 WARN_ON(rcu_access_pointer(net->nf.queue_handler)); 30 rcu_assign_pointer(net->nf.queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ void nf_unregister_queue_handler(struct net *net) { RCU_INIT_POINTER(net->nf.queue_handler, NULL); } EXPORT_SYMBOL(nf_unregister_queue_handler); void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ if (state->in) dev_put(state->in); if (state->out) dev_put(state->out); if (state->sk) sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { struct net_device *physdev; physdev = nf_bridge_get_physindev(entry->skb); if (physdev) dev_put(physdev); physdev = nf_bridge_get_physoutdev(entry->skb); if (physdev) dev_put(physdev); } #endif } EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); /* Bump dev refs so they don't vanish while packet is out */ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; if (state->in) dev_hold(state->in); if (state->out) dev_hold(state->out); if (state->sk) sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { struct net_device *physdev; physdev = nf_bridge_get_physindev(entry->skb); if (physdev) dev_hold(physdev); physdev = nf_bridge_get_physoutdev(entry->skb); if (physdev) dev_hold(physdev); } #endif } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) { const struct nf_queue_handler *qh; rcu_read_lock(); qh = rcu_dereference(net->nf.queue_handler); if (qh) qh->nf_hook_drop(net, ops); rcu_read_unlock(); } /* * Any packet that leaves via this function must come back * through nf_reinject(). */ int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum) { int status = -ENOENT; struct nf_queue_entry *entry = NULL; const struct nf_afinfo *afinfo; const struct nf_queue_handler *qh; struct net *net = state->net; /* QUEUE == DROP if no one is waiting, to be safe. */ qh = rcu_dereference(net->nf.queue_handler); if (!qh) { status = -ESRCH; goto err; } afinfo = nf_get_afinfo(state->pf); if (!afinfo) goto err; entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); if (!entry) { status = -ENOMEM; goto err; } *entry = (struct nf_queue_entry) { .skb = skb, .elem = elem, .state = *state, .size = sizeof(*entry) + afinfo->route_key_size, }; nf_queue_entry_get_refs(entry); skb_dst_force(skb); afinfo->saveroute(skb, entry); status = qh->outfn(entry, queuenum); if (status < 0) { nf_queue_entry_release_refs(entry); goto err; } return 0; err: kfree(entry); return status; } void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { struct sk_buff *skb = entry->skb; struct nf_hook_ops *elem = entry->elem; const struct nf_afinfo *afinfo; int err; nf_queue_entry_release_refs(entry); /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) verdict = elem->hook(elem->priv, skb, &entry->state); if (verdict == NF_ACCEPT) { afinfo = nf_get_afinfo(entry->state.pf); if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0) verdict = NF_DROP; } entry->state.thresh = INT_MIN; if (verdict == NF_ACCEPT) { next_hook: verdict = nf_iterate(entry->state.hook_list, skb, &entry->state, &elem); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: local_bh_disable(); entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: err = nf_queue(skb, elem, &entry->state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) goto next_hook; kfree_skb(skb); } break; case NF_STOLEN: break; default: kfree_skb(skb); } kfree(entry); } EXPORT_SYMBOL(nf_reinject);
/* * include/linux/pagevec.h * * In many places it is efficient to batch an operation up against multiple * pages. A pagevec is a multipage container which is used for that. */ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H /* 14 pointers + two long's align the pagevec structure to a power of two */ #define PAGEVEC_SIZE 14 struct page; struct address_space; struct pagevec { unsigned long nr; unsigned long cold; struct page *pages[PAGEVEC_SIZE]; }; void __pagevec_release(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_entries, pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages); static inline void pagevec_init(struct pagevec *pvec, int cold) { pvec->nr = 0; pvec->cold = cold; } static inline void pagevec_reinit(struct pagevec *pvec) { pvec->nr = 0; } 929 14 static inline unsigned pagevec_count(struct pagevec *pvec) { return pvec->nr; } static inline unsigned pagevec_space(struct pagevec *pvec) { return PAGEVEC_SIZE - pvec->nr; } 2260 /* * Add a page to a pagevec. Returns the number of slots still available. */ static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page) { 3106 pvec->pages[pvec->nr++] = page; return pagevec_space(pvec); } static inline void pagevec_release(struct pagevec *pvec) { if (pagevec_count(pvec)) __pagevec_release(pvec); 3152 } #endif /* _LINUX_PAGEVEC_H */
/* * NET An implementation of the SOCKET network access protocol. * * Version: @(#)socket.c 1.1.93 18/02/95 * * Authors: Orest Zborowski, <obz@Kodak.COM> * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Anonymous : NOTSOCK/BADF cleanup. Error fix in * shutdown() * Alan Cox : verify_area() fixes * Alan Cox : Removed DDI * Jonathan Kamens : SOCK_DGRAM reconnect bug * Alan Cox : Moved a load of checks to the very * top level. * Alan Cox : Move address structures to/from user * mode above the protocol layers. * Rob Janssen : Allow 0 length sends. * Alan Cox : Asynchronous I/O support (cribbed from the * tty drivers). * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) * Jeff Uphoff : Made max number of sockets command-line * configurable. * Matti Aarnio : Made the number of sockets dynamic, * to be allocated when needed, and mr. * Uphoff's max is used as max to be * allowed to allocate. * Linus : Argh. removed all the socket allocation * altogether: it's in the inode now. * Alan Cox : Made sock_alloc()/sock_release() public * for NetROM and future kernel nfsd type * stuff. * Alan Cox : sendmsg/recvmsg basics. * Tom Dyas : Export net symbols. * Marcin Dalecki : Fixed problems with CONFIG_NET="n". * Alan Cox : Added thread locking to sys_* calls * for sockets. May have errors at the * moment. * Kevin Buhr : Fixed the dumb errors in the above. * Andi Kleen : Some small cleanups, optimizations, * and fixed a copy_from_user() bug. * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) * Tigran Aivazian : Made listen(2) backlog sanity checks * protocol-independent * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * * This module is effectively the top level interface to the BSD socket * paradigm. * * Based upon Swansea University Computer Society NET3.039 */ #include <linux/mm.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/thread_info.h> #include <linux/rcupdate.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/if_bridge.h> #include <linux/if_frad.h> #include <linux/if_vlan.h> #include <linux/ptp_classify.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/cache.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/kmod.h> #include <linux/audit.h> #include <linux/wireless.h> #include <linux/nsproxy.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/nospec.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <net/compat.h> #include <net/wext.h> #include <net/cls_cgroup.h> #include <net/sock.h> #include <linux/netfilter.h> #include <linux/if_tun.h> #include <linux/ipv6_route.h> #include <linux/route.h> #include <linux/sockios.h> #include <linux/atalk.h> #include <net/busy_poll.h> #include <linux/errqueue.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; #endif static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to); static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); static unsigned int sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #endif static int sock_fasync(int fd, struct file *filp, int on); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more); static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. */ static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, }; /* * The protocol list. Each protocol is registered in here. */ static DEFINE_SPINLOCK(net_family_lock); static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly; /* * Statistics counters of the socket lists */ static DEFINE_PER_CPU(int, sockets_in_use); /* * Support routines. * Move socket addresses back and forth across the kernel/user * divide and look after the messy bits. */ /** * move_addr_to_kernel - copy a socket address into kernel space * @uaddr: Address in user space * @kaddr: Address in kernel space * @ulen: Length in user space * * The address is copied into kernel space. If the provided address is * too long an error code of -EINVAL is returned. If the copy gives * invalid addresses -EFAULT is returned. On a success 0 is returned. */ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr) { 1310 if (ulen < 0 || ulen > sizeof(struct sockaddr_storage)) return -EINVAL; 422 if (ulen == 0) return 0; 1299 if (copy_from_user(kaddr, uaddr, ulen)) return -EFAULT; 1299 return audit_sockaddr(ulen, kaddr); } /** * move_addr_to_user - copy an address to user space * @kaddr: kernel space address * @klen: length of address in kernel * @uaddr: user space address * @ulen: pointer to user length field * * The value pointed to by ulen on entry is the buffer length available. * This is overwritten with the buffer space used. -EINVAL is returned * if an overlong buffer is specified or a negative buffer size. -EFAULT * is returned if either the buffer or the length field are not * accessible. * After copying the data up to the limit the user specifies, the true * length of the data is written over the length limit the user * specified. Zero is returned for a success. */ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, void __user *uaddr, int __user *ulen) { int err; int len; 129 BUG_ON(klen > sizeof(struct sockaddr_storage)); 129 err = get_user(len, ulen); if (err) return err; 125 if (len > klen) len = klen; if (len < 0) return -EINVAL; 124 if (len) { 64 if (audit_sockaddr(klen, kaddr)) return -ENOMEM; 64 if (copy_to_user(uaddr, kaddr, len)) return -EFAULT; } /* * "fromlen shall refer to the value before truncation.." * 1003.1g */ 129 return __put_user(klen, ulen); } static struct kmem_cache *sock_inode_cachep __read_mostly; static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; struct socket_wq *wq; 936 ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; 936 wq = kmalloc(sizeof(*wq), GFP_KERNEL); if (!wq) { kmem_cache_free(sock_inode_cachep, ei); return NULL; } 936 init_waitqueue_head(&wq->wait); wq->fasync_list = NULL; wq->flags = 0; RCU_INIT_POINTER(ei->socket.wq, wq); ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; ei->socket.ops = NULL; ei->socket.sk = NULL; ei->socket.file = NULL; 936 return &ei->vfs_inode; } static void sock_destroy_inode(struct inode *inode) { struct socket_alloc *ei; struct socket_wq *wq; 496 ei = container_of(inode, struct socket_alloc, vfs_inode); wq = rcu_dereference_protected(ei->socket.wq, 1); kfree_rcu(wq, rcu); kmem_cache_free(sock_inode_cachep, ei); } static void init_once(void *foo) { struct socket_alloc *ei = (struct socket_alloc *)foo; 28 inode_init_once(&ei->vfs_inode); } static int init_inodecache(void) { sock_inode_cachep = kmem_cache_create("sock_inode_cache", sizeof(struct socket_alloc), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD), init_once); if (sock_inode_cachep == NULL) return -ENOMEM; return 0; } static const struct super_operations sockfs_ops = { .alloc_inode = sock_alloc_inode, .destroy_inode = sock_destroy_inode, .statfs = simple_statfs, }; /* * sockfs_dname() is called from d_path(). */ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", 4 d_inode(dentry)->i_ino); } static const struct dentry_operations sockfs_dentry_operations = { .d_dname = sockfs_dname, }; static struct dentry *sockfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_pseudo(fs_type, "socket:", &sockfs_ops, &sockfs_dentry_operations, SOCKFS_MAGIC); } static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", .mount = sockfs_mount, .kill_sb = kill_anon_super, }; /* * Obtains the first available file descriptor and sets it up for use. * * These functions create file structures and maps them to fd space * of the current process. On success it returns file descriptor * and file struct implicitly stored in sock->file. * Note that another thread may close file descriptor before we return * from this function. We use the fact that now we do not refer * to socket after mapping. If one day we will need it, this * function will increment ref. count on file by 1. * * In any case returned fd MAY BE not valid! * This race condition is unavoidable * with shared fd spaces, we cannot solve it inside kernel, * but we take care of internal coherence yet. */ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) { 833 struct qstr name = { .name = "" }; struct path path; struct file *file; if (dname) { 35 name.name = dname; name.len = strlen(name.name); 803 } else if (sock->sk) { 803 name.name = sock->sk->sk_prot_creator->name; name.len = strlen(name.name); } 833 path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); if (unlikely(!path.dentry)) return ERR_PTR(-ENOMEM); 833 path.mnt = mntget(sock_mnt); d_instantiate(path.dentry, SOCK_INODE(sock)); file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &socket_file_ops); if (IS_ERR(file)) { /* drop dentry, keep inode */ ihold(d_inode(path.dentry)); path_put(&path); return file; } 833 sock->file = file; file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->private_data = sock; 833 return file; } EXPORT_SYMBOL(sock_alloc_file); static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); if (unlikely(fd < 0)) return fd; 697 newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { 697 fd_install(fd, newfile); return fd; } put_unused_fd(fd); return PTR_ERR(newfile); } struct socket *sock_from_file(struct file *file, int *err) { 3720 if (file->f_op == &socket_file_ops) 3713 return file->private_data; /* set in sock_map_fd */ 12 *err = -ENOTSOCK; return NULL; } EXPORT_SYMBOL(sock_from_file); /** * sockfd_lookup - Go from a file number to its socket slot * @fd: file handle * @err: pointer to an error code return * * The file handle passed in is locked and the socket it is bound * too is returned. If an error occurs the err pointer is overwritten * with a negative errno code and NULL is returned. The function checks * for both invalid handles and passing a handle which is not a socket. * * On a success the socket object pointer is returned. */ struct socket *sockfd_lookup(int fd, int *err) { struct file *file; struct socket *sock; 1240 file = fget(fd); if (!file) { 22 *err = -EBADF; return NULL; } 1233 sock = sock_from_file(file, err); 1231 if (!sock) 24 fput(file); return sock; } EXPORT_SYMBOL(sockfd_lookup); static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) { 2877 struct fd f = fdget(fd); struct socket *sock; *err = -EBADF; if (f.file) { 2870 sock = sock_from_file(f.file, err); if (likely(sock)) { 2865 *fput_needed = f.flags; return sock; } 10 fdput(f); } 2877 return NULL; } #define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname" #define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX) #define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1) static ssize_t sockfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { if (!strcmp(name, XATTR_NAME_SOCKPROTONAME)) { if (value) { if (dentry->d_name.len + 1 > size) return -ERANGE; memcpy(value, dentry->d_name.name, dentry->d_name.len + 1); 6 } 3 return dentry->d_name.len + 1; } return -EOPNOTSUPP; } 2 static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { 1 ssize_t len; ssize_t used = 0; 2 len = security_inode_listsecurity(d_inode(dentry), buffer, size); if (len < 0) return len; 6 used += len; if (buffer) { if (size < used) return -ERANGE; buffer += len; } len = (XATTR_NAME_SOCKPROTONAME_LEN + 1); used += len; 6 if (buffer) { if (size < used) return -ERANGE; memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len); 6 buffer += len; 5 } 4 return used; } static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) 1 { 1 int err = simple_setattr(dentry, iattr); 4 if (!err && (iattr->ia_valid & ATTR_UID)) { 3 struct socket *sock = SOCKET_I(d_inode(dentry)); 5 sock->sk->sk_uid = iattr->ia_uid; } return err; } 1 static const struct inode_operations sockfs_inode_ops = { 3 .getxattr = sockfs_getxattr, .listxattr = sockfs_listxattr, 3 .setattr = sockfs_setattr, 1 }; /** 1 * sock_alloc - allocate a socket * * Allocate a new inode and socket object. The two are bound together * and initialised. The socket is then returned. If we are out of inodes * NULL is returned. 3 */ static struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; sock = SOCKET_I(inode); kmemcheck_annotate_bitfield(sock, type); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; this_cpu_add(sockets_in_use, 1); return sock; 936 } /** * sock_release - close a socket 936 * @sock: socket to close * * The socket is released from the protocol stack if it has a release * callback, and the inode is then released if the socket is bound to * an inode not a file. */ void sock_release(struct socket *sock) { if (sock->ops) { 936 struct module *owner = sock->ops->owner; sock->ops->release(sock); sock->ops = NULL; module_put(owner); } if (rcu_dereference_protected(sock->wq, 1)->fasync_list) pr_err("%s: fasync list not empty!\n", __func__); this_cpu_sub(sockets_in_use, 1); if (!sock->file) { iput(SOCK_INODE(sock)); return; 495 } 414 sock->file = NULL; } EXPORT_SYMBOL(sock_release); 405 10 void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) { u8 flags = *tx_flags; 415 if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE) flags |= SKBTX_HW_TSTAMP; 496 if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; 496 if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP; 96 if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) flags |= SKBTX_ACK_TSTAMP; 496 *tx_flags = flags; } EXPORT_SYMBOL(__sock_tx_timestamp); 86 static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); return ret; } 101 int sock_sendmsg(struct socket *sock, struct msghdr *msg) { 13 int err = security_socket_sendmsg(sock, msg, msg_data_left(msg)); 101 18 return err ?: sock_sendmsg_nosec(sock, msg); } 101 EXPORT_SYMBOL(sock_sendmsg); 96 int kernel_sendmsg(struct socket *sock, struct msghdr *msg, 101 struct kvec *vec, size_t num, size_t size) 12 { iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); 101 return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) 2874 */ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); struct scm_timestamping tss; int empty = 1; struct skb_shared_hwtstamps *shhwtstamps = 2878 skb_hwtstamps(skb); 2875 /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ if (need_software_tstamp && skb->tstamp.tv64 == 0) __net_timestamp(skb); if (need_software_tstamp) { if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) { 227 struct timeval tv; skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); } else { struct timespec ts; skb_get_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, sizeof(ts), &ts); } } 36 memset(&tss, 0, sizeof(tss)); if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) 9 empty = 0; 3 if (!empty) put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, sizeof(tss), &tss); 9 } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); 4 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int ack; 5 if (!sock_flag(sk, SOCK_WIFI_STATUS)) return; if (!skb->wifi_acked_valid) return; 36 ack = skb->wifi_acked; 27 25 put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack); } 11 EXPORT_SYMBOL_GPL(__sock_recv_wifi_status); 25 static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) 36 { 25 if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount) put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL, sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount); } void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { sock_recv_timestamp(msg, sk, skb); sock_recv_drops(msg, sk, skb); } EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size, int flags) { return sock->ops->recvmsg(sock, msg, size, flags); } int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { int err = security_socket_recvmsg(sock, msg, size, flags); 14 return err ?: sock_recvmsg_nosec(sock, msg, size, flags); } EXPORT_SYMBOL(sock_recvmsg); /** * kernel_recvmsg - Receive a message from a socket (kernel space) * @sock: The socket to receive the message from * @msg: Received message 14 * @vec: Input s/g array for message data 14 * @num: Size of input s/g array 14 * @size: Number of bytes to read * @flags: Message flags (MSG_DONTWAIT, etc...) * * On return the msg structure contains the scatter/gather array passed in the * vec argument. The array is modified so that it consists of the unfilled * portion of the original array. 747 * * The returned value is the total number of bytes received, or an error. */ 747 int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { 747 mm_segment_t oldfs = get_fs(); int result; 749 iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); set_fs(KERNEL_DS); result = sock_recvmsg(sock, msg, size, flags); set_fs(oldfs); return result; } EXPORT_SYMBOL(kernel_recvmsg); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more) { struct socket *sock; int flags; sock = file->private_data; flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */ flags |= more; return kernel_sendpage(sock, page, offset, size, flags); } static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct socket *sock = file->private_data; if (unlikely(!sock->ops->splice_read)) return -EINVAL; return sock->ops->splice_read(sock, ppos, pipe, len, flags); } static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; 398 struct socket *sock = file->private_data; struct msghdr msg = {.msg_iter = *to, .msg_iocb = iocb}; ssize_t res; if (file->f_flags & O_NONBLOCK) msg.msg_flags = MSG_DONTWAIT; if (iocb->ki_pos != 0) return -ESPIPE; if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; 20 res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags); *to = msg.msg_iter; return res; } 20 static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; 62 struct msghdr msg = {.msg_iter = *from, .msg_iocb = iocb}; ssize_t res; if (iocb->ki_pos != 0) return -ESPIPE; 7 if (file->f_flags & O_NONBLOCK) msg.msg_flags = MSG_DONTWAIT; 62 if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; 62 res = sock_sendmsg(sock, &msg); *from = msg.msg_iter; 50 return res; } /* * Atomic setting of ioctl hooks to avoid race * with module unload. */ 706 static DEFINE_MUTEX(br_ioctl_mutex); static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg); void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) { mutex_lock(&br_ioctl_mutex); br_ioctl_hook = hook; mutex_unlock(&br_ioctl_mutex); 705 } 78 EXPORT_SYMBOL(brioctl_set); 705 static DEFINE_MUTEX(vlan_ioctl_mutex); 16 static int (*vlan_ioctl_hook) (struct net *, void __user *arg); 705 void vlan_ioctl_set(int (*hook) (struct net *, void __user *)) { 601 mutex_lock(&vlan_ioctl_mutex); vlan_ioctl_hook = hook; mutex_unlock(&vlan_ioctl_mutex); } EXPORT_SYMBOL(vlan_ioctl_set); static DEFINE_MUTEX(dlci_ioctl_mutex); static int (*dlci_ioctl_hook) (unsigned int, void __user *); void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) { mutex_lock(&dlci_ioctl_mutex); dlci_ioctl_hook = hook; mutex_unlock(&dlci_ioctl_mutex); } EXPORT_SYMBOL(dlci_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { int err; void __user *argp = (void __user *)arg; err = sock->ops->ioctl(sock, cmd, arg); /* * If this ioctl is unknown try to hand it down * to the NIC driver. */ if (err == -ENOIOCTLCMD) err = dev_ioctl(net, cmd, argp); return err; } /* * With an ioctl, arg may well be a user mode pointer, but we don't know * what to do with it - that's up to the protocol still. */ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct socket *sock; struct sock *sk; void __user *argp = (void __user *)arg; 101 int pid, err; struct net *net; 335 sock = file->private_data; sk = sock->sk; net = sock_net(sk); if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { err = dev_ioctl(net, cmd, argp); } else #ifdef CONFIG_WEXT_CORE if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { 317 err = dev_ioctl(net, cmd, argp); } else #endif switch (cmd) { case FIOSETOWN: case SIOCSPGRP: err = -EFAULT; if (get_user(pid, (int __user *)argp)) break; f_setown(sock->file, pid, 1); err = 0; break; 58 case FIOGETOWN: case SIOCGPGRP: err = put_user(f_getown(sock->file), (int __user *)argp); break; case SIOCGIFBR: case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: err = -ENOPKG; if (!br_ioctl_hook) request_module("bridge"); mutex_lock(&br_ioctl_mutex); if (br_ioctl_hook) 58 err = br_ioctl_hook(net, cmd, argp); mutex_unlock(&br_ioctl_mutex); break; case SIOCGIFVLAN: 3 case SIOCSIFVLAN: err = -ENOPKG; 3 if (!vlan_ioctl_hook) request_module("8021q"); mutex_lock(&vlan_ioctl_mutex); if (vlan_ioctl_hook) 7 err = vlan_ioctl_hook(net, argp); mutex_unlock(&vlan_ioctl_mutex); break; case SIOCADDDLCI: case SIOCDELDLCI: err = -ENOPKG; if (!dlci_ioctl_hook) request_module("dlci"); 3 3 mutex_lock(&dlci_ioctl_mutex); if (dlci_ioctl_hook) 3 err = dlci_ioctl_hook(cmd, argp); mutex_unlock(&dlci_ioctl_mutex); break; 3 default: err = sock_do_ioctl(net, sock, cmd, arg); break; } return err; 3 } 3 int sock_create_lite(int family, int type, int protocol, struct socket **res) 4 { int err; struct socket *sock = NULL; 4 err = security_socket_create(family, type, protocol, 1); if (err) goto out; 3 sock = sock_alloc(); 3 if (!sock) { err = -ENOMEM; 3 goto out; } 3 sock->type = type; err = security_socket_post_create(sock, family, type, protocol, 1); if (err) 39 goto out_release; out: 59 *res = sock; return err; out_release: sock_release(sock); sock = NULL; goto out; } EXPORT_SYMBOL(sock_create_lite); 30 /* No kernel lock held - perfect */ static unsigned int sock_poll(struct file *file, poll_table *wait) { 30 unsigned int busy_flag = 0; struct socket *sock; /* * We can't return errors to poll, so it's either yes or no. */ 30 sock = file->private_data; if (sk_can_busy_loop(sock->sk)) { /* this socket can poll_ll so tell the system call */ busy_flag = POLL_BUSY_LOOP; 30 /* once, only if requested by syscall */ if (wait && (wait->_key & POLL_BUSY_LOOP)) sk_busy_loop(sock->sk, 1); } return busy_flag | sock->ops->poll(file, sock, wait); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) { struct socket *sock = file->private_data; return sock->ops->mmap(file, sock, vma); } static int sock_close(struct inode *inode, struct file *filp) { sock_release(SOCKET_I(inode)); 136 return 0; } /* * Update the socket async list * * Fasync_list locking strategy. * * 1. fasync_list is modified only under process context socket lock * i.e. under semaphore. * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) 136 * or under socket lock */ static int sock_fasync(int fd, struct file *filp, int on) { 10 struct socket *sock = filp->private_data; struct sock *sk = sock->sk; struct socket_wq *wq; if (sk == NULL) return -EINVAL; 405 lock_sock(sk); wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk)); fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) sock_reset_flag(sk, SOCK_FASYNC); else sock_set_flag(sk, SOCK_FASYNC); release_sock(sk); return 0; } /* This function may be called only under rcu_lock */ int sock_wake_async(struct socket_wq *wq, int how, int band) { 14 if (!wq || !wq->fasync_list) return -1; switch (how) { case SOCK_WAKE_WAITD: if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; 14 goto call_kill; 14 case SOCK_WAKE_SPACE: if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; /* fall through */ 1 case SOCK_WAKE_IO: call_kill: 13 kill_fasync(&wq->fasync_list, SIGIO, band); break; 14 case SOCK_WAKE_URG: 14 kill_fasync(&wq->fasync_list, SIGURG, band); } return 0; } EXPORT_SYMBOL(sock_wake_async); 97 int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { 96 int err; struct socket *sock; 41 const struct net_proto_family *pf; /* * Check protocol is in range 54 */ if (family < 0 || family >= NPROTO) return -EAFNOSUPPORT; if (type < 0 || type >= SOCK_MAX) return -EINVAL; 70 /* Compatibility. 1 This uglymoron is moved from INET layer to here to avoid deadlock in module load. */ 97 if (family == PF_INET && type == SOCK_PACKET) { static int warned; if (!warned) { warned = 1; pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); } family = PF_PACKET; } err = security_socket_create(family, type, protocol, kern); if (err) return err; 967 /* * Allocate the socket and allow the family to set things up. if 967 * the protocol is 0, the family is instructed to select an appropriate * default. */ sock = sock_alloc(); if (!sock) { net_warn_ratelimited("socket: no more sockets\n"); return -ENFILE; /* Not exactly a match, but its the closest posix thing */ 966 } 2 sock->type = type; #ifdef CONFIG_MODULES /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user * requested real, full-featured networking support upon configuration. * Otherwise module support will break! 966 */ 967 if (rcu_access_pointer(net_families[family]) == NULL) request_module("net-pf-%d", family); #endif rcu_read_lock(); pf = rcu_dereference(net_families[family]); err = -EAFNOSUPPORT; if (!pf) 906 goto out_release; /* * We will call the ->create function, that possibly is in a loadable * module, so we have to bump that loadable module refcnt first. */ if (!try_module_get(pf->owner)) 906 goto out_release; /* Now protected by module ref count */ rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); if (err < 0) goto out_module_put; /* 10 * Now to bump the refcnt of the [loadable] module that owns this * socket at sock_release time we decrement its refcnt. */ 906 if (!try_module_get(sock->ops->owner)) 906 goto out_module_busy; 906 /* * Now that we're done with the ->create function, the [loadable] * module can have its refcnt decremented */ module_put(pf->owner); err = security_socket_post_create(sock, family, type, protocol, kern); if (err) goto out_sock_release; *res = sock; return 0; 897 out_module_busy: err = -EAFNOSUPPORT; out_module_put: sock->ops = NULL; module_put(pf->owner); out_sock_release: sock_release(sock); return err; 840 out_release: rcu_read_unlock(); goto out_sock_release; } EXPORT_SYMBOL(__sock_create); int sock_create(int family, int type, int protocol, struct socket **res) 840 { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); } EXPORT_SYMBOL(sock_create); 840 int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { return __sock_create(net, family, type, protocol, res, 1); } EXPORT_SYMBOL(sock_create_kern); 76 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { int retval; 86 struct socket *sock; int flags; /* Check the SOCK_* constants for consistency. */ 10 BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK); BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 132 return -EINVAL; type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 30 retval = sock_create(family, type, protocol, &sock); if (retval < 0) goto out; 828 retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); if (retval < 0) goto out_release; out: /* It may be already another descriptor 8) Not kernel problem. */ return retval; out_release: sock_release(sock); return retval; } /* * Create a pair of connected sockets. 821 */ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, int __user *, usockvec) { struct socket *sock1, *sock2; int fd1, fd2, err; struct file *newfile1, *newfile2; int flags; 699 flags = type & ~SOCK_TYPE_MASK; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; type &= SOCK_TYPE_MASK; 822 if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 2 /* * Obtain the first socket and check if the underlying protocol * supports the socketpair call. */ err = sock_create(family, type, protocol, &sock1); if (err < 0) goto out; 144 err = sock_create(family, type, protocol, &sock2); if (err < 0) goto out_release_1; err = sock1->ops->socketpair(sock1, sock2); if (err < 0) goto out_release_both; 125 fd1 = get_unused_fd_flags(flags); if (unlikely(fd1 < 0)) { 141 err = fd1; goto out_release_both; } fd2 = get_unused_fd_flags(flags); if (unlikely(fd2 < 0)) { err = fd2; goto out_put_unused_1; } newfile1 = sock_alloc_file(sock1, flags, NULL); if (IS_ERR(newfile1)) { err = PTR_ERR(newfile1); goto out_put_unused_both; 131 } newfile2 = sock_alloc_file(sock2, flags, NULL); if (IS_ERR(newfile2)) { 131 err = PTR_ERR(newfile2); goto out_fput_1; } err = put_user(fd1, &usockvec[0]); if (err) goto out_fput_both; err = put_user(fd2, &usockvec[1]); if (err) 125 goto out_fput_both; audit_fd_pair(fd1, fd2); fd_install(fd1, newfile1); fd_install(fd2, newfile2); 125 /* fd1 and fd2 may be already another descriptors. * Not kernel problem. */ return 0; 125 out_fput_both: fput(newfile2); fput(newfile1); put_unused_fd(fd2); put_unused_fd(fd1); goto out; out_fput_1: fput(newfile1); put_unused_fd(fd2); 85 put_unused_fd(fd1); sock_release(sock2); goto out; 85 out_put_unused_both: put_unused_fd(fd2); 85 out_put_unused_1: put_unused_fd(fd1); out_release_both: sock_release(sock2); out_release_1: sock_release(sock1); out: return err; } 40 /* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. * * We move the socket address to kernel space before we call * the protocol layer (having also checked the address is ok). */ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); 8 if (err >= 0) { err = security_socket_bind(sock, 8 (struct sockaddr *)&address, addrlen); 143 if (!err) err = sock->ops->bind(sock, (struct sockaddr *) &address, addrlen); } fput_light(sock->file, fput_needed); } return err; } /* 258 * Perform a listen. Basically, we allow the protocol to do anything * necessary for a listen, and if that works, we mark the socket as * ready for listening. */ SYSCALL_DEFINE2(listen, int, fd, int, backlog) { struct socket *sock; 257 int err, fput_needed; int somaxconn; 254 sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; 253 if ((unsigned int)backlog > somaxconn) backlog = somaxconn; err = security_socket_listen(sock, backlog); 258 if (!err) err = sock->ops->listen(sock, backlog); 258 fput_light(sock->file, fput_needed); } return err; } /* * For accept, we attempt to create a new socket, set up the link * with the client, wake up the client, then return the new 35 * connected fd. We collect the address of the connector in kernel * space and move it to user at the very end. This is unclean because * we open the socket then return an error. * * 1003.1g adds the ability to recvmsg() to query connection pending * status to recvmsg. We need to add that support in a way thats * clean when we restucture accept also. */ 33 SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen, int, flags) { 33 struct socket *sock, *newsock; struct file *newfile; 33 int err, len, newfd, fput_needed; struct sockaddr_storage address; 34 if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 35 return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = -ENFILE; newsock = sock_alloc(); if (!newsock) goto out_put; newsock->type = sock->type; 42 newsock->ops = sock->ops; /* * We don't need try_module_get here, as the listening socket (sock) * has the protocol module (sock->ops->owner) held. */ __module_get(newsock->ops->owner); newfd = get_unused_fd_flags(flags); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); goto out_put; } 39 newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) { err = PTR_ERR(newfile); put_unused_fd(newfd); 36 sock_release(newsock); goto out_put; } err = security_socket_accept(sock, newsock); 36 if (err) goto out_fd; err = sock->ops->accept(sock, newsock, sock->file->f_flags); if (err < 0) goto out_fd; if (upeer_sockaddr) { if (newsock->ops->getname(newsock, (struct sockaddr *)&address, &len, 2) < 0) { err = -ECONNABORTED; 1 goto out_fd; } err = move_addr_to_user(&address, len, upeer_sockaddr, upeer_addrlen); 35 if (err < 0) goto out_fd; } /* File flags are not inherited via accept() unlike another OSes. */ fd_install(newfd, newfile); err = newfd; 35 out_put: fput_light(sock->file, fput_needed); out: 35 return err; out_fd: fput(newfile); put_unused_fd(newfd); 19 goto out_put; 4 } SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, int __user *, upeer_addrlen) { 4 return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); } /* * Attempt to connect to a socket with the server address. The address * is in user space so we verify it is OK and move it to kernel space. * * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to 18 * break bindings * * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and * other SEQPACKET protocols that take time to connect() as it doesn't 37 * include the -EINPROGRESS status for such sockets. */ 42 SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, 20 int, addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = move_addr_to_kernel(uservaddr, addrlen, &address); if (err < 0) goto out_put; err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen); if (err) goto out_put; err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, sock->file->f_flags); out_put: fput_light(sock->file, fput_needed); out: 472 return err; } /* * Get the local address ('name') of a socket object. Move the obtained * name to user space. */ SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) 467 { struct socket *sock; struct sockaddr_storage address; int len, err, fput_needed; 465 sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; 463 err = security_socket_getsockname(sock); if (err) goto out_put; 447 err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0); 450 if (err) goto out_put; err = move_addr_to_user(&address, len, usockaddr, usockaddr_len); out_put: fput_light(sock->file, fput_needed); out: return err; 19 } /* * Get the remote address ('name') of a socket object. Move the obtained * name to user space. */ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { struct socket *sock; 18 struct sockaddr_storage address; int len, err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); 18 if (sock != NULL) { err = security_socket_getpeername(sock); if (err) { 17 fput_light(sock->file, fput_needed); return err; } 18 err = 19 sock->ops->getname(sock, (struct sockaddr *)&address, &len, 1); if (!err) err = move_addr_to_user(&address, len, usockaddr, usockaddr_len); fput_light(sock->file, fput_needed); } return err; 13 } /* * Send a datagram to a given address. We move the address into kernel * space and check the user space data area is readable before invoking * the protocol. */ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, 11 unsigned int, flags, struct sockaddr __user *, addr, int, addr_len) { struct socket *sock; struct sockaddr_storage address; int err; struct msghdr msg; 11 struct iovec iov; int fput_needed; 5 err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter); if (unlikely(err)) 11 return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); 13 if (!sock) goto out; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; if (addr) { err = move_addr_to_kernel(addr, addr_len, &address); 488 if (err < 0) goto out_put; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; err = sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); out: return err; 487 } /* * Send a datagram down a socket. 481 */ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, unsigned int, flags) { 314 return sys_sendto(fd, buff, len, flags, NULL, 0); } 313 /* * Receive a frame from the socket and optionally record the address of the * sender. We verify the buffers are writable and if needed move the * sender address from kernel to user space. 27 */ 480 SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags, struct sockaddr __user *, addr, int __user *, addr_len) 401 { struct socket *sock; 407 struct iovec iov; struct msghdr msg; struct sockaddr_storage address; int err, err2; int fput_needed; err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; msg.msg_control = NULL; msg.msg_controllen = 0; /* Save some cycles and don't copy the address if not needed */ msg.msg_name = addr ? (struct sockaddr *)&address : NULL; /* We assume all kernel code knows the size of sockaddr_storage */ msg.msg_namelen = 0; 293 msg.msg_iocb = NULL; msg.msg_flags = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags); if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, msg.msg_namelen, addr, addr_len); if (err2 < 0) err = err2; } fput_light(sock->file, fput_needed); 293 out: return err; } 292 /* * Receive a datagram from a socket. */ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, unsigned int, flags) { return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); } 3 292 /* * Set a socket option. Because we don't know the option lengths we have 188 * to pass the user mode parameter for the protocols to sort out. 2 */ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, 1 char __user *, optval, int, optlen) { int err, fput_needed; 201 struct socket *sock; 202 if (optlen < 0) return -EINVAL; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { err = security_socket_setsockopt(sock, level, optname); if (err) 158 goto out_put; if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, optval, optlen); else err = sock->ops->setsockopt(sock, level, optname, optval, optlen); out_put: fput_light(sock->file, fput_needed); } return err; } /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, char __user *, optval, int __user *, optlen) { int err, fput_needed; struct socket *sock; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { err = security_socket_getsockopt(sock, level, optname); if (err) goto out_put; if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, optlen); else err = sock->ops->getsockopt(sock, level, optname, optval, optlen); out_put: fput_light(sock->file, fput_needed); } return err; } /* * Shutdown a socket. */ SYSCALL_DEFINE2(shutdown, int, fd, int, how) { int err, fput_needed; struct socket *sock; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { err = security_socket_shutdown(sock, how); if (!err) err = sock->ops->shutdown(sock, how); fput_light(sock->file, fput_needed); } return err; } /* A couple of helpful macros for getting the address of the 32/64 bit * fields which are the same type (int / unsigned) on our platforms. */ #define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) #define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) #define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) struct used_address { struct sockaddr_storage name; 50 unsigned int name_len; }; static int copy_msghdr_from_user(struct msghdr *kmsg, struct user_msghdr __user *umsg, struct sockaddr __user **save_addr, struct iovec **iov) { struct sockaddr __user *uaddr; 49 struct iovec __user *uiov; 49 size_t nr_segs; ssize_t err; 50 if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) || __get_user(uaddr, &umsg->msg_name) || __get_user(kmsg->msg_namelen, &umsg->msg_namelen) || __get_user(uiov, &umsg->msg_iov) || __get_user(nr_segs, &umsg->msg_iovlen) || __get_user(kmsg->msg_control, &umsg->msg_control) || __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || __get_user(kmsg->msg_flags, &umsg->msg_flags)) return -EFAULT; if (!uaddr) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) return -EINVAL; if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); if (save_addr) *save_addr = uaddr; if (uaddr && kmsg->msg_namelen) { if (!save_addr) { err = move_addr_to_kernel(uaddr, kmsg->msg_namelen, kmsg->msg_name); if (err < 0) return err; } } else { kmsg->msg_name = NULL; kmsg->msg_namelen = 0; } if (nr_segs > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; return import_iovec(save_addr ? READ : WRITE, uiov, nr_segs, UIO_FASTIOV, iov, &kmsg->msg_iter); } static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address) { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; struct sockaddr_storage address; struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; unsigned char ctl[sizeof(struct cmsghdr) + 20] __attribute__ ((aligned(sizeof(__kernel_size_t)))); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; int ctl_len; ssize_t err; msg_sys->msg_name = &address; if (MSG_CMSG_COMPAT & flags) err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov); else err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov); if (err < 0) return err; err = -ENOBUFS; if (msg_sys->msg_controllen > INT_MAX) goto out_freeiov; ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { err = 1697 cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, sizeof(ctl)); if (err) goto out_freeiov; ctl_buf = msg_sys->msg_control; ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) goto out_freeiov; 1697 } err = -EFAULT; /* 1697 * Careful! Before this, msg_sys->msg_control contains a user pointer. 20 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted * checking falls down on this. */ if (copy_from_user(ctl_buf, 1696 (void __user __force *)msg_sys->msg_control, ctl_len)) goto out_freectl; 1695 msg_sys->msg_control = ctl_buf; } 156 msg_sys->msg_flags = flags; if (sock->file->f_flags & O_NONBLOCK) msg_sys->msg_flags |= MSG_DONTWAIT; 142 /* * If this is sendmmsg() and current destination address is same as * previously succeeded address, omit asking LSM's decision. * used_address->name_len is initialized to UINT_MAX so that the first * destination address never matches. */ if (used_address && msg_sys->msg_name && used_address->name_len == msg_sys->msg_namelen && !memcmp(&used_address->name, msg_sys->msg_name, used_address->name_len)) { err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } err = sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. */ if (used_address && err >= 0) { used_address->name_len = msg_sys->msg_namelen; 1694 if (msg_sys->msg_name) memcpy(&used_address->name, msg_sys->msg_name, used_address->name_len); 16 } out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); out_freeiov: kfree(iov); 1694 return err; 156 } 38 /* 3 * BSD sendmsg interface */ 1694 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; 1058 sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) 326 goto out; err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL); fput_light(sock->file, fput_needed); 1665 out: 38 return err; } 1667 1668 SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_sendmsg(fd, msg, flags); } /* * Linux sendmmsg interface */ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags) 729 { int fput_needed, err, datagrams; struct socket *sock; struct mmsghdr __user *entry; 722 struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; 696 struct used_address used_address; 697 if (vlen > UIO_MAXIOV) vlen = UIO_MAXIOV; datagrams = 0; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; used_address.name_len = UINT_MAX; entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; err = 0; while (datagrams < vlen) { if (MSG_CMSG_COMPAT & flags) { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry, &msg_sys, flags, &used_address); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { err = ___sys_sendmsg(sock, 1074 (struct user_msghdr __user *)entry, &msg_sys, flags, &used_address); if (err < 0) break; err = put_user(err, &entry->msg_len); ++entry; } if (err) 1074 break; ++datagrams; if (msg_data_left(&msg_sys)) break; } 1074 1073 fput_light(sock->file, fput_needed); 1073 /* We only return an error if no datagrams were able to be sent */ if (datagrams != 0) return datagrams; 1058 return err; } SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_sendmmsg(fd, mmsg, vlen, flags); } 1058 static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, int nosec) 1058 { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; 402 unsigned long cmsg_ptr; int total_len, len; ssize_t err; 402 /* kernel mode address */ struct sockaddr_storage addr; 403 /* user mode address pointers */ struct sockaddr __user *uaddr; int __user *uaddr_len = COMPAT_NAMELEN(msg); msg_sys->msg_name = &addr; if (MSG_CMSG_COMPAT & flags) err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov); else err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov); if (err < 0) return err; total_len = iov_iter_count(&msg_sys->msg_iter); cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 424 /* We assume all kernel code knows the size of sockaddr_storage */ msg_sys->msg_namelen = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, total_len, flags); if (err < 0) goto out_freeiov; 424 len = err; if (uaddr != NULL) { err = move_addr_to_user(&addr, msg_sys->msg_namelen, uaddr, uaddr_len); if (err < 0) goto out_freeiov; 424 } 5 err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), 423 COMPAT_FLAGS(msg)); if (err) 304 goto out_freeiov; if (MSG_CMSG_COMPAT & flags) err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg_compat->msg_controllen); else err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg->msg_controllen); 17 if (err) 423 goto out_freeiov; err = len; out_freeiov: kfree(iov); return err; 304 } 101 /* * BSD recvmsg interface */ long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags) 304 { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; 304 sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); 304 fput_light(sock->file, fput_needed); out: return err; } 313 314 SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) { if (flags & MSG_CMSG_COMPAT) return -EINVAL; return __sys_recvmsg(fd, msg, flags); } /* * Linux recvmmsg interface */ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, 186 unsigned int flags, struct timespec *timeout) { int fput_needed, err, datagrams; struct socket *sock; 185 struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; 78 struct msghdr msg_sys; struct timespec end_time; 78 if (timeout && poll_select_set_timeout(&end_time, timeout->tv_sec, timeout->tv_nsec)) return -EINVAL; datagrams = 0; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) return err; err = sock_error(sock->sk); if (err) { datagrams = err; goto out_put; } entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; while (datagrams < vlen) { /* * No need to ask LSM for more than the first datagram. */ 243 if (MSG_CMSG_COMPAT & flags) { 72 err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry, &msg_sys, flags & ~MSG_WAITFORONE, datagrams); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); 243 ++compat_entry; } else { 3 err = ___sys_recvmsg(sock, (struct user_msghdr __user *)entry, 241 &msg_sys, flags & ~MSG_WAITFORONE, datagrams); if (err < 0) break; err = put_user(err, &entry->msg_len); ++entry; } if (err) 241 break; ++datagrams; /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */ 239 if (flags & MSG_WAITFORONE) 239 flags |= MSG_DONTWAIT; if (timeout) { ktime_get_ts(timeout); *timeout = timespec_sub(end_time, *timeout); 232 if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; } /* Timeout, return less than vlen datagrams */ if (timeout->tv_nsec == 0 && timeout->tv_sec == 0) break; } /* Out of band data, return right away */ if (msg_sys.msg_flags & MSG_OOB) break; 232 } 232 if (err == 0) goto out_put; if (datagrams == 0) { 22 datagrams = err; goto out_put; 232 } 69 /* * We may return less entries than requested (vlen) if the 7 * sock is non block and there aren't enough datagrams... */ if (err != -EAGAIN) { /* * ... or if recvmsg returns an error after we 65 * received some datagrams, where we record the * error to return on the next call or if the * app asks about it using getsockopt(SO_ERROR). */ sock->sk->sk_err = -err; 228 } out_put: fput_light(sock->file, fput_needed); 103 return datagrams; } 59 SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct timespec __user *, timeout) { int datagrams; struct timespec timeout_sys; if (flags & MSG_CMSG_COMPAT) return -EINVAL; 54 if (!timeout) return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys))) return -EFAULT; 36 datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); if (datagrams > 0 && 161 copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys))) datagrams = -EFAULT; return datagrams; } #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) static const unsigned char nargs[21] = { AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), AL(4), AL(5), AL(4) }; #undef AL /* * System call vectors. * * Argument checking cleaned up. Saved 20% in size. * This function doesn't need to set the kernel lock because * it is set by the callees. */ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[AUDITSC_ARGS]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_SENDMMSG) return -EINVAL; call = array_index_nospec(call, SYS_SENDMMSG + 1); len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); if (err) return err; a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: err = sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = sys_listen(a0, a1); break; case SYS_ACCEPT: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = sys_send(a0, (void __user *)a1, a[2], a[3]); break; case SYS_SENDTO: err = sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = sys_recv(a0, (void __user *)a1, a[2], a[3]); break; case SYS_RECVFROM: err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2]); break; case SYS_SENDMMSG: err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]); break; case SYS_RECVMSG: err = sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2]); break; case SYS_RECVMMSG: err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct timespec __user *)a[4]); break; case SYS_ACCEPT4: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; } #endif /* __ARCH_WANT_SYS_SOCKETCALL */ /** * sock_register - add a socket protocol handler * @ops: description of protocol * * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the * socket interface. The value ops->family corresponds to the * socket system call protocol family. */ int sock_register(const struct net_proto_family *ops) { int err; if (ops->family >= NPROTO) { pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } spin_lock(&net_family_lock); if (rcu_dereference_protected(net_families[ops->family], lockdep_is_held(&net_family_lock))) err = -EEXIST; else { rcu_assign_pointer(net_families[ops->family], ops); err = 0; } spin_unlock(&net_family_lock); pr_info("NET: Registered protocol family %d\n", ops->family); return err; } EXPORT_SYMBOL(sock_register); /** * sock_unregister - remove a protocol handler * @family: protocol family to remove * * This function is called by a protocol handler that wants to * remove its address family, and have it unlinked from the * new socket creation. * * If protocol handler is a module, then it can use module reference * counts to protect against new references. If protocol handler is not * a module then it needs to provide its own protection in * the ops->create routine. */ void sock_unregister(int family) { BUG_ON(family < 0 || family >= NPROTO); spin_lock(&net_family_lock); RCU_INIT_POINTER(net_families[family], NULL); spin_unlock(&net_family_lock); synchronize_rcu(); pr_info("NET: Unregistered protocol family %d\n", family); } EXPORT_SYMBOL(sock_unregister); static int __init sock_init(void) { int err; /* * Initialize the network sysctl infrastructure. */ err = net_sysctl_init(); if (err) goto out; /* * Initialize skbuff SLAB cache */ skb_init(); /* * Initialize the protocols module. */ init_inodecache(); err = register_filesystem(&sock_fs_type); if (err) goto out_fs; sock_mnt = kern_mount(&sock_fs_type); if (IS_ERR(sock_mnt)) { err = PTR_ERR(sock_mnt); goto out_mount; } /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER err = netfilter_init(); if (err) goto out; #endif ptp_classifier_init(); out: return err; out_mount: unregister_filesystem(&sock_fs_type); out_fs: goto out; } core_initcall(sock_init); /* early initcall */ static int __init jit_init(void) { #ifdef CONFIG_BPF_JIT_ALWAYS_ON bpf_jit_enable = 1; #endif return 0; } pure_initcall(jit_init); #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { int cpu; int counter = 0; for_each_possible_cpu(cpu) counter += per_cpu(sockets_in_use, cpu); /* It can be negative, by the way. 8) */ if (counter < 0) counter = 0; seq_printf(seq, "sockets: used %d\n", counter); } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_COMPAT static int do_siocgstamp(struct net *net, struct socket *sock, unsigned int cmd, void __user *up) { mm_segment_t old_fs = get_fs(); struct timeval ktv; int err; set_fs(KERNEL_DS); 2 err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); set_fs(old_fs); if (!err) err = compat_put_timeval(&ktv, up); 2 2 return err; } 2 static int do_siocgstampns(struct net *net, struct socket *sock, unsigned int cmd, void __user *up) { mm_segment_t old_fs = get_fs(); struct timespec kts; int err; set_fs(KERNEL_DS); err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); set_fs(old_fs); if (!err) 7 err = compat_put_timespec(&kts, up); return err; } static int dev_ifname32(struct net *net, struct compat_ifreq __user *uifr32) { struct ifreq __user *uifr; 5 int err; uifr = compat_alloc_user_space(sizeof(struct ifreq)); if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq))) return -EFAULT; err = dev_ioctl(net, SIOCGIFNAME, uifr); if (err) 4 return err; if (copy_in_user(uifr32, uifr, sizeof(struct compat_ifreq))) return -EFAULT; return 0; } 3 static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) { struct compat_ifconf ifc32; struct ifconf ifc; struct ifconf __user *uifc; struct compat_ifreq __user *ifr32; struct ifreq __user *ifr; unsigned int i, j; int err; 2 if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) return -EFAULT; memset(&ifc, 0, sizeof(ifc)); 2 if (ifc32.ifcbuf == 0) { ifc32.ifc_len = 0; ifc.ifc_len = 0; ifc.ifc_req = NULL; 1 uifc = compat_alloc_user_space(sizeof(struct ifconf)); } else { size_t len = ((ifc32.ifc_len / sizeof(struct compat_ifreq)) + 1) * sizeof(struct ifreq); uifc = compat_alloc_user_space(sizeof(struct ifconf) + len); ifc.ifc_len = len; ifr = ifc.ifc_req = (void __user *)(uifc + 1); ifr32 = compat_ptr(ifc32.ifcbuf); for (i = 0; i < ifc32.ifc_len; i += sizeof(struct compat_ifreq)) { if (copy_in_user(ifr, ifr32, sizeof(struct compat_ifreq))) return -EFAULT; ifr++; ifr32++; } } if (copy_to_user(uifc, &ifc, sizeof(struct ifconf))) 14 return -EFAULT; err = dev_ioctl(net, SIOCGIFCONF, uifc); 10 if (err) return err; 1 if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) return -EFAULT; ifr = ifc.ifc_req; 9 ifr32 = compat_ptr(ifc32.ifcbuf); for (i = 0, j = 0; i + sizeof(struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len; i += sizeof(struct compat_ifreq), j += sizeof(struct ifreq)) { if (copy_in_user(ifr32, ifr, sizeof(struct compat_ifreq))) return -EFAULT; ifr32++; 9 ifr++; } 8 if (ifc32.ifcbuf == 0) { /* Translate from 64-bit structure multiple to * a 32-bit one. 9 */ i = ifc.ifc_len; i = ((i / sizeof(struct ifreq)) * sizeof(struct compat_ifreq)); 9 ifc32.ifc_len = i; } else { ifc32.ifc_len = i; } 9 if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf))) return -EFAULT; 9 return 0; } 8 static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) { 8 struct compat_ethtool_rxnfc __user *compat_rxnfc; bool convert_in = false, convert_out = false; 8 size_t buf_size = ALIGN(sizeof(struct ifreq), 8); struct ethtool_rxnfc __user *rxnfc; struct ifreq __user *ifr; u32 rule_cnt = 0, actual_rule_cnt; 9 u32 ethcmd; u32 data; int ret; 1 if (get_user(data, &ifr32->ifr_ifru.ifru_data)) 9 return -EFAULT; compat_rxnfc = compat_ptr(data); 8 if (get_user(ethcmd, &compat_rxnfc->cmd)) return -EFAULT; /* Most ethtool structures are defined without padding. * Unfortunately struct ethtool_rxnfc is an exception. */ switch (ethcmd) { default: break; case ETHTOOL_GRXCLSRLALL: /* Buffer size is variable */ if (get_user(rule_cnt, &compat_rxnfc->rule_cnt)) return -EFAULT; if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) return -ENOMEM; buf_size += rule_cnt * sizeof(u32); /* fall through */ case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: 109 case ETHTOOL_GRXCLSRULE: case ETHTOOL_SRXCLSRLINS: convert_out = true; 109 /* fall through */ case ETHTOOL_SRXCLSRLDEL: buf_size += sizeof(struct ethtool_rxnfc); convert_in = true; break; } ifr = compat_alloc_user_space(buf_size); 108 rxnfc = (void __user *)ifr + ALIGN(sizeof(struct ifreq), 8); if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ)) return -EFAULT; 3 if (put_user(convert_in ? rxnfc : compat_ptr(data), &ifr->ifr_ifru.ifru_data)) 3 return -EFAULT; 2 if (convert_in) { /* We expect there to be holes between fs.m_ext and * fs.ring_cookie and at the end of fs, but nowhere else. */ BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + sizeof(compat_rxnfc->fs.m_ext) != offsetof(struct ethtool_rxnfc, fs.m_ext) + sizeof(rxnfc->fs.m_ext)); BUILD_BUG_ON( offsetof(struct compat_ethtool_rxnfc, fs.location) - offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != offsetof(struct ethtool_rxnfc, fs.location) - offsetof(struct ethtool_rxnfc, fs.ring_cookie)); 107 if (copy_in_user(rxnfc, compat_rxnfc, 5 (void __user *)(&rxnfc->fs.m_ext + 1) - (void __user *)rxnfc) || copy_in_user(&rxnfc->fs.ring_cookie, &compat_rxnfc->fs.ring_cookie, (void __user *)(&rxnfc->fs.location + 1) - 107 (void __user *)&rxnfc->fs.ring_cookie)) return -EFAULT; if (ethcmd == ETHTOOL_GRXCLSRLALL) { if (put_user(rule_cnt, &rxnfc->rule_cnt)) 107 return -EFAULT; } else if (copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt, sizeof(rxnfc->rule_cnt))) return -EFAULT; } ret = dev_ioctl(net, SIOCETHTOOL, ifr); if (ret) return ret; if (convert_out) { if (copy_in_user(compat_rxnfc, rxnfc, (const void __user *)(&rxnfc->fs.m_ext + 1) - (const void __user *)rxnfc) || 5 copy_in_user(&compat_rxnfc->fs.ring_cookie, &rxnfc->fs.ring_cookie, (const void __user *)(&rxnfc->fs.location + 1) - (const void __user *)&rxnfc->fs.ring_cookie) || 5 copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt, sizeof(rxnfc->rule_cnt))) return -EFAULT; 5 2 if (ethcmd == ETHTOOL_GRXCLSRLALL) { /* As an optimisation, we only copy the actual * number of rules that the underlying 3 * function returned. Since Mallory might * change the rule count in user memory, we * check that it is less than the rule count * originally given (as the user buffer size), * which has been range-checked. 107 */ if (get_user(actual_rule_cnt, &rxnfc->rule_cnt)) return -EFAULT; if (actual_rule_cnt < rule_cnt) 36 rule_cnt = actual_rule_cnt; if (copy_in_user(&compat_rxnfc->rule_locs[0], &rxnfc->rule_locs[0], rule_cnt * sizeof(u32))) return -EFAULT; } } return 0; } static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) { void __user *uptr; compat_uptr_t uptr32; struct ifreq __user *uifr; uifr = compat_alloc_user_space(sizeof(*uifr)); if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq))) return -EFAULT; if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) return -EFAULT; uptr = compat_ptr(uptr32); if (put_user(uptr, &uifr->ifr_settings.ifs_ifsu.raw_hdlc)) return -EFAULT; return dev_ioctl(net, SIOCWANDEV, uifr); } static int bond_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *ifr32) { struct ifreq kifr; mm_segment_t old_fs; int err; switch (cmd) { case SIOCBONDENSLAVE: 1 case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: if (copy_from_user(&kifr, ifr32, sizeof(struct compat_ifreq))) 1 return -EFAULT; old_fs = get_fs(); 1 set_fs(KERNEL_DS); err = dev_ioctl(net, cmd, (struct ifreq __user __force *) &kifr); set_fs(old_fs); 1 return err; default: return -ENOIOCTLCMD; } } /* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *u_ifreq32) { 6 struct ifreq __user *u_ifreq64; char tmp_buf[IFNAMSIZ]; void __user *data64; u32 data32; 6 if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]), IFNAMSIZ)) return -EFAULT; 6 if (get_user(data32, &u_ifreq32->ifr_ifru.ifru_data)) return -EFAULT; data64 = compat_ptr(data32); u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64)); 20 if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0], IFNAMSIZ)) return -EFAULT; if (put_user(data64, &u_ifreq64->ifr_ifru.ifru_data)) return -EFAULT; return dev_ioctl(net, cmd, u_ifreq64); } static int dev_ifsioc(struct net *net, struct socket *sock, unsigned int cmd, struct compat_ifreq __user *uifr32) { struct ifreq __user *uifr; int err; 85 uifr = compat_alloc_user_space(sizeof(*uifr)); if (copy_in_user(uifr, uifr32, sizeof(*uifr32))) return -EFAULT; 84 err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr); 84 if (!err) { switch (cmd) { case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: case SIOCGIFMEM: 84 case SIOCGIFHWADDR: case SIOCGIFINDEX: case SIOCGIFADDR: 85 case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: case SIOCGIFPFLAGS: case SIOCGIFTXQLEN: case SIOCGMIIPHY: case SIOCGMIIREG: if (copy_in_user(uifr32, uifr, sizeof(*uifr32))) err = -EFAULT; 228 break; } } return err; 227 } static int compat_sioc_ifmap(struct net *net, unsigned int cmd, 168 struct compat_ifreq __user *uifr32) { struct ifreq ifr; struct compat_ifmap __user *uifmap32; mm_segment_t old_fs; int err; uifmap32 = &uifr32->ifr_ifru.ifru_map; err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); err |= get_user(ifr.ifr_map.irq, &uifmap32->irq); err |= get_user(ifr.ifr_map.dma, &uifmap32->dma); err |= get_user(ifr.ifr_map.port, &uifmap32->port); 21 if (err) return -EFAULT; old_fs = get_fs(); set_fs(KERNEL_DS); err = dev_ioctl(net, cmd, (void __user __force *)&ifr); set_fs(old_fs); if (cmd == SIOCGIFMAP && !err) { err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); err |= put_user(ifr.ifr_map.irq, &uifmap32->irq); err |= put_user(ifr.ifr_map.dma, &uifmap32->dma); err |= put_user(ifr.ifr_map.port, &uifmap32->port); if (err) 4 err = -EFAULT; } return err; } struct rtentry32 { u32 rt_pad1; struct sockaddr rt_dst; /* target address */ struct sockaddr rt_gateway; /* gateway addr (RTF_GATEWAY) */ struct sockaddr rt_genmask; /* target network mask (IP) */ 3 unsigned short rt_flags; short rt_pad2; u32 rt_pad3; unsigned char rt_tos; unsigned char rt_class; 131 short rt_pad4; 1 short rt_metric; /* +1 for binary compatibility! */ /* char * */ u32 rt_dev; /* forcing the device at add */ u32 rt_mtu; /* per route MTU/Window */ u32 rt_window; /* Window clamping */ unsigned short rt_irtt; /* Initial RTT */ }; struct in6_rtmsg32 { struct in6_addr rtmsg_dst; struct in6_addr rtmsg_src; struct in6_addr rtmsg_gateway; u32 rtmsg_type; u16 rtmsg_dst_len; u16 rtmsg_src_len; u32 rtmsg_metric; u32 rtmsg_info; u32 rtmsg_flags; s32 rtmsg_ifindex; }; static int routing_ioctl(struct net *net, struct socket *sock, unsigned int cmd, void __user *argp) { int ret; void *r = NULL; struct in6_rtmsg r6; struct rtentry r4; char devname[16]; u32 rtdev; mm_segment_t old_fs = get_fs(); if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */ struct in6_rtmsg32 __user *ur6 = argp; ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst), 3 * sizeof(struct in6_addr)); ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); r = (void *) &r6; } else { /* ipv4 */ struct rtentry32 __user *ur4 = argp; ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), 3 * sizeof(struct sockaddr)); ret |= get_user(r4.rt_flags, &(ur4->rt_flags)); ret |= get_user(r4.rt_metric, &(ur4->rt_metric)); ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu)); ret |= get_user(r4.rt_window, &(ur4->rt_window)); ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt)); 33 ret |= get_user(rtdev, &(ur4->rt_dev)); if (rtdev) { 33 ret |= copy_from_user(devname, compat_ptr(rtdev), 15); r4.rt_dev = (char __user __force *)devname; 11 devname[15] = 0; } else r4.rt_dev = NULL; r = (void *) &r4; } if (ret) { ret = -EFAULT; goto out; } set_fs(KERNEL_DS); 22 ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r); set_fs(old_fs); out: return ret; } /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE * for some operations; this forces use of the newer bridge-utils that 3 * use compatible ioctls */ static int old_bridge_ioctl(compat_ulong_t __user *argp) { 19 compat_ulong_t tmp; if (get_user(tmp, argp)) return -EFAULT; if (tmp == BRCTL_GET_VERSION) 33 return BRCTL_VERSION + 1; return -EINVAL; } static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, 30 unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; struct net *net = sock_net(sk); 33 if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) return compat_ifr_data_ioctl(net, cmd, argp); switch (cmd) { case SIOCSIFBR: case SIOCGIFBR: return old_bridge_ioctl(argp); case SIOCGIFNAME: return dev_ifname32(net, argp); case SIOCGIFCONF: 3 return dev_ifconf(net, argp); case SIOCETHTOOL: 3 return ethtool_ioctl(net, argp); case SIOCWANDEV: return compat_siocwandev(net, argp); case SIOCGIFMAP: case SIOCSIFMAP: return compat_sioc_ifmap(net, cmd, argp); case SIOCBONDENSLAVE: case SIOCBONDRELEASE: 587 case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: return bond_ioctl(net, cmd, argp); case SIOCADDRT: case SIOCDELRT: return routing_ioctl(net, sock, cmd, argp); case SIOCGSTAMP: 514 return do_siocgstamp(net, sock, cmd, argp); case SIOCGSTAMPNS: return do_siocgstampns(net, sock, cmd, argp); 3 case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: 2 case SIOCSHWTSTAMP: case SIOCGHWTSTAMP: 14 return compat_ifr_data_ioctl(net, cmd, argp); 109 case FIOSETOWN: case SIOCSPGRP: 1 case FIOGETOWN: case SIOCGPGRP: case SIOCBRADDBR: 131 case SIOCBRDELBR: case SIOCGIFVLAN: case SIOCSIFVLAN: case SIOCADDDLCI: case SIOCDELDLCI: 20 return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: 33 case SIOCSIFFLAGS: case SIOCGIFMETRIC: 7 case SIOCSIFMETRIC: case SIOCGIFMTU: 4 case SIOCSIFMTU: case SIOCGIFMEM: case SIOCSIFMEM: case SIOCGIFHWADDR: case SIOCSIFHWADDR: 85 case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFINDEX: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCSIFHWBROADCAST: case SIOCDIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFNETMASK: 19 case SIOCSIFNETMASK: case SIOCSIFPFLAGS: case SIOCGIFPFLAGS: case SIOCGIFTXQLEN: case SIOCSIFTXQLEN: case SIOCBRADDIF: case SIOCBRDELIF: case SIOCSIFNAME: case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: return dev_ifsioc(net, sock, cmd, argp); case SIOCSARP: case SIOCGARP: case SIOCDARP: case SIOCATMARK: return sock_do_ioctl(net, sock, cmd, arg); } return -ENOIOCTLCMD; } static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct socket *sock = file->private_data; int ret = -ENOIOCTLCMD; struct sock *sk; struct net *net; sk = sock->sk; net = sock_net(sk); if (sock->ops->compat_ioctl) 228 ret = sock->ops->compat_ioctl(sock, cmd, arg); if (ret == -ENOIOCTLCMD && (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) ret = compat_wext_handle_ioctl(net, cmd, arg); 34 if (ret == -ENOIOCTLCMD) ret = compat_sock_ioctl_trans(file, sock, cmd, arg); return ret; } #endif int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { 588 return sock->ops->bind(sock, addr, addrlen); } EXPORT_SYMBOL(kernel_bind); int kernel_listen(struct socket *sock, int backlog) { return sock->ops->listen(sock, backlog); } EXPORT_SYMBOL(kernel_listen); 216 int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { 588 struct sock *sk = sock->sk; int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, 588 newsock); if (err < 0) 570 goto done; err = sock->ops->accept(sock, *newsock, flags); if (err < 0) { sock_release(*newsock); *newsock = NULL; goto done; } (*newsock)->ops = sock->ops; __module_get((*newsock)->ops->owner); done: return err; } EXPORT_SYMBOL(kernel_accept); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { return sock->ops->connect(sock, addr, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); int kernel_getsockname(struct socket *sock, struct sockaddr *addr, int *addrlen) { return sock->ops->getname(sock, addr, addrlen, 0); } EXPORT_SYMBOL(kernel_getsockname); int kernel_getpeername(struct socket *sock, struct sockaddr *addr, int *addrlen) { return sock->ops->getname(sock, addr, addrlen, 1); } EXPORT_SYMBOL(kernel_getpeername); int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen) { mm_segment_t oldfs = get_fs(); char __user *uoptval; int __user *uoptlen; int err; uoptval = (char __user __force *) optval; uoptlen = (int __user __force *) optlen; set_fs(KERNEL_DS); if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); else err = sock->ops->getsockopt(sock, level, optname, uoptval, uoptlen); set_fs(oldfs); return err; } 4 EXPORT_SYMBOL(kernel_getsockopt); int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, unsigned int optlen) { mm_segment_t oldfs = get_fs(); char __user *uoptval; int err; uoptval = (char __user __force *) optval; set_fs(KERNEL_DS); if (level == SOL_SOCKET) err = sock_setsockopt(sock, level, optname, uoptval, optlen); else err = sock->ops->setsockopt(sock, level, optname, uoptval, optlen); set_fs(oldfs); return err; } EXPORT_SYMBOL(kernel_setsockopt); int kernel_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { if (sock->ops->sendpage) return sock->ops->sendpage(sock, page, offset, size, flags); return sock_no_sendpage(sock, page, offset, size, flags); } EXPORT_SYMBOL(kernel_sendpage); int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg) { mm_segment_t oldfs = get_fs(); int err; set_fs(KERNEL_DS); err = sock->ops->ioctl(sock, cmd, arg); set_fs(oldfs); return err; } EXPORT_SYMBOL(kernel_sock_ioctl); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { return sock->ops->shutdown(sock, how); } 398 EXPORT_SYMBOL(kernel_sock_shutdown);
/* * xfrm4_policy.c * * Changes: * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/inetdevice.h> #include <linux/if_tunnel.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/l3mdev.h> static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, u32 mark) { struct rtable *rt; 239 memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); fl4->flowi4_mark = mark; if (saddr) fl4->saddr = saddr->a4; fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) 238 return &rt->dst; return ERR_CAST(rt); } static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, u32 mark) { struct flowi4 fl4; return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark); } static int xfrm4_get_saddr(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr, u32 mark) { struct dst_entry *dst; struct flowi4 fl4; 239 dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; 238 saddr->a4 = fl4.saddr; dst_release(dst); 239 return 0; } static int xfrm4_get_tos(const struct flowi *fl) { return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */ } static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { return 0; } static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { struct rtable *rt = (struct rtable *)xdst->route; const struct flowi4 *fl4 = &fl->u.ip4; xdst->u.rt.rt_iif = fl4->flowi4_iif; xdst->u.dst.dev = dev; dev_hold(dev); /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt.rt_is_input = rt->rt_is_input; xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; xdst->u.rt.rt_table_id = rt->rt_table_id; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; } static void _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) { 152 const struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl; u8 *xprth = skb_network_header(skb) + ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; 152 145 if (skb_dst(skb)) oif = l3mdev_fib_oif(skb_dst(skb)->dev); 152 memset(fl4, 0, sizeof(struct flowi4)); 152 fl4->flowi4_mark = skb->mark; fl4->flowi4_oif = reverse ? skb->skb_iif : oif; 125 fl4->flowi4_proto = iph->protocol; fl4->daddr = reverse ? iph->saddr : iph->daddr; fl4->saddr = reverse ? iph->daddr : iph->saddr; fl4->flowi4_tos = iph->tos; if (!ip_is_fragment(iph)) { 119 switch (iph->protocol) { 119 case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: 118 case IPPROTO_SCTP: case IPPROTO_DCCP: if (xprth + 4 < skb->data || 118 pskb_may_pull(skb, xprth + 4 - skb->data)) { 118 __be16 *ports; xprth = skb_network_header(skb) + ihl * 4; ports = (__be16 *)xprth; 1 fl4->fl4_sport = ports[!!reverse]; 1 fl4->fl4_dport = ports[!reverse]; } break; case IPPROTO_ICMP: if (xprth + 2 < skb->data || pskb_may_pull(skb, xprth + 2 - skb->data)) { u8 *icmp; xprth = skb_network_header(skb) + ihl * 4; icmp = xprth; 1 fl4->fl4_icmp_type = icmp[0]; 1 fl4->fl4_icmp_code = icmp[1]; } break; 1 case IPPROTO_ESP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { __be32 *ehdr; xprth = skb_network_header(skb) + ihl * 4; ehdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ehdr[0]; } break; case IPPROTO_AH: if (xprth + 8 < skb->data || pskb_may_pull(skb, xprth + 8 - skb->data)) { __be32 *ah_hdr; xprth = skb_network_header(skb) + ihl * 4; ah_hdr = (__be32 *)xprth; 1 1 fl4->fl4_ipsec_spi = ah_hdr[1]; } break; case IPPROTO_COMP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ipcomp_hdr; xprth = skb_network_header(skb) + ihl * 4; ipcomp_hdr = (__be16 *)xprth; 5 5 fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); } break; 3 case IPPROTO_GRE: if (xprth + 12 < skb->data || pskb_may_pull(skb, xprth + 12 - skb->data)) { __be16 *greflags; __be32 *gre_hdr; 2 1 xprth = skb_network_header(skb) + ihl * 4; 2 greflags = (__be16 *)xprth; gre_hdr = (__be32 *)xprth; if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) gre_hdr++; fl4->fl4_gre_key = gre_hdr[1]; } } break; 152 152 default: 152 fl4->fl4_ipsec_spi = 0; break; } } } static inline int xfrm4_garbage_collect(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); xfrm4_policy_afinfo.garbage_collect(net); return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); } static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, sk, skb, mtu); } static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->redirect(path, sk, skb); } static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); xfrm_dst_destroy(xdst); } static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int unregister) { if (!unregister) return; xfrm_dst_ifdown(dst, dev); } static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, .gc_thresh = INT_MAX, }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .decode_session = _decode_session4, .get_tos = xfrm4_get_tos, .init_path = xfrm4_init_path, .fill_dst = xfrm4_fill_dst, .blackhole_route = ipv4_blackhole_route, }; #ifdef CONFIG_SYSCTL static struct ctl_table xfrm4_policy_table[] = { { .procname = "xfrm4_gc_thresh", .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; static int __net_init xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; 30 30 table = xfrm4_policy_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL); if (!table) 30 goto err_alloc; table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh; } hdr = register_net_sysctl(net, "net/ipv4", table); if (!hdr) 30 goto err_reg; net->ipv4.xfrm4_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; if (!net->ipv4.xfrm4_hdr) return; table = net->ipv4.xfrm4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.xfrm4_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else /* CONFIG_SYSCTL */ static int inline xfrm4_net_sysctl_init(struct net *net) { return 0; } static void inline xfrm4_net_sysctl_exit(struct net *net) { } #endif static int __net_init xfrm4_net_init(struct net *net) { 30 int ret; memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template, 30 sizeof(xfrm4_dst_ops_template)); ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops); if (ret) 30 return ret; ret = xfrm4_net_sysctl_init(net); if (ret) dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); return ret; } static void __net_exit xfrm4_net_exit(struct net *net) { xfrm4_net_sysctl_exit(net); dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); } static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, }; static void __init xfrm4_policy_init(void) { xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); } void __init xfrm4_init(void) { xfrm4_state_init(); xfrm4_policy_init(); xfrm4_protocol_init(); register_pernet_subsys(&xfrm4_net_ops); }
#ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ #include <linux/sched.h> #define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS) #include <asm/uaccess.h> static __always_inline void pagefault_disabled_inc(void) { 102 current->pagefault_disabled++; } static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; 66 WARN_ON(current->pagefault_disabled < 0); } /* * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * * User access methods will not sleep when called from a pagefault_disabled() * environment. */ static inline void pagefault_disable(void) { 102 pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. */ barrier(); } static inline void pagefault_enable(void) { /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ 1822 barrier(); pagefault_disabled_dec(); } /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ #define pagefault_disabled() (current->pagefault_disabled != 0) /* * The pagefault handler is in general disabled by pagefault_disable() or * when in irq context (via in_atomic()). * * This function should only be used by the fault handlers. Other users should * stick to pagefault_disabled(). * Please NEVER use preempt_disable() to disable the fault handler. With * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) #ifndef ARCH_HAS_NOCACHE_UACCESS static inline unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } static inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user(to, from, n); } #endif /* ARCH_HAS_NOCACHE_UACCESS */ /* * probe_kernel_read(): safely attempt to read from a location * @dst: pointer to the buffer that shall take the data * @src: address to read from * @size: size of the data chunk * * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ extern long probe_kernel_read(void *dst, const void *src, size_t size); extern long __probe_kernel_read(void *dst, const void *src, size_t size); /* * probe_kernel_write(): safely attempt to write to a location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk * * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ extern long notrace probe_kernel_write(void *dst, const void *src, size_t size); extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size); extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); /** * probe_kernel_address(): safely attempt to read from a location * @addr: address to read from * @retval: read into this variable * * Returns 0 on success, or -EFAULT. */ #define probe_kernel_address(addr, retval) \ probe_kernel_read(&retval, addr, sizeof(retval)) #ifndef user_access_begin #define user_access_begin() do { } while (0) #define user_access_end() do { } while (0) #define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) #define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) #endif #endif /* __LINUX_UACCESS_H__ */
/* * 32bit compatibility wrappers for the input subsystem. * * Very heavily based on evdev.c - Copyright (c) 1999-2002 Vojtech Pavlik * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published by * the Free Software Foundation. */ #include <linux/export.h> #include <asm/uaccess.h> #include "input-compat.h" #ifdef CONFIG_COMPAT 81 int input_event_from_user(const char __user *buffer, struct input_event *event) { 81 if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { struct input_event_compat compat_event; 81 if (copy_from_user(&compat_event, buffer, sizeof(struct input_event_compat))) 2 return -EFAULT; 81 event->time.tv_sec = compat_event.time.tv_sec; event->time.tv_usec = compat_event.time.tv_usec; event->type = compat_event.type; event->code = compat_event.code; event->value = compat_event.value; } else { if (copy_from_user(event, buffer, sizeof(struct input_event))) 81 return -EFAULT; } return 0; } 10 int input_event_to_user(char __user *buffer, const struct input_event *event) { 10 if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) { struct input_event_compat compat_event; 10 compat_event.time.tv_sec = event->time.tv_sec; compat_event.time.tv_usec = event->time.tv_usec; compat_event.type = event->type; compat_event.code = event->code; compat_event.value = event->value; 10 if (copy_to_user(buffer, &compat_event, sizeof(struct input_event_compat))) 2 return -EFAULT; } else { if (copy_to_user(buffer, event, sizeof(struct input_event))) 10 return -EFAULT; } return 0; } 5 int input_ff_effect_from_user(const char __user *buffer, size_t size, struct ff_effect *effect) { 5 if (INPUT_COMPAT_TEST) { struct ff_effect_compat *compat_effect; 5 if (size != sizeof(struct ff_effect_compat)) return -EINVAL; /* * It so happens that the pointer which needs to be changed * is the last field in the structure, so we can retrieve the * whole thing and replace just the pointer. */ compat_effect = (struct ff_effect_compat *)effect; 4 if (copy_from_user(compat_effect, buffer, sizeof(struct ff_effect_compat))) return -EFAULT; 3 if (compat_effect->type == FF_PERIODIC && 2 compat_effect->u.periodic.waveform == FF_CUSTOM) 5 effect->u.periodic.custom_data = 1 compat_ptr(compat_effect->u.periodic.custom_data); } else { if (size != sizeof(struct ff_effect)) return -EINVAL; if (copy_from_user(effect, buffer, sizeof(struct ff_effect))) return -EFAULT; } return 0; } #else int input_event_from_user(const char __user *buffer, struct input_event *event) { if (copy_from_user(event, buffer, sizeof(struct input_event))) return -EFAULT; return 0; } int input_event_to_user(char __user *buffer, const struct input_event *event) { if (copy_to_user(buffer, event, sizeof(struct input_event))) return -EFAULT; return 0; } int input_ff_effect_from_user(const char __user *buffer, size_t size, struct ff_effect *effect) { if (size != sizeof(struct ff_effect)) return -EINVAL; if (copy_from_user(effect, buffer, sizeof(struct ff_effect))) return -EFAULT; return 0; } #endif /* CONFIG_COMPAT */ EXPORT_SYMBOL_GPL(input_event_from_user); EXPORT_SYMBOL_GPL(input_event_to_user); EXPORT_SYMBOL_GPL(input_ff_effect_from_user);
#include <linux/syscalls.h> #include <linux/compat.h> #include <linux/quotaops.h> /* * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) * and is necessary due to alignment problems. */ struct compat_if_dqblk { compat_u64 dqb_bhardlimit; compat_u64 dqb_bsoftlimit; compat_u64 dqb_curspace; compat_u64 dqb_ihardlimit; compat_u64 dqb_isoftlimit; compat_u64 dqb_curinodes; compat_u64 dqb_btime; compat_u64 dqb_itime; compat_uint_t dqb_valid; }; /* XFS structures */ struct compat_fs_qfilestat { compat_u64 dqb_bhardlimit; compat_u64 qfs_nblks; compat_uint_t qfs_nextents; }; struct compat_fs_quota_stat { __s8 qs_version; __u16 qs_flags; __s8 qs_pad; struct compat_fs_qfilestat qs_uquota; struct compat_fs_qfilestat qs_gquota; compat_uint_t qs_incoredqs; compat_int_t qs_btimelimit; compat_int_t qs_itimelimit; compat_int_t qs_rtbtimelimit; __u16 qs_bwarnlimit; __u16 qs_iwarnlimit; }; asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr) { unsigned int cmds; struct if_dqblk __user *dqblk; struct compat_if_dqblk __user *compat_dqblk; struct fs_quota_stat __user *fsqstat; struct compat_fs_quota_stat __user *compat_fsqstat; compat_uint_t data; u16 xdata; long ret; 22 cmds = cmd >> SUBCMDSHIFT; switch (cmds) { case Q_GETQUOTA: 1 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); compat_dqblk = addr; ret = sys_quotactl(cmd, special, id, dqblk); if (ret) break; if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || get_user(data, &dqblk->dqb_valid) || put_user(data, &compat_dqblk->dqb_valid)) ret = -EFAULT; break; case Q_SETQUOTA: 2 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); compat_dqblk = addr; ret = -EFAULT; if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || 1 get_user(data, &compat_dqblk->dqb_valid) || 1 put_user(data, &dqblk->dqb_valid)) break; ret = sys_quotactl(cmd, special, id, dqblk); break; case Q_XGETQSTAT: fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); compat_fsqstat = addr; ret = sys_quotactl(cmd, special, id, fsqstat); if (ret) break; ret = -EFAULT; /* Copying qs_version, qs_flags, qs_pad */ if (copy_in_user(compat_fsqstat, fsqstat, offsetof(struct compat_fs_quota_stat, qs_uquota))) break; /* Copying qs_uquota */ if (copy_in_user(&compat_fsqstat->qs_uquota, &fsqstat->qs_uquota, sizeof(compat_fsqstat->qs_uquota)) || get_user(data, &fsqstat->qs_uquota.qfs_nextents) || put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) break; /* Copying qs_gquota */ if (copy_in_user(&compat_fsqstat->qs_gquota, &fsqstat->qs_gquota, sizeof(compat_fsqstat->qs_gquota)) || get_user(data, &fsqstat->qs_gquota.qfs_nextents) || put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) break; /* Copying the rest */ if (copy_in_user(&compat_fsqstat->qs_incoredqs, &fsqstat->qs_incoredqs, sizeof(struct compat_fs_quota_stat) - offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || get_user(xdata, &fsqstat->qs_iwarnlimit) || put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) break; ret = 0; break; default: 20 ret = sys_quotactl(cmd, special, id, addr); } 21 return ret; }
/* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #define RT6_DEBUG 2 #if RT6_DEBUG >= 3 #define RT6_TRACE(x...) pr_debug(x) #else #define RT6_TRACE(x...) do { ; } while (0) #endif static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct rt6_info *, void *arg); int sernum; void *arg; }; static DEFINE_RWLOCK(fib6_walker_lock); #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); static int fib6_walk(struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(unsigned long arg); static LIST_HEAD(fib6_walkers); #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) static void fib6_walker_link(struct fib6_walker *w) { 235 write_lock_bh(&fib6_walker_lock); 235 list_add(&w->lh, &fib6_walkers); 235 write_unlock_bh(&fib6_walker_lock); } static void fib6_walker_unlink(struct fib6_walker *w) { 235 write_lock_bh(&fib6_walker_lock); 235 list_del(&w->lh); write_unlock_bh(&fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old; do { 148 old = atomic_read(&net->ipv6.fib6_sernum); 148 new = old < INT_MAX ? old + 1 : 1; 148 } while (atomic_cmpxchg(&net->ipv6.fib6_sernum, old, new) != old); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ 20 return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } static struct fib6_node *node_alloc(void) { struct fib6_node *fn; 84 fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); return fn; } static void node_free_immediate(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } static void node_free_rcu(struct rcu_head *head) { struct fib6_node *fn = container_of(head, struct fib6_node, rcu); kmem_cache_free(fib6_node_kmem, fn); } static void node_free(struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); } static void rt6_rcu_free(struct rt6_info *rt) { 41 call_rcu(&rt->dst.rcu_head, dst_rcu_free); } static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; 41 if (!non_pcpu_rt->rt6i_pcpu) return; 41 for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; 41 ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { 5 rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; } } 41 free_percpu(non_pcpu_rt->rt6i_pcpu); non_pcpu_rt->rt6i_pcpu = NULL; } 41 static void rt6_release(struct rt6_info *rt) { 42 if (atomic_dec_and_test(&rt->rt6i_ref)) { 41 rt6_free_pcpu(rt); 41 rt6_rcu_free(rt); } 42 } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ 29 rwlock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); 29 } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; 1 table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { 1 table->tb6_id = id; table->tb6_root.leaf = net->ipv6.ip6_null_entry; table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; 59 if (id == 0) id = RT6_TABLE_MAIN; 59 tb = fib6_get_table(net, id); 59 if (tb) return tb; 1 tb = fib6_alloc_table(net, id); if (tb) 1 fib6_link_table(net, tb); return tb; } struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; 808 if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); 808 rcu_read_lock(); 808 head = &net->ipv6.fib_table_hash[h]; 808 hlist_for_each_entry_rcu(tb, head, tb6_hlist) { 808 if (tb->tb6_id == id) { 807 rcu_read_unlock(); return tb; } } 2 rcu_read_unlock(); 808 return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); 28 fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put(rt); rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); } return &rt->dst; } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif static int fib6_dump_node(struct fib6_walker *w) { int res; struct rt6_info *rt; 10 for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { 10 res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ 1 w->leaf = rt; return 1; } } 10 w->leaf = NULL; 10 return 0; } static void fib6_dump_end(struct netlink_callback *cb) { 9 struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(w); } 9 cb->args[2] = 0; kfree(w); } 9 cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct fib6_walker *w; int res; 10 w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { 10 w->count = 0; w->skip = 0; read_lock_bh(&table->tb6_lock); res = fib6_walk(w); read_unlock_bh(&table->tb6_lock); if (res > 0) { 1 cb->args[4] = 1; cb->args[5] = w->root->fn_sernum; } } else { if (cb->args[5] != w->root->fn_sernum) { /* Begin at the root if the tree changed */ cb->args[5] = w->root->fn_sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; } else w->skip = 0; read_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); read_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { 10 struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct rt6_rtnl_dump_arg arg; struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; s_h = cb->args[0]; s_e = cb->args[1]; 8 w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. hook callback destructor. */ 10 cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; /* * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; 10 w->func = fib6_dump_node; cb->args[2] = (long)w; } 10 arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; 10 rcu_read_lock(); 10 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; 10 head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { 10 if (e < s_e) goto next; 10 res = fib6_dump_table(tb, skb, cb); 10 if (res != 0) goto out; next: 10 e++; } } out: 10 rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; 10 res = res < 0 ? res : skb->len; 10 if (res <= 0) 9 fib6_dump_end(cb); return res; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, int sernum) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; RT6_TRACE("fib6_add_1\n"); /* insert node in tree */ fn = root; do { 127 key = (struct rt6key *)((u8 *)fn->leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || 127 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { 83 if (!allow_create) { 1 if (replace_required) { 2 pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ 127 if (plen == fn->fn_bit) { /* clean up an intermediate node */ 57 if (!(fn->fn_flags & RTN_RTINFO)) { 1 rt6_release(fn->leaf); fn->leaf = NULL; } 57 fn->fn_sernum = sernum; return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ 123 fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? fn->right : fn->left; 123 } while (fn); 54 if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ 1 if (replace_required) { pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ 53 ln = node_alloc(); if (!ln) return ERR_PTR(-ENOMEM); 53 ln->fn_bit = plen; ln->parent = pn; ln->fn_sernum = sernum; if (dir) 1 pn->right = ln; else 52 pn->left = ln; return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ 82 pn = fn->parent; /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ 82 bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ 82 if (plen > bit) { 74 in = node_alloc(); ln = node_alloc(); 74 if (!in || !ln) { if (in) node_free_immediate(in); if (ln) node_free_immediate(ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ 74 in->fn_bit = bit; in->parent = pn; in->leaf = fn->leaf; atomic_inc(&in->leaf->rt6i_ref); in->fn_sernum = sernum; /* update parent pointer */ if (dir) 59 pn->right = in; else 69 pn->left = in; 74 ln->fn_bit = plen; ln->parent = in; fn->parent = in; ln->fn_sernum = sernum; if (addr_bit_set(addr, bit)) { 61 in->right = ln; in->left = fn; } else { 67 in->left = ln; in->right = fn; } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ 8 ln = node_alloc(); if (!ln) return ERR_PTR(-ENOMEM); 8 ln->fn_bit = plen; ln->parent = pn; ln->fn_sernum = sernum; if (dir) 2 pn->right = ln; else 6 pn->left = ln; 8 if (addr_bit_set(&key->addr, plen)) ln->right = fn; else 8 ln->left = fn; 8 fn->parent = ln; } return ln; } static bool rt6_qualify_for_ecmp(struct rt6_info *rt) { return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; } static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) { int i; 1 for (i = 0; i < RTAX_MAX; i++) { 1 if (test_bit(i, mxc->mx_valid)) 1 mp[i] = mxc->mx[i]; } } static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) { 115 if (!mxc->mx) return 0; 5 if (dst->flags & DST_HOST) { 1 u32 *mp = dst_metrics_write_ptr(dst); 1 if (unlikely(!mp)) return -ENOMEM; 1 fib6_copy_metrics(mp, mxc); } else { 4 dst_init_metrics(dst, mxc->mx, false); /* We've stolen mx now. */ mxc->mx = NULL; } 1 return 0; } static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, struct net *net) { 41 if (atomic_read(&rt->rt6i_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ 3 while (fn) { 3 if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { 3 fn->leaf = fib6_find_prefix(net, fn); atomic_inc(&fn->leaf->rt6i_ref); rt6_release(rt); } 3 fn = fn->parent; } /* No more references are possible at this point. */ 3 BUG_ON(atomic_read(&rt->rt6i_ref) != 1); } 41 } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc) { struct rt6_info *iter = NULL; struct rt6_info **ins; struct rt6_info **fallback_ins = NULL; 125 int replace = (info->nlh && 13 (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; 125 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); int err; ins = &fn->leaf; for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { /* * Search for duplicates */ 56 if (iter->rt6i_metric == rt->rt6i_metric) { /* * Same priority level */ 52 if (info->nlh && 9 (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; 51 if (replace) { 5 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } 1 if (rt_can_ecmp) 1 fallback_ins = fallback_ins ?: ins; goto next_iter; } 46 if (rt6_duplicate_nexthop(iter, rt)) { 13 if (rt->rt6i_nsiblings) rt->rt6i_nsiblings = 0; 13 if (!(iter->rt6i_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->rt6i_flags & RTF_EXPIRES)) rt6_clean_expires(iter); else rt6_set_expires(iter, rt->dst.expires); iter->rt6i_pmtu = rt->rt6i_pmtu; return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ 36 if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->rt6i_nsiblings++; } 9 if (iter->rt6i_metric > rt->rt6i_metric) break; next_iter: 40 ins = &iter->dst.rt6_next; } 40 if (fallback_ins && !found) { /* No ECMP-able route found, replace first non-ECMP one */ ins = fallback_ins; 1 iter = *ins; found++; } /* Reset round-robin state, if necessary */ 44 if (ins == &fn->leaf) 93 fn->rr_ptr = NULL; /* Link this route to others same route. */ 119 if (rt->rt6i_nsiblings) { unsigned int rt6i_nsiblings; struct rt6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = fn->leaf; while (sibling) { if (sibling->rt6i_metric == rt->rt6i_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail(&rt->rt6i_siblings, &sibling->rt6i_siblings); break; } sibling = sibling->dst.rt6_next; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ rt6i_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->rt6i_siblings, rt6i_siblings) { sibling->rt6i_nsiblings++; BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); rt6i_nsiblings++; } BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); } /* * insert node */ 119 if (!replace) { 110 if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: 111 err = fib6_commit_metrics(&rt->dst, mxc); if (err) return err; 111 rt->dst.rt6_next = iter; *ins = rt; rcu_assign_pointer(rt->rt6i_node, fn); atomic_inc(&rt->rt6i_ref); inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { 84 info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; 9 if (!found) { 5 if (add) goto add; 4 pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } 5 err = fib6_commit_metrics(&rt->dst, mxc); if (err) return err; 5 *ins = rt; rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } 5 nsiblings = iter->rt6i_nsiblings; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; 5 rt6_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->dst.rt6_next; iter = *ins; while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; } else { ins = &iter->dst.rt6_next; } iter = *ins; } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct rt6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && 100 (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { 21 if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, 12 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); 21 } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc) { struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; int replace_required = 0; 127 int sernum = fib6_new_sernum(info->nl_net); 127 if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && !atomic_read(&rt->dst.__refcnt))) return -EINVAL; 127 if (info->nlh) { 15 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } 3 if (!allow_create && !replace_required) 2 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); 127 fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, replace_required, sernum); 125 if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } pn = fn; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen) { struct fib6_node *sn; if (!fn->subtree) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(); if (!sfn) goto failure; sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); sfn->fn_flags = RTN_ROOT; sfn->fn_sernum = sernum; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, sernum); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ sfn->parent = fn; fn->subtree = sfn; } else { sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required, sernum); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!fn->leaf) { fn->leaf = rt; atomic_inc(&rt->rt6i_ref); } fn = sn; } #endif 125 err = fib6_add_rt2node(fn, rt, info, mxc); if (!err) { 115 fib6_start_gc(info->nl_net, rt); 115 if (!(rt->rt6i_flags & RTF_CACHE)) 115 fib6_prune_clones(info->nl_net, pn); 115 rt->dst.flags &= ~DST_NOCACHE; } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn && pn->leaf == rt) { pn->leaf = NULL; atomic_dec(&rt->rt6i_ref); } if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { pn->leaf = fib6_find_prefix(info->nl_net, pn); #if RT6_DEBUG >= 2 if (!pn->leaf) { WARN_ON(pn->leaf == NULL); pn->leaf = info->nl_net->ipv6.ip6_null_entry; } #endif atomic_inc(&pn->leaf->rt6i_ref); } #endif goto failure; } return err; failure: /* fn->leaf could be NULL if fn is an intermediate node and we * failed to add the new route to it in both subtree creation * failure and fib6_add_rt2node() failure case. * In both cases, fib6_repair_tree() should be called to fix * fn->leaf. */ 18 if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) 1 fib6_repair_tree(info->nl_net, fn); 20 if (!(rt->dst.flags & DST_NOCACHE)) 20 dst_free(&rt->dst); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on rt6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; 686 dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? fn->right : fn->left; 686 if (next) { fn = next; continue; } break; } 686 while (fn) { 686 if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { struct rt6key *key; 686 key = (struct rt6key *) ((u8 *) fn->leaf + args->offset); 686 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (fn->subtree) { struct fib6_node *sfn; sfn = fib6_lookup_1(fn->subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } #ifdef CONFIG_IPV6_SUBTREES backtrack: #endif 207 if (fn->fn_flags & RTN_ROOT) break; 207 fn = fn->parent; } return NULL; } struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct rt6_info, rt6i_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct rt6_info, rt6i_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; 686 fn = fib6_lookup_1(root, daddr ? args : args + 1); 686 if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; 686 return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset) { struct fib6_node *fn; 15 for (fn = root; fn ; ) { 15 struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || 15 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) return NULL; 15 if (plen == fn->fn_bit) return fn; /* * We have more bits to go */ 13 if (addr_bit_set(addr, fn->fn_bit)) fn = fn->right; else fn = fn->left; } return NULL; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len) { struct fib6_node *fn; 15 fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct rt6_info, rt6i_dst)); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn && fn->subtree) fn = fib6_locate_1(fn->subtree, saddr, src_len, offsetof(struct rt6_info, rt6i_src)); } #endif 10 if (fn && fn->fn_flags & RTN_RTINFO) return fn; 15 return NULL; } /* * Deletion * */ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) { 19 if (fn->fn_flags & RTN_ROOT) return net->ipv6.ip6_null_entry; 19 while (fn) { 3 if (fn->left) 19 return fn->left->leaf; if (fn->right) return fn->right->leaf; fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child, *pn; struct fib6_walker *w; int iter = 0; for (;;) { RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; 29 WARN_ON(fn->fn_flags & RTN_RTINFO); 29 WARN_ON(fn->fn_flags & RTN_TL_ROOT); 29 WARN_ON(fn->leaf); children = 0; child = NULL; 29 if (fn->right) child = fn->right, children |= 1; 29 if (fn->left) child = fn->left, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { 19 fn->leaf = fib6_find_prefix(net, fn); #if RT6_DEBUG >= 2 if (!fn->leaf) { WARN_ON(!fn->leaf); fn->leaf = net->ipv6.ip6_null_entry; } #endif 19 atomic_inc(&fn->leaf->rt6i_ref); 29 return fn->parent; } 28 pn = fn->parent; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); FIB6_SUBTREE(pn) = NULL; nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn->right == fn) 23 pn->right = child; 21 else if (pn->left == fn) 21 pn->left = child; #if RT6_DEBUG >= 2 else WARN_ON(1); #endif 28 if (child) 28 child->parent = pn; nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif 28 read_lock(&fib6_walker_lock); 6 FOR_WALKERS(w) { 6 if (!child) { 3 if (w->root == fn) { w->root = w->node = NULL; RT6_TRACE("W %p adjusted by delroot 1\n", w); 3 } else if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); 3 w->node = pn; w->state = nstate; } } else { 6 if (w->root == fn) { w->root = child; RT6_TRACE("W %p adjusted by delroot 2\n", w); } 6 if (w->node == fn) { 6 w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); 3 w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); 3 w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } 28 read_unlock(&fib6_walker_lock); node_free(fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; 24 rt6_release(pn->leaf); pn->leaf = NULL; fn = pn; } } static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, struct nl_info *info) { struct fib6_walker *w; struct rt6_info *rt = *rtp; 37 struct net *net = info->nl_net; RT6_TRACE("fib6_del_route\n"); /* Unlink it */ *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (fn->rr_ptr == rt) 5 fn->rr_ptr = NULL; /* Remove this entry from other siblings */ 37 if (rt->rt6i_nsiblings) { struct rt6_info *sibling, *next_sibling; list_for_each_entry_safe(sibling, next_sibling, &rt->rt6i_siblings, rt6i_siblings) sibling->rt6i_nsiblings--; rt->rt6i_nsiblings = 0; list_del_init(&rt->rt6i_siblings); } /* Adjust walkers */ 37 read_lock(&fib6_walker_lock); 19 FOR_WALKERS(w) { 19 if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); 19 w->leaf = rt->dst.rt6_next; if (!w->leaf) 18 w->state = FWS_U; } } 37 read_unlock(&fib6_walker_lock); rt->dst.rt6_next = NULL; /* If it was last route, expunge its radix tree node */ if (!fn->leaf) { 28 fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; fn = fib6_repair_tree(net, fn); } 37 fib6_purge_rt(rt, fn, net); inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); } int fib6_del(struct rt6_info *rt, struct nl_info *info) { 37 struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct net *net = info->nl_net; struct rt6_info **rtp; #if RT6_DEBUG >= 2 if (rt->dst.obsolete > 0) { WARN_ON(fn); 37 return -ENOENT; } #endif 37 if (!fn || rt == net->ipv6.ip6_null_entry) return -ENOENT; 37 WARN_ON(!(fn->fn_flags & RTN_RTINFO)); 37 if (!(rt->rt6i_flags & RTF_CACHE)) { struct fib6_node *pn = fn; #ifdef CONFIG_IPV6_SUBTREES /* clones of this route might be in another subtree */ if (rt->rt6i_src.plen) { while (!(pn->fn_flags & RTN_ROOT)) pn = pn->parent; pn = pn->parent; } #endif 35 fib6_prune_clones(info->nl_net, pn); } /* * Walk the leaf entries looking for ourself */ 37 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { 37 if (*rtp == rt) { 37 fib6_del_route(fn, rtp, info); return 0; } } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. */ static int fib6_walk_continue(struct fib6_walker *w) 235 { struct fib6_node *fn, *pn; for (;;) { fn = w->node; 235 if (!fn) return 0; 235 if (w->prune && fn != w->root && 48 fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { 48 w->state = FWS_C; w->leaf = fn->leaf; } 235 switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; #endif case FWS_L: 235 if (fn->left) { 157 w->node = fn->left; w->state = FWS_INIT; continue; } 227 w->state = FWS_R; case FWS_R: 235 if (fn->right) { 132 w->node = fn->right; w->state = FWS_INIT; continue; } 233 w->state = FWS_C; w->leaf = fn->leaf; case FWS_C: 235 if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; 235 if (w->skip) { w->skip--; goto skip; } 235 err = w->func(w); if (err) return err; 235 w->count++; continue; } skip: 235 w->state = FWS_U; case FWS_U: 235 if (fn == w->root) return 0; 157 pn = fn->parent; w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (pn->left == fn) { 157 w->state = FWS_R; continue; } 132 if (pn->right == fn) { 132 w->state = FWS_C; w->leaf = w->node->leaf; continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct fib6_walker *w) { int res; 230 w->state = FWS_INIT; w->node = w->root; fib6_walker_link(w); res = fib6_walk_continue(w); if (res <= 0) 230 fib6_walker_unlink(w); 230 return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct rt6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); 220 struct nl_info info = { .nl_net = c->net, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && 34 w->node->fn_sernum != c->sernum) 34 w->node->fn_sernum = c->sernum; 210 if (!c->func) { 34 WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); 220 w->leaf = NULL; 220 return 0; } 210 for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { 210 res = c->func(rt, c->arg); if (res < 0) { 19 w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->rt6i_node), res); #endif continue; } return 0; } 210 WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -1 -> delete this route. * 0 -> continue walking * * prune==1 -> only immediate children of node (certainly, * ignoring pure split nodes) will be scanned. */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), bool prune, int sernum, void *arg) { struct fib6_cleaner c; 220 c.w.root = root; c.w.func = fib6_clean_node; c.w.prune = prune; c.w.count = 0; c.w.skip = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; fib6_walk(&c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), int sernum, void *arg) { struct fib6_table *table; struct hlist_head *head; unsigned int h; 114 rcu_read_lock(); 114 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 114 head = &net->ipv6.fib_table_hash[h]; 114 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 114 write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, false, sernum, arg); write_unlock_bh(&table->tb6_lock); } } 114 rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), void *arg) { 91 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); } static int fib6_prune_clone(struct rt6_info *rt, void *arg) { 133 if (rt->rt6i_flags & RTF_CACHE) { RT6_TRACE("pruning clone %p\n", rt); return -1; } return 0; } static void fib6_prune_clones(struct net *net, struct fib6_node *fn) { fib6_clean_tree(net, fn, fib6_prune_clone, true, FIB6_NO_SERNUM_CHANGE, NULL); } static void fib6_flush_trees(struct net *net) { 34 int new_sernum = fib6_new_sernum(net); 34 __fib6_clean_all(net, NULL, new_sernum, NULL); } /* * Garbage collection */ static struct fib6_gc_args { int timeout; int more; } gc_args; static int fib6_age(struct rt6_info *rt, void *arg) { 13 unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. * * Also age clones. Note, that clones are aged out * only if they are not in use now. */ if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { if (time_after(now, rt->dst.expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } gc_args.more++; 13 } else if (rt->rt6i_flags & RTF_CACHE) { 1 if (atomic_read(&rt->dst.__refcnt) == 0 && 1 time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; 1 } else if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh; __u8 neigh_flags = 0; 1 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); 1 if (neigh) { 1 neigh_flags = neigh->flags; 1 neigh_release(neigh); } 1 if (!(neigh_flags & NTF_ROUTER)) { RT6_TRACE("purging route %p via non-router but gateway\n", rt); return -1; } } 13 gc_args.more++; } return 0; } static DEFINE_SPINLOCK(fib6_gc_lock); void fib6_run_gc(unsigned long expires, struct net *net, bool force) 13 { unsigned long now; if (force) { spin_lock_bh(&fib6_gc_lock); 13 } else if (!spin_trylock_bh(&fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } 13 gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = icmp6_dst_gc(); fib6_clean_all(net, fib6_age, NULL); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else 13 del_timer(&net->ipv6.ip6_fib_timer); 13 spin_unlock_bh(&fib6_gc_lock); } static void fib6_gc_timer_cb(unsigned long arg) { fib6_run_gc(0, (struct net *)arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; 28 setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_timer; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); 28 net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; 28 net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; 28 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; 28 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); #endif 28 fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_timer: return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), 0, SLAB_HWCACHE_ALIGN, NULL); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, NULL); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; loff_t skip; struct fib6_table *tbl; int sernum; }; static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct rt6_info *rt = v; 5 struct ipv6_route_iter *iter = seq->private; seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (rt->rt6i_flags & RTF_GATEWAY) seq_printf(seq, "%pi6", &rt->rt6i_gateway); else 5 seq_puts(seq, "00000000000000000000000000000000"); 5 seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), rt->dst.__use, rt->rt6i_flags, 5 rt->dst.dev ? rt->dst.dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { 5 struct ipv6_route_iter *iter = w->args; 5 if (!iter->skip) return 1; do { 5 iter->w.leaf = iter->w.leaf->dst.rt6_next; iter->skip--; 5 if (!iter->skip && iter->w.leaf) return 1; 5 } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) { 5 memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = iter->w.root->fn_sernum; INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(&iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; 5 if (tbl) { 5 h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; 5 node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } 5 while (!node && h < FIB6_TABLE_HASHSZ) { 5 node = rcu_dereference_bh( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } 5 return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { 5 if (iter->sernum != iter->w.root->fn_sernum) { iter->sernum = iter->w.root->fn_sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct rt6_info *n; 5 struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (!v) goto iter_table; 5 n = ((struct rt6_info *)v)->dst.rt6_next; if (n) { 5 ++*pos; 5 return n; } iter_table: 5 ipv6_route_check_sernum(iter); 5 read_lock(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); read_unlock(&iter->tbl->tb6_lock); if (r > 0) { 5 if (v) 5 ++*pos; 5 return iter->w.leaf; 5 } else if (r < 0) { fib6_walker_unlink(&iter->w); return NULL; } 5 fib6_walker_unlink(&iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; 5 ipv6_route_seq_setup_walk(iter); goto iter_table; 5 } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU_BH) { 5 struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; 5 rcu_read_lock_bh(); 5 iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { 5 ipv6_route_seq_setup_walk(iter); 5 return ipv6_route_seq_next(seq, NULL, pos); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; 5 return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU_BH) { 5 struct ipv6_route_iter *iter = seq->private; 5 if (ipv6_route_iter_active(iter)) 3 fib6_walker_unlink(&iter->w); 5 rcu_read_unlock_bh(); } static const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; int ipv6_route_open(struct inode *inode, struct file *file) { 8 return seq_open_net(inode, file, &ipv6_route_seq_ops, sizeof(struct ipv6_route_iter)); } #endif /* CONFIG_PROC_FS */
/* * Input Multitouch Library * * Copyright (c) 2008-2010 Henrik Rydberg * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published by * the Free Software Foundation. */ #include <linux/input/mt.h> #include <linux/export.h> #include <linux/slab.h> #define TRKID_SGN ((TRKID_MAX + 1) >> 1) static void copy_abs(struct input_dev *dev, unsigned int dst, unsigned int src) { if (dev->absinfo && test_bit(src, dev->absbit)) { dev->absinfo[dst] = dev->absinfo[src]; dev->absinfo[dst].fuzz = 0; dev->absbit[BIT_WORD(dst)] |= BIT_MASK(dst); } } /** * input_mt_init_slots() - initialize MT input slots * @dev: input device supporting MT events and finger tracking * @num_slots: number of slots used by the device * @flags: mt tasks to handle in core * * This function allocates all necessary memory for MT slot handling * in the input device, prepares the ABS_MT_SLOT and * ABS_MT_TRACKING_ID events for use and sets up appropriate buffers. * Depending on the flags set, it also performs pointer emulation and * frame synchronization. * * May be called repeatedly. Returns -EINVAL if attempting to * reinitialize with a different number of slots. */ 3 int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots, unsigned int flags) { 2 struct input_mt *mt = dev->mt; int i; 3 if (!num_slots) return 0; if (mt) 1 return mt->num_slots != num_slots ? -EINVAL : 0; 2 mt = kzalloc(sizeof(*mt) + num_slots * sizeof(*mt->slots), GFP_KERNEL); if (!mt) goto err_mem; 2 mt->num_slots = num_slots; mt->flags = flags; input_set_abs_params(dev, ABS_MT_SLOT, 0, num_slots - 1, 0, 0); input_set_abs_params(dev, ABS_MT_TRACKING_ID, 0, TRKID_MAX, 0, 0); if (flags & (INPUT_MT_POINTER | INPUT_MT_DIRECT)) { __set_bit(EV_KEY, dev->evbit); __set_bit(BTN_TOUCH, dev->keybit); copy_abs(dev, ABS_X, ABS_MT_POSITION_X); copy_abs(dev, ABS_Y, ABS_MT_POSITION_Y); copy_abs(dev, ABS_PRESSURE, ABS_MT_PRESSURE); } 2 if (flags & INPUT_MT_POINTER) { __set_bit(BTN_TOOL_FINGER, dev->keybit); __set_bit(BTN_TOOL_DOUBLETAP, dev->keybit); if (num_slots >= 3) __set_bit(BTN_TOOL_TRIPLETAP, dev->keybit); if (num_slots >= 4) __set_bit(BTN_TOOL_QUADTAP, dev->keybit); if (num_slots >= 5) __set_bit(BTN_TOOL_QUINTTAP, dev->keybit); __set_bit(INPUT_PROP_POINTER, dev->propbit); } 2 if (flags & INPUT_MT_DIRECT) __set_bit(INPUT_PROP_DIRECT, dev->propbit); 2 if (flags & INPUT_MT_SEMI_MT) __set_bit(INPUT_PROP_SEMI_MT, dev->propbit); 2 if (flags & INPUT_MT_TRACK) { unsigned int n2 = num_slots * num_slots; mt->red = kcalloc(n2, sizeof(*mt->red), GFP_KERNEL); if (!mt->red) goto err_mem; } /* Mark slots as 'inactive' */ for (i = 0; i < num_slots; i++) 2 input_mt_set_value(&mt->slots[i], ABS_MT_TRACKING_ID, -1); /* Mark slots as 'unused' */ 2 mt->frame = 1; dev->mt = mt; 3 return 0; err_mem: kfree(mt); return -ENOMEM; } EXPORT_SYMBOL(input_mt_init_slots); /** * input_mt_destroy_slots() - frees the MT slots of the input device * @dev: input device with allocated MT slots * * This function is only needed in error path as the input core will * automatically free the MT slots when the device is destroyed. */ void input_mt_destroy_slots(struct input_dev *dev) { 18 if (dev->mt) { 1 kfree(dev->mt->red); kfree(dev->mt); } 18 dev->mt = NULL; } EXPORT_SYMBOL(input_mt_destroy_slots); /** * input_mt_report_slot_state() - report contact state * @dev: input device with allocated MT slots * @tool_type: the tool type to use in this slot * @active: true if contact is active, false otherwise * * Reports a contact via ABS_MT_TRACKING_ID, and optionally * ABS_MT_TOOL_TYPE. If active is true and the slot is currently * inactive, or if the tool type is changed, a new tracking id is * assigned to the slot. The tool type is only reported if the * corresponding absbit field is set. */ void input_mt_report_slot_state(struct input_dev *dev, unsigned int tool_type, bool active) { struct input_mt *mt = dev->mt; struct input_mt_slot *slot; int id; if (!mt) return; slot = &mt->slots[mt->slot]; slot->frame = mt->frame; if (!active) { input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); return; } id = input_mt_get_value(slot, ABS_MT_TRACKING_ID); if (id < 0 || input_mt_get_value(slot, ABS_MT_TOOL_TYPE) != tool_type) id = input_mt_new_trkid(mt); input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, id); input_event(dev, EV_ABS, ABS_MT_TOOL_TYPE, tool_type); } EXPORT_SYMBOL(input_mt_report_slot_state); /** * input_mt_report_finger_count() - report contact count * @dev: input device with allocated MT slots * @count: the number of contacts * * Reports the contact count via BTN_TOOL_FINGER, BTN_TOOL_DOUBLETAP, * BTN_TOOL_TRIPLETAP and BTN_TOOL_QUADTAP. * * The input core ensures only the KEY events already setup for * this device will produce output. */ void input_mt_report_finger_count(struct input_dev *dev, int count) { input_event(dev, EV_KEY, BTN_TOOL_FINGER, count == 1); input_event(dev, EV_KEY, BTN_TOOL_DOUBLETAP, count == 2); input_event(dev, EV_KEY, BTN_TOOL_TRIPLETAP, count == 3); input_event(dev, EV_KEY, BTN_TOOL_QUADTAP, count == 4); input_event(dev, EV_KEY, BTN_TOOL_QUINTTAP, count == 5); } EXPORT_SYMBOL(input_mt_report_finger_count); /** * input_mt_report_pointer_emulation() - common pointer emulation * @dev: input device with allocated MT slots * @use_count: report number of active contacts as finger count * * Performs legacy pointer emulation via BTN_TOUCH, ABS_X, ABS_Y and * ABS_PRESSURE. Touchpad finger count is emulated if use_count is true. * * The input core ensures only the KEY and ABS axes already setup for * this device will produce output. */ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) { struct input_mt *mt = dev->mt; struct input_mt_slot *oldest; int oldid, count, i; if (!mt) return; oldest = NULL; oldid = mt->trkid; count = 0; for (i = 0; i < mt->num_slots; ++i) { struct input_mt_slot *ps = &mt->slots[i]; int id = input_mt_get_value(ps, ABS_MT_TRACKING_ID); if (id < 0) continue; if ((id - oldid) & TRKID_SGN) { oldest = ps; oldid = id; } count++; } input_event(dev, EV_KEY, BTN_TOUCH, count > 0); if (use_count) input_mt_report_finger_count(dev, count); if (oldest) { int x = input_mt_get_value(oldest, ABS_MT_POSITION_X); int y = input_mt_get_value(oldest, ABS_MT_POSITION_Y); input_event(dev, EV_ABS, ABS_X, x); input_event(dev, EV_ABS, ABS_Y, y); if (test_bit(ABS_MT_PRESSURE, dev->absbit)) { int p = input_mt_get_value(oldest, ABS_MT_PRESSURE); input_event(dev, EV_ABS, ABS_PRESSURE, p); } } else { if (test_bit(ABS_MT_PRESSURE, dev->absbit)) input_event(dev, EV_ABS, ABS_PRESSURE, 0); } } EXPORT_SYMBOL(input_mt_report_pointer_emulation); static void __input_mt_drop_unused(struct input_dev *dev, struct input_mt *mt) { int i; for (i = 0; i < mt->num_slots; i++) { if (!input_mt_is_used(mt, &mt->slots[i])) { input_mt_slot(dev, i); input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); } } } /** * input_mt_drop_unused() - Inactivate slots not seen in this frame * @dev: input device with allocated MT slots * * Lift all slots not seen since the last call to this function. */ void input_mt_drop_unused(struct input_dev *dev) { struct input_mt *mt = dev->mt; if (mt) { __input_mt_drop_unused(dev, mt); mt->frame++; } } EXPORT_SYMBOL(input_mt_drop_unused); /** * input_mt_sync_frame() - synchronize mt frame * @dev: input device with allocated MT slots * * Close the frame and prepare the internal state for a new one. * Depending on the flags, marks unused slots as inactive and performs * pointer emulation. */ void input_mt_sync_frame(struct input_dev *dev) { struct input_mt *mt = dev->mt; bool use_count = false; if (!mt) return; if (mt->flags & INPUT_MT_DROP_UNUSED) __input_mt_drop_unused(dev, mt); if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT)) use_count = true; input_mt_report_pointer_emulation(dev, use_count); mt->frame++; } EXPORT_SYMBOL(input_mt_sync_frame); static int adjust_dual(int *begin, int step, int *end, int eq, int mu) { int f, *p, s, c; if (begin == end) return 0; f = *begin; p = begin + step; s = p == end ? f + 1 : *p; for (; p != end; p += step) if (*p < f) s = f, f = *p; else if (*p < s) s = *p; c = (f + s + 1) / 2; if (c == 0 || (c > mu && (!eq || mu > 0))) return 0; /* Improve convergence for positive matrices by penalizing overcovers */ if (s < 0 && mu <= 0) c *= 2; for (p = begin; p != end; p += step) *p -= c; return (c < s && s <= 0) || (f >= 0 && f < c); } static void find_reduced_matrix(int *w, int nr, int nc, int nrc, int mu) { int i, k, sum; for (k = 0; k < nrc; k++) { for (i = 0; i < nr; i++) adjust_dual(w + i, nr, w + i + nrc, nr <= nc, mu); sum = 0; for (i = 0; i < nrc; i += nr) sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr, mu); if (!sum) break; } } static int input_mt_set_matrix(struct input_mt *mt, const struct input_mt_pos *pos, int num_pos, int mu) { const struct input_mt_pos *p; struct input_mt_slot *s; int *w = mt->red; int x, y; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (!input_mt_is_active(s)) continue; x = input_mt_get_value(s, ABS_MT_POSITION_X); y = input_mt_get_value(s, ABS_MT_POSITION_Y); for (p = pos; p != pos + num_pos; p++) { int dx = x - p->x, dy = y - p->y; *w++ = dx * dx + dy * dy - mu; } } return w - mt->red; } static void input_mt_set_slots(struct input_mt *mt, int *slots, int num_pos) { struct input_mt_slot *s; int *w = mt->red, j; for (j = 0; j != num_pos; j++) slots[j] = -1; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (!input_mt_is_active(s)) continue; for (j = 0; j != num_pos; j++) { if (w[j] < 0) { slots[j] = s - mt->slots; break; } } w += num_pos; } for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (input_mt_is_active(s)) continue; for (j = 0; j != num_pos; j++) { if (slots[j] < 0) { slots[j] = s - mt->slots; break; } } } } /** * input_mt_assign_slots() - perform a best-match assignment * @dev: input device with allocated MT slots * @slots: the slot assignment to be filled * @pos: the position array to match * @num_pos: number of positions * @dmax: maximum ABS_MT_POSITION displacement (zero for infinite) * * Performs a best match against the current contacts and returns * the slot assignment list. New contacts are assigned to unused * slots. * * The assignments are balanced so that all coordinate displacements are * below the euclidian distance dmax. If no such assignment can be found, * some contacts are assigned to unused slots. * * Returns zero on success, or negative error in case of failure. */ int input_mt_assign_slots(struct input_dev *dev, int *slots, const struct input_mt_pos *pos, int num_pos, int dmax) { struct input_mt *mt = dev->mt; int mu = 2 * dmax * dmax; int nrc; if (!mt || !mt->red) return -ENXIO; if (num_pos > mt->num_slots) return -EINVAL; if (num_pos < 1) return 0; nrc = input_mt_set_matrix(mt, pos, num_pos, mu); find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc, mu); input_mt_set_slots(mt, slots, num_pos); return 0; } EXPORT_SYMBOL(input_mt_assign_slots); /** * input_mt_get_slot_by_key() - return slot matching key * @dev: input device with allocated MT slots * @key: the key of the sought slot * * Returns the slot of the given key, if it exists, otherwise * set the key on the first unused slot and return. * * If no available slot can be found, -1 is returned. * Note that for this function to work properly, input_mt_sync_frame() has * to be called at each frame. */ int input_mt_get_slot_by_key(struct input_dev *dev, int key) { struct input_mt *mt = dev->mt; struct input_mt_slot *s; if (!mt) return -1; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) if (input_mt_is_active(s) && s->key == key) return s - mt->slots; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) if (!input_mt_is_active(s) && !input_mt_is_used(mt, s)) { s->key = key; return s - mt->slots; } return -1; } EXPORT_SYMBOL(input_mt_get_slot_by_key);
#ifndef _ASM_WORD_AT_A_TIME_H #define _ASM_WORD_AT_A_TIME_H #include <linux/kernel.h> /* * This is largely generic for little-endian machines, but the * optimal byte mask counting is probably going to be something * that is architecture-specific. If you have a reliably fast * bit count instruction, that might be better than the multiply * and shift, for example. */ struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } #ifdef CONFIG_64BIT /* * Jan Achrenius on G+: microoptimized version of * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" * that works for the bytemasks without having to * mask them first. */ static inline long count_masked_bytes(unsigned long mask) { return mask*0x0001020304050608ul >> 56; } #else /* 32-bit case */ /* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ static inline long count_masked_bytes(long mask) { /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ long a = (0x0ff0001+mask) >> 23; /* Fix the 1 for 00 case */ return a & mask; } #endif /* Return nonzero if it has a zero */ static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { 402 unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; *bits = mask; return mask; } static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c) { return bits; } static inline unsigned long create_zero_mask(unsigned long bits) { 2466 bits = (bits - 1) & ~bits; return bits >> 7; } /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) static inline unsigned long find_zero(unsigned long mask) { return count_masked_bytes(mask); } /* * Load an unaligned word from kernel space. * * In the (very unlikely) case of the word being a page-crosser * and the next page not being mapped, take the exception and * return zeroes in the non-existing part. */ static inline unsigned long load_unaligned_zeropad(const void *addr) { unsigned long ret, dummy; 957 asm( "1:\tmov %2,%0\n" "2:\n" ".section .fixup,\"ax\"\n" "3:\t" "lea %2,%1\n\t" "and %3,%1\n\t" "mov (%1),%0\n\t" "leal %2,%%ecx\n\t" "andl %4,%%ecx\n\t" "shll $3,%%ecx\n\t" "shr %%cl,%0\n\t" "jmp 2b\n" ".previous\n" _ASM_EXTABLE(1b, 3b) :"=&r" (ret),"=&c" (dummy) :"m" (*(unsigned long *)addr), "i" (-sizeof(unsigned long)), "i" (sizeof(unsigned long)-1)); return ret; } #endif /* _ASM_WORD_AT_A_TIME_H */
#ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H #include <linux/genetlink.h> #include <net/netlink.h> #include <net/net_namespace.h> #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family */ struct genl_multicast_group { char name[GENL_NAMSIZ]; }; struct genl_ops; struct genl_info; /** * struct genl_family - generic netlink family * @id: protocol family idenfitier * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported * @netnsok: set to true if the family can handle network * namespaces and should be presented in all of them * @parallel_ops: operations can be called in parallel and aren't * synchronized by the core genetlink code * @pre_doit: called before an operation's doit callback, it may * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks * @mcast_bind: a socket bound to the given multicast group (which * is given as the offset into the groups array) * @mcast_unbind: a socket was unbound from the given multicast group. * Note that unbind() will not be called symmetrically if the * generic netlink family is removed while there are still open * sockets. * @attrbuf: buffer to store parsed attributes * @family_list: family list * @mcgrps: multicast groups used by this family (private) * @n_mcgrps: number of multicast groups (private) * @mcgrp_offset: starting number of multicast group IDs in this family * @ops: the operations supported by this family (private) * @n_ops: number of operations supported by this family (private) */ struct genl_family { unsigned int id; unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; bool netnsok; bool parallel_ops; int (*pre_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); void (*post_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); int (*mcast_bind)(struct net *net, int group); void (*mcast_unbind)(struct net *net, int group); struct nlattr ** attrbuf; /* private */ const struct genl_ops * ops; /* private */ const struct genl_multicast_group *mcgrps; /* private */ unsigned int n_ops; /* private */ unsigned int n_mcgrps; /* private */ unsigned int mcgrp_offset; /* private */ struct list_head family_list; /* private */ struct module *module; }; /** * struct genl_info - receiving information * @snd_seq: sending sequence number * @snd_portid: netlink portid of sender * @nlhdr: netlink message header * @genlhdr: generic netlink message header * @userhdr: user specific header * @attrs: netlink attributes * @_net: network namespace * @user_ptr: user pointers * @dst_sk: destination socket */ struct genl_info { u32 snd_seq; u32 snd_portid; struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; void * userhdr; struct nlattr ** attrs; possible_net_t _net; void * user_ptr[2]; struct sock * dst_sk; }; static inline struct net *genl_info_net(struct genl_info *info) { 5 return read_pnet(&info->_net); } static inline void genl_info_net_set(struct genl_info *info, struct net *net) { write_pnet(&info->_net, net); } /** * struct genl_ops - generic netlink operations * @cmd: command identifier * @internal_flags: flags used by the family * @flags: flags * @policy: attribute validation policy * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps * @ops_list: operations list */ struct genl_ops { const struct nla_policy *policy; int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); u8 cmd; u8 internal_flags; u8 flags; }; int __genl_register_family(struct genl_family *family); static inline int genl_register_family(struct genl_family *family) { family->module = THIS_MODULE; return __genl_register_family(family); } /** * genl_register_family_with_ops - register a generic netlink family with ops * @family: generic netlink family * @ops: operations to be registered * @n_ops: number of elements to register * * Registers the specified family and operations from the specified table. * Only one family may be registered with the same family name or identifier. * * The family id may equal GENL_ID_GENERATE causing an unique id to * be automatically generated and assigned. * * Either a doit or dumpit callback must be specified for every registered * operation or the function will fail. Only one operation structure per * command identifier may be registered. * * See include/net/genetlink.h for more documenation on the operations * structure. * * Return 0 on success or a negative error code. */ static inline int _genl_register_family_with_ops_grps(struct genl_family *family, const struct genl_ops *ops, size_t n_ops, const struct genl_multicast_group *mcgrps, size_t n_mcgrps) { family->module = THIS_MODULE; family->ops = ops; family->n_ops = n_ops; family->mcgrps = mcgrps; family->n_mcgrps = n_mcgrps; return __genl_register_family(family); } #define genl_register_family_with_ops(family, ops) \ _genl_register_family_with_ops_grps((family), \ (ops), ARRAY_SIZE(ops), \ NULL, 0) #define genl_register_family_with_ops_groups(family, ops, grps) \ _genl_register_family_with_ops_grps((family), \ (ops), ARRAY_SIZE(ops), \ (grps), ARRAY_SIZE(grps)) int genl_unregister_family(struct genl_family *family); void genl_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, struct genl_family *family, int flags, u8 cmd); /** * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * @family: generic netlink family * * Returns pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr, struct genl_family *family) { return (struct nlmsghdr *)((char *)user_hdr - family->hdrsize - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_parse - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * */ static inline int genlmsg_parse(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy) { return nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy); } /** * genl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @user_hdr: user header as returned from genlmsg_put() * @family: generic netlink family * * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it * simpler to use with generic netlink. */ static inline void genl_dump_check_consistent(struct netlink_callback *cb, void *user_hdr, struct genl_family *family) { nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr, family)); } /** * genlmsg_put_reply - Add generic netlink header to a reply message * @skb: socket buffer holding the message * @info: receiver info * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, struct genl_family *family, int flags, u8 cmd) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, family, flags, cmd); } /** * genlmsg_end - Finalize a generic netlink message * @skb: socket buffer the message is stored in * @hdr: user specific header */ static inline void genlmsg_end(struct sk_buff *skb, void *hdr) { 9 nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_cancel - Cancel construction of a generic netlink message * @skb: socket buffer the message is stored in * @hdr: generic netlink message header */ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) { if (hdr) 2 nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast_netns(struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return nlmsg_multicast(net->genl_sock, skb, portid, group, flags); } /** * genlmsg_multicast - multicast a netlink message to the default netns * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast(struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return genlmsg_multicast_netns(family, &init_net, skb, portid, group, flags); } /** * genlmsg_multicast_allns - multicast a netlink message to all net namespaces * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags); /** * genlmsg_unicast - unicast a netlink message * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid) { 163 return nlmsg_unicast(net->genl_sock, skb, portid); } /** * genlmsg_reply - reply to a request * @skb: netlink message to be sent back * @info: receiver information */ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) { 163 return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid); } /** * gennlmsg_data - head of message payload * @gnlh: genetlink message header */ static inline void *genlmsg_data(const struct genlmsghdr *gnlh) { return ((unsigned char *) gnlh + GENL_HDRLEN); } /** * genlmsg_len - length of message payload * @gnlh: genetlink message header */ static inline int genlmsg_len(const struct genlmsghdr *gnlh) { struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh - NLMSG_HDRLEN); return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_msg_size - length of genetlink message not including padding * @payload: length of message payload */ static inline int genlmsg_msg_size(int payload) { return GENL_HDRLEN + payload; } /** * genlmsg_total_size - length of genetlink message including padding * @payload: length of message payload */ static inline int genlmsg_total_size(int payload) { return NLMSG_ALIGN(genlmsg_msg_size(payload)); } /** * genlmsg_new - Allocate a new generic netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. */ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) { return nlmsg_new(genlmsg_total_size(payload), flags); } /** * genl_set_err - report error to genetlink broadcast listeners * @family: the generic netlink family * @net: the network namespace to report the error to * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * (this is the offset of the multicast group in the groups array) * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_RECV_NO_ENOBUFS socket option. */ static inline int genl_set_err(struct genl_family *family, struct net *net, u32 portid, u32 group, int code) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_set_err(net->genl_sock, portid, group, code); } static inline int genl_has_listeners(struct genl_family *family, struct net *net, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_has_listeners(net->genl_sock, group); } #endif /* __NET_GENERIC_NETLINK_H */
#ifndef __NET_RTNETLINK_H #define __NET_RTNETLINK_H #include <linux/rtnetlink.h> #include <net/netlink.h> typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *); typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *); int __rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, rtnl_calcit_func); void rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, rtnl_calcit_func); int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); static inline int rtnl_msg_family(const struct nlmsghdr *nlh) { if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg)) 8 return ((struct rtgenmsg *) nlmsg_data(nlh))->rtgen_family; else return AF_UNSPEC; } /** * struct rtnl_link_ops - rtnetlink link operations * * @list: Used internally * @kind: Identifier * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters * @priv_size: sizeof net_device private space * @setup: net_device setup function * @newlink: Function for configuring and registering a new device * @changelink: Function for changing parameters of an existing device * @dellink: Function to remove a device * @get_size: Function to calculate required room for dumping device * specific netlink attributes * @fill_info: Function to dump device specific netlink attributes * @get_xstats_size: Function to calculate required room for dumping device * specific statistics * @fill_xstats: Function to dump device specific statistics * @get_num_tx_queues: Function to determine number of transmit queues * to create when creating a new device. * @get_num_rx_queues: Function to determine number of receive queues * to create when creating a new device. * @get_link_net: Function to get the i/o netns of the device */ struct rtnl_link_ops { struct list_head list; const char *kind; size_t priv_size; void (*setup)(struct net_device *dev); int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], struct nlattr *data[]); int (*newlink)(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]); int (*changelink)(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]); void (*dellink)(struct net_device *dev, struct list_head *head); size_t (*get_size)(const struct net_device *dev); int (*fill_info)(struct sk_buff *skb, const struct net_device *dev); size_t (*get_xstats_size)(const struct net_device *dev); int (*fill_xstats)(struct sk_buff *skb, const struct net_device *dev); unsigned int (*get_num_tx_queues)(void); unsigned int (*get_num_rx_queues)(void); int slave_maxtype; const struct nla_policy *slave_policy; int (*slave_validate)(struct nlattr *tb[], struct nlattr *data[]); int (*slave_changelink)(struct net_device *dev, struct net_device *slave_dev, struct nlattr *tb[], struct nlattr *data[]); size_t (*get_slave_size)(const struct net_device *dev, const struct net_device *slave_dev); int (*fill_slave_info)(struct sk_buff *skb, const struct net_device *dev, const struct net_device *slave_dev); struct net *(*get_link_net)(const struct net_device *dev); }; int __rtnl_link_register(struct rtnl_link_ops *ops); void __rtnl_link_unregister(struct rtnl_link_ops *ops); int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * * @list: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. * @get_link_af_size: Function to calculate size of address family specific * netlink attributes. * @validate_link_af: Validate a IFLA_AF_SPEC attribute, must check attr * for invalid configuration settings. * @set_link_af: Function to parse a IFLA_AF_SPEC attribute and modify * net_device accordingly. */ struct rtnl_af_ops { struct list_head list; int family; int (*fill_link_af)(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask); size_t (*get_link_af_size)(const struct net_device *dev, u32 ext_filter_mask); int (*validate_link_af)(const struct net_device *dev, const struct nlattr *attr); int (*set_link_af)(struct net_device *dev, const struct nlattr *attr); }; void __rtnl_af_unregister(struct rtnl_af_ops *ops); void rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[]); int rtnl_delete_link(struct net_device *dev); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) #endif
/* * linux/kernel/time.c * * Copyright (C) 1991, 1992 Linus Torvalds * * This file contains the interface functions for the various * time related system calls: time, stime, gettimeofday, settimeofday, * adjtime */ /* * Modification history kernel/time.c * * 1993-09-02 Philip Gladstone * Created file with time related functions from sched/core.c and adjtimex() * 1993-10-08 Torsten Duwe * adjtime interface update and CMOS clock write code * 1995-08-13 Torsten Duwe * kernel PLL updated to 1994-12-13 specs (rfc-1589) * 1999-01-16 Ulrich Windl * Introduced error checking for many cases in adjtimex(). * Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) * (Even though the technical memorandum forbids it) * 2004-07-14 Christoph Lameter * Added getnstimeofday to allow the posix timer functions to return * with nanosecond accuracy */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/timex.h> #include <linux/capability.h> #include <linux/timekeeper_internal.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> #include <linux/math64.h> #include <linux/ptrace.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <generated/timeconst.h> #include "timekeeping.h" /* * The timezone where the local system is located. Used as a default by some * programs who obtain this value by using gettimeofday. */ struct timezone sys_tz; EXPORT_SYMBOL(sys_tz); #ifdef __ARCH_WANT_SYS_TIME /* * sys_time() can be implemented in user-level using * sys_gettimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). */ SYSCALL_DEFINE1(time, time_t __user *, tloc) { time_t i = get_seconds(); if (tloc) { if (put_user(i,tloc)) return -EFAULT; } force_successful_syscall_return(); return i; } /* * sys_stime() can be implemented in user-level using * sys_settimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). */ SYSCALL_DEFINE1(stime, time_t __user *, tptr) { struct timespec tv; int err; if (get_user(tv.tv_sec, tptr)) return -EFAULT; tv.tv_nsec = 0; err = security_settime(&tv, NULL); if (err) return err; do_settimeofday(&tv); return 0; } #endif /* __ARCH_WANT_SYS_TIME */ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { struct timeval ktv; do_gettimeofday(&ktv); if (copy_to_user(tv, &ktv, sizeof(ktv))) return -EFAULT; } if (unlikely(tz != NULL)) { if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) return -EFAULT; } return 0; } /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. */ int persistent_clock_is_local; /* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. * * This is ugly, but preferable to the alternatives. Otherwise we * would either need to write a program to do it in /etc/rc (and risk * confusion if the program gets run more than once; it would also be * hard to make the program warp the clock precisely n hours) or * compile in the timezone information into the kernel. Bad, bad.... * * - TYT, 1992-01-01 * * The best thing to do is to keep the CMOS clock in universal time (UTC) * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ static inline void warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { struct timespec adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; adjust.tv_nsec = 0; timekeeping_inject_offset(&adjust); } } /* * In case for some reason the CMOS clock has not already been running * in UTC, but in some local time: The first time we set the timezone, * we will warp the clock so that it is ticking UTC time instead of * local time. Presumably, if someone is setting the timezone then we * are running in an environment where the programs understand about * timezones. This should be done at boot time in the /etc/rc script, * as soon as possible, so that the clock can be set right. Otherwise, * various programs will get confused when the clock gets warped. */ 3 int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; 3 if (tv && !timespec_valid(tv)) return -EINVAL; 1 error = security_settime(tv, tz); if (error) return error; if (tz) { /* Verify we're witin the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { firsttime = 0; if (!tv) warp_clock(); } } if (tv) 3 return do_settimeofday(tv); return 0; } SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { struct timeval user_tv; struct timespec new_ts; struct timezone new_tz; if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; if (!timeval_valid(&user_tv)) return -EINVAL; new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) { struct timex txc; /* Local copy of parameter */ int ret; /* Copy the user data space into the kernel copy * structure. But bear in mind that the structures * may change */ if(copy_from_user(&txc, txc_p, sizeof(struct timex))) return -EFAULT; ret = do_adjtimex(&txc); return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } /** * current_fs_time - Return FS time * @sb: Superblock. * * Return the current time truncated to the time granularity supported by * the fs. */ struct timespec current_fs_time(struct super_block *sb) { 2308 struct timespec now = current_kernel_time(); return timespec_trunc(now, sb->s_time_gran); } EXPORT_SYMBOL(current_fs_time); /* * Convert jiffies to milliseconds and back. * * Avoid unnecessary multiplications/divisions in the * two most common HZ cases: */ unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) 494 return (MSEC_PER_SEC / HZ) * j; #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> HZ_TO_MSEC_SHR32; # else return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); # endif #endif } EXPORT_SYMBOL(jiffies_to_msecs); unsigned int jiffies_to_usecs(const unsigned long j) { /* * Hz usually doesn't go much further MSEC_PER_SEC. * jiffies_to_usecs() and usecs_to_jiffies() depend on that. */ BUILD_BUG_ON(HZ > USEC_PER_SEC); #if !(USEC_PER_SEC % HZ) 533 return (USEC_PER_SEC / HZ) * j; #else # if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; # else return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; # endif #endif } EXPORT_SYMBOL(jiffies_to_usecs); /** * timespec_trunc - Truncate timespec to a granularity * @t: Timespec * @gran: Granularity in ns. * * Truncate a timespec to a granularity. Always rounds down. gran must * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ 2308 struct timespec timespec_trunc(struct timespec t, unsigned gran) { /* Avoid division in the common cases 1 ns and 1 s. */ if (gran == 1) { /* nothing */ } else if (gran == NSEC_PER_SEC) { t.tv_nsec = 0; } else if (gran > 1 && gran < NSEC_PER_SEC) { t.tv_nsec -= t.tv_nsec % gran; } else { WARN(1, "illegal file time granularity: %u", gran); } 2308 return t; } EXPORT_SYMBOL(timespec_trunc); /* * mktime64 - Converts date to seconds. * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. * * [For the Julian calendar (which was used in Russia before 1917, * Britain & colonies before 1752, anywhere else before 1582, * and is still in use by some communities) leave out the * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec) { unsigned int mon = mon0, year = year0; /* 1..12 -> 11,12,1..10 */ 12 if (0 >= (int) (mon -= 2)) { 2 mon += 12; /* Puts Feb last since it has leap day */ year -= 1; } return ((((time64_t) 12 (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 )*24 + hour /* now have hours */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } EXPORT_SYMBOL(mktime64); /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * * @ts: pointer to timespec variable to be set * @sec: seconds to set * @nsec: nanoseconds to set * * Set seconds and nanoseconds field of a timespec variable and * normalize to the timespec storage format * * Note: The tv_nsec part is always in the range of * 0 <= tv_nsec < NSEC_PER_SEC * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) { 215 while (nsec >= NSEC_PER_SEC) { /* * The following asm() prevents the compiler from * optimising this loop into a modulo operation. See * also __iter_div_u64_rem() in include/linux/time.h */ 98 asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } 220 while (nsec < 0) { 175 asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } 215 ts->tv_sec = sec; ts->tv_nsec = nsec; } EXPORT_SYMBOL(set_normalized_timespec); /** * ns_to_timespec - Convert nanoseconds to timespec * @nsec: the nanoseconds value to be converted * * Returns the timespec representation of the nsec parameter. */ 176 struct timespec ns_to_timespec(const s64 nsec) { struct timespec ts; s32 rem; 179 if (!nsec) 36 return (struct timespec) {0, 0}; 156 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); if (unlikely(rem < 0)) { ts.tv_sec--; 2 rem += NSEC_PER_SEC; } 176 ts.tv_nsec = rem; return ts; } EXPORT_SYMBOL(ns_to_timespec); /** * ns_to_timeval - Convert nanoseconds to timeval * @nsec: the nanoseconds value to be converted * * Returns the timeval representation of the nsec parameter. */ struct timeval ns_to_timeval(const s64 nsec) { 36 struct timespec ts = ns_to_timespec(nsec); struct timeval tv; tv.tv_sec = ts.tv_sec; tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; 36 return tv; } EXPORT_SYMBOL(ns_to_timeval); #if BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * * @ts: pointer to timespec variable to be set * @sec: seconds to set * @nsec: nanoseconds to set * * Set seconds and nanoseconds field of a timespec variable and * normalize to the timespec storage format * * Note: The tv_nsec part is always in the range of * 0 <= tv_nsec < NSEC_PER_SEC * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { /* * The following asm() prevents the compiler from * optimising this loop into a modulo operation. See * also __iter_div_u64_rem() in include/linux/time.h */ asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } ts->tv_sec = sec; ts->tv_nsec = nsec; } EXPORT_SYMBOL(set_normalized_timespec64); /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ struct timespec64 ns_to_timespec64(const s64 nsec) { struct timespec64 ts; s32 rem; if (!nsec) return (struct timespec64) {0, 0}; ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); if (unlikely(rem < 0)) { ts.tv_sec--; rem += NSEC_PER_SEC; } ts.tv_nsec = rem; return ts; } EXPORT_SYMBOL(ns_to_timespec64); #endif /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see __msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * the _msecs_to_jiffies helpers are the HZ dependent conversion * routines found in include/linux/jiffies.h */ unsigned long __msecs_to_jiffies(const unsigned int m) { /* * Negative value, means infinite timeout: */ 828 if ((int)m < 0) return MAX_JIFFY_OFFSET; 828 return _msecs_to_jiffies(m); } EXPORT_SYMBOL(__msecs_to_jiffies); unsigned long __usecs_to_jiffies(const unsigned int u) { 411 if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; 411 return _usecs_to_jiffies(u); } EXPORT_SYMBOL(__usecs_to_jiffies); /* * The TICK_NSEC - 1 rounds up the value to the next resolution. Note * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. * Note that due to the small error in the multiplier here, this * rounding is incorrect for sufficiently large values of tv_nsec, but * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. */ static unsigned long __timespec64_to_jiffies(u64 sec, long nsec) { 10 nsec = nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; nsec = 0; } return ((sec * SEC_CONVERSION) + (((u64)nsec * NSEC_CONVERSION) >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } static unsigned long __timespec_to_jiffies(unsigned long sec, long nsec) { return __timespec64_to_jiffies((u64)sec, nsec); } unsigned long timespec64_to_jiffies(const struct timespec64 *value) { 10 return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec); } EXPORT_SYMBOL(timespec64_to_jiffies); void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { /* * Convert jiffies to nanoseconds and separate with * one divide. */ u32 rem; 15 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, NSEC_PER_SEC, &rem); value->tv_nsec = rem; } EXPORT_SYMBOL(jiffies_to_timespec64); /* * We could use a similar algorithm to timespec_to_jiffies (with a * different multiplier for usec instead of nsec). But this has a * problem with rounding: we can't exactly add TICK_NSEC - 1 to the * usec value, since it's not necessarily integral. * * We could instead round in the intermediate scaled representation * (i.e. in units of 1/2^(large scale) jiffies) but that's also * perilous: the scaling introduces a small positive error, which * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 * units to the intermediate before shifting) leads to accidental * overflow and overestimates. * * At the cost of one additional multiplication by a constant, just * use the timespec implementation. */ unsigned long timeval_to_jiffies(const struct timeval *value) { 7 return __timespec_to_jiffies(value->tv_sec, 7 value->tv_usec * NSEC_PER_USEC); } EXPORT_SYMBOL(timeval_to_jiffies); void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) { /* * Convert jiffies to nanoseconds and separate with * one divide. */ u32 rem; 24 value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, NSEC_PER_SEC, &rem); value->tv_usec = rem / NSEC_PER_USEC; } EXPORT_SYMBOL(jiffies_to_timeval); /* * Convert jiffies/jiffies_64 to clock_t and back. */ clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ return x * (USER_HZ / HZ); # else 667 return x / (HZ / USER_HZ); # endif #else return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); #endif } EXPORT_SYMBOL(jiffies_to_clock_t); unsigned long clock_t_to_jiffies(unsigned long x) 4 { #if (HZ % USER_HZ)==0 if (x >= ~0UL / (HZ / USER_HZ)) return ~0UL; return x * (HZ / USER_HZ); #else /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; /* .. but do try to contain it here */ return div_u64((u64)x * HZ, USER_HZ); #endif } EXPORT_SYMBOL(clock_t_to_jiffies); u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ x = div_u64(x * USER_HZ, HZ); # elif HZ > USER_HZ x = div_u64(x, HZ / USER_HZ); # else /* Nothing to do */ # endif #else /* * There are better ways that don't overflow early, * but even this doesn't overflow in hundreds of years * in 64 bits, so.. */ x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); #endif 2 return x; } EXPORT_SYMBOL(jiffies_64_to_clock_t); u64 nsec_to_clock_t(u64 x) { #if (NSEC_PER_SEC % USER_HZ) == 0 10 return div_u64(x, NSEC_PER_SEC / USER_HZ); #elif (USER_HZ % 512) == 0 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); #else /* * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, * overflow after 64.99 years. * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... */ return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); #endif } /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. * And this doesn't return MAX_JIFFY_OFFSET since this function is designed * for scheduler, not for use in device drivers to calculate timeout value. * * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years */ u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 60 return div_u64(n, NSEC_PER_SEC / HZ); #elif (HZ % 512) == 0 /* overflow after 292 years if HZ = 1024 */ return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); #else /* * Generic case - optimized for cases where HZ is a multiple of 3. * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. */ return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } EXPORT_SYMBOL(nsecs_to_jiffies64); /** * nsecs_to_jiffies - Convert nsecs in u64 to jiffies * * @n: nsecs in u64 * * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. * And this doesn't return MAX_JIFFY_OFFSET since this function is designed * for scheduler, not for use in device drivers to calculate timeout value. * * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years */ unsigned long nsecs_to_jiffies(u64 n) { 60 return (unsigned long)nsecs_to_jiffies64(n); } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); /* * Add two timespec values and do a safety check for overflow. * It's assumed that both values are valid (>= 0) */ struct timespec timespec_add_safe(const struct timespec lhs, 212 const struct timespec rhs) { struct timespec res; 212 set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); 212 if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) res.tv_sec = TIME_T_MAX; 212 return res; }
/* * ioctl32.c: Conversion between 32bit and 64bit native ioctls. * * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) * * These routines maintain argument size conversion between 32bit and 64bit * ioctls. */ #include <linux/joystick.h> #include <linux/types.h> #include <linux/compat.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/compiler.h> #include <linux/sched.h> #include <linux/smp.h> #include <linux/ioctl.h> #include <linux/if.h> #include <linux/if_bridge.h> #include <linux/raid/md_u.h> #include <linux/kd.h> #include <linux/route.h> #include <linux/in6.h> #include <linux/ipv6_route.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/vt.h> #include <linux/falloc.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/if_pppox.h> #include <linux/mtio.h> #include <linux/auto_fs.h> #include <linux/auto_fs4.h> #include <linux/tty.h> #include <linux/vt_kern.h> #include <linux/fb.h> #include <linux/videodev2.h> #include <linux/netdevice.h> #include <linux/raw.h> #include <linux/blkdev.h> #include <linux/elevator.h> #include <linux/rtc.h> #include <linux/pci.h> #include <linux/serial.h> #include <linux/if_tun.h> #include <linux/ctype.h> #include <linux/syscalls.h> #include <linux/i2c.h> #include <linux/i2c-dev.h> #include <linux/atalk.h> #include <linux/gfp.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_sock.h> #include <net/bluetooth/rfcomm.h> #include <linux/capi.h> #include <linux/gigaset_dev.h> #ifdef CONFIG_BLOCK #include <linux/cdrom.h> #include <linux/fd.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/sg.h> #endif #include <asm/uaccess.h> #include <linux/ethtool.h> #include <linux/mii.h> #include <linux/if_bonding.h> #include <linux/watchdog.h> #include <linux/soundcard.h> #include <linux/lp.h> #include <linux/ppdev.h> #include <linux/atm.h> #include <linux/atmarp.h> #include <linux/atmclip.h> #include <linux/atmdev.h> #include <linux/atmioc.h> #include <linux/atmlec.h> #include <linux/atmmpc.h> #include <linux/atmsvc.h> #include <linux/atm_tcp.h> #include <linux/sonet.h> #include <linux/atm_suni.h> #include <linux/usb.h> #include <linux/usbdevice_fs.h> #include <linux/nbd.h> #include <linux/random.h> #include <linux/filter.h> #include <linux/hiddev.h> #define __DVB_CORE__ #include <linux/dvb/audio.h> #include <linux/dvb/dmx.h> #include <linux/dvb/frontend.h> #include <linux/dvb/video.h> #include <linux/sort.h> #ifdef CONFIG_SPARC #include <asm/fbio.h> #endif static int w_long(unsigned int fd, unsigned int cmd, compat_ulong_t __user *argp) { 1 mm_segment_t old_fs = get_fs(); int err; unsigned long val; set_fs (KERNEL_DS); err = sys_ioctl(fd, cmd, (unsigned long)&val); set_fs (old_fs); 3 if (!err && put_user(val, argp)) return -EFAULT; return err; } struct compat_video_event { int32_t type; compat_time_t timestamp; union { video_size_t size; unsigned int frame_rate; } u; }; static int do_video_get_event(unsigned int fd, unsigned int cmd, struct compat_video_event __user *up) { struct video_event kevent; mm_segment_t old_fs = get_fs(); int err; set_fs(KERNEL_DS); err = sys_ioctl(fd, cmd, (unsigned long) &kevent); set_fs(old_fs); if (!err) { err = put_user(kevent.type, &up->type); err |= put_user(kevent.timestamp, &up->timestamp); err |= put_user(kevent.u.size.w, &up->u.size.w); err |= put_user(kevent.u.size.h, &up->u.size.h); err |= put_user(kevent.u.size.aspect_ratio, &up->u.size.aspect_ratio); if (err) err = -EFAULT; } 8 return err; } struct compat_video_still_picture { compat_uptr_t iFrame; int32_t size; }; static int do_video_stillpicture(unsigned int fd, unsigned int cmd, struct compat_video_still_picture __user *up) { struct video_still_picture __user *up_native; compat_uptr_t fp; int32_t size; int err; 3 err = get_user(fp, &up->iFrame); err |= get_user(size, &up->size); if (err) return -EFAULT; up_native = 2 compat_alloc_user_space(sizeof(struct video_still_picture)); err = put_user(compat_ptr(fp), &up_native->iFrame); err |= put_user(size, &up_native->size); if (err) return -EFAULT; 2 err = sys_ioctl(fd, cmd, (unsigned long) up_native); return err; } struct compat_video_spu_palette { int length; compat_uptr_t palette; }; static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, struct compat_video_spu_palette __user *up) { struct video_spu_palette __user *up_native; compat_uptr_t palp; int length, err; err = get_user(palp, &up->palette); err |= get_user(length, &up->length); if (err) return -EFAULT; up_native = compat_alloc_user_space(sizeof(struct video_spu_palette)); err = put_user(compat_ptr(palp), &up_native->palette); err |= put_user(length, &up_native->length); if (err) return -EFAULT; err = sys_ioctl(fd, cmd, (unsigned long) up_native); return err; } #ifdef CONFIG_BLOCK typedef struct sg_io_hdr32 { compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ compat_int_t dxfer_direction; /* [i] data transfer direction */ unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ unsigned char mx_sb_len; /* [i] max length to write to sbp */ unsigned short iovec_count; /* [i] 0 implies no scatter gather */ compat_uint_t dxfer_len; /* [i] byte count of data transfer */ compat_uint_t dxferp; /* [i], [*io] points to data transfer memory or scatter gather list */ compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ compat_int_t pack_id; /* [i->o] unused internally (normally) */ compat_uptr_t usr_ptr; /* [i->o] unused internally */ unsigned char status; /* [o] scsi status */ unsigned char masked_status; /* [o] shifted, masked scsi status */ unsigned char msg_status; /* [o] messaging level data (optional) */ unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ unsigned short host_status; /* [o] errors from host adapter */ unsigned short driver_status; /* [o] errors from software driver */ compat_int_t resid; /* [o] dxfer_len - actual_transferred */ compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ compat_uint_t info; /* [o] auxiliary information */ } sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ typedef struct sg_iovec32 { compat_uint_t iov_base; compat_uint_t iov_len; } sg_iovec32_t; static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) { sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); sg_iovec32_t __user *iov32 = dxferp; int i; for (i = 0; i < iovec_count; i++) { u32 base, len; 1 if (get_user(base, &iov32[i].iov_base) || get_user(len, &iov32[i].iov_len) || put_user(compat_ptr(base), &iov[i].iov_base) || put_user(len, &iov[i].iov_len)) return -EFAULT; } if (put_user(iov, &sgio->dxferp)) return -EFAULT; return 0; } static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, sg_io_hdr32_t __user *sgio32) { sg_io_hdr_t __user *sgio; u16 iovec_count; u32 data; void __user *dxferp; int err; int interface_id; 4 if (get_user(interface_id, &sgio32->interface_id)) return -EFAULT; 4 if (interface_id != 'S') 1 return sys_ioctl(fd, cmd, (unsigned long)sgio32); 3 if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; { 3 void __user *top = compat_alloc_user_space(0); void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + (iovec_count * sizeof(sg_iovec_t))); if (new > top) return -EINVAL; sgio = new; } /* Ok, now construct. */ 3 if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, (2 * sizeof(int)) + (2 * sizeof(unsigned char)) + (1 * sizeof(unsigned short)) + (1 * sizeof(unsigned int)))) return -EFAULT; 2 if (get_user(data, &sgio32->dxferp)) return -EFAULT; 2 dxferp = compat_ptr(data); if (iovec_count) { 1 if (sg_build_iovec(sgio, dxferp, iovec_count)) return -EFAULT; } else { 1 if (put_user(dxferp, &sgio->dxferp)) return -EFAULT; } { unsigned char __user *cmdp; unsigned char __user *sbp; 1 if (get_user(data, &sgio32->cmdp)) return -EFAULT; 1 cmdp = compat_ptr(data); if (get_user(data, &sgio32->sbp)) return -EFAULT; 1 sbp = compat_ptr(data); if (put_user(cmdp, &sgio->cmdp) || 1 put_user(sbp, &sgio->sbp)) return -EFAULT; } 1 if (copy_in_user(&sgio->timeout, &sgio32->timeout, 3 * sizeof(int))) return -EFAULT; 1 if (get_user(data, &sgio32->usr_ptr)) return -EFAULT; 1 if (put_user(compat_ptr(data), &sgio->usr_ptr)) return -EFAULT; 1 err = sys_ioctl(fd, cmd, (unsigned long) sgio); if (err >= 0) { void __user *datap; if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, sizeof(int)) || get_user(datap, &sgio->usr_ptr) || put_user((u32)(unsigned long)datap, &sgio32->usr_ptr) || copy_in_user(&sgio32->status, &sgio->status, (4 * sizeof(unsigned char)) + (2 * sizeof(unsigned short)) + (3 * sizeof(int)))) err = -EFAULT; } return err; } struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ char req_state; char orphan; char sg_io_owned; char problem; int pack_id; compat_uptr_t usr_ptr; unsigned int duration; int unused; }; static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct compat_sg_req_info __user *o) { int err, i; sg_req_info_t __user *r; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); err = sys_ioctl(fd,cmd,(unsigned long)r); if (err < 0) return err; for (i = 0; i < SG_MAX_QUEUE; i++) { void __user *ptr; int d; if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) || get_user(ptr, &r[i].usr_ptr) || get_user(d, &r[i].duration) || put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) || put_user(d, &o[i].duration)) return -EFAULT; } return err; } #endif /* CONFIG_BLOCK */ struct sock_fprog32 { unsigned short len; compat_caddr_t filter; }; #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, struct sock_fprog32 __user *u_fprog32) { 8 struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); void __user *fptr64; u32 fptr32; u16 flen; if (get_user(flen, &u_fprog32->len) || 8 get_user(fptr32, &u_fprog32->filter)) return -EFAULT; 8 fptr64 = compat_ptr(fptr32); if (put_user(flen, &u_fprog64->len) || 8 put_user(fptr64, &u_fprog64->filter)) return -EFAULT; 8 if (cmd == PPPIOCSPASS32) cmd = PPPIOCSPASS; else cmd = PPPIOCSACTIVE; 8 return sys_ioctl(fd, cmd, (unsigned long) u_fprog64); } struct ppp_option_data32 { compat_caddr_t ptr; u32 length; compat_int_t transmit; }; #define PPPIOCSCOMPRESS32 _IOW('t', 77, struct ppp_option_data32) struct ppp_idle32 { compat_time_t xmit_idle; compat_time_t recv_idle; }; #define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) static int ppp_gidle(unsigned int fd, unsigned int cmd, struct ppp_idle32 __user *idle32) { struct ppp_idle __user *idle; __kernel_time_t xmit, recv; int err; 1 idle = compat_alloc_user_space(sizeof(*idle)); err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); 1 if (!err) { if (get_user(xmit, &idle->xmit_idle) || get_user(recv, &idle->recv_idle) || put_user(xmit, &idle32->xmit_idle) || put_user(recv, &idle32->recv_idle)) err = -EFAULT; } return err; } static int ppp_scompress(unsigned int fd, unsigned int cmd, struct ppp_option_data32 __user *odata32) { struct ppp_option_data __user *odata; __u32 data; void __user *datap; 2 odata = compat_alloc_user_space(sizeof(*odata)); if (get_user(data, &odata32->ptr)) return -EFAULT; 2 datap = compat_ptr(data); if (put_user(datap, &odata->ptr)) return -EFAULT; 2 if (copy_in_user(&odata->length, &odata32->length, sizeof(__u32) + sizeof(int))) return -EFAULT; 2 return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); } #ifdef CONFIG_BLOCK struct mtget32 { compat_long_t mt_type; compat_long_t mt_resid; compat_long_t mt_dsreg; compat_long_t mt_gstat; compat_long_t mt_erreg; compat_daddr_t mt_fileno; compat_daddr_t mt_blkno; }; #define MTIOCGET32 _IOR('m', 2, struct mtget32) struct mtpos32 { compat_long_t mt_blkno; }; #define MTIOCPOS32 _IOR('m', 3, struct mtpos32) static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) { 2 mm_segment_t old_fs = get_fs(); struct mtget get; struct mtget32 __user *umget32; struct mtpos pos; struct mtpos32 __user *upos32; unsigned long kcmd; void *karg; int err = 0; switch(cmd) { case MTIOCPOS32: kcmd = MTIOCPOS; karg = &pos; break; default: /* MTIOCGET32 */ kcmd = MTIOCGET; karg = &get; break; } 2 set_fs (KERNEL_DS); err = sys_ioctl (fd, kcmd, (unsigned long)karg); set_fs (old_fs); if (err) return err; switch (cmd) { case MTIOCPOS32: upos32 = argp; err = __put_user(pos.mt_blkno, &upos32->mt_blkno); break; case MTIOCGET32: umget32 = argp; err = __put_user(get.mt_type, &umget32->mt_type); err |= __put_user(get.mt_resid, &umget32->mt_resid); err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); err |= __put_user(get.mt_gstat, &umget32->mt_gstat); err |= __put_user(get.mt_erreg, &umget32->mt_erreg); err |= __put_user(get.mt_fileno, &umget32->mt_fileno); err |= __put_user(get.mt_blkno, &umget32->mt_blkno); break; } return err ? -EFAULT: 0; } #endif /* CONFIG_BLOCK */ /* Bluetooth ioctls */ #define HCIUARTSETPROTO _IOW('U', 200, int) #define HCIUARTGETPROTO _IOR('U', 201, int) #define HCIUARTGETDEVICE _IOR('U', 202, int) #define HCIUARTSETFLAGS _IOW('U', 203, int) #define HCIUARTGETFLAGS _IOR('U', 204, int) #define BNEPCONNADD _IOW('B', 200, int) #define BNEPCONNDEL _IOW('B', 201, int) #define BNEPGETCONNLIST _IOR('B', 210, int) #define BNEPGETCONNINFO _IOR('B', 211, int) #define BNEPGETSUPPFEAT _IOR('B', 212, int) #define CMTPCONNADD _IOW('C', 200, int) #define CMTPCONNDEL _IOW('C', 201, int) #define CMTPGETCONNLIST _IOR('C', 210, int) #define CMTPGETCONNINFO _IOR('C', 211, int) #define HIDPCONNADD _IOW('H', 200, int) #define HIDPCONNDEL _IOW('H', 201, int) #define HIDPGETCONNLIST _IOR('H', 210, int) #define HIDPGETCONNINFO _IOR('H', 211, int) struct serial_struct32 { compat_int_t type; compat_int_t line; compat_uint_t port; compat_int_t irq; compat_int_t flags; compat_int_t xmit_fifo_size; compat_int_t custom_divisor; compat_int_t baud_base; unsigned short close_delay; char io_type; char reserved_char[1]; compat_int_t hub6; unsigned short closing_wait; /* time to wait before closing */ unsigned short closing_wait2; /* no longer used... */ compat_uint_t iomem_base; unsigned short iomem_reg_shift; unsigned int port_high; /* compat_ulong_t iomap_base FIXME */ compat_int_t reserved[1]; }; static int serial_struct_ioctl(unsigned fd, unsigned cmd, struct serial_struct32 __user *ss32) { typedef struct serial_struct32 SS32; int err; struct serial_struct ss; 9 mm_segment_t oldseg = get_fs(); __u32 udata; unsigned int base; if (cmd == TIOCSSERIAL) { 8 if (!access_ok(VERIFY_READ, ss32, sizeof(SS32))) return -EFAULT; 8 if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base))) return -EFAULT; 7 if (__get_user(udata, &ss32->iomem_base)) return -EFAULT; 7 ss.iomem_base = compat_ptr(udata); if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || 7 __get_user(ss.port_high, &ss32->port_high)) return -EFAULT; 7 ss.iomap_base = 0UL; } 8 set_fs(KERNEL_DS); err = sys_ioctl(fd,cmd,(unsigned long)(&ss)); set_fs(oldseg); 1 if (cmd == TIOCGSERIAL && err >= 0) { if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32))) return -EFAULT; if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base))) return -EFAULT; base = (unsigned long)ss.iomem_base >> 32 ? 0xffffffff : (unsigned)(unsigned long)ss.iomem_base; if (__put_user(base, &ss32->iomem_base) || __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || __put_user(ss.port_high, &ss32->port_high)) return -EFAULT; } return err; } /* * I2C layer ioctls */ struct i2c_msg32 { u16 addr; u16 flags; u16 len; compat_caddr_t buf; }; struct i2c_rdwr_ioctl_data32 { compat_caddr_t msgs; /* struct i2c_msg __user *msgs */ u32 nmsgs; }; struct i2c_smbus_ioctl_data32 { u8 read_write; u8 command; u32 size; compat_caddr_t data; /* union i2c_smbus_data *data */ }; struct i2c_rdwr_aligned { struct i2c_rdwr_ioctl_data cmd; struct i2c_msg msgs[0]; }; static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata) { struct i2c_rdwr_aligned __user *tdata; struct i2c_msg __user *tmsgs; struct i2c_msg32 __user *umsgs; compat_caddr_t datap; u32 nmsgs; int i; 6 if (get_user(nmsgs, &udata->nmsgs)) return -EFAULT; 5 if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS) return -EINVAL; 4 if (get_user(datap, &udata->msgs)) return -EFAULT; umsgs = compat_ptr(datap); tdata = compat_alloc_user_space(sizeof(*tdata) + nmsgs * sizeof(struct i2c_msg)); tmsgs = &tdata->msgs[0]; if (put_user(nmsgs, &tdata->cmd.nmsgs) || 4 put_user(tmsgs, &tdata->cmd.msgs)) return -EFAULT; 4 for (i = 0; i < nmsgs; i++) { 3 if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16))) return -EFAULT; 2 if (get_user(datap, &umsgs[i].buf) || 2 put_user(compat_ptr(datap), &tmsgs[i].buf)) return -EFAULT; } 3 return sys_ioctl(fd, cmd, (unsigned long)tdata); } static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; compat_caddr_t datap; 2 tdata = compat_alloc_user_space(sizeof(*tdata)); if (tdata == NULL) return -ENOMEM; 2 if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) return -EFAULT; 2 if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) return -EFAULT; 2 if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8))) return -EFAULT; 2 if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32))) return -EFAULT; 2 if (__get_user(datap, &udata->data) || 2 __put_user(compat_ptr(datap), &tdata->data)) return -EFAULT; 2 return sys_ioctl(fd, cmd, (unsigned long)tdata); } #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) #define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) #define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) { 5 mm_segment_t oldfs = get_fs(); compat_ulong_t val32; unsigned long kval; int ret; switch (cmd) { case RTC_IRQP_READ32: case RTC_EPOCH_READ32: set_fs(KERNEL_DS); 2 ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ? RTC_IRQP_READ : RTC_EPOCH_READ, (unsigned long)&kval); set_fs(oldfs); 1 if (ret) return ret; 1 val32 = kval; return put_user(val32, (unsigned int __user *)argp); case RTC_IRQP_SET32: 2 return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: 1 return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); } return -ENOIOCTLCMD; } /* on ia32 l_start is on a 32-bit boundary */ #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) struct space_resv_32 { __s16 l_type; __s16 l_whence; __s64 l_start __attribute__((packed)); /* len == 0 means until end of file */ __s64 l_len __attribute__((packed)); __s32 l_sysid; __u32 l_pid; __s32 l_pad[4]; /* reserve area */ }; #define FS_IOC_RESVSP_32 _IOW ('X', 40, struct space_resv_32) #define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, struct space_resv_32 __user *p32) { struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || 180 copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) || 180 copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) || 180 copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) || 180 copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) || 180 copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || 180 copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32))) return -EFAULT; 181 return ioctl_preallocate(file, p); } #endif /* * simple reversible transform to make our table more evenly * distributed after sorting. */ #define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) #define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), /* ioctl should not be warned about even if it's not implemented. Valid reasons to use this: - It is implemented with ->compat_ioctl on some device, but programs call it on others too. - The ioctl is not implemented in the native kernel, but programs call it commonly anyways. Most other reasons are not valid. */ #define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) static unsigned int ioctl_pointer[] = { /* compatible ioctls first */ COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ /* Big T */ COMPATIBLE_IOCTL(TCGETA) COMPATIBLE_IOCTL(TCSETA) COMPATIBLE_IOCTL(TCSETAW) COMPATIBLE_IOCTL(TCSETAF) COMPATIBLE_IOCTL(TCSBRK) COMPATIBLE_IOCTL(TCXONC) COMPATIBLE_IOCTL(TCFLSH) COMPATIBLE_IOCTL(TCGETS) COMPATIBLE_IOCTL(TCSETS) COMPATIBLE_IOCTL(TCSETSW) COMPATIBLE_IOCTL(TCSETSF) COMPATIBLE_IOCTL(TIOCLINUX) COMPATIBLE_IOCTL(TIOCSBRK) COMPATIBLE_IOCTL(TIOCGDEV) COMPATIBLE_IOCTL(TIOCCBRK) COMPATIBLE_IOCTL(TIOCGSID) COMPATIBLE_IOCTL(TIOCGICOUNT) COMPATIBLE_IOCTL(TIOCGPKT) COMPATIBLE_IOCTL(TIOCGPTLCK) COMPATIBLE_IOCTL(TIOCGEXCL) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) COMPATIBLE_IOCTL(TIOCSETD) COMPATIBLE_IOCTL(TIOCEXCL) COMPATIBLE_IOCTL(TIOCNXCL) COMPATIBLE_IOCTL(TIOCCONS) COMPATIBLE_IOCTL(TIOCGSOFTCAR) COMPATIBLE_IOCTL(TIOCSSOFTCAR) COMPATIBLE_IOCTL(TIOCSWINSZ) COMPATIBLE_IOCTL(TIOCGWINSZ) COMPATIBLE_IOCTL(TIOCMGET) COMPATIBLE_IOCTL(TIOCMBIC) COMPATIBLE_IOCTL(TIOCMBIS) COMPATIBLE_IOCTL(TIOCMSET) COMPATIBLE_IOCTL(TIOCPKT) COMPATIBLE_IOCTL(TIOCNOTTY) COMPATIBLE_IOCTL(TIOCSTI) COMPATIBLE_IOCTL(TIOCOUTQ) COMPATIBLE_IOCTL(TIOCSPGRP) COMPATIBLE_IOCTL(TIOCGPGRP) COMPATIBLE_IOCTL(TIOCGPTN) COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) COMPATIBLE_IOCTL(TIOCSIG) #ifdef TIOCSRS485 COMPATIBLE_IOCTL(TIOCSRS485) #endif #ifdef TIOCGRS485 COMPATIBLE_IOCTL(TIOCGRS485) #endif #ifdef TCGETS2 COMPATIBLE_IOCTL(TCGETS2) COMPATIBLE_IOCTL(TCSETS2) COMPATIBLE_IOCTL(TCSETSW2) COMPATIBLE_IOCTL(TCSETSF2) #endif /* Little f */ COMPATIBLE_IOCTL(FIOCLEX) COMPATIBLE_IOCTL(FIONCLEX) COMPATIBLE_IOCTL(FIOASYNC) COMPATIBLE_IOCTL(FIONBIO) COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ COMPATIBLE_IOCTL(FS_IOC_FIEMAP) /* 0x00 */ COMPATIBLE_IOCTL(FIBMAP) COMPATIBLE_IOCTL(FIGETBSZ) /* 'X' - originally XFS but some now in the VFS */ COMPATIBLE_IOCTL(FIFREEZE) COMPATIBLE_IOCTL(FITHAW) COMPATIBLE_IOCTL(FITRIM) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) COMPATIBLE_IOCTL(KDGKBTYPE) COMPATIBLE_IOCTL(KDGETMODE) COMPATIBLE_IOCTL(KDGKBMODE) COMPATIBLE_IOCTL(KDGKBMETA) COMPATIBLE_IOCTL(KDGKBENT) COMPATIBLE_IOCTL(KDSKBENT) COMPATIBLE_IOCTL(KDGKBSENT) COMPATIBLE_IOCTL(KDSKBSENT) COMPATIBLE_IOCTL(KDGKBDIACR) COMPATIBLE_IOCTL(KDSKBDIACR) COMPATIBLE_IOCTL(KDGKBDIACRUC) COMPATIBLE_IOCTL(KDSKBDIACRUC) COMPATIBLE_IOCTL(KDKBDREP) COMPATIBLE_IOCTL(KDGKBLED) COMPATIBLE_IOCTL(KDGETLED) #ifdef CONFIG_BLOCK /* Big S */ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK) COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER) COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) #endif /* Big V (don't complain on serial console) */ IGNORE_IOCTL(VT_OPENQRY) IGNORE_IOCTL(VT_GETMODE) /* Little p (/dev/rtc, /dev/envctrl, etc.) */ COMPATIBLE_IOCTL(RTC_AIE_ON) COMPATIBLE_IOCTL(RTC_AIE_OFF) COMPATIBLE_IOCTL(RTC_UIE_ON) COMPATIBLE_IOCTL(RTC_UIE_OFF) COMPATIBLE_IOCTL(RTC_PIE_ON) COMPATIBLE_IOCTL(RTC_PIE_OFF) COMPATIBLE_IOCTL(RTC_WIE_ON) COMPATIBLE_IOCTL(RTC_WIE_OFF) COMPATIBLE_IOCTL(RTC_ALM_SET) COMPATIBLE_IOCTL(RTC_ALM_READ) COMPATIBLE_IOCTL(RTC_RD_TIME) COMPATIBLE_IOCTL(RTC_SET_TIME) COMPATIBLE_IOCTL(RTC_WKALM_SET) COMPATIBLE_IOCTL(RTC_WKALM_RD) /* * These two are only for the sbus rtc driver, but * hwclock tries them on every rtc device first when * running on sparc. On other architectures the entries * are useless but harmless. */ COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */ COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */ /* Little m */ COMPATIBLE_IOCTL(MTIOCTOP) /* Socket level stuff */ COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK /* md calls this on random blockdevs */ IGNORE_IOCTL(RAID_VERSION) /* qemu/qemu-img might call these two on plain files for probing */ IGNORE_IOCTL(CDROM_DRIVE_STATUS) IGNORE_IOCTL(FDGETPRM32) /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) COMPATIBLE_IOCTL(SG_EMULATED_HOST) COMPATIBLE_IOCTL(SG_GET_TRANSFORM) COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) COMPATIBLE_IOCTL(SG_GET_SCSI_ID) COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA) COMPATIBLE_IOCTL(SG_GET_LOW_DMA) COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID) COMPATIBLE_IOCTL(SG_GET_PACK_ID) COMPATIBLE_IOCTL(SG_GET_NUM_WAITING) COMPATIBLE_IOCTL(SG_SET_DEBUG) COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE) COMPATIBLE_IOCTL(SG_GET_COMMAND_Q) COMPATIBLE_IOCTL(SG_SET_COMMAND_Q) COMPATIBLE_IOCTL(SG_GET_VERSION_NUM) COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN) COMPATIBLE_IOCTL(SG_SCSI_RESET) COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) #endif /* PPP stuff */ COMPATIBLE_IOCTL(PPPIOCGFLAGS) COMPATIBLE_IOCTL(PPPIOCSFLAGS) COMPATIBLE_IOCTL(PPPIOCGASYNCMAP) COMPATIBLE_IOCTL(PPPIOCSASYNCMAP) COMPATIBLE_IOCTL(PPPIOCGUNIT) COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP) COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP) COMPATIBLE_IOCTL(PPPIOCGMRU) COMPATIBLE_IOCTL(PPPIOCSMRU) COMPATIBLE_IOCTL(PPPIOCSMAXCID) COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP) COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP) COMPATIBLE_IOCTL(PPPIOCXFERUNIT) /* PPPIOCSCOMPRESS is translated */ COMPATIBLE_IOCTL(PPPIOCGNPMODE) COMPATIBLE_IOCTL(PPPIOCSNPMODE) COMPATIBLE_IOCTL(PPPIOCGDEBUG) COMPATIBLE_IOCTL(PPPIOCSDEBUG) /* PPPIOCSPASS is translated */ /* PPPIOCSACTIVE is translated */ /* PPPIOCGIDLE is translated */ COMPATIBLE_IOCTL(PPPIOCNEWUNIT) COMPATIBLE_IOCTL(PPPIOCATTACH) COMPATIBLE_IOCTL(PPPIOCDETACH) COMPATIBLE_IOCTL(PPPIOCSMRRU) COMPATIBLE_IOCTL(PPPIOCCONNECT) COMPATIBLE_IOCTL(PPPIOCDISCONN) COMPATIBLE_IOCTL(PPPIOCATTCHAN) COMPATIBLE_IOCTL(PPPIOCGCHAN) COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) /* ppdev */ COMPATIBLE_IOCTL(PPSETMODE) COMPATIBLE_IOCTL(PPRSTATUS) COMPATIBLE_IOCTL(PPRCONTROL) COMPATIBLE_IOCTL(PPWCONTROL) COMPATIBLE_IOCTL(PPFCONTROL) COMPATIBLE_IOCTL(PPRDATA) COMPATIBLE_IOCTL(PPWDATA) COMPATIBLE_IOCTL(PPCLAIM) COMPATIBLE_IOCTL(PPRELEASE) COMPATIBLE_IOCTL(PPYIELD) COMPATIBLE_IOCTL(PPEXCL) COMPATIBLE_IOCTL(PPDATADIR) COMPATIBLE_IOCTL(PPNEGOT) COMPATIBLE_IOCTL(PPWCTLONIRQ) COMPATIBLE_IOCTL(PPCLRIRQ) COMPATIBLE_IOCTL(PPSETPHASE) COMPATIBLE_IOCTL(PPGETMODES) COMPATIBLE_IOCTL(PPGETMODE) COMPATIBLE_IOCTL(PPGETPHASE) COMPATIBLE_IOCTL(PPGETFLAGS) COMPATIBLE_IOCTL(PPSETFLAGS) /* Big A */ /* sparc only */ /* Big Q for sound/OSS */ COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET) COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC) COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO) COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE) COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT) COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT) COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE) COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR) COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI) COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES) COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS) COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS) COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO) COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD) COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL) COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE) COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC) COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND) COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME) COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID) COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL) COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE) /* Big T for sound/OSS */ COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE) COMPATIBLE_IOCTL(SNDCTL_TMR_START) COMPATIBLE_IOCTL(SNDCTL_TMR_STOP) COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE) COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO) COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE) COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME) COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT) /* Little m for sound/OSS */ COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME) COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE) COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD) /* Big P for sound/OSS */ COMPATIBLE_IOCTL(SNDCTL_DSP_RESET) COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC) COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED) COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO) COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE) COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS) COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER) COMPATIBLE_IOCTL(SNDCTL_DSP_POST) COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE) COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT) COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS) COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT) COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE) COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE) COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK) COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS) COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER) COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER) COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR) COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR) /* SNDCTL_DSP_MAPINBUF, XXX needs translation */ /* SNDCTL_DSP_MAPOUTBUF, XXX needs translation */ COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO) COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX) COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY) COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE) COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE) COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS) COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS) COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER) /* Big C for sound/OSS */ COMPATIBLE_IOCTL(SNDCTL_COPR_RESET) COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD) COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA) COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE) COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA) COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE) COMPATIBLE_IOCTL(SNDCTL_COPR_RUN) COMPATIBLE_IOCTL(SNDCTL_COPR_HALT) COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG) COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG) /* Big M for sound/OSS */ COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME) COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS) COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE) COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH) COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM) COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER) COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE) COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC) COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD) COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX) COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM) COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV) COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN) COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN) COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1) COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2) COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3) COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1)) COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2)) COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3)) COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN)) COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT)) COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO)) COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO)) COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR)) COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE) /* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */ /* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */ COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC) COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK) COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK) COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS) COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3) COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1)) COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2)) COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3)) COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN)) COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT)) COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO)) COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO)) COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR)) COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE) /* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */ /* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */ COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC) COMPATIBLE_IOCTL(SOUND_MIXER_INFO) COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO) COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS) COMPATIBLE_IOCTL(SOUND_MIXER_AGC) COMPATIBLE_IOCTL(SOUND_MIXER_3DSE) COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1) COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2) COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3) COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4) COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5) COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) COMPATIBLE_IOCTL(OSS_GETVERSION) /* Raw devices */ COMPATIBLE_IOCTL(RAW_SETBIND) COMPATIBLE_IOCTL(RAW_GETBIND) /* Watchdog */ COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) COMPATIBLE_IOCTL(WDIOC_GETSTATUS) COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS) COMPATIBLE_IOCTL(WDIOC_GETTEMP) COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) /* Big R */ COMPATIBLE_IOCTL(RNDGETENTCNT) COMPATIBLE_IOCTL(RNDADDTOENTCNT) COMPATIBLE_IOCTL(RNDGETPOOL) COMPATIBLE_IOCTL(RNDADDENTROPY) COMPATIBLE_IOCTL(RNDZAPENTCNT) COMPATIBLE_IOCTL(RNDCLEARPOOL) /* Bluetooth */ COMPATIBLE_IOCTL(HCIDEVUP) COMPATIBLE_IOCTL(HCIDEVDOWN) COMPATIBLE_IOCTL(HCIDEVRESET) COMPATIBLE_IOCTL(HCIDEVRESTAT) COMPATIBLE_IOCTL(HCIGETDEVLIST) COMPATIBLE_IOCTL(HCIGETDEVINFO) COMPATIBLE_IOCTL(HCIGETCONNLIST) COMPATIBLE_IOCTL(HCIGETCONNINFO) COMPATIBLE_IOCTL(HCIGETAUTHINFO) COMPATIBLE_IOCTL(HCISETRAW) COMPATIBLE_IOCTL(HCISETSCAN) COMPATIBLE_IOCTL(HCISETAUTH) COMPATIBLE_IOCTL(HCISETENCRYPT) COMPATIBLE_IOCTL(HCISETPTYPE) COMPATIBLE_IOCTL(HCISETLINKPOL) COMPATIBLE_IOCTL(HCISETLINKMODE) COMPATIBLE_IOCTL(HCISETACLMTU) COMPATIBLE_IOCTL(HCISETSCOMTU) COMPATIBLE_IOCTL(HCIBLOCKADDR) COMPATIBLE_IOCTL(HCIUNBLOCKADDR) COMPATIBLE_IOCTL(HCIINQUIRY) COMPATIBLE_IOCTL(HCIUARTSETPROTO) COMPATIBLE_IOCTL(HCIUARTGETPROTO) COMPATIBLE_IOCTL(RFCOMMCREATEDEV) COMPATIBLE_IOCTL(RFCOMMRELEASEDEV) COMPATIBLE_IOCTL(RFCOMMGETDEVLIST) COMPATIBLE_IOCTL(RFCOMMGETDEVINFO) COMPATIBLE_IOCTL(RFCOMMSTEALDLC) COMPATIBLE_IOCTL(BNEPCONNADD) COMPATIBLE_IOCTL(BNEPCONNDEL) COMPATIBLE_IOCTL(BNEPGETCONNLIST) COMPATIBLE_IOCTL(BNEPGETCONNINFO) COMPATIBLE_IOCTL(BNEPGETSUPPFEAT) COMPATIBLE_IOCTL(CMTPCONNADD) COMPATIBLE_IOCTL(CMTPCONNDEL) COMPATIBLE_IOCTL(CMTPGETCONNLIST) COMPATIBLE_IOCTL(CMTPGETCONNINFO) COMPATIBLE_IOCTL(HIDPCONNADD) COMPATIBLE_IOCTL(HIDPCONNDEL) COMPATIBLE_IOCTL(HIDPGETCONNLIST) COMPATIBLE_IOCTL(HIDPGETCONNINFO) /* CAPI */ COMPATIBLE_IOCTL(CAPI_REGISTER) COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER) COMPATIBLE_IOCTL(CAPI_GET_VERSION) COMPATIBLE_IOCTL(CAPI_GET_SERIAL) COMPATIBLE_IOCTL(CAPI_GET_PROFILE) COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD) COMPATIBLE_IOCTL(CAPI_GET_ERRCODE) COMPATIBLE_IOCTL(CAPI_INSTALLED) COMPATIBLE_IOCTL(CAPI_GET_FLAGS) COMPATIBLE_IOCTL(CAPI_SET_FLAGS) COMPATIBLE_IOCTL(CAPI_CLR_FLAGS) COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT) COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT) /* Siemens Gigaset */ COMPATIBLE_IOCTL(GIGASET_REDIR) COMPATIBLE_IOCTL(GIGASET_CONFIG) COMPATIBLE_IOCTL(GIGASET_BRKCHARS) COMPATIBLE_IOCTL(GIGASET_VERSION) /* Misc. */ COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */ COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) /* NBD */ COMPATIBLE_IOCTL(NBD_DO_IT) COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) COMPATIBLE_IOCTL(NBD_CLEAR_QUE) COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) COMPATIBLE_IOCTL(NBD_DISCONNECT) /* i2c */ COMPATIBLE_IOCTL(I2C_SLAVE) COMPATIBLE_IOCTL(I2C_SLAVE_FORCE) COMPATIBLE_IOCTL(I2C_TENBIT) COMPATIBLE_IOCTL(I2C_PEC) COMPATIBLE_IOCTL(I2C_RETRIES) COMPATIBLE_IOCTL(I2C_TIMEOUT) /* hiddev */ COMPATIBLE_IOCTL(HIDIOCGVERSION) COMPATIBLE_IOCTL(HIDIOCAPPLICATION) COMPATIBLE_IOCTL(HIDIOCGDEVINFO) COMPATIBLE_IOCTL(HIDIOCGSTRING) COMPATIBLE_IOCTL(HIDIOCINITREPORT) COMPATIBLE_IOCTL(HIDIOCGREPORT) COMPATIBLE_IOCTL(HIDIOCSREPORT) COMPATIBLE_IOCTL(HIDIOCGREPORTINFO) COMPATIBLE_IOCTL(HIDIOCGFIELDINFO) COMPATIBLE_IOCTL(HIDIOCGUSAGE) COMPATIBLE_IOCTL(HIDIOCSUSAGE) COMPATIBLE_IOCTL(HIDIOCGUCODE) COMPATIBLE_IOCTL(HIDIOCGFLAG) COMPATIBLE_IOCTL(HIDIOCSFLAG) COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) /* dvb */ COMPATIBLE_IOCTL(AUDIO_STOP) COMPATIBLE_IOCTL(AUDIO_PLAY) COMPATIBLE_IOCTL(AUDIO_PAUSE) COMPATIBLE_IOCTL(AUDIO_CONTINUE) COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE) COMPATIBLE_IOCTL(AUDIO_SET_MUTE) COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC) COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE) COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT) COMPATIBLE_IOCTL(AUDIO_GET_STATUS) COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES) COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER) COMPATIBLE_IOCTL(AUDIO_SET_ID) COMPATIBLE_IOCTL(AUDIO_SET_MIXER) COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE) COMPATIBLE_IOCTL(AUDIO_SET_EXT_ID) COMPATIBLE_IOCTL(AUDIO_SET_ATTRIBUTES) COMPATIBLE_IOCTL(AUDIO_SET_KARAOKE) COMPATIBLE_IOCTL(DMX_START) COMPATIBLE_IOCTL(DMX_STOP) COMPATIBLE_IOCTL(DMX_SET_FILTER) COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) COMPATIBLE_IOCTL(DMX_GET_CAPS) COMPATIBLE_IOCTL(DMX_SET_SOURCE) COMPATIBLE_IOCTL(DMX_GET_STC) COMPATIBLE_IOCTL(FE_GET_INFO) COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD) COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD) COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY) COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST) COMPATIBLE_IOCTL(FE_SET_TONE) COMPATIBLE_IOCTL(FE_SET_VOLTAGE) COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE) COMPATIBLE_IOCTL(FE_READ_STATUS) COMPATIBLE_IOCTL(FE_READ_BER) COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH) COMPATIBLE_IOCTL(FE_READ_SNR) COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS) COMPATIBLE_IOCTL(FE_SET_FRONTEND) COMPATIBLE_IOCTL(FE_GET_FRONTEND) COMPATIBLE_IOCTL(FE_GET_EVENT) COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD) COMPATIBLE_IOCTL(VIDEO_STOP) COMPATIBLE_IOCTL(VIDEO_PLAY) COMPATIBLE_IOCTL(VIDEO_FREEZE) COMPATIBLE_IOCTL(VIDEO_CONTINUE) COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE) COMPATIBLE_IOCTL(VIDEO_SET_BLANK) COMPATIBLE_IOCTL(VIDEO_GET_STATUS) COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT) COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD) COMPATIBLE_IOCTL(VIDEO_SLOWMOTION) COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES) COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER) COMPATIBLE_IOCTL(VIDEO_SET_ID) COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE) COMPATIBLE_IOCTL(VIDEO_SET_FORMAT) COMPATIBLE_IOCTL(VIDEO_SET_SYSTEM) COMPATIBLE_IOCTL(VIDEO_SET_HIGHLIGHT) COMPATIBLE_IOCTL(VIDEO_SET_SPU) COMPATIBLE_IOCTL(VIDEO_GET_NAVI) COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES) COMPATIBLE_IOCTL(VIDEO_GET_SIZE) COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE) /* joystick */ COMPATIBLE_IOCTL(JSIOCGVERSION) COMPATIBLE_IOCTL(JSIOCGAXES) COMPATIBLE_IOCTL(JSIOCGBUTTONS) COMPATIBLE_IOCTL(JSIOCGNAME(0)) #ifdef TIOCGLTC COMPATIBLE_IOCTL(TIOCGLTC) COMPATIBLE_IOCTL(TIOCSLTC) #endif #ifdef TIOCSTART /* * For these two we have definitions in ioctls.h and/or termios.h on * some architectures but no actual implemention. Some applications * like bash call them if they are defined in the headers, so we provide * entries here to avoid syslog message spew. */ COMPATIBLE_IOCTL(TIOCSTART) COMPATIBLE_IOCTL(TIOCSTOP) #endif /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, but we don't want warnings on other file systems. So declare them as compatible here. */ #define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) #define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) #ifdef CONFIG_SPARC /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ IGNORE_IOCTL(FBIOGTYPE) IGNORE_IOCTL(FBIOSATTR) IGNORE_IOCTL(FBIOGATTR) IGNORE_IOCTL(FBIOSVIDEO) IGNORE_IOCTL(FBIOGVIDEO) IGNORE_IOCTL(FBIOSCURPOS) IGNORE_IOCTL(FBIOGCURPOS) IGNORE_IOCTL(FBIOGCURMAX) IGNORE_IOCTL(FBIOPUTCMAP32) IGNORE_IOCTL(FBIOGETCMAP32) IGNORE_IOCTL(FBIOSCURSOR32) IGNORE_IOCTL(FBIOGCURSOR32) #endif }; /* * Convert common ioctl arguments based on their command number * * Please do not add any code in here. Instead, implement * a compat_ioctl operation in the place that handleѕ the * ioctl for the native case. */ static long do_ioctl_trans(int fd, unsigned int cmd, unsigned long arg, struct file *file) { void __user *argp = compat_ptr(arg); switch (cmd) { 82 case PPPIOCGIDLE32: return ppp_gidle(fd, cmd, argp); case PPPIOCSCOMPRESS32: return ppp_scompress(fd, cmd, argp); 1 case PPPIOCSPASS32: case PPPIOCSACTIVE32: 2 return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); #ifdef CONFIG_BLOCK case SG_IO: 8 return sg_ioctl_trans(fd, cmd, argp); case SG_GET_REQUEST_TABLE: return sg_grt_trans(fd, cmd, argp); 4 case MTIOCGET32: case MTIOCPOS32: return mt_ioctl_trans(fd, cmd, argp); #endif /* Serial */ 2 case TIOCGSERIAL: case TIOCSSERIAL: return serial_struct_ioctl(fd, cmd, argp); /* i2c */ case I2C_FUNCS: 9 return w_long(fd, cmd, argp); case I2C_RDWR: return do_i2c_rdwr_ioctl(fd, cmd, argp); 3 case I2C_SMBUS: return do_i2c_smbus_ioctl(fd, cmd, argp); 6 /* Not implemented in the native kernel */ case RTC_IRQP_READ32: 24 case RTC_IRQP_SET32: case RTC_EPOCH_READ32: case RTC_EPOCH_SET32: return rtc_ioctl(fd, cmd, argp); /* dvb */ 5 case VIDEO_GET_EVENT: return do_video_get_event(fd, cmd, argp); case VIDEO_STILLPICTURE: return do_video_stillpicture(fd, cmd, argp); 8 case VIDEO_SET_SPU_PALETTE: return do_video_set_spu_palette(fd, cmd, argp); 3 } /* * These take an integer instead of a pointer as 'arg', * so we must not do a compat_ptr() translation. */ switch (cmd) { /* Big T */ case TCSBRKP: 39 case TIOCMIWAIT: case TIOCSCTTY: /* RAID */ case HOT_REMOVE_DISK: case HOT_ADD_DISK: case SET_DISK_FAULTY: case SET_BITMAP_FILE: /* Big K */ case KDSIGACCEPT: case KIOCSOUND: case KDMKTONE: case KDSETMODE: case KDSKBMODE: case KDSKBMETA: case KDSKBLED: case KDSETLED: /* NBD */ case NBD_SET_SOCK: case NBD_SET_BLKSIZE: case NBD_SET_SIZE: case NBD_SET_SIZE_BLOCKS: return do_vfs_ioctl(file, fd, cmd, arg); } 5 return -ENOIOCTLCMD; } static int compat_ioctl_check_table(unsigned int xcmd) { int i; const int max = ARRAY_SIZE(ioctl_pointer) - 1; BUILD_BUG_ON(max >= (1 << 16)); /* guess initial offset into table, assuming a normalized distribution */ i = ((xcmd >> 16) * max) >> 16; /* do linear search up first, until greater or equal */ while (ioctl_pointer[i] < xcmd && i < max) i++; 224 /* then do linear search down */ 226 while (ioctl_pointer[i] > xcmd && i > 0) i--; 395 return ioctl_pointer[i] == xcmd; 200 } COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg32) { 2058 unsigned long arg = arg32; struct fd f = fdget(fd); int error = -EBADF; if (!f.file) 1948 goto out; /* RED-PEN how should LSM module know it's handling 32bit? */ error = security_file_ioctl(f.file, cmd, arg); if (error) goto out_fput; /* * To allow the compat_ioctl handlers to be self contained * we need to check the common ioctls here first. * Just handle them with the standard handlers below. */ switch (cmd) { case FIOCLEX: case FIONCLEX: 2046 case FIONBIO: case FIOASYNC: case FIOQSIZE: break; #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: error = compat_ioctl_preallocate(f.file, compat_ptr(arg)); goto out_fput; #else 182 case FS_IOC_RESVSP: case FS_IOC_RESVSP64: error = ioctl_preallocate(f.file, compat_ptr(arg)); goto out_fput; #endif case FIBMAP: case FIGETBSZ: case FIONREAD: if (S_ISREG(file_inode(f.file)->i_mode)) break; /*FALL THROUGH*/ 48 default: if (f.file->f_op->compat_ioctl) { error = f.file->f_op->compat_ioctl(f.file, cmd, arg); if (error != -ENOIOCTLCMD) 1870 goto out_fput; 1775 } 1472 if (!f.file->f_op->unlocked_ioctl) goto do_ioctl; break; 387 } if (compat_ioctl_check_table(XFORM(cmd))) goto found_handler; 395 error = do_ioctl_trans(fd, cmd, arg, f.file); if (error == -ENOIOCTLCMD) error = -ENOTTY; 82 47 goto out_fput; found_handler: arg = (unsigned long)compat_ptr(arg); do_ioctl: error = do_vfs_ioctl(f.file, fd, cmd, arg); out_fput: fdput(f); 348 out: return error; 1943 } static int __init init_sys32_ioctl_cmp(const void *p, const void *q) { unsigned int a, b; a = *(unsigned int *)p; b = *(unsigned int *)q; if (a > b) return 1; if (a < b) return -1; return 0; } static int __init init_sys32_ioctl(void) { sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), init_sys32_ioctl_cmp, NULL); return 0; } __initcall(init_sys32_ioctl);
/* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public Licens * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ /* * mballoc.c contains the multiblocks allocation routines */ #include "ext4_jbd2.h" #include "mballoc.h" #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> #include <trace/events/ext4.h> #ifdef CONFIG_EXT4_DEBUG ushort ext4_mballoc_debug __read_mostly; module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); #endif /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() * - search for metadata in few groups * * TODO v4: * - normalization should take into account whether file is still open * - discard preallocations if no free space left (policy?) * - don't normalize tails * - quota * - reservation for superuser * * TODO v3: * - bitmap read-ahead (proposed by Oleg Drokin aka green) * - track min/max extents in each group for better group selection * - mb_mark_used() may allocate chunk right after splitting buddy * - tree of groups sorted by number of free blocks * - error handling */ /* * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * * During initialization phase of the allocator we decide to use the * group preallocation or inode preallocation depending on the size of * the file. The size of the file could be the resulting file size we * would have after allocation, or the current file size, which ever * is larger. If the size is less than sbi->s_mb_stream_request we * select to use the group preallocation. The default value of * s_mb_stream_request is 16 blocks. This can also be tuned via * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in * terms of number of blocks. * * The main motivation for having small file use group preallocation is to * ensure that we have small files closer together on the disk. * * First stage the allocator looks at the inode prealloc list, * ext4_inode_info->i_prealloc_list, which contains list of prealloc * spaces for this particular inode. The inode prealloc space is * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space * pa_len -> length for this prealloc space (in clusters) * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc * space we will consume the particular prealloc space. This makes sure that * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except * pa_free. * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * * The reason for having a per cpu locality group is to reduce the contention * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets * mapped to the buddy and bitmap information regarding different * groups. The buddy information is attached to buddy cache inode so that * we can access them through the page cache. The information regarding * each group is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are stored in the * inode as: * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / * blocksize) blocks. So it can have information regarding groups_per_page * which is blocks_per_page/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. * * We look for count number of blocks in the buddy cache. If we were able * to locate that many free blocks we return with additional information * regarding rest of the contiguous physical block available * * Before allocating blocks via buddy cache we normalize the request * blocks. This ensure we ask for more blocks that we needed. The extra * blocks that we get after allocation is added to the respective prealloc * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * the smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contiguous block in * stripe size. This should result in better allocation on RAID setups. If * not, we search in the specific group using bitmap for best extents. The * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the * subsequent request. */ /* * mballoc operates on the following data: * - on-disk bitmap * - in-core buddy (actually includes buddy and bitmap) * - preallocation descriptors (PAs) * * there are two types of preallocations: * - inode * assiged to specific inode and can be used for this inode only. * it describes part of inode's space preallocated to specific * physical blocks. any block from that preallocated can be used * independent. the descriptor just tracks number of blocks left * unused. so, before taking some block from descriptor, one must * make sure corresponded logical block isn't allocated yet. this * also means that freeing any block within descriptor's range * must discard all preallocated blocks. * - locality group * assigned to specific locality group which does not translate to * permanent set of inodes: inode can join and leave group. space * from this type of preallocation can be used for any inode. thus * it's consumed from the beginning to the end. * * relation between them can be expressed as: * in-core buddy = on-disk bitmap + preallocation descriptors * * this mean blocks mballoc considers used are: * - allocated blocks (persistent) * - preallocated blocks (non-persistent) * * consistency in mballoc world means that at any time a block is either * free or used in ALL structures. notice: "any time" should not be read * literally -- time is discrete and delimited by locks. * * to keep it simple, we don't use block numbers, instead we count number of * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. * * all operations can be expressed as: * - init buddy: buddy = on-disk + PAs * - new PA: buddy += N; PA = N * - use inode PA: on-disk += N; PA -= N * - discard inode PA buddy -= on-disk - PA; PA = 0 * - use locality group PA on-disk += N; PA -= N * - discard locality group PA buddy -= PA; PA = 0 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap * is used in real operation because we can't know actual used * bits from PA, only from on-disk bitmap * * if we follow this strict logic, then all operations above should be atomic. * given some of them can block, we'd have to use something like semaphores * killing performance on high-end SMP hardware. let's try to relax it using * the following knowledge: * 1) if buddy is referenced, it's already initialized * 2) while block is used in buddy and the buddy is referenced, * nobody can re-allocate that block * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has * bit set and PA claims same block, it's OK. IOW, one can set bit in * on-disk bitmap if buddy has same bit set or/and PA covers corresponded * block * * so, now we're building a concurrency table: * - init buddy vs. * - new PA * blocks for PA are allocated in the buddy, buddy must be referenced * until PA is linked to allocation group to avoid concurrent buddy init * - use inode PA * we need to make sure that either on-disk bitmap or PA has uptodate data * given (3) we care that PA-=N operation doesn't interfere with init * - discard inode PA * the simplest way would be to have buddy initialized by the discard * - use locality group PA * again PA-=N must be serialized with init * - discard locality group PA * the simplest way would be to have buddy initialized by the discard * - new PA vs. * - use inode PA * i_data_sem serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * some mutex should serialize them * - discard locality group PA * discard process must wait until PA isn't used by another process * - use inode PA * - use inode PA * i_data_sem or another mutex should serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * nothing wrong here -- they're different PAs covering different blocks * - discard locality group PA * discard process must wait until PA isn't used by another process * * now we're ready to make few consequences: * - PA is referenced and while it is no discard is possible * - PA is referenced until block isn't marked in on-disk bitmap * - PA changes only after on-disk bitmap * - discard must not compete with init. either init is done before * any discard or they're serialized somehow * - buddy init as sum of on-disk bitmap and PAs is done atomically * * a special case when we've used PA to emptiness. no need to modify buddy * in this case, but we should care about concurrent init * */ /* * Logic in few words: * * - allocation: * load group * find blocks * mark bits in on-disk bitmap * release group * * - use preallocation: * find proper PA (per-inode or group) * load group * mark bits in on-disk bitmap * release group * release PA * * - free: * load group * mark bits in on-disk bitmap * release group * * - discard preallocations in group: * mark PAs deleted * move them onto local list * load on-disk bitmap * load group * remove PA from object (inode or locality group) * mark free blocks in-core * * - discard inode's preallocations: */ /* * Locking rules * * Locks: * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) * * Paths: * - new pa * object * group * * - find and use pa: * pa * * - release consumed pa: * pa * group * object * * - generate in-core bitmap: * group * pa * * - discard all for given object (inode, locality group): * object * pa * group * * - discard all for given group: * group * pa * group * object * */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" }; static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_free_data_callback(struct super_block *sb, struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 827 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline int mb_test_bit(int bit, void *addr) { /* * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ 746 addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { 711 addr = mb_correct_addr_and_bit(&bit, addr); 355 ext4_set_bit(bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { 420 addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline int mb_test_and_clear_bit(int bit, void *addr) { 421 addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_and_clear_bit(bit, addr); } static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; 5 start += fix; ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; 783 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); 783 if (order > e4b->bd_blkbits + 1) { 43 *max = 0; return NULL; } /* at order 0 we see each particular block */ 783 if (order == 0) { 572 *max = 1 << (e4b->bd_blkbits + 3); return e4b->bd_bitmap; } 769 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 783 return bb; } #ifdef DOUBLE_CHECK static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int i; struct super_block *sb = e4b->bd_sb; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { int i; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; b1 = (unsigned char *) e4b->bd_info->bb_bitmap; b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { ext4_msg(e4b->bd_sb, KERN_ERR, "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc", e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } } } #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } #endif #ifdef AGGRESSIVE_CHECK #define MB_CHECK_ASSERT(assert) \ do { \ if (!(assert)) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ function, file, line, # assert); \ BUG(); \ } \ } while (0) static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; int order = e4b->bd_blkbits + 1; int max; int max2; int i; int j; int k; int count; struct ext4_group_info *grp; int fragments = 0; int fstart; struct list_head *cur; void *buddy; void *buddy2; { static int mb_check_counter; if (mb_check_counter++ % 100 != 0) return 0; } while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); MB_CHECK_ASSERT(buddy); buddy2 = mb_find_buddy(e4b, order - 1, &max2); MB_CHECK_ASSERT(buddy2); MB_CHECK_ASSERT(buddy != buddy2); MB_CHECK_ASSERT(max * 2 == max2); count = 0; for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { /* only single bit in buddy2 may be 1 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); } else if (!mb_test_bit((i << 1) + 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit(i << 1, buddy2)); } continue; } /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( !mb_test_bit(k, e4b->bd_bitmap)); } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); order--; } fstart = -1; buddy = mb_find_buddy(e4b, 0, &max); for (i = 0; i < max; i++) { if (!mb_test_bit(i, buddy)) { MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); if (fstart == -1) { fragments++; fstart = i; } continue; } fstart = -1; /* check used bits only */ for (j = 0; j < e4b->bd_blkbits + 1; j++) { buddy2 = mb_find_buddy(e4b, j, &max2); k = i >> j; MB_CHECK_ASSERT(k < max2); MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif /* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. * Clear the bits in bitmap which the blocks of the chunk(s) covered, * then increase bb_counters[] for corresponded chunk size. */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); 10 border = 2 << sb->s_blocksize_bits; while (len > 0) { /* find how many blocks can be covered since this position */ 10 max = ffs(first | border) - 1; /* find how many blocks of power 2 we need to mark */ min = fls(len) - 1; if (max < min) min = max; chunk = 1 << min; /* mark multiblock chunks only */ grp->bb_counters[min]++; if (min > 0) mb_clear_bit(first >> min, 10 buddy + sbi->s_mb_offsets[min]); 10 len -= chunk; first += chunk; } } /* * Cache the order of the largest free extent we have available in this block * group. */ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { int i; int bits; 10 grp->bb_largest_free_order = -1; /* uninit */ bits = sb->s_blocksize_bits + 1; 719 for (i = bits; i >= 0; i--) { 795 if (grp->bb_counters[i] > 0) { 793 grp->bb_largest_free_order = i; break; } } } static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { 10 struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { 10 fragments++; first = i; i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) 10 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); else 3 grp->bb_counters[0]++; 10 if (i < max) 5 i = mb_find_next_zero_bit(bitmap, max, i); } 10 grp->bb_fragments = fragments; if (free != grp->bb_free) { 4 ext4_grp_locked_error(sb, group, 0, 0, "block bitmap and bg descriptor " "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); 4 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } 10 mb_set_largest_free_order(sb, grp); 10 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; spin_lock(&EXT4_SB(sb)->s_bal_lock); EXT4_SB(sb)->s_mb_buddies_generated++; EXT4_SB(sb)->s_mb_generation_time += period; spin_unlock(&EXT4_SB(sb)->s_bal_lock); } static void mb_regenerate_buddy(struct ext4_buddy *e4b) { int count; int order = 1; void *buddy; 4 while ((buddy = mb_find_buddy(e4b, order++, &count))) { 4 ext4_set_bits(buddy, 0, count); } 4 e4b->bd_info->bb_fragments = 0; memset(e4b->bd_info->bb_counters, 0, sizeof(*e4b->bd_info->bb_counters) * (e4b->bd_sb->s_blocksize_bits + 2)); ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, e4b->bd_bitmap, e4b->bd_group); } /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A page can * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 * * Locking note: This routine takes the block group lock of all groups * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; int blocksize; int blocks_per_page; int groups_per_page; int err = 0; int i; ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; struct ext4_group_info *grinfo; mb_debug(1, "init page %lu\n", page->index); 7 inode = page->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = 1 << inode->i_blkbits; blocks_per_page = PAGE_CACHE_SIZE / blocksize; groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) groups_per_page = 1; /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, gfp); if (bh == NULL) { err = -ENOMEM; goto out; } } else bh = &bhs; 7 first_group = page->index * blocks_per_page / 2; /* read all groups the page covers into the cache */ 6 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 7 if (group >= ngroups) break; 7 grinfo = ext4_get_group_info(sb, group); /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the page, * which may be currently in use by an allocating task. */ 7 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } 7 bh[i] = ext4_read_block_bitmap_nowait(sb, group); if (IS_ERR(bh[i])) { 1 err = PTR_ERR(bh[i]); bh[i] = NULL; goto out; } mb_debug(1, "read bitmap for group %u\n", group); } /* wait for I/O completion */ 6 for (i = 0, group = first_group; i < groups_per_page; i++, group++) { int err2; 6 if (!bh[i]) continue; 6 err2 = ext4_wait_block_bitmap(sb, group, bh[i]); if (!err) err = err2; } 6 first_block = page->index * blocks_per_page; 6 for (i = 0; i < blocks_per_page; i++) { 6 group = (first_block + i) >> 1; if (group >= ngroups) break; 6 if (!bh[group - first_group]) /* skip initialized uptodate buddy */ continue; 6 if (!buffer_verified(bh[group - first_group])) /* Skip faulty bitmaps */ continue; err = 0; /* * data carry information regarding this * particular group in the format specified * above * */ 6 data = page_address(page) + (i * blocksize); 6 bitmap = bh[group - first_group]->b_data; /* * We place the buddy block and bitmap block * close together */ if ((first_block + i) & 1) { /* this is block of buddy */ 6 BUG_ON(incore == NULL); mb_debug(1, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); 6 trace_ext4_mb_buddy_bitmap_load(sb, group); 6 grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * (sb->s_blocksize_bits+2)); /* * incore got set to the group block bitmap below */ 6 ext4_lock_group(sb, group); /* init the buddy */ 6 memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group); ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ 6 BUG_ON(incore != NULL); mb_debug(1, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); 6 trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ 6 ext4_lock_group(sb, group); 6 memcpy(data, bitmap, blocksize); /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); 6 ext4_mb_generate_from_freelist(sb, data, group); 6 ext4_unlock_group(sb, group); /* set incore so that the buddy information can be * generated using this */ incore = data; } } 6 SetPageUptodate(page); out: if (bh) { 7 for (i = 0; i < groups_per_page; i++) 7 brelse(bh[i]); 7 if (bh != &bhs) kfree(bh); } 7 return err; } /* * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap * are on the same page e4b->bd_buddy_page is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; struct page *page; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; 6 poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; 6 BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; 6 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ return 0; } 6 block++; pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; 6 BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) { 6 unlock_page(e4b->bd_bitmap_page); page_cache_release(e4b->bd_bitmap_page); } 6 if (e4b->bd_buddy_page) { 6 unlock_page(e4b->bd_buddy_page); page_cache_release(e4b->bd_buddy_page); } } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct page *page; int ret = 0; 6 might_sleep(); mb_debug(1, "init group %u\n", group); 6 this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ 6 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); 6 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ goto err; } page = e4b.bd_bitmap_page; 6 ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) goto err; 6 if (!PageUptodate(page)) { ret = -EIO; goto err; } if (e4b.bd_buddy_page == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ ret = 0; goto err; } /* init buddy cache */ page = e4b.bd_buddy_page; ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); if (ret) goto err; 6 if (!PageUptodate(page)) { ret = -EIO; goto err; } err: 6 ext4_mb_put_buddy_page_lock(&e4b); 6 return ret; } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; int pnum; int poff; struct page *page; int ret; struct ext4_group_info *grp; 795 struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; might_sleep(); mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 795 grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* * we need full data about the group * to make a good selection */ 5 ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ 795 block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; /* we could use find_or_create_page(), but it locks page * what we'd like to avoid in fast path ... */ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 795 if (page == NULL || !PageUptodate(page)) { if (page) /* * drop the page reference and try * to get the page with lock. If we * are not uptodate that implies * somebody just created the page but * is yet to initialize the same. So * wait for it to initialize. */ page_cache_release(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { unlock_page(page); goto err; } mb_cmp_bitmaps(e4b, page_address(page) + (poff * sb->s_blocksize)); } unlock_page(page); } } if (page == NULL) { ret = -ENOMEM; goto err; } 795 if (!PageUptodate(page)) { ret = -EIO; goto err; } /* Pages marked accessed already */ e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); 795 if (page == NULL || !PageUptodate(page)) { if (page) 1 page_cache_release(page); 1 page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { 1 BUG_ON(page->mapping != inode->i_mapping); 1 if (!PageUptodate(page)) { 1 ret = ext4_mb_init_cache(page, e4b->bd_bitmap, gfp); if (ret) { 1 unlock_page(page); goto err; } } unlock_page(page); } } if (page == NULL) { ret = -ENOMEM; goto err; } 795 if (!PageUptodate(page)) { ret = -EIO; goto err; } /* Pages marked accessed already */ e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); return 0; err: if (page) 1 page_cache_release(page); 1 if (e4b->bd_bitmap_page) 1 page_cache_release(e4b->bd_bitmap_page); 1 if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); 1 e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; 795 return ret; } static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { 660 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); } static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { 795 if (e4b->bd_bitmap_page) 795 page_cache_release(e4b->bd_bitmap_page); 795 if (e4b->bd_buddy_page) 795 page_cache_release(e4b->bd_buddy_page); 795 } static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1; 660 int bb_incr = 1 << (e4b->bd_blkbits - 1); void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 660 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); bb = e4b->bd_buddy; 660 while (order <= e4b->bd_blkbits + 1) { 660 block = block >> 1; 660 if (!mb_test_bit(block, bb)) { /* this block is part of buddy of order 'order' */ return order; } 660 bb += bb_incr; bb_incr >>= 1; order++; } return 0; } static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; 403 len = cur + len; 403 while (cur < len) { 403 if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ 269 addr = bm + (cur >> 3); *addr = 0; cur += 32; continue; } 403 mb_clear_bit(cur, bm); cur++; } 403 } /* clear bits in given range * will return first found zero bit if any, -1 otherwise */ static int mb_test_and_clear_bits(void *bm, int cur, int len) { __u32 *addr; int zero_bit = -1; len = cur + len; 421 while (cur < len) { 421 if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ 311 addr = bm + (cur >> 3); if (*addr != (__u32)(-1) && zero_bit == -1) zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); 311 *addr = 0; cur += 32; continue; } 421 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) zero_bit = cur; 418 cur++; } return zero_bit; } void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; 711 len = cur + len; 711 while (cur < len) { 711 if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: set whole word at once */ 556 addr = bm + (cur >> 3); *addr = 0xffffffff; cur += 32; continue; } 711 mb_set_bit(cur, bm); cur++; } 711 } /* * _________________________________________________________________ */ static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { 400 if (mb_test_bit(*bit + side, bitmap)) { 399 mb_clear_bit(*bit, bitmap); (*bit) -= side; return 1; } else { (*bit) += side; 355 mb_set_bit(*bit, bitmap); return -1; } } static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) { int max; int order = 1; void *buddy = mb_find_buddy(e4b, order, &max); while (buddy) { void *buddy2; /* Bits in range [first; last] are known to be set since * corresponding blocks were allocated. Bits in range * (first; last) will stay set because they form buddies on * upper layer. We just deal with borders if they don't * align with upper layer and then go up. * Releasing entire group is all about clearing * single bit of highest order buddy. */ /* Example: * --------------------------------- * | 1 | 1 | 1 | 1 | * --------------------------------- * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | * --------------------------------- * 0 1 2 3 4 5 6 7 * \_____________________/ * * Neither [1] nor [6] is aligned to above layer. * Left neighbour [0] is free, so mark it busy, * decrease bb_counters and extend range to * [0; 6] * Right neighbour [7] is busy. It can't be coaleasced with [6], so * mark [6] free, increase bb_counters and shrink range to * [0; 5]. * Then shift range to [0; 2], go up and do the same. */ 400 if (first & 1) 387 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); 400 if (!(last & 1)) 379 e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); 400 if (first > last) break; 385 order++; 385 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { 39 mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; 400 break; } 385 first >>= 1; last >>= 1; buddy = buddy2; } } static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int left_is_free = 0; int right_is_free = 0; int block; 421 int last = first + count - 1; struct super_block *sb = e4b->bd_sb; if (WARN_ON(count == 0)) return; 421 BUG_ON(last >= (sb->s_blocksize << 3)); 421 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ 421 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return; mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); 421 e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) 421 e4b->bd_info->bb_first_free = first; /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ 421 if (first != 0) 421 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); 421 block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); 421 if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) 421 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); 67 if (unlikely(block != -1)) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; 4 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), block); 4 ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) percpu_counter_sub(&sbi->s_freeclusters_counter, 4 e4b->bd_info->bb_free); /* Mark the block group as corrupt. */ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, 4 &e4b->bd_info->bb_state); 4 mb_regenerate_buddy(e4b); goto done; } /* let's maintain fragments counter */ 420 if (left_is_free && right_is_free) 293 e4b->bd_info->bb_fragments--; 387 else if (!left_is_free && !right_is_free) 339 e4b->bd_info->bb_fragments++; /* buddy[0] == bd_bitmap is a special case, so handle * it right away and let mb_buddy_mark_free stay free of * zero order checks. * Check if neighbours are to be coaleasced, * adjust bitmap bb_counters and borders appropriately. */ 377 if (first & 1) { 334 first += !left_is_free; 347 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; } 420 if (!(last & 1)) { 320 last -= !right_is_free; 349 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; } 420 if (first <= last) 400 mb_buddy_mark_free(e4b, first >> 1, last >> 1); done: 421 mb_set_largest_free_order(sb, e4b->bd_info); 421 mb_check_buddy(e4b); } static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int next = block; int max, order; void *buddy; 572 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); 572 buddy = mb_find_buddy(e4b, 0, &max); 246 BUG_ON(buddy == NULL); 572 BUG_ON(block >= max); 572 if (mb_test_bit(block, buddy)) { 368 ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; 572 return 0; } /* find actual order */ 550 order = mb_find_order_for_block(e4b, block); block = block >> order; ex->fe_len = 1 << order; ex->fe_start = block << order; ex->fe_group = e4b->bd_group; /* calc difference from given start */ next = next - ex->fe_start; ex->fe_len -= next; ex->fe_start += next; 262 while (needed > ex->fe_len && 275 mb_find_buddy(e4b, order, &max)) { 275 if (block + 1 >= max) break; 275 next = (block + 1) * (1 << order); if (mb_test_bit(next, e4b->bd_bitmap)) break; 266 order = mb_find_order_for_block(e4b, next); block = next >> order; ex->fe_len += 1 << order; } 550 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); return ex->fe_len; } static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) { int ord; int mlen = 0; 660 int max = 0; int cur; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 660 BUG_ON(e4b->bd_group != ex->fe_group); 660 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); 660 e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) 276 e4b->bd_info->bb_first_free += len; /* let's maintain fragments counter */ 660 if (start != 0) 660 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); 660 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 659 max = !mb_test_bit(start + len, e4b->bd_bitmap); 519 if (mlen && max) 419 e4b->bd_info->bb_fragments++; 618 else if (!mlen && !max) 447 e4b->bd_info->bb_fragments--; /* let's maintain buddy itself */ 660 while (len) { 660 ord = mb_find_order_for_block(e4b, start); 660 if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; 660 buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); 660 mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; start += mlen; len -= mlen; BUG_ON(len < 0); continue; } /* store for history */ 420 if (ret == 0) 420 ret = len | (ord << 16); /* we have to split large buddy */ 420 BUG_ON(ord <= 0); 420 buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; ord--; cur = (start >> ord) & ~1U; buddy = mb_find_buddy(e4b, ord, &max); mb_clear_bit(cur, buddy); mb_clear_bit(cur + 1, buddy); e4b->bd_info->bb_counters[ord]++; e4b->bd_info->bb_counters[ord]++; } 660 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 660 ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; } /* * Must be called under group lock! */ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { 660 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int ret; 660 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 660 BUG_ON(ac->ac_status == AC_STATUS_FOUND); ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; ret = mb_mark_used(e4b, &ac->ac_b_ex); /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; ac->ac_status = AC_STATUS_FOUND; ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; /* * take the page reference. We want the page to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ ac->ac_bitmap_page = e4b->bd_bitmap_page; 660 get_page(ac->ac_bitmap_page); 660 ac->ac_buddy_page = e4b->bd_buddy_page; 660 get_page(ac->ac_buddy_page); /* store last allocated for subsequent stream allocation */ 660 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 631 spin_lock(&sbi->s_md_lock); sbi->s_mb_last_group = ac->ac_f_ex.fe_group; sbi->s_mb_last_start = ac->ac_f_ex.fe_start; spin_unlock(&sbi->s_md_lock); } 660 } /* * regular allocator, for general purposes allocation */ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) { 424 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; struct ext4_free_extent ex; int max; 502 if (ac->ac_status == AC_STATUS_FOUND) return; /* * We don't want to scan for a whole year */ if (ac->ac_found > sbi->s_mb_max_to_scan && 22 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 22 ac->ac_status = AC_STATUS_BREAK; return; } /* * Haven't found good chunk so far, let's continue */ 424 if (bex->fe_len < gex->fe_len) return; 502 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 259 && bex->fe_group == e4b->bd_group) { /* recheck chunk's availability - we don't know * when it was found (within this lock-unlock * period or not) */ 259 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); if (max >= gex->fe_len) { 259 ext4_mb_use_best_found(ac, e4b); return; } } } /* * The routine checks whether found extent is good enough. If it is, * then the extent gets marked used and flag is set to the context * to stop scanning. Otherwise, the extent is compared with the * previous found extent and if new one is better, then it's stored * in the context. Later, the best found extent will be used, if * mballoc can't find good enough extent. * * FIXME: real allocation policy is to be designed yet! */ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *ex, struct ext4_buddy *e4b) { struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 502 BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); 502 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 502 ac->ac_found++; /* * The special case - take what you catch first */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 439 *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * Let's check whether the chuck is good enough */ 502 if (ex->fe_len == gex->fe_len) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * If this is first found extent, just store it in the context */ 435 if (bex->fe_len == 0) { 435 *bex = *ex; return; } /* * If new found extent is better, store it in the context */ 422 if (bex->fe_len < gex->fe_len) { /* if the request isn't satisfied, any found extent * larger than previous best one is better */ 213 if (ex->fe_len > bex->fe_len) 335 *bex = *ex; 386 } else if (ex->fe_len > gex->fe_len) { /* if the request is satisfied, then we try to find * an extent that still satisfy the request, but is * smaller than previous one */ 377 if (ex->fe_len < bex->fe_len) *bex = *ex; } 422 ext4_mb_check_limits(ac, e4b, 0); } static noinline_for_stack int ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { 183 struct ext4_free_extent ex = ac->ac_b_ex; ext4_group_t group = ex.fe_group; int max; int err; BUG_ON(ex.fe_len <= 0); 183 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 183 if (err) return err; 183 ext4_lock_group(ac->ac_sb, group); 183 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { 183 ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } 183 ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } static noinline_for_stack int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { 661 ext4_group_t group = ac->ac_g_ex.fe_group; int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 398 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; 661 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) return 0; 661 if (grp->bb_free == 0) return 0; 397 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; 397 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { 32 ext4_mb_unload_buddy(e4b); return 0; } 397 ext4_lock_group(ac->ac_sb, group); 397 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ 201 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + ex.fe_start; /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } } else if (max >= ac->ac_g_ex.fe_len) { 201 BUG_ON(ex.fe_len <= 0); 201 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 201 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 201 ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); 377 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { /* Sometimes, caller may want to merge even small * number of blocks to an existing extent */ BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } 397 ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } /* * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ static noinline_for_stack void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { 584 struct super_block *sb = ac->ac_sb; struct ext4_group_info *grp = e4b->bd_info; void *buddy; int i; int k; int max; BUG_ON(ac->ac_2order <= 0); 584 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 584 if (grp->bb_counters[i] == 0) continue; 584 buddy = mb_find_buddy(e4b, i, &max); BUG_ON(buddy == NULL); 584 k = mb_find_next_zero_bit(buddy, max, 0); BUG_ON(k >= max); 584 ac->ac_found++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; ac->ac_b_ex.fe_group = e4b->bd_group; ext4_mb_use_best_found(ac, e4b); BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); 584 if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); break; } } /* * The routine scans the group and measures all found extents. * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ static noinline_for_stack void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { 502 struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i; int free; free = e4b->bd_info->bb_free; BUG_ON(free <= 0); 502 i = e4b->bd_info->bb_first_free; 502 while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, 502 EXT4_CLUSTERS_PER_GROUP(sb), i); if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * we have free blocks */ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); break; } 502 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); 502 if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit * without claiming the space. */ break; } 502 ex.fe_logical = 0xDEADC0DE; /* debug value */ 502 ext4_mb_measure_extent(ac, &ex, e4b); 502 i += ex.fe_len; free -= ex.fe_len; } 502 ext4_mb_check_limits(ac, e4b, 1); } /* * This is a special case for storages like raid5 * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; ext4_grpblk_t i; int max; BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; } } i += sbi->s_stripe; } } /* * This is now called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable * for the allocation or not. In addition it can also return negative * error code when something goes wrong. */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { unsigned free, fragments; 658 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 658 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); 658 free = grp->bb_free; if (free == 0) 658 return 0; 658 if (cr <= 2 && free < ac->ac_g_ex.fe_len) return 0; 658 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return 0; /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1 int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); if (ret) return ret; } 657 fragments = grp->bb_fragments; if (fragments == 0) return 0; 657 switch (cr) { case 0: 604 BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ 604 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return 0; 604 if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || 604 (free / fragments) >= ac->ac_g_ex.fe_len) return 1; 277 if (grp->bb_largest_free_order < ac->ac_2order) return 0; return 1; case 1: 498 if ((free / fragments) >= ac->ac_g_ex.fe_len) return 1; break; case 2: 169 if (free >= ac->ac_g_ex.fe_len) return 1; break; case 3: return 1; default: BUG(); } return 0; } static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; int cr; int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; 661 sb = ac->ac_sb; sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); /* non-extent files are limited to low blocks/groups */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 239 ngroups = sbi->s_blockfile_groups; 661 BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ 661 err = ext4_mb_find_by_goal(ac, &e4b); 661 if (err || ac->ac_status == AC_STATUS_FOUND) goto out; 658 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) goto out; /* * ac->ac2_order is set only if the fe_len is a power of 2 * if ac2_order is set we also set criteria to 0 so that we * try exact allocation using buddy. */ 658 i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ 632 if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { /* * This should tell if fe_len is exactly power of 2 */ 632 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 616 ac->ac_2order = array_index_nospec(i - 1, sb->s_blocksize_bits + 2); } /* if stream allocation is enabled, use global goal */ 658 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ 629 spin_lock(&sbi->s_md_lock); ac->ac_g_ex.fe_group = sbi->s_mb_last_group; ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } /* Let's just scan groups to find more-less suitable blocks */ 658 cr = ac->ac_2order ? 0 : 1; /* * cr == 0 try to get exact allocation, * cr == 3 try to get anything */ repeat: 658 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 658 ac->ac_criteria = cr; /* * searching for the right group start * from the goal value specified */ group = ac->ac_g_ex.fe_group; 282 for (i = 0; i < ngroups; group++, i++) { int ret = 0; 658 cond_resched(); /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ if (group >= ngroups) group = 0; /* This now checks without needing the buddy page */ 658 ret = ext4_mb_good_group(ac, group, cr); if (ret <= 0) { 282 if (!first_err) first_err = ret; continue; } 657 err = ext4_mb_load_buddy(sb, group, &e4b); if (err) goto out; 657 ext4_lock_group(sb, group); /* * We need to check again after locking the * block group */ 657 ret = ext4_mb_good_group(ac, group, cr); if (ret <= 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); if (!first_err) first_err = ret; continue; } 657 ac->ac_groups_scanned++; if (cr == 0) 584 ext4_mb_simple_scan_group(ac, &e4b); 502 else if (cr == 1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % sbi->s_stripe)) ext4_mb_scan_aligned(ac, &e4b); else 502 ext4_mb_complex_scan_group(ac, &e4b); 657 ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); if (ac->ac_status != AC_STATUS_CONTINUE) break; } } 658 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 183 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { /* * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ 183 ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); */ 28 ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; 658 cr = 3; atomic_inc(&sbi->s_mb_lost_chunks); goto repeat; } } out: 16 if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; 661 return err; } static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = seq->private; ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = seq->private; ext4_group_t group; ++*pos; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = seq->private; ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; struct ext4_buddy e4b; struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { seq_printf(seq, "#%-5u: I/O error\n", group); return 0; } buddy_loaded = 1; } memcpy(&sg, ext4_get_group_info(sb, group), i); if (buddy_loaded) ext4_mb_unload_buddy(&e4b); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? sg.info.bb_counters[i] : 0); seq_printf(seq, " ]\n"); return 0; } static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } static const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) { struct super_block *sb = PDE_DATA(inode); int rc; rc = seq_open(file, &ext4_mb_seq_groups_ops); if (rc == 0) { struct seq_file *m = file->private_data; m->private = sb; } return rc; } const struct file_operations ext4_seq_mb_groups_fops = { .owner = THIS_MODULE, .open = ext4_mb_seq_groups_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; BUG_ON(!cachep); return cachep; } /* * Allocate the top-level s_group_info array for the specified number * of groups */ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned size; struct ext4_group_info ***new_groupinfo; size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); if (size <= sbi->s_group_info_size) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } if (sbi->s_group_info) { memcpy(new_groupinfo, sbi->s_group_info, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); kvfree(sbi->s_group_info); } sbi->s_group_info = new_groupinfo; sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { int i; int metalen = 0; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. * If it's true, we have to allocate a new table of pointers * to ext4_group_info structures */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); goto exit_meta_group_info; } sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = meta_group_info; } meta_group_info = sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); /* * initialize bb_free to be able to skip * empty groups without initialization */ if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { meta_group_info[i]->bb_free = ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ #ifdef DOUBLE_CHECK { struct buffer_head *bh; meta_group_info[i]->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); BUG_ON(meta_group_info[i]->bb_bitmap == NULL); bh = ext4_read_block_bitmap(sb, group); BUG_ON(IS_ERR_OR_NULL(bh)); memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); } #endif return 0; exit_group_info: /* If a meta_group_info table has been allocated, release it now */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; } exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; struct ext4_group_desc *desc; struct kmem_cache *cachep; err = ext4_mb_alloc_groupinfo(sb, ngroups); if (err) return err; sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } /* To avoid potentially colliding with an valid on-disk inode number, * use EXT4_BAD_INO for the buddy cache inode number. This inode is * not in the inode hash, so it should never be found by iget(), but * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) goto err_freebuddy; } return 0; err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) kmem_cache_free(cachep, ext4_get_group_info(sb, i)); i = sbi->s_group_info_size; while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: kvfree(sbi->s_group_info); return -ENOMEM; } static void ext4_groupinfo_destroy_slabs(void) { int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { if (ext4_groupinfo_caches[i]) kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } static int ext4_groupinfo_create_slab(size_t size) { static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); int slab_size; int blocksize_bits = order_base_2(size); int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep; if (cache_index >= NR_GRPINFO_CACHES) return -EINVAL; if (unlikely(cache_index < 0)) cache_index = 0; mutex_lock(&ext4_grpinfo_slab_create_mutex); if (ext4_groupinfo_caches[cache_index]) { mutex_unlock(&ext4_grpinfo_slab_create_mutex); return 0; /* Already created */ } slab_size = offsetof(struct ext4_group_info, bb_counters[blocksize_bits + 2]); cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); ext4_groupinfo_caches[cache_index] = cachep; mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { printk(KERN_EMERG "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } return 0; } int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; unsigned offset, offset_incr; unsigned max; int ret; i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { ret = -ENOMEM; goto out; } i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; goto out; } ret = ext4_groupinfo_create_slab(sb->s_blocksize); if (ret < 0) goto out; /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; sbi->s_mb_offsets[0] = 0; i = 1; offset = 0; offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; offset += offset_incr; offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i <= sb->s_blocksize_bits + 1); spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file * systems, this is probably too big (i.e, if the cluster size * is 1 megabyte, then group preallocation size becomes half a * gigabyte!). As a default, we will keep a two megabyte * group pralloc size for cluster sizes up to 64k, and after * that, we will force a minimum group preallocation size of * 32 clusters. This translates to 8 megs when the cluster * size is 256k, and 32 megs when the cluster size is 1 meg, * which seems reasonable as a default. */ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> sbi->s_cluster_bits, 32); /* * If there is a s_stripe > 1, then we set the s_mb_group_prealloc * to the lowest multiple of s_stripe which is bigger than * the s_mb_group_prealloc as determined above. We want * the preallocation size to be an exact multiple of the * RAID stripe size so that preallocations don't fragment * the stripes. */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( sbi->s_mb_group_prealloc, sbi->s_stripe); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); for (j = 0; j < PREALLOC_TB_SIZE; j++) INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) goto out_free_locality_groups; return 0; out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); sbi->s_mb_maxs = NULL; return ret; } /* need to called with the ext4 group lock held */ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; int count = 0; list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; kmem_cache_free(ext4_pspace_cachep, pa); } if (count) mb_debug(1, "mballoc: %u PAs left\n", count); } int ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); #ifdef DOUBLE_CHECK kfree(grinfo->bb_bitmap); #endif ext4_lock_group(sb, i); ext4_mb_cleanup_pa(grinfo); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, "mballoc: %u extents scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, "mballoc: %lu generated and it took %Lu", sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } free_percpu(sbi->s_locality_groups); return 0; } static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t cluster, int count, unsigned long flags) { ext4_fsblk_t discard_block; discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + ext4_group_first_block_no(sb, block_group)); count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); return sb_issue_discard(sb, discard_block, count, GFP_NOFS, flags); } /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ static void ext4_free_data_callback(struct super_block *sb, struct ext4_journal_cb_entry *jce, int rc) { struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, entry->efd_group, entry->efd_start_cluster, entry->efd_count, 0); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%d block:%d count:%d failed" " with %d", entry->efd_group, entry->efd_start_cluster, entry->efd_count, err); } err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; count2++; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. * If the volume is mounted with -o discard, online discard * is supported and the free blocks will be trimmed online. */ if (!test_opt(sb, DISCARD)) EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ page_cache_release(e4b.bd_buddy_page); page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->efd_group); kmem_cache_free(ext4_free_data_cachep, entry); ext4_mb_unload_buddy(&e4b); mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) return -ENOMEM; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); if (ext4_free_data_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; } return 0; } void ext4_exit_mballoc(void) { /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); } /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; 709 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 709 BUG_ON(ac->ac_b_ex.fe_len <= 0); 709 sb = ac->ac_sb; sbi = EXT4_SB(sb); bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto out_err; } BUFFER_TRACE(bitmap_bh, "getting write access"); 709 err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto out_err; err = -EIO; 709 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); if (!gdp) goto out_err; 709 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) goto out_err; 709 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EFSCORRUPTED; goto out_err; } 709 ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK { int i; for (i = 0; i < ac->ac_b_ex.fe_len; i++) { BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, bitmap_bh->b_data)); } } #endif 709 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); if (ext4_has_group_desc_csum(sb) && 709 (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 6 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, ac->ac_b_ex.fe_group, gdp)); } 709 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_group_clusters_set(sb, gdp, len); ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ 527 percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); 709 if (sbi->s_log_groups_per_flex) { 709 ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); atomic64_sub(ac->ac_b_ex.fe_len, &sbi->s_flex_groups[flex_group].free_clusters); } 709 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; 709 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: 709 brelse(bitmap_bh); 709 return err; } /* * here we normalize request for locality group * Group request are normalized to s_mb_group_prealloc, which goes to * s_strip if we set the same via mount option. * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; 39 struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); 39 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } /* * Normalization means making request better in terms of * size and alignment */ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { 635 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; loff_t size, start_off; loff_t orig_size __maybe_unused; ext4_lblk_t start; 631 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; /* do normalize only data requests, metadata requests do not need preallocation */ 661 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; /* sometime caller may want exact blocks */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; /* caller may indicate that preallocation isn't * required (it's a tail, for example) */ 636 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) return; if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 39 ext4_mb_normalize_group_request(ac); 661 return ; } bsbits = ac->ac_sb->s_blocksize_bits; /* first, let's learn actual file size * given current request is allocated */ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); orig_size = size; /* max size of free chunks */ 221 max = 2 << bsbits; #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ /* XXX: should this table be tunable? */ start_off = 0; if (size <= 16 * 1024) { size = 16 * 1024; 631 } else if (size <= 32 * 1024) { size = 32 * 1024; 631 } else if (size <= 64 * 1024) { size = 64 * 1024; 631 } else if (size <= 128 * 1024) { size = 128 * 1024; 458 } else if (size <= 256 * 1024) { size = 256 * 1024; 425 } else if (size <= 512 * 1024) { size = 512 * 1024; 394 } else if (size <= 1024 * 1024) { size = 1024 * 1024; 352 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 160 (21 - bsbits)) << 21; size = 2 * 1024 * 1024; 221 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 20 (22 - bsbits)) << 22; size = 4 * 1024 * 1024; 211 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 211 (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), ac->ac_o_ex.fe_len) << bsbits; } 631 size = size >> bsbits; start = start_off >> bsbits; /* don't cover already allocated blocks in selected range */ 544 if (ar->pleft && start <= ar->lleft) { 348 size -= ar->lleft + 1 - start; start = ar->lleft + 1; } 631 if (ar->pright && start + size - 1 >= ar->lright) 8 size -= start + size - ar->lright; /* * Trim allocation request for filesystems with artificially small * groups. */ 631 if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); 631 end = start + size; /* check we don't cross already preallocated blocks */ 631 rcu_read_lock(); 631 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ext4_lblk_t pa_end; 89 if (pa->pa_deleted) continue; 89 spin_lock(&pa->pa_lock); if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } 89 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), pa->pa_len); /* PA must not overlap original request */ 32 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || ac->ac_o_ex.fe_logical < pa->pa_lstart)); /* skip PAs this normalized request doesn't overlap with */ 89 if (pa->pa_lstart >= end || pa_end <= start) { spin_unlock(&pa->pa_lock); continue; } 60 BUG_ON(pa->pa_lstart <= start && pa_end >= end); /* adjust start or end to be adjacent to this pa */ 60 if (pa_end <= ac->ac_o_ex.fe_logical) { BUG_ON(pa_end < start); start = pa_end; 4 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { BUG_ON(pa->pa_lstart > end); end = pa->pa_lstart; } 89 spin_unlock(&pa->pa_lock); } 631 rcu_read_unlock(); size = end - start; /* XXX: extra loop to check we really don't overlap preallocations */ 631 rcu_read_lock(); 631 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ext4_lblk_t pa_end; 89 spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { 89 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), pa->pa_len); 32 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); } 89 spin_unlock(&pa->pa_lock); } 631 rcu_read_unlock(); if (start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); BUG(); } 631 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ 631 ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ 58 if (ar->pright && (ar->lright == (start + size))) { /* merge to the right */ 4 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, &ac->ac_f_ex.fe_group, &ac->ac_f_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } 631 if (ar->pleft && (ar->lleft + 1 == start)) { /* merge to the left */ 398 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, &ac->ac_f_ex.fe_group, &ac->ac_f_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, (unsigned) orig_size, (unsigned) start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { 710 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } 710 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) 661 trace_ext4_mballoc_alloc(ac); else 521 trace_ext4_mballoc_prealloc(ac); } /* * Called on failure; free up any blocks from the inode PA for this * context. We don't need this for MB_GROUP_PA because we only change * pa_free in ext4_mb_release_context(), but on failure, we've already * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. */ 1 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { 1 struct ext4_prealloc_space *pa = ac->ac_pa; struct ext4_buddy e4b; int err; if (pa == NULL) { 1 if (ac->ac_f_ex.fe_len == 0) 1 return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); if (err) { /* * This should never happen since we pin the * pages in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ WARN(1, "mb_load_buddy failed (%d)", err); return; } ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); ext4_mb_unload_buddy(&e4b); return; } 1 if (pa->pa_type == MB_INODE_PA) pa->pa_free += ac->ac_b_ex.fe_len; } /* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { 523 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); 523 BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); 523 BUG_ON(pa->pa_free < len); 523 pa->pa_free -= len; mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); } /* * use blocks preallocated to locality group */ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { 358 unsigned int len = ac->ac_o_ex.fe_len; ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; /* we don't correct pa_pstart or pa_plen here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); } /* * Return the prealloc space that have minimal distance * from the goal block. @cpa is the prealloc * space that is having currently known minimal distance * from the goal block. */ static struct ext4_prealloc_space * ext4_mb_check_group_pa(ext4_fsblk_t goal_block, struct ext4_prealloc_space *pa, struct ext4_prealloc_space *cpa) { ext4_fsblk_t cur_distance, new_distance; 355 if (cpa == NULL) { 355 atomic_inc(&pa->pa_count); return pa; } 20 cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ 20 atomic_dec(&cpa->pa_count); atomic_inc(&pa->pa_count); return pa; } /* * search goal blocks in preallocated space */ static noinline_for_stack int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { 699 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *pa, *cpa = NULL; ext4_fsblk_t goal_block; /* only data can be preallocated */ 710 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 710 return 0; /* first, try per-file preallocation */ 699 rcu_read_lock(); 699 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { /* all fields in this condition don't change, * so we can skip locking for them */ 462 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || ac->ac_o_ex.fe_logical >= (pa->pa_lstart + 447 EXT4_C2B(sbi, pa->pa_len))) continue; /* non-extent files can't have physical blocks past 2^32 */ 445 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 210 (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > EXT4_MAX_BLOCK_FILE_PHYS)) continue; /* found preallocated blocks, use them */ 445 spin_lock(&pa->pa_lock); 445 if (pa->pa_deleted == 0 && pa->pa_free) { 445 atomic_inc(&pa->pa_count); ext4_mb_use_inode_pa(ac, pa); spin_unlock(&pa->pa_lock); ac->ac_criteria = 10; 445 rcu_read_unlock(); return 1; } spin_unlock(&pa->pa_lock); } 694 rcu_read_unlock(); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) return 0; /* inode may have no locality group for some reason */ 359 lg = ac->ac_lg; if (lg == NULL) return 0; 359 order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. */ for (i = order; i < PREALLOC_TB_SIZE; i++) { 359 rcu_read_lock(); 359 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], pa_inode_list) { 355 spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && 355 pa->pa_free >= ac->ac_o_ex.fe_len) { 355 cpa = ext4_mb_check_group_pa(goal_block, pa, cpa); } 355 spin_unlock(&pa->pa_lock); } 359 rcu_read_unlock(); } 359 if (cpa) { 355 ext4_mb_use_group_pa(ac, cpa); ac->ac_criteria = 20; return 1; } return 0; } /* * the function goes through all block freed in the group * but not yet committed and marks them used in in-core bitmap. * buddy must be generated from this bitmap * Need to be called with the ext4 group lock held */ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group) { struct rb_node *n; struct ext4_group_info *grp; struct ext4_free_data *entry; 6 grp = ext4_get_group_info(sb, group); n = rb_first(&(grp->bb_free_root)); while (n) { entry = rb_entry(n, struct ext4_free_data, efd_node); ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } return; } /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held */ static noinline_for_stack void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { 6 struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_prealloc_space *pa; struct list_head *cur; ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; int len; /* all form of preallocation discards first load group, * so the only competing code is preallocation use. * we don't need any locking here * notice we do NOT ignore preallocations with pa_deleted * otherwise we could leave used blocks available for * allocation in buddy when concurrent ext4_mb_put_pa() * is dropping preallocation */ list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); len = pa->pa_len; spin_unlock(&pa->pa_lock); if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); ext4_set_bits(bitmap, start, len); preallocated += len; } mb_debug(1, "prellocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed */ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; ext4_fsblk_t grp_blk; /* in this short window concurrent discard can set pa_deleted */ 591 spin_lock(&pa->pa_lock); 591 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { spin_unlock(&pa->pa_lock); return; } 162 if (pa->pa_deleted == 1) { 589 spin_unlock(&pa->pa_lock); return; } 162 pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ if (pa->pa_type == MB_GROUP_PA) 18 grp_blk--; 162 grp = ext4_get_group_number(sb, grp_blk); /* * possible race: * * P1 (buddy init) P2 (regular allocation) * find block B in PA * copy on-disk bitmap to buddy * mark B in on-disk bitmap * drop PA from group * mark all PAs in buddy * * thus, P1 initializes buddy with B available. to prevent this * we make "copy" and "mark all PAs" atomic and serialize "drop PA" * against that pair */ 162 ext4_lock_group(sb, grp); 162 list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); spin_lock(pa->pa_obj_lock); 162 list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } /* * creates new preallocated space for given inode */ static noinline_for_stack int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { 508 struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; /* preallocate only when found space is larger then requested */ 508 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 508 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 508 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); if (pa == NULL) return -ENOMEM; 508 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { int winl; int wins; int win; int offs; /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart * to cover original request */ 94 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 94 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); /* we're limited by original request in that * logical block must be covered any way * winl is window we can move our chunk within */ 94 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; /* also, we should cover whole original request */ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); /* the smallest one defines real window */ win = min(winl, wins); offs = ac->ac_o_ex.fe_logical % EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 80 if (offs && offs < win) win = offs; 94 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - 94 EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 94 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ 508 ac->ac_f_ex = ac->ac_b_ex; pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); 508 trace_ext4_mb_new_inode_pa(ac, pa); 508 ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ei = EXT4_I(ac->ac_inode); 508 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); pa->pa_obj_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; 508 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 508 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 508 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); spin_lock(pa->pa_obj_lock); 508 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 508 spin_unlock(pa->pa_obj_lock); 508 return 0; 464 } /* * creates new preallocated space for locality group inodes belongs to */ static noinline_for_stack int ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { 38 struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg; struct ext4_prealloc_space *pa; struct ext4_group_info *grp; /* preallocate only when found space is larger then requested */ 38 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 38 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 38 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 38 BUG_ON(ext4_pspace_cachep == NULL); pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); if (pa == NULL) return -ENOMEM; /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ 38 ac->ac_f_ex = ac->ac_b_ex; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); 38 trace_ext4_mb_new_group_pa(ac, pa); 38 ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 38 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); lg = ac->ac_lg; BUG_ON(lg == NULL); 38 pa->pa_obj_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; 38 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 38 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 38 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ 38 return 0; } static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { int err; 514 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 38 err = ext4_mb_new_group_pa(ac); else 508 err = ext4_mb_new_inode_pa(ac); return err; } /* * finds all unused blocks in on-disk bitmap, frees them in * in-core bitmap and buddy. * @pa must be unlinked from inode and group lists, so that * nobody else can find/use it. * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ static noinline_for_stack int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { 247 struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int end; unsigned int next; ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; int err = 0; int free = 0; BUG_ON(pa->pa_deleted == 0); 247 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); 247 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 247 end = bit + pa->pa_len; while (bit < end) { 247 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; 247 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); mb_debug(1, " free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; 247 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 247 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + EXT4_C2B(sbi, bit)), next - bit); 247 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } 247 if (free != pa->pa_free) { 4 ext4_msg(e4b->bd_sb, KERN_CRIT, "pa %p: logic %lu, phys. %lu, len %lu", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. */ } 247 atomic_add(free, &sbi->s_mb_discarded); return err; } static noinline_for_stack int ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { 8 struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; 8 trace_ext4_mb_release_group_pa(sb, pa); 8 BUG_ON(pa->pa_deleted == 0); 8 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 8 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 8 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 8 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); 8 return 0; } /* * releases all preallocations in given group * * first, we need to decide discard policy: * - when do we discard * 1) ENOSPC * - how many do we discard * 1) how many requested */ static noinline_for_stack int ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_group_t group, int needed) { 16 struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; struct list_head list; struct ext4_buddy e4b; int err; int busy = 0; int free = 0; mb_debug(1, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) return 0; 10 bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); return 0; } 10 err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); return 0; } 10 if (needed == 0) needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; 10 INIT_LIST_HEAD(&list); repeat: 10 ext4_lock_group(sb, group); 10 list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { 10 spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { spin_unlock(&pa->pa_lock); busy = 1; continue; } 10 if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* seems this one can be freed ... */ 10 pa->pa_deleted = 1; /* we can trust pa_free ... */ free += pa->pa_free; spin_unlock(&pa->pa_lock); 10 list_del(&pa->pa_group_list); 10 list_add(&pa->u.pa_tmp_list, &list); } /* if we still need more blocks and some PAs were used, try again */ 10 if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); cond_resched(); goto repeat; } /* found anything to free? */ 10 if (list_empty(&list)) { BUG_ON(free != 0); goto out; } /* now free all selected PAs */ 10 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ 10 spin_lock(pa->pa_obj_lock); 10 list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); if (pa->pa_type == MB_GROUP_PA) 8 ext4_mb_release_group_pa(&e4b, pa); else 8 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 10 list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } out: 10 ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); 16 return free; } /* * releases all non-used preallocated blocks for given inode * * It's important to discard preallocations under i_data_sem * We don't want another block to be served from the prealloc * space when we are discarding the inode prealloc space. * * FIXME!! Make sure it is valid at all the call sites */ void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); 559 struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; int err; 561 if (!S_ISREG(inode->i_mode)) { /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 561 return; } mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); 559 trace_ext4_discard_preallocations(inode); 559 INIT_LIST_HEAD(&list); repeat: /* first, collect all pa's in the inode */ 559 spin_lock(&ei->i_prealloc_lock); 559 while (!list_empty(&ei->i_prealloc_list)) { pa = list_entry(ei->i_prealloc_list.next, struct ext4_prealloc_space, pa_inode_list); 244 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 244 spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; } 244 if (pa->pa_deleted == 0) { 244 pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); 244 list_del_rcu(&pa->pa_inode_list); 244 list_add(&pa->u.pa_tmp_list, &list); continue; } /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from * the list. as we might be called from * ->clear_inode() the inode will get freed * and concurrent thread which is unlinking * pa from inode's list may access already * freed memory, bad-bad-bad */ /* XXX: if this happens too often, we can * add a flag to force wait only in case * of ->clear_inode(), but not in case of * regular truncate */ schedule_timeout_uninterruptible(HZ); goto repeat; } 559 spin_unlock(&ei->i_prealloc_lock); 244 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 244 BUG_ON(pa->pa_type != MB_INODE_PA); 244 group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; } 244 bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); continue; } 244 ext4_lock_group(sb, group); 244 list_del(&pa->pa_group_list); ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); 244 list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } #ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; if (!ext4_mballoc_debug || (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" " Allocation context details:"); ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", ac->ac_status, ac->ac_flags); ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, (unsigned long)ac->ac_o_ex.fe_logical, (unsigned long)ac->ac_g_ex.fe_group, (unsigned long)ac->ac_g_ex.fe_start, (unsigned long)ac->ac_g_ex.fe_len, (unsigned long)ac->ac_g_ex.fe_logical, (unsigned long)ac->ac_b_ex.fe_group, (unsigned long)ac->ac_b_ex.fe_start, (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; struct list_head *cur; ext4_lock_group(sb, i); list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); printk(KERN_ERR "PA:%u:%d:%u \n", i, start, pa->pa_len); } ext4_unlock_group(sb, i); if (grp->bb_free == 0) continue; printk(KERN_ERR "%u: %d/%d \n", i, grp->bb_free, grp->bb_fragments); } printk(KERN_ERR "\n"); } #else static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { return; } #endif /* * We use locality group preallocation for small size file. The size of the * file is determined by the current size or the resulting size after * allocation which ever is larger * * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { 699 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; 699 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; if ((size == isize) && 217 !ext4_fs_is_busy(sbi) && 217 (atomic_read(&ac->ac_inode->i_writecount) == 0)) { 1 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } 699 if (sbi->s_mb_group_prealloc <= 0) { ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; } /* don't use group allocation for large files */ 699 size = max(size, isize); if (size > sbi->s_mb_stream_request) { 645 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; } 359 BUG_ON(ac->ac_lg != NULL); /* * locality group prealloc space are per cpu. The reason for having * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ 359 ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; /* serialize all allocations in the group */ mutex_lock(&ac->ac_lg->lg_mutex); } static noinline_for_stack int ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { 710 struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; unsigned int len; ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ len = ar->len; /* just a dirty hack to filter too big requests */ if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ 710 goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || 710 goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); 710 ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; /* we have to define context: we'll we work with a file or * locality group. this is a policy, actually */ 699 ext4_mb_group_or_file(ac); 710 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, atomic_read(&ar->inode->i_writecount) ? "" : "non-"); return 0; } static noinline_for_stack void ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_locality_group *lg, int order, int total_entries) { ext4_group_t group = 0; struct ext4_buddy e4b; struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; mb_debug(1, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_inode_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* * This is the pa that we just used * for block allocation. So don't * free that */ spin_unlock(&pa->pa_lock); continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* only lg prealloc space */ BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; if (total_entries <= 5) { /* * we want to keep only 5 entries * allowing it to grow to 8. This * mak sure we don't call discard * soon for this list. */ break; } } spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { int err; group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } /* * We have incremented pa_count. So it cannot be freed at this * point. Also we hold lg_mutex. So no parallel allocation is * possible from this lg. That means pa_free cannot be updated. * * A parallel ext4_mb_discard_group_preallocations is possible. * which can cause the lg_prealloc_list to be updated. */ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) { int order, added = 0, lg_prealloc_count = 1; struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; order = fls(pa->pa_free) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); 3 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_inode_list) { 3 spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { spin_unlock(&tmp_pa->pa_lock); continue; } 3 if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ list_add_tail_rcu(&pa->pa_inode_list, &tmp_pa->pa_inode_list); added = 1; /* * we want to count the total * number of entries in the list */ } 3 spin_unlock(&tmp_pa->pa_lock); lg_prealloc_count++; } 3 if (!added) 358 list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list[order]); 358 spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ 358 if (lg_prealloc_count > 8) { ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); return; } return ; } /* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { 591 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 710 struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); } } if (pa) { /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding * doesn't grow big. */ 358 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 358 spin_lock(pa->pa_obj_lock); 358 list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); 358 ext4_mb_add_n_trim(ac); } 591 ext4_mb_put_pa(ac, ac->ac_sb, pa); } 710 if (ac->ac_bitmap_page) 660 page_cache_release(ac->ac_bitmap_page); 710 if (ac->ac_buddy_page) 660 page_cache_release(ac->ac_buddy_page); 710 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 359 mutex_unlock(&ac->ac_lg->lg_mutex); 710 ext4_mb_collect_stats(ac); return 0; } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0; 16 trace_ext4_mb_discard_preallocations(sb, needed); 16 for (i = 0; i < ngroups && needed > 0; i++) { 16 ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; needed -= ret; } return freed; } /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { int freed; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; 713 might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); 713 trace_ext4_request_blocks(ar); /* Allow to use superuser reservation for quota file */ 713 if (IS_NOQUOTA(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 713 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ 531 while (ar->len && 531 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ 152 cond_resched(); ar->len = ar->len >> 1; } 528 if (!ar->len) { 91 *errp = -ENOSPC; return 0; } reserv_clstrs = ar->len; 528 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { dquot_alloc_block_nofail(ar->inode, EXT4_C2B(sbi, ar->len)); } else { while (ar->len && 528 dquot_alloc_block(ar->inode, 528 EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } } 528 inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; goto out; } } 710 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; goto out; } 710 *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; goto out; } 710 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 522 if (!ext4_mb_use_preallocated(ac)) { 661 ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); repeat: /* allocate space in core */ 661 *errp = ext4_mb_regular_allocator(ac); if (*errp) goto discard_and_exit; /* as we've just preallocated more space than * user requested originally, we store allocated * space in a special descriptor */ 661 if (ac->ac_status == AC_STATUS_FOUND && 660 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 514 *errp = ext4_mb_new_preallocation(ac); 514 if (*errp) { discard_and_exit: ext4_discard_allocated_blocks(ac); goto errout; } } 710 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 709 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp) { 1 ext4_discard_allocated_blocks(ac); goto errout; } else { 709 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } } else { 16 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 16 if (freed) goto repeat; 7 *errp = -ENOSPC; } errout: 709 if (*errp) { 8 ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } 710 ext4_mb_release_context(ac); out: if (ac) 710 kmem_cache_free(ext4_ac_cachep, ac); 710 if (inquota && ar->len < inquota) 205 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 710 if (!ar->len) { 8 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ 7 percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); } 713 trace_ext4_allocate_blocks(ar, (unsigned long long)block); return block; } /* * We can merge two free data extents only if the physical blocks * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ static int can_merge(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { if ((entry1->efd_tid == entry2->efd_tid) && (entry1->efd_group == entry2->efd_group) && ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) return 1; return 0; } static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct rb_node **n = &db->bb_free_root.rb_node, *node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ page_cache_get(e4b->bd_buddy_page); page_cache_get(e4b->bd_bitmap_page); } while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, efd_node); if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); return 0; } } rb_link_node(new_node, parent, n); rb_insert_color(new_node, &db->bb_free_root); /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(entry, new_entry) && ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_start_cluster = entry->efd_start_cluster; new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); kmem_cache_free(ext4_free_data_cachep, entry); } } node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); if (can_merge(new_entry, entry) && ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); kmem_cache_free(ext4_free_data_cachep, entry); } } /* Add the extent to transaction's private list */ ext4_journal_callback_add(handle, ext4_free_data_callback, &new_entry->efd_jce); return 0; } /** * ext4_free_blocks() -- Free given blocks and update quota * @handle: handle for this transaction * @inode: inode * @block: start physical block to free * @count: number of blocks to count * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags) { struct buffer_head *bitmap_bh = NULL; 405 struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; int ret; might_sleep(); if (bh) { 1 if (block) BUG_ON(block != bh->b_blocknr); else block = bh->b_blocknr; } 405 sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 382 !ext4_data_block_valid(sbi, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); goto error_return; } ext4_debug("freeing block %llu\n", block); 405 trace_ext4_free_blocks(inode, block, count, flags); 405 if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 1 BUG_ON(count > 1); 1 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, bh, block); } /* * We need to make sure we don't reuse the freed block until * after the transaction is committed, which we can do by * treating the block as metadata, below. We make an * exception if the inode is to be written in writeback mode * since writeback mode has weak data consistency guarantees. */ 405 if (!ext4_should_writeback_data(inode)) flags |= EXT4_FREE_BLOCKS_METADATA; /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ 405 overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; block += overflow; if (count > overflow) count -= overflow; else return; } else { block -= overflow; count += overflow; } } 405 overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) count -= overflow; else return; } else count += sbi->s_cluster_ratio - overflow; } 405 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; 238 for (i = 0; i < count; i++) { 238 cond_resched(); bh = sb_find_get_block(inode->i_sb, block + i); if (!bh) continue; 238 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, bh, block + i); } } do_more: overflow = 0; 405 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 405 if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( ext4_get_group_info(sb, block_group)))) return; /* * Check to see if we are freeing blocks across a group * boundary. */ 403 if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { 32 overflow = EXT4_C2B(sbi, bit) + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } 403 count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } 403 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); if (!gdp) { err = -EIO; goto error_return; } 403 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 403 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 403 in_range(block, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group) || 403 in_range(block + count - 1, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_return; } BUFFER_TRACE(bitmap_bh, "getting write access"); 403 err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto error_return; /* * We are about to modify some metadata. Call the journal APIs * to unshare ->b_data if a currently-committing transaction is * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); 403 err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; #ifdef AGGRESSIVE_CHECK { int i; for (i = 0; i < count_clusters; i++) BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif 403 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ 403 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) goto error_return; 403 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { struct ext4_free_data *new_entry; /* * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed * * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { /* need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ 403 if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, count, 0); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%d block:%d count:%lu failed" " with %d", block_group, bit, count, err); } else 403 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); 403 ext4_lock_group(sb, block_group); 403 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); } 403 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; ext4_free_group_clusters_set(sb, gdp, ret); ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); if (sbi->s_log_groups_per_flex) { 403 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); atomic64_add(count_clusters, &sbi->s_flex_groups[flex_group].free_clusters); } 403 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 403 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 403 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; 403 if (overflow && !err) { block += count; 31 count = overflow; put_bh(bitmap_bh); goto do_more; } error_return: 403 brelse(bitmap_bh); 405 ext4_std_error(sb, err); return; } /** * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physical block to add to the block group * @count: number of blocks to free * * This marks the blocks as free in the bitmap and buddy. */ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; unsigned int i; struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); if (count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { ext4_warning(sb, "too much blocks added to group %u\n", block_group); err = -EINVAL; goto error_return; } bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } desc = ext4_get_group_desc(sb, block_group, &gd_bh); if (!desc) { err = -EIO; goto error_return; } if (in_range(ext4_block_bitmap(sb, desc), block, count) || in_range(ext4_inode_bitmap(sb, desc), block, count) || in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; goto error_return; } BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto error_return; /* * We are about to modify some metadata. Call the journal APIs * to unshare ->b_data if a currently-committing transaction is * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { ext4_error(sb, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { blocks_freed++; } } err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_return; /* * need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(NULL, &e4b, bit, count); blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, blk_free_count); ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, EXT4_NUM_B2C(sbi, blocks_freed)); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; error_return: brelse(bitmap_bh); ext4_std_error(sb, err); return err; } /** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM * @group: alloc. group we are working with * @e4b: ext4 buddy for the group * @blkdev_flags: flags for the block device * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b, unsigned long blkdev_flags) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; ex.fe_group = group; ex.fe_len = count; /* * Mark blocks used, so no one can reuse them while * being trimmed. */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); ret = ext4_issue_discard(sb, group, start, count, blkdev_flags); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; } /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count * @blkdev_flags: flags for the block device * * ext4_trim_all_free walks through group's buddy bitmap searching for free * extents. When the free block is found, ext4_trim_extent is called to TRIM * the extent. * * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy * bitmap. Then issue a TRIM command on this extent and free the extent in * the group buddy bitmap. This is done until whole group is scanned. */ static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks, unsigned long blkdev_flags) { void *bitmap; ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; int ret = 0; trace_ext4_trim_all_free(sb, group, start, max); ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_warning(sb, "Error %d loading buddy information for %u", ret, group); return ret; } bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) goto out; start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; while (start <= max) { start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { ret = ext4_trim_extent(sb, start, next - start, group, &e4b, blkdev_flags); if (ret && ret != -EOPNOTSUPP) break; ret = 0; count += next - start; } free_count += next - start; start = next + 1; if (fatal_signal_pending(current)) { count = -ERESTARTSYS; break; } if (need_resched()) { ext4_unlock_group(sb, group); cond_resched(); ext4_lock_group(sb, group); } if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } if (!ret) { ret = count; EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); } out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", count, group); return ret; } /** * ext4_trim_fs() -- trim ioctl handle function * @sb: superblock for filesystem * @range: fstrim_range structure * @blkdev_flags: flags for the block device * * start: First Byte to trim * len: number of Bytes to trim from start * minlen: minimum extent length in Bytes * ext4_trim_fs goes through all allocation groups containing Bytes from * start to start+len. For each such a group ext4_trim_all_free function * is invoked to trim all free space. */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range, unsigned long blkdev_flags) { struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = EXT4_NUM_B2C(EXT4_SB(sb), range->minlen >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; if (end >= max_blks) end = max_blks - 1; if (end <= first_data_blk) goto out; if (start < first_data_blk) start = first_data_blk; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } /* * For all the groups except the last one, last cluster will * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ if (group == last_group) end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, end, minlen, blkdev_flags); if (cnt < 0) { ret = cnt; break; } trimmed += cnt; } /* * For every group except the first one, we are sure * that the first cluster to discard will be cluster #0. */ first_cluster = 0; } if (!ret) atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; }
/* * linux/kernel/resource.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 1999 Martin Mares <mj@ucw.cz> * * Arbitrary resource management. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/export.h> #include <linux/errno.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> #include <linux/mm.h> #include <linux/resource_ext.h> #include <asm/io.h> struct resource ioport_resource = { .name = "PCI IO", .start = 0, .end = IO_SPACE_LIMIT, .flags = IORESOURCE_IO, }; EXPORT_SYMBOL(ioport_resource); struct resource iomem_resource = { .name = "PCI mem", .start = 0, .end = -1, .flags = IORESOURCE_MEM, }; EXPORT_SYMBOL(iomem_resource); /* constraints to be met while allocating resources */ struct resource_constraint { resource_size_t min, max, align; resource_size_t (*alignf)(void *, const struct resource *, resource_size_t, resource_size_t); void *alignf_data; }; static DEFINE_RWLOCK(resource_lock); /* * For memory hotplug, there is no way to free resource entries allocated * by boot mem after the system is up. So for reusing the resource entry * we need to remember the resource. */ static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); static struct resource *next_resource(struct resource *p, bool sibling_only) { /* Caller wants to traverse through siblings only */ 6 if (sibling_only) 6 return p->sibling; if (p->child) return p->child; while (!p->sibling && p->parent) p = p->parent; return p->sibling; } static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; (*pos)++; return (void *)next_resource(p, false); } #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { struct resource *p = m->private; loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) ; return p; } static void r_stop(struct seq_file *m, void *v) __releases(resource_lock) { read_unlock(&resource_lock); } static int r_show(struct seq_file *m, void *v) { struct resource *root = m->private; struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; int depth; for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) { start = r->start; end = r->end; } else { start = end = 0; } seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", width, start, width, end, r->name ? r->name : "<BAD>"); return 0; } static const struct seq_operations resource_op = { .start = r_start, .next = r_next, .stop = r_stop, .show = r_show, }; static int ioports_open(struct inode *inode, struct file *file) { int res = seq_open(file, &resource_op); if (!res) { struct seq_file *m = file->private_data; m->private = &ioport_resource; } return res; } static int iomem_open(struct inode *inode, struct file *file) { int res = seq_open(file, &resource_op); if (!res) { struct seq_file *m = file->private_data; m->private = &iomem_resource; } return res; } static const struct file_operations proc_ioports_operations = { .open = ioports_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations proc_iomem_operations = { .open = iomem_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static int __init ioresources_init(void) { proc_create("ioports", 0, NULL, &proc_ioports_operations); proc_create("iomem", 0, NULL, &proc_iomem_operations); return 0; } __initcall(ioresources_init); #endif /* CONFIG_PROC_FS */ static void free_resource(struct resource *res) { if (!res) return; if (!PageSlab(virt_to_head_page(res))) { spin_lock(&bootmem_resource_lock); res->sibling = bootmem_resource_free; bootmem_resource_free = res; spin_unlock(&bootmem_resource_lock); } else { kfree(res); } } static struct resource *alloc_resource(gfp_t flags) { struct resource *res = NULL; spin_lock(&bootmem_resource_lock); if (bootmem_resource_free) { res = bootmem_resource_free; bootmem_resource_free = res->sibling; } spin_unlock(&bootmem_resource_lock); if (res) memset(res, 0, sizeof(struct resource)); else res = kzalloc(sizeof(struct resource), flags); return res; } /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { resource_size_t start = new->start; resource_size_t end = new->end; struct resource *tmp, **p; if (end < start) return root; if (start < root->start) return root; if (end > root->end) return root; p = &root->child; for (;;) { tmp = *p; if (!tmp || tmp->start > end) { new->sibling = tmp; *p = new; new->parent = root; return NULL; } p = &tmp->sibling; if (tmp->end < start) continue; return tmp; } } static int __release_resource(struct resource *old) { struct resource *tmp, **p; p = &old->parent->child; for (;;) { tmp = *p; if (!tmp) break; if (tmp == old) { *p = tmp->sibling; old->parent = NULL; return 0; } p = &tmp->sibling; } return -EINVAL; } static void __release_child_resources(struct resource *r) { struct resource *tmp, *p; resource_size_t size; p = r->child; r->child = NULL; while (p) { tmp = p; p = p->sibling; tmp->parent = NULL; tmp->sibling = NULL; __release_child_resources(tmp); printk(KERN_DEBUG "release child resource %pR\n", tmp); /* need to restore size, and keep flags */ size = resource_size(tmp); tmp->start = 0; tmp->end = size - 1; } } void release_child_resources(struct resource *r) { write_lock(&resource_lock); __release_child_resources(r); write_unlock(&resource_lock); } /** * request_resource_conflict - request and reserve an I/O or memory resource * @root: root resource descriptor * @new: resource descriptor desired by caller * * Returns 0 for success, conflict resource on error. */ struct resource *request_resource_conflict(struct resource *root, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __request_resource(root, new); write_unlock(&resource_lock); return conflict; } /** * request_resource - request and reserve an I/O or memory resource * @root: root resource descriptor * @new: resource descriptor desired by caller * * Returns 0 for success, negative error code on error. */ int request_resource(struct resource *root, struct resource *new) { struct resource *conflict; conflict = request_resource_conflict(root, new); return conflict ? -EBUSY : 0; } EXPORT_SYMBOL(request_resource); /** * release_resource - release a previously reserved resource * @old: resource pointer */ int release_resource(struct resource *old) { int retval; write_lock(&resource_lock); retval = __release_resource(old); write_unlock(&resource_lock); return retval; } EXPORT_SYMBOL(release_resource); /* * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. * This walks through whole tree and not just first level children * until and unless first_level_children_only is true. */ static int find_next_iomem_res(struct resource *res, char *name, bool first_level_children_only) { resource_size_t start, end; struct resource *p; bool sibling_only = false; BUG_ON(!res); 6 start = res->start; end = res->end; BUG_ON(start >= end); if (first_level_children_only) sibling_only = true; 6 read_lock(&resource_lock); 6 for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { 6 if (p->flags != res->flags) continue; 6 if (name && strcmp(p->name, name)) continue; 6 if (p->start > end) { p = NULL; break; } 6 if ((p->end >= start) && (p->start < end)) break; } 6 read_unlock(&resource_lock); 6 if (!p) return -1; /* copy data */ if (res->start < p->start) res->start = p->start; 6 if (res->end > p->end) res->end = p->end; return 0; } /* * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * name are valid candidates. * * @name: name of resource * @flags: resource flags * @start: start addr * @end: end addr */ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; u64 orig_end; int ret = -1; res.start = start; res.end = end; res.flags = flags; orig_end = res.end; while ((res.start < res.end) && (!find_next_iomem_res(&res, name, false))) { ret = (*func)(res.start, res.end, arg); if (ret) break; res.start = res.end + 1; res.end = orig_end; } return ret; } /* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". This function deals with * full ranges and not pfn. If resources are not pfn aligned, dealing * with pfn can truncate ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; u64 orig_end; int ret = -1; res.start = start; res.end = end; res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && (!find_next_iomem_res(&res, "System RAM", true))) { ret = (*func)(res.start, res.end, arg); if (ret) break; res.start = res.end + 1; res.end = orig_end; } return ret; } #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) { struct resource res; unsigned long pfn, end_pfn; u64 orig_end; int ret = -1; 6 res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && 6 (find_next_iomem_res(&res, "System RAM", true) >= 0)) { 6 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) 6 ret = (*func)(pfn, end_pfn - pfn, arg); 6 if (ret) break; 6 res.start = res.end + 1; res.end = orig_end; } 6 return ret; } #endif static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; } /* * This generic page_is_ram() returns true if specified address is * registered as "System RAM" in iomem_resource list. */ int __weak page_is_ram(unsigned long pfn) { return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } EXPORT_SYMBOL_GPL(page_is_ram); /** * region_intersects() - determine intersection of region with known resources * @start: region start address * @size: size of region * @name: name of resource (in iomem_resource) * * Check if the specified region partially overlaps or fully eclipses a * resource identified by @name. Return REGION_DISJOINT if the region * does not overlap @name, return REGION_MIXED if the region overlaps * @type and another resource, and return REGION_INTERSECTS if the * region overlaps @type and no other defined resource. Note, that * REGION_INTERSECTS is also returned in the case when the specified * region overlaps RAM and undefined memory holes. * * region_intersect() is used by memory remapping functions to ensure * the user is not remapping RAM and is a vast speed up over walking * through the resource table page by page. */ int region_intersects(resource_size_t start, size_t size, const char *name) { unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; resource_size_t end = start + size - 1; int type = 0; int other = 0; struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; if (start >= p->start && start <= p->end) is_type ? type++ : other++; if (end >= p->start && end <= p->end) is_type ? type++ : other++; if (p->start >= start && p->end <= end) is_type ? type++ : other++; } read_unlock(&resource_lock); if (other == 0) return type ? REGION_INTERSECTS : REGION_DISJOINT; if (type) return REGION_MIXED; return REGION_DISJOINT; } void __weak arch_remove_reservations(struct resource *avail) { } static resource_size_t simple_align_resource(void *data, const struct resource *avail, resource_size_t size, resource_size_t align) { return avail->start; } static void resource_clip(struct resource *res, resource_size_t min, resource_size_t max) { if (res->start < min) res->start = min; if (res->end > max) res->end = max; } /* * Find empty slot in the resource tree with the given range and * alignment constraints */ static int __find_resource(struct resource *root, struct resource *old, struct resource *new, resource_size_t size, struct resource_constraint *constraint) { struct resource *this = root->child; struct resource tmp = *new, avail, alloc; tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment * of this->start - 1 to tmp->end below would cause an underflow. */ if (this && this->start == root->start) { tmp.start = (this == old) ? old->start : this->end + 1; this = this->sibling; } for(;;) { if (this) tmp.end = (this == old) ? this->end : this->start - 1; else tmp.end = root->end; if (tmp.end < tmp.start) goto next; resource_clip(&tmp, constraint->min, constraint->max); arch_remove_reservations(&tmp); /* Check for overflow after ALIGN() */ avail.start = ALIGN(tmp.start, constraint->align); avail.end = tmp.end; avail.flags = new->flags & ~IORESOURCE_UNSET; if (avail.start >= tmp.start) { alloc.flags = avail.flags; alloc.start = constraint->alignf(constraint->alignf_data, &avail, size, constraint->align); alloc.end = alloc.start + size - 1; if (alloc.start <= alloc.end && resource_contains(&avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; } } next: if (!this || this->end == root->end) break; if (this != old) tmp.start = this->end + 1; this = this->sibling; } return -EBUSY; } /* * Find empty slot in the resource tree given range and alignment. */ static int find_resource(struct resource *root, struct resource *new, resource_size_t size, struct resource_constraint *constraint) { return __find_resource(root, NULL, new, size, constraint); } /** * reallocate_resource - allocate a slot in the resource tree given range & alignment. * The resource will be relocated if the new size cannot be reallocated in the * current location. * * @root: root resource descriptor * @old: resource descriptor desired by caller * @newsize: new size of the resource descriptor * @constraint: the size and alignment constraints to be met. */ static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, struct resource_constraint *constraint) { int err=0; struct resource new = *old; struct resource *conflict; write_lock(&resource_lock); if ((err = __find_resource(root, old, &new, newsize, constraint))) goto out; if (resource_contains(&new, old)) { old->start = new.start; old->end = new.end; goto out; } if (old->child) { err = -EBUSY; goto out; } if (resource_contains(old, &new)) { old->start = new.start; old->end = new.end; } else { __release_resource(old); *old = new; conflict = __request_resource(root, old); BUG_ON(conflict); } out: write_unlock(&resource_lock); return err; } /** * allocate_resource - allocate empty slot in the resource tree given range & alignment. * The resource will be reallocated with a new size if it was already allocated * @root: root resource descriptor * @new: resource descriptor desired by caller * @size: requested resource region size * @min: minimum boundary to allocate * @max: maximum boundary to allocate * @align: alignment requested, in bytes * @alignf: alignment function, optional, called if not NULL * @alignf_data: arbitrary data to pass to the @alignf function */ int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, resource_size_t (*alignf)(void *, const struct resource *, resource_size_t, resource_size_t), void *alignf_data) { int err; struct resource_constraint constraint; if (!alignf) alignf = simple_align_resource; constraint.min = min; constraint.max = max; constraint.align = align; constraint.alignf = alignf; constraint.alignf_data = alignf_data; if ( new->parent ) { /* resource is already allocated, try reallocating with the new constraints */ return reallocate_resource(root, new, size, &constraint); } write_lock(&resource_lock); err = find_resource(root, new, size, &constraint); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); return err; } EXPORT_SYMBOL(allocate_resource); /** * lookup_resource - find an existing resource by a resource start address * @root: root resource descriptor * @start: resource start address * * Returns a pointer to the resource if found, NULL otherwise */ struct resource *lookup_resource(struct resource *root, resource_size_t start) { struct resource *res; read_lock(&resource_lock); for (res = root->child; res; res = res->sibling) { if (res->start == start) break; } read_unlock(&resource_lock); return res; } /* * Insert a resource into the resource tree. If successful, return NULL, * otherwise return the conflicting resource (compare to __request_resource()) */ static struct resource * __insert_resource(struct resource *parent, struct resource *new) { struct resource *first, *next; for (;; parent = first) { first = __request_resource(parent, new); if (!first) return first; if (first == parent) return first; if (WARN_ON(first == new)) /* duplicated insertion */ return first; if ((first->start > new->start) || (first->end < new->end)) break; if ((first->start == new->start) && (first->end == new->end)) break; } for (next = first; ; next = next->sibling) { /* Partial overlap? Bad, and unfixable */ if (next->start < new->start || next->end > new->end) return next; if (!next->sibling) break; if (next->sibling->start > new->end) break; } new->parent = parent; new->sibling = next->sibling; new->child = first; next->sibling = NULL; for (next = first; next; next = next->sibling) next->parent = new; if (parent->child == first) { parent->child = new; } else { next = parent->child; while (next->sibling != first) next = next->sibling; next->sibling = new; } return NULL; } /** * insert_resource_conflict - Inserts resource in the resource tree * @parent: parent of the new resource * @new: new resource to insert * * Returns 0 on success, conflict resource if the resource can't be inserted. * * This function is equivalent to request_resource_conflict when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. */ struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __insert_resource(parent, new); write_unlock(&resource_lock); return conflict; } /** * insert_resource - Inserts a resource in the resource tree * @parent: parent of the new resource * @new: new resource to insert * * Returns 0 on success, -EBUSY if the resource can't be inserted. */ int insert_resource(struct resource *parent, struct resource *new) { struct resource *conflict; conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } /** * insert_resource_expand_to_fit - Insert a resource into the resource tree * @root: root resource descriptor * @new: new resource to insert * * Insert a resource into the resource tree, possibly expanding it in order * to make it encompass any conflicting resources. */ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) { if (new->parent) return; write_lock(&resource_lock); for (;;) { struct resource *conflict; conflict = __insert_resource(root, new); if (!conflict) break; if (conflict == root) break; /* Ok, expand resource to cover the conflict, then try again .. */ if (conflict->start < new->start) new->start = conflict->start; if (conflict->end > new->end) new->end = conflict->end; printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); } write_unlock(&resource_lock); } static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { struct resource *tmp, *parent = res->parent; resource_size_t end = start + size - 1; int result = -EBUSY; if (!parent) goto skip; if ((start < parent->start) || (end > parent->end)) goto out; if (res->sibling && (res->sibling->start <= end)) goto out; tmp = parent->child; if (tmp != res) { while (tmp->sibling != res) tmp = tmp->sibling; if (start <= tmp->end) goto out; } skip: for (tmp = res->child; tmp; tmp = tmp->sibling) if ((tmp->start < start) || (tmp->end > end)) goto out; res->start = start; res->end = end; result = 0; out: return result; } /** * adjust_resource - modify a resource's start and size * @res: resource to modify * @start: new start value * @size: new size * * Given an existing resource, change its start and size to match the * arguments. Returns 0 on success, -EBUSY if it can't fit. * Existing children of the resource are assumed to be immutable. */ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { int result; write_lock(&resource_lock); result = __adjust_resource(res, start, size); write_unlock(&resource_lock); return result; } EXPORT_SYMBOL(adjust_resource); static void __init __reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { struct resource *parent = root; struct resource *conflict; struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; if (!res) return; res->name = name; res->start = start; res->end = end; res->flags = IORESOURCE_BUSY; while (1) { conflict = __request_resource(parent, res); if (!conflict) { if (!next_res) break; res = next_res; next_res = NULL; continue; } /* conflict covered whole area */ if (conflict->start <= res->start && conflict->end >= res->end) { free_resource(res); WARN_ON(next_res); break; } /* failed, split and try again */ if (conflict->start > res->start) { end = res->end; res->end = conflict->start - 1; if (conflict->end < end) { next_res = alloc_resource(GFP_ATOMIC); if (!next_res) { free_resource(res); break; } next_res->name = name; next_res->start = conflict->end + 1; next_res->end = end; next_res->flags = IORESOURCE_BUSY; } } else { res->start = conflict->end + 1; } } } void __init reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { int abort = 0; write_lock(&resource_lock); if (root->start > start || root->end < end) { pr_err("requested range [0x%llx-0x%llx] not in root %pr\n", (unsigned long long)start, (unsigned long long)end, root); if (start > root->end || end < root->start) abort = 1; else { if (end > root->end) end = root->end; if (start < root->start) start = root->start; pr_err("fixing request to [0x%llx-0x%llx]\n", (unsigned long long)start, (unsigned long long)end); } dump_stack(); } if (!abort) __reserve_region_with_split(root, start, end, name); write_unlock(&resource_lock); } /** * resource_alignment - calculate resource's alignment * @res: resource pointer * * Returns alignment on success, 0 (invalid alignment) on failure. */ resource_size_t resource_alignment(struct resource *res) { switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { case IORESOURCE_SIZEALIGN: return resource_size(res); case IORESOURCE_STARTALIGN: return res->start; default: return 0; } } /* * This is compatibility stuff for IO resources. * * Note how this, unlike the above, knows about * the IO flag meanings (busy etc). * * request_region creates a new busy region. * * release_region releases a matching busy region. */ static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait); /** * __request_region - create a new busy resource region * @parent: parent resource descriptor * @start: resource start address * @n: resource region size * @name: reserving caller's ID string * @flags: IO resource flags */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); struct resource *res = alloc_resource(GFP_KERNEL); if (!res) return NULL; res->name = name; res->start = start; res->end = start + n - 1; res->flags = resource_type(parent); res->flags |= IORESOURCE_BUSY | flags; write_lock(&resource_lock); for (;;) { struct resource *conflict; conflict = __request_resource(parent, res); if (!conflict) break; if (conflict != parent) { if (!(conflict->flags & IORESOURCE_BUSY)) { parent = conflict; continue; } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); write_unlock(&resource_lock); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); remove_wait_queue(&muxed_resource_wait, &wait); write_lock(&resource_lock); continue; } /* Uhhuh, that didn't work out.. */ free_resource(res); res = NULL; break; } write_unlock(&resource_lock); return res; } EXPORT_SYMBOL(__request_region); /** * __release_region - release a previously reserved resource region * @parent: parent resource descriptor * @start: resource start address * @n: resource region size * * The described resource region must match a currently busy region. */ void __release_region(struct resource *parent, resource_size_t start, resource_size_t n) { struct resource **p; resource_size_t end; p = &parent->child; end = start + n - 1; write_lock(&resource_lock); for (;;) { struct resource *res = *p; if (!res) break; if (res->start <= start && res->end >= end) { if (!(res->flags & IORESOURCE_BUSY)) { p = &res->child; continue; } if (res->start != start || res->end != end) break; *p = res->sibling; write_unlock(&resource_lock); if (res->flags & IORESOURCE_MUXED) wake_up(&muxed_resource_wait); free_resource(res); return; } p = &res->sibling; } write_unlock(&resource_lock); printk(KERN_WARNING "Trying to free nonexistent resource " "<%016llx-%016llx>\n", (unsigned long long)start, (unsigned long long)end); } EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region * @parent: parent resource descriptor * @start: resource start address * @size: resource region size * * This interface is intended for memory hot-delete. The requested region * is released from a currently busy memory resource. The requested region * must either match exactly or fit into a single busy resource entry. In * the latter case, the remaining resource is adjusted accordingly. * Existing children of the busy memory resource must be immutable in the * request. * * Note: * - Additional release conditions, such as overlapping region, can be * supported after they are confirmed as valid cases. * - When a busy memory resource gets split into two entries, the code * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ int release_mem_region_adjustable(struct resource *parent, resource_size_t start, resource_size_t size) { struct resource **p; struct resource *res; struct resource *new_res; resource_size_t end; int ret = -EINVAL; end = start + size - 1; if ((start < parent->start) || (end > parent->end)) return ret; /* The alloc_resource() result gets checked later */ new_res = alloc_resource(GFP_KERNEL); p = &parent->child; write_lock(&resource_lock); while ((res = *p)) { if (res->start >= end) break; /* look for the next resource if it does not fit into */ if (res->start > start || res->end < end) { p = &res->sibling; continue; } if (!(res->flags & IORESOURCE_MEM)) break; if (!(res->flags & IORESOURCE_BUSY)) { p = &res->child; continue; } /* found the target resource; let's adjust accordingly */ if (res->start == start && res->end == end) { /* free the whole entry */ *p = res->sibling; free_resource(res); ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ ret = __adjust_resource(res, end + 1, res->end - end); } else if (res->start != start && res->end == end) { /* adjust the end */ ret = __adjust_resource(res, res->start, start - res->start); } else { /* split into two entries */ if (!new_res) { ret = -ENOMEM; break; } new_res->name = res->name; new_res->start = end + 1; new_res->end = res->end; new_res->flags = res->flags; new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; ret = __adjust_resource(res, res->start, start - res->start); if (ret) break; res->sibling = new_res; new_res = NULL; } break; } write_unlock(&resource_lock); free_resource(new_res); return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ /* * Managed region resource */ static void devm_resource_release(struct device *dev, void *ptr) { struct resource **r = ptr; release_resource(*r); } /** * devm_request_resource() - request and reserve an I/O or memory resource * @dev: device for which to request the resource * @root: root of the resource tree from which to request the resource * @new: descriptor of the resource to request * * This is a device-managed version of request_resource(). There is usually * no need to release resources requested by this function explicitly since * that will be taken care of when the device is unbound from its driver. * If for some reason the resource needs to be released explicitly, because * of ordering issues for example, drivers must call devm_release_resource() * rather than the regular release_resource(). * * When a conflict is detected between any existing resources and the newly * requested resource, an error message will be printed. * * Returns 0 on success or a negative error code on failure. */ int devm_request_resource(struct device *dev, struct resource *root, struct resource *new) { struct resource *conflict, **ptr; ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; *ptr = new; conflict = request_resource_conflict(root, new); if (conflict) { dev_err(dev, "resource collision: %pR conflicts with %s %pR\n", new, conflict->name, conflict); devres_free(ptr); return -EBUSY; } devres_add(dev, ptr); return 0; } EXPORT_SYMBOL(devm_request_resource); static int devm_resource_match(struct device *dev, void *res, void *data) { struct resource **ptr = res; return *ptr == data; } /** * devm_release_resource() - release a previously requested resource * @dev: device for which to release the resource * @new: descriptor of the resource to release * * Releases a resource previously requested using devm_request_resource(). */ void devm_release_resource(struct device *dev, struct resource *new) { WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match, new)); } EXPORT_SYMBOL(devm_release_resource); struct region_devres { struct resource *parent; resource_size_t start; resource_size_t n; }; static void devm_region_release(struct device *dev, void *res) { struct region_devres *this = res; __release_region(this->parent, this->start, this->n); } static int devm_region_match(struct device *dev, void *res, void *match_data) { struct region_devres *this = res, *match = match_data; return this->parent == match->parent && this->start == match->start && this->n == match->n; } struct resource * __devm_request_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n, const char *name) { struct region_devres *dr = NULL; struct resource *res; dr = devres_alloc(devm_region_release, sizeof(struct region_devres), GFP_KERNEL); if (!dr) return NULL; dr->parent = parent; dr->start = start; dr->n = n; res = __request_region(parent, start, n, name, 0); if (res) devres_add(dev, dr); else devres_free(dr); return res; } EXPORT_SYMBOL(__devm_request_region); void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n) { struct region_devres match_data = { parent, start, n }; __release_region(parent, start, n); WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, &match_data)); } EXPORT_SYMBOL(__devm_release_region); /* * Called from init/main.c to reserve IO ports. */ #define MAXRESERVE 4 static int __init reserve_setup(char *str) { static int reserved; static struct resource reserve[MAXRESERVE]; for (;;) { unsigned int io_start, io_num; int x = reserved; if (get_option (&str, &io_start) != 2) break; if (get_option (&str, &io_num) == 0) break; if (x < MAXRESERVE) { struct resource *res = reserve + x; res->name = "reserved"; res->start = io_start; res->end = io_start + io_num - 1; res->flags = IORESOURCE_BUSY; res->child = NULL; if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) reserved = x+1; } } return 1; } __setup("reserve=", reserve_setup); /* * Check if the requested addr and size spans more than any slot in the * iomem resource tree. */ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) { struct resource *p = &iomem_resource; int err = 0; loff_t l; read_lock(&resource_lock); for (p = p->child; p ; p = r_next(NULL, p, &l)) { /* * We can probably skip the resources without * IORESOURCE_IO attribute? */ if (p->start >= addr + size) continue; if (p->end < addr) continue; if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) continue; /* * if a resource is "BUSY", it's not a hardware resource * but a driver mapping of such a resource; we don't want * to warn for those; some drivers legitimately map only * partial hardware resources. (example: vesafb) */ if (p->flags & IORESOURCE_BUSY) continue; printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n", (unsigned long long)addr, (unsigned long long)(addr + size - 1), p->name, p); err = -1; break; } read_unlock(&resource_lock); return err; } #ifdef CONFIG_STRICT_DEVMEM static int strict_iomem_checks = 1; #else static int strict_iomem_checks; #endif /* * check if an address is reserved in the iomem resource tree * returns 1 if reserved, 0 if not reserved. */ int iomem_is_exclusive(u64 addr) { struct resource *p = &iomem_resource; int err = 0; loff_t l; int size = PAGE_SIZE; if (!strict_iomem_checks) return 0; addr = addr & PAGE_MASK; read_lock(&resource_lock); for (p = p->child; p ; p = r_next(NULL, p, &l)) { /* * We can probably skip the resources without * IORESOURCE_IO attribute? */ if (p->start >= addr + size) break; if (p->end < addr) continue; if (p->flags & IORESOURCE_BUSY && p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } } read_unlock(&resource_lock); return err; } struct resource_entry *resource_list_create_entry(struct resource *res, size_t extra_size) { struct resource_entry *entry; entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL); if (entry) { INIT_LIST_HEAD(&entry->node); entry->res = res ? res : &entry->__res; } return entry; } EXPORT_SYMBOL(resource_list_create_entry); void resource_list_free(struct list_head *head) { struct resource_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, head, node) resource_list_destroy_entry(entry); } EXPORT_SYMBOL(resource_list_free); static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) strict_iomem_checks = 0; if (strstr(str, "strict")) strict_iomem_checks = 1; return 1; } __setup("iomem=", strict_iomem);
/* FTP extension for connection tracking. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/ip.h> #include <linux/slab.h> #include <linux/ipv6.h> #include <linux/ctype.h> #include <linux/inet.h> #include <net/checksum.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_helper.h> #include <linux/netfilter/nf_conntrack_ftp.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ftp connection tracking helper"); MODULE_ALIAS("ip_conntrack_ftp"); MODULE_ALIAS_NFCT_HELPER("ftp"); /* This is slow, but it's simple. --RR */ static char *ftp_buffer; static DEFINE_SPINLOCK(nf_ftp_lock); #define MAX_PORTS 8 static u_int16_t ports[MAX_PORTS]; static unsigned int ports_c; module_param_array(ports, ushort, &ports_c, 0400); static bool loose; module_param(loose, bool, 0600); unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, enum ip_conntrack_info ctinfo, enum nf_ct_ftp_type type, unsigned int protoff, unsigned int matchoff, unsigned int matchlen, struct nf_conntrack_expect *exp); EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); static struct ftp_search { const char *pattern; size_t plen; char skip; char term; enum nf_ct_ftp_type ftptype; int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); } search[IP_CT_DIR_MAX][2] = { [IP_CT_DIR_ORIGINAL] = { { .pattern = "PORT", .plen = sizeof("PORT") - 1, .skip = ' ', .term = '\r', .ftptype = NF_CT_FTP_PORT, .getnum = try_rfc959, }, { .pattern = "EPRT", .plen = sizeof("EPRT") - 1, .skip = ' ', .term = '\r', .ftptype = NF_CT_FTP_EPRT, .getnum = try_eprt, }, }, [IP_CT_DIR_REPLY] = { { .pattern = "227 ", .plen = sizeof("227 ") - 1, .ftptype = NF_CT_FTP_PASV, .getnum = try_rfc1123, }, { .pattern = "229 ", .plen = sizeof("229 ") - 1, .skip = '(', .term = ')', .ftptype = NF_CT_FTP_EPSV, .getnum = try_epsv_response, }, }, }; static int get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term) { const char *end; int ret = in6_pton(src, min_t(size_t, dlen, 0xffff), (u8 *)dst, term, &end); if (ret > 0) return (int)(end - src); return 0; } static int try_number(const char *data, size_t dlen, u_int32_t array[], int array_size, char sep, char term) { u_int32_t i, len; memset(array, 0, sizeof(array[0])*array_size); /* Keep data pointing at next char. */ for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { if (*data >= '0' && *data <= '9') { array[i] = array[i]*10 + *data - '0'; } else if (*data == sep) i++; else { /* Unexpected character; true if it's the terminator (or we don't care about one) and we're finished. */ if ((*data == term || !term) && i == array_size - 1) return len; pr_debug("Char %u (got %u nums) `%u' unexpected\n", len, i, *data); return 0; } } pr_debug("Failed to fill %u numbers separated by %c\n", array_size, sep); return 0; } /* Returns 0, or length of numbers: 192,168,1,1,5,6 */ static int try_rfc959(const char *data, size_t dlen, struct nf_conntrack_man *cmd, char term, unsigned int *offset) { int length; u_int32_t array[6]; length = try_number(data, dlen, array, 6, ',', term); if (length == 0) return 0; cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]); cmd->u.tcp.port = htons((array[4] << 8) | array[5]); return length; } /* * From RFC 1123: * The format of the 227 reply to a PASV command is not * well standardized. In particular, an FTP client cannot * assume that the parentheses shown on page 40 of RFC-959 * will be present (and in fact, Figure 3 on page 43 omits * them). Therefore, a User-FTP program that interprets * the PASV reply must scan the reply for the first digit * of the host and port numbers. */ static int try_rfc1123(const char *data, size_t dlen, struct nf_conntrack_man *cmd, char term, unsigned int *offset) { int i; for (i = 0; i < dlen; i++) if (isdigit(data[i])) break; if (i == dlen) return 0; *offset += i; return try_rfc959(data + i, dlen - i, cmd, 0, offset); } /* Grab port: number up to delimiter */ static int get_port(const char *data, int start, size_t dlen, char delim, __be16 *port) { u_int16_t tmp_port = 0; int i; for (i = start; i < dlen; i++) { /* Finished? */ if (data[i] == delim) { if (tmp_port == 0) break; *port = htons(tmp_port); pr_debug("get_port: return %d\n", tmp_port); return i + 1; } else if (data[i] >= '0' && data[i] <= '9') tmp_port = tmp_port*10 + data[i] - '0'; else { /* Some other crap */ pr_debug("get_port: invalid char.\n"); break; } } return 0; } /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, char term, unsigned int *offset) { char delim; int length; /* First character is delimiter, then "1" for IPv4 or "2" for IPv6, then delimiter again. */ if (dlen <= 3) { pr_debug("EPRT: too short\n"); return 0; } delim = data[0]; if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { pr_debug("try_eprt: invalid delimitter.\n"); return 0; } if ((cmd->l3num == PF_INET && data[1] != '1') || (cmd->l3num == PF_INET6 && data[1] != '2')) { pr_debug("EPRT: invalid protocol number.\n"); return 0; } pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim); if (data[1] == '1') { u_int32_t array[4]; /* Now we have IP address. */ length = try_number(data + 3, dlen - 3, array, 4, '.', delim); if (length != 0) cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]); } else { /* Now we have IPv6 address. */ length = get_ipv6_addr(data + 3, dlen - 3, (struct in6_addr *)cmd->u3.ip6, delim); } if (length == 0) return 0; pr_debug("EPRT: Got IP address!\n"); /* Start offset includes initial "|1|", and trailing delimiter */ return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port); } /* Returns 0, or length of numbers: |||6446| */ static int try_epsv_response(const char *data, size_t dlen, struct nf_conntrack_man *cmd, char term, unsigned int *offset) { char delim; /* Three delimiters. */ if (dlen <= 3) return 0; delim = data[0]; if (isdigit(delim) || delim < 33 || delim > 126 || data[1] != delim || data[2] != delim) return 0; return get_port(data, 3, dlen, delim, &cmd->u.tcp.port); } /* Return 1 for match, 0 for accept, -1 for partial. */ static int find_pattern(const char *data, size_t dlen, const char *pattern, size_t plen, char skip, char term, unsigned int *numoff, unsigned int *numlen, struct nf_conntrack_man *cmd, int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *)) { size_t i = plen; pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); if (dlen == 0) return 0; if (dlen <= plen) { /* Short packet: try for partial? */ if (strncasecmp(data, pattern, dlen) == 0) return -1; else return 0; } if (strncasecmp(data, pattern, plen) != 0) { #if 0 size_t i; pr_debug("ftp: string mismatch\n"); for (i = 0; i < plen; i++) { pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n", i, data[i], data[i], pattern[i], pattern[i]); } #endif return 0; } pr_debug("Pattern matches!\n"); /* Now we've found the constant string, try to skip to the 'skip' character */ if (skip) { for (i = plen; data[i] != skip; i++) if (i == dlen - 1) return -1; /* Skip over the last character */ i++; } pr_debug("Skipped up to 0x%hhx delimiter!\n", skip); *numoff = i; *numlen = getnum(data + i, dlen - i, cmd, term, numoff); if (!*numlen) return -1; pr_debug("Match succeeded!\n"); return 1; } /* Look up to see if we're just after a \n. */ static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir) { unsigned int i; for (i = 0; i < info->seq_aft_nl_num[dir]; i++) if (info->seq_aft_nl[dir][i] == seq) return 1; return 0; } /* We don't update if it's older than what we have. */ static void update_nl_seq(struct nf_conn *ct, u32 nl_seq, struct nf_ct_ftp_master *info, int dir, struct sk_buff *skb) { unsigned int i, oldest; /* Look for oldest: if we find exact match, we're done. */ for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { if (info->seq_aft_nl[dir][i] == nl_seq) return; } if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; } else { if (before(info->seq_aft_nl[dir][0], info->seq_aft_nl[dir][1])) oldest = 0; else oldest = 1; if (after(nl_seq, info->seq_aft_nl[dir][oldest])) info->seq_aft_nl[dir][oldest] = nl_seq; } } static int help(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { unsigned int dataoff, datalen; const struct tcphdr *th; struct tcphdr _tcph; const char *fb_ptr; int ret; u32 seq; int dir = CTINFO2DIR(ctinfo); unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); 2 struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct); struct nf_conntrack_expect *exp; union nf_inet_addr *daddr; 2 struct nf_conntrack_man cmd = {}; unsigned int i; int found = 0, ends_in_nl; typeof(nf_nat_ftp_hook) nf_nat_ftp; /* Until there's been traffic both ways, don't look in packets. */ 2 if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) { pr_debug("ftp: Conntrackinfo = %u\n", ctinfo); 2 return NF_ACCEPT; } th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); if (th == NULL) return NF_ACCEPT; dataoff = protoff + th->doff * 4; /* No data? */ if (dataoff >= skb->len) { pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, skb->len); return NF_ACCEPT; } datalen = skb->len - dataoff; spin_lock_bh(&nf_ftp_lock); fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer); BUG_ON(fb_ptr == NULL); ends_in_nl = (fb_ptr[datalen - 1] == '\n'); seq = ntohl(th->seq) + datalen; /* Look up to see if we're just after a \n. */ if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { /* We're picking up this, clear flags and let it continue */ if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) { ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP; goto skip_nl_seq; } /* Now if this ends in \n, update ftp info. */ pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n", ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", ct_ftp_info->seq_aft_nl[dir][0], ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", ct_ftp_info->seq_aft_nl[dir][1]); ret = NF_ACCEPT; goto out_update_nl; } skip_nl_seq: /* Initialize IP/IPv6 addr to expected address (it's not mentioned in EPSV responses) */ cmd.l3num = nf_ct_l3num(ct); memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, sizeof(cmd.u3.all)); for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { found = find_pattern(fb_ptr, datalen, search[dir][i].pattern, search[dir][i].plen, search[dir][i].skip, search[dir][i].term, &matchoff, &matchlen, &cmd, search[dir][i].getnum); if (found) break; } if (found == -1) { /* We don't usually drop packets. After all, this is connection tracking, not packet filtering. However, it is necessary for accurate tracking in this case. */ nf_ct_helper_log(skb, ct, "partial matching of `%s'", search[dir][i].pattern); ret = NF_DROP; goto out; } else if (found == 0) { /* No match */ ret = NF_ACCEPT; goto out_update_nl; } pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", matchlen, fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); exp = nf_ct_expect_alloc(ct); if (exp == NULL) { nf_ct_helper_log(skb, ct, "cannot alloc expectation"); ret = NF_DROP; goto out; } /* We refer to the reverse direction ("!dir") tuples here, * because we're expecting something in the other direction. * Doesn't matter unless NAT is happening. */ daddr = &ct->tuplehash[!dir].tuple.dst.u3; /* Update the ftp info */ if ((cmd.l3num == nf_ct_l3num(ct)) && memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, sizeof(cmd.u3.all))) { /* Enrico Scholz's passive FTP to partially RNAT'd ftp server: it really wants us to connect to a different IP address. Simply don't record it for NAT. */ if (cmd.l3num == PF_INET) { pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n", &cmd.u3.ip, &ct->tuplehash[dir].tuple.src.u3.ip); } else { pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n", cmd.u3.ip6, ct->tuplehash[dir].tuple.src.u3.ip6); } /* Thanks to Cristiano Lincoln Mattos <lincoln@cesar.org.br> for reporting this potential problem (DMZ machines opening holes to internal networks, or the packet filter itself). */ if (!loose) { ret = NF_ACCEPT; goto out_put_expect; } daddr = &cmd.u3; } nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num, &ct->tuplehash[!dir].tuple.src.u3, daddr, IPPROTO_TCP, NULL, &cmd.u.tcp.port); /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook); if (nf_nat_ftp && ct->status & IPS_NAT_MASK) ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype, protoff, matchoff, matchlen, exp); else { /* Can't expect this? Best to drop packet now. */ if (nf_ct_expect_related(exp) != 0) { nf_ct_helper_log(skb, ct, "cannot add expectation"); ret = NF_DROP; } else ret = NF_ACCEPT; } out_put_expect: nf_ct_expect_put(exp); out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) update_nl_seq(ct, seq, ct_ftp_info, dir, skb); out: spin_unlock_bh(&nf_ftp_lock); return ret; } static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct) { struct nf_ct_ftp_master *ftp = nfct_help_data(ct); /* This conntrack has been injected from user-space, always pick up * sequence tracking. Otherwise, the first FTP command after the * failover breaks. */ ftp->flags[IP_CT_DIR_ORIGINAL] |= NF_CT_FTP_SEQ_PICKUP; ftp->flags[IP_CT_DIR_REPLY] |= NF_CT_FTP_SEQ_PICKUP; return 0; } static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; static const struct nf_conntrack_expect_policy ftp_exp_policy = { .max_expected = 1, .timeout = 5 * 60, }; /* don't make this __exit, since it's called from __init ! */ static void nf_conntrack_ftp_fini(void) { int i, j; for (i = 0; i < ports_c; i++) { for (j = 0; j < 2; j++) { if (ftp[i][j].me == NULL) continue; pr_debug("nf_ct_ftp: unregistering helper for pf: %d " "port: %d\n", ftp[i][j].tuple.src.l3num, ports[i]); nf_conntrack_helper_unregister(&ftp[i][j]); } } kfree(ftp_buffer); } static int __init nf_conntrack_ftp_init(void) { int i, j = -1, ret = 0; ftp_buffer = kmalloc(65536, GFP_KERNEL); if (!ftp_buffer) return -ENOMEM; if (ports_c == 0) ports[ports_c++] = FTP_PORT; /* FIXME should be configurable whether IPv4 and IPv6 FTP connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { ftp[i][0].tuple.src.l3num = PF_INET; ftp[i][1].tuple.src.l3num = PF_INET6; for (j = 0; j < 2; j++) { ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master); ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; ftp[i][j].expect_policy = &ftp_exp_policy; ftp[i][j].me = THIS_MODULE; ftp[i][j].help = help; ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr; if (ports[i] == FTP_PORT) sprintf(ftp[i][j].name, "ftp"); else sprintf(ftp[i][j].name, "ftp-%d", ports[i]); pr_debug("nf_ct_ftp: registering helper for pf: %d " "port: %d\n", ftp[i][j].tuple.src.l3num, ports[i]); ret = nf_conntrack_helper_register(&ftp[i][j]); if (ret) { printk(KERN_ERR "nf_ct_ftp: failed to register" " helper for pf: %d port: %d\n", ftp[i][j].tuple.src.l3num, ports[i]); nf_conntrack_ftp_fini(); return ret; } } } return 0; } module_init(nf_conntrack_ftp_init); module_exit(nf_conntrack_ftp_fini);
/* * Lockless get_user_pages_fast for x86 * * Copyright (C) 2008 Nick Piggin * Copyright (C) 2008 Novell Inc. */ #include <linux/sched.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/highmem.h> #include <linux/swap.h> #include <asm/pgtable.h> static inline pte_t gup_get_pte(pte_t *ptep) { #ifndef CONFIG_X86_PAE 153 return READ_ONCE(*ptep); #else /* * With get_user_pages_fast, we walk down the pagetables without taking * any locks. For this we would like to load the pointers atomically, * but that is not possible (without expensive cmpxchg8b) on PAE. What * we do have is the guarantee that a pte will only either go from not * present to present, or present to not present or both -- it will not * switch to a completely different present page without a TLB flush in * between; something that we are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees l iff pte_high * sees h. We load pte_high *after* loading pte_low, which ensures we * don't see an older value of pte_high. *Then* we recheck pte_low, * which ensures that we haven't picked up a changed pte high. We might * have got rubbish values from pte_low and pte_high, but we are * guaranteed that pte_low will not have the present bit set *unless* * it is 'l'. And get_user_pages_fast only operates on present ptes, so * we're safe. * * gup_get_pte should not be used or copied outside gup.c without being * very careful -- it does not atomically load the pte or anything that * is likely to be useful for you. */ pte_t pte; retry: pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); if (unlikely(pte.pte_low != ptep->pte_low)) goto retry; return pte; #endif } /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much * register pressure. */ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; pte_t *ptep; mask = _PAGE_PRESENT|_PAGE_USER; 153 if (write) mask |= _PAGE_RW; 153 ptep = pte_offset_map(&pmd, addr); do { 153 pte_t pte = gup_get_pte(ptep); struct page *page; /* Similar to the PMD case, NUMA hinting must take slow path */ if (pte_protnone(pte)) { pte_unmap(ptep); return 0; } if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; } 146 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 146 page = pte_page(pte); 146 get_page(page); 146 SetPageReferenced(page); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); pte_unmap(ptep - 1); 153 return 1; } static inline void get_head_page_multiple(struct page *page, int nr) { VM_BUG_ON_PAGE(page != compound_head(page), page); VM_BUG_ON_PAGE(page_count(page) == 0, page); atomic_add(nr, &page->_count); SetPageReferenced(page); } static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; if ((pmd_flags(pmd) & mask) != mask) return 0; /* hugepages are never "special" */ VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); refs = 0; head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; if (PageTail(page)) get_huge_page_tail(page); (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); get_head_page_multiple(head, refs); return 1; } static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; 160 pmdp = pmd_offset(&pud, addr); do { 160 pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); /* * The pmd_trans_splitting() check below explains why * pmdp_splitting_flush has to flush the tlb, to stop * this gup-fast code from running while we set the * splitting bit in the pmd. Returning zero will take * the slow path that will call wait_split_huge_page() * if the pmd is still in splitting state. gup-fast * can't because it has irq disabled and * wait_split_huge_page() would never return as the * tlb flush IPI wouldn't run. */ 160 if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; 160 if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; } else { 153 if (!gup_pte_range(pmd, addr, next, write, pages, nr)) return 0; } 91 } while (pmdp++, addr = next, addr != end); return 1; } static noinline int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; struct page *head, *page; int refs; mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; if ((pud_flags(pud) & mask) != mask) return 0; /* hugepages are never "special" */ VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); VM_BUG_ON(!pfn_valid(pud_pfn(pud))); refs = 0; head = pud_page(pud); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; if (PageTail(page)) get_huge_page_tail(page); (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); get_head_page_multiple(head, refs); return 1; } static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; 160 pudp = pud_offset(&pgd, addr); do { 160 pud_t pud = *pudp; next = pud_addr_end(addr, end); 160 if (pud_none(pud)) return 0; 160 if (unlikely(pud_large(pud))) { if (!gup_huge_pud(pud, addr, next, write, pages, nr)) return 0; } else { 160 if (!gup_pmd_range(pud, addr, next, write, pages, nr)) return 0; } 91 } while (pudp++, addr = next, addr != end); return 1; } /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; unsigned long next; unsigned long flags; pgd_t *pgdp; int nr = 0; start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, (void __user *)start, len))) return 0; /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by * important workloads (eg. DB2), and whether limiting the batch size * will decrease performance. * * It seems like we're in the clear for the moment. Direct-IO is * the main guy that batches up lots of get_user_pages, and even * they are limited to 64-at-a-time which is not so many. */ /* * This doesn't prevent pagetable teardown, but does prevent * the pagetables and pages from being freed on x86. * * So long as we atomically load page table pointers versus teardown * (which we do on x86, with the above PAE exception), we can follow the * address down to the the page and take a ref on it. */ local_irq_save(flags); pgdp = pgd_offset(mm, addr); do { pgd_t pgd = *pgdp; next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); return nr; } /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @write: whether pages will be written to * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_sem. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { 160 struct mm_struct *mm = current->mm; unsigned long addr, len, end; unsigned long next; pgd_t *pgdp; int nr = 0; start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; if (end < start) goto slow_irqon; #ifdef CONFIG_X86_64 160 if (end >> __VIRTUAL_MASK_SHIFT) goto slow_irqon; #endif /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by * important workloads (eg. DB2), and whether limiting the batch size * will decrease performance. * * It seems like we're in the clear for the moment. Direct-IO is * the main guy that batches up lots of get_user_pages, and even * they are limited to 64-at-a-time which is not so many. */ /* * This doesn't prevent pagetable teardown, but does prevent * the pagetables and pages from being freed on x86. * * So long as we atomically load page table pointers versus teardown * (which we do on x86, with the above PAE exception), we can follow the * address down to the the page and take a ref on it. */ 160 local_irq_disable(); pgdp = pgd_offset(mm, addr); do { 160 pgd_t pgd = *pgdp; next = pgd_addr_end(addr, end); 160 if (pgd_none(pgd)) goto slow; 160 if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) goto slow; 91 } while (pgdp++, addr = next, addr != end); 91 local_irq_enable(); VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); return nr; { int ret; slow: 91 local_irq_enable(); slow_irqon: /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; pages += nr; 91 ret = get_user_pages_unlocked(current, mm, start, (end - start) >> PAGE_SHIFT, pages, write ? FOLL_WRITE : 0); /* Have to be a bit careful with return values */ if (nr > 0) { 68 if (ret < 0) ret = nr; else 160 ret += nr; } return ret; } }
/* * linux/fs/ext4/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 directory handling functions * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Hash Tree Directory indexing (c) 2001 Daniel Phillips * */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/slab.h> #include "ext4.h" #include "xattr.h" static int ext4_dx_readdir(struct file *, struct dir_context *); /** * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not */ static int is_dx_dir(struct inode *inode) { 9 struct super_block *sb = inode->i_sb; if (ext4_has_feature_dir_index(inode->i_sb) && 9 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || 9 ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) return 1; 9 return 0; } /* * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... * * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; 76 const int rlen = ext4_rec_len_from_disk(de->rec_len, 76 dir->i_sb->s_blocksize); if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; 76 else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; 76 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; 76 else if (unlikely(((char *) de - buf) + rlen > size)) error_msg = "directory entry overrun"; 76 else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; else return 0; if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, name_len=%d, size=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, de->name_len, size); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, name_len=%d, size=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, de->name_len, size); return 1; } static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; int i; struct ext4_dir_entry_2 *de; int err; 6 struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; int dir_has_error = 0; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; if (ext4_encrypted_inode(inode)) { err = ext4_get_encryption_info(inode); if (err && err != -ENOKEY) return err; } if (is_dx_dir(inode)) { 6 err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { return err; } /* * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ ext4_clear_inode_flag(file_inode(file), EXT4_INODE_INDEX); } if (ext4_has_inline_data(inode)) { int has_inline_data = 1; err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) return err; } if (ext4_encrypted_inode(inode)) { err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) return err; } offset = ctx->pos & (sb->s_blocksize - 1); while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) 1 return PTR_ERR(bh); } if (!bh) { if (!dir_has_error) { EXT4_ERROR_FILE(file, 0, "directory contains a " "hole at offset %llu", (unsigned long long) ctx->pos); dir_has_error = 1; } /* corrupt size? Maybe no more blocks to read */ if (ctx->pos > inode->i_blocks << 9) break; ctx->pos += sb->s_blocksize - offset; continue; } /* Check the checksum */ if (!buffer_verified(bh) && !ext4_dirent_csum_verify(inode, (struct ext4_dir_entry *)bh->b_data)) { EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); bh = NULL; continue; } set_buffer_verified(bh); /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (file->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; file->f_version = inode->i_version; } while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* * On error, skip to the next block */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { if (!ext4_encrypted_inode(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { int save_len = fname_crypto_str.len; /* Directory is encrypted */ err = ext4_fname_disk_to_usr(inode, NULL, de, &fname_crypto_str); fname_crypto_str.len = save_len; if (err < 0) goto errout; if (!dir_emit(ctx, fname_crypto_str.name, err, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } if ((ctx->pos < inode->i_size) && !dir_relax(inode)) goto done; brelse(bh); bh = NULL; offset = 0; } done: err = 0; errout: #ifdef CONFIG_EXT4_FS_ENCRYPTION ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif brelse(bh); 6 return err; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT 9 return is_compat_task(); #else return (BITS_PER_LONG == 32); #endif } /* * These functions convert from the major/minor hash to an f_pos * value for dx directories * * Upper layer (for example NFS) should specify FMODE_32BITHASH or * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted * directly on both 32-bit and 64-bit nodes, under such case, neither * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) { if ((filp->f_mode & FMODE_32BITHASH) || 6 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 6 return major >> 1; else return ((__u64)(major >> 1) << 32) | (__u64)minor; } static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) { 5 if ((filp->f_mode & FMODE_32BITHASH) || 5 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 5 return (pos << 1) & 0xffffffff; else return ((pos >> 32) << 1) & 0xffffffff; } static inline __u32 pos2min_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || 5 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return 0; else return pos & 0xffffffff; } /* * Return 32- or 64-bit end-of-file for dx directories */ static inline loff_t ext4_get_htree_eof(struct file *filp) { 6 if ((filp->f_mode & FMODE_32BITHASH) || 9 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return EXT4_HTREE_EOF_32BIT; else return EXT4_HTREE_EOF_64BIT; } /* * ext4_dir_llseek() calls generic_file_llseek_size to handle htree * directories, where the "offset" is in terms of the filename hash * value instead of the byte offset. * * Because we may return a 64-bit hash that is well beyond offset limits, * we need to pass the max hash as the maximum allowable offset in * the htree directory case. * * For non-htree, ext4_llseek already chooses the proper max offset. */ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { 4 struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); 4 loff_t htree_max = ext4_get_htree_eof(file); 4 if (likely(dx_dir)) 4 return generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else return ext4_llseek(file, offset, whence); } /* * This structure holds the nodes of the red-black tree used to store * the directory entry in hash order. */ struct fname { __u32 hash; __u32 minor_hash; struct rb_node rb_hash; struct fname *next; __u32 inode; __u8 name_len; __u8 file_type; char name[0]; }; /* * This functoin implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) { struct fname *fname, *next; 5 rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; 3 fname = fname->next; kfree(old); } 5 *root = RB_ROOT; } static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, loff_t pos) { struct dir_private_info *p; 5 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; 5 p->curr_hash = pos2maj_hash(filp, pos); 5 p->curr_minor_hash = pos2min_hash(filp, pos); return p; } void ext4_htree_free_dir_info(struct dir_private_info *p) { 1 free_rb_tree_fname(&p->root); kfree(p); } /* * Given a directory entry, enter it into the fname rb tree. * * When filename encryption is enabled, the dirent will hold the * encrypted filename, while the htree will hold decrypted filename. * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct ext4_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; int len; 5 info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ len = sizeof(struct fname) + ent_name->len + 1; new_fn = kzalloc(len, GFP_KERNEL); if (!new_fn) return -ENOMEM; 5 new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); new_fn->name[ent_name->len] = 0; 5 while (*p) { parent = *p; fname = rb_entry(parent, struct fname, rb_hash); /* * If the hash and minor hash match up, then we put * them on a linked list. This rarely happens... */ 5 if ((new_fn->hash == fname->hash) && (new_fn->minor_hash == fname->minor_hash)) { new_fn->next = fname->next; fname->next = new_fn; 5 return 0; } 5 if (new_fn->hash < fname->hash) p = &(*p)->rb_left; 5 else if (new_fn->hash > fname->hash) p = &(*p)->rb_right; else if (new_fn->minor_hash < fname->minor_hash) 5 p = &(*p)->rb_left; else /* if (new_fn->minor_hash > fname->minor_hash) */ 5 p = &(*p)->rb_right; } 5 rb_link_node(&new_fn->rb_hash, parent, p); rb_insert_color(&new_fn->rb_hash, &info->root); return 0; 5 } /* * This is a helper function for ext4_dx_readdir. It calls filldir * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, struct fname *fname) { 6 struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; } 6 ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); 6 while (fname) { if (!dir_emit(ctx, fname->name, fname->name_len, 6 fname->inode, 6 get_dtype(sb, fname->file_type))) { 4 info->extra_fname = fname; return 1; } 5 fname = fname->next; } return 0; } static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { 6 struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; int ret; if (!info) { 5 info = ext4_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; file->private_data = info; } 6 if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ 6 if (info->last_pos != ctx->pos) { 1 free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; 1 info->curr_hash = pos2maj_hash(file, ctx->pos); 1 info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* * If there are any leftover names on the hash collision * chain, return them first. */ 6 if (info->extra_fname) { 2 if (call_filldir(file, ctx, info->extra_fname)) goto finished; 5 info->extra_fname = NULL; goto next_node; 5 } else if (!info->curr_node) 5 info->curr_node = rb_first(&info->root); while (1) { /* * Fill the rbtree if we have no more entries, * or the inode has changed since we last read in the * cached entries. */ 1 if ((!info->curr_node) || 5 (file->f_version != inode->i_version)) { 5 info->curr_node = NULL; free_rb_tree_fname(&info->root); file->f_version = inode->i_version; ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) return ret; 5 if (ret == 0) { 1 ctx->pos = ext4_get_htree_eof(file); break; } 5 info->curr_node = rb_first(&info->root); } 5 fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; 3 if (call_filldir(file, ctx, fname)) break; next_node: 5 info->curr_node = rb_next(info->curr_node); if (info->curr_node) { fname = rb_entry(info->curr_node, struct fname, rb_hash); 5 info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; } else { 3 if (info->next_hash == ~0) { 3 ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; info->curr_minor_hash = 0; } } finished: 6 info->last_pos = ctx->pos; return 0; } static int ext4_dir_open(struct inode * inode, struct file * filp) 6 { if (ext4_encrypted_inode(inode)) return ext4_get_encryption_info(inode) ? -EACCES : 0; return 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) { 2 if (filp->private_data) 1 ext4_htree_free_dir_info(filp->private_data); 2 return 0; } int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; int nlen, rlen; unsigned int offset = 0; char *top; de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -EFSCORRUPTED; return 0; } const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, .open = ext4_dir_open, .release = ext4_release_dir, };
#undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <linux/tracepoint.h> 6 TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = sk->sk_rcvbuf; ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated), TP_ARGS(sk, prot, allocated), TP_STRUCT__entry( __array(char, name, 32) __field(long *, sysctl_mem) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) ), TP_fast_assign( strncpy(__entry->name, prot->name, 32); __entry->sysctl_mem = prot->sysctl_mem; __entry->allocated = allocated; __entry->sysctl_rmem = prot->sysctl_rmem[0]; __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld " "sysctl_rmem=%d rmem_alloc=%d", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
/* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/user_namespace.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { BUG_ON(!fmt); if (WARN_ON(!fmt->load_binary)) return; write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { 270 return (path->mnt->mnt_flags & MNT_NOEXEC) || 266 (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. * * Also note that we take the address to load from from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) goto out; error = -EINVAL; if (!S_ISREG(file_inode(file)->i_mode)) goto exit; error = -EACCES; if (path_noexec(&file->f_path)) goto exit; fsnotify_open(file); error = -ENOEXEC; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!fmt->load_shlib) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); error = fmt->load_shlib(file); read_lock(&binfmt_lock); put_binfmt(fmt); if (error != -ENOEXEC) break; } read_unlock(&binfmt_lock); exit: fput(file); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { 30 struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); 65 if (!mm || !diff) return; 63 bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; int ret; unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { ret = expand_downwards(bprm->vma, pos); if (ret < 0) return NULL; } #endif if (write) gup_flags |= FOLL_WRITE; 63 ret = get_user_pages(current, bprm->mm, pos, 1, gup_flags, &page, NULL); if (ret <= 0) return NULL; if (write) { 63 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; unsigned long ptr_size, limit; /* * Since the stack will hold pointers to the strings, we * must account for them as well. * * The size calculation is the entire vma while each arg page is * built, so each time we get here it's calculating how far it * is currently (rather than each call being just the newly * added size from the arg page). As a result, we need to * always add the entire size of the pointers, so that on the * last call to get_arg_page() we'll actually have the entire * correct size. */ ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); if (ptr_size > ULONG_MAX - size) goto fail; 63 size += ptr_size; 63 acct_arg_size(bprm, size / PAGE_SIZE); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ 63 if (size <= ARG_MAX) return page; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, rlimit(RLIMIT_STACK) / 4); 63 if (size > limit) goto fail; } 2 return page; fail: put_page(page); return NULL; } static void put_arg_page(struct page *page) { 63 put_page(page); } static void free_arg_page(struct linux_binprm *bprm, int i) { } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static int __bprm_mm_init(struct linux_binprm *bprm) { int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; 69 bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) return -ENOMEM; 69 down_write(&mm->mmap_sem); vma->vm_mm = mm; /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; 69 vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) goto err; 69 mm->stack_vm = mm->total_vm = 1; arch_bprm_mm_init(mm, vma); up_write(&mm->mmap_sem); bprm->p = vma->vm_end - sizeof(void *); return 0; err: up_write(&mm->mmap_sem); bprm->vma = NULL; kmem_cache_free(vm_area_cachep, vma); return err; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { 63 return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static int __bprm_mm_init(struct linux_binprm *bprm) { bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); return 0; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; 69 err = __bprm_mm_init(bprm); if (err) goto err; return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; 69 static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; 15 if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); 15 return compat_ptr(compat); } #endif 68 if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; 69 if (argv.ptr.native != NULL) { for (;;) { 15 const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; 15 if (IS_ERR(p)) 65 return -EFAULT; 14 if (i >= max) return -E2BIG; 14 ++i; 1 if (fatal_signal_pending(current)) return -ERESTARTNOHAND; 14 cond_resched(); } } return i; } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; 63 while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; 63 str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; 63 len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; 63 if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwords. */ 63 pos = bprm->p; str += len; bprm->p -= len; 63 while (len > 0) { int offset, bytes_to_copy; 63 if (fatal_signal_pending(current)) { 1 ret = -ERESTARTNOHAND; goto out; } 63 cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; 63 if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; 3 if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; 63 page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } 63 if (kmapped_page) { flush_kernel_dcache_page(kmapped_page); kunmap(kmapped_page); 2 put_arg_page(kmapped_page); } kmapped_page = page; 63 kaddr = kmap(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } 63 if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: 63 if (kmapped_page) { flush_kernel_dcache_page(kmapped_page); kunmap(kmapped_page); 63 put_arg_page(kmapped_page); } 63 return ret; } /* * Like copy_strings, but get argv and its values from kernel memory. */ int copy_strings_kernel(int argc, const char *const *__argv, struct linux_binprm *bprm) { int r; 63 mm_segment_t oldfs = get_fs(); struct user_arg_ptr argv = { .ptr.native = (const char __user *const __user *)__argv, }; set_fs(KERNEL_DS); r = copy_strings(argc, argv, bprm); set_fs(oldfs); return r; } EXPORT_SYMBOL(copy_strings_kernel); #ifdef CONFIG_MMU /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. The process proceeds as follows: * * 1) Use shift to calculate the new vma endpoints. * 2) Extend vma to cover both the old and new ranges. This ensures the * arguments passed to subsequent functions are consistent. * 3) Move vma's page tables to the new range. * 4) Free up any cleared pgd range. * 5) Shrink the vma to cover only the new range. */ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) { struct mm_struct *mm = vma->vm_mm; unsigned long old_start = vma->vm_start; unsigned long old_end = vma->vm_end; unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; struct mmu_gather tlb; BUG_ON(new_start > new_end); /* * ensure there are no vmas between where we want to go * and where we are */ if (vma != find_vma(mm, new_start)) return -EFAULT; /* * cover the whole range: [new_start, old_end) */ if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* * move the page tables downwards, on failure we rely on * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, vma, new_start, length, false)) return -ENOMEM; lru_add_drain(); tlb_gather_mmu(&tlb, mm, old_start, old_end); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch * the address space in [new_end, old_start) some architectures * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb, old_start, old_end); /* * Shrink the vma to just the new range. Always succeeds. */ vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); return 0; } /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = rlimit_max(RLIMIT_STACK); if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; /* Add space for stack randomization. */ stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; down_write(&mm->mmap_sem); vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags); if (ret) goto out_unlock; BUG_ON(prev != vma); /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; else stack_base = vma->vm_end + stack_expand; #else if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_end - rlim_stack; else stack_base = vma->vm_start - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: up_write(&mm->mmap_sem); return ret; } EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; 93 struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return ERR_PTR(-EINVAL); 92 if (flags & AT_SYMLINK_NOFOLLOW) 1 open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; 92 if (flags & AT_EMPTY_PATH) 58 open_exec_flags.lookup_flags |= LOOKUP_EMPTY; 92 file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) goto out; err = -EACCES; 76 if (!S_ISREG(file_inode(file)->i_mode)) goto exit; 73 if (path_noexec(&file->f_path)) goto exit; 71 err = deny_write_access(file); if (err) goto exit; 70 if (name->name[0] != '\0') 23 fsnotify_open(file); out: return file; exit: 8 fput(file); return ERR_PTR(err); } struct file *open_exec(const char *name) { 10 struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { 10 f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } 10 return f; } EXPORT_SYMBOL(open_exec); int kernel_read(struct file *file, loff_t offset, char *addr, unsigned long count) { mm_segment_t old_fs; 63 loff_t pos = offset; int result; old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ result = vfs_read(file, (void __user *)addr, count, &pos); set_fs(old_fs); return result; } EXPORT_SYMBOL(kernel_read); ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; mm_release(tsk, old_mm); if (old_mm) { sync_mm_rss(old_mm); /* * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going * through with the exec. We must hold mmap_sem around * checking core_state and changing tsk->mm. */ down_read(&old_mm->mmap_sem); if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); return -EINTR; } } task_lock(tsk); active_mm = tsk->active_mm; tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop(active_mm); return 0; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if (signal_group_exit(sig)) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exit_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exit_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); threadgroup_change_end(tsk); schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->real_start_time = leader->real_start_time; BUG_ON(!same_thread_group(leader, tsk)); BUG_ON(has_group_leader_pid(tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. * Note: The old leader also uses this pid until release_task * is called. Odd but simple and correct. */ tsk->pid = leader->pid; change_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer wont't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); threadgroup_change_end(tsk); release_task(leader); } sig->group_exit_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; exit_itimers(sig); flush_itimer_signals(); if (atomic_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; atomic_set(&newsighand->count, 1); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); rcu_assign_pointer(tsk->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exit_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { 24 task_lock(tsk); strncpy(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable * so that a new one can be started */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { 52 task_lock(tsk); 52 trace_task_rename(tsk, buf); 52 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } int flush_old_exec(struct linux_binprm * bprm) { int retval; /* * Make sure we have a private signal table and that * we are unassociated from the previous thread group. */ retval = de_thread(current); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visibile until then. This also enables the update * to be lockless. */ set_mm_exe_file(bprm->mm, bprm->file); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; /* We're using it now */ set_fs(USER_DS); current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); current->personality &= ~bprm->per_clear; /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(current->files); return 0; out: return retval; } EXPORT_SYMBOL(flush_old_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { 58 struct inode *inode = file_inode(file); if (inode_permission2(file->f_path.mnt, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; 1 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && 1 !privileged_wrt_inode_uidgid(user_ns, inode)) user_ns = user_ns->parent; 1 if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } 58 } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { arch_pick_mmap_layout(current->mm); /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) set_dumpable(current->mm, SUID_DUMP_USER); else set_dumpable(current->mm, suid_dumpable); perf_event_exec(); __set_task_comm(current, kbasename(bprm->filename), true); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ current->mm->task_size = TASK_SIZE; /* install the new credentials */ if (!uid_eq(bprm->cred->uid, current_euid()) || !gid_eq(bprm->cred->gid, current_egid())) { current->pdeath_signal = 0; } else { if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) set_dumpable(current->mm, suid_dumpable); } /* An exec changes our domain. We are no longer part of the thread group */ current->self_exec_id++; flush_signal_handlers(current, 0); } EXPORT_SYMBOL(setup_new_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * install_exec_creds() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred and * and unlock. */ int prepare_bprm_creds(struct linux_binprm *bprm) { 93 if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR; 93 bprm->cred = prepare_exec_creds(); 93 if (likely(bprm->cred)) return 0; mutex_unlock(¤t->signal->cred_guard_mutex); return -ENOMEM; } static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); 53 if (bprm->cred) { 53 mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } 53 if (bprm->file) { 25 allow_write_access(bprm->file); fput(bprm->file); } /* If a binfmt changed the interp, free it. */ 53 if (bprm->interp != bprm->filename) 6 kfree(bprm->interp); 53 kfree(bprm); } int bprm_change_interp(char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ 8 if (bprm->interp != bprm->filename) 1 kfree(bprm->interp); 8 bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) 8 return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * install the new credentials for this executable */ void install_exec_creds(struct linux_binprm *bprm) { security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(current->mm) != SUID_DUMP_USER) perf_event_exit_task(current); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { 93 struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) { 1 if (ptracer_capable(p, current_user_ns())) bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; else 1 bprm->unsafe |= LSM_UNSAFE_PTRACE; } /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ 93 if (task_no_new_privs(current)) 2 bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; t = p; n_fs = 1; 93 spin_lock(&p->fs->lock); 93 rcu_read_lock(); 93 while_each_thread(p, t) { 93 if (t->fs == p->fs) 92 n_fs++; } 93 rcu_read_unlock(); if (p->fs->users > n_fs) 53 bprm->unsafe |= LSM_UNSAFE_SHARE; else 40 p->fs->in_exec = 1; 93 spin_unlock(&p->fs->lock); } static void bprm_fill_uid(struct linux_binprm *bprm) { struct inode *inode; unsigned int mode; kuid_t uid; kgid_t gid; /* clear any previous set[ug]id data from a previous binary */ 64 bprm->cred->euid = current_euid(); bprm->cred->egid = current_egid(); if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) return; 63 if (task_no_new_privs(current)) return; 61 inode = file_inode(bprm->file); mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; /* Be careful if suid/sgid is set */ 5 mutex_lock(&inode->i_mutex); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; uid = inode->i_uid; gid = inode->i_gid; mutex_unlock(&inode->i_mutex); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!kuid_has_mapping(bprm->cred->user_ns, uid) || 5 !kgid_has_mapping(bprm->cred->user_ns, gid)) return; 5 if (mode & S_ISUID) { 2 bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = uid; } 5 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1 bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = gid; } } /* * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes * * This may be called multiple times for binary chains (scripts for example). */ 63 int prepare_binprm(struct linux_binprm *bprm) { int retval; 64 bprm_fill_uid(bprm); /* fill in binprm security blob */ 64 retval = security_bprm_set_creds(bprm); 64 if (retval) return retval; 63 bprm->cred_prepared = 1; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); } EXPORT_SYMBOL(prepare_binprm); /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ 8 int remove_arg_zero(struct linux_binprm *bprm) { int ret = 0; unsigned long offset; char *kaddr; struct page *page; 8 if (!bprm->argc) return 0; do { 2 offset = bprm->p & ~PAGE_MASK; 2 page = get_arg_page(bprm, bprm->p, 0); if (!page) { ret = -EFAULT; goto out; } 2 kaddr = kmap_atomic(page); 2 for (; offset < PAGE_SIZE && kaddr[offset]; 2 offset++, bprm->p++) ; 2 kunmap_atomic(kaddr); 2 put_arg_page(page); if (offset == PAGE_SIZE) free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1); } while (offset == PAGE_SIZE); 2 bprm->p++; bprm->argc--; 8 ret = 0; out: return ret; } EXPORT_SYMBOL(remove_arg_zero); #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ 58 int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; /* This allows 4 levels of binfmt rewrites before failing hard. */ 58 if (bprm->recursion_depth > 5) return -ELOOP; 58 retval = security_bprm_check(bprm); if (retval) return retval; retval = -ENOENT; retry: 58 read_lock(&binfmt_lock); 50 list_for_each_entry(fmt, &formats, lh) { 58 if (!try_module_get(fmt->module)) continue; 58 read_unlock(&binfmt_lock); bprm->recursion_depth++; retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); bprm->recursion_depth--; 58 if (retval < 0 && !bprm->mm) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); force_sigsegv(SIGSEGV, current); return retval; } 58 if (retval != -ENOEXEC || !bprm->file) { read_unlock(&binfmt_lock); return retval; } } 58 read_unlock(&binfmt_lock); 23 if (need_retry) { 47 if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && 6 printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; 43 if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) return retval; need_retry = false; goto retry; } return retval; } EXPORT_SYMBOL(search_binary_handler); static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; 58 rcu_read_lock(); 58 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); 58 rcu_read_unlock(); ret = search_binary_handler(bprm); if (ret >= 0) { audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); } return ret; } /* * sys_execve() executes a new program. */ static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { char *pathbuf = NULL; struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; int retval; 94 if (IS_ERR(filename)) 1 return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ 93 if ((current->flags & PF_NPROC_EXCEEDED) && atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ 93 current->flags &= ~PF_NPROC_EXCEEDED; retval = unshare_files(&displaced); if (retval) goto out_ret; retval = -ENOMEM; 93 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) goto out_files; 93 retval = prepare_bprm_creds(bprm); if (retval) goto out_free; 93 check_unsafe_exec(bprm); current->in_execve = 1; file = do_open_execat(fd, filename, flags); 25 retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; 70 sched_exec(); bprm->file = file; 49 if (fd == AT_FDCWD || filename->name[0] == '/') { 20 bprm->filename = filename->name; } else { 49 if (filename->name[0] == '\0') 48 pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); else 1 pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", fd, filename->name); 49 if (!pathbuf) { retval = -ENOMEM; goto out_unmark; } /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. Relies on having exclusive access to * current->files (due to unshare_files above). */ 49 if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) 1 bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; 69 bprm->filename = pathbuf; } 69 bprm->interp = bprm->filename; 69 retval = bprm_mm_init(bprm); if (retval) goto out_unmark; bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) goto out; 65 bprm->envc = count(envp, MAX_ARG_STRINGS); if ((retval = bprm->envc) < 0) goto out; 64 retval = prepare_binprm(bprm); if (retval < 0) goto out; 63 retval = copy_strings_kernel(1, &bprm->filename, bprm); if (retval < 0) goto out; 63 bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out; 63 retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out; 58 would_dump(bprm, bprm->file); 58 retval = exec_binprm(bprm); if (retval < 0) goto out; /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; acct_update_integrals(current); task_numa_free(current, false); free_bprm(bprm); kfree(pathbuf); putname(filename); if (displaced) put_files_struct(displaced); return retval; out: 30 if (bprm->mm) { 30 acct_arg_size(bprm, 0); 30 mmput(bprm->mm); } out_unmark: 53 current->fs->in_exec = 0; current->in_execve = 0; out_free: 53 free_bprm(bprm); kfree(pathbuf); out_files: 53 if (displaced) 52 reset_files_struct(displaced); out_ret: 42 putname(filename); 43 return retval; } int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ 1 void set_dumpable(struct mm_struct *mm, int value) { unsigned long old, new; 1 if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; do { 1 old = ACCESS_ONCE(mm->flags); new = (old & ~MMF_DUMPABLE_MASK) | value; 1 } while (cmpxchg(&mm->flags, old, new) != old); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return do_execveat(fd, getname_flags(filename, lookup_flags, NULL), argv, envp, flags); } #ifdef CONFIG_COMPAT 28 COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } 67 COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return compat_do_execveat(fd, getname_flags(filename, lookup_flags, NULL), argv, envp, flags); } #endif
/* * "splice": joining two ropes together by interweaving their strands. * * This is the "extended pipe" functionality, where a pipe is used as * an arbitrary in-memory buffer. Think of a pipe as a small kernel * buffer that you can use to transfer data from one end to the other. * * The traditional unix read/write is extended with a "splice()" operation * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> #include <linux/compat.h> #include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; struct address_space *mapping; lock_page(page); mapping = page_mapping(page); if (mapping) { WARN_ON(!PageUptodate(page)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this page, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * page, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ wait_on_page_writeback(page); if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, page)) { buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } } /* * Raced with truncate or failed to remove page from current * address space, unlock and return failure. */ out_unlock: unlock_page(page); return 1; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { 957 page_cache_release(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } /* * Check whether the contents of buf is OK to access. Since the content * is a page cache page, IO may be in flight. */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { 947 struct page *page = buf->page; int err; 947 if (!PageUptodate(page)) { 278 lock_page(page); /* * Page got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ 278 if (!page->mapping) { err = -ENODATA; goto error; } /* * Uh oh, read-error from disk. */ 274 if (!PageUptodate(page)) { err = -EIO; goto error; } /* * Page is ok afterall, we are done. */ 274 unlock_page(page); } return 0; error: 194 unlock_page(page); 194 return err; } const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, .get = generic_pipe_buf_get, }; static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return 1; buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_steal(pipe, buf); } static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, .get = generic_pipe_buf_get, }; static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { 50 smp_mb(); if (waitqueue_active(&pipe->wait)) 11 wake_up_interruptible(&pipe->wait); 50 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill * @spd: data to fill * * Description: * @spd contains a map of pages and len/offset tuples, along with * the struct pipe_buf_operations associated with these pages. This * function will link that data to the pipe. * */ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { 1189 unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; 1188 if (!spd_pages) return 0; ret = 0; do_wakeup = 0; page_nr = 0; 1189 pipe_lock(pipe); for (;;) { 990 if (!pipe->readers) { 2 send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } 1188 if (pipe->nrbufs < pipe->buffers) { 1185 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; buf->flags = 0; if (spd->flags & SPLICE_F_GIFT) 2 buf->flags |= PIPE_BUF_FLAG_GIFT; 1185 pipe->nrbufs++; page_nr++; ret += buf->len; if (pipe->files) do_wakeup = 1; 1185 if (!--spd->nr_pages) break; 1185 if (pipe->nrbufs < pipe->buffers) continue; break; } 6 if (spd->flags & SPLICE_F_NONBLOCK) { 1 if (!ret) ret = -EAGAIN; break; } 5 if (signal_pending(current)) { 1 if (!ret) ret = -ERESTARTSYS; break; } 5 if (do_wakeup) { smp_mb(); if (waitqueue_active(&pipe->wait)) wake_up_interruptible_sync(&pipe->wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } 5 pipe->waiting_writers++; pipe_wait(pipe); pipe->waiting_writers--; } 1187 pipe_unlock(pipe); if (do_wakeup) 44 wakeup_pipe_readers(pipe); 1188 while (page_nr < spd_pages) 5 spd->spd_release(spd, page_nr++); return ret; } EXPORT_SYMBOL_GPL(splice_to_pipe); void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { 4 page_cache_release(spd->pages[i]); } /* * Check if we need to grow the arrays holding pages and partial page * descriptions. */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { 1233 unsigned int buffers = ACCESS_ONCE(pipe->buffers); spd->nr_pages_max = buffers; if (buffers <= PIPE_DEF_BUFFERS) 1233 return 0; 5 spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); 5 if (spd->pages && spd->partial) return 0; kfree(spd->pages); kfree(spd->partial); return -ENOMEM; } void splice_shrink_spd(struct splice_pipe_desc *spd) { 1232 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; 5 kfree(spd->pages); kfree(spd->partial); } static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { 546 struct address_space *mapping = in->f_mapping; unsigned int loff, nr_pages, req_pages; struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; int error, page_nr; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, }; if (splice_grow_spd(pipe, &spd)) return -ENOMEM; 546 index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; nr_pages = min(req_pages, spd.nr_pages_max); /* * Lookup the (hopefully) full range of pages we need. */ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); index += spd.nr_pages; /* * If find_get_pages_contig() returned fewer pages than we needed, * readahead/allocate the rest and fill in the holes. */ if (spd.nr_pages < nr_pages) page_cache_sync_readahead(mapping, &in->f_ra, in, 458 index, req_pages - spd.nr_pages); error = 0; 546 while (spd.nr_pages < nr_pages) { /* * Page could be there, find_get_pages_contig() breaks on * the first hole. */ 458 page = find_get_page(mapping, index); if (!page) { /* * page didn't exist, allocate one. */ 280 page = page_cache_alloc_cold(mapping); if (!page) break; 280 error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (unlikely(error)) { 1 page_cache_release(page); if (error == -EEXIST) continue; break; } /* * add_to_page_cache() locks the page, unlock it * to avoid convoluting the logic below even more. */ 280 unlock_page(page); } 458 spd.pages[spd.nr_pages++] = page; index++; } /* * Now loop over the map and see if we need to start IO on any * pages, fill in the partial map, etc. */ 546 index = *ppos >> PAGE_CACHE_SHIFT; nr_pages = spd.nr_pages; spd.nr_pages = 0; for (page_nr = 0; page_nr < nr_pages; page_nr++) { unsigned int this_len; 546 if (!len) break; /* * this_len is the max we'll use from this page */ 546 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); page = spd.pages[page_nr]; if (PageReadahead(page)) page_cache_async_readahead(mapping, &in->f_ra, in, 145 page, index, req_pages - page_nr); /* * If the page isn't uptodate, we may need to start io on it */ 546 if (!PageUptodate(page)) { 310 lock_page(page); /* * Page was truncated, or invalidated by the * filesystem. Redo the find/create, but this time the * page is kept locked, so there's no chance of another * race with truncate/invalidate. */ 310 if (!page->mapping) { 206 unlock_page(page); page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); if (!page) { error = -ENOMEM; break; } 206 page_cache_release(spd.pages[page_nr]); 546 spd.pages[page_nr] = page; } /* * page was already under io and is now done, great */ 310 if (PageUptodate(page)) { unlock_page(page); goto fill_it; } /* * need to read in the page */ 289 error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { /* * We really should re-lookup the page here, * but it complicates things a lot. Instead * lets just do what we already stored, and * we'll get it the next time we are called. */ if (error == AOP_TRUNCATED_PAGE) error = 0; break; } } fill_it: /* * i_size must be checked after PageUptodate. */ 546 isize = i_size_read(mapping->host); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 546 if (unlikely(!isize || index > end_index)) break; /* * if this is the last page, see if we need to shrink * the length and stop */ 546 if (end_index == index) { unsigned int plen; /* * max good bytes in this page */ 377 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (plen <= loff) break; /* * force quit after adding this page */ 377 this_len = min(this_len, plen - loff); len = this_len; } 546 spd.partial[page_nr].offset = loff; spd.partial[page_nr].len = this_len; len -= this_len; loff = 0; spd.nr_pages++; index++; } /* * Release any pages at the end, if we quit early. 'page_nr' is how far * we got, 'nr_pages' is how many pages are in the map. */ 19 while (page_nr < nr_pages) 19 page_cache_release(spd.pages[page_nr++]); 546 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; if (spd.nr_pages) 546 error = splice_to_pipe(pipe, &spd); 546 splice_shrink_spd(&spd); return error; } /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from * @ppos: position in @in * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given file and fill them into a pipe. Can be * used as long as the address_space operations for the source implements * a readpage() hook. * */ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { loff_t isize, left; int ret; if (IS_DAX(in->f_mapping->host)) return default_file_splice_read(in, ppos, pipe, len, flags); 552 isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) return 0; 546 left = isize - *ppos; if (unlikely(left < len)) len = left; ret = __generic_file_splice_read(in, ppos, pipe, len, flags); if (ret > 0) { 546 *ppos += ret; 552 file_accessed(in); } return ret; 19 } EXPORT_SYMBOL(generic_file_splice_read); static const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return 1; } /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); static ssize_t kernel_readv(struct file *file, const struct iovec *vec, unsigned long vlen, loff_t offset) { mm_segment_t old_fs; loff_t pos = offset; ssize_t res; old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); set_fs(old_fs); return res; } ssize_t kernel_write(struct file *file, const char *buf, size_t count, loff_t pos) { mm_segment_t old_fs; ssize_t res; old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ res = vfs_write(file, (__force const char __user *)buf, count, &pos); set_fs(old_fs); return res; } EXPORT_SYMBOL(kernel_write); ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { unsigned int nr_pages; unsigned int nr_freed; size_t offset; struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; ssize_t res; size_t this_len; int error; int i; 301 struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &default_pipe_buf_ops, .spd_release = spd_release_page, }; if (splice_grow_spd(pipe, &spd)) return -ENOMEM; res = -ENOMEM; vec = __vec; 301 if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { 1 vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); if (!vec) goto shrink_ret; } 301 offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 301 for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { struct page *page; 301 page = alloc_page(GFP_USER); error = -ENOMEM; if (!page) goto err; 301 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; spd.pages[i] = page; spd.nr_pages++; len -= this_len; offset = 0; } 301 res = kernel_readv(in, vec, spd.nr_pages, *ppos); if (res < 0) { 15 error = res; goto err; } error = 0; 289 if (!res) goto err; nr_freed = 0; 243 for (i = 0; i < spd.nr_pages; i++) { 243 this_len = min_t(size_t, vec[i].iov_len, res); spd.partial[i].offset = 0; spd.partial[i].len = this_len; if (!this_len) { 119 __free_page(spd.pages[i]); spd.pages[i] = NULL; nr_freed++; } 243 res -= this_len; } 243 spd.nr_pages -= nr_freed; res = splice_to_pipe(pipe, &spd); if (res > 0) 292 *ppos += res; shrink_ret: 301 if (vec != __vec) 1 kfree(vec); 301 splice_shrink_spd(&spd); return res; err: 158 for (i = 0; i < spd.nr_pages; i++) 158 __free_page(spd.pages[i]); res = error; goto shrink_ret; } EXPORT_SYMBOL(default_file_splice_read); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. */ static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { 398 struct file *file = sd->u.file; loff_t pos = sd->pos; int more; if (!likely(file->f_op->sendpage)) return -EINVAL; 398 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; 325 if (sd->len < sd->total_len && pipe->nrbufs > 1) 274 more |= MSG_SENDPAGE_NOTLAST; 398 return file->f_op->sendpage(file, buf->page, buf->offset, sd->len, &pos, more); } static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { 244 smp_mb(); if (waitqueue_active(&pipe->wait)) 218 wake_up_interruptible(&pipe->wait); 244 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } /** * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in * the pipe or if the requested number of bytes (@sd->total_len) * have been copied. It returns a positive number (one) if the * pipe needs to be filled with more data, zero if the required * number of bytes have been copied and -errno on error. * * This, together with splice_from_pipe_{begin,end,next}, may be * used to implement the functionality of __splice_from_pipe() when * locking is required around copying the pipe buffers to the * destination. */ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; 652 while (pipe->nrbufs) { 652 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; const struct pipe_buf_operations *ops = buf->ops; sd->len = buf->len; if (sd->len > sd->total_len) 32 sd->len = sd->total_len; 652 ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } 652 ret = actor(pipe, buf, sd); if (ret <= 0) return ret; 613 buf->offset += ret; buf->len -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; if (!buf->len) { 600 buf->ops = NULL; ops->release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->files) 245 sd->need_wakeup = true; } 613 if (!sd->total_len) return 0; } return 1; } /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wait for some data and return a positive * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { /* * Check for signal early to make process killable when there are * always buffers available */ 1385 if (signal_pending(current)) return -ERESTARTSYS; 1385 while (!pipe->nrbufs) { 228 if (!pipe->writers) return 0; 226 if (!pipe->waiting_writers && sd->num_spliced) return 0; 209 if (sd->flags & SPLICE_F_NONBLOCK) return -EAGAIN; 208 if (signal_pending(current)) 1384 return -ERESTARTSYS; 208 if (sd->need_wakeup) { 191 wakeup_pipe_writers(pipe); sd->need_wakeup = false; } 208 pipe_wait(pipe); } return 1; } /** * splice_from_pipe_begin - start splicing from pipe * @sd: information about the splice operation * * Description: * This function should be called before a loop containing * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ static void splice_from_pipe_begin(struct splice_desc *sd) { 654 sd->num_spliced = 0; sd->need_wakeup = false; } /** * splice_from_pipe_end - finish splicing from pipe * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wake up pipe writers if necessary. It should * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) 61 wakeup_pipe_writers(pipe); } /** * __splice_from_pipe - splice data from a pipe to given actor * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to * the desired destination. See pipe_to_file, pipe_to_sendpage, or * pipe_to_user. * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; 654 splice_from_pipe_begin(sd); do { 655 cond_resched(); 655 ret = splice_from_pipe_next(pipe, sd); if (ret > 0) 652 ret = splice_from_pipe_feed(pipe, sd, actor); } while (ret > 0); 487 splice_from_pipe_end(pipe, sd); 487 return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); /** * splice_from_pipe - splice data from a pipe to a file * @pipe: pipe to splice from * @out: file to splice to * @ppos: position in @out * @len: how many bytes to splice * @flags: splice modifier flags * @actor: handler that splices the data * * Description: * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { ssize_t ret; 551 struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); pipe_unlock(pipe); return ret; } /** * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. * This one is ->write_iter-based. * */ ssize_t iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { 751 struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; int nbufs = pipe->buffers; 751 struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); ssize_t ret; if (unlikely(!array)) return -ENOMEM; 751 pipe_lock(pipe); splice_from_pipe_begin(&sd); 745 while (sd.total_len) { struct iov_iter from; size_t left; int n, idx; 751 ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; 749 if (unlikely(nbufs < pipe->buffers)) { kfree(array); nbufs = pipe->buffers; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (!array) { ret = -ENOMEM; break; } } /* build the vector */ 749 left = sd.total_len; 749 for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) { 749 struct pipe_buffer *buf = pipe->bufs + idx; size_t this_len = buf->len; if (this_len > left) this_len = left; if (idx == pipe->buffers - 1) idx = -1; 749 ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) { 194 if (ret == -ENODATA) ret = 0; 194 goto done; } 749 array[n].bv_page = buf->page; array[n].bv_len = this_len; array[n].bv_offset = buf->offset; left -= this_len; } 749 iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, sd.total_len - left); ret = vfs_iter_write(out, &from, &sd.pos); if (ret <= 0) break; 745 sd.num_spliced += ret; sd.total_len -= ret; *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ 744 while (ret) { 745 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; if (ret >= buf->len) { 744 const struct pipe_buf_operations *ops = buf->ops; ret -= buf->len; buf->len = 0; buf->ops = NULL; ops->release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->files) 3 sd.need_wakeup = true; } else { 12 buf->offset += ret; buf->len -= ret; 12 ret = 0; } } } done: 749 kfree(array); 2 splice_from_pipe_end(pipe, &sd); 749 pipe_unlock(pipe); 749 if (sd.num_spliced) 746 ret = sd.num_spliced; return ret; } EXPORT_SYMBOL(iter_file_splice_write); static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { int ret; void *data; 157 loff_t tmp = sd->pos; data = kmap(buf->page); ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); kunmap(buf->page); return ret; } static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { ssize_t ret; 158 ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); if (ret > 0) 110 *ppos += ret; 135 return ret; } /** * generic_splice_sendpage - splice data from a pipe to a socket * @pipe: pipe to splice from * @out: socket to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will send @len bytes from the pipe to a network socket. No data copying * is involved. * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { 398 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); 146 if (out->f_op->splice_write) splice_write = out->f_op->splice_write; else splice_write = default_file_splice_write; 1287 return splice_write(pipe, out, ppos, len, flags); } /* * Attempt to initiate a splice from a file to a pipe. */ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int ret; 1247 if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; 1247 ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) 1 return ret; 1246 if (in->f_op->splice_read) splice_read = in->f_op->splice_read; else splice_read = default_file_splice_read; 1247 return splice_read(in, ppos, pipe, len, flags); } /** * splice_direct_to_actor - splices data directly between two non-pipes * @in: file to splice from * @sd: actor information on where to splice to * @actor: handles the data splicing * * Description: * This is a special case helper to splice directly between two * points, without requiring an explicit pipe. Internally an allocated * pipe is cached in the process, and reused during the lifetime of * that process. * */ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; long ret, bytes; umode_t i_mode; size_t len; int i, flags, more; /* * We require the input being a regular file, as we don't want to * randomly drop data for eg socket -> socket splicing. Use the * piped splicing for that! */ 1205 i_mode = file_inode(in)->i_mode; if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) return -EINVAL; /* * neither in nor out is a pipe, setup an internal pipe attached to * 'out' and transfer the wanted data from 'in' to 'out' through that */ 1204 pipe = current->splice_pipe; if (unlikely(!pipe)) { 1168 pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; /* * We don't have an immediate reader, but we'll read the stuff * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ 1168 pipe->readers = 1; current->splice_pipe = pipe; } /* * Do the splice. */ ret = 0; bytes = 0; 1204 len = sd->total_len; flags = sd->flags; /* * Don't block on output, we have to drain the direct pipe. */ sd->flags &= ~SPLICE_F_NONBLOCK; more = sd->flags & SPLICE_F_MORE; 1202 while (len) { size_t read_len; 1203 loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) 639 goto out_release; 1141 read_len = ret; sd->total_len = read_len; /* * If more data is pending, set SPLICE_F_MORE * If this is the last data and SPLICE_F_MORE was not set * initially, clears it. */ if (read_len < len) 1093 sd->flags |= SPLICE_F_MORE; 96 else if (!more) 96 sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ 1141 ret = actor(pipe, sd); if (unlikely(ret <= 0)) { 313 sd->pos = prev_pos; goto out_release; } 1082 bytes += ret; len -= ret; sd->pos = pos; 1071 if (ret < read_len) { 40 sd->pos = prev_pos + ret; goto out_release; } } done: 704 pipe->nrbufs = pipe->curbuf = 0; 705 file_accessed(in); return bytes; out_release: /* * If we did an incomplete transfer we must release * the pipe buffers in question: */ 639 for (i = 0; i < pipe->buffers; i++) { 639 struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) { 350 buf->ops->release(pipe, buf); buf->ops = NULL; } } 639 if (!bytes) bytes = ret; goto done; } EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { 1141 struct file *file = sd->u.file; 1141 return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } /** * do_splice_direct - splices data directly between two files * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * For use by do_sendfile(). splice can easily emulate sendfile, but * doing it in the application would incur an extra system call * (splice in + splice out, as compared to just sendfile()). So this helper * can splice directly through a process-private pipe. * */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) { 1206 struct splice_desc sd = { .len = len, .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, .opos = opos, }; long ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; 1206 if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; 1205 ret = rw_verify_area(WRITE, out, opos, len); if (unlikely(ret < 0)) return ret; 1205 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) 706 *ppos = sd.pos; return ret; } EXPORT_SYMBOL(do_splice_direct); static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); /* * Determine where to splice to/from. */ static long do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; long ret; ipipe = get_pipe_info(in); opipe = get_pipe_info(out); 157 if (ipipe && opipe) { 8 if (off_in || off_out) 1 return -ESPIPE; 7 if (!(in->f_mode & FMODE_READ)) return -EBADF; 7 if (!(out->f_mode & FMODE_WRITE)) return -EBADF; /* Splicing to self would be fun, but... */ 7 if (ipipe == opipe) return -EINVAL; 5 return splice_pipe_to_pipe(ipipe, opipe, len, flags); } if (ipipe) { 150 if (off_in) return -ESPIPE; if (off_out) { 4 if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; 3 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 153 return -EFAULT; } else { 145 offset = out->f_pos; } 148 if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; 148 if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; 147 ret = rw_verify_area(WRITE, out, &offset, len); if (unlikely(ret < 0)) return ret; 146 file_start_write(out); 146 ret = do_splice_from(ipipe, out, &offset, len, flags); 22 file_end_write(out); 96 if (!off_out) 95 out->f_pos = offset; 1 else if (copy_to_user(off_out, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; } 47 if (opipe) { 46 if (off_out) return -ESPIPE; 45 if (off_in) { 5 if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; } else { 40 offset = in->f_pos; } 4 ret = do_splice_to(in, &offset, opipe, len, flags); if (!off_in) in->f_pos = offset; else if (copy_to_user(off_in, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; } return -EINVAL; } /* * Map an iov into an array of pages and offset/length tupples. With the * partial_page structure, we can map several non-contiguous ranges into * our ones pages[] map instead of splitting that operation into pieces. * Could easily be exported as a generic helper for other users, in which * case one would probably want to add a 'max_nr_pages' parameter as well. */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, struct partial_page *partial, bool aligned, unsigned int pipe_buffers) { int buffers = 0, error = 0; while (nr_vecs) { unsigned long off, npages; struct iovec entry; void __user *base; size_t len; int i; error = -EFAULT; 18 if (copy_from_user(&entry, iov, sizeof(entry))) break; 18 base = entry.iov_base; len = entry.iov_len; /* * Sanity check this iovec. 0 read succeeds. */ error = 0; if (unlikely(!len)) break; 13 error = -EFAULT; 15 if (!access_ok(VERIFY_READ, base, len)) break; /* * Get this base offset and number of pages, then map * in the user pages. */ 15 off = (unsigned long) base & ~PAGE_MASK; /* * If asked for alignment, the offset must be zero and the * length a multiple of the PAGE_SIZE. */ error = -EINVAL; if (aligned && (off || len & ~PAGE_MASK)) break; npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; if (npages > pipe_buffers - buffers) npages = pipe_buffers - buffers; error = get_user_pages_fast((unsigned long)base, npages, 0, &pages[buffers]); if (unlikely(error <= 0)) break; /* * Fill this contiguous range into the partial page map. */ for (i = 0; i < error; i++) { 15 const int plen = min_t(size_t, len, PAGE_SIZE - off); partial[buffers].offset = off; partial[buffers].len = plen; off = 0; len -= plen; buffers++; } /* * We didn't complete this iov, stop here since it probably * means we have to move some of this into a pipe to * be able to continue. */ 15 if (len) break; /* * Don't continue if we mapped fewer pages than we asked for, * or if we mapped the max number of pages that we have * room for. */ 11 if (error < npages || buffers == pipe_buffers) break; 8 nr_vecs--; iov++; } 18 if (buffers) return buffers; return error; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { 123 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); 123 return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct splice_desc sd; long ret; struct iovec iovstack[UIO_FASTIOV]; 127 struct iovec *iov = iovstack; struct iov_iter iter; pipe = get_pipe_info(file); if (!pipe) return -EBADF; 126 ret = import_iovec(READ, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; 126 sd.total_len = iov_iter_count(&iter); sd.len = 0; sd.flags = flags; sd.u.data = &iter; sd.pos = 0; if (sd.total_len) { 124 pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } 8 kfree(iov); 9 return ret; } /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; 19 struct splice_pipe_desc spd = { .pages = pages, .partial = partial, .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &user_page_pipe_buf_ops, .spd_release = spd_release_page, }; long ret; pipe = get_pipe_info(file); if (!pipe) return -EBADF; 18 if (splice_grow_spd(pipe, &spd)) return -ENOMEM; 18 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, spd.partial, false, spd.nr_pages_max); 4 if (spd.nr_pages <= 0) ret = spd.nr_pages; else 15 ret = splice_to_pipe(pipe, &spd); 17 splice_shrink_spd(&spd); return ret; } /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple * operation that can be supported without any funky alignment restrictions * or nasty vm tricks. We simply map in the user memory and fill them into * a pipe. The reverse isn't quite as easy, though. There are two possible * solutions for that: * * - memcpy() the data internally, at which point we might as well just * do a regular read() on the buffer anyway. * - Lots of nasty vm tricks, that are neither fast nor flexible (it * has restriction limitations on both ends of the pipe). * * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ 149 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, unsigned long, nr_segs, unsigned int, flags) { struct fd f; long error; if (unlikely(nr_segs > UIO_MAXIOV)) return -EINVAL; 148 else if (unlikely(!nr_segs)) return 0; error = -EBADF; 146 f = fdget(fd); if (f.file) { 145 if (f.file->f_mode & FMODE_WRITE) 19 error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); 128 else if (f.file->f_mode & FMODE_READ) 127 error = vmsplice_to_user(f.file, iov, nr_segs, flags); 28 fdput(f); } return error; } #ifdef CONFIG_COMPAT 151 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, unsigned int, nr_segs, unsigned int, flags) { unsigned i; struct iovec __user *iov; if (nr_segs > UIO_MAXIOV) return -EINVAL; 149 iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); 146 for (i = 0; i < nr_segs; i++) { struct compat_iovec v; 147 if (get_user(v.iov_base, &iov32[i].iov_base) || 146 get_user(v.iov_len, &iov32[i].iov_len) || 146 put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || 146 put_user(v.iov_len, &iov[i].iov_len)) return -EFAULT; } return sys_vmsplice(fd, iov, nr_segs, flags); } #endif 209 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { struct fd in, out; long error; if (unlikely(!len)) return 0; error = -EBADF; 208 in = fdget(fd_in); if (in.file) { 206 if (in.file->f_mode & FMODE_READ) { 205 out = fdget(fd_out); if (out.file) { if (out.file->f_mode & FMODE_WRITE) 203 error = do_splice(in.file, off_in, out.file, off_out, len, flags); 153 fdput(out); } } 156 fdput(in); } return error; } /* * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check ->nrbufs without the inode lock first. This function * is speculative anyways, so missing one is ok. */ 9 if (pipe->nrbufs) return 0; ret = 0; 9 pipe_lock(pipe); while (!pipe->nrbufs) { 9 if (signal_pending(current)) { ret = -ERESTARTSYS; break; } 9 if (!pipe->writers) break; 7 if (!pipe->waiting_writers) { 6 if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } } 6 pipe_wait(pipe); } 9 pipe_unlock(pipe); return ret; } /* * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check ->nrbufs without the inode lock first. This function * is speculative anyways, so missing one is ok. */ 14 if (pipe->nrbufs < pipe->buffers) return 0; ret = 0; 4 pipe_lock(pipe); while (pipe->nrbufs >= pipe->buffers) { 4 if (!pipe->readers) { 1 send_sig(SIGPIPE, current, 0); ret = -EPIPE; break; } 3 if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } 2 if (signal_pending(current)) { ret = -ERESTARTSYS; break; } 2 pipe->waiting_writers++; pipe_wait(pipe); pipe->waiting_writers--; } 4 pipe_unlock(pipe); return ret; } /* * Splice contents of ipipe to opipe. */ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; int ret = 0, nbuf; bool input_wakeup = false; retry: 5 ret = ipipe_prep(ipipe, flags); if (ret) return ret; 5 ret = opipe_prep(opipe, flags); if (ret) return ret; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ 5 pipe_double_lock(ipipe, opipe); do { 5 if (!opipe->readers) { 1 send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } 4 if (!ipipe->nrbufs && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ 3 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { /* Already processed some buffers, break */ 2 if (ret) break; 1 if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } /* * We raced with another reader/writer and haven't * managed to process any buffers. A zero return * value means EOF, so retry instead. */ 1 pipe_unlock(ipipe); pipe_unlock(opipe); goto retry; } 3 ibuf = ipipe->bufs + ipipe->curbuf; nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); obuf = opipe->bufs + nbuf; if (len >= ibuf->len) { /* * Simply move the whole buffer from ipipe to opipe */ 2 *obuf = *ibuf; ibuf->ops = NULL; opipe->nrbufs++; ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); ipipe->nrbufs--; input_wakeup = true; } else { /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ 1 ibuf->ops->get(ipipe, ibuf); *obuf = *ibuf; /* * Don't inherit the gift flag, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = len; opipe->nrbufs++; ibuf->offset += obuf->len; ibuf->len -= obuf->len; } 3 ret += obuf->len; len -= obuf->len; } while (len); 5 pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) 3 wakeup_pipe_readers(opipe); 5 if (input_wakeup) 2 wakeup_pipe_writers(ipipe); return ret; } /* * Link contents of ipipe to opipe. */ static int link_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; int ret = 0, i = 0, nbuf; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ 7 pipe_double_lock(ipipe, opipe); do { 7 if (!opipe->readers) { 1 send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } /* * If we have iterated all input buffers or ran out of * output room, break. */ 6 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) break; 5 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ ibuf->ops->get(ipipe, ibuf); obuf = opipe->bufs + nbuf; *obuf = *ibuf; /* * Don't inherit the gift flag, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; if (obuf->len > len) 1 obuf->len = len; 5 opipe->nrbufs++; ret += obuf->len; len -= obuf->len; i++; } while (len); /* * return EAGAIN if we have the potential of some data in the * future, otherwise just return 0 */ 6 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) ret = -EAGAIN; 7 pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) 5 wakeup_pipe_readers(opipe); return ret; } /* * This is a tee(1) implementation that works on pipes. It doesn't copy * any data, it simply references the 'in' pages on the 'out' pipe. * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { 13 struct pipe_inode_info *ipipe = get_pipe_info(in); struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; /* * Duplicate the contents of ipipe to opipe without actually * copying the data. */ 12 if (ipipe && opipe && ipipe != opipe) { /* * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ 4 ret = ipipe_prep(ipipe, flags); 4 if (!ret) { 9 ret = opipe_prep(opipe, flags); if (!ret) 7 ret = link_pipe(ipipe, opipe, len, flags); } } return ret; } 18 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { struct fd in; int error; if (unlikely(!len)) return 0; error = -EBADF; 17 in = fdget(fdin); if (in.file) { if (in.file->f_mode & FMODE_READ) { 15 struct fd out = fdget(fdout); if (out.file) { if (out.file->f_mode & FMODE_WRITE) 13 error = do_tee(in.file, out.file, len, flags); 13 fdput(out); } } 15 fdput(in); } return error; }
#undef TRACE_SYSTEM #define TRACE_SYSTEM random #if !defined(_TRACE_RANDOM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RANDOM_H #include <linux/writeback.h> #include <linux/tracepoint.h> 418 TRACE_EVENT(add_device_randomness, TP_PROTO(int bytes, unsigned long IP), TP_ARGS(bytes, IP), TP_STRUCT__entry( __field( int, bytes ) __field(unsigned long, IP ) ), TP_fast_assign( __entry->bytes = bytes; __entry->IP = IP; ), TP_printk("bytes %d caller %pS", __entry->bytes, (void *)__entry->IP) ); DECLARE_EVENT_CLASS(random__mix_pool_bytes, TP_PROTO(const char *pool_name, int bytes, unsigned long IP), TP_ARGS(pool_name, bytes, IP), TP_STRUCT__entry( __field( const char *, pool_name ) __field( int, bytes ) __field(unsigned long, IP ) ), TP_fast_assign( __entry->pool_name = pool_name; __entry->bytes = bytes; __entry->IP = IP; ), TP_printk("%s pool: bytes %d caller %pS", __entry->pool_name, __entry->bytes, (void *)__entry->IP) ); 19 DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes, TP_PROTO(const char *pool_name, int bytes, unsigned long IP), TP_ARGS(pool_name, bytes, IP) ); 355 DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes_nolock, TP_PROTO(const char *pool_name, int bytes, unsigned long IP), TP_ARGS(pool_name, bytes, IP) ); 12 TRACE_EVENT(credit_entropy_bits, TP_PROTO(const char *pool_name, int bits, int entropy_count, int entropy_total, unsigned long IP), TP_ARGS(pool_name, bits, entropy_count, entropy_total, IP), TP_STRUCT__entry( __field( const char *, pool_name ) __field( int, bits ) __field( int, entropy_count ) __field( int, entropy_total ) __field(unsigned long, IP ) ), TP_fast_assign( __entry->pool_name = pool_name; __entry->bits = bits; __entry->entropy_count = entropy_count; __entry->entropy_total = entropy_total; __entry->IP = IP; ), TP_printk("%s pool: bits %d entropy_count %d entropy_total %d " "caller %pS", __entry->pool_name, __entry->bits, __entry->entropy_count, __entry->entropy_total, (void *)__entry->IP) ); TRACE_EVENT(push_to_pool, TP_PROTO(const char *pool_name, int pool_bits, int input_bits), TP_ARGS(pool_name, pool_bits, input_bits), TP_STRUCT__entry( __field( const char *, pool_name ) __field( int, pool_bits ) __field( int, input_bits ) ), TP_fast_assign( __entry->pool_name = pool_name; __entry->pool_bits = pool_bits; __entry->input_bits = input_bits; ), TP_printk("%s: pool_bits %d input_pool_bits %d", __entry->pool_name, __entry->pool_bits, __entry->input_bits) ); 357 TRACE_EVENT(debit_entropy, TP_PROTO(const char *pool_name, int debit_bits), TP_ARGS(pool_name, debit_bits), TP_STRUCT__entry( __field( const char *, pool_name ) __field( int, debit_bits ) ), TP_fast_assign( __entry->pool_name = pool_name; __entry->debit_bits = debit_bits; ), TP_printk("%s: debit_bits %d", __entry->pool_name, __entry->debit_bits) ); 36 TRACE_EVENT(add_input_randomness, TP_PROTO(int input_bits), TP_ARGS(input_bits), TP_STRUCT__entry( __field( int, input_bits ) ), TP_fast_assign( __entry->input_bits = input_bits; ), TP_printk("input_pool_bits %d", __entry->input_bits) ); TRACE_EVENT(add_disk_randomness, TP_PROTO(dev_t dev, int input_bits), TP_ARGS(dev, input_bits), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, input_bits ) ), TP_fast_assign( __entry->dev = dev; __entry->input_bits = input_bits; ), TP_printk("dev %d,%d input_pool_bits %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->input_bits) ); 4 TRACE_EVENT(xfer_secondary_pool, TP_PROTO(const char *pool_name, int xfer_bits, int request_bits, int pool_entropy, int input_entropy), TP_ARGS(pool_name, xfer_bits, request_bits, pool_entropy, input_entropy), TP_STRUCT__entry( __field( const char *, pool_name ) __field( int, xfer_bits ) __field( int, request_bits ) __field( int, pool_entropy ) __field( int, input_entropy ) ), TP_fast_assign( __entry->pool_name = pool_name; __entry->xfer_bits = xfer_bits; __entry->request_bits = request_bits; __entry->pool_entropy = pool_entropy; __entry->input_entropy = input_entropy; ), TP_printk("pool %s xfer_bits %d request_bits %d pool_entropy %d " "input_entropy %d", __entry->pool_name, __entry->xfer_bits, __entry->request_bits, __entry->pool_entropy, __entry->input_entropy) ); DECLARE_EVENT_CLASS(random__get_random_bytes, TP_PROTO(int nbytes, unsigned long IP), TP_ARGS(nbytes, IP), TP_STRUCT__entry( __field( int, nbytes ) __field(unsigned long, IP ) ), TP_fast_assign( __entry->nbytes = nbytes; __entry->IP = IP; ), TP_printk("nbytes %d caller %pS", __entry->nbytes, (void *)__entry->IP) ); 337 DEFINE_EVENT(random__get_random_bytes, get_random_bytes, TP_PROTO(int nbytes, unsigned long IP), TP_ARGS(nbytes, IP) ); DEFINE_EVENT(random__get_random_bytes, get_random_bytes_arch, TP_PROTO(int nbytes, unsigned long IP), TP_ARGS(nbytes, IP) ); DECLARE_EVENT_CLASS(random__extract_entropy, TP_PROTO(const char *pool_name, int nbytes, int entropy_count, unsigned long IP), TP_ARGS(pool_name, nbytes, entropy_count, IP), TP_STRUCT__entry( __field( const char *, pool_name ) __field( int, nbytes ) __field( int, entropy_count ) __field(unsigned long, IP ) ), TP_fast_assign( __entry->pool_name = pool_name; __entry->nbytes = nbytes; __entry->entropy_count = entropy_count; __entry->IP = IP; ), TP_printk("%s pool: nbytes %d entropy_count %d caller %pS", __entry->pool_name, __entry->nbytes, __entry->entropy_count, (void *)__entry->IP) ); 339 DEFINE_EVENT(random__extract_entropy, extract_entropy, TP_PROTO(const char *pool_name, int nbytes, int entropy_count, unsigned long IP), TP_ARGS(pool_name, nbytes, entropy_count, IP) ); 21 DEFINE_EVENT(random__extract_entropy, extract_entropy_user, TP_PROTO(const char *pool_name, int nbytes, int entropy_count, unsigned long IP), TP_ARGS(pool_name, nbytes, entropy_count, IP) ); 2 TRACE_EVENT(random_read, TP_PROTO(int got_bits, int need_bits, int pool_left, int input_left), TP_ARGS(got_bits, need_bits, pool_left, input_left), TP_STRUCT__entry( __field( int, got_bits ) __field( int, need_bits ) __field( int, pool_left ) __field( int, input_left ) ), TP_fast_assign( __entry->got_bits = got_bits; __entry->need_bits = need_bits; __entry->pool_left = pool_left; __entry->input_left = input_left; ), TP_printk("got_bits %d still_needed_bits %d " "blocking_pool_entropy_left %d input_entropy_left %d", __entry->got_bits, __entry->got_bits, __entry->pool_left, __entry->input_left) ); 12 TRACE_EVENT(urandom_read, TP_PROTO(int got_bits, int pool_left, int input_left), TP_ARGS(got_bits, pool_left, input_left), TP_STRUCT__entry( __field( int, got_bits ) __field( int, pool_left ) __field( int, input_left ) ), TP_fast_assign( __entry->got_bits = got_bits; __entry->pool_left = pool_left; __entry->input_left = input_left; ), TP_printk("got_bits %d nonblocking_pool_entropy_left %d " "input_entropy_left %d", __entry->got_bits, __entry->pool_left, __entry->input_left) ); #endif /* _TRACE_RANDOM_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
#include <linux/pm.h> #include <linux/acpi.h> struct usb_hub_descriptor; struct usb_dev_state; /* Functions local to drivers/usb/core/ */ extern int usb_create_sysfs_dev_files(struct usb_device *dev); extern void usb_remove_sysfs_dev_files(struct usb_device *dev); extern void usb_create_sysfs_intf_files(struct usb_interface *intf); extern void usb_remove_sysfs_intf_files(struct usb_interface *intf); extern int usb_create_ep_devs(struct device *parent, struct usb_host_endpoint *endpoint, struct usb_device *udev); extern void usb_remove_ep_devs(struct usb_host_endpoint *endpoint); extern void usb_enable_endpoint(struct usb_device *dev, struct usb_host_endpoint *ep, bool reset_toggle); extern void usb_enable_interface(struct usb_device *dev, struct usb_interface *intf, bool reset_toggles); extern void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr, bool reset_hardware); extern void usb_disable_interface(struct usb_device *dev, struct usb_interface *intf, bool reset_hardware); extern void usb_release_interface_cache(struct kref *ref); extern void usb_disable_device(struct usb_device *dev, int skip_ep0); extern int usb_deauthorize_device(struct usb_device *); extern int usb_authorize_device(struct usb_device *); extern void usb_deauthorize_interface(struct usb_interface *); extern void usb_authorize_interface(struct usb_interface *); extern void usb_detect_quirks(struct usb_device *udev); extern void usb_detect_interface_quirks(struct usb_device *udev); extern int usb_remove_device(struct usb_device *udev); extern int usb_get_device_descriptor(struct usb_device *dev, unsigned int size); extern int usb_get_bos_descriptor(struct usb_device *dev); extern void usb_release_bos_descriptor(struct usb_device *dev); extern char *usb_cache_string(struct usb_device *udev, int index); extern int usb_set_configuration(struct usb_device *dev, int configuration); extern int usb_choose_configuration(struct usb_device *udev); static inline unsigned usb_get_max_power(struct usb_device *udev, struct usb_host_config *c) { /* SuperSpeed power is in 8 mA units; others are in 2 mA units */ unsigned mul = (udev->speed >= USB_SPEED_SUPER ? 8 : 2); return c->desc.bMaxPower * mul; } extern void usb_kick_hub_wq(struct usb_device *dev); extern int usb_match_one_id_intf(struct usb_device *dev, struct usb_host_interface *intf, const struct usb_device_id *id); extern int usb_match_device(struct usb_device *dev, const struct usb_device_id *id); extern void usb_forced_unbind_intf(struct usb_interface *intf); extern void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev); extern void usb_hub_release_all_ports(struct usb_device *hdev, struct usb_dev_state *owner); extern bool usb_device_is_owned(struct usb_device *udev); extern int usb_hub_init(void); extern void usb_hub_cleanup(void); extern int usb_major_init(void); extern void usb_major_cleanup(void); extern int usb_device_supports_lpm(struct usb_device *udev); #ifdef CONFIG_PM extern int usb_suspend(struct device *dev, pm_message_t msg); extern int usb_resume(struct device *dev, pm_message_t msg); extern int usb_resume_complete(struct device *dev); extern int usb_port_suspend(struct usb_device *dev, pm_message_t msg); extern int usb_port_resume(struct usb_device *dev, pm_message_t msg); extern void usb_autosuspend_device(struct usb_device *udev); extern int usb_autoresume_device(struct usb_device *udev); extern int usb_remote_wakeup(struct usb_device *dev); extern int usb_runtime_suspend(struct device *dev); extern int usb_runtime_resume(struct device *dev); extern int usb_runtime_idle(struct device *dev); extern int usb_enable_usb2_hardware_lpm(struct usb_device *udev); extern int usb_disable_usb2_hardware_lpm(struct usb_device *udev); #else static inline int usb_port_suspend(struct usb_device *udev, pm_message_t msg) { return 0; } static inline int usb_port_resume(struct usb_device *udev, pm_message_t msg) { return 0; } #define usb_autosuspend_device(udev) do {} while (0) static inline int usb_autoresume_device(struct usb_device *udev) { return 0; } static inline int usb_enable_usb2_hardware_lpm(struct usb_device *udev) { return 0; } static inline int usb_disable_usb2_hardware_lpm(struct usb_device *udev) { return 0; } #endif extern struct bus_type usb_bus_type; extern struct mutex usb_port_peer_mutex; extern struct device_type usb_device_type; extern struct device_type usb_if_device_type; 420 extern struct device_type usb_ep_device_type; extern struct device_type usb_port_device_type; extern struct usb_device_driver usb_generic_driver; static inline int is_usb_device(const struct device *dev) { return dev->type == &usb_device_type; } static inline int is_usb_interface(const struct device *dev) { return dev->type == &usb_if_device_type; } static inline int is_usb_endpoint(const struct device *dev) { return dev->type == &usb_ep_device_type; } static inline int is_usb_port(const struct device *dev) { return dev->type == &usb_port_device_type; } /* Do the same for device drivers and interface drivers. */ static inline int is_usb_device_driver(struct device_driver *drv) { return container_of(drv, struct usbdrv_wrap, driver)-> for_devices; } /* for labeling diagnostics */ extern const char *usbcore_name; /* sysfs stuff */ extern const struct attribute_group *usb_device_groups[]; extern const struct attribute_group *usb_interface_groups[]; /* usbfs stuff */ extern struct mutex usbfs_mutex; extern struct usb_driver usbfs_driver; extern const struct file_operations usbfs_devices_fops; extern const struct file_operations usbdev_file_operations; extern void usbfs_conn_disc_event(void); extern int usb_devio_init(void); extern void usb_devio_cleanup(void); /* * Firmware specific cookie identifying a port's location. '0' == no location * data available */ typedef u32 usb_port_location_t; /* internal notify stuff */ extern void usb_notify_add_device(struct usb_device *udev); extern void usb_notify_remove_device(struct usb_device *udev); extern void usb_notify_add_bus(struct usb_bus *ubus); extern void usb_notify_remove_bus(struct usb_bus *ubus); extern void usb_hub_adjust_deviceremovable(struct usb_device *hdev, struct usb_hub_descriptor *desc); #ifdef CONFIG_ACPI extern int usb_acpi_register(void); extern void usb_acpi_unregister(void); extern acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev, int port1); #else static inline int usb_acpi_register(void) { return 0; }; static inline void usb_acpi_unregister(void) { }; #endif
/* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> struct ctl_table_header; struct mempolicy; /* * This is not completely implemented yet. The idea is to * create an in-memory tree (like the actual /proc filesystem * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * * parent/subdir are used for the directory structure (every /proc file has a * parent, but "subdir" is empty for all non-directory entries). * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { unsigned int low_ino; umode_t mode; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; struct proc_dir_entry *parent; struct rb_root subdir; struct rb_node subdir_node; void *data; atomic_t count; /* use count */ atomic_t in_use; /* number of callers into module in progress; */ /* negative -> it's going away RSN */ struct completion *pde_unload_completion; struct list_head pde_openers; /* who did ->open, but not ->release */ spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ u8 namelen; char name[]; }; union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); }; struct proc_inode { struct pid *pid; int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; }; /* * General functions */ static inline struct proc_inode *PROC_I(const struct inode *inode) { return container_of(inode, struct proc_inode, vfs_inode); } static inline struct proc_dir_entry *PDE(const struct inode *inode) { 204 return PROC_I(inode)->pde; } static inline void *__PDE_DATA(const struct inode *inode) { 25 return PDE(inode)->data; } static inline struct pid *proc_pid(struct inode *inode) { 534 return PROC_I(inode)->pid; } static inline struct task_struct *get_proc_task(struct inode *inode) { 511 return get_pid_task(proc_pid(inode), PIDTYPE_PID); } static inline int task_dumpable(struct task_struct *task) { int dumpable = 0; struct mm_struct *mm; 498 task_lock(task); mm = task->mm; if (mm) 496 dumpable = get_dumpable(mm); 2 task_unlock(task); if (dumpable == SUID_DUMP_USER) return 1; return 0; } static inline unsigned name_to_int(const struct qstr *qstr) { 140 const char *name = qstr->name; int len = qstr->len; unsigned n = 0; 146 if (len > 1 && *name == '0') goto out; 196 while (len-- > 0) { 196 unsigned c = *name++ - '0'; if (c > 9) goto out; 182 if (n >= (~0U-9)/10) goto out; 182 n *= 10; n += c; } return n; out: return ~0U; } /* * Offset of the first process in the /proc root directory.. */ #define FIRST_PROCESS_ENTRY 256 /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 /* * array.c */ extern const struct file_operations proc_tid_children_operations; extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_status(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); /* * base.c */ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int proc_setattr(struct dentry *, struct iattr *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); extern int pid_revalidate(struct dentry *, unsigned int); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef int instantiate_t(struct inode *, struct dentry *, struct task_struct *, const void *); extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, struct dentry *); extern int proc_readdir(struct file *, struct dir_context *); extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { 198 atomic_inc(&pde->count); return pde; } extern void pde_put(struct proc_dir_entry *); static inline bool is_empty_pde(const struct proc_dir_entry *pde) { 387 return S_ISDIR(pde->mode) && !pde->proc_iops; } struct proc_dir_entry *proc_create_mount_point(const char *name); /* * inode.c */ struct pde_opener { struct file *file; struct list_head lh; int closing; struct completion *c; }; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern void proc_init_inodecache(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *); extern void proc_entry_rundown(struct proc_dir_entry *); /* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; /* * proc_net.c */ extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; #ifdef CONFIG_NET extern int proc_net_init(void); #else static inline int proc_net_init(void) { return 0; } #endif /* * proc_self.c */ extern int proc_setup_self(struct super_block *); /* * proc_thread_self.c */ extern int proc_setup_thread_self(struct super_block *); extern void proc_thread_self_init(void); /* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void sysctl_head_put(struct ctl_table_header *); #else static inline void proc_sys_init(void) { } static inline void sysctl_head_put(struct ctl_table_header *head) { } #endif /* * uid.c */ #ifdef CONFIG_PROC_UID extern int proc_uid_init(void); #else static inline void proc_uid_init(void) { } #endif /* * proc_tty.c */ #ifdef CONFIG_TTY extern void proc_tty_init(void); #else static inline void proc_tty_init(void) {} #endif /* * root.c */ extern struct proc_dir_entry proc_root; extern void proc_self_init(void); extern int proc_remount(struct super_block *, int *, char *); /* * task_[no]mmu.c */ struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif }; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_tid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *);
#ifndef _ASM_X86_PTRACE_H #define _ASM_X86_PTRACE_H #include <asm/segment.h> #include <asm/page_types.h> #include <uapi/asm/ptrace.h> #ifndef __ASSEMBLY__ #ifdef __i386__ struct pt_regs { unsigned long bx; unsigned long cx; unsigned long dx; unsigned long si; unsigned long di; unsigned long bp; unsigned long ax; unsigned long ds; unsigned long es; unsigned long fs; unsigned long gs; unsigned long orig_ax; unsigned long ip; unsigned long cs; unsigned long flags; unsigned long sp; unsigned long ss; }; #else /* __i386__ */ struct pt_regs { /* * C ABI says these regs are callee-preserved. They aren't saved on kernel entry * unless syscall needs a complete, fully filled "struct pt_regs". */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; unsigned long r8; unsigned long ax; unsigned long cx; unsigned long dx; unsigned long si; unsigned long di; /* * On syscall entry, this is syscall#. On CPU exception, this is error code. * On hw interrupt, it's IRQ number: */ unsigned long orig_ax; /* Return frame for iretq */ unsigned long ip; unsigned long cs; unsigned long flags; unsigned long sp; unsigned long ss; /* top of stack page */ }; #endif /* !__i386__ */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt_types.h> #endif struct cpuinfo_x86; struct task_struct; extern unsigned long profile_pc(struct pt_regs *regs); #define profile_pc profile_pc extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code, int si_code); extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, unsigned long phase1_result); extern long syscall_trace_enter(struct pt_regs *); static inline unsigned long regs_return_value(struct pt_regs *regs) { return regs->ax; } /* * user_mode(regs) determines whether a register set came from user * mode. On x86_32, this is true if V8086 mode was enabled OR if the * register set was from protected mode with RPL-3 CS value. This * tricky test checks that with one comparison. * * On x86_64, vm86 mode is mercifully nonexistent, and we don't need * the extra check. */ static inline int user_mode(struct pt_regs *regs) 3351 { #ifdef CONFIG_X86_32 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL; #else return !!(regs->cs & 3); #endif } static inline int v8086_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 return (regs->flags & X86_VM_MASK); #else return 0; /* No V86 mode support in long mode */ #endif } static inline bool user_64bit_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_64 #ifndef CONFIG_PARAVIRT /* * On non-paravirt systems, this is the only long mode CPL 3 * selector. We do not allow long mode selectors in the LDT. */ return regs->cs == __USER_CS; #else /* Headers are too twisted for this to go in paravirt.h. */ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; #endif #else /* !CONFIG_X86_64 */ return false; #endif } #ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp #endif #ifdef CONFIG_X86_32 extern unsigned long kernel_stack_pointer(struct pt_regs *regs); #else static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } #endif #define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) #define GET_USP(regs) ((regs)->sp) #include <asm-generic/ptrace.h> /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); extern const char *regs_query_register_name(unsigned int offset); #define MAX_REG_OFFSET (offsetof(struct pt_regs, ss)) /** * regs_get_register() - get register value from its offset * @regs: pt_regs from which register value is gotten. * @offset: offset number of the register. * * regs_get_register returns the value of a register. The @offset is the * offset of the register in struct pt_regs address which specified by @regs. * If @offset is bigger than MAX_REG_OFFSET, this returns 0. */ static inline unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) { if (unlikely(offset > MAX_REG_OFFSET)) return 0; #ifdef CONFIG_X86_32 /* * Traps from the kernel do not save sp and ss. * Use the helper function to retrieve sp. */ if (offset == offsetof(struct pt_regs, sp) && regs->cs == __KERNEL_CS) return kernel_stack_pointer(regs); #endif return *(unsigned long *)((unsigned long)regs + offset); } /** * regs_within_kernel_stack() - check the address in the stack * @regs: pt_regs which contains kernel stack pointer. * @addr: address which is checked. * * regs_within_kernel_stack() checks @addr is within the kernel stack page(s). * If @addr is within the kernel stack, it returns true. If not, returns false. */ static inline int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); } /** * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns the address of the @n th entry of the * kernel stack which is specified by @regs. If the @n th entry is NOT in * the kernel stack, this returns NULL. */ static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n) { unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) return addr; else return NULL; } /* To avoid include hell, we can't include uaccess.h */ extern long probe_kernel_read(void *dst, const void *src, size_t size); /** * regs_get_kernel_stack_nth() - get Nth entry of the stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which * is specified by @regs. If the @n th entry is NOT in the kernel stack * this returns 0. */ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) { unsigned long *addr; unsigned long val; long ret; addr = regs_get_kernel_stack_nth_addr(regs, n); if (addr) { ret = probe_kernel_read(&val, addr, sizeof(val)); if (!ret) return val; } return 0; } #define arch_has_single_step() (1) #ifdef CONFIG_X86_DEBUGCTLMSR #define arch_has_block_step() (1) #else #define arch_has_block_step() (boot_cpu_data.x86 >= 6) #endif #define ARCH_HAS_USER_SINGLE_STEP_INFO /* * When hitting ptrace_stop(), we cannot return using SYSRET because * that does not restore the full CPU state, only a minimal set. The * ptracer can change arbitrary register values, which is usually okay * because the usual ptrace stops run off the signal delivery path which * forces IRET; however, ptrace_event() stops happen in arbitrary places * in the kernel and don't force IRET path. * * So force IRET path after a ptrace stop. */ #define arch_ptrace_stop_needed(code, info) \ ({ \ force_iret(); \ false; \ }) struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PTRACE_H */
/* * Hash: Hash algorithms under the crypto API * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #ifndef _CRYPTO_HASH_H #define _CRYPTO_HASH_H #include <linux/crypto.h> struct crypto_ahash; /** * DOC: Message Digest Algorithm Definitions * * These data structures define modular message digest algorithm * implementations, managed via crypto_register_ahash(), * crypto_register_shash(), crypto_unregister_ahash() and * crypto_unregister_shash(). */ /** * struct hash_alg_common - define properties of message digest * @digestsize: Size of the result of the transformation. A buffer of this size * must be available to the @final and @finup calls, so they can * store the resulting hash into it. For various predefined sizes, * search include/crypto/ using * git grep _DIGEST_SIZE include/crypto. * @statesize: Size of the block for partial state of the transformation. A * buffer of this size must be passed to the @export function as it * will save the partial state of the transformation into it. On the * other side, the @import function will load the state from a * buffer of this size as well. * @base: Start of data structure of cipher algorithm. The common data * structure of crypto_alg contains information common to all ciphers. * The hash_alg_common data structure now adds the hash-specific * information. */ struct hash_alg_common { unsigned int digestsize; unsigned int statesize; struct crypto_alg base; }; struct ahash_request { struct crypto_async_request base; unsigned int nbytes; struct scatterlist *src; u8 *result; /* This field may only be used by the ahash API code. */ void *priv; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; #define AHASH_REQUEST_ON_STACK(name, ahash) \ char __##name##_desc[sizeof(struct ahash_request) + \ crypto_ahash_reqsize(ahash)] CRYPTO_MINALIGN_ATTR; \ struct ahash_request *name = (void *)__##name##_desc /** * struct ahash_alg - asynchronous message digest definition * @init: Initialize the transformation context. Intended only to initialize the * state of the HASH transformation at the beginning. This shall fill in * the internal structures used during the entire duration of the whole * transformation. No data processing happens at this point. * @update: Push a chunk of data into the driver for transformation. This * function actually pushes blocks of data from upper layers into the * driver, which then passes those to the hardware as seen fit. This * function must not finalize the HASH transformation by calculating the * final message digest as this only adds more data into the * transformation. This function shall not modify the transformation * context, as this function may be called in parallel with the same * transformation object. Data processing can happen synchronously * [SHASH] or asynchronously [AHASH] at this point. * @final: Retrieve result from the driver. This function finalizes the * transformation and retrieves the resulting hash from the driver and * pushes it back to upper layers. No data processing happens at this * point. * @finup: Combination of @update and @final. This function is effectively a * combination of @update and @final calls issued in sequence. As some * hardware cannot do @update and @final separately, this callback was * added to allow such hardware to be used at least by IPsec. Data * processing can happen synchronously [SHASH] or asynchronously [AHASH] * at this point. * @digest: Combination of @init and @update and @final. This function * effectively behaves as the entire chain of operations, @init, * @update and @final issued in sequence. Just like @finup, this was * added for hardware which cannot do even the @finup, but can only do * the whole transformation in one run. Data processing can happen * synchronously [SHASH] or asynchronously [AHASH] at this point. * @setkey: Set optional key used by the hashing algorithm. Intended to push * optional key used by the hashing algorithm from upper layers into * the driver. This function can store the key in the transformation * context or can outright program it into the hardware. In the former * case, one must be careful to program the key into the hardware at * appropriate time and one must be careful that .setkey() can be * called multiple times during the existence of the transformation * object. Not all hashing algorithms do implement this function as it * is only needed for keyed message digests. SHAx/MDx/CRCx do NOT * implement this function. HMAC(MDx)/HMAC(SHAx)/CMAC(AES) do implement * this function. This function must be called before any other of the * @init, @update, @final, @finup, @digest is called. No data * processing happens at this point. * @export: Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. * @import: Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. * @halg: see struct hash_alg_common */ struct ahash_alg { int (*init)(struct ahash_request *req); int (*update)(struct ahash_request *req); int (*final)(struct ahash_request *req); int (*finup)(struct ahash_request *req); int (*digest)(struct ahash_request *req); int (*export)(struct ahash_request *req, void *out); int (*import)(struct ahash_request *req, const void *in); int (*setkey)(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); struct hash_alg_common halg; }; struct shash_desc { struct crypto_shash *tfm; u32 flags; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; #define SHASH_DESC_ON_STACK(shash, ctx) \ char __##shash##_desc[sizeof(struct shash_desc) + \ crypto_shash_descsize(ctx)] CRYPTO_MINALIGN_ATTR; \ struct shash_desc *shash = (struct shash_desc *)__##shash##_desc /** * struct shash_alg - synchronous message digest definition * @init: see struct ahash_alg * @update: see struct ahash_alg * @final: see struct ahash_alg * @finup: see struct ahash_alg * @digest: see struct ahash_alg * @export: see struct ahash_alg * @import: see struct ahash_alg * @setkey: see struct ahash_alg * @digestsize: see struct ahash_alg * @statesize: see struct ahash_alg * @descsize: Size of the operational state for the message digest. This state * size is the memory size that needs to be allocated for * shash_desc.__ctx * @base: internally used */ struct shash_alg { int (*init)(struct shash_desc *desc); int (*update)(struct shash_desc *desc, const u8 *data, unsigned int len); int (*final)(struct shash_desc *desc, u8 *out); int (*finup)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); int (*digest)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); int (*export)(struct shash_desc *desc, void *out); int (*import)(struct shash_desc *desc, const void *in); int (*setkey)(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); unsigned int descsize; /* These fields must match hash_alg_common. */ unsigned int digestsize __attribute__ ((aligned(__alignof__(struct hash_alg_common)))); unsigned int statesize; struct crypto_alg base; }; struct crypto_ahash { int (*init)(struct ahash_request *req); int (*update)(struct ahash_request *req); int (*final)(struct ahash_request *req); int (*finup)(struct ahash_request *req); int (*digest)(struct ahash_request *req); int (*export)(struct ahash_request *req, void *out); int (*import)(struct ahash_request *req, const void *in); int (*setkey)(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); unsigned int reqsize; bool has_setkey; struct crypto_tfm base; }; struct crypto_shash { unsigned int descsize; struct crypto_tfm base; }; /** * DOC: Asynchronous Message Digest API * * The asynchronous message digest API is used with the ciphers of type * CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto) * * The asynchronous cipher operation discussion provided for the * CRYPTO_ALG_TYPE_ABLKCIPHER API applies here as well. */ static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_ahash, base); } /** * crypto_alloc_ahash() - allocate ahash cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * ahash cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an ahash. The returned struct * crypto_ahash is the cipher handle that is required for any subsequent * API invocation for that ahash. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm) { return &tfm->base; } /** * crypto_free_ahash() - zeroize and free the ahash handle * @tfm: cipher handle to be freed */ static inline void crypto_free_ahash(struct crypto_ahash *tfm) { crypto_destroy_tfm(tfm, crypto_ahash_tfm(tfm)); } static inline unsigned int crypto_ahash_alignmask( struct crypto_ahash *tfm) { 39 return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm)); } /** * crypto_ahash_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the message digest cipher referenced with the cipher * handle is returned. * * Return: block size of cipher */ static inline unsigned int crypto_ahash_blocksize(struct crypto_ahash *tfm) { return crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm)); } static inline struct hash_alg_common *__crypto_hash_alg_common( struct crypto_alg *alg) { return container_of(alg, struct hash_alg_common, base); } static inline struct hash_alg_common *crypto_hash_alg_common( struct crypto_ahash *tfm) { 39 return __crypto_hash_alg_common(crypto_ahash_tfm(tfm)->__crt_alg); } /** * crypto_ahash_digestsize() - obtain message digest size * @tfm: cipher handle * * The size for the message digest created by the message digest cipher * referenced with the cipher handle is returned. * * * Return: message digest size of cipher */ static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm) { return crypto_hash_alg_common(tfm)->digestsize; } static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm) { return crypto_hash_alg_common(tfm)->statesize; } static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm) { return crypto_tfm_get_flags(crypto_ahash_tfm(tfm)); } static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags) { crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags); } static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags) { 17 crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags); } /** * crypto_ahash_reqtfm() - obtain cipher handle from request * @req: asynchronous request handle that contains the reference to the ahash * cipher handle * * Return the ahash cipher handle that is registered with the asynchronous * request handle ahash_request. * * Return: ahash cipher handle */ static inline struct crypto_ahash *crypto_ahash_reqtfm( struct ahash_request *req) { return __crypto_ahash_cast(req->base.tfm); } /** * crypto_ahash_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return the size of the ahash state size. With the crypto_ahash_export * function, the caller can export the state into a buffer whose size is * defined with this function. * * Return: size of the ahash state */ static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm) { return tfm->reqsize; } static inline void *ahash_request_ctx(struct ahash_request *req) { return req->__ctx; } /** * crypto_ahash_setkey - set key for cipher handle * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the ahash cipher. The cipher * handle must point to a keyed hash in order for this function to succeed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); static inline bool crypto_ahash_has_setkey(struct crypto_ahash *tfm) { return tfm->has_setkey; } /** * crypto_ahash_finup() - update and finalize message digest * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * This function is a "short-hand" for the function calls of * crypto_ahash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate functions. * * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_ahash_finup(struct ahash_request *req); /** * crypto_ahash_final() - calculate message digest * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * Finalize the message digest operation and create the message digest * based on all data added to the cipher handle. The message digest is placed * into the output buffer registered with the ahash_request handle. * * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_ahash_final(struct ahash_request *req); /** * crypto_ahash_digest() - calculate message digest for a buffer * @req: reference to the ahash_request handle that holds all information * needed to perform the cipher operation * * This function is a "short-hand" for the function calls of crypto_ahash_init, * crypto_ahash_update and crypto_ahash_final. The parameters have the same * meaning as discussed for those separate three functions. * * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_ahash_digest(struct ahash_request *req); /** * crypto_ahash_export() - extract current message digest state * @req: reference to the ahash_request handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * This function exports the hash state of the ahash_request handle into the * caller-allocated output buffer out which must have sufficient size (e.g. by * calling crypto_ahash_reqsize). * * Return: 0 if the export was successful; < 0 if an error occurred */ static inline int crypto_ahash_export(struct ahash_request *req, void *out) { return crypto_ahash_reqtfm(req)->export(req, out); } /** * crypto_ahash_import() - import message digest state * @req: reference to ahash_request handle the state is imported into * @in: buffer holding the state * * This function imports the hash state into the ahash_request handle from the * input buffer. That buffer should have been generated with the * crypto_ahash_export function. * * Return: 0 if the import was successful; < 0 if an error occurred */ static inline int crypto_ahash_import(struct ahash_request *req, const void *in) { return crypto_ahash_reqtfm(req)->import(req, in); } /** * crypto_ahash_init() - (re)initialize message digest handle * @req: ahash_request handle that already is initialized with all necessary * data using the ahash_request_* API functions * * The call (re-)initializes the message digest referenced by the ahash_request * handle. Any potentially existing state created by previous operations is * discarded. * * Return: 0 if the message digest initialization was successful; < 0 if an * error occurred */ static inline int crypto_ahash_init(struct ahash_request *req) { return crypto_ahash_reqtfm(req)->init(req); } /** * crypto_ahash_update() - add data to message digest for processing * @req: ahash_request handle that was previously initialized with the * crypto_ahash_init call. * * Updates the message digest state of the &ahash_request handle. The input data * is pointed to by the scatter/gather list registered in the &ahash_request * handle * * Return: 0 if the message digest update was successful; < 0 if an error * occurred */ static inline int crypto_ahash_update(struct ahash_request *req) { return crypto_ahash_reqtfm(req)->update(req); } /** * DOC: Asynchronous Hash Request Handle * * The &ahash_request data structure contains all pointers to data * required for the asynchronous cipher operation. This includes the cipher * handle (which can be used by multiple &ahash_request instances), pointer * to plaintext and the message digest output buffer, asynchronous callback * function, etc. It acts as a handle to the ahash_request_* API calls in a * similar way as ahash handle to the crypto_ahash_* API calls. */ /** * ahash_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing ahash handle in the request * data structure with a different one. */ static inline void ahash_request_set_tfm(struct ahash_request *req, struct crypto_ahash *tfm) { req->base.tfm = crypto_ahash_tfm(tfm); } /** * ahash_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the ahash * message digest API calls. During * the allocation, the provided ahash handle * is registered in the request data structure. * * Return: allocated request handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ static inline struct ahash_request *ahash_request_alloc( struct crypto_ahash *tfm, gfp_t gfp) { struct ahash_request *req; req = kmalloc(sizeof(struct ahash_request) + crypto_ahash_reqsize(tfm), gfp); if (likely(req)) ahash_request_set_tfm(req, tfm); return req; } /** * ahash_request_free() - zeroize and free the request data structure * @req: request data structure cipher handle to be freed */ static inline void ahash_request_free(struct ahash_request *req) { kzfree(req); } static inline struct ahash_request *ahash_request_cast( struct crypto_async_request *req) { return container_of(req, struct ahash_request, base); } /** * ahash_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * &crypto_async_request data structure provided to the callback function. * * This function allows setting the callback function that is triggered once * the cipher operation completes. * * The callback function is registered with the &ahash_request handle and * must comply with the following template * * void callback_function(struct crypto_async_request *req, int error) */ static inline void ahash_request_set_callback(struct ahash_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * ahash_request_set_crypt() - set data buffers * @req: ahash_request handle to be updated * @src: source scatter/gather list * @result: buffer that is filled with the message digest -- the caller must * ensure that the buffer has sufficient space by, for example, calling * crypto_ahash_digestsize() * @nbytes: number of bytes to process from the source scatter/gather list * * By using this call, the caller references the source scatter/gather list. * The source scatter/gather list points to the data the message digest is to * be calculated for. */ static inline void ahash_request_set_crypt(struct ahash_request *req, struct scatterlist *src, u8 *result, unsigned int nbytes) { req->src = src; req->nbytes = nbytes; req->result = result; } /** * DOC: Synchronous Message Digest API * * The synchronous message digest API is used with the ciphers of type * CRYPTO_ALG_TYPE_SHASH (listed as type "shash" in /proc/crypto) * * The message digest API is able to maintain state information for the * caller. * * The synchronous message digest API can store user-related context in in its * shash_desc request data structure. */ /** * crypto_alloc_shash() - allocate message digest handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * message digest cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a message digest. The returned &struct * crypto_shash is the cipher handle that is required for any subsequent * API invocation for that message digest. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm) { return &tfm->base; } /** * crypto_free_shash() - zeroize and free the message digest handle * @tfm: cipher handle to be freed */ static inline void crypto_free_shash(struct crypto_shash *tfm) { crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm)); } static inline unsigned int crypto_shash_alignmask( struct crypto_shash *tfm) { return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm)); } /** * crypto_shash_blocksize() - obtain block size for cipher * @tfm: cipher handle * * The block size for the message digest cipher referenced with the cipher * handle is returned. * * Return: block size of cipher */ static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm) { return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm)); } static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg) { return container_of(alg, struct shash_alg, base); } static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm) { 39 return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg); } /** * crypto_shash_digestsize() - obtain message digest size * @tfm: cipher handle * * The size for the message digest created by the message digest cipher * referenced with the cipher handle is returned. * * Return: digest size of cipher */ static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm) { 1 return crypto_shash_alg(tfm)->digestsize; } static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->statesize; } static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm) { return crypto_tfm_get_flags(crypto_shash_tfm(tfm)); } static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags) { crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags); } static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags); } /** * crypto_shash_descsize() - obtain the operational state size * @tfm: cipher handle * * The size of the operational state the cipher needs during operation is * returned for the hash referenced with the cipher handle. This size is * required to calculate the memory requirements to allow the caller allocating * sufficient memory for operational state. * * The operational state is defined with struct shash_desc where the size of * that data structure is to be calculated as * sizeof(struct shash_desc) + crypto_shash_descsize(alg) * * Return: size of the operational state */ static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm) { 39 return tfm->descsize; } static inline void *shash_desc_ctx(struct shash_desc *desc) { 33 return desc->__ctx; } /** * crypto_shash_setkey() - set key for message digest * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the keyed message digest cipher. The * cipher handle must point to a keyed message digest cipher in order for this * function to succeed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); /** * crypto_shash_digest() - calculate message digest for buffer * @desc: see crypto_shash_final() * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This function is a "short-hand" for the function calls of crypto_shash_init, * crypto_shash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate three functions. * * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); /** * crypto_shash_export() - extract operational state for message digest * @desc: reference to the operational state handle whose state is exported * @out: output buffer of sufficient size that can hold the hash state * * This function exports the hash state of the operational state handle into the * caller-allocated output buffer out which must have sufficient size (e.g. by * calling crypto_shash_descsize). * * Return: 0 if the export creation was successful; < 0 if an error occurred */ static inline int crypto_shash_export(struct shash_desc *desc, void *out) { 37 return crypto_shash_alg(desc->tfm)->export(desc, out); } /** * crypto_shash_import() - import operational state * @desc: reference to the operational state handle the state imported into * @in: buffer holding the state * * This function imports the hash state into the operational state handle from * the input buffer. That buffer should have been generated with the * crypto_ahash_export function. * * Return: 0 if the import was successful; < 0 if an error occurred */ static inline int crypto_shash_import(struct shash_desc *desc, const void *in) { return crypto_shash_alg(desc->tfm)->import(desc, in); } /** * crypto_shash_init() - (re)initialize message digest * @desc: operational state handle that is already filled * * The call (re-)initializes the message digest referenced by the * operational state handle. Any potentially existing state created by * previous operations is discarded. * * Return: 0 if the message digest initialization was successful; < 0 if an * error occurred */ static inline int crypto_shash_init(struct shash_desc *desc) { 37 return crypto_shash_alg(desc->tfm)->init(desc); } /** * crypto_shash_update() - add data to message digest for processing * @desc: operational state handle that is already initialized * @data: input data to be added to the message digest * @len: length of the input data * * Updates the message digest state of the operational state handle. * * Return: 0 if the message digest update was successful; < 0 if an error * occurred */ int crypto_shash_update(struct shash_desc *desc, const u8 *data, unsigned int len); /** * crypto_shash_final() - calculate message digest * @desc: operational state handle that is already filled with data * @out: output buffer filled with the message digest * * Finalize the message digest operation and create the message digest * based on all data added to the cipher handle. The message digest is placed * into the output buffer. The caller must ensure that the output buffer is * large enough by using crypto_shash_digestsize. * * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_final(struct shash_desc *desc, u8 *out); /** * crypto_shash_finup() - calculate message digest of buffer * @desc: see crypto_shash_final() * @data: see crypto_shash_update() * @len: see crypto_shash_update() * @out: see crypto_shash_final() * * This function is a "short-hand" for the function calls of * crypto_shash_update and crypto_shash_final. The parameters have the same * meaning as discussed for those separate functions. * * Return: 0 if the message digest creation was successful; < 0 if an error * occurred */ int crypto_shash_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out); #endif /* _CRYPTO_HASH_H */
/* * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. * * 10Apr2002 Andrew Morton * Initial version */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> #include <linux/mm_inline.h> #include <trace/events/writeback.h> #include "internal.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* * Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* * Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) #define RATELIMIT_CALC_SHIFT 10 /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; /* The following parameters are exported via /proc/sys/vm */ /* * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ EXPORT_SYMBOL_GPL(dirty_writeback_interval); /* * The longest time for which data is allowed to remain dirty */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. */ int block_dump; /* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; /* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ #endif struct bdi_writeback *wb; struct fprop_local_percpu *wb_completions; unsigned long avail; /* dirtyable */ unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; unsigned long wb_bg_thresh; unsigned long pos_ratio; }; /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) #ifdef CONFIG_CGROUP_WRITEBACK #define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB .dom = &global_wb_domain #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; do_div(min, tot_bw); } if (max < 100) { max *= this_bw; do_div(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ #define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB #define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effictive number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ /** * zone_dirtyable_memory - number of dirtyable pages in a zone * @zone: the zone * * Returns the zone's number of pages potentially available for dirty * page cache. This is the base value for the per-zone dirty limits. */ static unsigned long zone_dirtyable_memory(struct zone *zone) { unsigned long nr_pages; 359 nr_pages = zone_page_state(zone, NR_FREE_PAGES); nr_pages -= min(nr_pages, zone->dirty_balance_reserve); nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); return nr_pages; } static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; for_each_node_state(node, N_HIGH_MEMORY) { struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; x += zone_dirtyable_memory(z); } /* * Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below * the zone's dirty balance reserve and the above calculation * will underflow. However we still want to add in nodes * which are below threshold (negative values) to get a more * accurate calculation but make sure that the total never * underflows. */ if ((long)x < 0) x = 0; /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** * global_dirtyable_memory - number of globally dirtyable pages * * Returns the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) { unsigned long x; x = global_page_state(NR_FREE_PAGES); x -= min(x, dirty_balance_reserve); x += global_page_state(NR_INACTIVE_FILE); x += global_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } /** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest * * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and * real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { 399 const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; /* convert ratios to per-PAGE_SIZE for higher precision */ 399 unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; 399 unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; /* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against * globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. */ if (bytes) ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); if (bg_bytes) bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else thresh = (ratio * available_memory) / PAGE_SIZE; 399 if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; 399 if (bg_thresh >= thresh) bg_thresh = thresh / 2; 399 tsk = current; 399 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { bg_thresh += bg_thresh / 4; thresh += thresh / 4; } 399 dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) 399 trace_global_dirty_state(bg_thresh, thresh); 399 } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { 2 struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; } /** * zone_dirty_limit - maximum number of dirty pages allowed in a zone * @zone: the zone * * Returns the maximum number of dirty pages allowed in a zone, based * on the zone's dirtyable memory. */ static unsigned long zone_dirty_limit(struct zone *zone) { 359 unsigned long zone_memory = zone_dirtyable_memory(zone); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * zone_memory / global_dirtyable_memory(); else 359 dirty = vm_dirty_ratio * zone_memory / 100; 359 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) dirty += dirty / 4; return dirty; } /** * zone_dirty_ok - tells whether a zone is within its dirty limits * @zone: the zone to check * * Returns %true when the dirty pages in @zone are within the zone's * dirty limit, %false if the limit is exceeded. */ bool zone_dirty_ok(struct zone *zone) { 359 unsigned long limit = zone_dirty_limit(zone); 359 return zone_page_state(zone, NR_FILE_DIRTY) + zone_page_state(zone, NR_UNSTABLE_NFS) + zone_page_state(zone, NR_WRITEBACK) <= limit; } int dirty_background_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; } int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } static void wb_domain_writeout_inc(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac) { 29 __fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); /* First event after period switching was turned off? */ if (!unlikely(dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); } 29 } /* * Increment @wb's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { struct wb_domain *cgdom; __inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac); } void wb_writeout_inc(struct bdi_writeback *wb) { unsigned long flags; local_irq_save(flags); __wb_writeout_inc(wb); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(unsigned long t) { struct wb_domain *dom = (void *)t; int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ dom->period_time = 0; } } int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); spin_lock_init(&dom->lock); init_timer_deferrable(&dom->period_timer); dom->period_timer.function = writeout_period; dom->period_timer.data = (unsigned long)dom; dom->dirty_limit_tstamp = jiffies; return fprop_global_init(&dom->completions, gfp); } #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { del_timer_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. */ static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { min_ratio -= bdi->min_ratio; if (bdi_min_ratio + min_ratio < 100) { bdi_min_ratio += min_ratio; bdi->min_ratio += min_ratio; } else { ret = -EINVAL; } } spin_unlock_bh(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { int ret = 0; if (max_ratio > 100) return -EINVAL; spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_bh(&bdi_lock); return ret; } EXPORT_SYMBOL(bdi_set_max_ratio); static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { 398 return (thresh + bg_thresh) / 2; } static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) { return max(thresh, dom->dirty_limit); } /* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) { struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); mdtc->avail = filepages + min(headroom, other_clean); } /** * __wb_calc_thresh - @wb's share of dirty throttling threshold * @dtc: dirty_throttle_context of interest * * Returns @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. * * Note that balance_dirty_pages() will only seriously take it as a hard limit * when sleeping max_pause per page is not enough to keep the dirty pages under * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; u64 wb_thresh; long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* * Calculate this BDI's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; wb_thresh *= numerator; do_div(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / 100; if (wb_thresh > (thresh * wb_max_ratio) / 100) wb_thresh = thresh * wb_max_ratio / 100; return wb_thresh; } unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb), .thresh = thresh }; return __wb_calc_thresh(&gdtc); } /* * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, (limit - setpoint) | 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* * Dirty position control. * * (o) global/bdi setpoints * * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * * if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 .............* * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * * (o) wb control line * * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> * wb_setpoint^ x_intercept^ * * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD * card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload */ static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long write_bw = wb->avg_write_bandwidth; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; dtc->pos_ratio = 0; if (unlikely(dtc->dirty >= limit)) return; /* * global setpoint * * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2, 2 << RATELIMIT_CALC_SHIFT); return; } if (dtc->wb_dirty >= wb_thresh) return; wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); if (wb_setpoint == 0 || wb_setpoint == wb_thresh) return; wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, wb_thresh); /* * Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other * wb's) while given strictlimit wb is below limit. * * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; } /* * We have computed basic pos_ratio above based on global situation. If * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* * wb setpoint * * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * * x_intercept - wb_dirty * := -------------------------- * x_intercept - wb_setpoint * * The main wb control line is a linear function that subjects to * * (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw * * For single wb case, the dirty pages are observed to fluctuate * regularly within range * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); wb_setpoint = setpoint * (u64)x >> 16; /* * Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * * wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh */ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; x_intercept = wb_setpoint + span; if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ x_intercept = wb_thresh / 2; if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); else pos_ratio *= 8; } dtc->pos_ratio = pos_ratio; } static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period * * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); avg = bw; goto out; } bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; wb->avg_write_bandwidth = avg; } static void update_dirty_limit(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; unsigned long limit = dom->dirty_limit; /* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: dom->dirty_limit = limit; } static void domain_update_bandwidth(struct dirty_throttle_control *dtc, unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { update_dirty_limit(dtc); dom->dirty_limit_tstamp = now; } spin_unlock(&dom->lock); } /* * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long dirtied, unsigned long elapsed) { struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); /* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; /* * We could safely do this and return immediately: * * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; /* * For strictlimit case, calculations above were based on wb counters * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". * * We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; if (dtc->wb_dirty < 8) setpoint = dtc->wb_dirty + 1; else setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, unsigned long start_time, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; unsigned long elapsed = now - wb->bw_time_stamp; unsigned long dirtied; unsigned long written; lockdep_assert_held(&wb->list_lock); /* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); /* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) goto snapshot; if (update_ratelimit) { domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_bandwidth(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } wb_update_write_bandwidth(wb, elapsed, written); snapshot: wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; } void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; __wb_update_bandwidth(&gdtc, NULL, start_time, false); } /* * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive * global_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ 398 static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { 398 if (thresh > dirty) 398 return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { unsigned long bw = wb->avg_write_bandwidth; unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { long hi = ilog2(wb->avg_write_bandwidth); long lo = ilog2(wb->dirty_ratelimit); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ /* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); /* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * * (N * 10ms) on 2^N concurrent tasks. */ if (hi > lo) t += (hi - lo) * (10 * HZ) / 1024; /* * This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. * * We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. */ t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); /* * Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. */ if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } *nr_dirtied_pause = pages; /* * The minimal pause time will normally be half the target pause time. */ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long wb_reclaimable; /* * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ dtc->wb_thresh = __wb_calc_thresh(dtc); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, struct bdi_writeback *wb, unsigned long pages_dirtied) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { 398 unsigned long now = jiffies; unsigned long dirty, thresh, bg_thresh; unsigned long m_dirty = 0; /* stop bogus uninit warnings */ unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; /* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. */ nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); domain_dirty_limits(gdtc); if (unlikely(strictlimit)) { wb_dirty_limits(gdtc); dirty = gdtc->wb_dirty; thresh = gdtc->wb_thresh; bg_thresh = gdtc->wb_bg_thresh; } else { 398 dirty = gdtc->dirty; thresh = gdtc->thresh; bg_thresh = gdtc->bg_thresh; } if (mdtc) { unsigned long filepages, headroom, writeback; /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc->dirty += writeback; mdtc_calc_avail(mdtc, filepages, headroom); domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { wb_dirty_limits(mdtc); m_dirty = mdtc->wb_dirty; m_thresh = mdtc->wb_thresh; m_bg_thresh = mdtc->wb_bg_thresh; } else { m_dirty = mdtc->dirty; m_thresh = mdtc->thresh; m_bg_thresh = mdtc->bg_thresh; } } /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the wb limits are ramping up in case of !strictlimit. * * In strictlimit case make decision based on the wb counters * and limits. Small writeouts when the wb limits are ramping * up are the price we consciously pay for strictlimit-ing. * * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ 398 if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { 398 unsigned long intv = dirty_poll_interval(dirty, thresh); unsigned long m_intv = ULONG_MAX; 398 current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); break; } if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ if (!strictlimit) wb_dirty_limits(gdtc); dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); wb_position_ratio(gdtc); sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ if (!strictlimit) wb_dirty_limits(mdtc); dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); __wb_update_bandwidth(gdtc, mdtc, start_time, true); spin_unlock(&wb->list_lock); } /* throttle according to the chosen dtc */ dirty_ratelimit = wb->dirty_ratelimit; task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; pause = max_pause; goto pause; } period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, min(pause, 0L), start_time); if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; } else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; break; } if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } pause: trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); current->dirty_paused_when = now + pause; current->nr_dirtied = 0; current->nr_dirtied_pause = nr_dirtied_pause; /* * This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* * In the case of an unresponding NFS server and the NFS dirty * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the comsumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ if (sdtc->wb_dirty <= wb_stat_error(wb)) break; if (fatal_signal_pending(current)) break; } 398 if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; 398 if (writeback_in_progress(wb)) return; /* * In laptop mode, we wait until hitting the higher threshold before * starting background writeout, and then write out all the way down * to the lower threshold. So slow writers cause minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ 391 if (laptop_mode) return; 391 if (nr_reclaimable > gdtc->bg_thresh) 398 wb_start_background_writeback(wb); } static DEFINE_PER_CPU(int, bdp_ratelimits); /* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * On really big machines, get_writeback_state is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { 861 struct inode *inode = mapping->host; 861 struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; int *p; 861 if (!bdi_cap_account_dirty(bdi)) return; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) 398 wb = &bdi->wb; 479 ratelimit = current->nr_dirtied_pause; if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); 479 preempt_disable(); /* * This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) 394 *p = 0; 479 else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; } /* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ 479 p = this_cpu_ptr(&dirty_throttle_leaks); 109 if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; 109 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; } 479 preempt_enable(); 861 if (unlikely(current->nr_dirtied >= ratelimit)) 398 balance_dirty_pages(mapping, wb, current->nr_dirtied); wb_put(wb); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); /** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's * clean enough. Returns %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; /* * Similar to balance_dirty_pages() but ignores pages being written * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); gdtc->dirty = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) return true; if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) return true; if (mdtc) { unsigned long filepages, headroom, writeback; mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, &writeback); mdtc_calc_avail(mdtc, filepages, headroom); domain_dirty_limits(mdtc); /* ditto, ignore writeback */ if (mdtc->dirty > mdtc->bg_thresh) return true; if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) return true; } return false; } void throttle_vm_writeout(gfp_t gfp_mask) { unsigned long background_thresh; unsigned long dirty_thresh; for ( ; ; ) { 2 global_dirty_limits(&background_thresh, &dirty_thresh); dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh); /* * Boost the allowable dirty threshold a bit for page * allocators so they don't get DoS'ed by heavy writers */ dirty_thresh += dirty_thresh / 10; /* wheeee... */ 2 if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The caller might hold locks which can prevent IO completion * or progress in the filesystem. So we cannot just sit here * waiting for IO to complete. */ if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) break; } } /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); return 0; } #ifdef CONFIG_BLOCK void laptop_mode_timer_fn(unsigned long data) { struct request_queue *q = (struct request_queue *)data; int nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); struct bdi_writeback *wb; /* * We want to write everything out, not just down to the dirty * threshold */ if (!bdi_has_dirty_io(&q->backing_dev_info)) return; rcu_read_lock(); list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node) if (wb_has_dirty_io(wb)) wb_start_writeback(wb, nr_pages, true, WB_REASON_LAPTOP_TIMER); rcu_read_unlock(); } /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ void laptop_io_completion(struct backing_dev_info *info) { mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); } /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) del_timer(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } #endif /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * If it is too low then SMP machines will call the (expensive) * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory * thresholds. */ void writeback_set_ratelimit(void) { struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; } static int ratelimit_handler(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DEAD: writeback_set_ratelimit(); return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block ratelimit_nb = { .notifier_call = ratelimit_handler, .next = NULL, }; /* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers (by * comparing nr_free_buffer_pages() to vm_total_pages. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory (by subtracting * totalhigh_pages from vm_total_pages), and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. */ void __init page_writeback_init(void) { BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); } /** * tag_pages_for_writeback - tag pages to be written by write_cache_pages * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is * that write_cache_pages (or whoever calls this function) will then use * TOWRITE tag to identify pages eligible for writeback. This mechanism is * used to avoid livelocking of writeback by a process steadily creating new * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ /* * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. */ void tag_pages_for_writeback(struct address_space *mapping, 403 pgoff_t start, pgoff_t end) { #define WRITEBACK_TAG_BATCH 4096 unsigned long tagged; do { 403 spin_lock_irq(&mapping->tree_lock); tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, &start, end, WRITEBACK_TAG_BATCH, PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); 403 cond_resched(); /* We check 'start' to handle wrapping when end == ~0UL */ 1 } while (tagged >= WRITEBACK_TAG_BATCH && start); 403 } EXPORT_SYMBOL(tag_pages_for_writeback); /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. * * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). * * To avoid deadlocks between range_cyclic writeback and callers that hold * pages in PageWriteback to aggregate IO until write_cache_pages() returns, * we do not loop back to the start of the file. Doing so causes a page * lock/page writeback access order inversion - we should only ever lock * multiple pages in ascending page->index order, and looping back to the start * of the file violates that rule and causes deadlocks. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; int error; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); 130 pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; 130 end = -1; } else { 123 index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; 130 } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; 130 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 130 tag_pages_for_writeback(mapping, index, end); done_index = index; 130 while (!done && (index <= end)) { int i; 130 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; 122 for (i = 0; i < nr_pages; i++) { 122 struct page *page = pvec.pages[i]; /* * At this point, the page may be truncated or 122 * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ 122 done = 1; break; 11 } done_index = page->index; 122 lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page 122 * has disappeared concurrently, so there could be no 2 * real expectation of this data interity operation 2 * even if there is now a new, dirty page at the same * pagecache address. */ if (unlikely(page->mapping != mapping)) { continue_unlock: 122 unlock_page(page); 122 continue; } 122 if (!PageDirty(page)) { 122 /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); error = (*writepage)(page, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of * writeback. There's no need to continue for * background writeback. Just push done_index * past this page so media errors won't choke * writeout for the entire file. For integrity * writeback, we must process the entire dirty * set regardless of errors because the fs may * still have state to clear for each page. In * that case we continue processing and return * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); error = 0; 122 } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; done_index = page->index + 1; done = 1; break; } 122 if (!ret) 122 ret = error; } 130 /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; 130 } 79 } pagevec_release(&pvec); 130 cond_resched(); } /* * If we hit the last page and there is more work to be done: wrap * back the index back to the start of the file for the next * time we are called. */ if (wbc->range_cyclic && !done) done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; 122 return ret; 122 } EXPORT_SYMBOL(write_cache_pages); /* * Function used by generic_writepages to call the real writepage * function and set the mapping flags on error */ static int __writepage(struct page *page, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; 130 int ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); return ret; } /** * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. 130 * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * 130 * This is a library function, which implements the writepages() * address_space_operation. */ 130 int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct blk_plug plug; int ret; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; 711 blk_start_plug(&plug); 697 ret = write_cache_pages(mapping, wbc, __writepage, mapping); 711 blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(generic_writepages); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; if (wbc->nr_to_write <= 0) return 0; if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); return ret; } /** * write_one_page - write out a single page and optionally wait on I/O * @page: the page to write * @wait: if true, wait on writeout * * The page must be locked by the caller and will be unlocked upon return. * * write_one_page() returns a negative error code if I/O failed. */ int write_one_page(struct page *page, int wait) { struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } 455 page_cache_release(page); 429 } else { unlock_page(page); } return ret; } EXPORT_SYMBOL(write_one_page); /* * For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) return !TestSetPageDirty(page); 695 return 0; } 695 /* 695 * Helper function for set_page_dirty family. * * Caller must hold mem_cgroup_begin_page_stat(). * 695 * NOTE: This relies on being atomic wrt interrupts. */ void account_page_dirtied(struct page *page, struct address_space *mapping, 695 struct mem_cgroup *memcg) { struct inode *inode = mapping->host; trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { struct bdi_writeback *wb; 695 inode_attach_wb(inode, page); wb = inode_to_wb(inode); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); 83 __inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; 83 this_cpu_inc(bdp_ratelimits); } 83 } 83 EXPORT_SYMBOL(account_page_dirtied); 83 /* 83 * Helper function for deaccounting dirty page without writeback. * * Caller must hold mem_cgroup_begin_page_stat(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, struct mem_cgroup *memcg, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); } } 2 /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * 2 * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * * The caller must ensure this doesn't race with truncation. Most will simply * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and * the pte lock held, which also locks out truncation. */ int __set_page_dirty_nobuffers(struct page *page) 2 { struct mem_cgroup *memcg; 2 2 memcg = mem_cgroup_begin_page_stat(page); 2 if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { mem_cgroup_end_page_stat(memcg); return 1; 2 } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 2 account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); mem_cgroup_end_page_stat(memcg); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } mem_cgroup_end_page_stat(memcg); 2 return 0; } 2 EXPORT_SYMBOL(__set_page_dirty_nobuffers); 2 /* * Call this whenever redirtying a page, to de-account the dirty counters * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written 2 * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to 2 * systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. 2 */ void account_page_redirty(struct page *page) { 2 struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); unlocked_inode_to_wb_end(inode, &cookie); 2 } } EXPORT_SYMBOL(account_page_redirty); /* * When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via * redirty_page_for_writepage() and it should then unlock the page and return 0 */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { int ret; wbc->pages_skipped++; ret = __set_page_dirty_nobuffers(page); account_page_redirty(page); return ret; } 16 EXPORT_SYMBOL(redirty_page_for_writepage); 567 /* * Dirty a page. * 550 * For pages with a mapping this should be done under the page lock * for the benefit of asynchronous memory errors who prefer a consistent * dirty state. This rule can be broken in some special cases, * but should be better not to. * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 550 /* * readahead/lru_deactivate_page could remain * PG_readahead/PG_reclaim due to race with end_page_writeback 567 * About readahead, if the page is written, the flags would be * reset. So no problem. 17 * About lru_deactivate_page, if the page is redirty, the flag 16 * will be reset. So no problem. but if the page is used by readahead * it will confuse readahead and make it restart the size rampup * process. But it's a trivial problem. */ if (PageReclaim(page)) ClearPageReclaim(page); #ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; #endif return (*spd)(page); } if (!PageDirty(page)) { if (!TestSetPageDirty(page)) return 1; } return 0; } EXPORT_SYMBOL(set_page_dirty); /* 16 * set_page_dirty() is racy if the caller has no reference against 16 * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. * * Usually, the page _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the page should be locked before running set_page_dirty(). */ int set_page_dirty_lock(struct page *page) { int ret; lock_page(page); ret = set_page_dirty(page); unlock_page(page); return ret; } EXPORT_SYMBOL(set_page_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT 559 * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on 559 * the dirty lists, and in particular, clear_page_dirty_for_io() will still 23 * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is 518 * horribly ugly"? Thought you could. */ 518 void cancel_dirty_page(struct page *page) { struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; 59 struct bdi_writeback *wb; struct mem_cgroup *memcg; 559 struct wb_lock_cookie cookie = {}; memcg = mem_cgroup_begin_page_stat(page); wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, memcg, wb); unlocked_inode_to_wb_end(inode, &cookie); mem_cgroup_end_page_stat(memcg); } else { ClearPageDirty(page); } } EXPORT_SYMBOL(cancel_dirty_page); /* * Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. 404 * * This is for preparing to put the page under writeout. We leave the page * tagged as dirty in the radix tree so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), 404 * at which stage we bring the page's dirty flag and radix-tree dirty tag 404 * back into sync. * * This incoherency between the page's dirty flag and radix-tree tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); int ret = 0; BUG_ON(!PageLocked(page)); if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct mem_cgroup *memcg; struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole page dirty if it was * dirty in a pagetable. Only to then * (c) clean the page again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing 81 * them concurrently from different threads. * * Note! Normally the "set_page_dirty(page)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the page "master dirty bit" * as a serialization point for all the different 404 * threads doing their things. 404 */ if (page_mkclean(page)) 404 set_page_dirty(page); 404 /* * We carefully synchronise fault handlers against * installing a dirty pte and marking the page dirty * at this point. We do this by having them hold the * page lock while dirtying the page, and pages are * always locked coming in here, so we get the desired * exclusion. 404 */ memcg = mem_cgroup_begin_page_stat(page); wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); 29 dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } unlocked_inode_to_wb_end(inode, &cookie); mem_cgroup_end_page_stat(memcg); return ret; 29 } 29 return TestClearPageDirty(page); } EXPORT_SYMBOL(clear_page_dirty_for_io); 29 int test_clear_page_writeback(struct page *page) { 29 struct address_space *mapping = page_mapping(page); struct mem_cgroup *memcg; int ret; 29 memcg = mem_cgroup_begin_page_stat(page); if (mapping) { 29 struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); if (ret) { radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); 29 if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); __dec_wb_stat(wb, WB_WRITEBACK); 29 __wb_writeout_inc(wb); } } spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { 404 ret = TestClearPageWriteback(page); } if (ret) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); 404 } 404 mem_cgroup_end_page_stat(memcg); return ret; } 404 404 int __test_set_page_writeback(struct page *page, bool keep_write) { 404 struct address_space *mapping = page_mapping(page); struct mem_cgroup *memcg; int ret; 404 memcg = mem_cgroup_begin_page_stat(page); if (mapping) { 404 struct inode *inode = mapping->host; 404 struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; 404 spin_lock_irqsave(&mapping->tree_lock, flags); 404 ret = TestSetPageWriteback(page); if (!ret) { radix_tree_tag_set(&mapping->page_tree, 404 page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); } if (!PageDirty(page)) 404 radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); 404 if (!keep_write) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_TOWRITE); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); } if (!ret) { mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); 603 } mem_cgroup_end_page_stat(memcg); return ret; } EXPORT_SYMBOL(__test_set_page_writeback); /* * Return true if any of the pages in the mapping are marked with the * passed tag. */ int mapping_tagged(struct address_space *mapping, int tag) { return radix_tree_tagged(&mapping->page_tree, tag); 557 } EXPORT_SYMBOL(mapping_tagged); 557 /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. * * This function determines if the given page is related to a backing device * that requires page contents to be held stable during writeback. If so, then * it will wait for any pending writeback to complete. */ void wait_for_stable_page(struct page *page) { if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) wait_on_page_writeback(page); } EXPORT_SYMBOL_GPL(wait_for_stable_page);
#ifndef __LINUX_SPINLOCK_H #define __LINUX_SPINLOCK_H /* * include/linux/spinlock.h - generic spinlock/rwlock declarations * * here's the role of the various spinlock/rwlock related include files: * * on SMP builds: * * asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the * initializers * * linux/spinlock_types.h: * defines the generic type and initializers * * asm/spinlock.h: contains the arch_spin_*()/etc. lowlevel * implementations, mostly inline assembly code * * (also included on UP-debug builds:) * * linux/spinlock_api_smp.h: * contains the prototypes for the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. * * on UP builds: * * linux/spinlock_type_up.h: * contains the generic, simplified UP spinlock type. * (which is an empty structure on non-debug builds) * * linux/spinlock_types.h: * defines the generic type and initializers * * linux/spinlock_up.h: * contains the arch_spin_*()/etc. version of UP * builds. (which are NOPs on non-debug, non-preempt * builds) * * (included on UP-non-debug builds:) * * linux/spinlock_api_up.h: * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. */ #include <linux/typecheck.h> #include <linux/preempt.h> #include <linux/linkage.h> #include <linux/compiler.h> #include <linux/irqflags.h> #include <linux/thread_info.h> #include <linux/kernel.h> #include <linux/stringify.h> #include <linux/bottom_half.h> #include <asm/barrier.h> /* * Must define these before including other files, inline functions need them */ #define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME #define LOCK_SECTION_START(extra) \ ".subsection 1\n\t" \ extra \ ".ifndef " LOCK_SECTION_NAME "\n\t" \ LOCK_SECTION_NAME ":\n\t" \ ".endif\n" #define LOCK_SECTION_END \ ".previous\n\t" #define __lockfunc __attribute__((section(".spinlock.text"))) /* * Pull the arch_spinlock_t and arch_rwlock_t definitions: */ #include <linux/spinlock_types.h> /* * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them): */ #ifdef CONFIG_SMP # include <asm/spinlock.h> #else # include <linux/spinlock_up.h> #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key); # define raw_spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init((lock), #lock, &__key); \ } while (0) #else # define raw_spin_lock_init(lock) \ do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) #endif #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) #ifdef CONFIG_GENERIC_LOCKBREAK #define raw_spin_is_contended(lock) ((lock)->break_lock) #else #ifdef arch_spin_is_contended #define raw_spin_is_contended(lock) arch_spin_is_contended(&(lock)->raw_lock) #else #define raw_spin_is_contended(lock) (((void)(lock), 0)) #endif /*arch_spin_is_contended*/ #endif /* * Despite its name it doesn't necessarily has to be a full barrier. * It should only guarantee that a STORE before the critical section * can not be reordered with LOADs and STOREs inside this section. * spin_lock() is the one-way barrier, this LOAD can not escape out * of the region. So the default implementation simply ensures that * a STORE can not move into the critical section, smp_wmb() should * serialize it with another STORE done by spin_lock(). */ #ifndef smp_mb__before_spinlock #define smp_mb__before_spinlock() smp_wmb() #endif /** * raw_spin_unlock_wait - wait until the spinlock gets unlocked * @lock: the spinlock in question. */ #define raw_spin_unlock_wait(lock) arch_spin_unlock_wait(&(lock)->raw_lock) #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); #define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock) extern int do_raw_spin_trylock(raw_spinlock_t *lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) { __acquire(lock); arch_spin_lock(&lock->raw_lock); } static inline void do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock) { __acquire(lock); arch_spin_lock_flags(&lock->raw_lock, *flags); } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) { return arch_spin_trylock(&(lock)->raw_lock); } static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) { arch_spin_unlock(&lock->raw_lock); __release(lock); } #endif /* * Define the various spin_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The * various methods are defined as nops in the case they are not * required. */ #define raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) #define raw_spin_lock(lock) _raw_spin_lock(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) # define raw_spin_lock_bh_nested(lock, subclass) \ _raw_spin_lock_bh_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\ _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else /* * Always evaluate the 'subclass' argument to avoid that the compiler * warns about set-but-not-used variables when building with * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1. */ # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) # define raw_spin_lock_bh_nested(lock, subclass) _raw_spin_lock_bh(lock) #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave_nested(lock, subclass); \ } while (0) #else #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #endif #else #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_lock_irqsave(lock, flags); \ } while (0) #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ raw_spin_lock_irqsave(lock, flags) #endif #define raw_spin_lock_irq(lock) _raw_spin_lock_irq(lock) #define raw_spin_lock_bh(lock) _raw_spin_lock_bh(lock) #define raw_spin_unlock(lock) _raw_spin_unlock(lock) #define raw_spin_unlock_irq(lock) _raw_spin_unlock_irq(lock) #define raw_spin_unlock_irqrestore(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_unlock_irqrestore(lock, flags); \ } while (0) #define raw_spin_unlock_bh(lock) _raw_spin_unlock_bh(lock) #define raw_spin_trylock_bh(lock) \ __cond_lock(lock, _raw_spin_trylock_bh(lock)) #define raw_spin_trylock_irq(lock) \ ({ \ local_irq_disable(); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_enable(); 0; }); \ }) #define raw_spin_trylock_irqsave(lock, flags) \ ({ \ local_irq_save(flags); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) /** * raw_spin_can_lock - would raw_spin_trylock() succeed? * @lock: the spinlock in question. */ #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock)) /* Include rwlock functions */ #include <linux/rwlock.h> /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) # include <linux/spinlock_api_smp.h> #else # include <linux/spinlock_api_up.h> #endif /* * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) { 5904 return &lock->rlock; } #define spin_lock_init(_lock) \ do { \ spinlock_check(_lock); \ raw_spin_lock_init(&(_lock)->rlock); \ } while (0) static __always_inline void spin_lock(spinlock_t *lock) { 6409 raw_spin_lock(&lock->rlock); 368 } static __always_inline void spin_lock_bh(spinlock_t *lock) { 1870 raw_spin_lock_bh(&lock->rlock); } static __always_inline int spin_trylock(spinlock_t *lock) { 851 return raw_spin_trylock(&lock->rlock); } #define spin_lock_nested(lock, subclass) \ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) #define spin_lock_bh_nested(lock, subclass) \ do { \ raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\ } while (0) #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ } while (0) static __always_inline void spin_lock_irq(spinlock_t *lock) { 1852 raw_spin_lock_irq(&lock->rlock); } #define spin_lock_irqsave(lock, flags) \ do { \ raw_spin_lock_irqsave(spinlock_check(lock), flags); \ } while (0) #define spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ } while (0) static __always_inline void spin_unlock(spinlock_t *lock) { 7019 raw_spin_unlock(&lock->rlock); } static __always_inline void spin_unlock_bh(spinlock_t *lock) { 4094 raw_spin_unlock_bh(&lock->rlock); } static __always_inline void spin_unlock_irq(spinlock_t *lock) { 1500 raw_spin_unlock_irq(&lock->rlock); } static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { 5479 raw_spin_unlock_irqrestore(&lock->rlock, flags); } static __always_inline int spin_trylock_bh(spinlock_t *lock) { 13 return raw_spin_trylock_bh(&lock->rlock); } static __always_inline int spin_trylock_irq(spinlock_t *lock) { return raw_spin_trylock_irq(&lock->rlock); } #define spin_trylock_irqsave(lock, flags) \ ({ \ raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) static __always_inline void spin_unlock_wait(spinlock_t *lock) { raw_spin_unlock_wait(&lock->rlock); } static __always_inline int spin_is_locked(spinlock_t *lock) { 37 return raw_spin_is_locked(&lock->rlock); } static __always_inline int spin_is_contended(spinlock_t *lock) { 150 return raw_spin_is_contended(&lock->rlock); } static __always_inline int spin_can_lock(spinlock_t *lock) { return raw_spin_can_lock(&lock->rlock); } #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) */ #include <linux/atomic.h> /** * atomic_dec_and_lock - lock on reaching reference count zero * @atomic: the atomic counter * @lock: the spinlock in question * * Decrements @atomic by 1. If the result is 0, returns true and locks * @lock. Returns false for all other cases. */ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #define atomic_dec_and_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) #endif /* __LINUX_SPINLOCK_H */
#ifndef __KERNEL_PRINTK__ #define __KERNEL_PRINTK__ #include <stdarg.h> #include <linux/init.h> #include <linux/kern_levels.h> #include <linux/linkage.h> #include <linux/cache.h> extern const char linux_banner[]; extern const char linux_proc_banner[]; static inline int printk_get_level(const char *buffer) { 668 if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { 667 switch (buffer[1]) { case '0' ... '7': case 'd': /* KERN_DEFAULT */ return buffer[1]; } } return 0; } static inline const char *printk_skip_level(const char *buffer) { 667 if (printk_get_level(buffer)) return buffer + 2; return buffer; } #define CONSOLE_EXT_LOG_MAX 8192 /* printk's without a loglevel use this.. */ #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT /* We show everything that is MORE important than this.. */ #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ #define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ #define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ #define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */ #define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ #define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ extern int console_printk[]; #define console_loglevel (console_printk[0]) #define default_message_loglevel (console_printk[1]) #define minimum_console_loglevel (console_printk[2]) #define default_console_loglevel (console_printk[3]) static inline void console_silent(void) { console_loglevel = CONSOLE_LOGLEVEL_SILENT; } static inline void console_verbose(void) { if (console_loglevel) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } struct va_format { const char *fmt; va_list *va; }; /* * FW_BUG * Add this to a message where you are sure the firmware is buggy or behaves * really stupid or out of spec. Be aware that the responsible BIOS developer * should be able to fix this issue or at least get a concrete idea of the * problem by reading your message without the need of looking at the kernel * code. * * Use it for definite and high priority BIOS bugs. * * FW_WARN * Use it for not that clear (e.g. could the kernel messed up things already?) * and medium priority BIOS bugs. * * FW_INFO * Use this one if you want to tell the user or vendor about something * suspicious, but generally harmless related to the firmware. * * Use it for information or very low priority BIOS bugs. */ #define FW_BUG "[Firmware Bug]: " #define FW_WARN "[Firmware Warn]: " #define FW_INFO "[Firmware Info]: " /* * HW_ERR * Add this to a message for hardware errors, so that user can report * it to hardware vendor instead of LKML or software vendor. */ #define HW_ERR "[Hardware Error]: " /* * DEPRECATED * Add this to a message whenever you want to warn user space about the use * of a deprecated aspect of an API so they can stop using it */ #define DEPRECATED "[Deprecated]: " /* * Dummy printk for disabled debugging statements to use whilst maintaining * gcc's format and side-effect checking. */ static inline __printf(1, 2) int no_printk(const char *fmt, ...) { 6 return 0; } #ifdef CONFIG_EARLY_PRINTK extern asmlinkage __printf(1, 2) void early_printk(const char *fmt, ...); #else static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); #ifdef CONFIG_PRINTK asmlinkage __printf(5, 0) int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args); asmlinkage __printf(1, 0) int vprintk(const char *fmt, va_list args); asmlinkage __printf(5, 6) __cold int printk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, ...); asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); /* * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ __printf(1, 2) __cold int printk_deferred(const char *fmt, ...); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use * printk_ratelimited() or plain old __ratelimit(). */ extern int __printk_ratelimit(const char *func); #define printk_ratelimit() __printk_ratelimit(__func__) extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); extern int printk_delay_msec; extern int dmesg_restrict; extern int kptr_restrict; extern void wake_up_klogd(void); char *log_buf_addr_get(void); u32 log_buf_len_get(void); void log_buf_kexec_setup(void); void __init setup_log_buf(int early); __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) { return 0; } static inline __printf(1, 2) __cold int printk(const char *s, ...) { return 0; } static inline __printf(1, 2) __cold int printk_deferred(const char *s, ...) { return 0; } static inline int printk_ratelimit(void) { return 0; } static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec) { return false; } static inline void wake_up_klogd(void) { } static inline char *log_buf_addr_get(void) { return NULL; } static inline u32 log_buf_len_get(void) { return 0; } static inline void log_buf_kexec_setup(void) { } static inline void setup_log_buf(int early) { } static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...) { } static inline void dump_stack_print_info(const char *log_lvl) { } static inline void show_regs_print_info(const char *log_lvl) { } #endif extern asmlinkage void dump_stack(void) __cold; #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif /* * These can be used to print at the various log levels. * All of these will print unconditionally, although note that pr_debug() * and other debug macros are compiled out unless either DEBUG is defined * or CONFIG_DYNAMIC_DEBUG is set. */ #define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert(fmt, ...) \ printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) #define pr_crit(fmt, ...) \ printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) #define pr_err(fmt, ...) \ printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) #define pr_warning(fmt, ...) \ printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) #define pr_warn pr_warning #define pr_notice(fmt, ...) \ printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* * Like KERN_CONT, pr_cont() should only be used when continuing * a line with no newline ('\n') enclosed. Otherwise it defaults * back to KERN_DEFAULT. */ #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) /* pr_devel() should produce zero code unless DEBUG is defined */ #ifdef DEBUG #define pr_devel(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif #include <linux/dynamic_debug.h> /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) /* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */ #define pr_debug(fmt, ...) \ dynamic_pr_debug(fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define pr_debug(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* * Print a one-time message (analogous to WARN_ONCE() et al): */ #ifdef CONFIG_PRINTK #define printk_once(fmt, ...) \ ({ \ static bool __print_once __read_mostly; \ \ if (!__print_once) { \ __print_once = true; \ printk(fmt, ##__VA_ARGS__); \ } \ }) #define printk_deferred_once(fmt, ...) \ ({ \ static bool __print_once __read_mostly; \ \ if (!__print_once) { \ __print_once = true; \ printk_deferred(fmt, ##__VA_ARGS__); \ } \ }) #else #define printk_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #define printk_deferred_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_once(fmt, ...) \ printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert_once(fmt, ...) \ printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) #define pr_crit_once(fmt, ...) \ printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) #define pr_err_once(fmt, ...) \ printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) #define pr_warn_once(fmt, ...) \ printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) #define pr_notice_once(fmt, ...) \ printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info_once(fmt, ...) \ printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) #define pr_cont_once(fmt, ...) \ printk_once(KERN_CONT pr_fmt(fmt), ##__VA_ARGS__) #if defined(DEBUG) #define pr_devel_once(fmt, ...) \ printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel_once(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* If you are writing a driver, please use dev_dbg instead */ #if defined(DEBUG) #define pr_debug_once(fmt, ...) \ printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug_once(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* * ratelimited messages with local ratelimit_state, * no local ratelimit_state used in the !PRINTK case */ #ifdef CONFIG_PRINTK #define printk_ratelimited(fmt, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ \ if (__ratelimit(&_rs)) \ printk(fmt, ##__VA_ARGS__); \ }) #else #define printk_ratelimited(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_ratelimited(fmt, ...) \ printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert_ratelimited(fmt, ...) \ printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) #define pr_crit_ratelimited(fmt, ...) \ printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) #define pr_err_ratelimited(fmt, ...) \ printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) #define pr_warn_ratelimited(fmt, ...) \ printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) #define pr_notice_ratelimited(fmt, ...) \ printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info_ratelimited(fmt, ...) \ printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* no pr_cont_ratelimited, don't do that... */ #if defined(DEBUG) #define pr_devel_ratelimited(fmt, ...) \ printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt)); \ if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \ __ratelimit(&_rs)) \ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) #elif defined(DEBUG) #define pr_debug_ratelimited(fmt, ...) \ printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif extern const struct file_operations kmsg_fops; enum { DUMP_PREFIX_NONE, DUMP_PREFIX_ADDRESS, DUMP_PREFIX_OFFSET }; extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, char *linebuf, size_t linebuflen, bool ascii); #ifdef CONFIG_PRINTK extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); #if defined(CONFIG_DYNAMIC_DEBUG) #define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ dynamic_hex_dump(prefix_str, prefix_type, 16, 1, buf, len, true) #else extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type, const void *buf, size_t len); #endif /* defined(CONFIG_DYNAMIC_DEBUG) */ #else static inline void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { } static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, const void *buf, size_t len) { } #endif #if defined(CONFIG_DYNAMIC_DEBUG) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) #elif defined(DEBUG) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) #else static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { } #endif #endif
/* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/module.h> /* search_exception_table */ #include <linux/bootmem.h> /* max_low_pfn */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> /* * Page fault error code bits: * * bit 0 == 0: no page found 1: protection fault * bit 1 == 0: read access 1: write access * bit 2 == 0: kernel-mode access 1: user-mode access * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch */ enum x86_pf_error_code { PF_PROT = 1 << 0, PF_WRITE = 1 << 1, PF_USER = 1 << 2, PF_RSVD = 1 << 3, PF_INSTR = 1 << 4, }; /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ static nokprobe_inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; return 0; } static nokprobe_inline int kprobes_fault(struct pt_regs *regs) { int ret = 0; /* kprobe_running() needs smp_processor_id() */ if (kprobes_built_in() && !user_mode(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; preempt_enable(); } return ret; } /* * Prefetch quirks: * * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. * Check that here and ignore it. * * 64-bit mode: * * Sometimes the CPU reports invalid exceptions on prefetch. * Check that here and ignore it. * * Opcode checker based on code by Richard Brunner. */ static inline int check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, unsigned char opcode, int *prefetch) { unsigned char instr_hi = opcode & 0xf0; unsigned char instr_lo = opcode & 0x0f; switch (instr_hi) { case 0x20: case 0x30: /* * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. * In X86_64 long mode, the CPU will signal invalid * opcode if some of these prefixes are present so * X86_64 will never get here anyway */ return ((instr_lo & 7) == 0x6); #ifdef CONFIG_X86_64 case 0x40: /* * In AMD64 long mode 0x40..0x4F are valid REX prefixes * Need to figure out under what instruction mode the * instruction was issued. Could check the LDT for lm, * but for now it's good enough to assume that long * mode only uses well known segments or kernel. */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ 95 return (instr_lo & 0xC) == 0x4; case 0xF0: /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 5 return !instr_lo || (instr_lo>>1) == 1; case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ 95 if (probe_kernel_address(instr, opcode)) return 0; 95 *prefetch = (instr_lo == 0xF) && 95 (opcode == 0x0D || opcode == 0x18); return 0; default: return 0; } } static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; int prefetch = 0; /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ 3 if (error_code & PF_INSTR) return 0; 100 instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; 100 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) return 0; 100 while (instr < max_instr) { unsigned char opcode; 100 if (probe_kernel_address(instr, opcode)) break; 100 instr++; 100 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } return prefetch; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, struct task_struct *tsk, int fault) { unsigned lsb = 0; siginfo_t info; 102 info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; 102 info.si_addr_lsb = lsb; force_sig_info(si_signo, &info, tsk); } DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); #ifdef CONFIG_X86_32 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pgd += index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) return NULL; /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would * set_pud. */ pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) return NULL; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (pmd_present(*pmd) != pmd_present(*pmd_k)) set_pmd(pmd, *pmd_k); if (!pmd_present(*pmd_k)) return NULL; else BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } void vmalloc_sync_all(void) { unsigned long address; if (SHARED_KERNEL_PMD) return; for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); vmalloc_sync_one(page_address(page), address); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } /* * 32-bit: * * Handle a fault on the vmalloc or module mapping area */ static noinline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr = read_cr3(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; if (pmd_large(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; return 0; } NOKPROBE_SYMBOL(vmalloc_fault); /* * Did it hit the DOS screen memory VA from vm86 mode? */ static inline void check_v8086_mode(struct pt_regs *regs, unsigned long address, struct task_struct *tsk) { #ifdef CONFIG_VM86 unsigned long bit; if (!v8086_mode(regs) || !tsk->thread.vm86) return; bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) tsk->thread.vm86->screen_bitmap |= 1 << bit; #endif } static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; pmd_t *pmd; pte_t *pte; #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif pmd = pmd_offset(pud_offset(pgd, address), address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: printk("\n"); } #else /* CONFIG_X86_64: */ void vmalloc_sync_all(void) { sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); } /* * 64-bit: * * Handle a fault on the vmalloc area */ static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; 1 /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later * case just flush: */ pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); } else { BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } /* * Below here mismatches are bugs because these lower tables * are shared: */ pud = pud_offset(pgd, address); pud_ref = pud_offset(pgd_ref, address); if (pud_none(*pud_ref)) return -1; if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref)) BUG(); if (pud_large(*pud)) return 0; pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); if (pmd_none(*pmd_ref)) 1 return -1; if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref)) BUG(); if (pmd_large(*pmd)) return 0; pte_ref = pte_offset_kernel(pmd_ref, address); if (!pte_present(*pte_ref)) return -1; pte = pte_offset_kernel(pmd, address); /* * Don't use pte_page here, because the mappings can point * outside mem_map, and the NUMA hash lookup cannot handle * that: */ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); return 0; } NOKPROBE_SYMBOL(vmalloc_fault); #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" "******* Working around it, but it may cause SEGVs or burn power.\n" "******* Please consider a BIOS update.\n" "******* Disabling USB legacy in the BIOS may also help.\n"; #endif /* * No vm86 mode in 64-bit mode: */ static inline void check_v8086_mode(struct pt_regs *regs, unsigned long address, struct task_struct *tsk) { } static int bad_address(void *p) { unsigned long dummy; return probe_kernel_address((unsigned long *)p, dummy); } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); pud_t *pud; pmd_t *pmd; pte_t *pte; if (bad_address(pgd)) goto bad; printk("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; pud = pud_offset(pgd, address); if (bad_address(pud)) goto bad; printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) goto out; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; printk("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_large(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); if (bad_address(pte)) goto bad; printk("PTE %lx", pte_val(*pte)); out: printk("\n"); return; bad: printk("BAD\n"); } #endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. * * BIOS SMM functions are required to use a specific workaround * to avoid corruption of the 64bit RIP register on C stepping K8. * * A lot of BIOS that didn't get tested properly miss this. * * The OS sees this as a page fault with the upper 32bits of RIP cleared. * Try to work around it here. * * Note we only handle faults in kernel here. * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 != 0xf) return 0; if (address != regs->ip) return 0; if ((address >> 32) != 0) return 0; address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { printk_once(errata93_warning); regs->ip = address; return 1; } #endif return 0; } /* * Work around K8 erratum #100 K8 in compat mode occasionally jumps * to illegal addresses >4GB. * * We catch this in the page fault handler because these addresses * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; } 100 static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; /* * Pentium F0 0F C7 C8 bug workaround: */ if (boot_cpu_has_bug(X86_BUG_F00F)) { nr = (address - idt_descr.address) >> 3; if (nr == 6) { do_invalid_op(regs, 0); return 1; } } #endif return 0; } static const char nx_warning[] = KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; static const char smep_warning[] = KERN_CRIT "unable to execute userspace code (SMEP?) (uid: %d)\n"; static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!oops_may_print()) return; if (error_code & PF_INSTR) { unsigned int level; pgd_t *pgd; pte_t *pte; pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd += pgd_index(address); pte = lookup_address_in_pgd(pgd, address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(nx_warning, from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && (pgd_flags(*pgd) & _PAGE_USER) && (__read_cr4() & X86_CR4_SMEP)) printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip); dump_pagetable(address); } static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk; unsigned long flags; int sig; flags = oops_begin(); tsk = current; sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); tsk->thread.cr2 = address; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) sig = 0; oops_end(flags, regs, sig); } static noinline void no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) { struct task_struct *tsk = current; unsigned long flags; int sig; /* Are we prepared to handle this kernel fault? */ 821 if (fixup_exception(regs)) { /* * Any interrupt that takes a fault gets the fixup. This makes * the below recursive fault logic only apply to a faults from * task context. */ if (in_interrupt()) return; /* * Per the above we're !in_interrupt(), aka. task context. 821 * * In this case we need to make sure we're not recursively * faulting through the emulate_vsyscall() logic. */ if (current_thread_info()->sig_on_uaccess_error && signal) { tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; tsk->thread.cr2 = address; 821 /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, tsk, 0); } /* * Barring that, we can do the fixup and be happy. */ return; } /* * 32-bit: * * Valid to do another page fault here, because if this fault * had been triggered by is_prefetch fixup_exception would have * handled it. * * 64-bit: * * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; if (is_errata93(regs, address)) return; /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ flags = oops_begin(); show_fault_oops(regs, error_code, address); if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code; sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: */ static inline void show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { if (!unhandled_signal(tsk, SIGSEGV)) return; if (!printk_ratelimit()) return; printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); printk(KERN_CONT "\n"); } static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* 872 * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came * from user space: 100 */ if (is_prefetch(regs, error_code, address)) return; if (is_errata100(regs, address)) return; 97 #ifdef CONFIG_X86_64 /* 100 * Instruction fetch faults in the vsyscall page might need * emulation. */ if (unlikely((error_code & PF_INSTR) && ((address & ~0xfff) == VSYSCALL_ADDR))) { if (emulate_vsyscall(regs, address)) return; } 100 #endif /* Kernel addresses are always protection faults: */ if (address >= TASK_SIZE) error_code |= PF_PROT; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); 100 1 tsk->thread.cr2 = address; tsk->thread.error_code = error_code; 100 tsk->thread.trap_nr = X86_TRAP_PF; force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); 100 return; } if (is_f00f_bug(regs, address)) return; 872 no_context(regs, error_code, address, SIGSEGV, si_code); } static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 788 unsigned long address) 97 { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, 101 unsigned long address, int si_code) { struct mm_struct *mm = current->mm; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. 793 */ up_read(&mm->mmap_sem); __bad_area_nosemaphore(regs, error_code, address, si_code); } static noinline void bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); } static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 637 unsigned long address) { __bad_area(regs, error_code, address, SEGV_ACCERR); } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 159 unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { 45 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } /* User-space => ok to do another page fault: */ 43 if (is_prefetch(regs, error_code, address)) return; tsk->thread.cr2 = address; tsk->thread.error_code = error_code; 3 tsk->thread.trap_nr = X86_TRAP_PF; #ifdef CONFIG_MEMORY_FAILURE 3 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { printk(KERN_ERR "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); code = BUS_MCEERR_AR; } #endif force_sig_info_fault(SIGBUS, code, address, tsk, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); return; } 45 if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); 45 return; } /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got * oom-killed): */ pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else if (fault & VM_FAULT_SIGSEGV) bad_area_nosemaphore(regs, error_code, address); 45 else BUG(); 45 } } 45 static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; return 1; } /* * Handle a spurious fault caused by a stale TLB entry. * * This allows us to lazily refresh the TLB when increasing the * permissions of a kernel page (RO -> RW or NX -> X). Doing it * eagerly is very expensive since that implies doing a full * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * * Spurious faults may only occur if the TLB contains an entry with * fewer permission than the page table entry. Non-present (P = 0) * and reserved bit (R = 1) faults are never spurious. * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. * * Returns non-zero if a spurious fault was handled, zero otherwise. * * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 * (Optional Invalidation). */ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; int ret; /* * Only writes to RO or instruction fetches from NX may cause * spurious faults. * * These could be from user or supervisor accesses but the TLB * is only lazily flushed after a kernel mapping protection * change, so user accesses are not expected to cause spurious * faults. */ if (error_code != (PF_WRITE | PF_PROT) && error_code != (PF_INSTR | PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); 1 if (!pgd_present(*pgd)) 1 return 0; 1 pud = pud_offset(pgd, address); if (!pud_present(*pud)) return 0; if (pud_large(*pud)) return spurious_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) return spurious_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; ret = spurious_fault_check(error_code, pte); if (!ret) return 0; /* * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ ret = spurious_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } NOKPROBE_SYMBOL(spurious_fault); int show_unhandled_signals = 1; static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) { if (error_code & PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; 3067 } 2064 /* read, present: */ if (unlikely(error_code & PF_PROT)) return 1; /* read, not present: */ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 1739 return 1; return 0; } 1738 static int fault_in_kernel_space(unsigned long address) { return address >= TASK_SIZE_MAX; } static inline bool smap_violation(int error_code, struct pt_regs *regs) { if (!IS_ENABLED(CONFIG_X86_SMAP)) return false; if (!static_cpu_has(X86_FEATURE_SMAP)) return false; if (error_code & PF_USER) return false; if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) return false; return true; } /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * * This function must have noinline because both callers * {,trace_}do_page_fault() have notrace on. Having this an actual function * guarantees there's a function trace entry. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; /* * Detect and handle instructions that would cause a page fault for 3363 * both a tracked kernel page and a userspace page. */ if (kmemcheck_active(regs)) kmemcheck_hide(regs); prefetchw(&mm->mmap_sem); if (unlikely(kmmio_fault(regs, address))) return; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * This verifies that the fault happens in kernel space * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { if (vmalloc_fault(address) >= 0) return; if (kmemcheck_fault(regs, address, error_code)) 1 return; 1 } /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ if (kprobes_fault(regs)) 1 return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address); return; } /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobes_fault(regs))) return; if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); if (unlikely(smap_violation(error_code, regs))) { bad_area_nosemaphore(regs, error_code, address); 3362 return; } /* * If we're in an interrupt, have no user context or are running * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } 3362 /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * * User-mode registers count as a user access even for any * potential system fault or CPU buglet: */ if (user_mode(regs)) { local_irq_enable(); error_code |= PF_USER; flags |= FAULT_FLAG_USER; } else { 3351 if (regs->flags & X86_EFLAGS_IF) 831 local_irq_enable(); } perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 2820 2820 if (error_code & PF_WRITE) flags |= FAULT_FLAG_WRITE; 3351 /* * When running in the kernel we expect faults to occur only to 3351 * addresses in user space. All other faults represent errors in 2211 * the kernel and should generate an OOPS. Unfortunately, in the * case of an erroneous fault occurring in a code path which already * holds mmap_sem we will deadlock attempting to validate the fault * against the address space. Luckily the kernel only validly * references user space from well defined areas of code, which are * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform * the source reference check when there is a possibility of a * deadlock. Attempt to lock the address space, if we cannot we then * validate the source. If this is invalid we can skip the address * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && !search_exception_tables(regs->ip)) { bad_area_nosemaphore(regs, error_code, address); return; 3351 } 146 retry: 140 down_read(&mm->mmap_sem); 101 } else { /* * The above down_read_trylock() might have succeeded in * which case we'll have missed the might_sleep() from 164 * down_read(): */ might_sleep(); } vma = find_vma(mm, address); if (unlikely(!vma)) { 3347 bad_area(regs, error_code, address); return; } 3351 if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { bad_area(regs, error_code, address); return; 3343 } if (error_code & PF_USER) { 630 /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter * and pusha to work. ("enter $65535, $31" pushes 8 * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { bad_area(regs, error_code, address); return; } } 1 if (unlikely(expand_stack(vma, address))) { bad_area(regs, error_code, address); return; } 7 /* 637 * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address); return; } 3067 /* 159 * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); major |= fault & VM_FAULT_MAJOR; /* * If we need to retry the mmap_sem has already been released, 2962 * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ if (unlikely(fault & VM_FAULT_RETRY)) { /* Retry at most once */ if (flags & FAULT_FLAG_ALLOW_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; if (!fatal_signal_pending(tsk)) goto retry; 20 } 20 /* User mode? Just return to handle the fatal exception */ if (flags & FAULT_FLAG_USER) return; /* Not returning to user mode? Handle exceptions or die: */ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } 2962 /* 45 * Major/minor page fault accounting. If any of the events * returned VM_FAULT_MAJOR, we account it as a major fault. */ if (major) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; 2955 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); 47 } 3363 check_v8086_mode(regs, address, tsk); 2950 } NOKPROBE_SYMBOL(__do_page_fault); dotraplinkage void notrace do_page_fault(struct pt_regs *regs, unsigned long error_code) { unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; /* * We must have this function tagged with __kprobes, notrace and call 3363 * read_cr2() before calling anything else. To avoid calling any kind * of tracing machinery before we've observed the CR2 value. * * exception_{enter,exit}() contain all sorts of tracepoints. */ prev_state = exception_enter(); __do_page_fault(regs, error_code, address); exception_exit(prev_state); } NOKPROBE_SYMBOL(do_page_fault); #ifdef CONFIG_TRACING static nokprobe_inline void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) { if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else trace_page_fault_kernel(address, regs, error_code); } dotraplinkage void notrace trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) { /* * The exception_enter and tracepoint processing could * trigger another page faults (user space callchain * reading) and destroy the original cr2 value, so read * the faulting address now. */ unsigned long address = read_cr2(); enum ctx_state prev_state; prev_state = exception_enter(); trace_page_fault_entries(address, regs, error_code); __do_page_fault(regs, error_code, address); exception_exit(prev_state); } NOKPROBE_SYMBOL(trace_do_page_fault); #endif /* CONFIG_TRACING */
/* * Generic PPP layer for Linux. * * Copyright 1999-2002 Paul Mackerras. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * The generic PPP layer handles the PPP network interfaces, the * /dev/ppp device, packet and VJ compression, and multilink. * It talks to PPP `channels' via the interface defined in * include/linux/ppp_channel.h. Channels provide the basic means for * sending and receiving PPP frames on some kind of communications * channel. * * Part of the code in this driver was inspired by the old async-only * PPP driver, written by Michael Callahan and Al Longyear, and * subsequently hacked by Paul Mackerras. * * ==FILEVERSION 20041108== */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/ppp_defs.h> #include <linux/filter.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/ppp-comp.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/rwsem.h> #include <linux/stddef.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/slab.h> #include <asm/unaligned.h> #include <net/slhc_vj.h> #include <linux/atomic.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #define PPP_VERSION "2.4.2" /* * Network protocols we support. */ #define NP_IP 0 /* Internet Protocol V4 */ #define NP_IPV6 1 /* Internet Protocol V6 */ #define NP_IPX 2 /* IPX protocol */ #define NP_AT 3 /* Appletalk protocol */ #define NP_MPLS_UC 4 /* MPLS unicast */ #define NP_MPLS_MC 5 /* MPLS multicast */ #define NUM_NP 6 /* Number of NPs. */ #define MPHDRLEN 6 /* multilink protocol header length */ #define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */ /* * An instance of /dev/ppp can be associated with either a ppp * interface unit or a ppp channel. In both cases, file->private_data * points to one of these. */ struct ppp_file { enum { INTERFACE=1, CHANNEL } kind; struct sk_buff_head xq; /* pppd transmit queue */ struct sk_buff_head rq; /* receive queue for pppd */ wait_queue_head_t rwait; /* for poll on reading /dev/ppp */ atomic_t refcnt; /* # refs (incl /dev/ppp attached) */ int hdrlen; /* space to leave for headers */ int index; /* interface unit / channel number */ int dead; /* unit/channel has been shut down */ }; #define PF_TO_X(pf, X) container_of(pf, X, file) #define PF_TO_PPP(pf) PF_TO_X(pf, struct ppp) #define PF_TO_CHANNEL(pf) PF_TO_X(pf, struct channel) /* * Data structure to hold primary network stats for which * we want to use 64 bit storage. Other network stats * are stored in dev->stats of the ppp strucute. */ struct ppp_link_stats { u64 rx_packets; u64 tx_packets; u64 rx_bytes; u64 tx_bytes; }; /* * Data structure describing one ppp unit. * A ppp unit corresponds to a ppp network interface device * and represents a multilink bundle. * It can have 0 or more ppp channels connected to it. */ struct ppp { struct ppp_file file; /* stuff for read/write/poll 0 */ struct file *owner; /* file that owns this unit 48 */ struct list_head channels; /* list of attached channels 4c */ int n_channels; /* how many channels are attached 54 */ spinlock_t rlock; /* lock for receive side 58 */ spinlock_t wlock; /* lock for transmit side 5c */ int mru; /* max receive unit 60 */ unsigned int flags; /* control bits 64 */ unsigned int xstate; /* transmit state bits 68 */ unsigned int rstate; /* receive state bits 6c */ int debug; /* debug flags 70 */ struct slcompress *vj; /* state for VJ header compression */ enum NPmode npmode[NUM_NP]; /* what to do with each net proto 78 */ struct sk_buff *xmit_pending; /* a packet ready to go out 88 */ struct compressor *xcomp; /* transmit packet compressor 8c */ void *xc_state; /* its internal state 90 */ struct compressor *rcomp; /* receive decompressor 94 */ void *rc_state; /* its internal state 98 */ unsigned long last_xmit; /* jiffies when last pkt sent 9c */ unsigned long last_recv; /* jiffies when last pkt rcvd a0 */ struct net_device *dev; /* network interface device a4 */ int closing; /* is device closing down? a8 */ #ifdef CONFIG_PPP_MULTILINK int nxchan; /* next channel to send something on */ u32 nxseq; /* next sequence number to send */ int mrru; /* MP: max reconst. receive unit */ u32 nextseq; /* MP: seq no of next packet */ u32 minseq; /* MP: min of most recent seqnos */ struct sk_buff_head mrq; /* MP: receive reconstruction queue */ #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER struct bpf_prog *pass_filter; /* filter for packets to pass */ struct bpf_prog *active_filter; /* filter for pkts to reset idle */ #endif /* CONFIG_PPP_FILTER */ struct net *ppp_net; /* the net we belong to */ struct ppp_link_stats stats64; /* 64 bit network stats */ }; /* * Bits in flags: SC_NO_TCP_CCID, SC_CCP_OPEN, SC_CCP_UP, SC_LOOP_TRAFFIC, * SC_MULTILINK, SC_MP_SHORTSEQ, SC_MP_XSHORTSEQ, SC_COMP_TCP, SC_REJ_COMP_TCP, * SC_MUST_COMP * Bits in rstate: SC_DECOMP_RUN, SC_DC_ERROR, SC_DC_FERROR. * Bits in xstate: SC_COMP_RUN */ #define SC_FLAG_BITS (SC_NO_TCP_CCID|SC_CCP_OPEN|SC_CCP_UP|SC_LOOP_TRAFFIC \ |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \ |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP) /* * Private data structure for each channel. * This includes the data structure used for multilink. */ struct channel { struct ppp_file file; /* stuff for read/write/poll */ struct list_head list; /* link in all/new_channels list */ struct ppp_channel *chan; /* public channel data structure */ struct rw_semaphore chan_sem; /* protects `chan' during chan ioctl */ spinlock_t downl; /* protects `chan', file.xq dequeue */ struct ppp *ppp; /* ppp unit we're connected to */ struct net *chan_net; /* the net channel belongs to */ struct list_head clist; /* link in list of channels per unit */ rwlock_t upl; /* protects `ppp' */ #ifdef CONFIG_PPP_MULTILINK u8 avail; /* flag used in multilink stuff */ u8 had_frag; /* >= 1 fragments have been sent */ u32 lastseq; /* MP: last sequence # received */ int speed; /* speed of the corresponding ppp channel*/ #endif /* CONFIG_PPP_MULTILINK */ }; /* * SMP locking issues: * Both the ppp.rlock and ppp.wlock locks protect the ppp.channels * list and the ppp.n_channels field, you need to take both locks * before you modify them. * The lock ordering is: channel.upl -> ppp.wlock -> ppp.rlock -> * channel.downl. */ static DEFINE_MUTEX(ppp_mutex); static atomic_t ppp_unit_count = ATOMIC_INIT(0); static atomic_t channel_count = ATOMIC_INIT(0); /* per-net private data for this module */ static int ppp_net_id __read_mostly; struct ppp_net { /* units to ppp mapping */ struct idr units_idr; /* * all_ppp_mutex protects the units_idr mapping. * It also ensures that finding a ppp unit in the units_idr * map and updating its file.refcnt field is atomic. */ struct mutex all_ppp_mutex; /* channels */ struct list_head all_channels; struct list_head new_channels; int last_channel_index; /* * all_channels_lock protects all_channels and * last_channel_index, and the atomicity of find * a channel and updating its file.refcnt field. */ spinlock_t all_channels_lock; }; /* Get the PPP protocol number from a skb */ #define PPP_PROTO(skb) get_unaligned_be16((skb)->data) /* We limit the length of ppp->file.rq to this (arbitrary) value */ #define PPP_MAX_RQLEN 32 /* * Maximum number of multilink fragments queued up. * This has to be large enough to cope with the maximum latency of * the slowest channel relative to the others. Strictly it should * depend on the number of channels and their characteristics. */ #define PPP_MP_MAX_QLEN 128 /* Multilink header bits. */ #define B 0x80 /* this fragment begins a packet */ #define E 0x40 /* this fragment ends a packet */ /* Compare multilink sequence numbers (assumed to be 32 bits wide) */ #define seq_before(a, b) ((s32)((a) - (b)) < 0) #define seq_after(a, b) ((s32)((a) - (b)) > 0) /* Prototypes. */ static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf, struct file *file, unsigned int cmd, unsigned long arg); static void ppp_xmit_process(struct ppp *ppp); static void ppp_send_frame(struct ppp *ppp, struct sk_buff *skb); static void ppp_push(struct ppp *ppp); static void ppp_channel_push(struct channel *pch); static void ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch); static void ppp_receive_error(struct ppp *ppp); static void ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb); static struct sk_buff *ppp_decompress_frame(struct ppp *ppp, struct sk_buff *skb); #ifdef CONFIG_PPP_MULTILINK static void ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch); static void ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb); static struct sk_buff *ppp_mp_reconstruct(struct ppp *ppp); static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb); #endif /* CONFIG_PPP_MULTILINK */ static int ppp_set_compress(struct ppp *ppp, unsigned long arg); static void ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound); static void ppp_ccp_closed(struct ppp *ppp); static struct compressor *find_compressor(int type); static void ppp_get_stats(struct ppp *ppp, struct ppp_stats *st); static struct ppp *ppp_create_interface(struct net *net, int unit, struct file *file, int *retp); static void init_ppp_file(struct ppp_file *pf, int kind); static void ppp_destroy_interface(struct ppp *ppp); static struct ppp *ppp_find_unit(struct ppp_net *pn, int unit); static struct channel *ppp_find_channel(struct ppp_net *pn, int unit); static int ppp_connect_channel(struct channel *pch, int unit); static int ppp_disconnect_channel(struct channel *pch); static void ppp_destroy_channel(struct channel *pch); static int unit_get(struct idr *p, void *ptr); static int unit_set(struct idr *p, void *ptr, int n); static void unit_put(struct idr *p, int n); static void *unit_find(struct idr *p, int n); static const struct net_device_ops ppp_netdev_ops; static struct class *ppp_class; /* per net-namespace data */ static inline struct ppp_net *ppp_pernet(struct net *net) { BUG_ON(!net); return net_generic(net, ppp_net_id); } /* Translates a PPP protocol number to a NP index (NP == network protocol) */ static inline int proto_to_npindex(int proto) { switch (proto) { case PPP_IP: return NP_IP; case PPP_IPV6: return NP_IPV6; case PPP_IPX: return NP_IPX; case PPP_AT: return NP_AT; case PPP_MPLS_UC: return NP_MPLS_UC; case PPP_MPLS_MC: return NP_MPLS_MC; } return -EINVAL; } /* Translates an NP index into a PPP protocol number */ static const int npindex_to_proto[NUM_NP] = { PPP_IP, PPP_IPV6, PPP_IPX, PPP_AT, PPP_MPLS_UC, PPP_MPLS_MC, }; /* Translates an ethertype into an NP index */ static inline int ethertype_to_npindex(int ethertype) { switch (ethertype) { case ETH_P_IP: return NP_IP; case ETH_P_IPV6: return NP_IPV6; case ETH_P_IPX: return NP_IPX; case ETH_P_PPPTALK: case ETH_P_ATALK: return NP_AT; case ETH_P_MPLS_UC: return NP_MPLS_UC; case ETH_P_MPLS_MC: return NP_MPLS_MC; } return -1; } /* Translates an NP index into an ethertype */ static const int npindex_to_ethertype[NUM_NP] = { ETH_P_IP, ETH_P_IPV6, ETH_P_IPX, ETH_P_PPPTALK, ETH_P_MPLS_UC, ETH_P_MPLS_MC, }; /* * Locking shorthand. */ #define ppp_xmit_lock(ppp) spin_lock_bh(&(ppp)->wlock) #define ppp_xmit_unlock(ppp) spin_unlock_bh(&(ppp)->wlock) #define ppp_recv_lock(ppp) spin_lock_bh(&(ppp)->rlock) #define ppp_recv_unlock(ppp) spin_unlock_bh(&(ppp)->rlock) #define ppp_lock(ppp) do { ppp_xmit_lock(ppp); \ ppp_recv_lock(ppp); } while (0) #define ppp_unlock(ppp) do { ppp_recv_unlock(ppp); \ ppp_xmit_unlock(ppp); } while (0) /* * /dev/ppp device routines. * The /dev/ppp device is used by pppd to control the ppp unit. * It supports the read, write, ioctl and poll functions. * Open instances of /dev/ppp can be in one of three states: * unattached, attached to a ppp unit, or attached to a ppp channel. */ static int ppp_open(struct inode *inode, struct file *file) { /* * This could (should?) be enforced by the permissions on /dev/ppp. */ 1 if (!capable(CAP_NET_ADMIN)) return -EPERM; return 0; } static int ppp_release(struct inode *unused, struct file *file) { struct ppp_file *pf = file->private_data; struct ppp *ppp; if (pf) { file->private_data = NULL; if (pf->kind == INTERFACE) { ppp = PF_TO_PPP(pf); rtnl_lock(); if (file == ppp->owner) unregister_netdevice(ppp->dev); rtnl_unlock(); } if (atomic_dec_and_test(&pf->refcnt)) { switch (pf->kind) { case INTERFACE: ppp_destroy_interface(PF_TO_PPP(pf)); break; case CHANNEL: ppp_destroy_channel(PF_TO_CHANNEL(pf)); break; } } } return 0; } static ssize_t ppp_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct ppp_file *pf = file->private_data; DECLARE_WAITQUEUE(wait, current); ssize_t ret; struct sk_buff *skb = NULL; struct iovec iov; struct iov_iter to; ret = count; if (!pf) return -ENXIO; add_wait_queue(&pf->rwait, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); skb = skb_dequeue(&pf->rq); if (skb) break; ret = 0; if (pf->dead) break; if (pf->kind == INTERFACE) { /* * Return 0 (EOF) on an interface that has no * channels connected, unless it is looping * network traffic (demand mode). */ struct ppp *ppp = PF_TO_PPP(pf); if (ppp->n_channels == 0 && (ppp->flags & SC_LOOP_TRAFFIC) == 0) break; } ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; schedule(); } set_current_state(TASK_RUNNING); remove_wait_queue(&pf->rwait, &wait); if (!skb) goto out; ret = -EOVERFLOW; if (skb->len > count) goto outf; ret = -EFAULT; iov.iov_base = buf; iov.iov_len = count; iov_iter_init(&to, READ, &iov, 1, count); if (skb_copy_datagram_iter(skb, 0, &to, skb->len)) goto outf; ret = skb->len; outf: kfree_skb(skb); out: return ret; } static ssize_t ppp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct ppp_file *pf = file->private_data; struct sk_buff *skb; ssize_t ret; if (!pf) return -ENXIO; ret = -ENOMEM; skb = alloc_skb(count + pf->hdrlen, GFP_KERNEL); if (!skb) goto out; skb_reserve(skb, pf->hdrlen); ret = -EFAULT; if (copy_from_user(skb_put(skb, count), buf, count)) { kfree_skb(skb); goto out; } skb_queue_tail(&pf->xq, skb); switch (pf->kind) { case INTERFACE: ppp_xmit_process(PF_TO_PPP(pf)); break; case CHANNEL: ppp_channel_push(PF_TO_CHANNEL(pf)); break; } ret = count; out: return ret; } /* No kernel lock - fine */ static unsigned int ppp_poll(struct file *file, poll_table *wait) { struct ppp_file *pf = file->private_data; unsigned int mask; if (!pf) return 0; poll_wait(file, &pf->rwait, wait); mask = POLLOUT | POLLWRNORM; if (skb_peek(&pf->rq)) mask |= POLLIN | POLLRDNORM; if (pf->dead) mask |= POLLHUP; else if (pf->kind == INTERFACE) { /* see comment in ppp_read */ struct ppp *ppp = PF_TO_PPP(pf); if (ppp->n_channels == 0 && (ppp->flags & SC_LOOP_TRAFFIC) == 0) mask |= POLLIN | POLLRDNORM; } return mask; } #ifdef CONFIG_PPP_FILTER static int get_filter(void __user *arg, struct sock_filter **p) { struct sock_fprog uprog; struct sock_filter *code = NULL; int len; if (copy_from_user(&uprog, arg, sizeof(uprog))) return -EFAULT; if (!uprog.len) { *p = NULL; return 0; } len = uprog.len * sizeof(struct sock_filter); code = memdup_user(uprog.filter, len); if (IS_ERR(code)) return PTR_ERR(code); *p = code; return uprog.len; } #endif /* CONFIG_PPP_FILTER */ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ppp_file *pf; struct ppp *ppp; int err = -EFAULT, val, val2, i; struct ppp_idle idle; struct npioctl npi; int unit, cflags; struct slcompress *vj; void __user *argp = (void __user *)arg; int __user *p = argp; mutex_lock(&ppp_mutex); pf = file->private_data; if (!pf) { err = ppp_unattached_ioctl(current->nsproxy->net_ns, pf, file, cmd, arg); goto out; } if (cmd == PPPIOCDETACH) { /* * We have to be careful here... if the file descriptor * has been dup'd, we could have another process in the * middle of a poll using the same file *, so we had * better not free the interface data structures - * instead we fail the ioctl. Even in this case, we * shut down the interface if we are the owner of it. * Actually, we should get rid of PPPIOCDETACH, userland * (i.e. pppd) could achieve the same effect by closing * this fd and reopening /dev/ppp. */ err = -EINVAL; if (pf->kind == INTERFACE) { ppp = PF_TO_PPP(pf); rtnl_lock(); if (file == ppp->owner) unregister_netdevice(ppp->dev); rtnl_unlock(); } if (atomic_long_read(&file->f_count) < 2) { ppp_release(NULL, file); err = 0; } else pr_warn("PPPIOCDETACH file->f_count=%ld\n", atomic_long_read(&file->f_count)); goto out; } if (pf->kind == CHANNEL) { struct channel *pch; struct ppp_channel *chan; pch = PF_TO_CHANNEL(pf); switch (cmd) { case PPPIOCCONNECT: if (get_user(unit, p)) break; err = ppp_connect_channel(pch, unit); break; case PPPIOCDISCONN: err = ppp_disconnect_channel(pch); break; default: down_read(&pch->chan_sem); chan = pch->chan; err = -ENOTTY; if (chan && chan->ops->ioctl) err = chan->ops->ioctl(chan, cmd, arg); up_read(&pch->chan_sem); } goto out; } if (pf->kind != INTERFACE) { /* can't happen */ pr_err("PPP: not interface or channel??\n"); err = -EINVAL; goto out; } ppp = PF_TO_PPP(pf); switch (cmd) { case PPPIOCSMRU: if (get_user(val, p)) break; ppp->mru = val; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, p)) break; ppp_lock(ppp); cflags = ppp->flags & ~val; #ifdef CONFIG_PPP_MULTILINK if (!(ppp->flags & SC_MULTILINK) && (val & SC_MULTILINK)) ppp->nextseq = 0; #endif ppp->flags = val & SC_FLAG_BITS; ppp_unlock(ppp); if (cflags & SC_CCP_OPEN) ppp_ccp_closed(ppp); err = 0; break; case PPPIOCGFLAGS: val = ppp->flags | ppp->xstate | ppp->rstate; if (put_user(val, p)) break; err = 0; break; case PPPIOCSCOMPRESS: err = ppp_set_compress(ppp, arg); break; case PPPIOCGUNIT: if (put_user(ppp->file.index, p)) break; err = 0; break; case PPPIOCSDEBUG: if (get_user(val, p)) break; ppp->debug = val; err = 0; break; case PPPIOCGDEBUG: if (put_user(ppp->debug, p)) break; err = 0; break; case PPPIOCGIDLE: idle.xmit_idle = (jiffies - ppp->last_xmit) / HZ; idle.recv_idle = (jiffies - ppp->last_recv) / HZ; if (copy_to_user(argp, &idle, sizeof(idle))) break; err = 0; break; case PPPIOCSMAXCID: if (get_user(val, p)) break; val2 = 15; if ((val >> 16) != 0) { val2 = val >> 16; val &= 0xffff; } vj = slhc_init(val2+1, val+1); if (IS_ERR(vj)) { err = PTR_ERR(vj); break; } ppp_lock(ppp); if (ppp->vj) slhc_free(ppp->vj); ppp->vj = vj; ppp_unlock(ppp); err = 0; break; case PPPIOCGNPMODE: case PPPIOCSNPMODE: if (copy_from_user(&npi, argp, sizeof(npi))) break; err = proto_to_npindex(npi.protocol); if (err < 0) break; i = err; if (cmd == PPPIOCGNPMODE) { err = -EFAULT; npi.mode = ppp->npmode[i]; if (copy_to_user(argp, &npi, sizeof(npi))) break; } else { ppp->npmode[i] = npi.mode; /* we may be able to transmit more packets now (??) */ netif_wake_queue(ppp->dev); } err = 0; break; #ifdef CONFIG_PPP_FILTER case PPPIOCSPASS: { struct sock_filter *code; err = get_filter(argp, &code); if (err >= 0) { struct bpf_prog *pass_filter = NULL; struct sock_fprog_kern fprog = { .len = err, .filter = code, }; err = 0; if (fprog.filter) err = bpf_prog_create(&pass_filter, &fprog); if (!err) { ppp_lock(ppp); if (ppp->pass_filter) bpf_prog_destroy(ppp->pass_filter); ppp->pass_filter = pass_filter; ppp_unlock(ppp); } kfree(code); } break; } case PPPIOCSACTIVE: { struct sock_filter *code; err = get_filter(argp, &code); if (err >= 0) { struct bpf_prog *active_filter = NULL; struct sock_fprog_kern fprog = { .len = err, .filter = code, }; err = 0; if (fprog.filter) err = bpf_prog_create(&active_filter, &fprog); if (!err) { ppp_lock(ppp); if (ppp->active_filter) bpf_prog_destroy(ppp->active_filter); ppp->active_filter = active_filter; ppp_unlock(ppp); } kfree(code); } break; } #endif /* CONFIG_PPP_FILTER */ #ifdef CONFIG_PPP_MULTILINK case PPPIOCSMRRU: if (get_user(val, p)) break; ppp_recv_lock(ppp); ppp->mrru = val; ppp_recv_unlock(ppp); err = 0; break; #endif /* CONFIG_PPP_MULTILINK */ default: err = -ENOTTY; } out: mutex_unlock(&ppp_mutex); return err; } static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf, struct file *file, unsigned int cmd, unsigned long arg) { int unit, err = -EFAULT; struct ppp *ppp; struct channel *chan; struct ppp_net *pn; int __user *p = (int __user *)arg; switch (cmd) { case PPPIOCNEWUNIT: /* Create a new ppp unit */ if (get_user(unit, p)) break; ppp = ppp_create_interface(net, unit, file, &err); if (!ppp) break; file->private_data = &ppp->file; err = -EFAULT; if (put_user(ppp->file.index, p)) break; err = 0; break; case PPPIOCATTACH: /* Attach to an existing ppp unit */ if (get_user(unit, p)) break; err = -ENXIO; pn = ppp_pernet(net); mutex_lock(&pn->all_ppp_mutex); ppp = ppp_find_unit(pn, unit); if (ppp) { atomic_inc(&ppp->file.refcnt); file->private_data = &ppp->file; err = 0; } mutex_unlock(&pn->all_ppp_mutex); break; case PPPIOCATTCHAN: if (get_user(unit, p)) break; err = -ENXIO; pn = ppp_pernet(net); spin_lock_bh(&pn->all_channels_lock); chan = ppp_find_channel(pn, unit); if (chan) { atomic_inc(&chan->file.refcnt); file->private_data = &chan->file; err = 0; } spin_unlock_bh(&pn->all_channels_lock); break; default: err = -ENOTTY; } return err; } static const struct file_operations ppp_device_fops = { .owner = THIS_MODULE, .read = ppp_read, .write = ppp_write, .poll = ppp_poll, .unlocked_ioctl = ppp_ioctl, .open = ppp_open, .release = ppp_release, .llseek = noop_llseek, }; static __net_init int ppp_init_net(struct net *net) { 12 struct ppp_net *pn = net_generic(net, ppp_net_id); idr_init(&pn->units_idr); mutex_init(&pn->all_ppp_mutex); INIT_LIST_HEAD(&pn->all_channels); INIT_LIST_HEAD(&pn->new_channels); spin_lock_init(&pn->all_channels_lock); return 0; } static __net_exit void ppp_exit_net(struct net *net) { struct ppp_net *pn = net_generic(net, ppp_net_id); struct net_device *dev; struct net_device *aux; struct ppp *ppp; LIST_HEAD(list); int id; rtnl_lock(); for_each_netdev_safe(net, dev, aux) { if (dev->netdev_ops == &ppp_netdev_ops) unregister_netdevice_queue(dev, &list); } idr_for_each_entry(&pn->units_idr, ppp, id) /* Skip devices already unregistered by previous loop */ if (!net_eq(dev_net(ppp->dev), net)) unregister_netdevice_queue(ppp->dev, &list); unregister_netdevice_many(&list); rtnl_unlock(); mutex_destroy(&pn->all_ppp_mutex); idr_destroy(&pn->units_idr); } static struct pernet_operations ppp_net_ops = { .init = ppp_init_net, .exit = ppp_exit_net, .id = &ppp_net_id, .size = sizeof(struct ppp_net), }; #define PPP_MAJOR 108 /* Called at boot time if ppp is compiled into the kernel, or at module load time (from init_module) if compiled as a module. */ static int __init ppp_init(void) { int err; pr_info("PPP generic driver version " PPP_VERSION "\n"); err = register_pernet_device(&ppp_net_ops); if (err) { pr_err("failed to register PPP pernet device (%d)\n", err); goto out; } err = register_chrdev(PPP_MAJOR, "ppp", &ppp_device_fops); if (err) { pr_err("failed to register PPP device (%d)\n", err); goto out_net; } ppp_class = class_create(THIS_MODULE, "ppp"); if (IS_ERR(ppp_class)) { err = PTR_ERR(ppp_class); goto out_chrdev; } /* not a big deal if we fail here :-) */ device_create(ppp_class, NULL, MKDEV(PPP_MAJOR, 0), NULL, "ppp"); return 0; out_chrdev: unregister_chrdev(PPP_MAJOR, "ppp"); out_net: unregister_pernet_device(&ppp_net_ops); out: return err; } /* * Network interface unit routines. */ static netdev_tx_t ppp_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ppp *ppp = netdev_priv(dev); int npi, proto; unsigned char *pp; npi = ethertype_to_npindex(ntohs(skb->protocol)); if (npi < 0) goto outf; /* Drop, accept or reject the packet */ switch (ppp->npmode[npi]) { case NPMODE_PASS: break; case NPMODE_QUEUE: /* it would be nice to have a way to tell the network system to queue this one up for later. */ goto outf; case NPMODE_DROP: case NPMODE_ERROR: goto outf; } /* Put the 2-byte PPP protocol number on the front, making sure there is room for the address and control fields. */ if (skb_cow_head(skb, PPP_HDRLEN)) goto outf; pp = skb_push(skb, 2); proto = npindex_to_proto[npi]; put_unaligned_be16(proto, pp); skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(dev))); skb_queue_tail(&ppp->file.xq, skb); ppp_xmit_process(ppp); return NETDEV_TX_OK; outf: kfree_skb(skb); ++dev->stats.tx_dropped; return NETDEV_TX_OK; } static int ppp_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ppp *ppp = netdev_priv(dev); int err = -EFAULT; void __user *addr = (void __user *) ifr->ifr_ifru.ifru_data; struct ppp_stats stats; struct ppp_comp_stats cstats; char *vers; switch (cmd) { case SIOCGPPPSTATS: ppp_get_stats(ppp, &stats); if (copy_to_user(addr, &stats, sizeof(stats))) break; err = 0; break; case SIOCGPPPCSTATS: memset(&cstats, 0, sizeof(cstats)); if (ppp->xc_state) ppp->xcomp->comp_stat(ppp->xc_state, &cstats.c); if (ppp->rc_state) ppp->rcomp->decomp_stat(ppp->rc_state, &cstats.d); if (copy_to_user(addr, &cstats, sizeof(cstats))) break; err = 0; break; case SIOCGPPPVER: vers = PPP_VERSION; if (copy_to_user(addr, vers, strlen(vers) + 1)) break; err = 0; break; default: err = -EINVAL; } return err; } static struct rtnl_link_stats64* ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64) { struct ppp *ppp = netdev_priv(dev); ppp_recv_lock(ppp); stats64->rx_packets = ppp->stats64.rx_packets; stats64->rx_bytes = ppp->stats64.rx_bytes; ppp_recv_unlock(ppp); ppp_xmit_lock(ppp); stats64->tx_packets = ppp->stats64.tx_packets; stats64->tx_bytes = ppp->stats64.tx_bytes; ppp_xmit_unlock(ppp); stats64->rx_errors = dev->stats.rx_errors; stats64->tx_errors = dev->stats.tx_errors; stats64->rx_dropped = dev->stats.rx_dropped; stats64->tx_dropped = dev->stats.tx_dropped; stats64->rx_length_errors = dev->stats.rx_length_errors; return stats64; } static struct lock_class_key ppp_tx_busylock; static int ppp_dev_init(struct net_device *dev) { struct ppp *ppp; dev->qdisc_tx_busylock = &ppp_tx_busylock; ppp = netdev_priv(dev); /* Let the netdevice take a reference on the ppp file. This ensures * that ppp_destroy_interface() won't run before the device gets * unregistered. */ atomic_inc(&ppp->file.refcnt); return 0; } static void ppp_dev_uninit(struct net_device *dev) { struct ppp *ppp = netdev_priv(dev); struct ppp_net *pn = ppp_pernet(ppp->ppp_net); ppp_lock(ppp); ppp->closing = 1; ppp_unlock(ppp); mutex_lock(&pn->all_ppp_mutex); unit_put(&pn->units_idr, ppp->file.index); mutex_unlock(&pn->all_ppp_mutex); ppp->owner = NULL; ppp->file.dead = 1; wake_up_interruptible(&ppp->file.rwait); } static void ppp_dev_priv_destructor(struct net_device *dev) { struct ppp *ppp; ppp = netdev_priv(dev); if (atomic_dec_and_test(&ppp->file.refcnt)) ppp_destroy_interface(ppp); } static const struct net_device_ops ppp_netdev_ops = { .ndo_init = ppp_dev_init, .ndo_uninit = ppp_dev_uninit, .ndo_start_xmit = ppp_start_xmit, .ndo_do_ioctl = ppp_net_ioctl, .ndo_get_stats64 = ppp_get_stats64, }; static void ppp_setup(struct net_device *dev) { dev->netdev_ops = &ppp_netdev_ops; dev->hard_header_len = PPP_HDRLEN; dev->mtu = PPP_MRU; dev->addr_len = 0; dev->tx_queue_len = 3; dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->destructor = ppp_dev_priv_destructor; netif_keep_dst(dev); } /* * Transmit-side routines. */ /* * Called to do any work queued up on the transmit side * that can now be done. */ static void ppp_xmit_process(struct ppp *ppp) { struct sk_buff *skb; ppp_xmit_lock(ppp); if (!ppp->closing) { ppp_push(ppp); while (!ppp->xmit_pending && (skb = skb_dequeue(&ppp->file.xq))) ppp_send_frame(ppp, skb); /* If there's no work left to do, tell the core net code that we can accept some more. */ if (!ppp->xmit_pending && !skb_peek(&ppp->file.xq)) netif_wake_queue(ppp->dev); else netif_stop_queue(ppp->dev); } ppp_xmit_unlock(ppp); } static inline struct sk_buff * pad_compress_skb(struct ppp *ppp, struct sk_buff *skb) { struct sk_buff *new_skb; int len; int new_skb_size = ppp->dev->mtu + ppp->xcomp->comp_extra + ppp->dev->hard_header_len; int compressor_skb_size = ppp->dev->mtu + ppp->xcomp->comp_extra + PPP_HDRLEN; new_skb = alloc_skb(new_skb_size, GFP_ATOMIC); if (!new_skb) { if (net_ratelimit()) netdev_err(ppp->dev, "PPP: no memory (comp pkt)\n"); return NULL; } if (ppp->dev->hard_header_len > PPP_HDRLEN) skb_reserve(new_skb, ppp->dev->hard_header_len - PPP_HDRLEN); /* compressor still expects A/C bytes in hdr */ len = ppp->xcomp->compress(ppp->xc_state, skb->data - 2, new_skb->data, skb->len + 2, compressor_skb_size); if (len > 0 && (ppp->flags & SC_CCP_UP)) { consume_skb(skb); skb = new_skb; skb_put(skb, len); skb_pull(skb, 2); /* pull off A/C bytes */ } else if (len == 0) { /* didn't compress, or CCP not up yet */ consume_skb(new_skb); new_skb = skb; } else { /* * (len < 0) * MPPE requires that we do not send unencrypted * frames. The compressor will return -1 if we * should drop the frame. We cannot simply test * the compress_proto because MPPE and MPPC share * the same number. */ if (net_ratelimit()) netdev_err(ppp->dev, "ppp: compressor dropped pkt\n"); kfree_skb(skb); consume_skb(new_skb); new_skb = NULL; } return new_skb; } /* * Compress and send a frame. * The caller should have locked the xmit path, * and xmit_pending should be 0. */ static void ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) { int proto = PPP_PROTO(skb); struct sk_buff *new_skb; int len; unsigned char *cp; if (proto < 0x8000) { #ifdef CONFIG_PPP_FILTER /* check if we should pass this packet */ /* the filter instructions are constructed assuming a four-byte PPP header on each packet */ *skb_push(skb, 2) = 1; if (ppp->pass_filter && BPF_PROG_RUN(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: outbound frame " "not passed\n"); kfree_skb(skb); return; } /* if this packet passes the active filter, record the time */ if (!(ppp->active_filter && BPF_PROG_RUN(ppp->active_filter, skb) == 0)) ppp->last_xmit = jiffies; skb_pull(skb, 2); #else /* for data packets, record the time */ ppp->last_xmit = jiffies; #endif /* CONFIG_PPP_FILTER */ } ++ppp->stats64.tx_packets; ppp->stats64.tx_bytes += skb->len - 2; switch (proto) { case PPP_IP: if (!ppp->vj || (ppp->flags & SC_COMP_TCP) == 0) break; /* try to do VJ TCP header compression */ new_skb = alloc_skb(skb->len + ppp->dev->hard_header_len - 2, GFP_ATOMIC); if (!new_skb) { netdev_err(ppp->dev, "PPP: no memory (VJ comp pkt)\n"); goto drop; } skb_reserve(new_skb, ppp->dev->hard_header_len - 2); cp = skb->data + 2; len = slhc_compress(ppp->vj, cp, skb->len - 2, new_skb->data + 2, &cp, !(ppp->flags & SC_NO_TCP_CCID)); if (cp == skb->data + 2) { /* didn't compress */ consume_skb(new_skb); } else { if (cp[0] & SL_TYPE_COMPRESSED_TCP) { proto = PPP_VJC_COMP; cp[0] &= ~SL_TYPE_COMPRESSED_TCP; } else { proto = PPP_VJC_UNCOMP; cp[0] = skb->data[2]; } consume_skb(skb); skb = new_skb; cp = skb_put(skb, len + 2); cp[0] = 0; cp[1] = proto; } break; case PPP_CCP: /* peek at outbound CCP frames */ ppp_ccp_peek(ppp, skb, 0); break; } /* try to do packet compression */ if ((ppp->xstate & SC_COMP_RUN) && ppp->xc_state && proto != PPP_LCP && proto != PPP_CCP) { if (!(ppp->flags & SC_CCP_UP) && (ppp->flags & SC_MUST_COMP)) { if (net_ratelimit()) netdev_err(ppp->dev, "ppp: compression required but " "down - pkt dropped.\n"); goto drop; } skb = pad_compress_skb(ppp, skb); if (!skb) goto drop; } /* * If we are waiting for traffic (demand dialling), * queue it up for pppd to receive. */ if (ppp->flags & SC_LOOP_TRAFFIC) { if (ppp->file.rq.qlen > PPP_MAX_RQLEN) goto drop; skb_queue_tail(&ppp->file.rq, skb); wake_up_interruptible(&ppp->file.rwait); return; } ppp->xmit_pending = skb; ppp_push(ppp); return; drop: kfree_skb(skb); ++ppp->dev->stats.tx_errors; } /* * Try to send the frame in xmit_pending. * The caller should have the xmit path locked. */ static void ppp_push(struct ppp *ppp) { struct list_head *list; struct channel *pch; struct sk_buff *skb = ppp->xmit_pending; if (!skb) return; list = &ppp->channels; if (list_empty(list)) { /* nowhere to send the packet, just drop it */ ppp->xmit_pending = NULL; kfree_skb(skb); return; } if ((ppp->flags & SC_MULTILINK) == 0) { /* not doing multilink: send it down the first channel */ list = list->next; pch = list_entry(list, struct channel, clist); spin_lock_bh(&pch->downl); if (pch->chan) { if (pch->chan->ops->start_xmit(pch->chan, skb)) ppp->xmit_pending = NULL; } else { /* channel got unregistered */ kfree_skb(skb); ppp->xmit_pending = NULL; } spin_unlock_bh(&pch->downl); return; } #ifdef CONFIG_PPP_MULTILINK /* Multilink: fragment the packet over as many links as can take the packet at the moment. */ if (!ppp_mp_explode(ppp, skb)) return; #endif /* CONFIG_PPP_MULTILINK */ ppp->xmit_pending = NULL; kfree_skb(skb); } #ifdef CONFIG_PPP_MULTILINK static bool mp_protocol_compress __read_mostly = true; module_param(mp_protocol_compress, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(mp_protocol_compress, "compress protocol id in multilink fragments"); /* * Divide a packet to be transmitted into fragments and * send them out the individual links. */ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) { int len, totlen; int i, bits, hdrlen, mtu; int flen; int navail, nfree, nzero; int nbigger; int totspeed; int totfree; unsigned char *p, *q; struct list_head *list; struct channel *pch; struct sk_buff *frag; struct ppp_channel *chan; totspeed = 0; /*total bitrate of the bundle*/ nfree = 0; /* # channels which have no packet already queued */ navail = 0; /* total # of usable channels (not deregistered) */ nzero = 0; /* number of channels with zero speed associated*/ totfree = 0; /*total # of channels available and *having no queued packets before *starting the fragmentation*/ hdrlen = (ppp->flags & SC_MP_XSHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN; i = 0; list_for_each_entry(pch, &ppp->channels, clist) { if (pch->chan) { pch->avail = 1; navail++; pch->speed = pch->chan->speed; } else { pch->avail = 0; } if (pch->avail) { if (skb_queue_empty(&pch->file.xq) || !pch->had_frag) { if (pch->speed == 0) nzero++; else totspeed += pch->speed; pch->avail = 2; ++nfree; ++totfree; } if (!pch->had_frag && i < ppp->nxchan) ppp->nxchan = i; } ++i; } /* * Don't start sending this packet unless at least half of * the channels are free. This gives much better TCP * performance if we have a lot of channels. */ if (nfree == 0 || nfree < navail / 2) return 0; /* can't take now, leave it in xmit_pending */ /* Do protocol field compression */ p = skb->data; len = skb->len; if (*p == 0 && mp_protocol_compress) { ++p; --len; } totlen = len; nbigger = len % nfree; /* skip to the channel after the one we last used and start at that one */ list = &ppp->channels; for (i = 0; i < ppp->nxchan; ++i) { list = list->next; if (list == &ppp->channels) { i = 0; break; } } /* create a fragment for each channel */ bits = B; while (len > 0) { list = list->next; if (list == &ppp->channels) { i = 0; continue; } pch = list_entry(list, struct channel, clist); ++i; if (!pch->avail) continue; /* * Skip this channel if it has a fragment pending already and * we haven't given a fragment to all of the free channels. */ if (pch->avail == 1) { if (nfree > 0) continue; } else { pch->avail = 1; } /* check the channel's mtu and whether it is still attached. */ spin_lock_bh(&pch->downl); if (pch->chan == NULL) { /* can't use this channel, it's being deregistered */ if (pch->speed == 0) nzero--; else totspeed -= pch->speed; spin_unlock_bh(&pch->downl); pch->avail = 0; totlen = len; totfree--; nfree--; if (--navail == 0) break; continue; } /* *if the channel speed is not set divide *the packet evenly among the free channels; *otherwise divide it according to the speed *of the channel we are going to transmit on */ flen = len; if (nfree > 0) { if (pch->speed == 0) { flen = len/nfree; if (nbigger > 0) { flen++; nbigger--; } } else { flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / ((totspeed*totfree)/pch->speed)) - hdrlen; if (nbigger > 0) { flen += ((totfree - nzero)*pch->speed)/totspeed; nbigger -= ((totfree - nzero)*pch->speed)/ totspeed; } } nfree--; } /* *check if we are on the last channel or *we exceded the length of the data to *fragment */ if ((nfree <= 0) || (flen > len)) flen = len; /* *it is not worth to tx on slow channels: *in that case from the resulting flen according to the *above formula will be equal or less than zero. *Skip the channel in this case */ if (flen <= 0) { pch->avail = 2; spin_unlock_bh(&pch->downl); continue; } /* * hdrlen includes the 2-byte PPP protocol field, but the * MTU counts only the payload excluding the protocol field. * (RFC1661 Section 2) */ mtu = pch->chan->mtu - (hdrlen - 2); if (mtu < 4) mtu = 4; if (flen > mtu) flen = mtu; if (flen == len) bits |= E; frag = alloc_skb(flen + hdrlen + (flen == 0), GFP_ATOMIC); if (!frag) goto noskb; q = skb_put(frag, flen + hdrlen); /* make the MP header */ put_unaligned_be16(PPP_MP, q); if (ppp->flags & SC_MP_XSHORTSEQ) { q[2] = bits + ((ppp->nxseq >> 8) & 0xf); q[3] = ppp->nxseq; } else { q[2] = bits; q[3] = ppp->nxseq >> 16; q[4] = ppp->nxseq >> 8; q[5] = ppp->nxseq; } memcpy(q + hdrlen, p, flen); /* try to send it down the channel */ chan = pch->chan; if (!skb_queue_empty(&pch->file.xq) || !chan->ops->start_xmit(chan, frag)) skb_queue_tail(&pch->file.xq, frag); pch->had_frag = 1; p += flen; len -= flen; ++ppp->nxseq; bits = 0; spin_unlock_bh(&pch->downl); } ppp->nxchan = i; return 1; noskb: spin_unlock_bh(&pch->downl); if (ppp->debug & 1) netdev_err(ppp->dev, "PPP: no memory (fragment)\n"); ++ppp->dev->stats.tx_errors; ++ppp->nxseq; return 1; /* abandon the frame */ } #endif /* CONFIG_PPP_MULTILINK */ /* * Try to send data out on a channel. */ static void ppp_channel_push(struct channel *pch) { struct sk_buff *skb; struct ppp *ppp; spin_lock_bh(&pch->downl); if (pch->chan) { while (!skb_queue_empty(&pch->file.xq)) { skb = skb_dequeue(&pch->file.xq); if (!pch->chan->ops->start_xmit(pch->chan, skb)) { /* put the packet back and try again later */ skb_queue_head(&pch->file.xq, skb); break; } } } else { /* channel got deregistered */ skb_queue_purge(&pch->file.xq); } spin_unlock_bh(&pch->downl); /* see if there is anything from the attached unit to be sent */ if (skb_queue_empty(&pch->file.xq)) { read_lock_bh(&pch->upl); ppp = pch->ppp; if (ppp) ppp_xmit_process(ppp); read_unlock_bh(&pch->upl); } } /* * Receive-side routines. */ struct ppp_mp_skb_parm { u32 sequence; u8 BEbits; }; #define PPP_MP_CB(skb) ((struct ppp_mp_skb_parm *)((skb)->cb)) static inline void ppp_do_recv(struct ppp *ppp, struct sk_buff *skb, struct channel *pch) { ppp_recv_lock(ppp); if (!ppp->closing) ppp_receive_frame(ppp, skb, pch); else kfree_skb(skb); ppp_recv_unlock(ppp); } void ppp_input(struct ppp_channel *chan, struct sk_buff *skb) { struct channel *pch = chan->ppp; int proto; if (!pch) { kfree_skb(skb); return; } read_lock_bh(&pch->upl); if (!pskb_may_pull(skb, 2)) { kfree_skb(skb); if (pch->ppp) { ++pch->ppp->dev->stats.rx_length_errors; ppp_receive_error(pch->ppp); } goto done; } proto = PPP_PROTO(skb); if (!pch->ppp || proto >= 0xc000 || proto == PPP_CCPFRAG) { /* put it on the channel queue */ skb_queue_tail(&pch->file.rq, skb); /* drop old frames if queue too long */ while (pch->file.rq.qlen > PPP_MAX_RQLEN && (skb = skb_dequeue(&pch->file.rq))) kfree_skb(skb); wake_up_interruptible(&pch->file.rwait); } else { ppp_do_recv(pch->ppp, skb, pch); } done: read_unlock_bh(&pch->upl); } /* Put a 0-length skb in the receive queue as an error indication */ void ppp_input_error(struct ppp_channel *chan, int code) { struct channel *pch = chan->ppp; struct sk_buff *skb; if (!pch) return; read_lock_bh(&pch->upl); if (pch->ppp) { skb = alloc_skb(0, GFP_ATOMIC); if (skb) { skb->len = 0; /* probably unnecessary */ skb->cb[0] = code; ppp_do_recv(pch->ppp, skb, pch); } } read_unlock_bh(&pch->upl); } /* * We come in here to process a received frame. * The receive side of the ppp unit is locked. */ static void ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch) { /* note: a 0-length skb is used as an error indication */ if (skb->len > 0) { skb_checksum_complete_unset(skb); #ifdef CONFIG_PPP_MULTILINK /* XXX do channel-level decompression here */ if (PPP_PROTO(skb) == PPP_MP) ppp_receive_mp_frame(ppp, skb, pch); else #endif /* CONFIG_PPP_MULTILINK */ ppp_receive_nonmp_frame(ppp, skb); } else { kfree_skb(skb); ppp_receive_error(ppp); } } static void ppp_receive_error(struct ppp *ppp) { ++ppp->dev->stats.rx_errors; if (ppp->vj) slhc_toss(ppp->vj); } static void ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) { struct sk_buff *ns; int proto, len, npi; /* * Decompress the frame, if compressed. * Note that some decompressors need to see uncompressed frames * that come in as well as compressed frames. */ if (ppp->rc_state && (ppp->rstate & SC_DECOMP_RUN) && (ppp->rstate & (SC_DC_FERROR | SC_DC_ERROR)) == 0) skb = ppp_decompress_frame(ppp, skb); if (ppp->flags & SC_MUST_COMP && ppp->rstate & SC_DC_FERROR) goto err; proto = PPP_PROTO(skb); switch (proto) { case PPP_VJC_COMP: /* decompress VJ compressed packets */ if (!ppp->vj || (ppp->flags & SC_REJ_COMP_TCP)) goto err; if (skb_tailroom(skb) < 124 || skb_cloned(skb)) { /* copy to a new sk_buff with more tailroom */ ns = dev_alloc_skb(skb->len + 128); if (!ns) { netdev_err(ppp->dev, "PPP: no memory " "(VJ decomp)\n"); goto err; } skb_reserve(ns, 2); skb_copy_bits(skb, 0, skb_put(ns, skb->len), skb->len); consume_skb(skb); skb = ns; } else skb->ip_summed = CHECKSUM_NONE; len = slhc_uncompress(ppp->vj, skb->data + 2, skb->len - 2); if (len <= 0) { netdev_printk(KERN_DEBUG, ppp->dev, "PPP: VJ decompression error\n"); goto err; } len += 2; if (len > skb->len) skb_put(skb, len - skb->len); else if (len < skb->len) skb_trim(skb, len); proto = PPP_IP; break; case PPP_VJC_UNCOMP: if (!ppp->vj || (ppp->flags & SC_REJ_COMP_TCP)) goto err; /* Until we fix the decompressor need to make sure * data portion is linear. */ if (!pskb_may_pull(skb, skb->len)) goto err; if (slhc_remember(ppp->vj, skb->data + 2, skb->len - 2) <= 0) { netdev_err(ppp->dev, "PPP: VJ uncompressed error\n"); goto err; } proto = PPP_IP; break; case PPP_CCP: ppp_ccp_peek(ppp, skb, 1); break; } ++ppp->stats64.rx_packets; ppp->stats64.rx_bytes += skb->len - 2; npi = proto_to_npindex(proto); if (npi < 0) { /* control or unknown frame - pass it to pppd */ skb_queue_tail(&ppp->file.rq, skb); /* limit queue length by dropping old frames */ while (ppp->file.rq.qlen > PPP_MAX_RQLEN && (skb = skb_dequeue(&ppp->file.rq))) kfree_skb(skb); /* wake up any process polling or blocking on read */ wake_up_interruptible(&ppp->file.rwait); } else { /* network protocol frame - give it to the kernel */ #ifdef CONFIG_PPP_FILTER /* check if the packet passes the pass and active filters */ /* the filter instructions are constructed assuming a four-byte PPP header on each packet */ if (ppp->pass_filter || ppp->active_filter) { if (skb_unclone(skb, GFP_ATOMIC)) goto err; *skb_push(skb, 2) = 0; if (ppp->pass_filter && BPF_PROG_RUN(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: inbound frame " "not passed\n"); kfree_skb(skb); return; } if (!(ppp->active_filter && BPF_PROG_RUN(ppp->active_filter, skb) == 0)) ppp->last_recv = jiffies; __skb_pull(skb, 2); } else #endif /* CONFIG_PPP_FILTER */ ppp->last_recv = jiffies; if ((ppp->dev->flags & IFF_UP) == 0 || ppp->npmode[npi] != NPMODE_PASS) { kfree_skb(skb); } else { /* chop off protocol */ skb_pull_rcsum(skb, 2); skb->dev = ppp->dev; skb->protocol = htons(npindex_to_ethertype[npi]); skb_reset_mac_header(skb); skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(ppp->dev))); netif_rx(skb); } } return; err: kfree_skb(skb); ppp_receive_error(ppp); } static struct sk_buff * ppp_decompress_frame(struct ppp *ppp, struct sk_buff *skb) { int proto = PPP_PROTO(skb); struct sk_buff *ns; int len; /* Until we fix all the decompressor's need to make sure * data portion is linear. */ if (!pskb_may_pull(skb, skb->len)) goto err; if (proto == PPP_COMP) { int obuff_size; switch(ppp->rcomp->compress_proto) { case CI_MPPE: obuff_size = ppp->mru + PPP_HDRLEN + 1; break; default: obuff_size = ppp->mru + PPP_HDRLEN; break; } ns = dev_alloc_skb(obuff_size); if (!ns) { netdev_err(ppp->dev, "ppp_decompress_frame: " "no memory\n"); goto err; } /* the decompressor still expects the A/C bytes in the hdr */ len = ppp->rcomp->decompress(ppp->rc_state, skb->data - 2, skb->len + 2, ns->data, obuff_size); if (len < 0) { /* Pass the compressed frame to pppd as an error indication. */ if (len == DECOMP_FATALERROR) ppp->rstate |= SC_DC_FERROR; kfree_skb(ns); goto err; } consume_skb(skb); skb = ns; skb_put(skb, len); skb_pull(skb, 2); /* pull off the A/C bytes */ } else { /* Uncompressed frame - pass to decompressor so it can update its dictionary if necessary. */ if (ppp->rcomp->incomp) ppp->rcomp->incomp(ppp->rc_state, skb->data - 2, skb->len + 2); } return skb; err: ppp->rstate |= SC_DC_ERROR; ppp_receive_error(ppp); return skb; } #ifdef CONFIG_PPP_MULTILINK /* * Receive a multilink frame. * We put it on the reconstruction queue and then pull off * as many completed frames as we can. */ static void ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch) { u32 mask, seq; struct channel *ch; int mphdrlen = (ppp->flags & SC_MP_SHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN; if (!pskb_may_pull(skb, mphdrlen + 1) || ppp->mrru == 0) goto err; /* no good, throw it away */ /* Decode sequence number and begin/end bits */ if (ppp->flags & SC_MP_SHORTSEQ) { seq = ((skb->data[2] & 0x0f) << 8) | skb->data[3]; mask = 0xfff; } else { seq = (skb->data[3] << 16) | (skb->data[4] << 8)| skb->data[5]; mask = 0xffffff; } PPP_MP_CB(skb)->BEbits = skb->data[2]; skb_pull(skb, mphdrlen); /* pull off PPP and MP headers */ /* * Do protocol ID decompression on the first fragment of each packet. */ if ((PPP_MP_CB(skb)->BEbits & B) && (skb->data[0] & 1)) *skb_push(skb, 1) = 0; /* * Expand sequence number to 32 bits, making it as close * as possible to ppp->minseq. */ seq |= ppp->minseq & ~mask; if ((int)(ppp->minseq - seq) > (int)(mask >> 1)) seq += mask + 1; else if ((int)(seq - ppp->minseq) > (int)(mask >> 1)) seq -= mask + 1; /* should never happen */ PPP_MP_CB(skb)->sequence = seq; pch->lastseq = seq; /* * If this packet comes before the next one we were expecting, * drop it. */ if (seq_before(seq, ppp->nextseq)) { kfree_skb(skb); ++ppp->dev->stats.rx_dropped; ppp_receive_error(ppp); return; } /* * Reevaluate minseq, the minimum over all channels of the * last sequence number received on each channel. Because of * the increasing sequence number rule, we know that any fragment * before `minseq' which hasn't arrived is never going to arrive. * The list of channels can't change because we have the receive * side of the ppp unit locked. */ list_for_each_entry(ch, &ppp->channels, clist) { if (seq_before(ch->lastseq, seq)) seq = ch->lastseq; } if (seq_before(ppp->minseq, seq)) ppp->minseq = seq; /* Put the fragment on the reconstruction queue */ ppp_mp_insert(ppp, skb); /* If the queue is getting long, don't wait any longer for packets before the start of the queue. */ if (skb_queue_len(&ppp->mrq) >= PPP_MP_MAX_QLEN) { struct sk_buff *mskb = skb_peek(&ppp->mrq); if (seq_before(ppp->minseq, PPP_MP_CB(mskb)->sequence)) ppp->minseq = PPP_MP_CB(mskb)->sequence; } /* Pull completed packets off the queue and receive them. */ while ((skb = ppp_mp_reconstruct(ppp))) { if (pskb_may_pull(skb, 2)) ppp_receive_nonmp_frame(ppp, skb); else { ++ppp->dev->stats.rx_length_errors; kfree_skb(skb); ppp_receive_error(ppp); } } return; err: kfree_skb(skb); ppp_receive_error(ppp); } /* * Insert a fragment on the MP reconstruction queue. * The queue is ordered by increasing sequence number. */ static void ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb) { struct sk_buff *p; struct sk_buff_head *list = &ppp->mrq; u32 seq = PPP_MP_CB(skb)->sequence; /* N.B. we don't need to lock the list lock because we have the ppp unit receive-side lock. */ skb_queue_walk(list, p) { if (seq_before(seq, PPP_MP_CB(p)->sequence)) break; } __skb_queue_before(list, p, skb); } /* * Reconstruct a packet from the MP fragment queue. * We go through increasing sequence numbers until we find a * complete packet, or we get to the sequence number for a fragment * which hasn't arrived but might still do so. */ static struct sk_buff * ppp_mp_reconstruct(struct ppp *ppp) { u32 seq = ppp->nextseq; u32 minseq = ppp->minseq; struct sk_buff_head *list = &ppp->mrq; struct sk_buff *p, *tmp; struct sk_buff *head, *tail; struct sk_buff *skb = NULL; int lost = 0, len = 0; if (ppp->mrru == 0) /* do nothing until mrru is set */ return NULL; head = list->next; tail = NULL; skb_queue_walk_safe(list, p, tmp) { again: if (seq_before(PPP_MP_CB(p)->sequence, seq)) { /* this can't happen, anyway ignore the skb */ netdev_err(ppp->dev, "ppp_mp_reconstruct bad " "seq %u < %u\n", PPP_MP_CB(p)->sequence, seq); __skb_unlink(p, list); kfree_skb(p); continue; } if (PPP_MP_CB(p)->sequence != seq) { u32 oldseq; /* Fragment `seq' is missing. If it is after minseq, it might arrive later, so stop here. */ if (seq_after(seq, minseq)) break; /* Fragment `seq' is lost, keep going. */ lost = 1; oldseq = seq; seq = seq_before(minseq, PPP_MP_CB(p)->sequence)? minseq + 1: PPP_MP_CB(p)->sequence; if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "lost frag %u..%u\n", oldseq, seq-1); goto again; } /* * At this point we know that all the fragments from * ppp->nextseq to seq are either present or lost. * Also, there are no complete packets in the queue * that have no missing fragments and end before this * fragment. */ /* B bit set indicates this fragment starts a packet */ if (PPP_MP_CB(p)->BEbits & B) { head = p; lost = 0; len = 0; } len += p->len; /* Got a complete packet yet? */ if (lost == 0 && (PPP_MP_CB(p)->BEbits & E) && (PPP_MP_CB(head)->BEbits & B)) { if (len > ppp->mrru + 2) { ++ppp->dev->stats.rx_length_errors; netdev_printk(KERN_DEBUG, ppp->dev, "PPP: reconstructed packet" " is too long (%d)\n", len); } else { tail = p; break; } ppp->nextseq = seq + 1; } /* * If this is the ending fragment of a packet, * and we haven't found a complete valid packet yet, * we can discard up to and including this fragment. */ if (PPP_MP_CB(p)->BEbits & E) { struct sk_buff *tmp2; skb_queue_reverse_walk_from_safe(list, p, tmp2) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "discarding frag %u\n", PPP_MP_CB(p)->sequence); __skb_unlink(p, list); kfree_skb(p); } head = skb_peek(list); if (!head) break; } ++seq; } /* If we have a complete packet, copy it all into one skb. */ if (tail != NULL) { /* If we have discarded any fragments, signal a receive error. */ if (PPP_MP_CB(head)->sequence != ppp->nextseq) { skb_queue_walk_safe(list, p, tmp) { if (p == head) break; if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "discarding frag %u\n", PPP_MP_CB(p)->sequence); __skb_unlink(p, list); kfree_skb(p); } if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, " missed pkts %u..%u\n", ppp->nextseq, PPP_MP_CB(head)->sequence-1); ++ppp->dev->stats.rx_dropped; ppp_receive_error(ppp); } skb = head; if (head != tail) { struct sk_buff **fragpp = &skb_shinfo(skb)->frag_list; p = skb_queue_next(list, head); __skb_unlink(skb, list); skb_queue_walk_from_safe(list, p, tmp) { __skb_unlink(p, list); *fragpp = p; p->next = NULL; fragpp = &p->next; skb->len += p->len; skb->data_len += p->len; skb->truesize += p->truesize; if (p == tail) break; } } else { __skb_unlink(skb, list); } ppp->nextseq = PPP_MP_CB(tail)->sequence + 1; } return skb; } #endif /* CONFIG_PPP_MULTILINK */ /* * Channel interface. */ /* Create a new, unattached ppp channel. */ int ppp_register_channel(struct ppp_channel *chan) { return ppp_register_net_channel(current->nsproxy->net_ns, chan); } /* Create a new, unattached ppp channel for specified net. */ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan) { struct channel *pch; struct ppp_net *pn; pch = kzalloc(sizeof(struct channel), GFP_KERNEL); if (!pch) return -ENOMEM; pn = ppp_pernet(net); pch->ppp = NULL; pch->chan = chan; pch->chan_net = get_net(net); chan->ppp = pch; init_ppp_file(&pch->file, CHANNEL); pch->file.hdrlen = chan->hdrlen; #ifdef CONFIG_PPP_MULTILINK pch->lastseq = -1; #endif /* CONFIG_PPP_MULTILINK */ init_rwsem(&pch->chan_sem); spin_lock_init(&pch->downl); rwlock_init(&pch->upl); spin_lock_bh(&pn->all_channels_lock); pch->file.index = ++pn->last_channel_index; list_add(&pch->list, &pn->new_channels); atomic_inc(&channel_count); spin_unlock_bh(&pn->all_channels_lock); return 0; } /* * Return the index of a channel. */ int ppp_channel_index(struct ppp_channel *chan) { struct channel *pch = chan->ppp; if (pch) return pch->file.index; return -1; } /* * Return the PPP unit number to which a channel is connected. */ int ppp_unit_number(struct ppp_channel *chan) { struct channel *pch = chan->ppp; int unit = -1; if (pch) { read_lock_bh(&pch->upl); if (pch->ppp) unit = pch->ppp->file.index; read_unlock_bh(&pch->upl); } return unit; } /* * Return the PPP device interface name of a channel. */ char *ppp_dev_name(struct ppp_channel *chan) { struct channel *pch = chan->ppp; char *name = NULL; if (pch) { read_lock_bh(&pch->upl); if (pch->ppp && pch->ppp->dev) name = pch->ppp->dev->name; read_unlock_bh(&pch->upl); } return name; } /* * Disconnect a channel from the generic layer. * This must be called in process context. */ void ppp_unregister_channel(struct ppp_channel *chan) { struct channel *pch = chan->ppp; struct ppp_net *pn; if (!pch) return; /* should never happen */ chan->ppp = NULL; /* * This ensures that we have returned from any calls into the * the channel's start_xmit or ioctl routine before we proceed. */ down_write(&pch->chan_sem); spin_lock_bh(&pch->downl); pch->chan = NULL; spin_unlock_bh(&pch->downl); up_write(&pch->chan_sem); ppp_disconnect_channel(pch); pn = ppp_pernet(pch->chan_net); spin_lock_bh(&pn->all_channels_lock); list_del(&pch->list); spin_unlock_bh(&pn->all_channels_lock); pch->file.dead = 1; wake_up_interruptible(&pch->file.rwait); if (atomic_dec_and_test(&pch->file.refcnt)) ppp_destroy_channel(pch); } /* * Callback from a channel when it can accept more to transmit. * This should be called at BH/softirq level, not interrupt level. */ void ppp_output_wakeup(struct ppp_channel *chan) { struct channel *pch = chan->ppp; if (!pch) return; ppp_channel_push(pch); } /* * Compression control. */ /* Process the PPPIOCSCOMPRESS ioctl. */ static int ppp_set_compress(struct ppp *ppp, unsigned long arg) { int err; struct compressor *cp, *ocomp; struct ppp_option_data data; void *state, *ostate; unsigned char ccp_option[CCP_MAX_OPTION_LENGTH]; err = -EFAULT; if (copy_from_user(&data, (void __user *) arg, sizeof(data)) || (data.length <= CCP_MAX_OPTION_LENGTH && copy_from_user(ccp_option, (void __user *) data.ptr, data.length))) goto out; err = -EINVAL; if (data.length > CCP_MAX_OPTION_LENGTH || ccp_option[1] < 2 || ccp_option[1] > data.length) goto out; cp = try_then_request_module( find_compressor(ccp_option[0]), "ppp-compress-%d", ccp_option[0]); if (!cp) goto out; err = -ENOBUFS; if (data.transmit) { state = cp->comp_alloc(ccp_option, data.length); if (state) { ppp_xmit_lock(ppp); ppp->xstate &= ~SC_COMP_RUN; ocomp = ppp->xcomp; ostate = ppp->xc_state; ppp->xcomp = cp; ppp->xc_state = state; ppp_xmit_unlock(ppp); if (ostate) { ocomp->comp_free(ostate); module_put(ocomp->owner); } err = 0; } else module_put(cp->owner); } else { state = cp->decomp_alloc(ccp_option, data.length); if (state) { ppp_recv_lock(ppp); ppp->rstate &= ~SC_DECOMP_RUN; ocomp = ppp->rcomp; ostate = ppp->rc_state; ppp->rcomp = cp; ppp->rc_state = state; ppp_recv_unlock(ppp); if (ostate) { ocomp->decomp_free(ostate); module_put(ocomp->owner); } err = 0; } else module_put(cp->owner); } out: return err; } /* * Look at a CCP packet and update our state accordingly. * We assume the caller has the xmit or recv path locked. */ static void ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound) { unsigned char *dp; int len; if (!pskb_may_pull(skb, CCP_HDRLEN + 2)) return; /* no header */ dp = skb->data + 2; switch (CCP_CODE(dp)) { case CCP_CONFREQ: /* A ConfReq starts negotiation of compression * in one direction of transmission, * and hence brings it down...but which way? * * Remember: * A ConfReq indicates what the sender would like to receive */ if(inbound) /* He is proposing what I should send */ ppp->xstate &= ~SC_COMP_RUN; else /* I am proposing to what he should send */ ppp->rstate &= ~SC_DECOMP_RUN; break; case CCP_TERMREQ: case CCP_TERMACK: /* * CCP is going down, both directions of transmission */ ppp->rstate &= ~SC_DECOMP_RUN; ppp->xstate &= ~SC_COMP_RUN; break; case CCP_CONFACK: if ((ppp->flags & (SC_CCP_OPEN | SC_CCP_UP)) != SC_CCP_OPEN) break; len = CCP_LENGTH(dp); if (!pskb_may_pull(skb, len + 2)) return; /* too short */ dp += CCP_HDRLEN; len -= CCP_HDRLEN; if (len < CCP_OPT_MINLEN || len < CCP_OPT_LENGTH(dp)) break; if (inbound) { /* we will start receiving compressed packets */ if (!ppp->rc_state) break; if (ppp->rcomp->decomp_init(ppp->rc_state, dp, len, ppp->file.index, 0, ppp->mru, ppp->debug)) { ppp->rstate |= SC_DECOMP_RUN; ppp->rstate &= ~(SC_DC_ERROR | SC_DC_FERROR); } } else { /* we will soon start sending compressed packets */ if (!ppp->xc_state) break; if (ppp->xcomp->comp_init(ppp->xc_state, dp, len, ppp->file.index, 0, ppp->debug)) ppp->xstate |= SC_COMP_RUN; } break; case CCP_RESETACK: /* reset the [de]compressor */ if ((ppp->flags & SC_CCP_UP) == 0) break; if (inbound) { if (ppp->rc_state && (ppp->rstate & SC_DECOMP_RUN)) { ppp->rcomp->decomp_reset(ppp->rc_state); ppp->rstate &= ~SC_DC_ERROR; } } else { if (ppp->xc_state && (ppp->xstate & SC_COMP_RUN)) ppp->xcomp->comp_reset(ppp->xc_state); } break; } } /* Free up compression resources. */ static void ppp_ccp_closed(struct ppp *ppp) { void *xstate, *rstate; struct compressor *xcomp, *rcomp; ppp_lock(ppp); ppp->flags &= ~(SC_CCP_OPEN | SC_CCP_UP); ppp->xstate = 0; xcomp = ppp->xcomp; xstate = ppp->xc_state; ppp->xc_state = NULL; ppp->rstate = 0; rcomp = ppp->rcomp; rstate = ppp->rc_state; ppp->rc_state = NULL; ppp_unlock(ppp); if (xstate) { xcomp->comp_free(xstate); module_put(xcomp->owner); } if (rstate) { rcomp->decomp_free(rstate); module_put(rcomp->owner); } } /* List of compressors. */ static LIST_HEAD(compressor_list); static DEFINE_SPINLOCK(compressor_list_lock); struct compressor_entry { struct list_head list; struct compressor *comp; }; static struct compressor_entry * find_comp_entry(int proto) { struct compressor_entry *ce; list_for_each_entry(ce, &compressor_list, list) { if (ce->comp->compress_proto == proto) return ce; } return NULL; } /* Register a compressor */ int ppp_register_compressor(struct compressor *cp) { struct compressor_entry *ce; int ret; spin_lock(&compressor_list_lock); ret = -EEXIST; if (find_comp_entry(cp->compress_proto)) goto out; ret = -ENOMEM; ce = kmalloc(sizeof(struct compressor_entry), GFP_ATOMIC); if (!ce) goto out; ret = 0; ce->comp = cp; list_add(&ce->list, &compressor_list); out: spin_unlock(&compressor_list_lock); return ret; } /* Unregister a compressor */ void ppp_unregister_compressor(struct compressor *cp) { struct compressor_entry *ce; spin_lock(&compressor_list_lock); ce = find_comp_entry(cp->compress_proto); if (ce && ce->comp == cp) { list_del(&ce->list); kfree(ce); } spin_unlock(&compressor_list_lock); } /* Find a compressor. */ static struct compressor * find_compressor(int type) { struct compressor_entry *ce; struct compressor *cp = NULL; spin_lock(&compressor_list_lock); ce = find_comp_entry(type); if (ce) { cp = ce->comp; if (!try_module_get(cp->owner)) cp = NULL; } spin_unlock(&compressor_list_lock); return cp; } /* * Miscelleneous stuff. */ static void ppp_get_stats(struct ppp *ppp, struct ppp_stats *st) { struct slcompress *vj = ppp->vj; memset(st, 0, sizeof(*st)); st->p.ppp_ipackets = ppp->stats64.rx_packets; st->p.ppp_ierrors = ppp->dev->stats.rx_errors; st->p.ppp_ibytes = ppp->stats64.rx_bytes; st->p.ppp_opackets = ppp->stats64.tx_packets; st->p.ppp_oerrors = ppp->dev->stats.tx_errors; st->p.ppp_obytes = ppp->stats64.tx_bytes; if (!vj) return; st->vj.vjs_packets = vj->sls_o_compressed + vj->sls_o_uncompressed; st->vj.vjs_compressed = vj->sls_o_compressed; st->vj.vjs_searches = vj->sls_o_searches; st->vj.vjs_misses = vj->sls_o_misses; st->vj.vjs_errorin = vj->sls_i_error; st->vj.vjs_tossed = vj->sls_i_tossed; st->vj.vjs_uncompressedin = vj->sls_i_uncompressed; st->vj.vjs_compressedin = vj->sls_i_compressed; } /* * Stuff for handling the lists of ppp units and channels * and for initialization. */ /* * Create a new ppp interface unit. Fails if it can't allocate memory * or if there is already a unit with the requested number. * unit == -1 means allocate a new number. */ static struct ppp *ppp_create_interface(struct net *net, int unit, struct file *file, int *retp) { struct ppp *ppp; struct ppp_net *pn; struct net_device *dev = NULL; int ret = -ENOMEM; int i; dev = alloc_netdev(sizeof(struct ppp), "", NET_NAME_UNKNOWN, ppp_setup); if (!dev) goto out1; pn = ppp_pernet(net); ppp = netdev_priv(dev); ppp->dev = dev; ppp->mru = PPP_MRU; init_ppp_file(&ppp->file, INTERFACE); ppp->file.hdrlen = PPP_HDRLEN - 2; /* don't count proto bytes */ ppp->owner = file; for (i = 0; i < NUM_NP; ++i) ppp->npmode[i] = NPMODE_PASS; INIT_LIST_HEAD(&ppp->channels); spin_lock_init(&ppp->rlock); spin_lock_init(&ppp->wlock); #ifdef CONFIG_PPP_MULTILINK ppp->minseq = -1; skb_queue_head_init(&ppp->mrq); #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER ppp->pass_filter = NULL; ppp->active_filter = NULL; #endif /* CONFIG_PPP_FILTER */ /* * drum roll: don't forget to set * the net device is belong to */ dev_net_set(dev, net); rtnl_lock(); mutex_lock(&pn->all_ppp_mutex); if (unit < 0) { unit = unit_get(&pn->units_idr, ppp); if (unit < 0) { ret = unit; goto out2; } } else { ret = -EEXIST; if (unit_find(&pn->units_idr, unit)) goto out2; /* unit already exists */ /* * if caller need a specified unit number * lets try to satisfy him, otherwise -- * he should better ask us for new unit number * * NOTE: yes I know that returning EEXIST it's not * fair but at least pppd will ask us to allocate * new unit in this case so user is happy :) */ unit = unit_set(&pn->units_idr, ppp, unit); if (unit < 0) goto out2; } /* Initialize the new ppp unit */ ppp->file.index = unit; sprintf(dev->name, "ppp%d", unit); ret = register_netdevice(dev); if (ret != 0) { unit_put(&pn->units_idr, unit); netdev_err(ppp->dev, "PPP: couldn't register device %s (%d)\n", dev->name, ret); goto out2; } ppp->ppp_net = net; atomic_inc(&ppp_unit_count); mutex_unlock(&pn->all_ppp_mutex); rtnl_unlock(); *retp = 0; return ppp; out2: mutex_unlock(&pn->all_ppp_mutex); rtnl_unlock(); free_netdev(dev); out1: *retp = ret; return NULL; } /* * Initialize a ppp_file structure. */ static void init_ppp_file(struct ppp_file *pf, int kind) { pf->kind = kind; skb_queue_head_init(&pf->xq); skb_queue_head_init(&pf->rq); atomic_set(&pf->refcnt, 1); init_waitqueue_head(&pf->rwait); } /* * Free the memory used by a ppp unit. This is only called once * there are no channels connected to the unit and no file structs * that reference the unit. */ static void ppp_destroy_interface(struct ppp *ppp) { atomic_dec(&ppp_unit_count); if (!ppp->file.dead || ppp->n_channels) { /* "can't happen" */ netdev_err(ppp->dev, "ppp: destroying ppp struct %p " "but dead=%d n_channels=%d !\n", ppp, ppp->file.dead, ppp->n_channels); return; } ppp_ccp_closed(ppp); if (ppp->vj) { slhc_free(ppp->vj); ppp->vj = NULL; } skb_queue_purge(&ppp->file.xq); skb_queue_purge(&ppp->file.rq); #ifdef CONFIG_PPP_MULTILINK skb_queue_purge(&ppp->mrq); #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER if (ppp->pass_filter) { bpf_prog_destroy(ppp->pass_filter); ppp->pass_filter = NULL; } if (ppp->active_filter) { bpf_prog_destroy(ppp->active_filter); ppp->active_filter = NULL; } #endif /* CONFIG_PPP_FILTER */ kfree_skb(ppp->xmit_pending); free_netdev(ppp->dev); } /* * Locate an existing ppp unit. * The caller should have locked the all_ppp_mutex. */ static struct ppp * ppp_find_unit(struct ppp_net *pn, int unit) { return unit_find(&pn->units_idr, unit); } /* * Locate an existing ppp channel. * The caller should have locked the all_channels_lock. * First we look in the new_channels list, then in the * all_channels list. If found in the new_channels list, * we move it to the all_channels list. This is for speed * when we have a lot of channels in use. */ static struct channel * ppp_find_channel(struct ppp_net *pn, int unit) { struct channel *pch; list_for_each_entry(pch, &pn->new_channels, list) { if (pch->file.index == unit) { list_move(&pch->list, &pn->all_channels); return pch; } } list_for_each_entry(pch, &pn->all_channels, list) { if (pch->file.index == unit) return pch; } return NULL; } /* * Connect a PPP channel to a PPP interface unit. */ static int ppp_connect_channel(struct channel *pch, int unit) { struct ppp *ppp; struct ppp_net *pn; int ret = -ENXIO; int hdrlen; pn = ppp_pernet(pch->chan_net); mutex_lock(&pn->all_ppp_mutex); ppp = ppp_find_unit(pn, unit); if (!ppp) goto out; write_lock_bh(&pch->upl); ret = -EINVAL; if (pch->ppp) goto outl; ppp_lock(ppp); spin_lock_bh(&pch->downl); if (!pch->chan) { /* Don't connect unregistered channels */ spin_unlock_bh(&pch->downl); ppp_unlock(ppp); ret = -ENOTCONN; goto outl; } spin_unlock_bh(&pch->downl); if (pch->file.hdrlen > ppp->file.hdrlen) ppp->file.hdrlen = pch->file.hdrlen; hdrlen = pch->file.hdrlen + 2; /* for protocol bytes */ if (hdrlen > ppp->dev->hard_header_len) ppp->dev->hard_header_len = hdrlen; list_add_tail(&pch->clist, &ppp->channels); ++ppp->n_channels; pch->ppp = ppp; atomic_inc(&ppp->file.refcnt); ppp_unlock(ppp); ret = 0; outl: write_unlock_bh(&pch->upl); out: mutex_unlock(&pn->all_ppp_mutex); return ret; } /* * Disconnect a channel from its ppp unit. */ static int ppp_disconnect_channel(struct channel *pch) { struct ppp *ppp; int err = -EINVAL; write_lock_bh(&pch->upl); ppp = pch->ppp; pch->ppp = NULL; write_unlock_bh(&pch->upl); if (ppp) { /* remove it from the ppp unit's list */ ppp_lock(ppp); list_del(&pch->clist); if (--ppp->n_channels == 0) wake_up_interruptible(&ppp->file.rwait); ppp_unlock(ppp); if (atomic_dec_and_test(&ppp->file.refcnt)) ppp_destroy_interface(ppp); err = 0; } return err; } /* * Free up the resources used by a ppp channel. */ static void ppp_destroy_channel(struct channel *pch) { put_net(pch->chan_net); pch->chan_net = NULL; atomic_dec(&channel_count); if (!pch->file.dead) { /* "can't happen" */ pr_err("ppp: destroying undead channel %p !\n", pch); return; } skb_queue_purge(&pch->file.xq); skb_queue_purge(&pch->file.rq); kfree(pch); } static void __exit ppp_cleanup(void) { /* should never happen */ if (atomic_read(&ppp_unit_count) || atomic_read(&channel_count)) pr_err("PPP: removing module but units remain!\n"); unregister_chrdev(PPP_MAJOR, "ppp"); device_destroy(ppp_class, MKDEV(PPP_MAJOR, 0)); class_destroy(ppp_class); unregister_pernet_device(&ppp_net_ops); } /* * Units handling. Caller must protect concurrent access * by holding all_ppp_mutex */ /* associate pointer with specified number */ static int unit_set(struct idr *p, void *ptr, int n) { int unit; unit = idr_alloc(p, ptr, n, n + 1, GFP_KERNEL); if (unit == -ENOSPC) unit = -EINVAL; return unit; } /* get new free unit number and associate pointer with it */ static int unit_get(struct idr *p, void *ptr) { return idr_alloc(p, ptr, 0, 0, GFP_KERNEL); } /* put unit number back to a pool */ static void unit_put(struct idr *p, int n) { idr_remove(p, n); } /* get pointer associated with the number */ static void *unit_find(struct idr *p, int n) { return idr_find(p, n); } /* Module/initialization stuff */ module_init(ppp_init); module_exit(ppp_cleanup); EXPORT_SYMBOL(ppp_register_net_channel); EXPORT_SYMBOL(ppp_register_channel); EXPORT_SYMBOL(ppp_unregister_channel); EXPORT_SYMBOL(ppp_channel_index); EXPORT_SYMBOL(ppp_unit_number); EXPORT_SYMBOL(ppp_dev_name); EXPORT_SYMBOL(ppp_input); EXPORT_SYMBOL(ppp_input_error); EXPORT_SYMBOL(ppp_output_wakeup); EXPORT_SYMBOL(ppp_register_compressor); EXPORT_SYMBOL(ppp_unregister_compressor); MODULE_LICENSE("GPL"); MODULE_ALIAS_CHARDEV(PPP_MAJOR, 0); MODULE_ALIAS("devname:ppp");
/* * mm/pgtable-generic.c * * Generic pgtable methods declared in asm-generic/pgtable.h * * Copyright (C) 2010 Linus Torvalds */ #include <linux/pagemap.h> #include <asm/tlb.h> #include <asm-generic/pgtable.h> /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but * very seldom) called out from the p?d_none_or_clear_bad macros. */ void pgd_clear_bad(pgd_t *pgd) { pgd_ERROR(*pgd); pgd_clear(pgd); } void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* * Only sets the access flags (dirty, accessed), as well as write * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This * used to be done in the caller, but sparc needs minor faults to * force that call on sun4c so we changed this macro slightly */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address); } return changed; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { int young; young = ptep_test_and_clear_young(vma, address, ptep); if (young) flush_tlb_page(vma, address); return young; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { 1032 struct mm_struct *mm = (vma)->vm_mm; pte_t pte; pte = ptep_get_and_clear(mm, address, ptep); 1 if (pte_accessible(mm, pte)) 1031 flush_tlb_page(vma, address); 1032 return pte; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. stock flush_tlb_range() typically has optimization to nuke the * entire TLB TLB if flush span is greater than a threshhold, which will * likely be true for a single huge page. Thus a single thp flush will * invalidate the entire TLB which is not desitable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed) { set_pmd_at(vma->vm_mm, address, pmdp, entry); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } return changed; } #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(!pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = pmd_mksplitting(*pmdp); VM_BUG_ON(address & ~HPAGE_PMD_MASK); set_pmd_at(vma->vm_mm, address, pmdp, pmd); /* tlb flush only to serialize against gup-fast */ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); pmd_huge_pte(mm, pmdp) = pgtable; } #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW /* no "address" argument so destroys page coloring of some arch */ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); if (list_empty(&pgtable->lru)) pmd_huge_pte(mm, pmdp) = NULL; else { pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, struct page, lru); list_del(&pgtable->lru); } return pgtable; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t entry = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } #endif #ifndef pmdp_collapse_flush pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { /* * pmd and hugepage pte format are same. So we could * use the same function. */ pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); /* collapse entails shooting down ptes not pmd */ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifndef _NF_NAT_CORE_H #define _NF_NAT_CORE_H #include <linux/list.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_nat.h> /* This header used to share core functionality between the standalone NAT module, and the compatibility layer's use of NAT for masquerading. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb); int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family); static inline int nf_nat_initialized(struct nf_conn *ct, enum nf_nat_manip_type manip) { 620 if (manip == NF_NAT_MANIP_SRC) 497 return ct->status & IPS_SRC_NAT_DONE; else 620 return ct->status & IPS_DST_NAT_DONE; } struct nlattr; extern int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); #endif /* _NF_NAT_CORE_H */
/* * kernel/workqueue.c - generic async execution with shared worker pool * * Copyright (C) 2002 Ingo Molnar * * Derived from the taskqueue/keventd code by: * David Woodhouse <dwmw2@infradead.org> * Andrew Morton * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> * * Made to use alloc_percpu by Christoph Lameter. * * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * This is the generic async execution mechanism. Work items as are * executed in process context. The worker pool is shared and * automatically managed. There are two worker pools for each CPU (one for * normal work items and the other for high priority ones) and some extra * pools for workqueues which are not bound to any specific CPU - the * number of these backing pools is dynamic. * * Please read Documentation/workqueue.txt for details. */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/completion.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/kthread.h> #include <linux/hardirq.h> #include <linux/mempolicy.h> #include <linux/freezer.h> #include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> #include <linux/jhash.h> #include <linux/hashtable.h> #include <linux/rculist.h> #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> #include "workqueue_internal.h" enum { /* * worker_pool flags * * A bound pool is either associated or disassociated with its CPU. * While associated (!DISASSOCIATED), all workers are bound to the * CPU and none has %WORKER_UNBOUND set and concurrency management * is in effect. * * While DISASSOCIATED, the cpu may be offline and all workers have * %WORKER_UNBOUND set and concurrency management disabled, and may * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding * attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ WORKER_REBOUND = 1 << 8, /* worker was rebound */ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND | WORKER_REBOUND, NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, /* call for help after 10ms (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ /* * Rescue workers are used only on emergencies and shared by * all cpus. Give MIN_NICE. */ RESCUER_NICE_LEVEL = MIN_NICE, HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 24, }; /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only for * everyone else. * * P: Preemption protected. Disabling preemption is enough and should * only be modified and accessed from the local cpu. * * L: pool->lock protected. Access with pool->lock held. * * X: During normal operation, modification requires pool->lock and should * be done only from local cpu. Either disabling preemption on local * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * * A: pool->attach_mutex protected. * * PL: wq_pool_mutex protected. * * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. * * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. * * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or * sched-RCU for reads. * * WQ: wq->mutex protected. * * WR: wq->mutex protected for writes. Sched-RCU protected for reads. * * MD: wq_mayday_lock protected. */ /* struct worker is defined in workqueue_internal.h */ struct worker_pool { spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ /* nr_idle includes the ones off idle_list for rebinding */ int nr_idle; /* L: currently idle ones */ struct list_head idle_list; /* X: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ /* see manage_workers() for details on the two manager mutexes */ struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed * from other CPUs during try_to_wake_up(), put it in a separate * cacheline. */ atomic_t nr_running ____cacheline_aligned_in_smp; /* * Destruction of pool is sched-RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; } ____cacheline_aligned_in_smp; /* * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS * of work_struct->data are used for flags and the remaining high bits * point to the pwq; thus, pwqs need to be aligned at two's power of the * number of flag bits. */ struct pool_workqueue { struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ /* * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue * itself is also sched-RCU protected so that the first pwq can be * determined without grabbing wq->mutex. */ struct work_struct unbound_release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); /* * Structure used to wait for workqueue flush. */ struct wq_flusher { struct list_head list; /* WQ: list of flushers */ int flush_color; /* WQ: flush color waiting for */ struct completion done; /* flush completion */ }; struct wq_device; /* * The externally visible workqueue. It relays the issued work items to * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ int flush_color; /* WQ: current flush color */ atomic_t nr_pwqs_to_flush; /* flush in progress */ struct wq_flusher *first_flusher; /* WQ: first flusher */ struct list_head flusher_queue; /* WQ: flush waiters */ struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ #endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ /* * Destruction of workqueue_struct is sched-RCU protected to allow * walking the workqueues list without grabbing wq_pool_mutex. * This is used to dump all workqueues from sysrq. */ struct rcu_head rcu; /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ }; static struct kmem_cache *pwq_cache; static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ /* PL: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __read_mostly; EXPORT_SYMBOL_GPL(system_long_wq); struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); struct workqueue_struct *system_power_efficient_wq __read_mostly; EXPORT_SYMBOL_GPL(system_power_efficient_wq); struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ !lockdep_is_held(&wq_pool_mutex), \ "sched RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ !lockdep_is_held(&wq->mutex), \ "sched RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ "sched RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor * @pi: integer used for iteration * * This must be called either with wq_pool_mutex held or sched RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool(pool, pi) \ idr_for_each_entry(&worker_pool_idr, pool, pi) \ if (({ assert_rcu_or_pool_mutex(); false; })) { } \ else /** * for_each_pool_worker - iterate through all workers of a worker_pool * @worker: iteration cursor * @pool: worker_pool to iterate workers of * * This must be called with @pool->attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ else /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor * @wq: the target workqueue * * This must be called either with wq->mutex held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pwq(pwq, wq) \ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ else #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; static void *work_debug_hint(void *addr) { return ((struct work_struct *) addr)->func; } /* * fixup_init is called when: * - an active object is initialized */ static int work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_init(work, &work_debug_descr); return 1; default: return 0; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ static int work_fixup_activate(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: /* * This is not really a fixup. The work struct was * statically initialized. We just make sure that it * is tracked in the object tracker. */ if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { debug_object_init(work, &work_debug_descr); debug_object_activate(work, &work_debug_descr); return 0; } WARN_ON_ONCE(1); return 0; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: return 0; } } /* * fixup_free is called when: * - an active object is freed */ static int work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); return 1; default: return 0; } } static struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, .fixup_init = work_fixup_init, .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, }; static inline void debug_work_activate(struct work_struct *work) { 1221 debug_object_activate(work, &work_debug_descr); } static inline void debug_work_deactivate(struct work_struct *work) { 126 debug_object_deactivate(work, &work_debug_descr); } void __init_work(struct work_struct *work, int onstack) { 1777 if (onstack) 86 debug_object_init_on_stack(work, &work_debug_descr); else 1777 debug_object_init(work, &work_debug_descr); 1777 } EXPORT_SYMBOL_GPL(__init_work); void destroy_work_on_stack(struct work_struct *work) { debug_object_free(work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_work_on_stack); void destroy_delayed_work_on_stack(struct delayed_work *work) { destroy_timer_on_stack(&work->timer); debug_object_free(&work->work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); #else static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** * worker_pool_assign_id - allocate ID and assing it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned * successfully, -errno on failure. */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; } return ret; } /** * unbound_pwq_by_node - return the unbound pool_workqueue for the given node * @wq: the target workqueue * @node: the node ID * * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU * read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * * Return: The unbound pool_workqueue for @node. */ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { 525 assert_rcu_or_wq_mutex_or_pool_mutex(wq); /* * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a * delayed item is pending. The plan is to keep CPU -> NODE * mapping valid and stable across CPU on/offlines. Once that * happens, this workaround can be removed. */ if (unlikely(node == NUMA_NO_NODE)) return wq->dfl_pwq; 525 return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; } static int get_work_color(struct work_struct *work) { return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } static int work_next_color(int color) { return (color + 1) % WORK_NR_COLORS; } /* * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data * contain the pointer to the queued pwq. Once execution starts, the flag * is cleared and the high bits contain OFFQ flags and pool ID. * * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() * and clear_work_data() can be used to set the pwq, pool or clear * work->data. These functions should only be called while the work is * owned - ie. while the PENDING bit is set. * * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq * corresponding to a work. Pool is available once the work has been * queued anywhere after initialization until it is sync canceled. pwq is * available only while the work item is queued. * * %WORK_OFFQ_CANCELING is used to mark a work item which is being * canceled. While being canceled, a work item may have its PENDING set * but stay off timer and worklist for arbitrarily long and nobody should * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) { WARN_ON_ONCE(!work_pending(work)); 1237 atomic_long_set(&work->data, data | flags | work_static(work)); } static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, unsigned long extra_flags) { 1221 set_work_data(work, (unsigned long)pwq, WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); } static void set_work_pool_and_keep_pending(struct work_struct *work, int pool_id) { 126 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, WORK_STRUCT_PENDING); } static void set_work_pool_and_clear_pending(struct work_struct *work, int pool_id) { /* * The following wmb is paired with the implied mb in * test_and_set_bit(PENDING) and ensures all updates to @work made * here are visible to and precede any updates by the next PENDING * owner. */ 36 smp_wmb(); 36 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); /* * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from * work->current_func, which is executed afterwards. This possible * reordering can lead to a missed execution on attempt to qeueue * the same @work. E.g. consider this case: * * CPU#0 CPU#1 * ---------------------------- -------------------------------- * * 1 STORE event_indicated * 2 queue_work_on() { * 3 test_and_set_bit(PENDING) * 4 } set_..._and_clear_pending() { * 5 set_work_data() # clear bit * 6 smp_mb() * 7 work->current_func() { * 8 LOAD event_indicated * } * * Without an explicit full barrier speculative LOAD on line 8 can * be executed before CPU#0 does STORE on line 1. If that happens, * CPU#0 observes the PENDING bit is still set and new execution of * a @work is not queued in a hope, that CPU#1 will eventually * finish the queued @work. Meanwhile CPU#1 does not see * event_indicated is set, because speculative LOAD was executed * before actual STORE. */ smp_mb(); } static void clear_work_data(struct work_struct *work) { smp_wmb(); /* see set_work_pool_and_clear_pending() */ 67 set_work_data(work, WORK_STRUCT_NO_POOL, 0); } static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) 200 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); else return NULL; } /** * get_work_pool - return the worker_pool a given work was associated with * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read * access under sched-RCU read lock. As such, this function should be * called under wq_pool_mutex or with preemption disabled. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used * beyond the critical section, the caller is responsible for ensuring the * returned pool is and stays online. * * Return: The worker_pool @work was last associated with. %NULL if none. */ static struct worker_pool *get_work_pool(struct work_struct *work) { 1272 unsigned long data = atomic_long_read(&work->data); int pool_id; 1272 assert_rcu_or_pool_mutex(); 1272 if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) 201 (data & WORK_STRUCT_WQ_DATA_MASK))->pool; 1271 pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; 807 return idr_find(&worker_pool_idr, pool_id); } /** * get_work_pool_id - return the worker pool ID a given work is associated with * @work: the work item of interest * * Return: The worker_pool ID @work was last associated with. * %WORK_OFFQ_POOL_NONE if none. */ static int get_work_pool_id(struct work_struct *work) { 103 unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; 103 return data >> WORK_OFFQ_POOL_SHIFT; } static void mark_work_canceling(struct work_struct *work) { 67 unsigned long pool_id = get_work_pool_id(work); pool_id <<= WORK_OFFQ_POOL_SHIFT; 67 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); } static bool work_is_canceling(struct work_struct *work) { 10 unsigned long data = atomic_long_read(&work->data); return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING); } /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that * they're being called with pool->lock held. */ static bool __need_more_worker(struct worker_pool *pool) { return !atomic_read(&pool->nr_running); } /* * Need to wake up a worker? Called from anything but currently * running workers. * * Note that, because unbound workers never contribute to nr_running, this * function will always return %true for unbound pools as long as the * worklist isn't empty. */ static bool need_more_worker(struct worker_pool *pool) { return !list_empty(&pool->worklist) && __need_more_worker(pool); } /* Can I start working? Called from busy but !running workers. */ static bool may_start_working(struct worker_pool *pool) { return pool->nr_idle; } /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { return !list_empty(&pool->worklist) && atomic_read(&pool->nr_running) <= 1; } /* Do we need a new worker? Called from manager. */ static bool need_to_create_worker(struct worker_pool *pool) { return need_more_worker(pool) && !may_start_working(pool); } /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { bool managing = pool->flags & POOL_MANAGER_ACTIVE; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } /* * Wake up functions. */ /* Return the first idle worker. Safe with preemption disabled */ static struct worker *first_idle_worker(struct worker_pool *pool) { 1208 if (unlikely(list_empty(&pool->idle_list))) return NULL; return list_first_entry(&pool->idle_list, struct worker, entry); } /** * wake_up_worker - wake up an idle worker * @pool: worker pool to wake worker from * * Wake up the first idle worker of @pool. * * CONTEXT: * spin_lock_irq(pool->lock). */ static void wake_up_worker(struct worker_pool *pool) { 1208 struct worker *worker = first_idle_worker(pool); 1211 if (likely(worker)) 1211 wake_up_process(worker->task); } /** * wq_worker_waking_up - a worker is waking up * @task: task waking up * @cpu: CPU @task is waking up to * * This function is called during try_to_wake_up() when a worker is * being awoken. * * CONTEXT: * spin_lock_irq(rq->lock) */ void wq_worker_waking_up(struct task_struct *task, int cpu) { 1409 struct worker *worker = kthread_data(task); if (!(worker->flags & WORKER_NOT_RUNNING)) { 118 WARN_ON_ONCE(worker->pool->cpu != cpu); 118 atomic_inc(&worker->pool->nr_running); } 1409 } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep * @cpu: CPU in question, must be the current CPU number * * This function is called during schedule() when a busy worker is * going to sleep. Worker on the same cpu can be woken up by * returning pointer to its task. * * CONTEXT: * spin_lock_irq(rq->lock) * * Return: * Worker task on @cpu to wake up, %NULL if none. */ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; /* * Rescuers, which may not have all the fields set up like normal * workers, also reach here, let's not access anything before * checking NOT_RUNNING. */ if (worker->flags & WORKER_NOT_RUNNING) return NULL; pool = worker->pool; /* this can only happen on the local cpu */ if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) return NULL; /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). * Please read comment there. * * NOT_RUNNING is clear. This means that we're bound to and * running on the local cpu w/ rq lock held and preemption * disabled, which in turn means that none else could be * manipulating idle_list, so dereferencing idle_list without pool * lock is safe. */ if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) to_wakeup = first_idle_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: * spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; WARN_ON_ONCE(worker->task != current); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { atomic_dec(&pool->nr_running); } worker->flags |= flags; } /** * worker_clr_flags - clear worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. * * CONTEXT: * spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; WARN_ON_ONCE(worker->task != current); worker->flags &= ~flags; /* * If transitioning out of NOT_RUNNING, increment nr_running. Note * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask * of multiple flags, not a single flag. */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(&pool->nr_running); } /** * find_worker_executing_work - find worker which is executing a work * @pool: pool of interest * @work: work to find worker for * * Find a worker which is executing @work on @pool by searching * @pool->busy_hash which is keyed by the address of @work. For a worker * to match, its current execution should match the address of @work and * its work function. This is to avoid unwanted dependency between * unrelated work executions through a work item being recycled while still * being executed. * * This is a bit tricky. A work item may be freed once its execution * starts and nothing prevents the freed area from being recycled for * another work item. If the same work item address ends up being reused * before the original execution finishes, workqueue will identify the * recycled work item as currently executing and make it wait until the * current execution finishes, introducing an unwanted dependency. * * This function checks the work item address and work function to avoid * false positives. Note that this isn't complete as one may construct a * work function which can introduce dependency onto itself through a * recycled work item. Well, if somebody wants to shoot oneself in the * foot that badly, there's only so much we can do, and if such deadlock * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: * spin_lock_irq(pool->lock). * * Return: * Pointer to worker which is executing @work if found, %NULL * otherwise. */ static struct worker *find_worker_executing_work(struct worker_pool *pool, struct work_struct *work) { struct worker *worker; 1224 hash_for_each_possible(pool->busy_hash, worker, hentry, (unsigned long)work) 128 if (worker->current_work == work && 128 worker->current_func == work->func) return worker; return NULL; } /** * move_linked_works - move linked works to a list * @work: start of series of works to be scheduled * @head: target list to append @work to * @nextp: out parameter for nested worklist walking * * Schedule linked works starting from @work to @head. Work series to * be scheduled starts at @work and includes any consecutive work with * WORK_STRUCT_LINKED set in its predecessor. * * If @nextp is not NULL, it's updated to point to the next work of * the last scheduled work. This allows move_linked_works() to be * nested inside outer list_for_each_entry_safe(). * * CONTEXT: * spin_lock_irq(pool->lock). */ static void move_linked_works(struct work_struct *work, struct list_head *head, struct work_struct **nextp) { struct work_struct *n; /* * Linked worklist will always end before the end of the list, * use NULL for list head. */ list_for_each_entry_safe_from(work, n, NULL, entry) { list_move_tail(&work->entry, head); if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) break; } /* * If we're already inside safe list traversal and have moved * multiple works to the scheduled queue, the next position * needs to be updated. */ if (nextp) *nextp = n; } /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * * Obtain an extra reference on @pwq. The caller should guarantee that * @pwq has positive refcnt and be holding the matching pool->lock. */ static void get_pwq(struct pool_workqueue *pwq) { 1221 lockdep_assert_held(&pwq->pool->lock); 1221 WARN_ON_ONCE(pwq->refcnt <= 0); 1221 pwq->refcnt++; } /** * put_pwq - put a pool_workqueue reference * @pwq: pool_workqueue to put * * Drop a reference of @pwq. If its refcnt reaches zero, schedule its * destruction. The caller should be holding the matching pool->lock. */ static void put_pwq(struct pool_workqueue *pwq) { 126 lockdep_assert_held(&pwq->pool->lock); 126 if (likely(--pwq->refcnt)) return; if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) return; /* * @pwq can't be released under pool->lock, bounce to * pwq_unbound_release_workfn(). This never recurses on the same * pool->lock as this path is taken only for unbound workqueues and * the release work item is scheduled on a per-cpu workqueue. To * avoid lockdep warning, unbound pool->locks are given lockdep * subclass of 1 in get_unbound_pool(). */ schedule_work(&pwq->unbound_release_work); } /** * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock * @pwq: pool_workqueue to put (can be %NULL) * * put_pwq() with locking. This function also allows %NULL @pwq. */ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* * As both pwqs and pools are sched-RCU protected, the * following lock operations are safe. */ spin_lock_irq(&pwq->pool->lock); put_pwq(pwq); spin_unlock_irq(&pwq->pool->lock); } } static void pwq_activate_delayed_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; } static void pwq_activate_first_delayed(struct pool_workqueue *pwq) { struct work_struct *work = list_first_entry(&pwq->delayed_works, struct work_struct, entry); pwq_activate_delayed_work(work); } /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest * @color: color of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. * * CONTEXT: * spin_lock_irq(pool->lock). */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { /* uncolored work items don't participate in flushing or nr_active */ 126 if (color == WORK_NO_COLOR) goto out_put; 126 pwq->nr_in_flight[color]--; pwq->nr_active--; if (!list_empty(&pwq->delayed_works)) { /* one down, submit a delayed one */ if (pwq->nr_active < pwq->max_active) pwq_activate_first_delayed(pwq); } /* is flush in progress and are we at the flushing tip? */ 126 if (likely(pwq->flush_color != color)) goto out_put; /* are there still in-flight works? */ if (pwq->nr_in_flight[color]) goto out_put; /* this pwq is done, clear flush_color */ pwq->flush_color = -1; /* * If this was the last pwq, wake up the first flusher. It * will handle the rest. */ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) complete(&pwq->wq->first_flusher->done); out_put: 126 put_pwq(pwq); } /** * try_to_grab_pending - steal work item from worklist and disable irq * @work: work item to steal * @is_dwork: @work is a delayed_work * @flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any * stable state - idle, on timer or on worklist. * * Return: * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * -ENOENT if someone else is canceling @work, this state may persist * for arbitrarily long * * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting * interrupted while holding PENDING and @work off queue, irq must be * disabled on entry. This, combined with delayed_work->timer being * irqsafe, ensures that we return -EAGAIN for finite short period of time. * * On successful return, >= 0, irq is disabled and the caller is * responsible for releasing it using local_irq_restore(*@flags). * * This function is safe to call from any context including IRQ handler. */ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, unsigned long *flags) { struct worker_pool *pool; struct pool_workqueue *pwq; 719 local_irq_save(*flags); /* try to steal the timer if it exists */ if (is_dwork) { struct delayed_work *dwork = to_delayed_work(work); /* * dwork->timer is irqsafe. If del_timer() fails, it's * guaranteed that the timer is not queued anywhere and not * running on the local CPU. */ 684 if (likely(del_timer(&dwork->timer))) return 1; } /* try to claim PENDING the normal way */ 617 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) 719 return 0; /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ 128 pool = get_work_pool(work); if (!pool) goto fail; 128 spin_lock(&pool->lock); /* * work->data is guaranteed to point to pwq only while the work * item is queued on pwq->wq, and both updating work->data to point * to pwq on queueing and to pool on dequeueing are done under * pwq->pool->lock. This in turn guarantees that, if work->data * points to pwq which is associated with a locked pool, the work * item is currently queued on that pool. */ 126 pwq = get_work_pwq(work); if (pwq && pwq->pool == pool) { 126 debug_work_deactivate(work); /* * A delayed work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left * on the delayed_list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ if (*work_data_bits(work) & WORK_STRUCT_DELAYED) pwq_activate_delayed_work(work); 126 list_del_init(&work->entry); pwq_dec_nr_in_flight(pwq, get_work_color(work)); /* work->data points to pwq iff queued, point to pool */ 126 set_work_pool_and_keep_pending(work, pool->id); spin_unlock(&pool->lock); return 1; } 10 spin_unlock(&pool->lock); fail: 10 local_irq_restore(*flags); 10 if (work_is_canceling(work)) return -ENOENT; 10 cpu_relax(); return -EAGAIN; } /** * insert_work - insert a work into a pool * @pwq: pwq @work belongs to * @work: work to insert * @head: insertion point * @extra_flags: extra WORK_STRUCT_* flags to set * * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to * work_struct flags. * * CONTEXT: * spin_lock_irq(pool->lock). */ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { 1221 struct worker_pool *pool = pwq->pool; /* we own @work, set data and link */ 1221 set_work_pwq(work, pwq, extra_flags); 1221 list_add_tail(&work->entry, head); 1221 get_pwq(pwq); /* * Ensure either wq_worker_sleeping() sees the above * list_add_tail() or we see zero nr_running to avoid workers lying * around lazily while there are works to be processed. */ smp_mb(); if (__need_more_worker(pool)) 1208 wake_up_worker(pool); 1221 } /* * Test whether @work is being queued from another work executing on the * same workqueue. */ static bool is_chained_work(struct workqueue_struct *wq) { struct worker *worker; worker = current_wq_worker(); /* * Return %true iff I'm a worker execuing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ return worker && worker->current_pwq->wq == wq; } static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; struct worker_pool *last_pool; struct list_head *worklist; unsigned int work_flags; 1213 unsigned int req_cpu = cpu; /* * While a work item is PENDING && off queue, a task trying to * steal the PENDING will busy-loop waiting for it to either get * queued or lose PENDING. Grabbing PENDING and queueing should * happen with IRQ disabled. */ WARN_ON_ONCE(!irqs_disabled()); 1213 debug_work_activate(work); /* if draining, only works from the same workqueue are allowed */ if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; retry: 1213 if (req_cpu == WORK_CPU_UNBOUND) 1187 cpu = raw_smp_processor_id(); /* pwq which will be used unless @work is executing elsewhere */ 1213 if (!(wq->flags & WQ_UNBOUND)) 772 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else 525 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. */ 1213 last_pool = get_work_pool(work); 772 if (last_pool && last_pool != pwq->pool) { struct worker *worker; 569 spin_lock(&last_pool->lock); 1213 worker = find_worker_executing_work(last_pool, work); 107 if (worker && worker->current_pwq->wq == wq) { 577 pwq = worker->current_pwq; } else { /* meh... not running there, queue here */ 558 spin_unlock(&last_pool->lock); spin_lock(&pwq->pool->lock); } } else { 1169 spin_lock(&pwq->pool->lock); } /* * pwq is determined and locked. For unbound pools, we could have * raced with pwq release and it could already be dead. If its * refcnt is zero, repeat pwq selection. Note that pwqs never die * without another pwq replacing it in the numa_pwq_tbl or while * work items are executing on it, so the retrying is guaranteed to * make forward-progress. */ 1213 if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { spin_unlock(&pwq->pool->lock); cpu_relax(); goto retry; } /* oops */ WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", wq->name, cpu); } /* pwq determined, queue */ 1213 trace_workqueue_queue_work(req_cpu, pwq, work); 1213 if (WARN_ON(!list_empty(&work->entry))) { spin_unlock(&pwq->pool->lock); return; } 1213 pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); if (likely(pwq->nr_active < pwq->max_active)) { 1213 trace_workqueue_activate_work(work); 1213 pwq->nr_active++; worklist = &pwq->pool->worklist; } else { 5 work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; } 1213 insert_work(pwq, work, worklist, work_flags); 1213 spin_unlock(&pwq->pool->lock); } /** * queue_work_on - queue work on specific cpu * @cpu: CPU number to execute work on * @wq: workqueue to use * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it * can't go away. * * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { bool ret = false; unsigned long flags; 762 local_irq_save(flags); 707 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_work(cpu, wq, work); ret = true; } 762 local_irq_restore(flags); 762 return ret; } EXPORT_SYMBOL(queue_work_on); void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); } EXPORT_SYMBOL(delayed_work_timer_fn); static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { 747 struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); 747 WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); 747 WARN_ON_ONCE(timer_pending(timer)); 747 WARN_ON_ONCE(!list_empty(&work->entry)); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ 747 if (!delay) { 658 __queue_work(cpu, wq, &dwork->work); return; } 143 timer_stats_timer_set_start_info(&dwork->timer); 143 dwork->wq = wq; dwork->cpu = cpu; timer->expires = jiffies + delay; if (unlikely(cpu != WORK_CPU_UNBOUND)) add_timer_on(timer, cpu); else 747 add_timer(timer); } /** * queue_delayed_work_on - queue work on specific CPU after delay * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * Return: %false if @work was already on a queue, %true otherwise. If * @delay is zero and @dwork is idle, it will be scheduled for immediate * execution. */ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { struct work_struct *work = &dwork->work; bool ret = false; unsigned long flags; /* read the comment in __queue_work() */ 152 local_irq_save(flags); 137 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_delayed_work(cpu, wq, dwork, delay); ret = true; } 152 local_irq_restore(flags); 152 return ret; } EXPORT_SYMBOL(queue_delayed_work_on); /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is * zero, @work is guaranteed to be scheduled immediately regardless of its * current state. * * Return: %false if @dwork was idle and queued, %true if @dwork was * pending and its timer was modified. * * This function is safe to call from any context including IRQ handler. * See try_to_grab_pending() for details. */ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) 673 { unsigned long flags; int ret; do { 673 ret = try_to_grab_pending(&dwork->work, true, &flags); } while (unlikely(ret == -EAGAIN)); 673 if (likely(ret >= 0)) { 673 __queue_delayed_work(cpu, wq, dwork, delay); 673 local_irq_restore(flags); } /* -ENOENT from try_to_grab_pending() becomes %true */ 673 return ret; } EXPORT_SYMBOL_GPL(mod_delayed_work_on); /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state * * @worker is entering idle state. Update stats and idle timer if * necessary. * * LOCKING: * spin_lock_irq(pool->lock). */ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || WARN_ON_ONCE(!list_empty(&worker->entry) && (worker->hentry.next || worker->hentry.pprev))) return; /* can't use worker_set_flags(), also called from create_worker() */ worker->flags |= WORKER_IDLE; pool->nr_idle++; worker->last_active = jiffies; /* idle_list is LIFO */ list_add(&worker->entry, &pool->idle_list); if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* * Sanity check nr_running. Because wq_unbind_fn() releases * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. */ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && pool->nr_workers == pool->nr_idle && atomic_read(&pool->nr_running)); } /** * worker_leave_idle - leave idle state * @worker: worker which is leaving idle state * * @worker is leaving idle state. Update stats. * * LOCKING: * spin_lock_irq(pool->lock). */ static void worker_leave_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; list_del_init(&worker->entry); } static struct worker *alloc_worker(int node) { struct worker *worker; 24 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); if (worker) { 24 INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); INIT_LIST_HEAD(&worker->node); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } 24 return worker; } /** * worker_attach_to_pool() - attach a worker to a pool * @worker: worker to be attached * @pool: the target pool * * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and * cpu-binding of @worker are kept coordinated with the pool across * cpu-[un]hotplugs. */ static void worker_attach_to_pool(struct worker *worker, struct worker_pool *pool) { mutex_lock(&pool->attach_mutex); /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any * online CPUs. It'll be re-applied when any of the CPUs come up. */ set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains * stable across this function. See the comments above the * flag definition for details. */ if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; list_add_tail(&worker->node, &pool->workers); mutex_unlock(&pool->attach_mutex); } /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool * @pool: the pool @worker is attached to * * Undo the attaching which had been done in worker_attach_to_pool(). The * caller worker shouldn't access to the pool after detached except it has * other reference to the pool. */ static void worker_detach_from_pool(struct worker *worker, struct worker_pool *pool) { struct completion *detach_completion = NULL; mutex_lock(&pool->attach_mutex); list_del(&worker->node); if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; mutex_unlock(&pool->attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); if (detach_completion) complete(detach_completion); } /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * * Create and start a new worker which is attached to @pool. * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. * * Return: * Pointer to the newly created worker. */ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker = NULL; int id = -1; char id_buf[16]; /* ID is needed to determine kthread name */ id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); if (id < 0) goto fail; worker = alloc_worker(pool->node); if (!worker) goto fail; worker->pool = pool; worker->id = id; if (pool->cpu >= 0) snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, pool->attrs->nice < 0 ? "H" : ""); else snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); if (IS_ERR(worker->task)) goto fail; set_user_nice(worker->task, pool->attrs->nice); kthread_bind_mask(worker->task, pool->attrs->cpumask); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); /* start the newly created worker */ spin_lock_irq(&pool->lock); worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); spin_unlock_irq(&pool->lock); return worker; fail: if (id >= 0) ida_simple_remove(&pool->worker_ida, id); kfree(worker); return NULL; } /** * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * * Destroy @worker and adjust @pool stats accordingly. The worker should * be idle. * * CONTEXT: * spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || WARN_ON(!list_empty(&worker->scheduled)) || WARN_ON(!(worker->flags & WORKER_IDLE))) return; pool->nr_workers--; pool->nr_idle--; list_del_init(&worker->entry); worker->flags |= WORKER_DIE; wake_up_process(worker->task); } static void idle_worker_timeout(unsigned long __pool) { struct worker_pool *pool = (void *)__pool; spin_lock_irq(&pool->lock); while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); break; } destroy_worker(worker); } spin_unlock_irq(&pool->lock); } static void send_mayday(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq_mayday_lock); if (!wq->rescuer) return; /* mayday mayday mayday */ if (list_empty(&pwq->mayday_node)) { /* * If @pwq is for an unbound wq, its base ref may be put at * any time due to an attribute change. Pin @pwq until the * rescuer is done with it. */ get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); } } static void pool_mayday_timeout(unsigned long __pool) { struct worker_pool *pool = (void *)__pool; struct work_struct *work; spin_lock_irq(&pool->lock); spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* * We've been trying to create a new worker but * haven't been successful. We might be hitting an * allocation deadlock. Send distress signals to * rescuers. */ list_for_each_entry(work, &pool->worklist, entry) send_mayday(work); } spin_unlock(&wq_mayday_lock); spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } /** * maybe_create_worker - create a new worker if necessary * @pool: pool to create a new worker for * * Create a new worker for @pool if necessary. @pool is guaranteed to * have at least one idle worker on return from this function. If * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * * On return, need_to_create_worker() is guaranteed to be %false and * may_start_working() %true. * * LOCKING: * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. */ static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { restart: spin_unlock_irq(&pool->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { if (create_worker(pool) || !need_to_create_worker(pool)) break; schedule_timeout_interruptible(CREATE_COOLDOWN); if (!need_to_create_worker(pool)) break; } del_timer_sync(&pool->mayday_timer); spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully * created as @pool->lock was dropped and the new worker might have * already become busy. */ if (need_to_create_worker(pool)) goto restart; } /** * manage_workers - manage worker pool * @worker: self * * Assume the manager role and manage the worker pool @worker belongs * to. At any given time, there can be only zero or one manager per * pool. The exclusion is handled automatically by this function. * * The caller can safely start processing works on false return. On * true return, it's guaranteed that need_to_create_worker() is false * and may_start_working() is true. * * CONTEXT: * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * * Return: * %false if the pool doesn't need management and the caller can safely * start processing works, %true if management function was performed and * the conditions that the caller verified before calling the function may * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; if (pool->flags & POOL_MANAGER_ACTIVE) return false; pool->flags |= POOL_MANAGER_ACTIVE; pool->manager = worker; maybe_create_worker(pool); pool->manager = NULL; pool->flags &= ~POOL_MANAGER_ACTIVE; wake_up(&wq_manager_wait); return true; } /** * process_one_work - process single work * @worker: self * @work: work to process * * Process @work. This function contains all the logics necessary to * process a single work including synchronization against and * interaction with other workers on the same cpu, queueing and * flushing. As long as context requirement is met, any worker can * call this function to process a work. * * CONTEXT: * spin_lock_irq(pool->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) __releases(&pool->lock) __acquires(&pool->lock) { struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; int work_color; struct worker *collision; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from * inside the function that is called from it, this we need to * take into account for lockdep too. To avoid bogus "held * lock freed" warnings as well as problems when looking into * work->lockdep_map, make a copy and use that here. */ struct lockdep_map lockdep_map; lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* ensure we're on the correct CPU */ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); /* * A single work shouldn't be executed concurrently by * multiple workers on a single cpu. Check whether anyone is * already processing the work. If so, defer the work to the * currently executing one. */ collision = find_worker_executing_work(pool, work); if (unlikely(collision)) { move_linked_works(work, &collision->scheduled, NULL); return; } /* claim and dequeue */ debug_work_deactivate(work); hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; work_color = get_work_color(work); list_del_init(&work->entry); /* * CPU intensive works don't participate in concurrency management. * They're the scheduler's responsibility. This takes @worker out * of concurrency management and the next code block will chain * execution of the pending work items. */ if (unlikely(cpu_intensive)) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* * Wake up another worker if necessary. The condition is always * false for normal per-cpu workers since nr_running would always * be >= 1 at this point. This is used to chain execution of the * pending work items for WORKER_NOT_RUNNING workers such as the * UNBOUND and CPU_INTENSIVE ones. */ if (need_more_worker(pool)) wake_up_worker(pool); /* * Record the last pool and clear PENDING which should be the last * update to @work. Also, do this inside @pool->lock so that * PENDING and queued state changes happen together while IRQ is * disabled. */ set_work_pool_and_clear_pending(work, pool->id); spin_unlock_irq(&pool->lock); lock_map_acquire_read(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); worker->current_func(work); /* * While we must be careful to not use "work" after this, the trace * point will only record its address. */ trace_workqueue_execute_end(work); lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" " last function: %pf\n", current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); dump_stack(); } /* * The following prevents a kworker from hogging CPU on !PREEMPT * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ cond_resched_rcu_qs(); spin_lock_irq(&pool->lock); /* clear cpu intensive status */ if (unlikely(cpu_intensive)) worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* we're done with it, release */ hash_del(&worker->hentry); worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; worker->desc_valid = false; pwq_dec_nr_in_flight(pwq, work_color); } /** * process_scheduled_works - process scheduled works * @worker: self * * Process all scheduled works. Please note that the scheduled list * may change while processing a work, so this function repeatedly * fetches a work from the top and executes it. * * CONTEXT: * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. */ static void process_scheduled_works(struct worker *worker) { while (!list_empty(&worker->scheduled)) { struct work_struct *work = list_first_entry(&worker->scheduled, struct work_struct, entry); process_one_work(worker, work); } } /** * worker_thread - the worker thread function * @__worker: self * * The worker thread function. All workers belong to a worker_pool - * either a per-cpu one or dynamic unbound one. These workers process all * work items regardless of their specific target workqueue. The only * exception is work items which belong to workqueues with a rescuer which * will be explained in rescuer_thread(). * * Return: 0 */ static int worker_thread(void *__worker) { struct worker *worker = __worker; struct worker_pool *pool = worker->pool; /* tell the scheduler that this is a workqueue worker */ worker->task->flags |= PF_WQ_WORKER; woke_up: spin_lock_irq(&pool->lock); /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); worker->task->flags &= ~PF_WQ_WORKER; set_task_comm(worker->task, "kworker/dying"); ida_simple_remove(&pool->worker_ida, worker->id); worker_detach_from_pool(worker, pool); kfree(worker); return 0; } worker_leave_idle(worker); recheck: /* no more worker necessary? */ if (!need_more_worker(pool)) goto sleep; /* do we need to manage? */ if (unlikely(!may_start_working(pool)) && manage_workers(worker)) goto recheck; /* * ->scheduled list can only be filled while a worker is * preparing to process a work or actually processing it. * Make sure nobody diddled with it while I was sleeping. */ WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* * Finish PREP stage. We're guaranteed to have at least one idle * worker or that someone else has already assumed the manager * role. This is where @worker starts participating in concurrency * management if applicable and concurrency management is restored * after being rebound. See rebind_workers() for details. */ worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = list_first_entry(&pool->worklist, struct work_struct, entry); if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); if (unlikely(!list_empty(&worker->scheduled))) process_scheduled_works(worker); } else { move_linked_works(work, &worker->scheduled, NULL); process_scheduled_works(worker); } } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); sleep: /* * pool->lock is held and there's no work to process and no need to * manage, sleep. Workers are woken up only while holding * pool->lock or from local cpu, so setting the current state * before releasing pool->lock is enough to prevent losing any * event. */ worker_enter_idle(worker); __set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&pool->lock); schedule(); goto woke_up; } /** * rescuer_thread - the rescuer thread function * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of * developing into deadlock if some works currently on the same queue * need to be processed to satisfy the GFP_KERNEL allocation. This is * the problem rescuer solves. * * When such condition is possible, the pool summons rescuers of all * workqueues which have works queued on the pool and let them process * those works so that forward progress can be guaranteed. * * This should happen rarely. * * Return: 0 */ static int rescuer_thread(void *__rescuer) { struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); /* * Mark rescuer as worker too. As WORKER_PREP is never cleared, it * doesn't participate in concurrency management. */ rescuer->task->flags |= PF_WQ_WORKER; repeat: set_current_state(TASK_INTERRUPTIBLE); /* * By the time the rescuer is requested to stop, the workqueue * shouldn't have any work pending, but @wq->maydays may still have * pwq(s) queued. This can happen by non-rescuer workers consuming * all the work items before the rescuer got to them. Go through * @wq->maydays processing before acting on should_stop so that the * list is always empty on exit. */ should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); spin_unlock_irq(&wq_mayday_lock); worker_attach_to_pool(rescuer, pool); spin_lock_irq(&pool->lock); rescuer->pool = pool; /* * Slurp in all works issued via this workqueue and * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); if (!list_empty(scheduled)) { process_scheduled_works(rescuer); /* * The above execution of rescued work items could * have created more to rescue through * pwq_activate_first_delayed() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't * incur MAYDAY_INTERVAL delay inbetween. */ if (need_to_create_worker(pool)) { spin_lock(&wq_mayday_lock); get_pwq(pwq); list_move_tail(&pwq->mayday_node, &wq->maydays); spin_unlock(&wq_mayday_lock); } } /* * Put the reference grabbed by send_mayday(). @pool won't * go away while we're still attached to it. */ put_pwq(pwq); /* * Leave this pool. If need_more_worker() is %true, notify a * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ if (need_more_worker(pool)) wake_up_worker(pool); rescuer->pool = NULL; spin_unlock_irq(&pool->lock); worker_detach_from_pool(rescuer, pool); spin_lock_irq(&wq_mayday_lock); } spin_unlock_irq(&wq_mayday_lock); if (should_stop) { __set_current_state(TASK_RUNNING); rescuer->task->flags &= ~PF_WQ_WORKER; return 0; } /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); goto repeat; } struct wq_barrier { struct work_struct work; struct completion done; struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) { struct wq_barrier *barr = container_of(work, struct wq_barrier, work); complete(&barr->done); } /** * insert_wq_barrier - insert a barrier work * @pwq: pwq to insert barrier into * @barr: wq_barrier to insert * @target: target work to attach @barr to * @worker: worker currently executing @target, NULL if @target is not executing * * @barr is linked to @target such that @barr is completed only after * @target finishes execution. Please note that the ordering * guarantee is observed only with respect to @target and on the local * cpu. * * Currently, a queued barrier can't be canceled. This is because * try_to_grab_pending() can't determine whether the work to be * grabbed is at the head of the queue and thus can't clear LINKED * flag of the previous work while there must be a valid next work * after a work with LINKED flag set. * * Note that when @worker is non-NULL, @target may be modified * underneath us, so we can't reliably determine pwq from @target. * * CONTEXT: * spin_lock_irq(pool->lock). */ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { struct list_head *head; unsigned int linked = 0; /* * debugobject calls are safe here even with pool->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. */ 86 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); barr->task = current; /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ if (worker) 22 head = worker->scheduled.next; else { 75 unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ linked = *bits & WORK_STRUCT_LINKED; __set_bit(WORK_STRUCT_LINKED_BIT, bits); } 86 debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_color_to_flags(WORK_NO_COLOR) | linked); } /** * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing * @wq: workqueue being flushed * @flush_color: new flush color, < 0 for no-op * @work_color: new work color, < 0 for no-op * * Prepare pwqs for workqueue flushing. * * If @flush_color is non-negative, flush_color on all pwqs should be * -1. If no pwq has in-flight commands at the specified color, all * pwq->flush_color's stay at -1 and %false is returned. If any pwq * has in flight commands, its pwq->flush_color is set to * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq * wakeup logic is armed and %true is returned. * * The caller should have initialized @wq->first_flusher prior to * calling this function with non-negative @flush_color. If * @flush_color is negative, no flush color update is done and %false * is returned. * * If @work_color is non-negative, all pwqs should have the same * work_color which is previous to @work_color and all will be * advanced to @work_color. * * CONTEXT: * mutex_lock(wq->mutex). * * Return: * %true if @flush_color >= 0 and there's something to flush. %false * otherwise. */ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) 93 { bool wait = false; struct pool_workqueue *pwq; if (flush_color >= 0) { 93 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); 93 atomic_set(&wq->nr_pwqs_to_flush, 1); } 93 for_each_pwq(pwq, wq) { 93 struct worker_pool *pool = pwq->pool; spin_lock_irq(&pool->lock); if (flush_color >= 0) { 93 WARN_ON_ONCE(pwq->flush_color != -1); 93 if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color; atomic_inc(&wq->nr_pwqs_to_flush); wait = true; } } 93 if (work_color >= 0) { 93 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); 93 pwq->work_color = work_color; } 93 spin_unlock_irq(&pool->lock); } 93 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) 93 complete(&wq->first_flusher->done); 93 return wait; } /** * flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ void flush_workqueue(struct workqueue_struct *wq) { 93 struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), }; int next_color; 93 lock_map_acquire(&wq->lockdep_map); 93 lock_map_release(&wq->lockdep_map); mutex_lock(&wq->mutex); /* * Start-to-wait phase */ next_color = work_next_color(wq->work_color); if (next_color != wq->flush_color) { /* * Color space is not full. The current work_color * becomes our flush_color and work_color is advanced * by one. */ 93 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); 93 this_flusher.flush_color = wq->work_color; wq->work_color = next_color; if (!wq->first_flusher) { /* no flush in progress, become the first flusher */ 93 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); 93 wq->first_flusher = &this_flusher; if (!flush_workqueue_prep_pwqs(wq, wq->flush_color, wq->work_color)) { /* nothing to flush, done */ 93 wq->flush_color = next_color; wq->first_flusher = NULL; goto out_unlock; } } else { /* wait in queue */ WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } } else { /* * Oops, color space is full, wait on overflow queue. * The next flush completion will assign us * flush_color and transfer to flusher_queue. */ list_add_tail(&this_flusher.list, &wq->flusher_overflow); } mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); /* * Wake-up-and-cascade phase * * First flushers are responsible for cascading flushes and * handling overflow. Non-first flushers can simply return. */ if (wq->first_flusher != &this_flusher) 93 return; mutex_lock(&wq->mutex); /* we might have raced, check again with mutex held */ if (wq->first_flusher != &this_flusher) goto out_unlock; wq->first_flusher = NULL; WARN_ON_ONCE(!list_empty(&this_flusher.list)); WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); while (true) { struct wq_flusher *next, *tmp; /* complete all the flushers sharing the current flush color */ list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { if (next->flush_color != wq->flush_color) break; list_del_init(&next->list); complete(&next->done); } WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && wq->flush_color != work_next_color(wq->work_color)); /* this flush_color is finished, advance by one */ wq->flush_color = work_next_color(wq->flush_color); /* one color has been freed, handle overflow queue */ if (!list_empty(&wq->flusher_overflow)) { /* * Assign the same color to all overflowed * flushers, advance work_color and append to * flusher_queue. This is the start-to-wait * phase for these overflowed flushers. */ list_for_each_entry(tmp, &wq->flusher_overflow, list) tmp->flush_color = wq->work_color; wq->work_color = work_next_color(wq->work_color); list_splice_tail_init(&wq->flusher_overflow, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } if (list_empty(&wq->flusher_queue)) { WARN_ON_ONCE(wq->flush_color != wq->work_color); break; } /* * Need to flush more colors. Make the next flusher * the new first flusher and arm pwqs. */ WARN_ON_ONCE(wq->flush_color == wq->work_color); WARN_ON_ONCE(wq->flush_color != next->flush_color); list_del_init(&next->list); wq->first_flusher = next; if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1)) break; /* * Meh... this color is already done, clear first * flusher and repeat cascading. */ wq->first_flusher = NULL; } out_unlock: 93 mutex_unlock(&wq->mutex); } EXPORT_SYMBOL(flush_workqueue); /** * drain_workqueue - drain a workqueue * @wq: workqueue to drain * * Wait until the workqueue becomes empty. While draining is in progress, * only chain queueing is allowed. IOW, only currently pending or running * work items on @wq can queue further work items on it. @wq is flushed * repeatedly until it becomes empty. The number of flushing is determined * by the depth of chaining and should be relatively short. Whine if it * takes too long. */ void drain_workqueue(struct workqueue_struct *wq) { unsigned int flush_cnt = 0; struct pool_workqueue *pwq; /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ 32 mutex_lock(&wq->mutex); if (!wq->nr_drainers++) 32 wq->flags |= __WQ_DRAINING; 32 mutex_unlock(&wq->mutex); reflush: 32 flush_workqueue(wq); mutex_lock(&wq->mutex); 32 for_each_pwq(pwq, wq) { bool drained; 32 spin_lock_irq(&pwq->pool->lock); 32 drained = !pwq->nr_active && list_empty(&pwq->delayed_works); 32 spin_unlock_irq(&pwq->pool->lock); if (drained) continue; if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", wq->name, flush_cnt); mutex_unlock(&wq->mutex); goto reflush; } 32 if (!--wq->nr_drainers) 32 wq->flags &= ~__WQ_DRAINING; 32 mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; struct worker_pool *pool; struct pool_workqueue *pwq; might_sleep(); local_irq_disable(); pool = get_work_pool(work); if (!pool) { 112 local_irq_enable(); return false; } 90 spin_lock(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ 75 pwq = get_work_pwq(work); if (pwq) { if (unlikely(pwq->pool != pool)) goto already_gone; } else { 47 worker = find_worker_executing_work(pool, work); if (!worker) goto already_gone; 22 pwq = worker->current_pwq; } 86 insert_wq_barrier(pwq, barr, work, worker); spin_unlock_irq(&pool->lock); /* * If @max_active is 1 or rescuer is in use, flushing another work * item on the same workqueue may lead to deadlock. Make sure the * flusher is not running on the same workqueue by verifying write * access. */ 86 if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) 32 lock_map_acquire(&pwq->wq->lockdep_map); else 54 lock_map_acquire_read(&pwq->wq->lockdep_map); 86 lock_map_release(&pwq->wq->lockdep_map); return true; already_gone: 34 spin_unlock_irq(&pool->lock); return false; } /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush * * Wait until @work has finished execution. @work is guaranteed to be idle * on return if it hasn't been requeued since flush started. * * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_work(struct work_struct *work) { struct wq_barrier barr; 169 lock_map_acquire(&work->lockdep_map); 169 lock_map_release(&work->lockdep_map); 169 if (start_flush_work(work, &barr)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); 169 return true; } else { return false; } } EXPORT_SYMBOL_GPL(flush_work); struct cwt_wait { wait_queue_t wait; struct work_struct *work; }; static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); if (cwait->work != key) return 0; return autoremove_wake_function(wait, mode, sync, key); } static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) 67 { static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { 67 ret = try_to_grab_pending(work, is_dwork, &flags); /* * If someone else is already canceling, wait for it to * finish. flush_work() doesn't work for PREEMPT_NONE * because we may get scheduled between @work's completion * and the other canceling task resuming and clearing * CANCELING - flush_work() will return false immediately * as @work is no longer busy, try_to_grab_pending() will * return -ENOENT as @work is still being canceled and the * other canceling task won't be able to clear CANCELING as * we're hogging the CPU. * * Let's wait for completion using a waitqueue. As this * may lead to the thundering herd problem, use a custom * wake function which matches @work along with exclusive * wait and wakeup. */ if (unlikely(ret == -ENOENT)) { struct cwt_wait cwait; init_wait(&cwait.wait); cwait.wait.func = cwt_wakefn; cwait.work = work; prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, TASK_UNINTERRUPTIBLE); if (work_is_canceling(work)) schedule(); finish_wait(&cancel_waitq, &cwait.wait); } 67 } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ 67 mark_work_canceling(work); 67 local_irq_restore(flags); 67 flush_work(work); 67 clear_work_data(work); /* * Paired with prepare_to_wait() above so that either * waitqueue_active() is visible here or !work_is_canceling() is * visible there. */ smp_mb(); if (waitqueue_active(&cancel_waitq)) __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); 67 return ret; } /** * cancel_work_sync - cancel a work and wait for it to finish * @work: the work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself or migrates to * another workqueue. On return from this function, @work is * guaranteed to be not pending or executing on any CPU. * * cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use cancel_delayed_work_sync() instead. * * The caller must ensure that the workqueue on which @work was last * queued can't be destroyed before this function returns. * * Return: * %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) { 35 return __cancel_work_timer(work, false); } EXPORT_SYMBOL_GPL(cancel_work_sync); /** * flush_delayed_work - wait for a dwork to finish executing the last queueing * @dwork: the delayed work to flush * * Delayed timer is cancelled and the pending work is queued for * immediate execution. Like flush_work(), this function only * considers the last queueing instance of @dwork. * * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_delayed_work(struct delayed_work *dwork) { 32 local_irq_disable(); if (del_timer_sync(&dwork->timer)) __queue_work(dwork->cpu, dwork->wq, &dwork->work); 32 local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel * * Kill off a pending delayed_work. * * Return: %true if @dwork was pending and canceled; %false if it wasn't * pending. * * Note: * The work callback function may still be running on return, unless * it returns %true and the work doesn't re-arm itself. Explicitly flush or * use cancel_delayed_work_sync() to wait on it. * * This function is safe to call from any context including IRQ handler. */ bool cancel_delayed_work(struct delayed_work *dwork) 36 { unsigned long flags; int ret; do { 36 ret = try_to_grab_pending(&dwork->work, true, &flags); } while (unlikely(ret == -EAGAIN)); 36 if (unlikely(ret < 0)) return false; 36 set_work_pool_and_clear_pending(&dwork->work, get_work_pool_id(&dwork->work)); 36 local_irq_restore(flags); 36 return ret; } EXPORT_SYMBOL(cancel_delayed_work); /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish * @dwork: the delayed work cancel * * This is cancel_work_sync() for delayed works. * * Return: * %true if @dwork was pending, %false otherwise. */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { 32 return __cancel_work_timer(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work_sync); /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * * schedule_on_each_cpu() executes @func on each online CPU using the * system workqueue and blocks until all CPUs have completed. * schedule_on_each_cpu() is very slow. * * Return: * 0 on success, -errno on failure. */ int schedule_on_each_cpu(work_func_t func) { int cpu; struct work_struct __percpu *works; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); put_online_cpus(); free_percpu(works); return 0; } /** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute * @ew: guaranteed storage for the execute work structure (must * be available when the work executes) * * Executes the function immediately if process context is available, * otherwise schedules the function for delayed execution. * * Return: 0 - function was executed * 1 - function was scheduled for execution */ int execute_in_process_context(work_func_t fn, struct execute_work *ew) { if (!in_interrupt()) { fn(&ew->work); return 0; } INIT_WORK(&ew->work, fn); schedule_work(&ew->work); return 1; } EXPORT_SYMBOL_GPL(execute_in_process_context); /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free * * Undo alloc_workqueue_attrs(). */ void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); kfree(attrs); } } /** * alloc_workqueue_attrs - allocate a workqueue_attrs * @gfp_mask: allocation mask to use * * Allocate a new workqueue_attrs, initialize with default settings and * return it. * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) { struct workqueue_attrs *attrs; attrs = kzalloc(sizeof(*attrs), gfp_mask); if (!attrs) goto fail; if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); return attrs; fail: free_workqueue_attrs(attrs); return NULL; } static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from) { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); /* * Unlike hash and equality test, this function doesn't ignore * ->no_numa as it is used for both pool and wq attrs. Instead, * get_unbound_pool() explicitly clears ->no_numa after copying. */ to->no_numa = from->no_numa; } /* hash value of the content of @attr */ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { u32 hash = 0; hash = jhash_1word(attrs->nice, hash); hash = jhash(cpumask_bits(attrs->cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } /* content equality test */ static bool wqattrs_equal(const struct workqueue_attrs *a, const struct workqueue_attrs *b) { if (a->nice != b->nice) return false; if (!cpumask_equal(a->cpumask, b->cpumask)) return false; return true; } /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize * * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs. * * Return: 0 on success, -errno on failure. Even on failure, all fields * inside @pool proper are initialized and put_unbound_pool() can be called * on @pool safely to release it. */ static int init_worker_pool(struct worker_pool *pool) { spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); init_timer_deferrable(&pool->idle_timer); pool->idle_timer.function = idle_worker_timeout; pool->idle_timer.data = (unsigned long)pool; setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!pool->attrs) return -ENOMEM; return 0; } static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->cpu_pwqs); else free_workqueue_attrs(wq->unbound_attrs); kfree(wq->rescuer); kfree(wq); } static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); ida_destroy(&pool->worker_ida); free_workqueue_attrs(pool->attrs); kfree(pool); } /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). * * Should be called with wq_pool_mutex held. */ static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; lockdep_assert_held(&wq_pool_mutex); if (--pool->refcnt) return; /* sanity checks */ if (WARN_ON(!(pool->cpu < 0)) || WARN_ON(!list_empty(&pool->worklist))) return; /* release id and unhash */ if (pool->id >= 0) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); /* * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. */ spin_lock_irq(&pool->lock); wait_event_lock_irq(wq_manager_wait, !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); pool->flags |= POOL_MANAGER_ACTIVE; while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); mutex_lock(&pool->attach_mutex); if (!list_empty(&pool->workers)) pool->detach_completion = &detach_completion; mutex_unlock(&pool->attach_mutex); if (pool->detach_completion) wait_for_completion(pool->detach_completion); /* shut down the timers */ del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); /* sched-RCU protected to allow dereferences from get_work_pool() */ call_rcu_sched(&pool->rcu, rcu_free_pool); } /** * get_unbound_pool - get a worker_pool with the specified attributes * @attrs: the attributes of the worker_pool to get * * Obtain a worker_pool which has the same attributes as @attrs, bump the * reference count and return it. If there already is a matching * worker_pool, it will be used; otherwise, this function attempts to * create a new one. * * Should be called with wq_pool_mutex held. * * Return: On success, a worker_pool with the same attributes as @attrs. * On failure, %NULL. */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; int node; int target_node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); /* do we already have a matching pool? */ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; return pool; } } /* if cpumask is contained inside a NUMA node, we belong to that node */ if (wq_numa_enabled) { for_each_node(node) { if (cpumask_subset(attrs->cpumask, wq_numa_possible_cpumask[node])) { target_node = node; break; } } } /* nope, create a new one */ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); if (!pool || init_worker_pool(pool) < 0) goto fail; lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); pool->node = target_node; /* * no_numa isn't a worker_pool attribute, always clear it. See * 'struct workqueue_attrs' comments for detail. */ pool->attrs->no_numa = false; if (worker_pool_assign_id(pool) < 0) goto fail; /* create and start the initial worker */ if (!create_worker(pool)) goto fail; /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); return pool; fail: if (pool) put_unbound_pool(pool); return NULL; } static void rcu_free_pwq(struct rcu_head *rcu) { kmem_cache_free(pwq_cache, container_of(rcu, struct pool_workqueue, rcu)); } /* * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt * and needs to be destroyed. */ static void pwq_unbound_release_workfn(struct work_struct *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, unbound_release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; bool is_last; if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) return; mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); mutex_lock(&wq_pool_mutex); put_unbound_pool(pool); mutex_unlock(&wq_pool_mutex); call_rcu_sched(&pwq->rcu, rcu_free_pwq); /* * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ if (is_last) call_rcu_sched(&wq->rcu, rcu_free_wq); } /** * pwq_adjust_max_active - update a pwq's max_active to the current setting * @pwq: target pool_workqueue * * If @pwq isn't freezing, set @pwq->max_active to the associated * workqueue's saved_max_active and activate delayed work items * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ static void pwq_adjust_max_active(struct pool_workqueue *pwq) { 24 struct workqueue_struct *wq = pwq->wq; bool freezable = wq->flags & WQ_FREEZABLE; /* for @wq->saved_max_active */ 24 lockdep_assert_held(&wq->mutex); /* fast exit for non-freezable wqs */ 24 if (!freezable && pwq->max_active == wq->saved_max_active) return; 24 spin_lock_irq(&pwq->pool->lock); /* * During [un]freezing, the caller is responsible for ensuring that * this function is called at least once after @workqueue_freezing * is updated and visible. */ if (!freezable || !workqueue_freezing) { 24 pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->delayed_works) && pwq->nr_active < pwq->max_active) pwq_activate_first_delayed(pwq); /* * Need to kick a worker after thawed or an unbound wq's * max_active is bumped. It's a slow path. Do it always. */ 24 wake_up_worker(pwq->pool); } else { pwq->max_active = 0; } 24 spin_unlock_irq(&pwq->pool->lock); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { 24 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); 24 memset(pwq, 0, sizeof(*pwq)); pwq->pool = pool; pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ static void link_pwq(struct pool_workqueue *pwq) { 24 struct workqueue_struct *wq = pwq->wq; 24 lockdep_assert_held(&wq->mutex); /* may be called multiple times, ignore if already linked */ 24 if (!list_empty(&pwq->pwqs_node)) return; /* set the matching work_color */ 24 pwq->work_color = wq->work_color; /* sync max_active to the current setting */ pwq_adjust_max_active(pwq); /* link in @pwq */ 24 list_add_rcu(&pwq->pwqs_node, &wq->pwqs); } /* obtain a pool matching @attr and create a pwq associating the pool and @wq */ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct worker_pool *pool; struct pool_workqueue *pwq; lockdep_assert_held(&wq_pool_mutex); pool = get_unbound_pool(attrs); if (!pool) return NULL; pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!pwq) { put_unbound_pool(pool); return NULL; } init_pwq(pwq, wq, pool); return pwq; } /** * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node * @attrs: the wq_attrs of the default pwq of the target workqueue * @node: the target NUMA node * @cpu_going_down: if >= 0, the CPU to consider as offline * @cpumask: outarg, the resulting cpumask * * Calculate the cpumask a workqueue with @attrs should use on @node. If * @cpu_going_down is >= 0, that cpu is considered offline during * calculation. The result is stored in @cpumask. * * If NUMA affinity is not enabled, @attrs->cpumask is always used. If * enabled and @node has online CPUs requested by @attrs, the returned * cpumask is the intersection of the possible CPUs of @node and * @attrs->cpumask. * * The caller is responsible for ensuring that the cpumask of @node stays * stable. * * Return: %true if the resulting @cpumask is different from @attrs->cpumask, * %false if equal. */ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) { if (!wq_numa_enabled || attrs->no_numa) goto use_dfl; /* does @node have any online CPUs @attrs wants? */ cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); if (cpu_going_down >= 0) cpumask_clear_cpu(cpu_going_down, cpumask); if (cpumask_empty(cpumask)) goto use_dfl; /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); return !cpumask_equal(cpumask, attrs->cpumask); use_dfl: cpumask_copy(cpumask, attrs->cpumask); return false; } /* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, int node, struct pool_workqueue *pwq) { struct pool_workqueue *old_pwq; lockdep_assert_held(&wq_pool_mutex); lockdep_assert_held(&wq->mutex); /* link_pwq() can handle duplicate calls */ link_pwq(pwq); old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); return old_pwq; } /* context to store the prepared attrs & pwqs before applying */ struct apply_wqattrs_ctx { struct workqueue_struct *wq; /* target workqueue */ struct workqueue_attrs *attrs; /* attrs to apply */ struct list_head list; /* queued for batching commit */ struct pool_workqueue *dfl_pwq; struct pool_workqueue *pwq_tbl[]; }; /* free the resources after success or abort */ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { int node; for_each_node(node) put_pwq_unlocked(ctx->pwq_tbl[node]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); kfree(ctx); } } /* allocate the attrs and pwqs for later installation */ static struct apply_wqattrs_ctx * apply_wqattrs_prepare(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs, *tmp_attrs; int node; lockdep_assert_held(&wq_pool_mutex); ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!ctx || !new_attrs || !tmp_attrs) goto out_free; /* * Calculate the attrs of the default pwq. * If the user configured cpumask doesn't overlap with the * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); if (unlikely(cpumask_empty(new_attrs->cpumask))) cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); /* * We may create multiple pwqs with differing cpumasks. Make a * copy of @new_attrs which will be modified and used to obtain * pools. */ copy_workqueue_attrs(tmp_attrs, new_attrs); /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; for_each_node(node) { if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); if (!ctx->pwq_tbl[node]) goto out_free; } else { ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[node] = ctx->dfl_pwq; } } /* save the user configured attrs and sanitize it. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); ctx->attrs = new_attrs; ctx->wq = wq; free_workqueue_attrs(tmp_attrs); return ctx; out_free: free_workqueue_attrs(tmp_attrs); free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); return NULL; } /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { int node; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ for_each_node(node) ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, ctx->pwq_tbl[node]); /* @dfl_pwq might not have been used, ensure it's linked */ link_pwq(ctx->dfl_pwq); swap(ctx->wq->dfl_pwq, ctx->dfl_pwq); mutex_unlock(&ctx->wq->mutex); } static void apply_wqattrs_lock(void) { /* CPUs should stay stable across pwq creations and installations */ get_online_cpus(); mutex_lock(&wq_pool_mutex); } static void apply_wqattrs_unlock(void) { mutex_unlock(&wq_pool_mutex); put_online_cpus(); } static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; int ret = -ENOMEM; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ if (!list_empty(&wq->pwqs)) { if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->flags &= ~__WQ_ORDERED; } ctx = apply_wqattrs_prepare(wq, attrs); /* the ctx has been prepared successfully, let's commit it */ if (ctx) { apply_wqattrs_commit(ctx); ret = 0; } apply_wqattrs_cleanup(ctx); return ret; } /** * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA * machines, this function maps a separate pwq to each NUMA node with * possibles CPUs in @attrs->cpumask so that work items are affine to the * NUMA node it was issued on. Older pwqs are released as in-flight work * items finish. Note that a work item which repeatedly requeues itself * back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. * * Return: 0 on success and -errno on failure. */ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; apply_wqattrs_lock(); ret = apply_workqueue_attrs_locked(wq, attrs); apply_wqattrs_unlock(); return ret; } /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug * @wq: the target workqueue * @cpu: the CPU coming up or going down * @online: whether @cpu is coming up or going down * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of * @wq accordingly. * * If NUMA affinity can't be adjusted due to memory allocation failure, it * falls back to @wq->dfl_pwq which may not be optimal but is always * correct. * * Note that when the last allowed CPU of a NUMA node goes offline for a * workqueue with a cpumask spanning multiple nodes, the workers which were * already executing the work items for the workqueue will lose their CPU * affinity and may execute on any CPU. This is similar to how per-cpu * workqueues behave on CPU_DOWN. If a workqueue user wants strict * affinity, it's the user's responsibility to flush the work item from * CPU_DOWN_PREPARE. */ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, bool online) { int node = cpu_to_node(cpu); int cpu_off = online ? -1 : cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; cpumask_t *cpumask; lockdep_assert_held(&wq_pool_mutex); if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->no_numa) return; /* * We don't wanna alloc/free wq_attrs for each wq for each CPU. * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ target_attrs = wq_update_unbound_numa_attrs_buf; cpumask = target_attrs->cpumask; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); pwq = unbound_pwq_by_node(wq, node); /* * Let's determine what needs to be done. If the target cpumask is * different from the default pwq's, we need to compare it to @pwq's * and create a new one if they don't match. If the target cpumask * equals the default pwq's, the default pwq should be used. */ if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) return; } else { goto use_dfl_pwq; } /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", wq->name); goto use_dfl_pwq; } /* Install the new pwq. */ mutex_lock(&wq->mutex); old_pwq = numa_pwq_tbl_install(wq, node, pwq); goto out_unlock; use_dfl_pwq: mutex_lock(&wq->mutex); spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); spin_unlock_irq(&wq->dfl_pwq->pool->lock); old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); } static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { 24 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); if (!wq->cpu_pwqs) return -ENOMEM; 24 for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = 24 per_cpu_ptr(wq->cpu_pwqs, cpu); struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); 24 init_pwq(pwq, wq, &cpu_pools[highpri]); mutex_lock(&wq->mutex); link_pwq(pwq); mutex_unlock(&wq->mutex); } return 0; } else if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); return ret; } else { return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } } static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { 24 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; 24 if (max_active < 1 || max_active > lim) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", max_active, name, 1, lim); 24 return clamp_val(max_active, 1, lim); } struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, const char *lock_name, ...) { size_t tbl_size = 0; va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; /* * Unbound && max_active == 1 used to imply ordered, which is no * longer the case on NUMA machines due to per-node pools. While * alloc_ordered_workqueue() is the right way to create an ordered * workqueue, keep the previous behavior to avoid subtle breakages * on NUMA. */ 24 if ((flags & WQ_UNBOUND) && max_active == 1) flags |= __WQ_ORDERED; /* see the comment above the definition of WQ_POWER_EFFICIENT */ 24 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; /* allocate wq and format name */ if (flags & WQ_UNBOUND) tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); 24 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); if (!wq) return NULL; 24 if (flags & WQ_UNBOUND) { wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!wq->unbound_attrs) goto err_free_wq; } 24 va_start(args, lock_name); vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); max_active = max_active ?: WQ_DFL_ACTIVE; 24 max_active = wq_clamp_max_active(max_active, flags, wq->name); /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); 24 if (alloc_and_link_pwqs(wq) < 0) goto err_free_wq; /* * Workqueues which may be used during memory reclaim should * have a rescuer to guarantee forward progress. */ 24 if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; 24 rescuer = alloc_worker(NUMA_NO_NODE); if (!rescuer) goto err_destroy; 24 rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); if (IS_ERR(rescuer->task)) { kfree(rescuer); goto err_destroy; } 24 wq->rescuer = rescuer; kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); } 24 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; /* * wq_pool_mutex protects global freeze state and workqueues list. * Grab it, adjust max_active and add the new @wq to workqueues * list. */ 24 mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); 24 for_each_pwq(pwq, wq) 24 pwq_adjust_max_active(pwq); 24 mutex_unlock(&wq->mutex); 24 list_add_tail_rcu(&wq->list, &workqueues); 24 mutex_unlock(&wq_pool_mutex); return wq; err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); 24 return NULL; err_destroy: destroy_workqueue(wq); return NULL; } EXPORT_SYMBOL_GPL(__alloc_workqueue_key); /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue * * Safely destroy a workqueue. All work currently pending will be done first. */ void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; int node; /* drain it before proceeding with destruction */ 32 drain_workqueue(wq); /* sanity checks */ mutex_lock(&wq->mutex); 32 for_each_pwq(pwq, wq) { int i; 32 for (i = 0; i < WORK_NR_COLORS; i++) { 32 if (WARN_ON(pwq->nr_in_flight[i])) { mutex_unlock(&wq->mutex); 32 return; } } 32 if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || 32 WARN_ON(pwq->nr_active) || 32 WARN_ON(!list_empty(&pwq->delayed_works))) { mutex_unlock(&wq->mutex); return; } } 32 mutex_unlock(&wq->mutex); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ mutex_lock(&wq_pool_mutex); 32 list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); workqueue_sysfs_unregister(wq); 32 if (wq->rescuer) 32 kthread_stop(wq->rescuer->task); 32 if (!(wq->flags & WQ_UNBOUND)) { /* * The base ref is never dropped on per-cpu pwqs. Directly * schedule RCU free. */ 32 call_rcu_sched(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly * access numa_pwq_tbl[] and dfl_pwq to put the base refs. * @wq will be freed when the last pwq is released. */ for_each_node(node) { pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); put_pwq_unlocked(pwq); } /* * Put dfl_pwq. @wq may be freed any time after dfl_pwq is * put. Don't access it afterwards. */ pwq = wq->dfl_pwq; wq->dfl_pwq = NULL; put_pwq_unlocked(pwq); } } EXPORT_SYMBOL_GPL(destroy_workqueue); /** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue * @max_active: new max_active value. * * Set max_active of @wq to @max_active. * * CONTEXT: * Don't call from IRQ context. */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** * current_work - retrieve %current task's work struct * * Determine if %current task is a workqueue worker and what it's working on. * Useful to find out the context that the %current task is running in. * * Return: work struct if %current task is a workqueue worker, %NULL otherwise. */ struct work_struct *current_work(void) { struct worker *worker = current_wq_worker(); return worker ? worker->current_work : NULL; } EXPORT_SYMBOL(current_work); /** * current_is_workqueue_rescuer - is %current workqueue rescuer? * * Determine whether %current is a workqueue rescuer. Can be used from * work functions to determine whether it's being run off the rescuer task. * * Return: %true if %current is a workqueue rescuer. %false otherwise. */ bool current_is_workqueue_rescuer(void) { struct worker *worker = current_wq_worker(); return worker && worker->rescue_wq; } /** * workqueue_congested - test whether a workqueue is congested * @cpu: CPU in question * @wq: target workqueue * * Test whether @wq's cpu workqueue for @cpu is congested. There is * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. * Note that both per-cpu and unbound workqueues may be associated with * multiple pool_workqueues which have separate congested states. A * workqueue being congested on one CPU doesn't mean the workqueue is also * contested on other CPUs / NUMA nodes. * * Return: * %true if congested, %false otherwise. */ bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq; bool ret; rcu_read_lock_sched(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); ret = !list_empty(&pwq->delayed_works); rcu_read_unlock_sched(); return ret; } EXPORT_SYMBOL_GPL(workqueue_congested); /** * work_busy - test whether a work is currently pending or running * @work: the work to be tested * * Test whether @work is currently pending or running. There is no * synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * * Return: * OR'd bitmask of WORK_BUSY_* bits. */ unsigned int work_busy(struct work_struct *work) { struct worker_pool *pool; unsigned long flags; unsigned int ret = 0; if (work_pending(work)) ret |= WORK_BUSY_PENDING; local_irq_save(flags); pool = get_work_pool(work); if (pool) { spin_lock(&pool->lock); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; spin_unlock(&pool->lock); } local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(work_busy); /** * set_worker_desc - set description for the current work item * @fmt: printf-style format string * @...: arguments for the format string * * This function can be called by a running work function to describe what * the work item is about. If the worker task gets dumped, this * information will be printed out together to help debugging. The * description can be at most WORKER_DESC_LEN including the trailing '\0'. */ void set_worker_desc(const char *fmt, ...) { struct worker *worker = current_wq_worker(); va_list args; if (worker) { va_start(args, fmt); vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); va_end(args); worker->desc_valid = true; } } /** * print_worker_info - print out worker information and description * @log_lvl: the log level to use when printing * @task: target task * * If @task is a worker and currently executing a work item, print out the * name of the workqueue being serviced and worker description set with * set_worker_desc() by the currently executing work item. * * This function can be safely called on any task as long as the * task_struct itself is accessible. While safe, this function isn't * synchronized and may print out mixups or garbages of limited length. */ void print_worker_info(const char *log_lvl, struct task_struct *task) { 2 work_func_t *fn = NULL; char name[WQ_NAME_LEN] = { }; char desc[WORKER_DESC_LEN] = { }; struct pool_workqueue *pwq = NULL; struct workqueue_struct *wq = NULL; bool desc_valid = false; struct worker *worker; if (!(task->flags & PF_WQ_WORKER)) 2 return; /* * This function is called without any synchronization and @task * could be in any state. Be careful with dereferences. */ worker = probe_kthread_data(task); /* * Carefully copy the associated workqueue's workfn and name. Keep * the original last '\0' in case the original contains garbage. */ probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); probe_kernel_read(name, wq->name, sizeof(name) - 1); /* copy worker description */ probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid)); if (desc_valid) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %pf", log_lvl, name, fn); if (desc[0]) pr_cont(" (%s)", desc); pr_cont("\n"); } } static void pr_cont_pool_info(struct worker_pool *pool) { pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); if (pool->node != NUMA_NO_NODE) pr_cont(" node=%d", pool->node); pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); } static void pr_cont_work(bool comma, struct work_struct *work) { if (work->func == wq_barrier_func) { struct wq_barrier *barr; barr = container_of(work, struct wq_barrier, work); pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { pr_cont("%s %pf", comma ? "," : "", work->func); } } static void show_pwq(struct pool_workqueue *pwq) { struct worker_pool *pool = pwq->pool; struct work_struct *work; struct worker *worker; bool has_in_flight = false, has_pending = false; int bkt; pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (worker->current_pwq == pwq) { has_in_flight = true; break; } } if (has_in_flight) { bool comma = false; pr_info(" in-flight:"); hash_for_each(pool->busy_hash, bkt, worker, hentry) { if (worker->current_pwq != pwq) continue; pr_cont("%s %d%s:%pf", comma ? "," : "", task_pid_nr(worker->task), worker == pwq->wq->rescuer ? "(RESCUER)" : "", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work); comma = true; } pr_cont("\n"); } list_for_each_entry(work, &pool->worklist, entry) { if (get_work_pwq(work) == pwq) { has_pending = true; break; } } if (has_pending) { bool comma = false; pr_info(" pending:"); list_for_each_entry(work, &pool->worklist, entry) { if (get_work_pwq(work) != pwq) continue; pr_cont_work(comma, work); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } pr_cont("\n"); } if (!list_empty(&pwq->delayed_works)) { bool comma = false; pr_info(" delayed:"); list_for_each_entry(work, &pwq->delayed_works, entry) { pr_cont_work(comma, work); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } pr_cont("\n"); } } /** * show_workqueue_state - dump workqueue state * * Called from a sysrq handler and prints out all busy workqueues and * pools. */ void show_workqueue_state(void) { struct workqueue_struct *wq; struct worker_pool *pool; unsigned long flags; int pi; rcu_read_lock_sched(); pr_info("Showing busy workqueues and worker pools:\n"); list_for_each_entry_rcu(wq, &workqueues, list) { struct pool_workqueue *pwq; bool idle = true; for_each_pwq(pwq, wq) { if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { idle = false; break; } } if (idle) continue; pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); for_each_pwq(pwq, wq) { spin_lock_irqsave(&pwq->pool->lock, flags); if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); spin_unlock_irqrestore(&pwq->pool->lock, flags); } } for_each_pool(pool, pi) { struct worker *worker; bool first = true; spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" workers=%d", pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); list_for_each_entry(worker, &pool->idle_list, entry) { pr_cont(" %s%d", first ? "idle: " : "", task_pid_nr(worker->task)); first = false; } pr_cont("\n"); next_pool: spin_unlock_irqrestore(&pool->lock, flags); } rcu_read_unlock_sched(); } /* * CPU hotplug. * * There are two challenges in supporting CPU hotplug. Firstly, there * are a lot of assumptions on strong associations among work, pwq and * pool which make migrating pending and scheduled works very * difficult to implement without impacting hot paths. Secondly, * worker pools serve mix of short, long and very long running works making * blocked draining impractical. * * This is solved by allowing the pools to be disassociated from the CPU * running as an unbound one and allowing it to be reattached later if the * cpu comes back online. */ static void wq_unbind_fn(struct work_struct *work) { int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&pool->attach_mutex); spin_lock_irq(&pool->lock); /* * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); mutex_unlock(&pool->attach_mutex); /* * Call schedule() so that we cross rq->lock and thus can * guarantee sched callbacks see the %WORKER_UNBOUND flag. * This is necessary as scheduler callbacks may be invoked * from other cpus. */ schedule(); /* * Sched callbacks are disabled now. Zap nr_running. * After this, nr_running stays zero and need_more_worker() * and keep_working() are always true as long as the * worklist is not empty. This pool now behaves as an * unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ atomic_set(&pool->nr_running, 0); /* * With concurrency management just turned off, a busy * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ spin_lock_irq(&pool->lock); wake_up_worker(pool); spin_unlock_irq(&pool->lock); } } /** * rebind_workers - rebind all workers of a pool to the associated CPU * @pool: pool of interest * * @pool->cpu is coming online. Rebind all workers to the CPU. */ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; lockdep_assert_held(&pool->attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should * be on the run-queue of the associated CPU before any local * wake-ups for concurrency management happen, restore CPU affinity * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); /* * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is * being reworked and this can go away in time. */ if (!(pool->flags & POOL_DISASSOCIATED)) { spin_unlock_irq(&pool->lock); return; } pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; /* * A bound idle worker should actually be on the runqueue * of the associated CPU for local wake-ups targeting it to * work. Kick all idle workers so that they migrate to the * associated CPU. Doing this in the same loop as * replacing UNBOUND with REBOUND is safe as no worker will * be bound before @pool->lock is released. */ if (worker_flags & WORKER_IDLE) wake_up_process(worker->task); /* * We want to clear UNBOUND but can't directly call * worker_clr_flags() or adjust nr_running. Atomically * replace UNBOUND with another NOT_RUNNING flag REBOUND. * @worker will clear REBOUND using worker_clr_flags() when * it initiates the next execution cycle thus restoring * concurrency management. Note that when or whether * @worker clears REBOUND doesn't affect correctness. * * ACCESS_ONCE() is necessary because @worker->flags may be * tested without holding any lock in * wq_worker_waking_up(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency * management operations. */ WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); worker_flags |= WORKER_REBOUND; worker_flags &= ~WORKER_UNBOUND; ACCESS_ONCE(worker->flags) = worker_flags; } spin_unlock_irq(&pool->lock); } /** * restore_unbound_workers_cpumask - restore cpumask of unbound workers * @pool: unbound pool of interest * @cpu: the CPU which is coming up * * An unbound pool may end up with a cpumask which doesn't have any online * CPUs. When a worker of such pool get scheduled, the scheduler resets * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any * online CPU before, cpus_allowed of all its workers should be restored. */ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) { static cpumask_t cpumask; struct worker *worker; lockdep_assert_held(&pool->attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; /* is @cpu the only online CPU? */ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); if (cpumask_weight(&cpumask) != 1) return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); } /* * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. */ static int workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct worker_pool *pool; struct workqueue_struct *wq; int pi; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; if (!create_worker(pool)) return NOTIFY_BAD; } break; case CPU_DOWN_FAILED: case CPU_ONLINE: mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { mutex_lock(&pool->attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); mutex_unlock(&pool->attach_mutex); } /* update NUMA affinity of unbound workqueues */ list_for_each_entry(wq, &workqueues, list) wq_update_unbound_numa(wq, cpu, true); mutex_unlock(&wq_pool_mutex); break; } return NOTIFY_OK; } /* * Workqueues should be brought down after normal priority CPU notifiers. * This will be registered as low priority CPU notifier. */ static int workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct work_struct unbind_work; struct workqueue_struct *wq; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: /* unbinding per-cpu workers should happen on the local CPU */ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); queue_work_on(cpu, system_highpri_wq, &unbind_work); /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) wq_update_unbound_numa(wq, cpu, false); mutex_unlock(&wq_pool_mutex); /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); destroy_work_on_stack(&unbind_work); break; } return NOTIFY_OK; } #ifdef CONFIG_SMP struct work_for_cpu { struct work_struct work; long (*fn)(void *); void *arg; long ret; }; static void work_for_cpu_fn(struct work_struct *work) { struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); wfc->ret = wfc->fn(wfc->arg); } /** * work_on_cpu - run a function in user context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER /** * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable * workqueues will queue new works to their delayed_works list instead of * pool->worklist. * * CONTEXT: * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void freeze_workqueues_begin(void) { struct workqueue_struct *wq; struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); } mutex_unlock(&wq_pool_mutex); } /** * freeze_workqueues_busy - are freezable workqueues still busy? * * Check whether freezing is complete. This function must be called * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: * Grabs and releases wq_pool_mutex. * * Return: * %true if some freezable workqueues are still busy. %false if freezing * is complete. */ bool freeze_workqueues_busy(void) { bool busy = false; struct workqueue_struct *wq; struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(!workqueue_freezing); list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_FREEZABLE)) continue; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ rcu_read_lock_sched(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; rcu_read_unlock_sched(); goto out_unlock; } } rcu_read_unlock_sched(); } out_unlock: mutex_unlock(&wq_pool_mutex); return busy; } /** * thaw_workqueues - thaw workqueues * * Thaw workqueues. Normal queueing is restored and all collected * frozen works are transferred to their respective pool worklists. * * CONTEXT: * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void thaw_workqueues(void) { struct workqueue_struct *wq; struct pool_workqueue *pwq; mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; workqueue_freezing = false; /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); } out_unlock: mutex_unlock(&wq_pool_mutex); } #endif /* CONFIG_FREEZER */ static int workqueue_apply_unbound_cpumask(void) { LIST_HEAD(ctxs); int ret = 0; struct workqueue_struct *wq; struct apply_wqattrs_ctx *ctx, *n; lockdep_assert_held(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_UNBOUND)) continue; /* creating multiple pwqs breaks ordering guarantee */ if (wq->flags & __WQ_ORDERED) continue; ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); if (!ctx) { ret = -ENOMEM; break; } list_add_tail(&ctx->list, &ctxs); } list_for_each_entry_safe(ctx, n, &ctxs, list) { if (!ret) apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); } return ret; } /** * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask * @cpumask: the cpumask to set * * The low-level workqueues cpumask is a global cpumask that limits * the affinity of all unbound workqueues. This function check the @cpumask * and apply it to all unbound workqueues and updates all pwqs of them. * * Retun: 0 - Success * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; cpumask_var_t saved_cpumask; if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); /* save the old wq_unbound_cpumask. */ cpumask_copy(saved_cpumask, wq_unbound_cpumask); /* update wq_unbound_cpumask at first and apply it to wqs. */ cpumask_copy(wq_unbound_cpumask, cpumask); ret = workqueue_apply_unbound_cpumask(); /* restore the wq_unbound_cpumask when failed. */ if (ret < 0) cpumask_copy(wq_unbound_cpumask, saved_cpumask); apply_wqattrs_unlock(); } free_cpumask_var(saved_cpumask); return ret; } #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * * per_cpu RO bool : whether the workqueue is per-cpu or unbound * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * * id RO int : the associated pool ID * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers */ struct wq_device { struct workqueue_struct *wq; struct device dev; }; static struct workqueue_struct *dev_to_wq(struct device *dev) { struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); return wq_dev->wq; } static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); } static DEVICE_ATTR_RO(per_cpu); static ssize_t max_active_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); } static ssize_t max_active_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); int val; if (sscanf(buf, "%d", &val) != 1 || val <= 0) return -EINVAL; workqueue_set_max_active(wq, val); return count; } static DEVICE_ATTR_RW(max_active); static struct attribute *wq_sysfs_attrs[] = { &dev_attr_per_cpu.attr, &dev_attr_max_active.attr, NULL, }; ATTRIBUTE_GROUPS(wq_sysfs); static ssize_t wq_pool_ids_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); const char *delim = ""; int node, written = 0; rcu_read_lock_sched(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, "%s%d:%d", delim, node, unbound_pwq_by_node(wq, node)->pool->id); delim = " "; } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock_sched(); return written; } static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); mutex_unlock(&wq->mutex); return written; } /* prepare workqueue_attrs for sysfs store operations */ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) { struct workqueue_attrs *attrs; lockdep_assert_held(&wq_pool_mutex); attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!attrs) return NULL; copy_workqueue_attrs(attrs, wq->unbound_attrs); return attrs; } static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; if (sscanf(buf, "%d", &attrs->nice) == 1 && attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) ret = apply_workqueue_attrs_locked(wq, attrs); else ret = -EINVAL; out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(wq->unbound_attrs->cpumask)); mutex_unlock(&wq->mutex); return written; } static ssize_t wq_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; ret = cpumask_parse(buf, attrs->cpumask); if (!ret) ret = apply_workqueue_attrs_locked(wq, attrs); out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); written = scnprintf(buf, PAGE_SIZE, "%d\n", !wq->unbound_attrs->no_numa); mutex_unlock(&wq->mutex); return written; } static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; int v, ret = -ENOMEM; apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) goto out_unlock; ret = -EINVAL; if (sscanf(buf, "%d", &v) == 1) { attrs->no_numa = !v; ret = apply_workqueue_attrs_locked(wq, attrs); } out_unlock: apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), __ATTR(numa, 0644, wq_numa_show, wq_numa_store), __ATTR_NULL, }; static struct bus_type wq_subsys = { .name = "workqueue", .dev_groups = wq_sysfs_groups, }; static ssize_t wq_unbound_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { int written; mutex_lock(&wq_pool_mutex); written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(wq_unbound_cpumask)); mutex_unlock(&wq_pool_mutex); return written; } static ssize_t wq_unbound_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { cpumask_var_t cpumask; int ret; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) return -ENOMEM; ret = cpumask_parse(buf, cpumask); if (!ret) ret = workqueue_set_unbound_cpumask(cpumask); free_cpumask_var(cpumask); return ret ? ret : count; } static struct device_attribute wq_sysfs_cpumask_attr = __ATTR(cpumask, 0644, wq_unbound_cpumask_show, wq_unbound_cpumask_store); static int __init wq_sysfs_init(void) { int err; err = subsys_virtual_register(&wq_subsys, NULL); if (err) return err; return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr); } core_initcall(wq_sysfs_init); static void wq_device_release(struct device *dev) { struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); kfree(wq_dev); } /** * workqueue_sysfs_register - make a workqueue visible in sysfs * @wq: the workqueue to register * * Expose @wq in sysfs under /sys/bus/workqueue/devices. * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set * which is the preferred method. * * Workqueue user should use this function directly iff it wants to apply * workqueue_attrs before making the workqueue visible in sysfs; otherwise, * apply_workqueue_attrs() may race against userland updating the * attributes. * * Return: 0 on success, -errno on failure. */ int workqueue_sysfs_register(struct workqueue_struct *wq) { struct wq_device *wq_dev; int ret; /* * Adjusting max_active or creating new pwqs by applying * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); if (!wq_dev) return -ENOMEM; wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; wq_dev->dev.init_name = wq->name; wq_dev->dev.release = wq_device_release; /* * unbound_attrs are created separately. Suppress uevent until * everything is ready. */ dev_set_uevent_suppress(&wq_dev->dev, true); ret = device_register(&wq_dev->dev); if (ret) { put_device(&wq_dev->dev); wq->wq_dev = NULL; return ret; } if (wq->flags & WQ_UNBOUND) { struct device_attribute *attr; for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { ret = device_create_file(&wq_dev->dev, attr); if (ret) { device_unregister(&wq_dev->dev); wq->wq_dev = NULL; return ret; } } } dev_set_uevent_suppress(&wq_dev->dev, false); kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); return 0; } /** * workqueue_sysfs_unregister - undo workqueue_sysfs_register() * @wq: the workqueue to unregister * * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. */ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { struct wq_device *wq_dev = wq->wq_dev; if (!wq->wq_dev) return; wq->wq_dev = NULL; device_unregister(&wq_dev->dev); } #else /* CONFIG_SYSFS */ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ static void __init wq_numa_init(void) { cpumask_var_t *tbl; int node, cpu; if (num_possible_nodes() <= 1) return; if (wq_disable_numa) { pr_info("workqueue: NUMA affinity support disabled\n"); return; } wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); BUG_ON(!wq_update_unbound_numa_attrs_buf); /* * We want masks of possible CPUs of each node which isn't readily * available. Build one from cpu_to_node() which should have been * fully initialized by now. */ tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL); BUG_ON(!tbl); for_each_node(node) BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); if (WARN_ON(node == NUMA_NO_NODE)) { pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); /* happens iff arch is bonkers, let's just proceed */ return; } cpumask_set_cpu(cpu, tbl[node]); } wq_numa_possible_cpumask = tbl; wq_numa_enabled = true; } static int __init init_workqueues(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); wq_numa_init(); /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; i = 0; for_each_cpu_worker_pool(pool, cpu) { BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; pool->node = cpu_to_node(cpu); /* alloc pool ID */ mutex_lock(&wq_pool_mutex); BUG_ON(worker_pool_assign_id(pool)); mutex_unlock(&wq_pool_mutex); } } /* create the initial worker */ for_each_online_cpu(cpu) { struct worker_pool *pool; for_each_cpu_worker_pool(pool, cpu) { pool->flags &= ~POOL_DISASSOCIATED; BUG_ON(!create_worker(pool)); } } /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; /* * An ordered wq should have only one pwq as ordering is * guaranteed by max_active which is enforced by pwqs. * Turn off NUMA so that dfl_pwq is used for all nodes. */ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); attrs->nice = std_nice[i]; attrs->no_numa = true; ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", WQ_POWER_EFFICIENT, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", WQ_FREEZABLE | WQ_POWER_EFFICIENT, 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); return 0; } early_initcall(init_workqueues);
/* * fs/fs-writeback.c * * Copyright (C) 2002, Linus Torvalds. * * Contains all the functions related to writing back and waiting * upon dirty inodes against superblocks, and writing back dirty * pages against inodes. ie: data writeback. Writeout of the * inode itself is not handled here. * * 10Apr2002 Andrew Morton * Split out of fs/inode.c * Additions for address_space-based writeback */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kthread.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/tracepoint.h> #include <linux/device.h> #include <linux/memcontrol.h> #include "internal.h" /* * 4MB minimal write chunk size */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) struct wb_completion { atomic_t cnt; }; /* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct wb_completion *done; /* set if the caller waits */ }; /* * If one wants to wait for one or more wb_writeback_works, each work's * ->done should be set to a wb_completion defined using the following * macro. Once all work items are issued with wb_queue_work(), the caller * can wait for the completion of all using wb_wait_for_completion(). Work * items which are waited upon aren't freed automatically on completion. */ #define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ struct wb_completion cmpl = { \ .cnt = ATOMIC_INIT(1), \ } /* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its * timestamps updated and when they finally get written out to be two * dirtytime_expire_intervals. We set the default to 12 hours (in * seconds), which means most of the time inodes will have their * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_io_list); } /* * Include the creation of the trace points after defining the * wb_writeback_work structure and inline functions so that the definition * remains local to this file. */ #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); static bool wb_io_lists_populated(struct bdi_writeback *wb) { 609 if (wb_has_dirty_io(wb)) { return false; } else { 32 set_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(!wb->avg_write_bandwidth); atomic_long_add(wb->avg_write_bandwidth, 32 &wb->bdi->tot_write_bandwidth); return true; } } static void wb_io_lists_depopulated(struct bdi_writeback *wb) { 297 if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && 8 list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { 4 clear_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth) < 0); } 297 } /** * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io} * * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { 609 assert_spin_locked(&wb->list_lock); 609 list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ 609 if (head != &wb->b_dirty_time) 609 return wb_io_lists_populated(wb); 1 wb_io_lists_depopulated(wb); return false; 31 } /** * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list * @inode: inode to be removed * @wb: bdi_writeback @inode is being removed from * * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and * clear %WB_has_dirty_io if all are empty afterwards. */ static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { 296 assert_spin_locked(&wb->list_lock); 296 list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); spin_unlock_bh(&wb->work_lock); } static void finish_writeback_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { struct wb_completion *done = work->done; if (work->auto_free) kfree(work); if (done && atomic_dec_and_test(&done->cnt)) wake_up_all(&wb->bdi->wb_waitq); } static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { 77 trace_writeback_queue(wb, work); 77 if (work->done) 77 atomic_inc(&work->done->cnt); 77 spin_lock_bh(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) { 77 list_add_tail(&work->list, &wb->work_list); 77 mod_delayed_work(bdi_wq, &wb->dwork, 0); } else finish_writeback_work(wb, work); 77 spin_unlock_bh(&wb->work_lock); } /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @bdi: bdi work items were issued to * @done: target wb_completion * * Wait for one or more work items issued to @bdi with their ->done field * set to @done, which should have been defined with * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such * work items are completed. Work items which are waited upon aren't freed * automatically on completion. */ 76 static void wb_wait_for_completion(struct backing_dev_info *bdi, struct wb_completion *done) { 91 atomic_dec(&done->cnt); /* put down the initial count */ 76 wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); 91 } #ifdef CONFIG_CGROUP_WRITEBACK /* parameters for foreign inode detection, see wb_detach_inode() */ #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ #define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) /* each slot's duration is 2s / 16 */ #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) /* if foreign slots >= 8, switch */ #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; void __inode_attach_wb(struct inode *inode, struct page *page) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; if (page) { memcg_css = mem_cgroup_css_from_page(page); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); } } if (!wb) wb = &bdi->wb; /* * There may be multiple instances of this function racing to * update the same inode. Use cmpxchg() to tell the winner. */ if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held * * Returns @inode's wb with its list_lock held. @inode->i_lock must be * held on entry and is released on return. The returned wb is guaranteed * to stay @inode's associated wb until its list_lock is released. */ static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) __acquires(&wb->list_lock) { while (true) { struct bdi_writeback *wb = inode_to_wb(inode); /* * inode_to_wb() association is protected by both * @inode->i_lock and @wb->list_lock but list_lock nests * outside i_lock. Drop i_lock and verify that the * association hasn't changed after acquiring list_lock. */ wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); /* i_wb may have changed inbetween, can't use inode_to_wb() */ if (likely(wb == inode->i_wb)) { wb_put(wb); /* @inode already has ref */ return wb; } spin_unlock(&wb->list_lock); wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } } /** * inode_to_wb_and_lock_list - determine an inode's wb and lock it * @inode: inode of interest * * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held * on entry. */ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock) { spin_lock(&inode->i_lock); return locked_inode_to_wb_and_lock_list(inode); } struct inode_switch_wbs_context { struct inode *inode; struct bdi_writeback *new_wb; struct rcu_head rcu_head; struct work_struct work; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { down_write(&bdi->wb_switch_rwsem); } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { up_write(&bdi->wb_switch_rwsem); } static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; struct radix_tree_iter iter; bool switched = false; void **slot; /* * If @inode switches cgwb membership while sync_inodes_sb() is * being issued, sync_inodes_sb() might miss it. Synchronize. */ down_read(&bdi->wb_switch_rwsem); /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be * synchronizing against mapping->tree_lock. * * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&new_wb->list_lock); spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } spin_lock(&inode->i_lock); spin_lock_irq(&mapping->tree_lock); /* * Once I_FREEING is visible under i_lock, the eviction path owns * the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to * pages actually under underwriteback. */ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, PAGECACHE_TAG_DIRTY) { struct page *page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (likely(page) && PageDirty(page)) { __dec_wb_stat(old_wb, WB_RECLAIMABLE); __inc_wb_stat(new_wb, WB_RECLAIMABLE); } } radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, PAGECACHE_TAG_WRITEBACK) { struct page *page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); __dec_wb_stat(old_wb, WB_WRITEBACK); __inc_wb_stat(new_wb, WB_WRITEBACK); } } wb_get(new_wb); /* * Transfer to @new_wb's IO list if necessary. The specific list * @inode was on is ignored and the inode is put on ->b_dirty which * is always correct including from ->b_dirty_time. The transfer * preserves @inode->dirtied_when ordering. */ if (!list_empty(&inode->i_io_list)) { struct inode *pos; inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode->i_wb = new_wb; } /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; switched = true; skip_switch: /* * Paired with load_acquire in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); spin_unlock_irq(&mapping->tree_lock); spin_unlock(&inode->i_lock); spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); up_read(&bdi->wb_switch_rwsem); if (switched) { wb_wakeup(new_wb); wb_put(old_wb); } wb_put(new_wb); iput(inode); kfree(isw); atomic_dec(&isw_nr_in_flight); } static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) { struct inode_switch_wbs_context *isw = container_of(rcu_head, struct inode_switch_wbs_context, rcu_head); /* needs to grab bh-unsafe locks, bounce to work item */ INIT_WORK(&isw->work, inode_switch_wbs_work_fn); queue_work(isw_wq, &isw->work); } /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode * @new_wb_id: ID of the new wb * * Switch @inode's wb association to the wb identified by @new_wb_id. The * switching is performed asynchronously and may fail silently. */ static void inode_switch_wbs(struct inode *inode, int new_wb_id) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) return; /* * Avoid starting new switches while sync_inodes_sb() is in * progress. Otherwise, if the down_write protected issue path * blocks heavily, we might end up starting a large number of * switches which will block on the rwsem. */ if (!down_read_trylock(&bdi->wb_switch_rwsem)) return; isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) goto out_unlock; /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); if (memcg_css) isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); rcu_read_unlock(); if (!isw->new_wb) goto out_free; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & MS_ACTIVE) || inode->i_state & (I_WB_SWITCH | I_FREEING) || inode_to_wb(inode) == isw->new_wb) { spin_unlock(&inode->i_lock); goto out_free; } inode->i_state |= I_WB_SWITCH; spin_unlock(&inode->i_lock); ihold(inode); isw->inode = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the mapping's * tree_lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); atomic_inc(&isw_nr_in_flight); goto out_unlock; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); out_unlock: up_read(&bdi->wb_switch_rwsem); } /** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest * @inode: target inode * * @inode is locked and about to be written back under the control of @wbc. * Record @inode's writeback context into @wbc and unlock the i_lock. On * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); return; } wbc->wb = inode_to_wb(inode); wbc->inode = inode; wbc->wb_id = wbc->wb->memcg_css->id; wbc->wb_lcand_id = inode->i_wb_frn_winner; wbc->wb_tcand_id = 0; wbc->wb_bytes = 0; wbc->wb_lcand_bytes = 0; wbc->wb_tcand_bytes = 0; wb_get(wbc->wb); spin_unlock(&inode->i_lock); /* * A dying wb indicates that either the blkcg associated with the * memcg changed or the associated memcg is dying. In the first * case, a replacement wb should already be available and we should * refresh the wb immediately. In the second case, trying to * refresh will keep failing. */ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection * @wbc: writeback_control of the just finished writeback * * To be called after a writeback attempt of an inode finishes and undoes * wbc_attach_and_unlock_inode(). Can be called under any context. * * As concurrent write sharing of an inode is expected to be very rare and * memcg only tracks page ownership on first-use basis severely confining * the usefulness of such sharing, cgroup writeback tracks ownership * per-inode. While the support for concurrent write sharing of an inode * is deemed unnecessary, an inode being written to by different cgroups at * different points in time is a lot more common, and, more importantly, * charging only by first-use can too readily lead to grossly incorrect * behaviors (single foreign page can lead to gigabytes of writeback to be * incorrectly attributed). * * To resolve this issue, cgroup writeback detects the majority dirtier of * an inode and transfers the ownership to it. To avoid unnnecessary * oscillation, the detection mechanism keeps track of history and gives * out the switch verdict only if the foreign usage pattern is stable over * a certain amount of time and/or writeback attempts. * * On each writeback attempt, @wbc tries to detect the majority writer * using Boyer-Moore majority vote algorithm. In addition to the byte * count from the majority voting, it also counts the bytes written for the * current wb and the last round's winner wb (max of last round's current * wb, the winner from two rounds ago, and the last round's majority * candidate). Keeping track of the historical winner helps the algorithm * to semi-reliably detect the most active writer even when it's not the * absolute majority. * * Once the winner of the round is determined, whether the winner is * foreign or not and how much IO time the round consumed is recorded in * inode->i_wb_frn_history. If the amount of recorded foreign IO time is * over a certain threshold, the switch verdict is given. */ void wbc_detach_inode(struct writeback_control *wbc) { struct bdi_writeback *wb = wbc->wb; struct inode *inode = wbc->inode; unsigned long avg_time, max_bytes, max_time; u16 history; int max_id; if (!wb) return; history = inode->i_wb_frn_history; avg_time = inode->i_wb_frn_avg_time; /* pick the winner of this round */ if (wbc->wb_bytes >= wbc->wb_lcand_bytes && wbc->wb_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_id; max_bytes = wbc->wb_bytes; } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_lcand_id; max_bytes = wbc->wb_lcand_bytes; } else { max_id = wbc->wb_tcand_id; max_bytes = wbc->wb_tcand_bytes; } /* * Calculate the amount of IO time the winner consumed and fold it * into the running average kept per inode. If the consumed IO * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for * deciding whether to switch or not. This is to prevent one-off * small dirtiers from skewing the verdict. */ max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, wb->avg_write_bandwidth); if (avg_time) avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - (avg_time >> WB_FRN_TIME_AVG_SHIFT); else avg_time = max_time; /* immediate catch up on first run */ if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { int slots; /* * The switch verdict is reached if foreign wb's consume * more than a certain proportion of IO time in a * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot * history mask where each bit represents one sixteenth of * the period. Determine the number of slots to shift into * history from @max_time. */ slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), (unsigned long)WB_FRN_HIST_MAX_SLOTS); history <<= slots; if (wbc->wb_id != max_id) history |= (1U << slots) - 1; /* * Switch if the current wb isn't the consistent winner. * If there are multiple closely competing dirtiers, the * inode may switch across them repeatedly over time, which * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) inode_switch_wbs(inode, max_id); } /* * Multiple instances of this function may race to update the * following fields but we don't mind occassional inaccuracies. */ inode->i_wb_frn_winner = max_id; inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); inode->i_wb_frn_history = history; wb_put(wbc->wb); wbc->wb = NULL; } /** * wbc_account_io - account IO issued during writeback * @wbc: writeback_control of the writeback in progress * @page: page being written out * @bytes: number of bytes being written out * * @bytes from @page are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ void wbc_account_io(struct writeback_control *wbc, struct page *page, size_t bytes) { int id; /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (!wbc->wb) return; rcu_read_lock(); id = mem_cgroup_css_from_page(page)->id; rcu_read_unlock(); if (id == wbc->wb_id) { wbc->wb_bytes += bytes; return; } if (id == wbc->wb_lcand_id) wbc->wb_lcand_bytes += bytes; /* Boyer-Moore majority vote algorithm */ if (!wbc->wb_tcand_bytes) wbc->wb_tcand_id = id; if (id == wbc->wb_tcand_id) wbc->wb_tcand_bytes += bytes; else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } EXPORT_SYMBOL_GPL(wbc_account_io); /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion (may be NULL) * @cong_bits: mask of WB_[a]sync_congested bits to test * * Tests whether @inode is congested. @cong_bits is the mask of congestion * bits to test and the return value is the mask of set bits. * * If cgroup writeback is enabled for @inode, the congestion state is * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg * associated with @inode is congested; otherwise, the root wb's congestion * state is used. * * @inode is allowed to be NULL as this function is often called on * mapping->host which is NULL for the swapper space. */ int inode_congested(struct inode *inode, int cong_bits) { /* * Once set, ->i_wb never becomes NULL while the inode is alive. * Start transaction iff ->i_wb is visible. */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; struct wb_lock_cookie lock_cookie = {}; bool congested; wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); } EXPORT_SYMBOL_GPL(inode_congested); /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi * * Split @wb's portion of @nr_pages according to @wb's write bandwidth in * relation to the total write bandwidth of all wb's w/ dirty inodes on * @wb->bdi. */ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); if (nr_pages == LONG_MAX) return LONG_MAX; /* * This may be called on clean wb's and proportional distribution * may not make sense, just use the original @nr_pages in those * cases. In general, we wanna err on the side of writing more. */ if (!tot_bw || this_bw >= tot_bw) return nr_pages; else return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); } /** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info * @base_work: wb_writeback_work to issue * @skip_if_busy: skip wb's which already have writeback in progress * * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's * distributed to the busy wbs according to each wb's proportion in the * total active write bandwidth of @bdi. */ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { struct bdi_writeback *last_wb = NULL; struct bdi_writeback *wb = list_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); might_sleep(); restart: rcu_read_lock(); list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; if (last_wb) { wb_put(last_wb); last_wb = NULL; } /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || list_empty(&wb->b_dirty_time))) continue; if (skip_if_busy && writeback_in_progress(wb)) continue; nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 1; wb_queue_work(wb, work); continue; } /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 0; work->done = &fallback_work_done; wb_queue_work(wb, work); /* * Pin @wb so that it stays on @bdi->wb_list. This allows * continuing iteration from @wb after dropping and * regrabbing rcu read lock. */ wb_get(wb); last_wb = wb; rcu_read_unlock(); wb_wait_for_completion(bdi, &fallback_work_done); goto restart; } rcu_read_unlock(); if (last_wb) wb_put(last_wb); } 609 /** * cgroup_writeback_umount - flush inode wb switches for umount * * This function is called when a super_block is about to be destroyed and * flushes in-flight inode wb switches. An inode wb switch goes through * RCU and then workqueue, so the two need to be flushed in order to ensure * that all previously scheduled switches are finished. As wb switches are * rare occurrences and synchronize_rcu() can take a while, perform * flushing iff wb switches are in flight. */ 332 void cgroup_writeback_umount(void) { if (atomic_read(&isw_nr_in_flight)) { /* * Use rcu_barrier() to wait for all pending callbacks to * ensure that all in-flight wb switches are in the workqueue. */ rcu_barrier(); flush_workqueue(isw_wq); } } static int __init cgroup_writeback_init(void) { isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); 91 if (!isw_wq) return -ENOMEM; 31 return 0; 64 } fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) __acquires(&wb->list_lock) 13 { struct bdi_writeback *wb = inode_to_wb(inode); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); return wb; } 13 static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock) { struct bdi_writeback *wb = inode_to_wb(inode); spin_lock(&wb->list_lock); 13 return wb; } static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { return nr_pages; 13 } static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { might_sleep(); if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; wb_queue_work(&bdi->wb, base_work); } } #endif /* CONFIG_CGROUP_WRITEBACK */ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason) { struct wb_writeback_work *work; if (!wb_has_dirty_io(wb)) return; /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { 17 trace_writeback_nowork(wb); wb_wakeup(wb); return; } work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; work->reason = reason; work->auto_free = 1; wb_queue_work(wb, work); } /** * wb_start_background_writeback - start background writeback * @wb: bdi_writback to write from * * Description: * This makes sure WB_SYNC_NONE background writeback happens. When * this function returns, it is only guaranteed that for given wb * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. */ void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(wb); wb_wakeup(wb); } /* * Remove the inode from the writeback list it is on. */ void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { if (!list_empty(&wb->b_dirty)) { struct inode *tail; tail = wb_inode(wb->b_dirty.next); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); } /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { inode->i_state &= ~I_SYNC; /* If inode is clean an unused, put it into LRU now... */ inode_add_lru(inode); /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } static bool inode_dirtied_after(struct inode *inode, unsigned long t) { bool ret = time_after(inode->dirtied_when, t); #ifndef CONFIG_64BIT /* * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times * from permanently stopping the whole bdi writeback. */ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif return ret; } #define EXPIRE_DIRTY_ATIME 0x0001 /* * Move expired (dirtied before work->older_than_this) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, int flags, struct wb_writeback_work *work) { unsigned long *older_than_this = NULL; unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; int moved = 0; if ((flags & EXPIRE_DIRTY_ATIME) == 0) older_than_this = work->older_than_this; else if (!work->for_sync) { expire_time = jiffies - (dirtytime_expire_interval * HZ); older_than_this = &expire_time; } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; list_move(&inode->i_io_list, &tmp); moved++; if (flags & EXPIRE_DIRTY_ATIME) set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); goto out; } /* Move inodes from one superblock together */ 318 while (!list_empty(&tmp)) { 318 sb = wb_inode(tmp.prev)->i_sb; 318 list_for_each_prev_safe(pos, node, &tmp) { 318 inode = wb_inode(pos); if (inode->i_sb == sb) list_move(&inode->i_io_list, dispatch_queue); } } out: return moved; } /* * Queue all expired dirty inodes for io, eldest first. * Before * newly dirtied b_dirty b_io b_more_io * =============> gf edc BA 705 * After * newly dirtied b_dirty b_io b_more_io * =============> g fBAedc * | 705 * +--> dequeue for IO 2 */ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { int moved; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, EXPIRE_DIRTY_ATIME, work); if (moved) wb_io_lists_populated(wb); 703 trace_writeback_queue_io(wb, work, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { trace_writeback_write_inode_start(inode, wbc); ret = inode->i_sb->s_op->write_inode(inode, wbc); trace_writeback_write_inode(inode, wbc); return ret; } return 0; } /* * Wait for writeback on an inode to complete. Called with i_lock held. * Caller must make sure inode cannot go away when we drop i_lock. */ static void __inode_wait_for_writeback(struct inode *inode) __releases(inode->i_lock) __acquires(inode->i_lock) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode->i_lock); } } /* * Wait for writeback on an inode to complete. Caller must have inode pinned. */ void inode_wait_for_writeback(struct inode *inode) { spin_lock(&inode->i_lock); __inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); } /* * Sleep until I_SYNC is cleared. This function must be called with i_lock * held and drops it. It is aimed for callers not holding any inode reference * so once i_lock is dropped, inode can go away. */ static void inode_sleep_on_writeback(struct inode *inode) __releases(inode->i_lock) { DEFINE_WAIT(wait); wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); int sleep; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); sleep = inode->i_state & I_SYNC; spin_unlock(&inode->i_lock); if (sleep) schedule(); finish_wait(wqh, &wait); } /* * Find proper writeback list for the inode depending on its current state and * possibly also change of its state while we were doing writeback. Here we * handle things such as livelock prevention or fairness of writeback among * inodes. This function can be called only by flusher thread - noone else * processes all inodes in writeback lists and requeueing inodes behind flusher * thread's back can have unexpected consequences. */ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc) { if (inode->i_state & I_FREEING) return; /* * Sync livelock prevention. Each inode is tagged and synced in one * shot. If still dirty, it will be redirty_tail()'ed below. Update * the dirty time to prevent enqueue and sync it again. */ if ((inode->i_state & I_DIRTY) && (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) inode->dirtied_when = jiffies; if (wbc->pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. */ redirty_tail(inode, wb); return; } if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 322 /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. */ if (wbc->nr_to_write <= 0) { /* Slice used up. Queue for next turn. */ requeue_io(inode, wb); 322 } else { /* 322 * Writeback blocked by something other than * congestion. Delay the inode for some time to * avoid spinning on the CPU (100% iowait) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* 322 * Filesystems can dirty the inode during writeback operations, * such as delayed allocation during submission or metadata * updates after data IO completion. */ redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ 322 inode_io_list_del_locked(inode, wb); } } /* * Write out an inode and its dirty pages. Do not update the writeback list * linkage. That is left to the caller. The caller is also responsible for * setting I_SYNC flag and calling inode_sync_complete() to clear it. */ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; 322 long nr_to_write = wbc->nr_to_write; 322 unsigned dirty; int ret; WARN_ON(!(inode->i_state & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); ret = do_writepages(mapping, wbc); /* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data * I/O completion. We don't do it for sync(2) writeback because it has a * separate, external IO completion path and ->sync_fs for guaranteeing * inode metadata is written back correctly. */ 79 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { int err = filemap_fdatawait(mapping); 322 if (ret == 0) ret = err; } /* 322 * Some filesystems may redirty the inode during the writeback 318 * due to delalloc, clear dirty metadata flags right before 318 * write_inode() */ spin_lock(&inode->i_lock); 322 322 dirty = inode->i_state & I_DIRTY; if (inode->i_state & I_DIRTY_TIME) { if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || unlikely(time_after(jiffies, (inode->dirtied_time_when + dirtytime_expire_interval * HZ)))) { dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; trace_writeback_lazytime(inode); } } else inode->i_state &= ~I_DIRTY_TIME_EXPIRED; inode->i_state &= ~dirty; /* * Paired with smp_mb() in __mark_inode_dirty(). This allows * __mark_inode_dirty() to test i_state without grabbing i_lock - 323 * either they see the I_DIRTY bits cleared or we see the dirtied * inode. * * I_DIRTY_PAGES is always cleared together above even if @mapping 323 * still has dirty pages. The flag is reinstated after smp_mb() if * necessary. This guarantees that either __mark_inode_dirty() 323 * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. 2 */ smp_mb(); if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; spin_unlock(&inode->i_lock); 2 if (dirty & I_DIRTY_TIME) mark_inode_dirty_sync(inode); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; } trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; 322 } 27 27 /* * Write out an inode's dirty pages. Either the caller has an active reference 322 * on the inode or the inode has I_WILL_FREE set. * * This function is designed to be called for writing back one inode which * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() * and does more profound writeback list handling in writeback_sb_inodes(). */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct bdi_writeback *wb; int ret = 0; spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) 286 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 322 else WARN_ON(inode->i_state & I_WILL_FREE); 322 if (inode->i_state & I_SYNC) { if (wbc->sync_mode != WB_SYNC_ALL) goto out; /* * It's a data-integrity sync. We must wait. Since callers hold * inode reference or inode has I_WILL_FREE set, it cannot go * away under us. */ __inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* * Skip inode if it is clean and we have no outstanding writeback in * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this * function since flusher thread may be doing for example sync in * parallel and if we move the inode, it could get skipped. So here we * make sure inode is on some writeback list and leave it there unless * we have completely cleaned the inode. */ if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); wbc_detach_inode(wbc); wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* * If inode is clean, remove it from writeback lists. Otherwise don't * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: spin_unlock(&inode->i_lock); return ret; } static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; /* * WB_SYNC_ALL mode does livelock avoidance by syncing dirty * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX * here avoids calling into writeback_inodes_wb() more than once. * * The intended call sequence for WB_SYNC_ALL writeback is: * * wb_writeback() * writeback_sb_inodes() <== called only once * write_cache_pages() <== called once for each inode * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { pages = min(wb->avg_write_bandwidth / 2, global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); } return pages; } /* * Write a portion of b_io inodes which belong to @sb. * * Return the number of pages and/or inodes written. * * NOTE! This is called with wb->list_lock held, and will * unlock and relock that for each inode it ends up doing * IO for. */ static long writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, struct wb_writeback_work *work) { struct writeback_control wbc = { .sync_mode = work->sync_mode, .tagged_writepages = work->tagged_writepages, .for_kupdate = work->for_kupdate, .for_background = work->for_background, .for_sync = work->for_sync, .range_cyclic = work->range_cyclic, .range_start = 0, .range_end = LLONG_MAX, }; unsigned long start_time = jiffies; long write_chunk; long wrote = 0; /* count both pages and inodes */ while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; if (inode->i_sb != sb) { if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */ redirty_tail(inode, wb); continue; } /* * The inode belongs to a different superblock. * Bounce back to the caller to unpin this and * pin the next superblock. */ break; } /* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); redirty_tail(inode, wb); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { /* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to * b_more_io so that writeback can proceed with the * other inodes on s_io. * * We'll have another go at writing back this inode * when we completed a full scan of b_io. */ spin_unlock(&inode->i_lock); requeue_io(inode, wb); trace_writeback_sb_inodes_requeue(inode); continue; } spin_unlock(&wb->list_lock); /* * We already requeued the inode if it had I_SYNC set and we * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ if (inode->i_state & I_SYNC) { /* Wait for I_SYNC. This function drops i_lock... */ inode_sleep_on_writeback(inode); /* Inode may be gone, start again */ spin_lock(&wb->list_lock); continue; } inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(&wbc, inode); write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; /* * We use I_SYNC to pin the inode in memory. While it is set * evict_inode() will wait so the inode cannot be freed. */ __writeback_single_inode(inode, &wbc); wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; if (need_resched()) { /* * We're trying to balance between building up a nice * long list of IOs to improve our merge rate, and * getting those IOs out quickly for anyone throttling * in balance_dirty_pages(). cond_resched() doesn't * unplug, so get our IOs out the door before we * give up the CPU. */ blk_flush_plug(current); cond_resched(); } /* * Requeue @inode if still dirty. Be careful as @inode may * have been switched to another wb in the meantime. */ tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) wrote++; requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); if (unlikely(tmp_wb != wb)) { spin_unlock(&tmp_wb->list_lock); spin_lock(&wb->list_lock); } /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ if (wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } return wrote; } static long __writeback_inodes_wb(struct bdi_writeback *wb, struct wb_writeback_work *work) { unsigned long start_time = jiffies; long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!trylock_super(sb)) { /* * trylock_super() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ redirty_tail(inode, wb); continue; } wrote += writeback_sb_inodes(sb, wb, work); up_read(&sb->s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } /* Leave any unwritten inodes on b_io */ return wrote; } static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = reason, }; struct blk_plug plug; blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, &work); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work.nr_pages; } /* * Explicit flushing or periodic writeback of "old" data. * * Define "old": the first time one of an inode's pages is dirtied, we mark the * dirtying-time in the inode's address_space. So this periodic writeback code * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * * Try to run once per dirty_writeback_interval. But if a writeback event * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * older_than_this takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long oldest_jif; struct inode *inode; long progress; struct blk_plug plug; oldest_jif = jiffies; work->older_than_this = &oldest_jif; blk_start_plug(&plug); spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed */ if (work->nr_pages <= 0) break; /* * Background writeout and kupdate-style writeback may * run forever. Stop them if there is other work to do * so that e.g. sync can proceed. They'll be restarted * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && !list_empty(&wb->work_list)) break; /* * For background writeout, stop when we are below the * background dirty threshold */ if (work->for_background && !wb_over_bg_thresh(wb)) break; /* * Kupdate and background works are special and we want to * include all inodes that need writing. Livelock avoidance is * handled by these works yielding to any other work so we are * safe. */ if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) oldest_jif = jiffies; trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) queue_io(wb, work); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); wb_update_bandwidth(wb, wb_start); /* * Did we write something? Try for more * * Dirty inodes are moved to b_io for writeback in batches. * The completion of the current batch does not necessarily * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ if (progress) continue; /* * No more inodes for IO, bail */ if (list_empty(&wb->b_more_io)) break; /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise 113 * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { trace_writeback_wait(wb, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); /* This function drops i_lock... */ inode_sleep_on_writeback(inode); spin_lock(&wb->list_lock); } } spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work->nr_pages; } /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; spin_lock_bh(&wb->work_lock); if (!list_empty(&wb->work_list)) { work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } spin_unlock_bh(&wb->work_lock); return work; } /* * Add in the number of potentially dirty inodes, because each inode * write can dirty pagecache in the underlying blockdev. */ static unsigned long get_nr_dirty_pages(void) { return global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } static long wb_check_background_flush(struct bdi_writeback *wb) { if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); } return 0; } static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; long nr_pages; /* * When set to zero, disable periodic writeback */ if (!dirty_writeback_interval) return 0; expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) return 0; wb->last_old_flush = jiffies; nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); } return 0; } /* * Retrieve work items and do the writeback they describe */ static long wb_do_writeback(struct bdi_writeback *wb) { struct wb_writeback_work *work; long wrote = 0; set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); finish_writeback_work(wb, work); } /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); clear_bit(WB_writeback_running, &wb->state); return wrote; } /* * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); long pages_written; set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); 13 current->flags |= PF_SWAPWRITE; 13 if (likely(!current_is_workqueue_rescuer() || 13 !test_bit(WB_registered, &wb->state))) { 13 /* * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken 13 * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ 13 do { 13 pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); } while (!list_empty(&wb->work_list)); 13 } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } if (!list_empty(&wb->work_list)) wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); current->flags &= ~PF_SWAPWRITE; } /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. */ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; if (!nr_pages) nr_pages = get_nr_dirty_pages(); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { struct bdi_writeback *wb; i