#ifndef __SHMEM_FS_H
      #define __SHMEM_FS_H
      
      #include <linux/file.h>
      #include <linux/swap.h>
      #include <linux/mempolicy.h>
      #include <linux/pagemap.h>
      #include <linux/percpu_counter.h>
      #include <linux/xattr.h>
      
      /* inode in-kernel data */
      
      struct shmem_inode_info {
              spinlock_t                lock;
              unsigned int                seals;                /* shmem seals */
              unsigned long                flags;
              unsigned long                alloced;        /* data pages alloced to file */
              unsigned long                swapped;        /* subtotal assigned to swap */
              struct shared_policy        policy;                /* NUMA memory alloc policy */
              struct list_head        swaplist;        /* chain of maybes on swap */
              struct simple_xattrs        xattrs;                /* list of xattrs */
              struct inode                vfs_inode;
      };
      
      struct shmem_sb_info {
              unsigned long max_blocks;   /* How many blocks are allowed */
              struct percpu_counter used_blocks;  /* How many are allocated */
              unsigned long max_inodes;   /* How many inodes are allowed */
              unsigned long free_inodes;  /* How many are left for allocation */
              spinlock_t stat_lock;            /* Serialize shmem_sb_info changes */
              kuid_t uid;                    /* Mount uid for root directory */
              kgid_t gid;                    /* Mount gid for root directory */
              umode_t mode;                    /* Mount mode for root directory */
              struct mempolicy *mpol;     /* default memory policy for mappings */
      };
      
      static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
      {
  705         return container_of(inode, struct shmem_inode_info, vfs_inode);
      }
      
      /*
       * Functions in mm/shmem.c called directly from elsewhere:
       */
      extern int shmem_init(void);
      extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
      extern struct file *shmem_file_setup(const char *name,
                                              loff_t size, unsigned long flags);
      extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
                                                  unsigned long flags);
      extern int shmem_zero_setup(struct vm_area_struct *);
      extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
      extern bool shmem_mapping(struct address_space *mapping);
      extern void shmem_unlock_mapping(struct address_space *mapping);
      extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                              pgoff_t index, gfp_t gfp_mask);
      extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
      extern int shmem_unuse(swp_entry_t entry, struct page *page);
      
      static inline struct page *shmem_read_mapping_page(
                                      struct address_space *mapping, pgoff_t index)
      {
              return shmem_read_mapping_page_gfp(mapping, index,
                                              mapping_gfp_mask(mapping));
      }
      
      #ifdef CONFIG_TMPFS
      
      extern int shmem_add_seals(struct file *file, unsigned int seals);
      extern int shmem_get_seals(struct file *file);
      extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
      
      #else
      
      static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a)
      {
              return -EINVAL;
      }
      
      #endif
      
      #endif
      /*
       * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
       * Written by Alex Tomas <alex@clusterfs.com>
       *
       * Architecture independence:
       *   Copyright (c) 2005, Bull S.A.
       *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
       *
       * This program is free software; you can redistribute it and/or modify
       * it under the terms of the GNU General Public License version 2 as
       * published by the Free Software Foundation.
       *
       * This program is distributed in the hope that it will be useful,
       * but WITHOUT ANY WARRANTY; without even the implied warranty of
       * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       * GNU General Public License for more details.
       *
       * You should have received a copy of the GNU General Public Licens
       * along with this program; if not, write to the Free Software
       * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
       */
      
      /*
       * Extents support for EXT4
       *
       * TODO:
       *   - ext4*_error() should be used in some situations
       *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
       *   - smart tree reduction
       */
      
      #include <linux/fs.h>
      #include <linux/time.h>
      #include <linux/jbd2.h>
      #include <linux/highuid.h>
      #include <linux/pagemap.h>
      #include <linux/quotaops.h>
      #include <linux/string.h>
      #include <linux/slab.h>
      #include <asm/uaccess.h>
      #include <linux/fiemap.h>
      #include <linux/backing-dev.h>
      #include "ext4_jbd2.h"
      #include "ext4_extents.h"
      #include "xattr.h"
      
      #include <trace/events/ext4.h>
      
      /*
       * used by extent splitting.
       */
      #define EXT4_EXT_MAY_ZEROOUT        0x1  /* safe to zeroout if split fails \
                                              due to ENOSPC */
      #define EXT4_EXT_MARK_UNWRIT1        0x2  /* mark first half unwritten */
      #define EXT4_EXT_MARK_UNWRIT2        0x4  /* mark second half unwritten */
      
      #define EXT4_EXT_DATA_VALID1        0x8  /* first half contains valid data */
      #define EXT4_EXT_DATA_VALID2        0x10 /* second half contains valid data */
      
      static __le32 ext4_extent_block_csum(struct inode *inode,
                                           struct ext4_extent_header *eh)
      {
              struct ext4_inode_info *ei = EXT4_I(inode);
              struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
              __u32 csum;
      
              csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
                                 EXT4_EXTENT_TAIL_OFFSET(eh));
              return cpu_to_le32(csum);
      }
      
      static int ext4_extent_block_csum_verify(struct inode *inode,
                                               struct ext4_extent_header *eh)
      {
              struct ext4_extent_tail *et;
      
   43         if (!ext4_has_metadata_csum(inode->i_sb))
                      return 1;
      
              et = find_ext4_extent_tail(eh);
              if (et->et_checksum != ext4_extent_block_csum(inode, eh))
                      return 0;
              return 1;
      }
      
  419 static void ext4_extent_block_csum_set(struct inode *inode,
                                             struct ext4_extent_header *eh)
      {
              struct ext4_extent_tail *et;
      
  419         if (!ext4_has_metadata_csum(inode->i_sb))
                      return;
      
              et = find_ext4_extent_tail(eh);
  419         et->et_checksum = ext4_extent_block_csum(inode, eh);
      }
      
      static int ext4_split_extent(handle_t *handle,
                                      struct inode *inode,
                                      struct ext4_ext_path **ppath,
                                      struct ext4_map_blocks *map,
                                      int split_flag,
                                      int flags);
      
      static int ext4_split_extent_at(handle_t *handle,
                                   struct inode *inode,
                                   struct ext4_ext_path **ppath,
                                   ext4_lblk_t split,
                                   int split_flag,
                                   int flags);
      
      static int ext4_find_delayed_extent(struct inode *inode,
                                          struct extent_status *newes);
      
      static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                                  struct inode *inode,
                                                  int needed)
      {
              int err;
      
  320         if (!ext4_handle_valid(handle))
  320                 return 0;
              if (handle->h_buffer_credits > needed)
                      return 0;
              err = ext4_journal_extend(handle, needed);
              if (err <= 0)
                      return err;
              err = ext4_truncate_restart_trans(handle, inode, needed);
              if (err == 0)
                      err = -EAGAIN;
      
              return err;
      }
      
      /*
       * could return:
       *  - EROFS
       *  - ENOMEM
       */
      static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
                                      struct ext4_ext_path *path)
      {
              if (path->p_bh) {
                      /* path points to block */
                      BUFFER_TRACE(path->p_bh, "get_write_access");
  420                 return ext4_journal_get_write_access(handle, path->p_bh);
              }
              /* path points to leaf/index in inode body */
              /* we use in-core data, no need to protect them */
              return 0;
      }
      
      /*
       * could return:
       *  - EROFS
       *  - ENOMEM
       *  - EIO
       */
      int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
                           struct inode *inode, struct ext4_ext_path *path)
      {
              int err;
      
  792         WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
  792         if (path->p_bh) {
  419                 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
                      /* path points to block */
                      err = __ext4_handle_dirty_metadata(where, line, handle,
                                                         inode, path->p_bh);
              } else {
                      /* path points to leaf/index in inode body */
  780                 err = ext4_mark_inode_dirty(handle, inode);
              }
  792         return err;
      }
      
      static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                                    struct ext4_ext_path *path,
                                    ext4_lblk_t block)
      {
  602         if (path) {
  602                 int depth = path->p_depth;
                      struct ext4_extent *ex;
      
                      /*
                       * Try to predict block placement assuming that we are
                       * filling in a file which will eventually be
                       * non-sparse --- i.e., in the case of libbfd writing
                       * an ELF object sections out-of-order but in a way
                       * the eventually results in a contiguous object or
                       * executable file, or some database extending a table
                       * space file.  However, this is actually somewhat
                       * non-ideal if we are writing a sparse file such as
                       * qemu or KVM writing a raw image file that is going
                       * to stay fairly sparse, since it will end up
                       * fragmenting the file system's free space.  Maybe we
                       * should have some hueristics or some way to allow
                       * userspace to pass a hint to file system,
                       * especially if the latter case turns out to be
                       * common.
                       */
                      ex = path[depth].p_ext;
                      if (ex) {
  530                         ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
                              ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
      
                              if (block > ext_block)
  602                                 return ext_pblk + (block - ext_block);
                              else
   56                                 return ext_pblk - (ext_block - block);
                      }
      
                      /* it looks like index is empty;
                       * try to find starting block from index itself */
  502                 if (path[depth].p_bh)
                              return path[depth].p_bh->b_blocknr;
              }
      
              /* OK. use inode's group */
  502         return ext4_inode_to_goal_block(inode);
      }
      
      /*
       * Allocation for a meta data block
       */
      static ext4_fsblk_t
      ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                              struct ext4_ext_path *path,
                              struct ext4_extent *ex, int *err, unsigned int flags)
      {
              ext4_fsblk_t goal, newblock;
      
              goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
              newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
                                              NULL, err);
              return newblock;
      }
      
      static inline int ext4_ext_space_block(struct inode *inode, int check)
      {
              int size;
      
  301         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                              / sizeof(struct ext4_extent);
      #ifdef AGGRESSIVE_TEST
              if (!check && size > 6)
                      size = 6;
      #endif
              return size;
      }
      
      static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
      {
              int size;
      
              size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                              / sizeof(struct ext4_extent_idx);
      #ifdef AGGRESSIVE_TEST
              if (!check && size > 5)
                      size = 5;
      #endif
              return size;
      }
      
      static inline int ext4_ext_space_root(struct inode *inode, int check)
      {
              int size;
      
              size = sizeof(EXT4_I(inode)->i_data);
              size -= sizeof(struct ext4_extent_header);
              size /= sizeof(struct ext4_extent);
      #ifdef AGGRESSIVE_TEST
              if (!check && size > 3)
                      size = 3;
      #endif
              return size;
      }
      
      static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
      {
              int size;
      
              size = sizeof(EXT4_I(inode)->i_data);
              size -= sizeof(struct ext4_extent_header);
              size /= sizeof(struct ext4_extent_idx);
      #ifdef AGGRESSIVE_TEST
              if (!check && size > 4)
                      size = 4;
      #endif
              return size;
      }
      
      static inline int
      ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
                                 struct ext4_ext_path **ppath, ext4_lblk_t lblk,
                                 int nofail)
      {
              struct ext4_ext_path *path = *ppath;
              int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
      
   17         return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
                              EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
                              EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
                              (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
      }
      
      /*
       * Calculate the number of metadata blocks needed
       * to allocate @blocks
       * Worse case is one block per extent
       */
      int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
      {
              struct ext4_inode_info *ei = EXT4_I(inode);
              int idxs;
      
              idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                      / sizeof(struct ext4_extent_idx));
      
              /*
               * If the new delayed allocation block is contiguous with the
               * previous da block, it can share index blocks with the
               * previous block, so we only need to allocate a new index
               * block every idxs leaf blocks.  At ldxs**2 blocks, we need
               * an additional index block, and at ldxs**3 blocks, yet
               * another index blocks.
               */
              if (ei->i_da_metadata_calc_len &&
                  ei->i_da_metadata_calc_last_lblock+1 == lblock) {
                      int num = 0;
      
                      if ((ei->i_da_metadata_calc_len % idxs) == 0)
                              num++;
                      if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
                              num++;
                      if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
                              num++;
                              ei->i_da_metadata_calc_len = 0;
                      } else
                              ei->i_da_metadata_calc_len++;
                      ei->i_da_metadata_calc_last_lblock++;
                      return num;
              }
      
              /*
               * In the worst case we need a new set of index blocks at
               * every level of the inode's extent tree.
               */
              ei->i_da_metadata_calc_len = 1;
              ei->i_da_metadata_calc_last_lblock = lblock;
              return ext_depth(inode) + 1;
      }
      
      static int
      ext4_ext_max_entries(struct inode *inode, int depth)
      {
              int max;
      
  457         if (depth == ext_depth(inode)) {
                      if (depth == 0)
                              max = ext4_ext_space_root(inode, 1);
                      else
                              max = ext4_ext_space_root_idx(inode, 1);
              } else {
   43                 if (depth == 0)
                              max = ext4_ext_space_block(inode, 1);
                      else
                              max = ext4_ext_space_block_idx(inode, 1);
              }
      
              return max;
      }
      
      static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
      {
  264         ext4_fsblk_t block = ext4_ext_pblock(ext);
  264         int len = ext4_ext_get_actual_len(ext);
  264         ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
      
              /*
               * We allow neither:
               *  - zero length
               *  - overflow/wrap-around
               */
              if (lblock + len <= lblock)
                      return 0;
              return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
      }
      
      static int ext4_valid_extent_idx(struct inode *inode,
                                      struct ext4_extent_idx *ext_idx)
      {
              ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
      
              return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
      }
      
      static int ext4_valid_extent_entries(struct inode *inode,
                                      struct ext4_extent_header *eh,
                                      int depth)
      {
              unsigned short entries;
  457         if (eh->eh_entries == 0)
                      return 1;
      
              entries = le16_to_cpu(eh->eh_entries);
      
  349         if (depth == 0) {
                      /* leaf entries */
                      struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
  264                 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
                      ext4_fsblk_t pblock = 0;
                      ext4_lblk_t lblock = 0;
                      ext4_lblk_t prev = 0;
                      int len = 0;
                      while (entries) {
  264                         if (!ext4_valid_extent(inode, ext))
                                      return 0;
      
                              /* Check for overlapping extents */
  264                         lblock = le32_to_cpu(ext->ee_block);
  264                         len = ext4_ext_get_actual_len(ext);
  264                         if ((lblock <= prev) && prev) {
                                      pblock = ext4_ext_pblock(ext);
                                      es->s_last_error_block = cpu_to_le64(pblock);
                                      return 0;
                              }
  264                         ext++;
                              entries--;
                              prev = lblock + len - 1;
                      }
              } else {
                      struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
                      while (entries) {
  185                         if (!ext4_valid_extent_idx(inode, ext_idx))
                                      return 0;
  185                         ext_idx++;
                              entries--;
                      }
              }
              return 1;
      }
      
      static int __ext4_ext_check(const char *function, unsigned int line,
                                  struct inode *inode, struct ext4_extent_header *eh,
                                  int depth, ext4_fsblk_t pblk)
      {
              const char *error_msg;
              int max = 0, err = -EFSCORRUPTED;
      
  457         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
                      error_msg = "invalid magic";
                      goto corrupted;
              }
  457         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
                      error_msg = "unexpected eh_depth";
                      goto corrupted;
              }
  457         if (unlikely(eh->eh_max == 0)) {
                      error_msg = "invalid eh_max";
                      goto corrupted;
              }
  457         max = ext4_ext_max_entries(inode, depth);
  457         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
                      error_msg = "too large eh_max";
                      goto corrupted;
              }
  457         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
                      error_msg = "invalid eh_entries";
                      goto corrupted;
              }
  457         if (!ext4_valid_extent_entries(inode, eh, depth)) {
                      error_msg = "invalid extent entries";
                      goto corrupted;
              }
  374         if (unlikely(depth > 32)) {
                      error_msg = "too large eh_depth";
                      goto corrupted;
              }
              /* Verify checksum on non-root extent tree nodes */
  457         if (ext_depth(inode) != depth &&
   43             !ext4_extent_block_csum_verify(inode, eh)) {
                      error_msg = "extent tree corrupted";
                      err = -EFSBADCRC;
                      goto corrupted;
              }
  457         return 0;
      
      corrupted:
              ext4_error_inode(inode, function, line, 0,
                               "pblk %llu bad header/extent: %s - magic %x, "
                               "entries %u, max %u(%u), depth %u(%u)",
                               (unsigned long long) pblk, error_msg,
                               le16_to_cpu(eh->eh_magic),
                               le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
                               max, le16_to_cpu(eh->eh_depth), depth);
              return err;
      }
      
      #define ext4_ext_check(inode, eh, depth, pblk)                        \
              __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
      
      int ext4_ext_check_inode(struct inode *inode)
      {
   18         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
      }
      
      static struct buffer_head *
      __read_extent_tree_block(const char *function, unsigned int line,
                               struct inode *inode, ext4_fsblk_t pblk, int depth,
                               int flags)
      {
              struct buffer_head                *bh;
              int                                err;
      
  425         bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
              if (unlikely(!bh))
                      return ERR_PTR(-ENOMEM);
      
  425         if (!bh_uptodate_or_lock(bh)) {
                      trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
                      err = bh_submit_read(bh);
                      if (err < 0)
                              goto errout;
              }
  425         if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
                      return bh;
   43         err = __ext4_ext_check(function, line, inode,
                                     ext_block_hdr(bh), depth, pblk);
              if (err)
                      goto errout;
   43         set_buffer_verified(bh);
              /*
               * If this is a leaf block, cache all of its entries
               */
              if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
   43                 struct ext4_extent_header *eh = ext_block_hdr(bh);
                      struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
                      ext4_lblk_t prev = 0;
                      int i;
      
                      for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
                              unsigned int status = EXTENT_STATUS_WRITTEN;
   43                         ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
   43                         int len = ext4_ext_get_actual_len(ex);
      
   43                         if (prev && (prev != lblk))
                                      ext4_es_cache_extent(inode, prev,
                                                           lblk - prev, ~0,
                                                           EXTENT_STATUS_HOLE);
      
   43                         if (ext4_ext_is_unwritten(ex))
                                      status = EXTENT_STATUS_UNWRITTEN;
   43                         ext4_es_cache_extent(inode, lblk, len,
                                                   ext4_ext_pblock(ex), status);
                              prev = lblk + len;
                      }
              }
              return bh;
      errout:
              put_bh(bh);
  425         return ERR_PTR(err);
      
      }
      
      #define read_extent_tree_block(inode, pblk, depth, flags)                \
              __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
                                       (depth), (flags))
      
      /*
       * This function is called to cache a file's extent information in the
       * extent status tree
       */
      int ext4_ext_precache(struct inode *inode)
      {
              struct ext4_inode_info *ei = EXT4_I(inode);
              struct ext4_ext_path *path = NULL;
              struct buffer_head *bh;
              int i = 0, depth, ret = 0;
      
    6         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                      return 0;        /* not an extent-mapped inode */
      
    5         down_read(&ei->i_data_sem);
              depth = ext_depth(inode);
      
              path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
                             GFP_NOFS);
              if (path == NULL) {
                      up_read(&ei->i_data_sem);
                      return -ENOMEM;
              }
      
              /* Don't cache anything if there are no external extent blocks */
    5         if (depth == 0)
                      goto out;
    2         path[0].p_hdr = ext_inode_hdr(inode);
              ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
              if (ret)
                      goto out;
    2         path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
    2         while (i >= 0) {
                      /*
                       * If this is a leaf block or we've reached the end of
                       * the index block, go up
                       */
    2                 if ((i == depth) ||
    2                     path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
    2                         brelse(path[i].p_bh);
    2                         path[i].p_bh = NULL;
                              i--;
                              continue;
                      }
    2                 bh = read_extent_tree_block(inode,
                                                  ext4_idx_pblock(path[i].p_idx++),
                                                  depth - i - 1,
                                                  EXT4_EX_FORCE_CACHE);
                      if (IS_ERR(bh)) {
                              ret = PTR_ERR(bh);
                              break;
                      }
    2                 i++;
                      path[i].p_bh = bh;
                      path[i].p_hdr = ext_block_hdr(bh);
                      path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
              }
    2         ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
      out:
    5         up_read(&ei->i_data_sem);
              ext4_ext_drop_refs(path);
              kfree(path);
    5         return ret;
      }
      
      #ifdef EXT_DEBUG
      static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
      {
              int k, l = path->p_depth;
      
              ext_debug("path:");
              for (k = 0; k <= l; k++, path++) {
                      if (path->p_idx) {
                        ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
                                  ext4_idx_pblock(path->p_idx));
                      } else if (path->p_ext) {
                              ext_debug("  %d:[%d]%d:%llu ",
                                        le32_to_cpu(path->p_ext->ee_block),
                                        ext4_ext_is_unwritten(path->p_ext),
                                        ext4_ext_get_actual_len(path->p_ext),
                                        ext4_ext_pblock(path->p_ext));
                      } else
                              ext_debug("  []");
              }
              ext_debug("\n");
      }
      
      static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
      {
              int depth = ext_depth(inode);
              struct ext4_extent_header *eh;
              struct ext4_extent *ex;
              int i;
      
              if (!path)
                      return;
      
              eh = path[depth].p_hdr;
              ex = EXT_FIRST_EXTENT(eh);
      
              ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
      
              for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                      ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                                ext4_ext_is_unwritten(ex),
                                ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
              }
              ext_debug("\n");
      }
      
      static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
                              ext4_fsblk_t newblock, int level)
      {
              int depth = ext_depth(inode);
              struct ext4_extent *ex;
      
              if (depth != level) {
                      struct ext4_extent_idx *idx;
                      idx = path[level].p_idx;
                      while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
                              ext_debug("%d: move %d:%llu in new index %llu\n", level,
                                              le32_to_cpu(idx->ei_block),
                                              ext4_idx_pblock(idx),
                                              newblock);
                              idx++;
                      }
      
                      return;
              }
      
              ex = path[depth].p_ext;
              while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
                      ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                      le32_to_cpu(ex->ee_block),
                                      ext4_ext_pblock(ex),
                                      ext4_ext_is_unwritten(ex),
                                      ext4_ext_get_actual_len(ex),
                                      newblock);
                      ex++;
              }
      }
      
      #else
      #define ext4_ext_show_path(inode, path)
      #define ext4_ext_show_leaf(inode, path)
      #define ext4_ext_show_move(inode, path, newblock, level)
      #endif
      
      void ext4_ext_drop_refs(struct ext4_ext_path *path)
      {
              int depth, i;
      
  873         if (!path)
                      return;
  865         depth = path->p_depth;
  873         for (i = 0; i <= depth; i++, path++)
  865                 if (path->p_bh) {
  338                         brelse(path->p_bh);
                              path->p_bh = NULL;
                      }
      }
      
      /*
       * ext4_ext_binsearch_idx:
       * binary search for the closest index of the given block
       * the header must be checked before calling this
       */
      static void
      ext4_ext_binsearch_idx(struct inode *inode,
                              struct ext4_ext_path *path, ext4_lblk_t block)
      {
              struct ext4_extent_header *eh = path->p_hdr;
              struct ext4_extent_idx *r, *l, *m;
      
      
              ext_debug("binsearch for %u(idx):  ", block);
      
              l = EXT_FIRST_INDEX(eh) + 1;
              r = EXT_LAST_INDEX(eh);
              while (l <= r) {
                      m = l + (r - l) / 2;
                      if (block < le32_to_cpu(m->ei_block))
                              r = m - 1;
                      else
                              l = m + 1;
                      ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
                                      m, le32_to_cpu(m->ei_block),
                                      r, le32_to_cpu(r->ei_block));
              }
      
  339         path->p_idx = l - 1;
              ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
                        ext4_idx_pblock(path->p_idx));
      
      #ifdef CHECK_BINSEARCH
              {
                      struct ext4_extent_idx *chix, *ix;
                      int k;
      
                      chix = ix = EXT_FIRST_INDEX(eh);
                      for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
                        if (k != 0 &&
                            le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
                                      printk(KERN_DEBUG "k=%d, ix=0x%p, "
                                             "first=0x%p\n", k,
                                             ix, EXT_FIRST_INDEX(eh));
                                      printk(KERN_DEBUG "%u <= %u\n",
                                             le32_to_cpu(ix->ei_block),
                                             le32_to_cpu(ix[-1].ei_block));
                              }
                              BUG_ON(k && le32_to_cpu(ix->ei_block)
                                                 <= le32_to_cpu(ix[-1].ei_block));
                              if (block < le32_to_cpu(ix->ei_block))
                                      break;
                              chix = ix;
                      }
                      BUG_ON(chix != path->p_idx);
              }
      #endif
      
      }
      
      /*
       * ext4_ext_binsearch:
       * binary search for closest extent of the given block
       * the header must be checked before calling this
       */
      static void
      ext4_ext_binsearch(struct inode *inode,
                      struct ext4_ext_path *path, ext4_lblk_t block)
      {
              struct ext4_extent_header *eh = path->p_hdr;
              struct ext4_extent *r, *l, *m;
      
              if (eh->eh_entries == 0) {
                      /*
                       * this leaf is empty:
                       * we get such a leaf in split/add case
                       */
                      return;
              }
      
              ext_debug("binsearch for %u:  ", block);
      
  591         l = EXT_FIRST_EXTENT(eh) + 1;
              r = EXT_LAST_EXTENT(eh);
      
  504         while (l <= r) {
  504                 m = l + (r - l) / 2;
                      if (block < le32_to_cpu(m->ee_block))
  102                         r = m - 1;
                      else
  464                         l = m + 1;
                      ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
                                      m, le32_to_cpu(m->ee_block),
                                      r, le32_to_cpu(r->ee_block));
              }
      
  591         path->p_ext = l - 1;
              ext_debug("  -> %d:%llu:[%d]%d ",
                              le32_to_cpu(path->p_ext->ee_block),
                              ext4_ext_pblock(path->p_ext),
                              ext4_ext_is_unwritten(path->p_ext),
                              ext4_ext_get_actual_len(path->p_ext));
      
      #ifdef CHECK_BINSEARCH
              {
                      struct ext4_extent *chex, *ex;
                      int k;
      
                      chex = ex = EXT_FIRST_EXTENT(eh);
                      for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
                              BUG_ON(k && le32_to_cpu(ex->ee_block)
                                                <= le32_to_cpu(ex[-1].ee_block));
                              if (block < le32_to_cpu(ex->ee_block))
                                      break;
                              chex = ex;
                      }
                      BUG_ON(chex != path->p_ext);
              }
      #endif
      
      }
      
      int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
      {
              struct ext4_extent_header *eh;
      
              eh = ext_inode_hdr(inode);
   50         eh->eh_depth = 0;
              eh->eh_entries = 0;
              eh->eh_magic = EXT4_EXT_MAGIC;
              eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
              ext4_mark_inode_dirty(handle, inode);
              return 0;
      }
      
      struct ext4_ext_path *
      ext4_find_extent(struct inode *inode, ext4_lblk_t block,
                       struct ext4_ext_path **orig_path, int flags)
  711 {
              struct ext4_extent_header *eh;
              struct buffer_head *bh;
  357         struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
              short int depth, i, ppos = 0;
              int ret;
      
              eh = ext_inode_hdr(inode);
              depth = ext_depth(inode);
  707         if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
                      EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
                                       depth);
                      ret = -EFSCORRUPTED;
                      goto err;
              }
      
  357         if (path) {
  346                 ext4_ext_drop_refs(path);
                      if (depth > path[0].p_maxdepth) {
                              kfree(path);
                              *orig_path = path = NULL;
                      }
              }
              if (!path) {
                      /* account possible depth increase */
  711                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
                                      GFP_NOFS);
                      if (unlikely(!path))
                              return ERR_PTR(-ENOMEM);
  711                 path[0].p_maxdepth = depth + 1;
              }
  711         path[0].p_hdr = eh;
              path[0].p_bh = NULL;
      
              i = depth;
              /* walk through the tree */
              while (i) {
                      ext_debug("depth %d: num %d, max %d\n",
                                ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
      
  339                 ext4_ext_binsearch_idx(inode, path + ppos, block);
                      path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                      path[ppos].p_depth = i;
                      path[ppos].p_ext = NULL;
      
                      bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
                                                  flags);
                      if (IS_ERR(bh)) {
                              ret = PTR_ERR(bh);
                              goto err;
                      }
      
  339                 eh = ext_block_hdr(bh);
                      ppos++;
                      if (unlikely(ppos > depth)) {
                              put_bh(bh);
                              EXT4_ERROR_INODE(inode,
                                               "ppos %d > depth %d", ppos, depth);
                              ret = -EFSCORRUPTED;
                              goto err;
                      }
  339                 path[ppos].p_bh = bh;
                      path[ppos].p_hdr = eh;
              }
      
  711         path[ppos].p_depth = i;
              path[ppos].p_ext = NULL;
              path[ppos].p_idx = NULL;
      
              /* find extent */
  591         ext4_ext_binsearch(inode, path + ppos, block);
              /* if not an empty leaf */
              if (path[ppos].p_ext)
  591                 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
      
              ext4_ext_show_path(inode, path);
      
              return path;
      
      err:
              ext4_ext_drop_refs(path);
              kfree(path);
              if (orig_path)
                      *orig_path = NULL;
              return ERR_PTR(ret);
      }
      
      /*
       * ext4_ext_insert_index:
       * insert new index [@logical;@ptr] into the block at @curp;
       * check where to insert: before @curp or after @curp
       */
      static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                                       struct ext4_ext_path *curp,
                                       int logical, ext4_fsblk_t ptr)
      {
              struct ext4_extent_idx *ix;
              int len, err;
      
              err = ext4_ext_get_access(handle, inode, curp);
              if (err)
                      return err;
      
              if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
                      EXT4_ERROR_INODE(inode,
                                       "logical %d == ei_block %d!",
                                       logical, le32_to_cpu(curp->p_idx->ei_block));
                      return -EFSCORRUPTED;
              }
      
              if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
                                   >= le16_to_cpu(curp->p_hdr->eh_max))) {
                      EXT4_ERROR_INODE(inode,
                                       "eh_entries %d >= eh_max %d!",
                                       le16_to_cpu(curp->p_hdr->eh_entries),
                                       le16_to_cpu(curp->p_hdr->eh_max));
                      return -EFSCORRUPTED;
              }
      
              if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                      /* insert after */
                      ext_debug("insert new index %d after: %llu\n", logical, ptr);
                      ix = curp->p_idx + 1;
              } else {
                      /* insert before */
                      ext_debug("insert new index %d before: %llu\n", logical, ptr);
                      ix = curp->p_idx;
              }
      
              len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
              BUG_ON(len < 0);
              if (len > 0) {
                      ext_debug("insert new index %d: "
                                      "move %d indices from 0x%p to 0x%p\n",
                                      logical, len, ix, ix + 1);
                      memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
              }
      
              if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
                      EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
                      return -EFSCORRUPTED;
              }
      
              ix->ei_block = cpu_to_le32(logical);
              ext4_idx_store_pblock(ix, ptr);
              le16_add_cpu(&curp->p_hdr->eh_entries, 1);
      
              if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
                      EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
                      return -EFSCORRUPTED;
              }
      
              err = ext4_ext_dirty(handle, inode, curp);
              ext4_std_error(inode->i_sb, err);
      
              return err;
      }
      
      /*
       * ext4_ext_split:
       * inserts new subtree into the path, using free index entry
       * at depth @at:
       * - allocates all needed blocks (new leaf and all intermediate index blocks)
       * - makes decision where to split
       * - moves remaining extents and index entries (right to the split point)
       *   into the newly allocated blocks
       * - initializes subtree
       */
      static int ext4_ext_split(handle_t *handle, struct inode *inode,
                                unsigned int flags,
                                struct ext4_ext_path *path,
                                struct ext4_extent *newext, int at)
      {
              struct buffer_head *bh = NULL;
              int depth = ext_depth(inode);
              struct ext4_extent_header *neh;
              struct ext4_extent_idx *fidx;
              int i = at, k, m, a;
              ext4_fsblk_t newblock, oldblock;
              __le32 border;
              ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
              int err = 0;
              size_t ext_size = 0;
      
              /* make decision: where to split? */
              /* FIXME: now decision is simplest: at current extent */
      
              /* if current leaf will be split, then we should use
               * border from split point */
              if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
                      EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
                      return -EFSCORRUPTED;
              }
              if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
                      border = path[depth].p_ext[1].ee_block;
                      ext_debug("leaf will be split."
                                      " next leaf starts at %d\n",
                                        le32_to_cpu(border));
              } else {
                      border = newext->ee_block;
                      ext_debug("leaf will be added."
                                      " next leaf starts at %d\n",
                                      le32_to_cpu(border));
              }
      
              /*
               * If error occurs, then we break processing
               * and mark filesystem read-only. index won't
               * be inserted and tree will be in consistent
               * state. Next mount will repair buffers too.
               */
      
              /*
               * Get array to track all allocated blocks.
               * We need this to handle errors and free blocks
               * upon them.
               */
              ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
              if (!ablocks)
                      return -ENOMEM;
      
              /* allocate all needed blocks */
              ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
              for (a = 0; a < depth - at; a++) {
                      newblock = ext4_ext_new_meta_block(handle, inode, path,
                                                         newext, &err, flags);
                      if (newblock == 0)
                              goto cleanup;
                      ablocks[a] = newblock;
              }
      
              /* initialize new leaf */
              newblock = ablocks[--a];
              if (unlikely(newblock == 0)) {
                      EXT4_ERROR_INODE(inode, "newblock == 0!");
                      err = -EFSCORRUPTED;
                      goto cleanup;
              }
              bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
              if (unlikely(!bh)) {
                      err = -ENOMEM;
                      goto cleanup;
              }
              lock_buffer(bh);
      
              err = ext4_journal_get_create_access(handle, bh);
              if (err)
                      goto cleanup;
      
              neh = ext_block_hdr(bh);
              neh->eh_entries = 0;
              neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
              neh->eh_magic = EXT4_EXT_MAGIC;
              neh->eh_depth = 0;
      
              /* move remainder of path[depth] to the new leaf */
              if (unlikely(path[depth].p_hdr->eh_entries !=
                           path[depth].p_hdr->eh_max)) {
                      EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
                                       path[depth].p_hdr->eh_entries,
                                       path[depth].p_hdr->eh_max);
                      err = -EFSCORRUPTED;
                      goto cleanup;
              }
              /* start copy from next extent */
              m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
              ext4_ext_show_move(inode, path, newblock, depth);
              if (m) {
                      struct ext4_extent *ex;
                      ex = EXT_FIRST_EXTENT(neh);
                      memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
                      le16_add_cpu(&neh->eh_entries, m);
              }
      
              /* zero out unused area in the extent block */
              ext_size = sizeof(struct ext4_extent_header) +
                      sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
              memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
              ext4_extent_block_csum_set(inode, neh);
              set_buffer_uptodate(bh);
              unlock_buffer(bh);
      
              err = ext4_handle_dirty_metadata(handle, inode, bh);
              if (err)
                      goto cleanup;
              brelse(bh);
              bh = NULL;
      
              /* correct old leaf */
              if (m) {
                      err = ext4_ext_get_access(handle, inode, path + depth);
                      if (err)
                              goto cleanup;
                      le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
                      err = ext4_ext_dirty(handle, inode, path + depth);
                      if (err)
                              goto cleanup;
      
              }
      
              /* create intermediate indexes */
              k = depth - at - 1;
              if (unlikely(k < 0)) {
                      EXT4_ERROR_INODE(inode, "k %d < 0!", k);
                      err = -EFSCORRUPTED;
                      goto cleanup;
              }
              if (k)
                      ext_debug("create %d intermediate indices\n", k);
              /* insert new index into current index block */
              /* current depth stored in i var */
              i = depth - 1;
              while (k--) {
                      oldblock = newblock;
                      newblock = ablocks[--a];
                      bh = sb_getblk(inode->i_sb, newblock);
                      if (unlikely(!bh)) {
                              err = -ENOMEM;
                              goto cleanup;
                      }
                      lock_buffer(bh);
      
                      err = ext4_journal_get_create_access(handle, bh);
                      if (err)
                              goto cleanup;
      
                      neh = ext_block_hdr(bh);
                      neh->eh_entries = cpu_to_le16(1);
                      neh->eh_magic = EXT4_EXT_MAGIC;
                      neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
                      neh->eh_depth = cpu_to_le16(depth - i);
                      fidx = EXT_FIRST_INDEX(neh);
                      fidx->ei_block = border;
                      ext4_idx_store_pblock(fidx, oldblock);
      
                      ext_debug("int.index at %d (block %llu): %u -> %llu\n",
                                      i, newblock, le32_to_cpu(border), oldblock);
      
                      /* move remainder of path[i] to the new index block */
                      if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
                                              EXT_LAST_INDEX(path[i].p_hdr))) {
                              EXT4_ERROR_INODE(inode,
                                               "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
                                               le32_to_cpu(path[i].p_ext->ee_block));
                              err = -EFSCORRUPTED;
                              goto cleanup;
                      }
                      /* start copy indexes */
                      m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
                      ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
                                      EXT_MAX_INDEX(path[i].p_hdr));
                      ext4_ext_show_move(inode, path, newblock, i);
                      if (m) {
                              memmove(++fidx, path[i].p_idx,
                                      sizeof(struct ext4_extent_idx) * m);
                              le16_add_cpu(&neh->eh_entries, m);
                      }
                      /* zero out unused area in the extent block */
                      ext_size = sizeof(struct ext4_extent_header) +
                         (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
                      memset(bh->b_data + ext_size, 0,
                              inode->i_sb->s_blocksize - ext_size);
                      ext4_extent_block_csum_set(inode, neh);
                      set_buffer_uptodate(bh);
                      unlock_buffer(bh);
      
                      err = ext4_handle_dirty_metadata(handle, inode, bh);
                      if (err)
                              goto cleanup;
                      brelse(bh);
                      bh = NULL;
      
                      /* correct old index */
                      if (m) {
                              err = ext4_ext_get_access(handle, inode, path + i);
                              if (err)
                                      goto cleanup;
                              le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
                              err = ext4_ext_dirty(handle, inode, path + i);
                              if (err)
                                      goto cleanup;
                      }
      
                      i--;
              }
      
              /* insert new index */
              err = ext4_ext_insert_index(handle, inode, path + at,
                                          le32_to_cpu(border), newblock);
      
      cleanup:
              if (bh) {
                      if (buffer_locked(bh))
                              unlock_buffer(bh);
                      brelse(bh);
              }
      
              if (err) {
                      /* free all allocated blocks in error case */
                      for (i = 0; i < depth; i++) {
                              if (!ablocks[i])
                                      continue;
                              ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
                                               EXT4_FREE_BLOCKS_METADATA);
                      }
              }
              kfree(ablocks);
      
              return err;
      }
      
      /*
       * ext4_ext_grow_indepth:
       * implements tree growing procedure:
       * - allocates new block
  302  * - moves top-level data (index block or leaf) into the new block
       * - initializes new top-level, creating index that points to the
       *   just created block
       */
      static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                                       unsigned int flags)
      {
              struct ext4_extent_header *neh;
              struct buffer_head *bh;
              ext4_fsblk_t newblock, goal = 0;
  302         struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
  302         int err = 0;
              size_t ext_size = 0;
      
  301         /* Try to prepend new index to old one */
              if (ext_depth(inode))
  301                 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
              if (goal > le32_to_cpu(es->s_first_data_block)) {
                      flags |= EXT4_MB_HINT_TRY_GOAL;
  301                 goal--;
              } else
  301                 goal = ext4_inode_to_goal_block(inode);
              newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
                                              NULL, &err);
              if (newblock == 0)
                      return err;
      
              bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
  301         if (unlikely(!bh))
                      return -ENOMEM;
              lock_buffer(bh);
      
              err = ext4_journal_get_create_access(handle, bh);
              if (err) {
                      unlock_buffer(bh);
                      goto out;
              }
      
  301         ext_size = sizeof(EXT4_I(inode)->i_data);
              /* move top-level index/leaf into new block */
              memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
              /* zero out unused area in the extent block */
              memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
      
              /* set size of new block */
              neh = ext_block_hdr(bh);
              /* old root could have indexes or leaves
               * so calculate e_max right way */
              if (ext_depth(inode))
                      neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
  301         else
                      neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
              neh->eh_magic = EXT4_EXT_MAGIC;
              ext4_extent_block_csum_set(inode, neh);
  301         set_buffer_uptodate(bh);
              unlock_buffer(bh);
      
              err = ext4_handle_dirty_metadata(handle, inode, bh);
              if (err)
                      goto out;
      
              /* Update top-level index: num,max,pointer */
              neh = ext_inode_hdr(inode);
  301         neh->eh_entries = cpu_to_le16(1);
              ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
              if (neh->eh_depth == 0) {
  301                 /* Root extent block becomes index block */
                      neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
                      EXT_FIRST_INDEX(neh)->ei_block =
                              EXT_FIRST_EXTENT(neh)->ee_block;
              }
              ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                        le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                        le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                        ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
      
              le16_add_cpu(&neh->eh_depth, 1);
              ext4_mark_inode_dirty(handle, inode);
      out:
              brelse(bh);
      
              return err;
      }
      
      /*
       * ext4_ext_create_new_leaf:
       * finds empty index and adds new leaf.
  302  * if no free index is found, then it requests in-depth growing.
       */
      static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
                                          unsigned int mb_flags,
                                          unsigned int gb_flags,
                                          struct ext4_ext_path **ppath,
                                          struct ext4_extent *newext)
      {
              struct ext4_ext_path *path = *ppath;
              struct ext4_ext_path *curp;
              int depth, i, err = 0;
  302 
      repeat:
              i = depth = ext_depth(inode);
      
              /* walk up to the tree and look for free index entry */
              curp = path + depth;
              while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
                      i--;
                      curp--;
              }
      
              /* we use already allocated block for index block,
               * so subsequent data blocks should be contiguous */
              if (EXT_HAS_FREE_INDEX(curp)) {
                      /* if we found index with free entry, then use that
  302                  * entry: create all needed subtree and add new leaf */
                      err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
                      if (err)
                              goto out;
      
                      /* refill path */
  301                 path = ext4_find_extent(inode,
                                          (ext4_lblk_t)le32_to_cpu(newext->ee_block),
                                          ppath, gb_flags);
                      if (IS_ERR(path))
                              err = PTR_ERR(path);
              } else {
                      /* tree is full, time to grow in depth */
                      err = ext4_ext_grow_indepth(handle, inode, mb_flags);
                      if (err)
                              goto out;
      
  301                 /* refill path */
  301                 path = ext4_find_extent(inode,
                                         (ext4_lblk_t)le32_to_cpu(newext->ee_block),
                                          ppath, gb_flags);
                      if (IS_ERR(path)) {
                              err = PTR_ERR(path);
                              goto out;
                      }
      
                      /*
                       * only first (depth 0 -> 1) produces free space;
                       * in all other cases we have to split the grown tree
                       */
                      depth = ext_depth(inode);
                      if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
                              /* now we need to split */
                              goto repeat;
                      }
              }
      
      out:
              return err;
      }
      
      /*
       * search the closest allocated block to the left for *logical
       * and returns it at @logical + it's physical address at @phys
       * if *logical is the smallest allocated block, the function
       * returns 0 at @phys
       * return value contains 0 (success) or error code
  602  */
      static int ext4_ext_search_left(struct inode *inode,
                                      struct ext4_ext_path *path,
  589                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
      {
              struct ext4_extent_idx *ix;
              struct ext4_extent *ex;
              int depth, ee_len;
      
              if (unlikely(path == NULL)) {
  530                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
  530                 return -EFSCORRUPTED;
  530         }
   56         depth = path->p_depth;
              *phys = 0;
      
              if (depth == 0 && path->p_ext == NULL)
                      return 0;
      
   56         /* usually extent in the path covers blocks smaller
   16          * then *logical, but it can be that extent is the
               * first one in the file */
      
              ex = path[depth].p_ext;
              ee_len = ext4_ext_get_actual_len(ex);
              if (*logical < le32_to_cpu(ex->ee_block)) {
                      if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
                              EXT4_ERROR_INODE(inode,
                                               "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
                                               *logical, le32_to_cpu(ex->ee_block));
                              return -EFSCORRUPTED;
                      }
                      while (--depth >= 0) {
                              ix = path[depth].p_idx;
  515                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                      EXT4_ERROR_INODE(inode,
                                        "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
                                        ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
                                        EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
                      le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
                                        depth);
  515                                 return -EFSCORRUPTED;
                              }
                      }
                      return 0;
              }
      
              if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
                      EXT4_ERROR_INODE(inode,
                                       "logical %d < ee_block %d + ee_len %d!",
                                       *logical, le32_to_cpu(ex->ee_block), ee_len);
                      return -EFSCORRUPTED;
              }
      
              *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
              *phys = ext4_ext_pblock(ex) + ee_len - 1;
              return 0;
      }
      
      /*
       * search the closest allocated block to the right for *logical
       * and returns it at @logical + it's physical address at @phys
       * if *logical is the largest allocated block, the function
       * returns 0 at @phys
       * return value contains 0 (success) or error code
       */
  602 static int ext4_ext_search_right(struct inode *inode,
                                       struct ext4_ext_path *path,
                                       ext4_lblk_t *logical, ext4_fsblk_t *phys,
                                       struct ext4_extent **ret_ex)
  602 {
              struct buffer_head *bh = NULL;
              struct ext4_extent_header *eh;
  589         struct ext4_extent_idx *ix;
              struct ext4_extent *ex;
              ext4_fsblk_t block;
              int depth;        /* Note, NOT eh_depth; depth from top of tree */
              int ee_len;
      
              if (unlikely(path == NULL)) {
  530                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
  530                 return -EFSCORRUPTED;
  530         }
   56         depth = path->p_depth;
              *phys = 0;
      
              if (depth == 0 && path->p_ext == NULL)
                      return 0;
      
   56         /* usually extent in the path covers blocks smaller
   16          * then *logical, but it can be that extent is the
               * first one in the file */
      
              ex = path[depth].p_ext;
              ee_len = ext4_ext_get_actual_len(ex);
  602         if (*logical < le32_to_cpu(ex->ee_block)) {
                      if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
                              EXT4_ERROR_INODE(inode,
                                               "first_extent(path[%d].p_hdr) != ex",
                                               depth);
                              return -EFSCORRUPTED;
  515                 }
                      while (--depth >= 0) {
                              ix = path[depth].p_idx;
                              if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                      EXT4_ERROR_INODE(inode,
                                                       "ix != EXT_FIRST_INDEX *logical %d!",
                                                       *logical);
  515                                 return -EFSCORRUPTED;
                              }
   37                 }
                      goto found_extent;
              }
      
              if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
  494                 EXT4_ERROR_INODE(inode,
  308                                  "logical %d < ee_block %d + ee_len %d!",
                                       *logical, le32_to_cpu(ex->ee_block), ee_len);
                      return -EFSCORRUPTED;
              }
      
              if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
                      /* next allocated block in this leaf */
                      ex++;
                      goto found_extent;
              }
      
              /* go up and search for index to the right */
              while (--depth >= 0) {
                      ix = path[depth].p_idx;
                      if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
                              goto got_index;
              }
      
              /* we've gone up to the root and found no index to the right */
              return 0;
      
      got_index:
              /* we've found index to the right, let's
               * follow it and find the closest allocated
               * block to the right */
              ix++;
              block = ext4_idx_pblock(ix);
              while (++depth < path->p_depth) {
                      /* subtract from p_depth to get proper eh_depth */
                      bh = read_extent_tree_block(inode, block,
                                                  path->p_depth - depth, 0);
                      if (IS_ERR(bh))
   66                         return PTR_ERR(bh);
                      eh = ext_block_hdr(bh);
                      ix = EXT_FIRST_INDEX(eh);
                      block = ext4_idx_pblock(ix);
                      put_bh(bh);
              }
      
              bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
              if (IS_ERR(bh))
                      return PTR_ERR(bh);
              eh = ext_block_hdr(bh);
              ex = EXT_FIRST_EXTENT(eh);
      found_extent:
              *logical = le32_to_cpu(ex->ee_block);
              *phys = ext4_ext_pblock(ex);
              *ret_ex = ex;
              if (bh)
                      put_bh(bh);
              return 0;
      }
  560 
  560 /*
       * ext4_ext_next_allocated_block:
  546  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
       * NOTE: it considers block number from index entry as
       * allocated block. Thus, index entries have to be consistent
       * with leaves.
  557  */
      ext4_lblk_t
  557 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
      {
  557         int depth;
  560 
              BUG_ON(path == NULL);
              depth = path->p_depth;
  310 
              if (depth == 0 && path->p_ext == NULL)
                      return EXT_MAX_BLOCKS;
      
  526         while (depth >= 0) {
                      if (depth == path->p_depth) {
                              /* leaf */
                              if (path[depth].p_ext &&
                                      path[depth].p_ext !=
                                              EXT_LAST_EXTENT(path[depth].p_hdr))
                                return le32_to_cpu(path[depth].p_ext[1].ee_block);
                      } else {
                              /* index */
                              if (path[depth].p_idx !=
                                              EXT_LAST_INDEX(path[depth].p_hdr))
                                return le32_to_cpu(path[depth].p_idx[1].ei_block);
                      }
                      depth--;
  297         }
  297 
              return EXT_MAX_BLOCKS;
      }
      
      /*
       * ext4_ext_next_leaf_block:
       * returns first allocated block from next leaf or EXT_MAX_BLOCKS
       */
      static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
      {
              int depth;
      
              BUG_ON(path == NULL);
              depth = path->p_depth;
      
              /* zero-tree has no leaf blocks at all */
              if (depth == 0)
                      return EXT_MAX_BLOCKS;
      
              /* go to index block */
              depth--;
      
              while (depth >= 0) {
                      if (path[depth].p_idx !=
                                      EXT_LAST_INDEX(path[depth].p_hdr))
                              return (ext4_lblk_t)
                                      le32_to_cpu(path[depth].p_idx[1].ei_block);
                      depth--;
              }
      
  610         return EXT_MAX_BLOCKS;
      }
      
      /*
       * ext4_ext_correct_indexes:
       * if leaf gets modified and modified extent is first in the leaf,
       * then we have to correct all indexes above.
       * TODO: do we need to correct tree in all cases?
  610  */
      static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
                                      struct ext4_ext_path *path)
      {
              struct ext4_extent_header *eh;
              int depth = ext_depth(inode);
  610         struct ext4_extent *ex;
              __le32 border;
  610         int k, err = 0;
      
              eh = path[depth].p_hdr;
  336         ex = path[depth].p_ext;
      
              if (unlikely(ex == NULL || eh == NULL)) {
                      EXT4_ERROR_INODE(inode,
                                       "ex %p == NULL or eh %p == NULL", ex, eh);
                      return -EFSCORRUPTED;
              }
      
   35         if (depth == 0) {
                      /* there is no tree at all */
                      return 0;
              }
      
   35         if (ex != EXT_FIRST_EXTENT(eh)) {
                      /* we correct tree if first leaf got modified only */
                      return 0;
              }
      
   35         /*
               * TODO: we need correction if border is smaller than current one
               */
              k = depth - 1;
              border = path[depth].p_ext->ee_block;
              err = ext4_ext_get_access(handle, inode, path + k);
              if (err)
                      return err;
              path[k].p_idx->ei_block = border;
              err = ext4_ext_dirty(handle, inode, path + k);
              if (err)
                      return err;
      
              while (k--) {
                      /* change all left-side indexes */
                      if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
                              break;
                      err = ext4_ext_get_access(handle, inode, path + k);
                      if (err)
                              break;
                      path[k].p_idx->ei_block = border;
                      err = ext4_ext_dirty(handle, inode, path + k);
  540                 if (err)
                              break;
              }
  533 
  533         return err;
      }
  533 
      int
      ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                                      struct ext4_extent *ex2)
      {
              unsigned short ext1_ee_len, ext2_ee_len;
      
              if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
                      return 0;
  516 
              ext1_ee_len = ext4_ext_get_actual_len(ex1);
  516         ext2_ee_len = ext4_ext_get_actual_len(ex2);
  173 
  173         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
                              le32_to_cpu(ex2->ee_block))
                      return 0;
      
              /*
               * To allow future support for preallocated extents to be added
               * as an RO_COMPAT feature, refuse to merge to extents if
               * this can result in the top bit of ee_len being set.
  540          */
              if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                      return 0;
              if (ext4_ext_is_unwritten(ex1) &&
                  (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
                   atomic_read(&EXT4_I(inode)->i_unwritten) ||
                   (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
                      return 0;
      #ifdef AGGRESSIVE_TEST
              if (ext1_ee_len >= 4)
                      return 0;
      #endif
      
              if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                      return 1;
              return 0;
      }
      
      /*
       * This function tries to merge the "ex" extent to the next extent in the tree.
  608  * It always tries to merge towards right. If you want to merge towards
       * left, pass "ex - 1" as argument instead of "ex".
       * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
       * 1 if they got merged.
  608  */
  502 static int ext4_ext_try_to_merge_right(struct inode *inode,
                                       struct ext4_ext_path *path,
                                       struct ext4_extent *ex)
  608 {
   35         struct ext4_extent_header *eh;
              unsigned int depth, len;
              int merge_done = 0, unwritten;
   13 
              depth = ext_depth(inode);
   35         BUG_ON(path[depth].p_hdr == NULL);
   19         eh = path[depth].p_hdr;
      
              while (ex < EXT_LAST_EXTENT(eh)) {
                      if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
   35                         break;
                      /* merge with next extent! */
                      unwritten = ext4_ext_is_unwritten(ex);
   35                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                      + ext4_ext_get_actual_len(ex + 1));
                      if (unwritten)
                              ext4_ext_mark_unwritten(ex);
  608 
                      if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                              len = (EXT_LAST_EXTENT(eh) - ex - 1)
                                      * sizeof(struct ext4_extent);
                              memmove(ex + 1, ex + 2, len);
                      }
                      le16_add_cpu(&eh->eh_entries, -1);
                      merge_done = 1;
                      WARN_ON(eh->eh_entries == 0);
                      if (!eh->eh_entries)
                              EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
              }
      
              return merge_done;
      }
  608 
  335 /*
  335  * This function does a very simple check to see if we can collapse
       * an extent tree with a single extent tree leaf block into the inode.
       */
      static void ext4_ext_try_to_merge_up(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_ext_path *path)
      {
              size_t s;
   15         unsigned max_root = ext4_ext_space_root(inode, 0);
              ext4_fsblk_t blk;
      
              if ((path[0].p_depth != 1) ||
                  (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
                  (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
   15                 return;
      
              /*
               * We need to modify the block allocation bitmap and the block
               * group descriptor to release the extent tree block.  If we
               * can't get the journal credits, give up.
               */
              if (ext4_journal_extend(handle, 2))
                      return;
      
              /*
               * Copy the extent data up to the inode
   15          */
   15         blk = ext4_idx_pblock(path[0].p_idx);
              s = le16_to_cpu(path[1].p_hdr->eh_entries) *
                      sizeof(struct ext4_extent_idx);
              s += sizeof(struct ext4_extent_header);
      
              path[1].p_maxdepth = path[0].p_maxdepth;
              memcpy(path[0].p_hdr, path[1].p_hdr, s);
              path[0].p_depth = 0;
              path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
                      (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
              path[0].p_hdr->eh_max = cpu_to_le16(max_root);
      
              brelse(path[1].p_bh);
              ext4_free_blocks(handle, inode, NULL, blk, 1,
                               EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
      }
  608 
      /*
       * This function tries to merge the @ex extent to neighbours in the tree.
       * return 1 if merge left else 0.
  608  */
  476 static void ext4_ext_try_to_merge(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_ext_path *path,
  608                                   struct ext4_extent *ex) {
              struct ext4_extent_header *eh;
  608         unsigned int depth;
  608         int merge_done = 0;
      
              depth = ext_depth(inode);
              BUG_ON(path[depth].p_hdr == NULL);
              eh = path[depth].p_hdr;
      
              if (ex > EXT_FIRST_EXTENT(eh))
                      merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
      
              if (!merge_done)
                      (void) ext4_ext_try_to_merge_right(inode, path, ex);
      
              ext4_ext_try_to_merge_up(handle, inode, path);
      }
      
      /*
       * check if a portion of the "newext" extent overlaps with an
       * existing extent.
       *
       * If there is an overlap discovered, it updates the length of the newext
       * such that there will be no overlap, and then returns 1.
  602  * If there is no overlap found, it returns 0.
       */
      static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
  530                                            struct inode *inode,
                                                 struct ext4_extent *newext,
                                                 struct ext4_ext_path *path)
      {
              ext4_lblk_t b1, b2;
              unsigned int depth, len1;
              unsigned int ret = 0;
  515 
              b1 = le32_to_cpu(newext->ee_block);
              len1 = ext4_ext_get_actual_len(newext);
   37         depth = ext_depth(inode);
              if (!path[depth].p_ext)
                      goto out;
              b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
   56 
              /*
               * get the next allocated block if the extent in the path
               * is before the requested block(s)
               */
              if (b2 < b1) {
                      b2 = ext4_ext_next_allocated_block(path);
   66                 if (b2 == EXT_MAX_BLOCKS)
    2                         goto out;
                      b2 = EXT4_LBLK_CMASK(sbi, b2);
              }
      
              /* check for wrap through zero on extent logical start block*/
              if (b1 + len1 < b1) {
                      len1 = EXT_MAX_BLOCKS - b1;
                      newext->ee_len = cpu_to_le16(len1);
                      ret = 1;
              }
      
              /* check for overlap */
              if (b1 + len1 > b2) {
                      newext->ee_len = cpu_to_le16(b2 - b1);
                      ret = 1;
              }
      out:
  608         return ret;
      }
      
      /*
       * ext4_ext_insert_extent:
       * tries to merge requsted extent into the existing extent or
       * inserts requested extent as new one into the tree,
       * creating new leaf in the no-space case.
       */
      int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                      struct ext4_ext_path **ppath,
  608                                 struct ext4_extent *newext, int gb_flags)
      {
              struct ext4_ext_path *path = *ppath;
              struct ext4_extent_header *eh;
  608         struct ext4_extent *ex, *fex;
              struct ext4_extent *nearex; /* nearest extent */
              struct ext4_ext_path *npath = NULL;
              int depth, len, err;
              ext4_lblk_t next;
              int mb_flags = 0, unwritten;
      
              if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                      mb_flags |= EXT4_MB_DELALLOC_RESERVED;
  608         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                      EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
                      return -EFSCORRUPTED;
              }
              depth = ext_depth(inode);
              ex = path[depth].p_ext;
              eh = path[depth].p_hdr;
              if (unlikely(path[depth].p_hdr == NULL)) {
                      EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
  514                 return -EFSCORRUPTED;
   51         }
   51 
              /* try to insert block into found extent and return */
    7         if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
      
  514                 /*
  421                  * Try to see whether we should rather test the extent on
                       * right from ex, or from the left of ex. This is because
                       * ext4_find_extent() can return either extent on the
                       * left, or on the right from the searched position. This
                       * will make merging more effective.
                       */
  514                 if (ex < EXT_LAST_EXTENT(eh) &&
                          (le32_to_cpu(ex->ee_block) +
                          ext4_ext_get_actual_len(ex) <
                          le32_to_cpu(newext->ee_block))) {
                              ex += 1;
                              goto prepend;
                      } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
                                 (le32_to_cpu(newext->ee_block) +
                                 ext4_ext_get_actual_len(newext) <
  454                            le32_to_cpu(ex->ee_block)))
                              ex -= 1;
      
                      /* Try to append newex to the ex */
  454                 if (ext4_can_extents_be_merged(inode, ex, newext)) {
  320                         ext_debug("append [%d]%d block to %u:[%d]%d"
                                        "(from %llu)\n",
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        le32_to_cpu(ex->ee_block),
                                        ext4_ext_is_unwritten(ex),
                                        ext4_ext_get_actual_len(ex),
                                        ext4_ext_pblock(ex));
                              err = ext4_ext_get_access(handle, inode,
                                                        path + depth);
                              if (err)
  462                                 return err;
                              unwritten = ext4_ext_is_unwritten(ex);
                              ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                              + ext4_ext_get_actual_len(newext));
                              if (unwritten)
                                      ext4_ext_mark_unwritten(ex);
                              eh = path[depth].p_hdr;
                              nearex = ex;
                              goto merge;
                      }
    3 
      prepend:
                      /* Try to prepend newex to the ex */
                      if (ext4_can_extents_be_merged(inode, newext, ex)) {
                              ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
    3                                   "(from %llu)\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_is_unwritten(newext),
  454                                   ext4_ext_get_actual_len(newext),
                                        le32_to_cpu(ex->ee_block),
                                        ext4_ext_is_unwritten(ex),
  146                                   ext4_ext_get_actual_len(ex),
                                        ext4_ext_pblock(ex));
                              err = ext4_ext_get_access(handle, inode,
                                                        path + depth);
                              if (err)
                                      return err;
      
                              unwritten = ext4_ext_is_unwritten(ex);
                              ex->ee_block = newext->ee_block;
  595                         ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                              ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                              + ext4_ext_get_actual_len(newext));
                              if (unwritten)
  302                                 ext4_ext_mark_unwritten(ex);
                              eh = path[depth].p_hdr;
                              nearex = ex;
  297                         goto merge;
                      }
              }
      
              depth = ext_depth(inode);
              eh = path[depth].p_hdr;
              if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
                      goto has_space;
      
              /* probably next leaf has space for us? */
              fex = EXT_LAST_EXTENT(eh);
              next = EXT_MAX_BLOCKS;
              if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
                      next = ext4_ext_next_leaf_block(path);
              if (next != EXT_MAX_BLOCKS) {
                      ext_debug("next leaf block - %u\n", next);
                      BUG_ON(npath != NULL);
                      npath = ext4_find_extent(inode, next, NULL, 0);
                      if (IS_ERR(npath))
                              return PTR_ERR(npath);
                      BUG_ON(npath->p_depth != path->p_depth);
                      eh = npath[depth].p_hdr;
                      if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
  302                         ext_debug("next leaf isn't full(%d)\n",
   20                                   le16_to_cpu(eh->eh_entries));
  302                         path = npath;
                              goto has_space;
                      }
                      ext_debug("next leaf has no free space(%d,%d)\n",
                                le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
  301         }
      
              /*
  595          * There is no free space in the found leaf.
               * We're gonna add a new leaf in the tree.
  324          */
              if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                      mb_flags |= EXT4_MB_USE_RESERVED;
              err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
  595                                        ppath, newext);
              if (err)
                      goto cleanup;
              depth = ext_depth(inode);
              eh = path[depth].p_hdr;
      
      has_space:
  500         nearex = path[depth].p_ext;
      
  492         err = ext4_ext_get_access(handle, inode, path + depth);
              if (err)
                      goto cleanup;
      
              if (!nearex) {
                      /* there is no extent in this leaf, create first one */
                      ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
                                      le32_to_cpu(newext->ee_block),
                                      ext4_ext_pblock(newext),
                                      ext4_ext_is_unwritten(newext),
  469                                 ext4_ext_get_actual_len(newext));
                      nearex = EXT_FIRST_EXTENT(eh);
              } else {
   54                 if (le32_to_cpu(newext->ee_block)
                                 > le32_to_cpu(nearex->ee_block)) {
                              /* Insert after */
                              ext_debug("insert %u:%llu:[%d]%d before: "
                                              "nearest %p\n",
                                              le32_to_cpu(newext->ee_block),
                                              ext4_ext_pblock(newext),
                                              ext4_ext_is_unwritten(newext),
                                              ext4_ext_get_actual_len(newext),
  492                                         nearex);
                              nearex++;
   77                 } else {
                              /* Insert before */
                              BUG_ON(newext->ee_block == nearex->ee_block);
                              ext_debug("insert %u:%llu:[%d]%d after: "
                                              "nearest %p\n",
                                              le32_to_cpu(newext->ee_block),
                                              ext4_ext_pblock(newext),
                                              ext4_ext_is_unwritten(newext),
                                              ext4_ext_get_actual_len(newext),
                                              nearex);
                      }
                      len = EXT_LAST_EXTENT(eh) - nearex + 1;
  595                 if (len > 0) {
                              ext_debug("insert %u:%llu:[%d]%d: "
                                              "move %d extents from 0x%p to 0x%p\n",
                                              le32_to_cpu(newext->ee_block),
                                              ext4_ext_pblock(newext),
                                              ext4_ext_is_unwritten(newext),
                                              ext4_ext_get_actual_len(newext),
                                              len, nearex, nearex + 1);
                              memmove(nearex + 1, nearex,
  585                                 len * sizeof(struct ext4_extent));
                      }
              }
      
  608         le16_add_cpu(&eh->eh_entries, 1);
              path[depth].p_ext = nearex;
              nearex->ee_block = newext->ee_block;
              ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
  608         nearex->ee_len = newext->ee_len;
      
      merge:
  608         /* try to merge extents */
              if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
  608                 ext4_ext_try_to_merge(handle, inode, path, nearex);
      
      
              /* time to correct all indexes above */
              err = ext4_ext_correct_indexes(handle, inode, path);
              if (err)
                      goto cleanup;
      
              err = ext4_ext_dirty(handle, inode, path + path->p_depth);
      
      cleanup:
              ext4_ext_drop_refs(npath);
              kfree(npath);
              return err;
      }
      
    9 static int ext4_fill_fiemap_extents(struct inode *inode,
                                          ext4_lblk_t block, ext4_lblk_t num,
                                          struct fiemap_extent_info *fieinfo)
    9 {
              struct ext4_ext_path *path = NULL;
              struct ext4_extent *ex;
              struct extent_status es;
              ext4_lblk_t next, next_del, start = 0, end = 0;
              ext4_lblk_t last = block + num;
              int exists, depth = 0, err = 0;
              unsigned int flags = 0;
              unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
      
    9         while (block < last && block != EXT_MAX_BLOCKS) {
                      num = last - block;
                      /* find extent for this block */
                      down_read(&EXT4_I(inode)->i_data_sem);
      
                      path = ext4_find_extent(inode, block, &path, 0);
                      if (IS_ERR(path)) {
    9                         up_read(&EXT4_I(inode)->i_data_sem);
                              err = PTR_ERR(path);
                              path = NULL;
                              break;
                      }
      
                      depth = ext_depth(inode);
                      if (unlikely(path[depth].p_hdr == NULL)) {
                              up_read(&EXT4_I(inode)->i_data_sem);
                              EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
    6                         err = -EFSCORRUPTED;
                              break;
                      }
                      ex = path[depth].p_ext;
    1                 next = ext4_ext_next_allocated_block(path);
      
                      flags = 0;
    6                 exists = 0;
                      if (!ex) {
                              /* there is no extent yet, so try to allocate
                               * all requested space */
    2                         start = block;
                              end = block + num;
                      } else if (le32_to_cpu(ex->ee_block) > block) {
                              /* need to allocate space before found extent */
                              start = block;
                              end = le32_to_cpu(ex->ee_block);
                              if (block + num < end)
                                      end = block + num;
                      } else if (block >= le32_to_cpu(ex->ee_block)
                                              + ext4_ext_get_actual_len(ex)) {
    5                         /* need to allocate space after found extent */
                              start = block;
                              end = block + num;
                              if (end >= next)
                                      end = next;
                      } else if (block >= le32_to_cpu(ex->ee_block)) {
    6                         /*
                               * some part of requested space is covered
                               * by found extent
                               */
    6                         start = block;
                              end = le32_to_cpu(ex->ee_block)
                                      + ext4_ext_get_actual_len(ex);
                              if (block + num < end)
    5                                 end = block + num;
                              exists = 1;
                      } else {
                              BUG();
                      }
                      BUG_ON(end <= start);
      
                      if (!exists) {
                              es.es_lblk = start;
                              es.es_len = end - start;
                              es.es_pblk = 0;
    9                 } else {
    1                         es.es_lblk = le32_to_cpu(ex->ee_block);
                              es.es_len = ext4_ext_get_actual_len(ex);
    4                         es.es_pblk = ext4_ext_pblock(ex);
                              if (ext4_ext_is_unwritten(ex))
                                      flags |= FIEMAP_EXTENT_UNWRITTEN;
    9                 }
      
                      /*
                       * Find delayed extent and update es accordingly. We call
                       * it even in !exists case to find out whether es is the
                       * last existing extent or not.
                       */
                      next_del = ext4_find_delayed_extent(inode, &es);
                      if (!exists && next_del) {
                              exists = 1;
                              flags |= (FIEMAP_EXTENT_DELALLOC |
                                        FIEMAP_EXTENT_UNKNOWN);
                      }
                      up_read(&EXT4_I(inode)->i_data_sem);
      
                      if (unlikely(es.es_len == 0)) {
                              EXT4_ERROR_INODE(inode, "es.es_len == 0");
                              err = -EFSCORRUPTED;
                              break;
    9                 }
    8 
                      /*
                       * This is possible iff next == next_del == EXT_MAX_BLOCKS.
                       * we need to check next == EXT_MAX_BLOCKS because it is
                       * possible that an extent is with unwritten and delayed
                       * status due to when an extent is delayed allocated and
                       * is allocated by fallocate status tree will track both of
                       * them in a extent.
                       *
                       * So we could return a unwritten and delayed extent, and
                       * its block is equal to 'next'.
                       */
    9                 if (next == next_del && next == EXT_MAX_BLOCKS) {
                              flags |= FIEMAP_EXTENT_LAST;
                              if (unlikely(next_del != EXT_MAX_BLOCKS ||
                                           next != EXT_MAX_BLOCKS)) {
    8                                 EXT4_ERROR_INODE(inode,
                                                       "next extent == %u, next "
                                                       "delalloc extent = %u",
                                                       next, next_del);
    7                                 err = -EFSCORRUPTED;
                                      break;
                              }
                      }
      
                      if (exists) {
    4                         err = fiemap_fill_next_extent(fieinfo,
                                      (__u64)es.es_lblk << blksize_bits,
                                      (__u64)es.es_pblk << blksize_bits,
   10                                 (__u64)es.es_len << blksize_bits,
                                      flags);
                              if (err < 0)
                                      break;
                              if (err == 1) {
                                      err = 0;
                                      break;
                              }
                      }
      
                      block = es.es_lblk + es.es_len;
              }
      
              ext4_ext_drop_refs(path);
  559         kfree(path);
              return err;
      }
      
      /*
       * ext4_ext_put_gap_in_cache:
       * calculate boundaries of the gap that the requested block fits into
       * and cache this gap
       */
      static void
      ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
                                      ext4_lblk_t block)
   31 {
              int depth = ext_depth(inode);
    5         ext4_lblk_t len;
              ext4_lblk_t lblock;
              struct ext4_extent *ex;
              struct extent_status es;
      
              ex = path[depth].p_ext;
   26         if (ex == NULL) {
                      /* there is no extent yet, so gap is [0;-] */
                      lblock = 0;
                      len = EXT_MAX_BLOCKS;
                      ext_debug("cache gap(whole file):");
   26         } else if (block < le32_to_cpu(ex->ee_block)) {
                      lblock = block;
                      len = le32_to_cpu(ex->ee_block) - block;
                      ext_debug("cache gap(before): %u [%u:%u]",
                                      block,
                                      le32_to_cpu(ex->ee_block),
   26                                  ext4_ext_get_actual_len(ex));
              } else if (block >= le32_to_cpu(ex->ee_block)
                              + ext4_ext_get_actual_len(ex)) {
                      ext4_lblk_t next;
                      lblock = le32_to_cpu(ex->ee_block)
  559                         + ext4_ext_get_actual_len(ex);
      
                      next = ext4_ext_next_allocated_block(path);
    6                 ext_debug("cache gap(after): [%u:%u] %u",
    3                                 le32_to_cpu(ex->ee_block),
    3                                 ext4_ext_get_actual_len(ex),
                                      block);
                      BUG_ON(next == lblock);
  559                 len = next - lblock;
              } else {
                      BUG();
              }
      
              ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
              if (es.es_len) {
                      /* There's delayed extent containing lblock? */
                      if (es.es_lblk <= lblock)
                              return;
                      len = min(es.es_lblk - lblock, len);
              }
              ext_debug(" -> %u:%u\n", lblock, len);
              ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
  165 }
      
  165 /*
       * ext4_ext_rm_idx:
       * removes index from the index block.
       */
      static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
  165                         struct ext4_ext_path *path, int depth)
      {
              int err;
              ext4_fsblk_t leaf;
  165 
              /* free index block */
              depth--;
              path = path + depth;
              leaf = ext4_idx_pblock(path->p_idx);
              if (unlikely(path->p_hdr->eh_entries == 0)) {
  165                 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                      return -EFSCORRUPTED;
              }
              err = ext4_ext_get_access(handle, inode, path);
              if (err)
  165                 return err;
      
  165         if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
                      int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
                      len *= sizeof(struct ext4_extent_idx);
  165                 memmove(path->p_idx, path->p_idx + 1, len);
              }
      
              le16_add_cpu(&path->p_hdr->eh_entries, -1);
  165         err = ext4_ext_dirty(handle, inode, path);
              if (err)
                      return err;
              ext_debug("index is empty, remove it, free block %llu\n", leaf);
              trace_ext4_ext_rm_idx(inode, leaf);
      
              ext4_free_blocks(handle, inode, NULL, leaf, 1,
                               EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
      
              while (--depth >= 0) {
                      if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
                              break;
                      path--;
                      err = ext4_ext_get_access(handle, inode, path);
                      if (err)
                              break;
                      path->p_idx->ei_block = (path+1)->p_idx->ei_block;
                      err = ext4_ext_dirty(handle, inode, path);
                      if (err)
                              break;
              }
    4         return err;
    4 }
      
      /*
       * ext4_ext_calc_credits_for_single_extent:
       * This routine returns max. credits that needed to insert an extent
       * to the extent tree.
       * When pass the actual path, the caller should calculate credits
       * under i_data_sem.
       */
      int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                                                      struct ext4_ext_path *path)
      {
              if (path) {
                      int depth = ext_depth(inode);
                      int ret = 0;
      
    4                 /* probably there is space in leaf? */
                      if (le16_to_cpu(path[depth].p_hdr->eh_entries)
                                      < le16_to_cpu(path[depth].p_hdr->eh_max)) {
      
                              /*
    4                          *  There are some space in the leaf tree, no
                               *  need to account for leaf block credit
                               *
                               *  bitmaps and block group descriptor blocks
                               *  and other metadata blocks still need to be
                               *  accounted.
                               */
                              /* 1 bitmap, 1 block group descriptor */
                              ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
                              return ret;
                      }
              }
      
              return ext4_chunk_trans_blocks(inode, nrblocks);
      }
      
      /*
       * How many index/leaf blocks need to change/allocate to add @extents extents?
  825  *
       * If we add a single extent, then in the worse case, each tree level
       * index/leaf need to be changed in case of the tree split.
  825  *
       * If more extents are inserted, they could cause the whole tree split more
       * than once, but this is really rare.
  825  */
      int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
      {
              int index;
              int depth;
      
              /* If we are converting the inline data, only one is needed here. */
              if (ext4_has_inline_data(inode))
                      return 1;
  320 
              depth = ext_depth(inode);
  442 
              if (extents <= 1)
                      index = depth * 2;
              else
                      index = depth * 3;
      
              return index;
      }
      
      static inline int get_default_free_blocks_flags(struct inode *inode)
  320 {
  320         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                      return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
  320         else if (ext4_should_journal_data(inode))
                      return EXT4_FREE_BLOCKS_FORGET;
              return 0;
      }
      
      static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                                    struct ext4_extent *ex,
                                    long long *partial_cluster,
                                    ext4_lblk_t from, ext4_lblk_t to)
      {
              struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  320         unsigned short ee_len = ext4_ext_get_actual_len(ex);
              ext4_fsblk_t pblk;
              int flags = get_default_free_blocks_flags(inode);
      
              /*
               * For bigalloc file systems, we never free a partial cluster
  320          * at the beginning of the extent.  Instead, we make a note
               * that we tried freeing the cluster, and check to see if we
               * need to free it on a subsequent call to ext4_remove_blocks,
               * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
               */
              flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
      
              trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
              /*
               * If we have a partial cluster, and it's different from the
               * cluster of the last block, we need to explicitly free the
               * partial cluster here.
               */
              pblk = ext4_ext_pblock(ex) + ee_len - 1;
              if (*partial_cluster > 0 &&
                  *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
                      ext4_free_blocks(handle, inode, NULL,
                                       EXT4_C2B(sbi, *partial_cluster),
                                       sbi->s_cluster_ratio, flags);
                      *partial_cluster = 0;
              }
      
      #ifdef EXTENTS_STATS
              {
  320                 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  320                 spin_lock(&sbi->s_ext_stats_lock);
                      sbi->s_ext_blocks += ee_len;
                      sbi->s_ext_extents++;
                      if (ee_len < sbi->s_ext_min)
                              sbi->s_ext_min = ee_len;
  320                 if (ee_len > sbi->s_ext_max)
                              sbi->s_ext_max = ee_len;
                      if (ext_depth(inode) > sbi->s_depth_max)
                              sbi->s_depth_max = ext_depth(inode);
                      spin_unlock(&sbi->s_ext_stats_lock);
              }
      #endif
              if (from >= le32_to_cpu(ex->ee_block)
                  && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
                      /* tail removal */
                      ext4_lblk_t num;
                      long long first_cluster;
      
  320                 num = le32_to_cpu(ex->ee_block) + ee_len - from;
                      pblk = ext4_ext_pblock(ex) + ee_len - num;
                      /*
                       * Usually we want to free partial cluster at the end of the
                       * extent, except for the situation when the cluster is still
                       * used by any other extent (partial_cluster is negative).
                       */
                      if (*partial_cluster < 0 &&
                          *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
                              flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
      
                      ext_debug("free last %u blocks starting %llu partial %lld\n",
                                num, pblk, *partial_cluster);
                      ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
                      /*
                       * If the block range to be freed didn't start at the
                       * beginning of a cluster, and we removed the entire
                       * extent and the cluster is not used by any other extent,
                       * save the partial cluster here, since we might need to
                       * delete if we determine that the truncate or punch hole
                       * operation has removed all of the blocks in the cluster.
                       * If that cluster is used by another extent, preserve its
                       * negative value so it isn't freed later on.
                       *
                       * If the whole extent wasn't freed, we've reached the
                       * start of the truncated/punched region and have finished
                       * removing blocks.  If there's a partial cluster here it's
                       * shared with the remainder of the extent and is no longer
                       * a candidate for removal.
                       */
                      if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
                              first_cluster = (long long) EXT4_B2C(sbi, pblk);
                              if (first_cluster != -*partial_cluster)
                                      *partial_cluster = first_cluster;
                      } else {
                              *partial_cluster = 0;
                      }
              } else
                      ext4_error(sbi->s_sb, "strange request: removal(2) "
                                 "%u-%u from %u:%u\n",
                                 from, to, le32_to_cpu(ex->ee_block), ee_len);
              return 0;
      }
      
      
      /*
       * ext4_ext_rm_leaf() Removes the extents associated with the
       * blocks appearing between "start" and "end".  Both "start"
       * and "end" must appear in the same extent or EIO is returned.
       *
       * @handle: The journal handle
       * @inode:  The files inode
       * @path:   The path to the leaf
  442  * @partial_cluster: The cluster which we'll have to free if all extents
       *                   has been released from it.  However, if this value is
       *                   negative, it's a cluster just to the right of the
       *                   punched region and it must not be freed.
       * @start:  The first block to remove
       * @end:   The last block to remove
       */
      static int
      ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                       struct ext4_ext_path *path,
                       long long *partial_cluster,
                       ext4_lblk_t start, ext4_lblk_t end)
      {
              struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
              int err = 0, correct_index = 0;
  183         int depth = ext_depth(inode), credits;
              struct ext4_extent_header *eh;
              ext4_lblk_t a, b;
              unsigned num;
              ext4_lblk_t ex_ee_block;
              unsigned short ex_ee_len;
              unsigned unwritten = 0;
  442         struct ext4_extent *ex;
              ext4_fsblk_t pblk;
  431 
              /* the header must be checked already in ext4_ext_remove_space() */
  442         ext_debug("truncate since %u in leaf to %u\n", start, end);
  442         if (!path[depth].p_hdr)
                      path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
  442         eh = path[depth].p_hdr;
              if (unlikely(path[depth].p_hdr == NULL)) {
  442                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
  334                 return -EFSCORRUPTED;
              }
  320         /* find where to start removing */
              ex = path[depth].p_ext;
              if (!ex)
                      ex = EXT_LAST_EXTENT(eh);
      
              ex_ee_block = le32_to_cpu(ex->ee_block);
              ex_ee_len = ext4_ext_get_actual_len(ex);
      
              trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
  320 
  320         while (ex >= EXT_FIRST_EXTENT(eh) &&
                              ex_ee_block + ex_ee_len > start) {
      
                      if (ext4_ext_is_unwritten(ex))
                              unwritten = 1;
                      else
                              unwritten = 0;
      
                      ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
                                unwritten, ex_ee_len);
                      path[depth].p_ext = ex;
      
                      a = ex_ee_block > start ? ex_ee_block : start;
                      b = ex_ee_block+ex_ee_len - 1 < end ?
    2                         ex_ee_block+ex_ee_len - 1 : end;
      
                      ext_debug("  border %u:%u\n", a, b);
      
                      /* If this extent is beyond the end of the hole, skip it */
    2                 if (end < ex_ee_block) {
                              /*
                               * We're going to skip this extent and move to another,
                               * so note that its first cluster is in use to avoid
                               * freeing it when removing blocks.  Eventually, the
                               * right edge of the truncated/punched region will
                               * be just to the left.
                               */
                              if (sbi->s_cluster_ratio > 1) {
                                      pblk = ext4_ext_pblock(ex);
                                      *partial_cluster =
                                              -(long long) EXT4_B2C(sbi, pblk);
                              }
                              ex--;
   25                         ex_ee_block = le32_to_cpu(ex->ee_block);
                              ex_ee_len = ext4_ext_get_actual_len(ex);
                              continue;
                      } else if (b != ex_ee_block + ex_ee_len - 1) {
                              EXT4_ERROR_INODE(inode,
                                               "can not handle truncate %u:%u "
                                               "on extent %u:%u",
                                               start, end, ex_ee_block,
                                               ex_ee_block + ex_ee_len - 1);
                              err = -EFSCORRUPTED;
                              goto out;
  320                 } else if (a != ex_ee_block) {
                              /* remove tail of the extent */
                              num = a - ex_ee_block;
  308                 } else {
                              /* remove whole extent: excellent! */
  320                         num = 0;
                      }
                      /*
                       * 3 for leaf, sb, and inode plus 2 (bmap and group
                       * descriptor) for each block group; assume two block
                       * groups plus ex_ee_len/blocks_per_block_group for
  320                  * the worst case
                       */
                      credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
                      if (ex == EXT_FIRST_EXTENT(eh)) {
  320                         correct_index = 1;
                              credits += (ext_depth(inode)) + 1;
                      }
                      credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
      
  320                 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
                      if (err)
  314                         goto out;
      
   25                 err = ext4_ext_get_access(handle, inode, path + depth);
                      if (err)
                              goto out;
      
                      err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
                                               a, b);
   12                 if (err)
                              goto out;
      
                      if (num == 0)
                              /* this extent is removed; mark slot entirely unused */
                              ext4_ext_store_pblock(ex, 0);
      
                      ex->ee_len = cpu_to_le16(num);
                      /*
                       * Do not mark unwritten if all the blocks in the
                       * extent have been removed.
                       */
   22                 if (unwritten && num)
                              ext4_ext_mark_unwritten(ex);
                      /*
                       * If the extent was completely released,
                       * we need to remove it from the leaf
                       */
                      if (num == 0) {
  314                         if (end != EXT_MAX_BLOCKS - 1) {
                                      /*
                                       * For hole punching, we need to scoot all the
  320                                  * extents up when an extent is removed so that
                                       * we dont have blank extents in the middle
                                       */
                                      memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
                                              sizeof(struct ext4_extent));
      
  320                                 /* Now get rid of the one at the end */
                                      memset(EXT_LAST_EXTENT(eh), 0,
  320                                         sizeof(struct ext4_extent));
                              }
                              le16_add_cpu(&eh->eh_entries, -1);
  317                 }
   28 
                      err = ext4_ext_dirty(handle, inode, path + depth);
                      if (err)
                              goto out;
      
                      ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
                                      ext4_ext_pblock(ex));
                      ex--;
                      ex_ee_block = le32_to_cpu(ex->ee_block);
  434                 ex_ee_len = ext4_ext_get_actual_len(ex);
              }
      
              if (correct_index && eh->eh_entries)
                      err = ext4_ext_correct_indexes(handle, inode, path);
      
              /*
               * If there's a partial cluster and at least one extent remains in
               * the leaf, free the partial cluster if it isn't shared with the
               * current extent.  If it is shared with the current extent
               * we zero partial_cluster because we've reached the start of the
               * truncated/punched region and we're done removing blocks.
               */
  441         if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
  165                 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
                      if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
                              ext4_free_blocks(handle, inode, NULL,
                                               EXT4_C2B(sbi, *partial_cluster),
                                               sbi->s_cluster_ratio,
                                               get_default_free_blocks_flags(inode));
                      }
                      *partial_cluster = 0;
              }
      
              /* if this leaf is free, then we should
               * remove it from index block above */
              if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
                      err = ext4_ext_rm_idx(handle, inode, path, depth);
      
  184 out:
              return err;
      }
      
      /*
       * ext4_ext_more_to_rm:
       * returns 1 if current index has to be freed (even partial)
  183  */
      static int
      ext4_ext_more_to_rm(struct ext4_ext_path *path)
      {
              BUG_ON(path->p_idx == NULL);
      
              if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
                      return 0;
  444 
              /*
               * if truncate on deeper level happened, it wasn't partial,
               * so we have to consider current index for truncation
               */
              if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
                      return 0;
              return 1;
      }
      
      int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                                ext4_lblk_t end)
  443 {
              struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
              int depth = ext_depth(inode);
  444         struct ext4_ext_path *path = NULL;
              long long partial_cluster = 0;
              handle_t *handle;
              int i = 0, err = 0;
      
              ext_debug("truncate since %u to %u\n", start, end);
      
              /* probably first extent we're gonna free will be last in block */
              handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
  444         if (IS_ERR(handle))
                      return PTR_ERR(handle);
      
      again:
              trace_ext4_ext_remove_space(inode, start, end, depth);
      
   25         /*
               * Check if we are removing extents inside the extent tree. If that
               * is the case, we are going to punch a hole inside the extent tree
               * so we have to check whether we need to split the extent covering
               * the last block to remove so we can easily remove the part of it
   25          * in ext4_ext_rm_leaf().
               */
              if (end < EXT_MAX_BLOCKS - 1) {
                      struct ext4_extent *ex;
    3                 ext4_lblk_t ee_block, ex_end, lblk;
                      ext4_fsblk_t pblk;
      
                      /* find extent for or closest extent to this block */
                      path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
                      if (IS_ERR(path)) {
    3                         ext4_journal_stop(handle);
                              return PTR_ERR(path);
                      }
   23                 depth = ext_depth(inode);
   23                 /* Leaf not may not exist only if inode has no blocks at all */
                      ex = path[depth].p_ext;
                      if (!ex) {
                              if (depth) {
                                      EXT4_ERROR_INODE(inode,
                                                       "path[%d].p_hdr == NULL",
                                                       depth);
                                      err = -EFSCORRUPTED;
   22                         }
                              goto out;
                      }
      
                      ee_block = le32_to_cpu(ex->ee_block);
                      ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
      
    1                 /*
                       * See if the last block is inside the extent, if so split
                       * the extent at 'end' block so we can easily remove the
                       * tail of the first part of the split extent in
                       * ext4_ext_rm_leaf().
                       */
                      if (end >= ee_block && end < ex_end) {
      
                              /*
                               * If we're going to split the extent, note that
                               * the cluster containing the block after 'end' is
                               * in use to avoid freeing it when removing blocks.
    1                          */
                              if (sbi->s_cluster_ratio > 1) {
    1                                 pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
                                      partial_cluster =
                                              -(long long) EXT4_B2C(sbi, pblk);
   23                         }
      
                              /*
                               * Split the extent in two so that 'end' is the last
                               * block in the first new extent. Also we should not
                               * fail removing space due to ENOSPC so try to use
                               * reserved block if that happens.
                               */
                              err = ext4_force_split_extent_at(handle, inode, &path,
                                                               end + 1, 1);
                              if (err < 0)
                                      goto out;
      
                      } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
                              /*
                               * If there's an extent to the right its first cluster
   23                          * contains the immediate right boundary of the
                               * truncated/punched region.  Set partial_cluster to
                               * its negative value so it won't be freed if shared
                               * with the current extent.  The end < ee_block case
                               * is handled in ext4_ext_rm_leaf().
                               */
                              lblk = ex_end + 1;
                              err = ext4_ext_search_right(inode, path, &lblk, &pblk,
  442                                                     &ex);
                              if (err)
   23                                 goto out;
                              if (pblk)
                                      partial_cluster =
                                              -(long long) EXT4_B2C(sbi, pblk);
  431                 }
              }
              /*
               * We start scanning from right side, freeing all the blocks
               * after i_size and walking into the tree depth-wise.
               */
  431         depth = ext_depth(inode);
              if (path) {
                      int k = i = depth;
                      while (--k > 0)
                              path[k].p_block =
                                      le16_to_cpu(path[k].p_hdr->eh_entries)+1;
              } else {
                      path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
                                     GFP_NOFS);
                      if (path == NULL) {
                              ext4_journal_stop(handle);
  442                         return -ENOMEM;
  442                 }
                      path[0].p_maxdepth = path[0].p_depth = depth;
  442                 path[0].p_hdr = ext_inode_hdr(inode);
                      i = 0;
      
                      if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
  441                         err = -EFSCORRUPTED;
  441                         goto out;
                      }
              }
              err = 0;
      
              while (i >= 0 && err == 0) {
  184                 if (i == depth) {
                              /* this is leaf block */
                              err = ext4_ext_rm_leaf(handle, inode, path,
                                                     &partial_cluster, start,
                                                     end);
  184                         /* root level has p_bh == NULL, brelse() eats this */
                              brelse(path[i].p_bh);
  183                         path[i].p_bh = NULL;
                              i--;
                              continue;
                      }
      
                      /* this is index block */
                      if (!path[i].p_hdr) {
  182                         ext_debug("initialize header\n");
                              path[i].p_hdr = ext_block_hdr(path[i].p_bh);
                      }
  184 
                      if (!path[i].p_idx) {
                              /* this level hasn't been touched yet */
  184                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
                              path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
                              ext_debug("init index ptr: hdr 0x%p, num %d\n",
  183                                   path[i].p_hdr,
                                        le16_to_cpu(path[i].p_hdr->eh_entries));
                      } else {
                              /* we were already here, see at next index */
                              path[i].p_idx--;
                      }
      
                      ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
                                      i, EXT_FIRST_INDEX(path[i].p_hdr),
                                      path[i].p_idx);
                      if (ext4_ext_more_to_rm(path + i)) {
                              struct buffer_head *bh;
                              /* go to the next level */
  183                         ext_debug("move to level %d (block %llu)\n",
                                        i + 1, ext4_idx_pblock(path[i].p_idx));
                              memset(path + i + 1, 0, sizeof(*path));
                              bh = read_extent_tree_block(inode,
                                      ext4_idx_pblock(path[i].p_idx), depth - i - 1,
  183                                 EXT4_EX_NOCACHE);
                              if (IS_ERR(bh)) {
                                      /* should we reset i_size? */
                                      err = PTR_ERR(bh);
                                      break;
                              }
                              /* Yield here to deal with large extent trees.
                               * Should be a no-op if we did IO above. */
  182                         cond_resched();
                              if (WARN_ON(i + 1 > depth)) {
                                      err = -EFSCORRUPTED;
                                      break;
                              }
                              path[i + 1].p_bh = bh;
      
  182                         /* save actual number of indexes since this
  182                          * number is changed at the next iteration */
                              path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
                              i++;
                      } else {
                              /* we finished processing this index, go up */
                              if (path[i].p_hdr->eh_entries == 0 && i > 0) {
  441                                 /* index is empty, remove it;
                                       * handle must be already prepared by the
                                       * truncatei_leaf() */
                                      err = ext4_ext_rm_idx(handle, inode, path, i);
                              }
                              /* root level has p_bh == NULL, brelse() eats this */
                              brelse(path[i].p_bh);
                              path[i].p_bh = NULL;
                              i--;
  441                         ext_debug("return to level %d\n", i);
                      }
  431         }
      
              trace_ext4_ext_remove_space_done(inode, start, end, depth,
                              partial_cluster, path->p_hdr->eh_entries);
      
              /*
               * If we still have something in the partial cluster and we have removed
  441          * even the first extent, then we should free the blocks in the partial
               * cluster as well.  (This code will only run when there are no leaves
               * to the immediate left of the truncated/punched region.)
               */
              if (partial_cluster > 0 && err == 0) {
  409                 /* don't zero partial_cluster since it's not used afterwards */
                      ext4_free_blocks(handle, inode, NULL,
  409                                  EXT4_C2B(sbi, partial_cluster),
                                       sbi->s_cluster_ratio,
                                       get_default_free_blocks_flags(inode));
              }
      
              /* TODO: flexible tree reduction should be here */
              if (path->p_hdr->eh_entries == 0) {
  443                 /*
                       * truncate to zero freed all the tree,
                       * so we need to correct eh_depth
                       */
                      err = ext4_ext_get_access(handle, inode, path);
  443                 if (err == 0) {
                              ext_inode_hdr(inode)->eh_depth = 0;
                              ext_inode_hdr(inode)->eh_max =
                                      cpu_to_le16(ext4_ext_space_root(inode, 0));
                              err = ext4_ext_dirty(handle, inode, path);
                      }
              }
      out:
              ext4_ext_drop_refs(path);
              kfree(path);
              path = NULL;
              if (err == -EAGAIN)
                      goto again;
              ext4_journal_stop(handle);
      
              return err;
      }
      
      /*
       * called at mount time
       */
      void ext4_ext_init(struct super_block *sb)
      {
              /*
               * possible initialization would be here
               */
      
              if (ext4_has_feature_extents(sb)) {
      #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
                      printk(KERN_INFO "EXT4-fs: file extents enabled"
      #ifdef AGGRESSIVE_TEST
                             ", aggressive tests"
      #endif
      #ifdef CHECK_BINSEARCH
                             ", check binsearch"
      #endif
      #ifdef EXTENTS_STATS
                             ", stats"
      #endif
                             "\n");
      #endif
      #ifdef EXTENTS_STATS
                      spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
                      EXT4_SB(sb)->s_ext_min = 1 << 30;
                      EXT4_SB(sb)->s_ext_max = 0;
      #endif
              }
      }
      
      /*
       * called at umount time
       */
      void ext4_ext_release(struct super_block *sb)
      {
              if (!ext4_has_feature_extents(sb))
                      return;
      
      #ifdef EXTENTS_STATS
              if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
                      struct ext4_sb_info *sbi = EXT4_SB(sb);
                      printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
                              sbi->s_ext_blocks, sbi->s_ext_extents,
    7                         sbi->s_ext_blocks / sbi->s_ext_extents);
   15                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
                              sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
              }
   15 #endif
   15 }
      
      static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
      {
              ext4_lblk_t  ee_block;
              ext4_fsblk_t ee_pblock;
              unsigned int ee_len;
      
              ee_block  = le32_to_cpu(ex->ee_block);
              ee_len    = ext4_ext_get_actual_len(ex);
              ee_pblock = ext4_ext_pblock(ex);
      
              if (ee_len == 0)
    7                 return 0;
    7 
              return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
                                           EXTENT_STATUS_WRITTEN);
      }
      
      /* FIXME!! we need to try to merge to left or right after zero-out  */
      static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
      {
              ext4_fsblk_t ee_pblock;
              unsigned int ee_len;
              int ret;
      
              ee_len    = ext4_ext_get_actual_len(ex);
              ee_pblock = ext4_ext_pblock(ex);
      
              if (ext4_encrypted_inode(inode))
                      return ext4_encrypted_zeroout(inode, ex);
      
              ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
              if (ret > 0)
                      ret = 0;
      
              return ret;
      }
      
      /*
       * ext4_split_extent_at() splits an extent at given block.
       *
       * @handle: the journal handle
       * @inode: the file inode
       * @path: the path to the extent
       * @split: the logical block where the extent is splitted.
       * @split_flags: indicates if the extent could be zeroout if split fails, and
       *                 the states(init or unwritten) of new extents.
       * @flags: flags used to insert new extent to extent tree.
       *
       *
       * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
       * of which are deterimined by split_flag.
       *
   32  * There are two cases:
       *  a> the extent are splitted into two extent.
       *  b> split is not needed, and just mark the extent.
       *
       * return 0 on success.
       */
      static int ext4_split_extent_at(handle_t *handle,
                                   struct inode *inode,
                                   struct ext4_ext_path **ppath,
                                   ext4_lblk_t split,
                                   int split_flag,
                                   int flags)
      {
              struct ext4_ext_path *path = *ppath;
              ext4_fsblk_t newblock;
              ext4_lblk_t ee_block;
   32         struct ext4_extent *ex, newex, orig_ex, zero_ex;
              struct ext4_extent *ex2 = NULL;
              unsigned int ee_len, depth;
   32         int err = 0;
   32 
              BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
   32                (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
   32 
              ext_debug("ext4_split_extents_at: inode %lu, logical"
                      "block %llu\n", inode->i_ino, (unsigned long long)split);
      
              ext4_ext_show_leaf(inode, path);
   12 
              depth = ext_depth(inode);
              ex = path[depth].p_ext;
              ee_block = le32_to_cpu(ex->ee_block);
   32         ee_len = ext4_ext_get_actual_len(ex);
              newblock = split - ee_block + ext4_ext_pblock(ex);
      
              BUG_ON(split < ee_block || split >= (ee_block + ee_len));
              BUG_ON(!ext4_ext_is_unwritten(ex) &&
                     split_flag & (EXT4_EXT_MAY_ZEROOUT |
   10                              EXT4_EXT_MARK_UNWRIT1 |
    4                              EXT4_EXT_MARK_UNWRIT2));
      
    6         err = ext4_ext_get_access(handle, inode, path + depth);
              if (err)
   10                 goto out;
    6 
              if (split == ee_block) {
   10                 /*
   32                  * case b: block @split is the block that the extent begins with
                       * then we just change the state of the extent, and splitting
                       * is not needed.
                       */
   29                 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
   17                         ext4_ext_mark_unwritten(ex);
                      else
   13                         ext4_ext_mark_initialized(ex);
      
                      if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
                              ext4_ext_try_to_merge(handle, inode, path, ex);
      
                      err = ext4_ext_dirty(handle, inode, path + path->p_depth);
   29                 goto out;
              }
      
              /* case a */
              memcpy(&orig_ex, ex, sizeof(orig_ex));
              ex->ee_len = cpu_to_le16(split - ee_block);
              if (split_flag & EXT4_EXT_MARK_UNWRIT1)
                      ext4_ext_mark_unwritten(ex);
      
   11         /*
               * path may lead to new leaf, not to original leaf any more
   29          * after ext4_ext_insert_extent() returns,
               */
              err = ext4_ext_dirty(handle, inode, path + depth);
              if (err)
                      goto fix_extent_len;
      
              ex2 = &newex;
              ex2->ee_block = cpu_to_le32(split);
              ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
              ext4_ext_store_pblock(ex2, newblock);
              if (split_flag & EXT4_EXT_MARK_UNWRIT2)
                      ext4_ext_mark_unwritten(ex2);
      
              err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
              if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
                      if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
                              if (split_flag & EXT4_EXT_DATA_VALID1) {
                                      err = ext4_ext_zeroout(inode, ex2);
                                      zero_ex.ee_block = ex2->ee_block;
                                      zero_ex.ee_len = cpu_to_le16(
                                                      ext4_ext_get_actual_len(ex2));
                                      ext4_ext_store_pblock(&zero_ex,
                                                            ext4_ext_pblock(ex2));
                              } else {
                                      err = ext4_ext_zeroout(inode, ex);
                                      zero_ex.ee_block = ex->ee_block;
                                      zero_ex.ee_len = cpu_to_le16(
                                                      ext4_ext_get_actual_len(ex));
                                      ext4_ext_store_pblock(&zero_ex,
                                                            ext4_ext_pblock(ex));
                              }
                      } else {
                              err = ext4_ext_zeroout(inode, &orig_ex);
                              zero_ex.ee_block = orig_ex.ee_block;
                              zero_ex.ee_len = cpu_to_le16(
                                                      ext4_ext_get_actual_len(&orig_ex));
                              ext4_ext_store_pblock(&zero_ex,
                                                    ext4_ext_pblock(&orig_ex));
                      }
      
   29                 if (err)
                              goto fix_extent_len;
                      /* update the extent length and mark as initialized */
                      ex->ee_len = cpu_to_le16(ee_len);
                      ext4_ext_try_to_merge(handle, inode, path, ex);
                      err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                      if (err)
                              goto fix_extent_len;
      
                      /* update extent status tree */
                      err = ext4_zeroout_es(inode, &zero_ex);
      
                      goto out;
              } else if (err)
                      goto fix_extent_len;
      
      out:
              ext4_ext_show_leaf(inode, path);
              return err;
      
      fix_extent_len:
              ex->ee_len = orig_ex.ee_len;
              ext4_ext_dirty(handle, inode, path + path->p_depth);
              return err;
      }
      
      /*
       * ext4_split_extents() splits an extent and mark extent which is covered
       * by @map as split_flags indicates
       *
       * It may result in splitting the extent into multiple extents (up to three)
   15  * There are three possibilities:
       *   a> There is no split required
       *   b> Splits in two extents: Split is happening at either end of the extent
       *   c> Splits in three extents: Somone is splitting in middle of the extent
       *
       */
      static int ext4_split_extent(handle_t *handle,
   10                               struct inode *inode,
                                    struct ext4_ext_path **ppath,
                                    struct ext4_map_blocks *map,
                                    int split_flag,
                                    int flags)
   15 {
              struct ext4_ext_path *path = *ppath;
              ext4_lblk_t ee_block;
              struct ext4_extent *ex;
   10         unsigned int ee_len, depth;
              int err = 0;
              int unwritten;
              int split_flag1, flags1;
              int allocated = map->m_len;
   10 
    1         depth = ext_depth(inode);
   10         ex = path[depth].p_ext;
              ee_block = le32_to_cpu(ex->ee_block);
              ee_len = ext4_ext_get_actual_len(ex);
              unwritten = ext4_ext_is_unwritten(ex);
      
    5         if (map->m_lblk + map->m_len < ee_block + ee_len) {
                      split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
                      flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
                      if (unwritten)
                              split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
                                             EXT4_EXT_MARK_UNWRIT2;
   15                 if (split_flag & EXT4_EXT_DATA_VALID2)
                              split_flag1 |= EXT4_EXT_DATA_VALID1;
                      err = ext4_split_extent_at(handle, inode, ppath,
   15                                 map->m_lblk + map->m_len, split_flag1, flags1);
                      if (err)
                              goto out;
              } else {
                      allocated = ee_len - (map->m_lblk - ee_block);
              }
              /*
   15          * Update path is required because previous ext4_split_extent_at() may
               * result in split of original leaf or extent zeroout.
               */
   15         path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
              if (IS_ERR(path))
                      return PTR_ERR(path);
              depth = ext_depth(inode);
   15         ex = path[depth].p_ext;
              if (!ex) {
                      EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
   15                                  (unsigned long) map->m_lblk);
                      return -EFSCORRUPTED;
              }
              unwritten = ext4_ext_is_unwritten(ex);
              split_flag1 = 0;
      
              if (map->m_lblk >= ee_block) {
                      split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
   15                 if (unwritten) {
                              split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
                              split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
                                                           EXT4_EXT_MARK_UNWRIT2);
                      }
                      err = ext4_split_extent_at(handle, inode, ppath,
                                      map->m_lblk, split_flag1, flags);
                      if (err)
                              goto out;
              }
      
              ext4_ext_show_leaf(inode, path);
      out:
              return err ? err : allocated;
      }
      
      /*
       * This function is called by ext4_ext_map_blocks() if someone tries to write
       * to an unwritten extent. It may result in splitting the unwritten
       * extent into multiple extents (up to three - one initialized and two
       * unwritten).
       * There are three possibilities:
       *   a> There is no split required: Entire extent should be initialized
       *   b> Splits in two extents: Write is happening at either end of the extent
       *   c> Splits in three extents: Somone is writing in middle of the extent
       *
       * Pre-conditions:
       *  - The extent pointed to by 'path' is unwritten.
       *  - The extent pointed to by 'path' contains a superset
   15  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
       *
       * Post-conditions on success:
       *  - the returned value is the number of blocks beyond map->l_lblk
       *    that are allocated and initialized.
       *    It is guaranteed to be >= map->m_len.
       */
      static int ext4_ext_convert_to_initialized(handle_t *handle,
                                                 struct inode *inode,
                                                 struct ext4_map_blocks *map,
                                                 struct ext4_ext_path **ppath,
                                                 int flags)
      {
              struct ext4_ext_path *path = *ppath;
              struct ext4_sb_info *sbi;
              struct ext4_extent_header *eh;
              struct ext4_map_blocks split_map;
              struct ext4_extent zero_ex;
              struct ext4_extent *ex, *abut_ex;
   14         ext4_lblk_t ee_block, eof_block;
              unsigned int ee_len, depth, map_len = map->m_len;
              int allocated = 0, max_zeroout = 0;
              int err = 0;
              int split_flag = 0;
      
              ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
   15                 "block %llu, max_blocks %u\n", inode->i_ino,
   15                 (unsigned long long)map->m_lblk, map_len);
      
   15         sbi = EXT4_SB(inode->i_sb);
              eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                      inode->i_sb->s_blocksize_bits;
   15         if (eof_block < map->m_lblk + map_len)
   15                 eof_block = map->m_lblk + map_len;
      
              depth = ext_depth(inode);
              eh = path[depth].p_hdr;
              ex = path[depth].p_ext;
              ee_block = le32_to_cpu(ex->ee_block);
              ee_len = ext4_ext_get_actual_len(ex);
              zero_ex.ee_len = 0;
      
              trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
      
              /* Pre-conditions */
              BUG_ON(!ext4_ext_is_unwritten(ex));
              BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
      
              /*
               * Attempt to transfer newly initialized blocks from the currently
   15          * unwritten extent to its neighbor. This is much cheaper
               * than an insertion followed by a merge as those involve costly
               * memmove() calls. Transferring to the left is the common case in
    8          * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
               * followed by append writes.
               *
               * Limitations of the current logic:
               *  - L1: we do not deal with writes covering the whole extent.
    7          *    This would require removing the extent if the transfer
    7          *    is possible.
    7          *  - L2: we only attempt to merge with an extent stored in the
    7          *    same extent tree node.
               */
              if ((map->m_lblk == ee_block) &&
                      /* See if we can merge left */
                      (map_len < ee_len) &&                /*L1*/
                      (ex > EXT_FIRST_EXTENT(eh))) {        /*L2*/
                      ext4_lblk_t prev_lblk;
                      ext4_fsblk_t prev_pblk, ee_pblk;
                      unsigned int prev_len;
      
                      abut_ex = ex - 1;
                      prev_lblk = le32_to_cpu(abut_ex->ee_block);
                      prev_len = ext4_ext_get_actual_len(abut_ex);
                      prev_pblk = ext4_ext_pblock(abut_ex);
                      ee_pblk = ext4_ext_pblock(ex);
    7 
    7                 /*
                       * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                       * upon those conditions:
                       * - C1: abut_ex is initialized,
    7                  * - C2: abut_ex is logically abutting ex,
                       * - C3: abut_ex is physically abutting ex,
                       * - C4: abut_ex can receive the additional blocks without
                       *   overflowing the (initialized) length limit.
    7                  */
                      if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                              ((prev_lblk + prev_len) == ee_block) &&                /*C2*/
    7                         ((prev_pblk + prev_len) == ee_pblk) &&                /*C3*/
                              (prev_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
                              err = ext4_ext_get_access(handle, inode, path + depth);
                              if (err)
                                      goto out;
      
                              trace_ext4_ext_convert_to_initialized_fastpath(inode,
                                      map, ex, abut_ex);
   10 
                              /* Shift the start of ex by 'map_len' blocks */
    2                         ex->ee_block = cpu_to_le32(ee_block + map_len);
                              ext4_ext_store_pblock(ex, ee_pblk + map_len);
                              ex->ee_len = cpu_to_le16(ee_len - map_len);
                              ext4_ext_mark_unwritten(ex); /* Restore the flag */
      
                              /* Extend abut_ex by 'map_len' blocks */
                              abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
      
    1                         /* Result: number of initialized blocks past m_lblk */
                              allocated = map_len;
                      }
              } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
                         (map_len < ee_len) &&        /*L1*/
                         ex < EXT_LAST_EXTENT(eh)) {        /*L2*/
                      /* See if we can merge right */
                      ext4_lblk_t next_lblk;
                      ext4_fsblk_t next_pblk, ee_pblk;
                      unsigned int next_len;
      
                      abut_ex = ex + 1;
                      next_lblk = le32_to_cpu(abut_ex->ee_block);
                      next_len = ext4_ext_get_actual_len(abut_ex);
                      next_pblk = ext4_ext_pblock(abut_ex);
                      ee_pblk = ext4_ext_pblock(ex);
      
                      /*
                       * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                       * upon those conditions:
                       * - C1: abut_ex is initialized,
                       * - C2: abut_ex is logically abutting ex,
                       * - C3: abut_ex is physically abutting ex,
                       * - C4: abut_ex can receive the additional blocks without
                       *   overflowing the (initialized) length limit.
                       */
                      if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                          ((map->m_lblk + map_len) == next_lblk) &&                /*C2*/
                          ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
                          (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
                              err = ext4_ext_get_access(handle, inode, path + depth);
                              if (err)
                                      goto out;
      
                              trace_ext4_ext_convert_to_initialized_fastpath(inode,
                                      map, ex, abut_ex);
      
    7                         /* Shift the start of abut_ex by 'map_len' blocks */
                              abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
    7                         ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
                              ex->ee_len = cpu_to_le16(ee_len - map_len);
                              ext4_ext_mark_unwritten(ex); /* Restore the flag */
      
                              /* Extend abut_ex by 'map_len' blocks */
                              abut_ex->ee_len = cpu_to_le16(next_len + map_len);
   14 
                              /* Result: number of initialized blocks past m_lblk */
                              allocated = map_len;
                      }
              }
              if (allocated) {
                      /* Mark the block containing both extents as dirty */
                      ext4_ext_dirty(handle, inode, path + depth);
      
                      /* Update path to point to the right extent */
   13                 path[depth].p_ext = abut_ex;
                      goto out;
              } else
                      allocated = ee_len - (map->m_lblk - ee_block);
      
              WARN_ON(map->m_lblk < ee_block);
              /*
   13          * It is safe to convert extent to initialized via explicit
    6          * zeroout only if extent is fully inside i_size or new_size.
               */
              split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
    6 
    6         if (EXT4_EXT_MAY_ZEROOUT & split_flag)
                      max_zeroout = sbi->s_extent_max_zeroout_kb >>
                              (inode->i_sb->s_blocksize_bits - 10);
    2 
              if (ext4_encrypted_inode(inode))
                      max_zeroout = 0;
    6 
              /* If extent is less than s_max_zeroout_kb, zeroout directly */
              if (max_zeroout && (ee_len <= max_zeroout)) {
                      err = ext4_ext_zeroout(inode, ex);
                      if (err)
                              goto out;
                      zero_ex.ee_block = ex->ee_block;
                      zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
                      ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
      
                      err = ext4_ext_get_access(handle, inode, path + depth);
                      if (err)
                              goto out;
   11                 ext4_ext_mark_initialized(ex);
                      ext4_ext_try_to_merge(handle, inode, path, ex);
                      err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                      goto out;
    8         }
      
              /*
               * four cases:
               * 1. split the extent into three extents.
               * 2. split the extent into two extents, zeroout the first half.
               * 3. split the extent into two extents, zeroout the second half.
               * 4. split the extent into two extents with out zeroout.
               */
              split_map.m_lblk = map->m_lblk;
              split_map.m_len = map->m_len;
      
    8         if (max_zeroout && (allocated > map->m_len)) {
                      if (allocated <= max_zeroout) {
    3                         /* case 3 */
    2                         zero_ex.ee_block =
                                               cpu_to_le32(map->m_lblk);
                              zero_ex.ee_len = cpu_to_le16(allocated);
                              ext4_ext_store_pblock(&zero_ex,
                                      ext4_ext_pblock(ex) + map->m_lblk - ee_block);
                              err = ext4_ext_zeroout(inode, &zero_ex);
    2                         if (err)
                                      goto out;
                              split_map.m_lblk = map->m_lblk;
                              split_map.m_len = allocated;
    3                 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
                              /* case 2 */
                              if (map->m_lblk != ee_block) {
                                      zero_ex.ee_block = ex->ee_block;
                                      zero_ex.ee_len = cpu_to_le16(map->m_lblk -
                                                              ee_block);
   11                                 ext4_ext_store_pblock(&zero_ex,
                                                            ext4_ext_pblock(ex));
                                      err = ext4_ext_zeroout(inode, &zero_ex);
                                      if (err)
                                              goto out;
                              }
    6 
   15                         split_map.m_lblk = ee_block;
   15                         split_map.m_len = map->m_lblk - ee_block + map->m_len;
                              allocated = map->m_len;
                      }
              }
      
              err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
                                      flags);
              if (err > 0)
                      err = 0;
      out:
              /* If we have gotten a failure, don't zero out status tree */
              if (!err)
                      err = ext4_zeroout_es(inode, &zero_ex);
              return err ? err : allocated;
      }
      
      /*
       * This function is called by ext4_ext_map_blocks() from
       * ext4_get_blocks_dio_write() when DIO to write
       * to an unwritten extent.
       *
       * Writing to an unwritten extent may result in splitting the unwritten
       * extent into multiple initialized/unwritten extents (up to three)
       * There are three possibilities:
       *   a> There is no split required: Entire extent should be unwritten
       *   b> Splits in two extents: Write is happening at either end of the extent
       *   c> Splits in three extents: Somone is writing in middle of the extent
       *
       * This works the same way in the case of initialized -> unwritten conversion.
       *
       * One of more index blocks maybe needed if the extent tree grow after
       * the unwritten extent split. To prevent ENOSPC occur at the IO
       * complete, we need to split the unwritten extent before DIO submit
    4  * the IO. The unwritten extent called at this time will be split
       * into three unwritten extent(at most). After IO complete, the part
       * being filled will be convert to initialized by the end_io callback function
       * via ext4_convert_unwritten_extents().
       *
       * Returns the size of unwritten extent to be written on success.
       */
      static int ext4_split_convert_extents(handle_t *handle,
                                              struct inode *inode,
                                              struct ext4_map_blocks *map,
                                              struct ext4_ext_path **ppath,
                                              int flags)
      {
              struct ext4_ext_path *path = *ppath;
              ext4_lblk_t eof_block;
              ext4_lblk_t ee_block;
              struct ext4_extent *ex;
              unsigned int ee_len;
              int split_flag = 0, depth;
      
              ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
                        __func__, inode->i_ino,
    4                   (unsigned long long)map->m_lblk, map->m_len);
      
              eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
    4                 inode->i_sb->s_blocksize_bits;
              if (eof_block < map->m_lblk + map->m_len)
                      eof_block = map->m_lblk + map->m_len;
    4         /*
    4          * It is safe to convert extent to initialized via explicit
               * zeroout only if extent is fully insde i_size or new_size.
               */
              depth = ext_depth(inode);
    4         ex = path[depth].p_ext;
              ee_block = le32_to_cpu(ex->ee_block);
              ee_len = ext4_ext_get_actual_len(ex);
      
              /* Convert to unwritten */
              if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
                      split_flag |= EXT4_EXT_DATA_VALID1;
              /* Convert to initialized */
              } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
   26                 split_flag |= ee_block + ee_len <= eof_block ?
                                    EXT4_EXT_MAY_ZEROOUT : 0;
                      split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
              }
              flags |= EXT4_GET_BLOCKS_PRE_IO;
              return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
      }
   26 
      static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                      struct inode *inode,
   26                                                 struct ext4_map_blocks *map,
                                                      struct ext4_ext_path **ppath)
      {
              struct ext4_ext_path *path = *ppath;
              struct ext4_extent *ex;
              ext4_lblk_t ee_block;
              unsigned int ee_len;
              int depth;
              int err = 0;
      
              depth = ext_depth(inode);
              ex = path[depth].p_ext;
   26         ee_block = le32_to_cpu(ex->ee_block);
              ee_len = ext4_ext_get_actual_len(ex);
      
              ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
                      "block %llu, max_blocks %u\n", inode->i_ino,
                        (unsigned long long)ee_block, ee_len);
      
              /* If extent is larger than requested it is a clear sign that we still
               * have some extent state machine issues left. So extent_split is still
               * required.
               * TODO: Once all related issues will be fixed this situation should be
               * illegal.
               */
              if (ee_block != map->m_lblk || ee_len > map->m_len) {
      #ifdef EXT4_DEBUG
                      ext4_warning("Inode (%ld) finished: extent logical block %llu,"
                                   " len %u; IO logical block %llu, len %u\n",
                                   inode->i_ino, (unsigned long long)ee_block, ee_len,
   26                              (unsigned long long)map->m_lblk, map->m_len);
      #endif
                      err = ext4_split_convert_extents(handle, inode, map, ppath,
                                                       EXT4_GET_BLOCKS_CONVERT);
   26                 if (err < 0)
                              return err;
                      path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
                      if (IS_ERR(path))
                              return PTR_ERR(path);
                      depth = ext_depth(inode);
                      ex = path[depth].p_ext;
              }
      
              err = ext4_ext_get_access(handle, inode, path + depth);
              if (err)
                      goto out;
              /* first mark the extent as initialized */
              ext4_ext_mark_initialized(ex);
      
              /* note: ext4_ext_correct_indexes() isn't needed here because
               * borders are not changed
               */
              ext4_ext_try_to_merge(handle, inode, path, ex);
    7 
              /* Mark modified extent as dirty */
              err = ext4_ext_dirty(handle, inode, path + path->p_depth);
      out:
              ext4_ext_show_leaf(inode, path);
              return err;
      }
      
      static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                              sector_t block, int count)
      {
              int i;
              for (i = 0; i < count; i++)
                      unmap_underlying_metadata(bdev, block + i);
      }
  479 
      /*
       * Handle EOFBLOCKS_FL flag, clearing it if necessary
   37  */
      static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
                                    ext4_lblk_t lblk,
                                    struct ext4_ext_path *path,
                                    unsigned int len)
      {
              int i, depth;
              struct ext4_extent_header *eh;
              struct ext4_extent *last_ex;
      
   37         if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
                      return 0;
      
              depth = ext_depth(inode);
              eh = path[depth].p_hdr;
      
              /*
               * We're going to remove EOFBLOCKS_FL entirely in future so we
               * do not care for this case anymore. Simply remove the flag
               * if there are no extents.
   37          */
   37         if (unlikely(!eh->eh_entries))
   37                 goto out;
              last_ex = EXT_LAST_EXTENT(eh);
              /*
               * We should clear the EOFBLOCKS_FL flag if we are writing the
               * last block in the last extent in the file.  We test this by
               * first checking to see if the caller to
               * ext4_ext_get_blocks() was interested in the last block (or
               * a block beyond the last block) in the current extent.  If
               * this turns out to be false, we can bail out from this
               * function immediately.
               */
              if (lblk + len < le32_to_cpu(last_ex->ee_block) +
    2             ext4_ext_get_actual_len(last_ex))
                      return 0;
              /*
               * If the caller does appear to be planning to write at or
               * beyond the end of the current extent, we then test to see
               * if the current extent is the last extent in the file, by
               * checking to make sure it was reached via the rightmost node
               * at each level of the tree.
               */
              for (i = depth-1; i >= 0; i--)
                      if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
                              return 0;
      out:
              ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
              return ext4_mark_inode_dirty(handle, inode);
  192 }
      
      /**
    1  * ext4_find_delalloc_range: find delayed allocated block in the given range.
    1  *
       * Return 1 if there is a delalloc block in the range, otherwise 0.
       */
      int ext4_find_delalloc_range(struct inode *inode,
                                   ext4_lblk_t lblk_start,
  192                              ext4_lblk_t lblk_end)
      {
              struct extent_status es;
      
              ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
              if (es.es_len == 0)
                      return 0; /* there is no delay extent in this tree */
              else if (es.es_lblk <= lblk_start &&
                       lblk_start < es.es_lblk + es.es_len)
                      return 1;
              else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
                      return 1;
              else
                      return 0;
      }
      
      int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
      {
              struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
              ext4_lblk_t lblk_start, lblk_end;
              lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
              lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
      
              return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
      }
      
      /**
       * Determines how many complete clusters (out of those specified by the 'map')
       * are under delalloc and were reserved quota for.
       * This function is called when we are writing out the blocks that were
       * originally written with their allocation delayed, but then the space was
       * allocated using fallocate() before the delayed allocation could be resolved.
       * The cases to look for are:
       * ('=' indicated delayed allocated blocks
       *  '-' indicates non-delayed allocated blocks)
       * (a) partial clusters towards beginning and/or end outside of allocated range
       *     are not delalloc'ed.
       *        Ex:
       *        |----c---=|====c====|====c====|===-c----|
       *                 |++++++ allocated ++++++|
       *        ==> 4 complete clusters in above example
       *
       * (b) partial cluster (outside of allocated range) towards either end is
       *     marked for delayed allocation. In this case, we will exclude that
       *     cluster.
       *        Ex:
       *        |----====c========|========c========|
       *             |++++++ allocated ++++++|
       *        ==> 1 complete clusters in above example
       *
       *        Ex:
       *        |================c================|
  197  *            |++++++ allocated ++++++|
       *        ==> 0 complete clusters in above example
       *
       * The ext4_da_update_reserve_space will be called only if we
       * determine here that there were some "entire" clusters that span
       * this 'allocated' range.
       * In the non-bigalloc case, this function will just end up returning num_blks
       * without ever calling ext4_find_delalloc_range.
       */
      static unsigned int
      get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
  197                            unsigned int num_blks)
      {
              struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  197         ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
              ext4_lblk_t lblk_from, lblk_to, c_offset;
              unsigned int allocated_clusters = 0;
      
              alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
              alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
      
              /* max possible clusters for this allocation */
              allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
      
  197         trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
  197 
              /* Check towards left side */
              c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
              if (c_offset) {
                      lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
                      lblk_to = lblk_from + c_offset - 1;
      
                      if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
  197                         allocated_clusters--;
              }
      
              /* Now check towards right. */
              c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
              if (allocated_clusters && c_offset) {
                      lblk_from = lblk_start + num_blks;
                      lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
      
                      if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
                              allocated_clusters--;
              }
      
              return allocated_clusters;
      }
      
      static int
      convert_initialized_extent(handle_t *handle, struct inode *inode,
                                 struct ext4_map_blocks *map,
                                 struct ext4_ext_path **ppath, int flags,
                                 unsigned int allocated, ext4_fsblk_t newblock)
      {
              struct ext4_ext_path *path = *ppath;
              struct ext4_extent *ex;
              ext4_lblk_t ee_block;
              unsigned int ee_len;
              int depth;
              int err = 0;
      
              /*
               * Make sure that the extent is no bigger than we support with
               * unwritten extent
               */
              if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
                      map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
      
              depth = ext_depth(inode);
              ex = path[depth].p_ext;
              ee_block = le32_to_cpu(ex->ee_block);
              ee_len = ext4_ext_get_actual_len(ex);
      
              ext_debug("%s: inode %lu, logical"
                      "block %llu, max_blocks %u\n", __func__, inode->i_ino,
                        (unsigned long long)ee_block, ee_len);
      
              if (ee_block != map->m_lblk || ee_len > map->m_len) {
                      err = ext4_split_convert_extents(handle, inode, map, ppath,
                                      EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
                      if (err < 0)
                              return err;
                      path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
                      if (IS_ERR(path))
                              return PTR_ERR(path);
                      depth = ext_depth(inode);
                      ex = path[depth].p_ext;
                      if (!ex) {
                              EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                               (unsigned long) map->m_lblk);
                              return -EFSCORRUPTED;
                      }
              }
      
              err = ext4_ext_get_access(handle, inode, path + depth);
              if (err)
                      return err;
              /* first mark the extent as unwritten */
              ext4_ext_mark_unwritten(ex);
      
              /* note: ext4_ext_correct_indexes() isn't needed here because
               * borders are not changed
               */
              ext4_ext_try_to_merge(handle, inode, path, ex);
      
              /* Mark modified extent as dirty */
              err = ext4_ext_dirty(handle, inode, path + path->p_depth);
              if (err)
                      return err;
              ext4_ext_show_leaf(inode, path);
      
              ext4_update_inode_fsync_trans(handle, inode, 1);
              err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
              if (err)
                      return err;
              map->m_flags |= EXT4_MAP_UNWRITTEN;
              if (allocated > map->m_len)
                      allocated = map->m_len;
   45         map->m_len = allocated;
              return allocated;
      }
      
      static int
      ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
                              struct ext4_map_blocks *map,
                              struct ext4_ext_path **ppath, int flags,
                              unsigned int allocated, ext4_fsblk_t newblock)
      {
              struct ext4_ext_path *path = *ppath;
              int ret = 0;
              int err = 0;
              ext4_io_end_t *io = ext4_inode_aio(inode);
   45 
              ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
                        "block %llu, max_blocks %u, flags %x, allocated %u\n",
                        inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
   45                   flags, allocated);
    4         ext4_ext_show_leaf(inode, path);
      
              /*
               * When writing into unwritten space, we should not fail to
               * allocate metadata blocks for the new extent block if needed.
               */
              flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
      
              trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
    4                                                     allocated, newblock);
    3 
              /* get_block() before submit the IO, split the extent */
    1         if (flags & EXT4_GET_BLOCKS_PRE_IO) {
    4                 ret = ext4_split_convert_extents(handle, inode, map, ppath,
                                               flags | EXT4_GET_BLOCKS_CONVERT);
                      if (ret <= 0)
                              goto out;
   43                 /*
   26                  * Flag the inode(non aio case) or end_io struct (aio case)
                       * that this IO needs to conversion to written when IO is
   26                  * completed
   26                  */
   26                 if (io)
                              ext4_set_io_unwritten_flag(inode, io);
                      else
                              ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
   26                 map->m_flags |= EXT4_MAP_UNWRITTEN;
                      goto out;
              }
              /* IO end_io complete, convert the filled extent to written */
              if (flags & EXT4_GET_BLOCKS_CONVERT) {
                      ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
                                                                 ppath);
                      if (ret >= 0) {
                              ext4_update_inode_fsync_trans(handle, inode, 1);
                              err = check_eofblocks_fl(handle, inode, map->m_lblk,
                                                       path, map->m_len);
                      } else
   17                         err = ret;
    1                 map->m_flags |= EXT4_MAP_MAPPED;
                      map->m_pblk = newblock;
                      if (allocated > map->m_len)
                              allocated = map->m_len;
                      map->m_len = allocated;
   16                 goto out2;
              }
              /* buffered IO case */
              /*
               * repeat fallocate creation request
               * we already have an unwritten extent
               */
              if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
    2                 map->m_flags |= EXT4_MAP_UNWRITTEN;
                      goto map_out;
              }
      
              /* buffered READ or buffered write_begin() lookup */
   15         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                      /*
   15                  * We have blocks reserved already.  We
                       * return allocated blocks so that delalloc
   15                  * won't do block reservation for us.  But
                       * the buffer head will be unmapped so that
                       * a read from the block returns 0s.
                       */
   19                 map->m_flags |= EXT4_MAP_UNWRITTEN;
                      goto out1;
              }
      
              /* buffered write, writepage time, convert*/
              ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
              if (ret >= 0)
                      ext4_update_inode_fsync_trans(handle, inode, 1);
      out:
              if (ret <= 0) {
    7                 err = ret;
                      goto out2;
    7         } else
    7                 allocated = ret;
              map->m_flags |= EXT4_MAP_NEW;
   19         /*
               * if we allocated more blocks than requested
               * we need to make sure we unmap the extra block
               * allocated. The actual needed block will get
               * unmapped later when we find the buffer_head marked
               * new.
               */
              if (allocated > map->m_len) {
                      unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
   18                                         newblock + map->m_len,
                                              allocated - map->m_len);
    1                 allocated = map->m_len;
              }
              map->m_len = allocated;
    1 
              /*
               * If we have done fallocate with the offset that is already
               * delayed allocated, we would have block reservation
               * and quota reservation done in the delayed write path.
               * But fallocate would have already updated quota and block
   20          * count for this offset. So cancel these reservation
               */
   19         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                      unsigned int reserved_clusters;
                      reserved_clusters = get_reserved_cluster_alloc(inode,
                                      map->m_lblk, map->m_len);
                      if (reserved_clusters)
                              ext4_da_update_reserve_space(inode,
   20                                                      reserved_clusters,
                                                           0);
              }
      
      map_out:
              map->m_flags |= EXT4_MAP_MAPPED;
   45         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
                      err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
                                               map->m_len);
                      if (err < 0)
                              goto out2;
              }
      out1:
              if (allocated > map->m_len)
                      allocated = map->m_len;
              ext4_ext_show_leaf(inode, path);
              map->m_pblk = newblock;
              map->m_len = allocated;
      out2:
              return err ? err : allocated;
      }
      
      /*
       * get_implied_cluster_alloc - check to see if the requested
       * allocation (in the map structure) overlaps with a cluster already
       * allocated in an extent.
       *        @sb        The filesystem superblock structure
       *        @map        The requested lblk->pblk mapping
       *        @ex        The extent structure which might contain an implied
       *                        cluster allocation
       *
       * This function is called by ext4_ext_map_blocks() after we failed to
       * find blocks that were already in the inode's extent tree.  Hence,
       * we know that the beginning of the requested region cannot overlap
       * the extent from the inode's extent tree.  There are three cases we
       * want to catch.  The first is this case:
       *
       *                 |--- cluster # N--|
       *    |--- extent ---|        |---- requested region ---|
       *                        |==========|
       *
       * The second case that we need to test for is this one:
       *
       *   |--------- cluster # N ----------------|
       *           |--- requested region --|   |------- extent ----|
       *           |=======================|
       *
       * The third case is when the requested region lies between two extents
       * within the same cluster:
       *          |------------- cluster # N-------------|
       * |----- ex -----|                  |---- ex_right ----|
       *                  |------ requested region ------|
       *                  |================|
       *
       * In each of the above cases, we need to set the map->m_pblk and
       * map->m_len so it corresponds to the return the extent labelled as
       * "|====|" from cluster #N, since it is already in use for data in
       * cluster EXT4_B2C(sbi, map->m_lblk).        We will then return 1 to
       * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
       * as a new "allocated" block region.  Otherwise, we will return 0 and
       * ext4_ext_map_blocks() will then allocate one or more new clusters
       * by calling ext4_mb_new_blocks().
       */
      static int get_implied_cluster_alloc(struct super_block *sb,
                                           struct ext4_map_blocks *map,
                                           struct ext4_extent *ex,
                                           struct ext4_ext_path *path)
      {
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
              ext4_lblk_t ex_cluster_start, ex_cluster_end;
              ext4_lblk_t rr_cluster_start;
              ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
              ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
              unsigned short ee_len = ext4_ext_get_actual_len(ex);
      
              /* The extent passed in that we are trying to match */
              ex_cluster_start = EXT4_B2C(sbi, ee_block);
              ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
      
              /* The requested region passed into ext4_map_blocks() */
              rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
      
              if ((rr_cluster_start == ex_cluster_end) ||
                  (rr_cluster_start == ex_cluster_start)) {
                      if (rr_cluster_start == ex_cluster_end)
                              ee_start += ee_len - 1;
                      map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
                      map->m_len = min(map->m_len,
                                       (unsigned) sbi->s_cluster_ratio - c_offset);
                      /*
                       * Check for and handle this case:
                       *
                       *   |--------- cluster # N-------------|
                       *                       |------- extent ----|
                       *           |--- requested region ---|
                       *           |===========|
                       */
      
                      if (map->m_lblk < ee_block)
                              map->m_len = min(map->m_len, ee_block - map->m_lblk);
      
                      /*
                       * Check for the case where there is already another allocated
                       * block to the right of 'ex' but before the end of the cluster.
                       *
                       *          |------------- cluster # N-------------|
                       * |----- ex -----|                  |---- ex_right ----|
                       *                  |------ requested region ------|
                       *                  |================|
                       */
                      if (map->m_lblk > ee_block) {
                              ext4_lblk_t next = ext4_ext_next_allocated_block(path);
                              map->m_len = min(map->m_len, next - map->m_lblk);
                      }
      
                      trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
                      return 1;
              }
      
              trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
              return 0;
      }
      
      
      /*
       * Block allocation/map/preallocation routine for extents based files
       *
       *
       * Need to be called with
       * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
       * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
       *
  705  * return > 0, number of of blocks already mapped/allocated
       *          if create == 0 and these are pre-allocated blocks
       *                  buffer head is unmapped
       *          otherwise blocks are mapped
       *
       * return = 0, if plain look up failed (blocks have not been allocated)
       *          buffer head is unmapped
       *
       * return < 0, error case.
       */
      int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                              struct ext4_map_blocks *map, int flags)
      {
              struct ext4_ext_path *path = NULL;
              struct ext4_extent newex, *ex, *ex2;
  705         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
              ext4_fsblk_t newblock = 0;
              int free_on_err = 0, err = 0, depth, ret;
  705         unsigned int allocated = 0, offset = 0;
              unsigned int allocated_clusters = 0;
              struct ext4_allocation_request ar;
              ext4_io_end_t *io = ext4_inode_aio(inode);
              ext4_lblk_t cluster_offset;
              int set_unwritten = 0;
              bool map_from_cluster = false;
  705 
              ext_debug("blocks %u/%u requested for inode %lu\n",
                        map->m_lblk, map->m_len, inode->i_ino);
              trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
      
              /* find extent for this block */
              path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
              if (IS_ERR(path)) {
                      err = PTR_ERR(path);
                      path = NULL;
                      goto out2;
              }
      
              depth = ext_depth(inode);
      
              /*
               * consistent leaf must not be empty;
               * this situation is possible, though, _during_ tree modification;
  575          * this is why assert can't be put in ext4_find_extent()
               */
              if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                      EXT4_ERROR_INODE(inode, "bad extent address "
                                       "lblock: %lu, depth: %d pblock %lld",
                                       (unsigned long) map->m_lblk, depth,
                                       path[depth].p_block);
                      err = -EFSCORRUPTED;
                      goto out2;
  575         }
      
  575         ex = path[depth].p_ext;
              if (ex) {
                      ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
  575                 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
   87                 unsigned short ee_len;
      
      
                      /*
                       * unwritten extents are treated as holes, except that
                       * we split out initialized portions during a write.
                       */
                      ee_len = ext4_ext_get_actual_len(ex);
      
                      trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
      
   43                 /* if found extent covers block, simply return it */
                      if (in_range(map->m_lblk, ee_block, ee_len)) {
                              newblock = map->m_lblk - ee_block + ee_start;
                              /* number of remaining blocks in the extent */
   76                         allocated = ee_len - (map->m_lblk - ee_block);
                              ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                        ee_block, ee_len, newblock);
      
   45                         /*
                               * If the extent is initialized check whether the
                               * caller wants to convert it to unwritten.
   45                          */
                              if ((!ext4_ext_is_unwritten(ex)) &&
                                  (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
   45                                 allocated = convert_initialized_extent(
                                                      handle, inode, map, &path,
                                                      flags, allocated, newblock);
                                      goto out2;
                              } else if (!ext4_ext_is_unwritten(ex))
                                      goto out;
      
                              ret = ext4_ext_handle_unwritten_extents(
                                      handle, inode, map, &path, flags,
  683                                 allocated, newblock);
                              if (ret < 0)
                                      err = ret;
                              else
                                      allocated = ret;
  559                         goto out2;
                      }
              }
      
              /*
               * requested block isn't allocated yet;
               * we couldn't try to create block if create flag is zero
  602          */
              if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                      /*
                       * put just found gap into cache to speed up
                       * subsequent requests
                       */
                      ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
                      goto out2;
              }
      
              /*
               * Okay, we need to do block allocation.
               */
              newex.ee_block = cpu_to_le32(map->m_lblk);
              cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
      
  602         /*
  602          * If we are doing bigalloc, check to see if the extent returned
               * by ext4_find_extent() implies a cluster we can use.
               */
  602         if (cluster_offset && ex &&
                  get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
                      ar.len = allocated = map->m_len;
                      newblock = map->m_pblk;
                      map_from_cluster = true;
                      goto got_allocated_blocks;
              }
      
  602         /* find neighbour allocated blocks */
              ar.lleft = map->m_lblk;
              err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
              if (err)
                      goto out2;
              ar.lright = map->m_lblk;
              ex2 = NULL;
              err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
              if (err)
                      goto out2;
      
              /* Check if the extent after searching to the right implies a
               * cluster we can use. */
              if ((sbi->s_cluster_ratio > 1) && ex2 &&
  602             get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
  162                 ar.len = allocated = map->m_len;
  162                 newblock = map->m_pblk;
  551                 map_from_cluster = true;
   90                 goto got_allocated_blocks;
  162         }
      
              /*
  550          * See if request is beyond maximum number of blocks we can have in
  602          * a single extent. For an initialized extent this limit is
               * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
    2          * EXT_UNWRITTEN_MAX_LEN.
               */
              if (map->m_len > EXT_INIT_MAX_LEN &&
                  !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                      map->m_len = EXT_INIT_MAX_LEN;
  602         else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
                       (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                      map->m_len = EXT_UNWRITTEN_MAX_LEN;
      
              /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
              newex.ee_len = cpu_to_le16(map->m_len);
              err = ext4_ext_check_overlap(sbi, inode, &newex, path);
              if (err)
                      allocated = ext4_ext_get_actual_len(&newex);
              else
                      allocated = map->m_len;
      
              /* allocate new block */
              ar.inode = inode;
              ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
              ar.logical = map->m_lblk;
              /*
               * We calculate the offset from the beginning of the cluster
               * for the logical block number, since when we allocate a
               * physical cluster, the physical block should start at the
  602          * same offset from the beginning of the cluster.  This is
    4          * needed so that future calls to get_implied_cluster_alloc()
  601          * work correctly.
  196          */
  602         offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
  196         ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
  602         ar.goal -= offset;
              ar.logical -= offset;
              if (S_ISREG(inode->i_mode))
                      ar.flags = EXT4_MB_HINT_DATA;
              else
                      /* disable in-core preallocation for non-regular files */
  598                 ar.flags = 0;
  598         if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
                      ar.flags |= EXT4_MB_HINT_NOPREALLOC;
              if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                      ar.flags |= EXT4_MB_DELALLOC_RESERVED;
              if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                      ar.flags |= EXT4_MB_USE_RESERVED;
  598         newblock = ext4_mb_new_blocks(handle, &ar, &err);
  451         if (!newblock)
                      goto out2;
              ext_debug("allocate new block: goal %llu, found %llu/%u\n",
  192                   ar.goal, newblock, allocated);
              free_on_err = 1;
              allocated_clusters = ar.len;
              ar.len = EXT4_C2B(sbi, ar.len) - offset;
              if (ar.len > allocated)
                      ar.len = allocated;
      
      got_allocated_blocks:
              /* try to insert new extent into found leaf and return */
              ext4_ext_store_pblock(&newex, newblock + offset);
              newex.ee_len = cpu_to_le16(ar.len);
              /* Mark unwritten */
              if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
  598                 ext4_ext_mark_unwritten(&newex);
                      map->m_flags |= EXT4_MAP_UNWRITTEN;
  467                 /*
                       * io_end structure was created for every IO write to an
  597                  * unwritten extent. To avoid unnecessary conversion,
  598                  * here we flag the IO that really needs the conversion.
                       * For non asycn direct IO case, flag the inode state
                       * that we need to perform conversion when IO is done.
  598                  */
   29                 if (flags & EXT4_GET_BLOCKS_PRE_IO)
    5                         set_unwritten = 1;
              }
   24 
              err = 0;
              if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
                      err = check_eofblocks_fl(handle, inode, map->m_lblk,
   26                                          path, ar.len);
              if (!err)
                      err = ext4_ext_insert_extent(handle, inode, &path,
                                                   &newex, flags);
      
              if (!err && set_unwritten) {
                      if (io)
                              ext4_set_io_unwritten_flag(inode, io);
                      else
                              ext4_set_inode_state(inode,
                                                   EXT4_STATE_DIO_UNWRITTEN);
              }
      
  598         if (err && free_on_err) {
  598                 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
  598                         EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
                      /* free data blocks we just allocated */
                      /* not a good idea to call discard here directly,
                       * but otherwise we'd need to call it every free() */
                      ext4_discard_preallocations(inode);
                      ext4_free_blocks(handle, inode, NULL, newblock,
                                       EXT4_C2B(sbi, allocated_clusters), fb_flags);
                      goto out2;
              }
      
              /* previous routine could use block we allocated */
              newblock = ext4_ext_pblock(&newex);
              allocated = ext4_ext_get_actual_len(&newex);
  196         if (allocated > map->m_len)
                      allocated = map->m_len;
              map->m_flags |= EXT4_MAP_NEW;
  196 
  196         /*
               * Update reserved blocks/metadata blocks after successful
               * block allocation which had been deferred till now.
               */
              if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                      unsigned int reserved_clusters;
                      /*
                       * Check how many clusters we had reserved this allocated range
                       */
                      reserved_clusters = get_reserved_cluster_alloc(inode,
                                                      map->m_lblk, allocated);
                      if (!map_from_cluster) {
                              BUG_ON(allocated_clusters < reserved_clusters);
                              if (reserved_clusters < allocated_clusters) {
                                      struct ext4_inode_info *ei = EXT4_I(inode);
                                      int reservation = allocated_clusters -
                                                        reserved_clusters;
                                      /*
                                       * It seems we claimed few clusters outside of
                                       * the range of this allocation. We should give
                                       * it back to the reservation pool. This can
                                       * happen in the following case:
                                       *
                                       * * Suppose s_cluster_ratio is 4 (i.e., each
                                       *   cluster has 4 blocks. Thus, the clusters
                                       *   are [0-3],[4-7],[8-11]...
                                       * * First comes delayed allocation write for
                                       *   logical blocks 10 & 11. Since there were no
                                       *   previous delayed allocated blocks in the
                                       *   range [8-11], we would reserve 1 cluster
                                       *   for this write.
                                       * * Next comes write for logical blocks 3 to 8.
                                       *   In this case, we will reserve 2 clusters
                                       *   (for [0-3] and [4-7]; and not for [8-11] as
                                       *   that range has a delayed allocated blocks.
                                       *   Thus total reserved clusters now becomes 3.
                                       * * Now, during the delayed allocation writeout
                                       *   time, we will first write blocks [3-8] and
                                       *   allocate 3 clusters for writing these
                                       *   blocks. Also, we would claim all these
                                       *   three clusters above.
                                       * * Now when we come here to writeout the
                                       *   blocks [10-11], we would expect to claim
                                       *   the reservation of 1 cluster we had made
                                       *   (and we would claim it since there are no
                                       *   more delayed allocated blocks in the range
                                       *   [8-11]. But our reserved cluster count had
                                       *   already gone to 0.
                                       *
                                       *   Thus, at the step 4 above when we determine
                                       *   that there are still some unwritten delayed
                                       *   allocated blocks outside of our current
                                       *   block range, we should increment the
                                       *   reserved clusters count so that when the
                                       *   remaining blocks finally gets written, we
                                       *   could claim them.
                                       */
  196                                 dquot_reserve_block(inode,
                                                      EXT4_C2B(sbi, reservation));
                                      spin_lock(&ei->i_block_reservation_lock);
                                      ei->i_reserved_data_blocks += reservation;
                                      spin_unlock(&ei->i_block_reservation_lock);
                              }
                              /*
                               * We will claim quota for all newly allocated blocks.
                               * We're updating the reserved space *after* the
  597                          * correction above so we do not accidentally free
  450                          * all the metadata reservation because we might
                               * actually need it later on.
  192                          */
                              ext4_da_update_reserve_space(inode, allocated_clusters,
  625                                                         1);
                      }
              }
      
              /*
               * Cache the extent and update transaction to commit on fdatasync only
               * when it is _not_ an unwritten extent.
  704          */
              if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
                      ext4_update_inode_fsync_trans(handle, inode, 1);
  704         else
                      ext4_update_inode_fsync_trans(handle, inode, 0);
  704 out:
              if (allocated > map->m_len)
                      allocated = map->m_len;
              ext4_ext_show_leaf(inode, path);
              map->m_flags |= EXT4_MAP_MAPPED;
  431         map->m_pblk = newblock;
              map->m_len = allocated;
      out2:
              ext4_ext_drop_refs(path);
              kfree(path);
      
              trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                             err ? err : allocated);
              return err ? err : allocated;
      }
      
      void ext4_ext_truncate(handle_t *handle, struct inode *inode)
      {
              struct super_block *sb = inode->i_sb;
              ext4_lblk_t last_block;
              int err = 0;
      
  431         /*
               * TODO: optimization is possible here.
               * Probably we need not scan at all,
               * because page truncation is enough.
               */
      
              /* we have to know where to truncate from in crash case */
  431         EXT4_I(inode)->i_disksize = inode->i_size;
              ext4_mark_inode_dirty(handle, inode);
      
              last_block = (inode->i_size + sb->s_blocksize - 1)
  431                         >> EXT4_BLOCK_SIZE_BITS(sb);
  430 retry:
              err = ext4_es_remove_extent(inode, last_block,
                                          EXT_MAX_BLOCKS - last_block);
              if (err == -ENOMEM) {
                      cond_resched();
                      congestion_wait(BLK_RW_ASYNC, HZ/50);
                      goto retry;
              }
              if (err) {
                      ext4_std_error(inode->i_sb, err);
                      return;
  166         }
              err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
              ext4_std_error(inode->i_sb, err);
      }
      
      static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
                                        ext4_lblk_t len, loff_t new_size,
                                        int flags, int mode)
      {
              struct inode *inode = file_inode(file);
              handle_t *handle;
              int ret = 0;
              int ret2 = 0;
              int retries = 0;
    4         int depth = 0;
              struct ext4_map_blocks map;
              unsigned int credits;
              loff_t epos;
      
  166         map.m_lblk = offset;
              map.m_len = len;
              /*
               * Don't normalize the request if it can fit in one extent so
               * that it doesn't get unnecessarily split into multiple
  166          * extents.
               */
              if (len <= EXT_UNWRITTEN_MAX_LEN)
                      flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
      
  166         /*
               * credits to insert 1 extent into extent tree
               */
              credits = ext4_chunk_trans_blocks(inode, len);
  166         /*
  146          * We can only call ext_depth() on extent based inodes
               */
              if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                      depth = ext_depth(inode);
  166         else
                      depth = -1;
      
      retry:
              while (ret >= 0 && len) {
                      /*
  166                  * Recalculate credits when extent tree depth changes.
                       */
                      if (depth >= 0 && depth != ext_depth(inode)) {
                              credits = ext4_chunk_trans_blocks(inode, len);
                              depth = ext_depth(inode);
                      }
   48 
                      handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                                  credits);
                      if (IS_ERR(handle)) {
  165                         ret = PTR_ERR(handle);
                              break;
                      }
  165                 ret = ext4_map_blocks(handle, inode, &map, flags);
                      if (ret <= 0) {
                              ext4_debug("inode #%lu: block %u: len %u: "
                                         "ext4_ext_map_blocks returned %d",
                                         inode->i_ino, map.m_lblk,
                                         map.m_len, ret);
                              ext4_mark_inode_dirty(handle, inode);
  165                         ret2 = ext4_journal_stop(handle);
  164                         break;
                      }
                      map.m_lblk += ret;
  165                 map.m_len = len = len - ret;
                      epos = (loff_t)map.m_lblk << inode->i_blkbits;
  165                 inode->i_ctime = ext4_current_time(inode);
  108                 if (new_size) {
                              if (epos > new_size)
                                      epos = new_size;
   48                         if (ext4_update_inode_size(inode, epos) & 0x1)
   47                                 inode->i_mtime = inode->i_ctime;
                      } else {
                              if (epos > inode->i_size)
                                      ext4_set_inode_flag(inode,
                                                          EXT4_INODE_EOFBLOCKS);
   71                 }
  166                 ext4_mark_inode_dirty(handle, inode);
                      ext4_update_inode_fsync_trans(handle, inode, 1);
                      ret2 = ext4_journal_stop(handle);
                      if (ret2)
                              break;
              }
              if (ret == -ENOSPC &&
                              ext4_should_retry_alloc(inode->i_sb, &retries)) {
                      ret = 0;
                      goto retry;
              }
      
              return ret > 0 ? ret2 : ret;
      }
      
      static long ext4_zero_range(struct file *file, loff_t offset,
                                  loff_t len, int mode)
      {
              struct inode *inode = file_inode(file);
              handle_t *handle = NULL;
              unsigned int max_blocks;
              loff_t new_size = 0;
              int ret = 0;
              int flags;
              int credits;
              int partial_begin, partial_end;
  147         loff_t start, end;
              ext4_lblk_t lblk;
              unsigned int blkbits = inode->i_blkbits;
      
              trace_ext4_zero_range(inode, offset, len, mode);
      
              if (!S_ISREG(inode->i_mode))
                      return -EINVAL;
      
              /* Call ext4_force_commit to flush all data in case of data=journal. */
              if (ext4_should_journal_data(inode)) {
                      ret = ext4_force_commit(inode->i_sb);
                      if (ret)
                              return ret;
              }
      
              /*
               * Round up offset. This is not fallocate, we neet to zero out
               * blocks, so convert interior block aligned part of the range to
               * unwritten and possibly manually zero out unaligned parts of the
               * range.
               */
              start = round_up(offset, 1 << blkbits);
              end = round_down((offset + len), 1 << blkbits);
      
              if (start < offset || end > offset + len)
                      return -EINVAL;
              partial_begin = offset & ((1 << blkbits) - 1);
              partial_end = (offset + len) & ((1 << blkbits) - 1);
      
              lblk = start >> blkbits;
              max_blocks = (end >> blkbits);
              if (max_blocks < lblk)
                      max_blocks = 0;
              else
                      max_blocks -= lblk;
      
              mutex_lock(&inode->i_mutex);
      
              /*
               * Indirect files do not support unwritten extnets
               */
              if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                      ret = -EOPNOTSUPP;
                      goto out_mutex;
              }
      
              if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                  (offset + len > i_size_read(inode) ||
                   offset + len > EXT4_I(inode)->i_disksize)) {
                      new_size = offset + len;
                      ret = inode_newsize_ok(inode, new_size);
                      if (ret)
                              goto out_mutex;
              }
      
              flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
              if (mode & FALLOC_FL_KEEP_SIZE)
                      flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
      
              /* Wait all existing dio workers, newcomers will block on i_mutex */
              ext4_inode_block_unlocked_dio(inode);
              inode_dio_wait(inode);
      
              /* Preallocate the range including the unaligned edges */
              if (partial_begin || partial_end) {
                      ret = ext4_alloc_file_blocks(file,
                                      round_down(offset, 1 << blkbits) >> blkbits,
                                      (round_up((offset + len), 1 << blkbits) -
                                       round_down(offset, 1 << blkbits)) >> blkbits,
                                      new_size, flags, mode);
                      if (ret)
                              goto out_dio;
      
              }
      
              /* Zero range excluding the unaligned edges */
              if (max_blocks > 0) {
                      flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
                                EXT4_EX_NOCACHE);
      
                      /*
                       * Prevent page faults from reinstantiating pages we have
                       * released from page cache.
                       */
                      down_write(&EXT4_I(inode)->i_mmap_sem);
                      ret = ext4_update_disksize_before_punch(inode, offset, len);
                      if (ret) {
                              up_write(&EXT4_I(inode)->i_mmap_sem);
                              goto out_dio;
                      }
                      /* Now release the pages and zero block aligned part of pages */
                      truncate_pagecache_range(inode, start, end - 1);
                      inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
      
                      ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
                                                   flags, mode);
                      up_write(&EXT4_I(inode)->i_mmap_sem);
                      if (ret)
                              goto out_dio;
              }
              if (!partial_begin && !partial_end)
                      goto out_dio;
      
              /*
               * In worst case we have to writeout two nonadjacent unwritten
               * blocks and update the inode
               */
              credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
              if (ext4_should_journal_data(inode))
                      credits += 2;
              handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
              if (IS_ERR(handle)) {
                      ret = PTR_ERR(handle);
                      ext4_std_error(inode->i_sb, ret);
                      goto out_dio;
              }
      
              inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
              if (new_size) {
                      ext4_update_inode_size(inode, new_size);
              } else {
                      /*
                      * Mark that we allocate beyond EOF so the subsequent truncate
                      * can proceed even if the new size is the same as i_size.
                      */
                      if ((offset + len) > i_size_read(inode))
                              ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
              }
              ext4_mark_inode_dirty(handle, inode);
      
              /* Zero out partial block at the edges of the range */
              ret = ext4_zero_partial_blocks(handle, inode, offset, len);
              if (ret >= 0)
                      ext4_update_inode_fsync_trans(handle, inode, 1);
      
              if (file->f_flags & O_SYNC)
                      ext4_handle_sync(handle);
      
              ext4_journal_stop(handle);
      out_dio:
              ext4_inode_resume_unlocked_dio(inode);
      out_mutex:
  204         mutex_unlock(&inode->i_mutex);
              return ret;
      }
      
      /*
       * preallocate space for a file. This implements ext4's fallocate file
  170  * operation, which gets called from sys_fallocate system call.
       * For block-mapped files, posix_fallocate should fall back to the method
       * of writing zeroes to the required new blocks (the same behavior which is
       * expected for file systems which do not support fallocate() system call).
       */
      long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
      {
              struct inode *inode = file_inode(file);
              loff_t new_size = 0;
              unsigned int max_blocks;
              int ret = 0;
              int flags;
              ext4_lblk_t lblk;
              unsigned int blkbits = inode->i_blkbits;
      
              /*
               * Encrypted inodes can't handle collapse range or insert
               * range since we would need to re-encrypt blocks with a
               * different IV or XTS tweak (which are based on the logical
               * block number).
               *
               * XXX It's not clear why zero range isn't working, but we'll
               * leave it disabled for encrypted inodes for now.  This is a
  204          * bug we should fix....
   34          */
              if (ext4_encrypted_inode(inode) &&
                  (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
                           FALLOC_FL_ZERO_RANGE)))
                      return -EOPNOTSUPP;
      
  170         /* Return error if mode is not supported */
              if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
                           FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
  170                      FALLOC_FL_INSERT_RANGE))
                      return -EOPNOTSUPP;
      
  170         if (mode & FALLOC_FL_PUNCH_HOLE)
  147                 return ext4_punch_hole(inode, offset, len);
      
  170         ret = ext4_convert_inline_data(inode);
  170         if (ret)
                      return ret;
      
              if (mode & FALLOC_FL_COLLAPSE_RANGE)
                      return ext4_collapse_range(inode, offset, len);
      
              if (mode & FALLOC_FL_INSERT_RANGE)
                      return ext4_insert_range(inode, offset, len);
      
              if (mode & FALLOC_FL_ZERO_RANGE)
                      return ext4_zero_range(file, offset, len, mode);
      
  170         trace_ext4_fallocate_enter(inode, offset, len, mode);
              lblk = offset >> blkbits;
              /*
               * We can't just convert len to max_blocks because
               * If blocksize = 4096 offset = 3072 and len = 2048
               */
              max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
                      - lblk;
      
              flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
              if (mode & FALLOC_FL_KEEP_SIZE)
                      flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
      
              mutex_lock(&inode->i_mutex);
      
              /*
               * We only support preallocation for extent-based files only
               */
              if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                      ret = -EOPNOTSUPP;
  166                 goto out;
              }
      
              if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                  (offset + len > i_size_read(inode) ||
                   offset + len > EXT4_I(inode)->i_disksize)) {
                      new_size = offset + len;
                      ret = inode_newsize_ok(inode, new_size);
                      if (ret)
   70                         goto out;
              }
      
              /* Wait all existing dio workers, newcomers will block on i_mutex */
              ext4_inode_block_unlocked_dio(inode);
  113         inode_dio_wait(inode);
  113 
              ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
                                           flags, mode);
              ext4_inode_resume_unlocked_dio(inode);
              if (ret)
                      goto out;
      
              if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
                      ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
                                                      EXT4_I(inode)->i_sync_tid);
              }
      out:
              mutex_unlock(&inode->i_mutex);
              trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
              return ret;
      }
      
      /*
       * This function convert a range of blocks to written extents
       * The caller of this function will pass the start offset and the size.
       * all unwritten extents within this range will be converted to
   27  * written extents.
       *
       * This function is called from the direct IO end io call back
       * function, to convert the fallocated extents after IO is completed.
       * Returns 0 on success.
       */
      int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                         loff_t offset, ssize_t len)
      {
              unsigned int max_blocks;
              int ret = 0;
              int ret2 = 0;
              struct ext4_map_blocks map;
              unsigned int credits, blkbits = inode->i_blkbits;
      
              map.m_lblk = offset >> blkbits;
              /*
               * We can't just convert len to max_blocks because
               * If blocksize = 4096 offset = 3072 and len = 2048
               */
              max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
                            map.m_lblk);
              /*
               * This is somewhat ugly but the idea is clear: When transaction is
   27          * reserved, everything goes into it. Otherwise we rather start several
               * smaller transactions for conversion of each extent separately.
   27          */
   27         if (handle) {
                      handle = ext4_journal_start_reserved(handle,
                                                           EXT4_HT_EXT_CONVERT);
   27                 if (IS_ERR(handle))
                              return PTR_ERR(handle);
                      credits = 0;
              } else {
                      /*
                       * credits to insert 1 extent into extent tree
                       */
   27                 credits = ext4_chunk_trans_blocks(inode, max_blocks);
              }
              while (ret >= 0 && ret < max_blocks) {
                      map.m_lblk += ret;
                      map.m_len = (max_blocks -= ret);
                      if (credits) {
                              handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                                          credits);
   27                         if (IS_ERR(handle)) {
                                      ret = PTR_ERR(handle);
   27                                 break;
                              }
                      }
                      ret = ext4_map_blocks(handle, inode, &map,
   27                                       EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                      if (ret <= 0)
   27                         ext4_warning(inode->i_sb,
                                           "inode #%lu: block %u: len %u: "
                                           "ext4_ext_map_blocks returned %d",
                                           inode->i_ino, map.m_lblk,
                                           map.m_len, ret);
                      ext4_mark_inode_dirty(handle, inode);
                      if (credits)
                              ret2 = ext4_journal_stop(handle);
                      if (ret <= 0 || ret2)
                              break;
              }
              if (!credits)
                      ret2 = ext4_journal_stop(handle);
              return ret > 0 ? ret2 : ret;
      }
      
      /*
       * If newes is not existing extent (newes->ec_pblk equals zero) find
    5  * delayed extent at start of newes and update newes accordingly and
    6  * return start of the next delayed extent.
       *
       * If newes is existing extent (newes->ec_pblk is not equal zero)
       * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
       * extent found. Leave newes unmodified.
       */
      static int ext4_find_delayed_extent(struct inode *inode,
                                          struct extent_status *newes)
      {
              struct extent_status es;
              ext4_lblk_t block, next_del;
    4 
              if (newes->es_pblk == 0) {
    2                 ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
                                      newes->es_lblk + newes->es_len - 1, &es);
    4 
                      /*
                       * No extent in extent-tree contains block @newes->es_pblk,
    4                  * then the block may stay in 1)a hole or 2)delayed-extent.
                       */
                      if (es.es_len == 0)
                              /* A hole found. */
    8                         return 0;
      
                      if (es.es_lblk > newes->es_lblk) {
                              /* A hole found. */
    2                         newes->es_len = min(es.es_lblk - newes->es_lblk,
                                                  newes->es_len);
    8                         return 0;
                      }
      
                      newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
              }
      
              block = newes->es_lblk + newes->es_len;
              ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
              if (es.es_len == 0)
                      next_del = EXT_MAX_BLOCKS;
              else
                      next_del = es.es_lblk;
      
              return next_del;
      }
    1 /* fiemap flags we can handle specified here */
      #define EXT4_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
      
      static int ext4_xattr_fiemap(struct inode *inode,
    1                                 struct fiemap_extent_info *fieinfo)
      {
              __u64 physical = 0;
    1         __u64 length;
              __u32 flags = FIEMAP_EXTENT_LAST;
              int blockbits = inode->i_sb->s_blocksize_bits;
              int error = 0;
      
              /* in-inode? */
    1         if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                      struct ext4_iloc iloc;
                      int offset;        /* offset of xattr in inode */
      
                      error = ext4_get_inode_loc(inode, &iloc);
                      if (error)
    1                         return error;
    1                 physical = (__u64)iloc.bh->b_blocknr << blockbits;
                      offset = EXT4_GOOD_OLD_INODE_SIZE +
                                      EXT4_I(inode)->i_extra_isize;
                      physical += offset;
                      length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
                      flags |= FIEMAP_EXTENT_DATA_INLINE;
                      brelse(iloc.bh);
              } else { /* external block */
                      physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
                      length = inode->i_sb->s_blocksize;
              }
   20 
              if (physical)
                      error = fiemap_fill_next_extent(fieinfo, 0, physical,
                                                      length, flags);
              return (error < 0 ? error : 0);
      }
      
      int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                      __u64 start, __u64 len)
      {
   20         ext4_lblk_t start_blk;
    3         int error = 0;
   20 
              if (ext4_has_inline_data(inode)) {
                      int has_inline = 1;
      
                      error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
   20                                                 start, len);
    9 
                      if (has_inline)
                              return error;
   13         }
      
              if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
   11                 error = ext4_ext_precache(inode);
    1                 if (error)
                              return error;
              }
      
              /* fallback to generic here if not in extents fmt */
   10         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                      return generic_block_fiemap(inode, fieinfo, start, len,
                              ext4_get_block);
      
              if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
                      return -EBADR;
      
              if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
                      error = ext4_xattr_fiemap(inode, fieinfo);
              } else {
   10                 ext4_lblk_t len_blks;
                      __u64 last_blk;
      
                      start_blk = start >> inode->i_sb->s_blocksize_bits;
                      last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
                      if (last_blk >= EXT_MAX_BLOCKS)
                              last_blk = EXT_MAX_BLOCKS-1;
                      len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
      
                      /*
                       * Walk the extent tree gathering extent information
                       * and pushing extents back to the user.
                       */
                      error = ext4_fill_fiemap_extents(inode, start_blk,
                                                       len_blks, fieinfo);
              }
              return error;
      }
      
      /*
       * ext4_access_path:
       * Function to access the path buffer for marking it dirty.
       * It also checks if there are sufficient credits left in the journal handle
       * to update path.
       */
      static int
      ext4_access_path(handle_t *handle, struct inode *inode,
                      struct ext4_ext_path *path)
      {
              int credits, err;
      
              if (!ext4_handle_valid(handle))
                      return 0;
      
              /*
               * Check if need to extend journal credits
               * 3 for leaf, sb, and inode plus 2 (bmap and group
               * descriptor) for each block group; assume two block
               * groups
               */
              if (handle->h_buffer_credits < 7) {
                      credits = ext4_writepage_trans_blocks(inode);
                      err = ext4_ext_truncate_extend_restart(handle, inode, credits);
                      /* EAGAIN is success */
                      if (err && err != -EAGAIN)
                              return err;
              }
      
              err = ext4_ext_get_access(handle, inode, path);
              return err;
      }
      
      /*
       * ext4_ext_shift_path_extents:
       * Shift the extents of a path structure lying between path[depth].p_ext
       * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
       * if it is right shift or left shift operation.
       */
      static int
      ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
                                  struct inode *inode, handle_t *handle,
                                  enum SHIFT_DIRECTION SHIFT)
      {
              int depth, err = 0;
              struct ext4_extent *ex_start, *ex_last;
              bool update = 0;
              depth = path->p_depth;
      
              while (depth >= 0) {
                      if (depth == path->p_depth) {
                              ex_start = path[depth].p_ext;
                              if (!ex_start)
                                      return -EFSCORRUPTED;
      
                              ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
      
                              err = ext4_access_path(handle, inode, path + depth);
                              if (err)
                                      goto out;
      
                              if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
                                      update = 1;
      
                              while (ex_start <= ex_last) {
                                      if (SHIFT == SHIFT_LEFT) {
                                              le32_add_cpu(&ex_start->ee_block,
                                                      -shift);
                                              /* Try to merge to the left. */
                                              if ((ex_start >
                                                  EXT_FIRST_EXTENT(path[depth].p_hdr))
                                                  &&
                                                  ext4_ext_try_to_merge_right(inode,
                                                  path, ex_start - 1))
                                                      ex_last--;
                                              else
                                                      ex_start++;
                                      } else {
                                              le32_add_cpu(&ex_last->ee_block, shift);
                                              ext4_ext_try_to_merge_right(inode, path,
                                                      ex_last);
                                              ex_last--;
                                      }
                              }
                              err = ext4_ext_dirty(handle, inode, path + depth);
                              if (err)
                                      goto out;
      
                              if (--depth < 0 || !update)
                                      break;
                      }
      
                      /* Update index too */
                      err = ext4_access_path(handle, inode, path + depth);
                      if (err)
                              goto out;
      
                      if (SHIFT == SHIFT_LEFT)
                              le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
                      else
                              le32_add_cpu(&path[depth].p_idx->ei_block, shift);
                      err = ext4_ext_dirty(handle, inode, path + depth);
                      if (err)
                              goto out;
      
                      /* we are done if current index is not a starting index */
                      if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
                              break;
      
                      depth--;
              }
      
      out:
              return err;
      }
      
      /*
       * ext4_ext_shift_extents:
       * All the extents which lies in the range from @start to the last allocated
       * block for the @inode are shifted either towards left or right (depending
       * upon @SHIFT) by @shift blocks.
       * On success, 0 is returned, error otherwise.
       */
      static int
      ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
                             ext4_lblk_t start, ext4_lblk_t shift,
                             enum SHIFT_DIRECTION SHIFT)
      {
              struct ext4_ext_path *path;
              int ret = 0, depth;
              struct ext4_extent *extent;
              ext4_lblk_t stop, *iterator, ex_start, ex_end;
      
              /* Let path point to the last extent */
              path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                      EXT4_EX_NOCACHE);
              if (IS_ERR(path))
                      return PTR_ERR(path);
      
              depth = path->p_depth;
              extent = path[depth].p_ext;
              if (!extent)
                      goto out;
      
              stop = le32_to_cpu(extent->ee_block);
      
             /*
              * For left shifts, make sure the hole on the left is big enough to
              * accommodate the shift.  For right shifts, make sure the last extent
              * won't be shifted beyond EXT_MAX_BLOCKS.
              */
              if (SHIFT == SHIFT_LEFT) {
                      path = ext4_find_extent(inode, start - 1, &path,
                                              EXT4_EX_NOCACHE);
                      if (IS_ERR(path))
                              return PTR_ERR(path);
                      depth = path->p_depth;
                      extent =  path[depth].p_ext;
                      if (extent) {
                              ex_start = le32_to_cpu(extent->ee_block);
                              ex_end = le32_to_cpu(extent->ee_block) +
                                      ext4_ext_get_actual_len(extent);
                      } else {
                              ex_start = 0;
                              ex_end = 0;
                      }
      
                      if ((start == ex_start && shift > ex_start) ||
                          (shift > start - ex_end)) {
                              ret = -EINVAL;
                              goto out;
                      }
              } else {
                      if (shift > EXT_MAX_BLOCKS -
                          (stop + ext4_ext_get_actual_len(extent))) {
                              ret = -EINVAL;
                              goto out;
                      }
              }
      
              /*
               * In case of left shift, iterator points to start and it is increased
               * till we reach stop. In case of right shift, iterator points to stop
               * and it is decreased till we reach start.
               */
              if (SHIFT == SHIFT_LEFT)
                      iterator = &start;
              else
                      iterator = &stop;
      
              /*
               * Its safe to start updating extents.  Start and stop are unsigned, so
               * in case of right shift if extent with 0 block is reached, iterator
               * becomes NULL to indicate the end of the loop.
               */
              while (iterator && start <= stop) {
                      path = ext4_find_extent(inode, *iterator, &path,
                                              EXT4_EX_NOCACHE);
                      if (IS_ERR(path))
                              return PTR_ERR(path);
                      depth = path->p_depth;
                      extent = path[depth].p_ext;
                      if (!extent) {
                              EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                               (unsigned long) *iterator);
                              return -EFSCORRUPTED;
                      }
                      if (SHIFT == SHIFT_LEFT && *iterator >
                          le32_to_cpu(extent->ee_block)) {
                              /* Hole, move to the next extent */
                              if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
                                      path[depth].p_ext++;
                              } else {
                                      *iterator = ext4_ext_next_allocated_block(path);
                                      continue;
                              }
                      }
      
                      if (SHIFT == SHIFT_LEFT) {
                              extent = EXT_LAST_EXTENT(path[depth].p_hdr);
                              *iterator = le32_to_cpu(extent->ee_block) +
                                              ext4_ext_get_actual_len(extent);
                      } else {
                              extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
                              if (le32_to_cpu(extent->ee_block) > 0)
                                      *iterator = le32_to_cpu(extent->ee_block) - 1;
                              else
                                      /* Beginning is reached, end of the loop */
                                      iterator = NULL;
                              /* Update path extent in case we need to stop */
                              while (le32_to_cpu(extent->ee_block) < start)
                                      extent++;
                              path[depth].p_ext = extent;
                      }
                      ret = ext4_ext_shift_path_extents(path, shift, inode,
                                      handle, SHIFT);
                      if (ret)
                              break;
              }
      out:
              ext4_ext_drop_refs(path);
              kfree(path);
              return ret;
      }
      
      /*
       * ext4_collapse_range:
       * This implements the fallocate's collapse range functionality for ext4
       * Returns: 0 and non-zero on error.
       */
      int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
      {
              struct super_block *sb = inode->i_sb;
              ext4_lblk_t punch_start, punch_stop;
              handle_t *handle;
              unsigned int credits;
              loff_t new_size, ioffset;
              int ret;
      
              /*
               * We need to test this early because xfstests assumes that a
               * collapse range of (0, 1) will return EOPNOTSUPP if the file
               * system does not support collapse range.
               */
              if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                      return -EOPNOTSUPP;
      
              /* Collapse range works only on fs block size aligned offsets. */
              if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
                  len & (EXT4_CLUSTER_SIZE(sb) - 1))
                      return -EINVAL;
      
              if (!S_ISREG(inode->i_mode))
                      return -EINVAL;
      
              trace_ext4_collapse_range(inode, offset, len);
      
              punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
              punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
      
              /* Call ext4_force_commit to flush all data in case of data=journal. */
              if (ext4_should_journal_data(inode)) {
                      ret = ext4_force_commit(inode->i_sb);
                      if (ret)
                              return ret;
              }
      
              mutex_lock(&inode->i_mutex);
              /*
               * There is no need to overlap collapse range with EOF, in which case
               * it is effectively a truncate operation
               */
              if (offset + len >= i_size_read(inode)) {
                      ret = -EINVAL;
                      goto out_mutex;
              }
      
              /* Currently just for extent based files */
              if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                      ret = -EOPNOTSUPP;
                      goto out_mutex;
              }
      
              /* Wait for existing dio to complete */
              ext4_inode_block_unlocked_dio(inode);
              inode_dio_wait(inode);
      
              /*
               * Prevent page faults from reinstantiating pages we have released from
               * page cache.
               */
              down_write(&EXT4_I(inode)->i_mmap_sem);
              /*
               * Need to round down offset to be aligned with page size boundary
               * for page size > block size.
               */
              ioffset = round_down(offset, PAGE_SIZE);
              /*
               * Write tail of the last page before removed range since it will get
               * removed from the page cache below.
               */
              ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
              if (ret)
                      goto out_mmap;
              /*
               * Write data that will be shifted to preserve them when discarding
               * page cache below. We are also protected from pages becoming dirty
               * by i_mmap_sem.
               */
              ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
                                                 LLONG_MAX);
              if (ret)
                      goto out_mmap;
              truncate_pagecache(inode, ioffset);
      
              credits = ext4_writepage_trans_blocks(inode);
              handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
              if (IS_ERR(handle)) {
                      ret = PTR_ERR(handle);
                      goto out_mmap;
              }
      
              down_write(&EXT4_I(inode)->i_data_sem);
              ext4_discard_preallocations(inode);
      
              ret = ext4_es_remove_extent(inode, punch_start,
                                          EXT_MAX_BLOCKS - punch_start);
              if (ret) {
                      up_write(&EXT4_I(inode)->i_data_sem);
                      goto out_stop;
              }
      
              ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
              if (ret) {
                      up_write(&EXT4_I(inode)->i_data_sem);
                      goto out_stop;
              }
              ext4_discard_preallocations(inode);
      
              ret = ext4_ext_shift_extents(inode, handle, punch_stop,
                                           punch_stop - punch_start, SHIFT_LEFT);
              if (ret) {
                      up_write(&EXT4_I(inode)->i_data_sem);
                      goto out_stop;
              }
      
              new_size = i_size_read(inode) - len;
              i_size_write(inode, new_size);
              EXT4_I(inode)->i_disksize = new_size;
      
              up_write(&EXT4_I(inode)->i_data_sem);
              if (IS_SYNC(inode))
                      ext4_handle_sync(handle);
              inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
              ext4_mark_inode_dirty(handle, inode);
              ext4_update_inode_fsync_trans(handle, inode, 1);
      
      out_stop:
              ext4_journal_stop(handle);
      out_mmap:
              up_write(&EXT4_I(inode)->i_mmap_sem);
              ext4_inode_resume_unlocked_dio(inode);
      out_mutex:
              mutex_unlock(&inode->i_mutex);
              return ret;
      }
      
      /*
       * ext4_insert_range:
       * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
       * The data blocks starting from @offset to the EOF are shifted by @len
       * towards right to create a hole in the @inode. Inode size is increased
       * by len bytes.
       * Returns 0 on success, error otherwise.
       */
      int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
      {
              struct super_block *sb = inode->i_sb;
              handle_t *handle;
              struct ext4_ext_path *path;
              struct ext4_extent *extent;
              ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
              unsigned int credits, ee_len;
              int ret = 0, depth, split_flag = 0;
              loff_t ioffset;
      
              /*
               * We need to test this early because xfstests assumes that an
               * insert range of (0, 1) will return EOPNOTSUPP if the file
               * system does not support insert range.
               */
              if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                      return -EOPNOTSUPP;
      
              /* Insert range works only on fs block size aligned offsets. */
              if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
                              len & (EXT4_CLUSTER_SIZE(sb) - 1))
                      return -EINVAL;
      
              if (!S_ISREG(inode->i_mode))
                      return -EOPNOTSUPP;
      
              trace_ext4_insert_range(inode, offset, len);
      
              offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
              len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
      
              /* Call ext4_force_commit to flush all data in case of data=journal */
              if (ext4_should_journal_data(inode)) {
                      ret = ext4_force_commit(inode->i_sb);
                      if (ret)
                              return ret;
              }
      
              mutex_lock(&inode->i_mutex);
              /* Currently just for extent based files */
              if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                      ret = -EOPNOTSUPP;
                      goto out_mutex;
              }
      
              /* Check for wrap through zero */
              if (inode->i_size + len > inode->i_sb->s_maxbytes) {
                      ret = -EFBIG;
                      goto out_mutex;
              }
      
              /* Offset should be less than i_size */
              if (offset >= i_size_read(inode)) {
                      ret = -EINVAL;
                      goto out_mutex;
              }
      
              /* Wait for existing dio to complete */
              ext4_inode_block_unlocked_dio(inode);
              inode_dio_wait(inode);
      
              /*
               * Prevent page faults from reinstantiating pages we have released from
               * page cache.
               */
              down_write(&EXT4_I(inode)->i_mmap_sem);
              /*
               * Need to round down to align start offset to page size boundary
               * for page size > block size.
               */
              ioffset = round_down(offset, PAGE_SIZE);
              /* Write out all dirty pages */
              ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
                              LLONG_MAX);
              if (ret)
                      goto out_mmap;
              truncate_pagecache(inode, ioffset);
      
              credits = ext4_writepage_trans_blocks(inode);
              handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
              if (IS_ERR(handle)) {
                      ret = PTR_ERR(handle);
                      goto out_mmap;
              }
      
              /* Expand file to avoid data loss if there is error while shifting */
              inode->i_size += len;
              EXT4_I(inode)->i_disksize += len;
              inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
              ret = ext4_mark_inode_dirty(handle, inode);
              if (ret)
                      goto out_stop;
      
              down_write(&EXT4_I(inode)->i_data_sem);
              ext4_discard_preallocations(inode);
      
              path = ext4_find_extent(inode, offset_lblk, NULL, 0);
              if (IS_ERR(path)) {
                      up_write(&EXT4_I(inode)->i_data_sem);
                      goto out_stop;
              }
      
              depth = ext_depth(inode);
              extent = path[depth].p_ext;
              if (extent) {
                      ee_start_lblk = le32_to_cpu(extent->ee_block);
                      ee_len = ext4_ext_get_actual_len(extent);
      
                      /*
                       * If offset_lblk is not the starting block of extent, split
                       * the extent @offset_lblk
                       */
                      if ((offset_lblk > ee_start_lblk) &&
                                      (offset_lblk < (ee_start_lblk + ee_len))) {
                              if (ext4_ext_is_unwritten(extent))
                                      split_flag = EXT4_EXT_MARK_UNWRIT1 |
                                              EXT4_EXT_MARK_UNWRIT2;
                              ret = ext4_split_extent_at(handle, inode, &path,
                                              offset_lblk, split_flag,
                                              EXT4_EX_NOCACHE |
                                              EXT4_GET_BLOCKS_PRE_IO |
                                              EXT4_GET_BLOCKS_METADATA_NOFAIL);
                      }
      
                      ext4_ext_drop_refs(path);
                      kfree(path);
                      if (ret < 0) {
                              up_write(&EXT4_I(inode)->i_data_sem);
                              goto out_stop;
                      }
              } else {
                      ext4_ext_drop_refs(path);
                      kfree(path);
              }
      
              ret = ext4_es_remove_extent(inode, offset_lblk,
                              EXT_MAX_BLOCKS - offset_lblk);
              if (ret) {
                      up_write(&EXT4_I(inode)->i_data_sem);
                      goto out_stop;
              }
      
              /*
               * if offset_lblk lies in a hole which is at start of file, use
               * ee_start_lblk to shift extents
               */
              ret = ext4_ext_shift_extents(inode, handle,
                      ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
                      len_lblk, SHIFT_RIGHT);
      
              up_write(&EXT4_I(inode)->i_data_sem);
              if (IS_SYNC(inode))
                      ext4_handle_sync(handle);
              if (ret >= 0)
                      ext4_update_inode_fsync_trans(handle, inode, 1);
      
      out_stop:
              ext4_journal_stop(handle);
      out_mmap:
              up_write(&EXT4_I(inode)->i_mmap_sem);
              ext4_inode_resume_unlocked_dio(inode);
      out_mutex:
              mutex_unlock(&inode->i_mutex);
              return ret;
      }
      
      /**
       * ext4_swap_extents - Swap extents between two inodes
       *
       * @inode1:        First inode
       * @inode2:        Second inode
       * @lblk1:        Start block for first inode
       * @lblk2:        Start block for second inode
       * @count:        Number of blocks to swap
       * @mark_unwritten: Mark second inode's extents as unwritten after swap
       * @erp:        Pointer to save error value
       *
       * This helper routine does exactly what is promise "swap extents". All other
   43  * stuff such as page-cache locking consistency, bh mapping consistency or
       * extent's data copying must be performed by caller.
       * Locking:
       *                 i_mutex is held for both inodes
       *                 i_data_sem is locked for write for both inodes
   43  * Assumptions:
   43  *                All pages from requested range are locked for both inodes
   43  */
      int
   43 ext4_swap_extents(handle_t *handle, struct inode *inode1,
                           struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
                        ext4_lblk_t count, int unwritten, int *erp)
   43 {
              struct ext4_ext_path *path1 = NULL;
              struct ext4_ext_path *path2 = NULL;
              int replaced_count = 0;
   43 
              BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
              BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
              BUG_ON(!mutex_is_locked(&inode1->i_mutex));
              BUG_ON(!mutex_is_locked(&inode2->i_mutex));
      
   43         *erp = ext4_es_remove_extent(inode1, lblk1, count);
              if (unlikely(*erp))
                      return 0;
              *erp = ext4_es_remove_extent(inode2, lblk2, count);
              if (unlikely(*erp))
                      return 0;
      
              while (count) {
   43                 struct ext4_extent *ex1, *ex2, tmp_ex;
                      ext4_lblk_t e1_blk, e2_blk;
                      int e1_len, e2_len, len;
                      int split = 0;
      
                      path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
   43                 if (IS_ERR(path1)) {
                              *erp = PTR_ERR(path1);
                              path1 = NULL;
   16                 finish:
                              count = 0;
                              goto repeat;
   16                 }
                      path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
   16                 if (IS_ERR(path2)) {
   16                         *erp = PTR_ERR(path2);
                              path2 = NULL;
                              goto finish;
   16                 }
   16                 ex1 = path1[path1->p_depth].p_ext;
                      ex2 = path2[path2->p_depth].p_ext;
                      /* Do we have somthing to swap ? */
                      if (unlikely(!ex2 || !ex1))
    3                         goto finish;
      
                      e1_blk = le32_to_cpu(ex1->ee_block);
                      e2_blk = le32_to_cpu(ex2->ee_block);
                      e1_len = ext4_ext_get_actual_len(ex1);
    3                 e2_len = ext4_ext_get_actual_len(ex2);
      
                      /* Hole handling */
    3                 if (!in_range(lblk1, e1_blk, e1_len) ||
                          !in_range(lblk2, e2_blk, e2_len)) {
                              ext4_lblk_t next1, next2;
    1 
                              /* if hole after extent, then go to next extent */
                              next1 = ext4_ext_next_allocated_block(path1);
                              next2 = ext4_ext_next_allocated_block(path2);
                              /* If hole before extent, then shift to that extent */
                              if (e1_blk > lblk1)
                                      next1 = e1_blk;
                              if (e2_blk > lblk2)
                                      next2 = e1_blk;
                              /* Do we have something to swap */
                              if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
                                      goto finish;
   16                         /* Move to the rightest boundary */
                              len = next1 - lblk1;
    2                         if (len < next2 - lblk2)
                                      len = next2 - lblk2;
                              if (len > count)
                                      len = count;
                              lblk1 += len;
   16                         lblk2 += len;
                              count -= len;
    4                         goto repeat;
                      }
      
                      /* Prepare left boundary */
                      if (e1_blk < lblk1) {
                              split = 1;
                              *erp = ext4_force_split_extent_at(handle, inode1,
                                                      &path1, lblk1, 0);
                              if (unlikely(*erp))
                                      goto finish;
                      }
                      if (e2_blk < lblk2) {
   16                         split = 1;
                              *erp = ext4_force_split_extent_at(handle, inode2,
                                                      &path2,  lblk2, 0);
                              if (unlikely(*erp))
                                      goto finish;
                      }
                      /* ext4_split_extent_at() may result in leaf extent split,
   16                  * path must to be revalidated. */
                      if (split)
                              goto repeat;
      
                      /* Prepare right boundary */
   16                 len = count;
                      if (len > e1_blk + e1_len - lblk1)
   13                         len = e1_blk + e1_len - lblk1;
                      if (len > e2_blk + e2_len - lblk2)
    3                         len = e2_blk + e2_len - lblk2;
      
                      if (len != e1_len) {
                              split = 1;
                              *erp = ext4_force_split_extent_at(handle, inode1,
                                                      &path1, lblk1 + len, 0);
                              if (unlikely(*erp))
                                      goto finish;
                      }
   16                 if (len != e2_len) {
    5                         split = 1;
                              *erp = ext4_force_split_extent_at(handle, inode2,
   16                                                 &path2, lblk2 + len, 0);
                              if (*erp)
                                      goto finish;
                      }
                      /* ext4_split_extent_at() may result in leaf extent split,
   16                  * path must to be revalidated. */
                      if (split)
                              goto repeat;
      
                      BUG_ON(e2_len != e1_len);
                      *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
   16                 if (unlikely(*erp))
   16                         goto finish;
    1                 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
                      if (unlikely(*erp))
   16                         goto finish;
      
                      /* Both extents are fully inside boundaries. Swap it now */
                      tmp_ex = *ex1;
                      ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
                      ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
   16                 ex1->ee_len = cpu_to_le16(e2_len);
                      ex2->ee_len = cpu_to_le16(e1_len);
                      if (unwritten)
                              ext4_ext_mark_unwritten(ex2);
                      if (ext4_ext_is_unwritten(&tmp_ex))
                              ext4_ext_mark_unwritten(ex1);
      
                      ext4_ext_try_to_merge(handle, inode2, path2, ex2);
                      ext4_ext_try_to_merge(handle, inode1, path1, ex1);
                      *erp = ext4_ext_dirty(handle, inode2, path2 +
   16                                       path2->p_depth);
                      if (unlikely(*erp))
                              goto finish;
                      *erp = ext4_ext_dirty(handle, inode1, path1 +
                                            path1->p_depth);
                      /*
   43                  * Looks scarry ah..? second inode already points to new blocks,
                       * and it was successfully dirtied. But luckily error may happen
                       * only due to journal error, so full transaction will be
                       * aborted anyway.
   43                  */
                      if (unlikely(*erp))
                              goto finish;
                      lblk1 += len;
                      lblk2 += len;
                      replaced_count += len;
                      count -= len;
      
              repeat:
                      ext4_ext_drop_refs(path1);
                      kfree(path1);
                      ext4_ext_drop_refs(path2);
                      kfree(path2);
                      path1 = path2 = NULL;
              }
              return replaced_count;
      }
      /*
       *  mm/mprotect.c
       *
       *  (C) Copyright 1994 Linus Torvalds
       *  (C) Copyright 2002 Christoph Hellwig
       *
       *  Address space accounting code        <alan@lxorguk.ukuu.org.uk>
       *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
       */
      
      #include <linux/mm.h>
      #include <linux/hugetlb.h>
      #include <linux/shm.h>
      #include <linux/mman.h>
      #include <linux/fs.h>
      #include <linux/highmem.h>
      #include <linux/security.h>
      #include <linux/mempolicy.h>
      #include <linux/personality.h>
      #include <linux/syscalls.h>
      #include <linux/swap.h>
      #include <linux/swapops.h>
      #include <linux/mmu_notifier.h>
      #include <linux/migrate.h>
      #include <linux/perf_event.h>
      #include <linux/ksm.h>
      #include <asm/uaccess.h>
      #include <asm/pgtable.h>
      #include <asm/cacheflush.h>
      #include <asm/tlbflush.h>
      
      #include "internal.h"
      
      /*
       * For a prot_numa update we only hold mmap_sem for read so there is a
       * potential race with faulting where a pmd was temporarily none. This
       * function checks for a transhuge pmd under the appropriate lock. It
       * returns a pte if it was successfully locked or NULL if it raced with
       * a transhuge insertion.
       */
      static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
                              unsigned long addr, int prot_numa, spinlock_t **ptl)
      {
              pte_t *pte;
              spinlock_t *pmdl;
      
              /* !prot_numa is protected by mmap_sem held for write */
              if (!prot_numa)
   16                 return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
      
              pmdl = pmd_lock(vma->vm_mm, pmd);
              if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
                      spin_unlock(pmdl);
                      return NULL;
              }
      
              pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
              spin_unlock(pmdl);
              return pte;
      }
      
      static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                      unsigned long addr, unsigned long end, pgprot_t newprot,
                      int dirty_accountable, int prot_numa)
      {
   16         struct mm_struct *mm = vma->vm_mm;
              pte_t *pte, oldpte;
              spinlock_t *ptl;
              unsigned long pages = 0;
      
   16         pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
              if (!pte)
                      return 0;
      
   16         flush_tlb_batched_pending(vma->vm_mm);
              arch_enter_lazy_mmu_mode();
              do {
   16                 oldpte = *pte;
                      if (pte_present(oldpte)) {
                              pte_t ptent;
   15                         bool preserve_write = prot_numa && pte_write(oldpte);
      
                              /*
                               * Avoid trapping faults against the zero or KSM
                               * pages. See similar comment in change_huge_pmd.
                               */
                              if (prot_numa) {
                                      struct page *page;
      
                                      page = vm_normal_page(vma, addr, oldpte);
                                      if (!page || PageKsm(page))
                                              continue;
      
                                      /* Avoid TLB flush if possible */
                                      if (pte_protnone(oldpte))
                                              continue;
                              }
      
   15                         ptent = ptep_modify_prot_start(mm, addr, pte);
   15                         ptent = pte_modify(ptent, newprot);
                              if (preserve_write)
                                      ptent = pte_mkwrite(ptent);
      
                              /* Avoid taking write faults for known dirty pages */
   15                         if (dirty_accountable && pte_dirty(ptent) &&
                                              (pte_soft_dirty(ptent) ||
                                               !(vma->vm_flags & VM_SOFTDIRTY))) {
    1                                 ptent = pte_mkwrite(ptent);
                              }
   15                         ptep_modify_prot_commit(mm, addr, pte, ptent);
                              pages++;
                      } else if (IS_ENABLED(CONFIG_MIGRATION)) {
   14                         swp_entry_t entry = pte_to_swp_entry(oldpte);
      
                              if (is_write_migration_entry(entry)) {
                                      pte_t newpte;
                                      /*
                                       * A protection check is difficult so
                                       * just be safe and disable write
                                       */
                                      make_migration_entry_read(&entry);
                                      newpte = swp_entry_to_pte(entry);
                                      if (pte_swp_soft_dirty(oldpte))
                                              newpte = pte_swp_mksoft_dirty(newpte);
                                      set_pte_at(mm, addr, pte, newpte);
      
                                      pages++;
                              }
                      }
   16         } while (pte++, addr += PAGE_SIZE, addr != end);
   16         arch_leave_lazy_mmu_mode();
              pte_unmap_unlock(pte - 1, ptl);
      
              return pages;
      }
      
      static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                      pud_t *pud, unsigned long addr, unsigned long end,
                      pgprot_t newprot, int dirty_accountable, int prot_numa)
      {
              pmd_t *pmd;
              struct mm_struct *mm = vma->vm_mm;
              unsigned long next;
              unsigned long pages = 0;
              unsigned long nr_huge_updates = 0;
              unsigned long mni_start = 0;
      
   25         pmd = pmd_offset(pud, addr);
              do {
                      unsigned long this_pages;
      
   25                 next = pmd_addr_end(addr, end);
   25                 if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
                              continue;
      
                      /* invoke the mmu notifier if the pmd is populated */
                      if (!mni_start) {
                              mni_start = addr;
                              mmu_notifier_invalidate_range_start(mm, mni_start, end);
                      }
      
                      if (pmd_trans_huge(*pmd)) {
                              if (next - addr != HPAGE_PMD_SIZE)
                                      split_huge_page_pmd(vma, addr, pmd);
                              else {
                                      int nr_ptes = change_huge_pmd(vma, pmd, addr,
                                                      newprot, prot_numa);
      
                                      if (nr_ptes) {
                                              if (nr_ptes == HPAGE_PMD_NR) {
                                                      pages += HPAGE_PMD_NR;
                                                      nr_huge_updates++;
                                              }
      
                                              /* huge pmd was handled */
                                              continue;
                                      }
                              }
                              /* fall through, the trans huge pmd just split */
                      }
   16                 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
                                       dirty_accountable, prot_numa);
                      pages += this_pages;
   20         } while (pmd++, addr = next, addr != end);
      
              if (mni_start)
                      mmu_notifier_invalidate_range_end(mm, mni_start, end);
      
              if (nr_huge_updates)
                      count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
              return pages;
      }
      
      static inline unsigned long change_pud_range(struct vm_area_struct *vma,
                      pgd_t *pgd, unsigned long addr, unsigned long end,
                      pgprot_t newprot, int dirty_accountable, int prot_numa)
      {
              pud_t *pud;
              unsigned long next;
              unsigned long pages = 0;
      
              pud = pud_offset(pgd, addr);
              do {
   25                 next = pud_addr_end(addr, end);
   25                 if (pud_none_or_clear_bad(pud))
                              continue;
   25                 pages += change_pmd_range(vma, pud, addr, next, newprot,
                                       dirty_accountable, prot_numa);
   25         } while (pud++, addr = next, addr != end);
      
              return pages;
      }
      
      static unsigned long change_protection_range(struct vm_area_struct *vma,
                      unsigned long addr, unsigned long end, pgprot_t newprot,
                      int dirty_accountable, int prot_numa)
      {
   25         struct mm_struct *mm = vma->vm_mm;
              pgd_t *pgd;
              unsigned long next;
              unsigned long start = addr;
              unsigned long pages = 0;
      
              BUG_ON(addr >= end);
   25         pgd = pgd_offset(mm, addr);
              flush_cache_range(vma, addr, end);
              set_tlb_flush_pending(mm);
              do {
   25                 next = pgd_addr_end(addr, end);
   25                 if (pgd_none_or_clear_bad(pgd))
                              continue;
   25                 pages += change_pud_range(vma, pgd, addr, next, newprot,
                                       dirty_accountable, prot_numa);
   25         } while (pgd++, addr = next, addr != end);
      
              /* Only flush the TLB if we actually modified any entries: */
   25         if (pages)
   15                 flush_tlb_range(vma, start, end);
   25         clear_tlb_flush_pending(mm);
      
              return pages;
      }
      
      unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
                             unsigned long end, pgprot_t newprot,
                             int dirty_accountable, int prot_numa)
      {
              unsigned long pages;
      
              if (is_vm_hugetlb_page(vma))
                      pages = hugetlb_change_protection(vma, start, end, newprot);
              else
                      pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
      
              return pages;
      }
      
      static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
                                     unsigned long next, struct mm_walk *walk)
      {
              return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
                      0 : -EACCES;
      }
      
      static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
                                         unsigned long addr, unsigned long next,
                                         struct mm_walk *walk)
      {
              return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
                      0 : -EACCES;
      }
      
      static int prot_none_test(unsigned long addr, unsigned long next,
                                struct mm_walk *walk)
      {
              return 0;
      }
      
      static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
                                 unsigned long end, unsigned long newflags)
      {
              pgprot_t new_pgprot = vm_get_page_prot(newflags);
              struct mm_walk prot_none_walk = {
                      .pte_entry = prot_none_pte_entry,
                      .hugetlb_entry = prot_none_hugetlb_entry,
                      .test_walk = prot_none_test,
                      .mm = current->mm,
                      .private = &new_pgprot,
              };
      
              return walk_page_range(start, end, &prot_none_walk);
      }
      
      int
      mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
              unsigned long start, unsigned long end, unsigned long newflags)
      {
   25         struct mm_struct *mm = vma->vm_mm;
   26         unsigned long oldflags = vma->vm_flags;
              long nrpages = (end - start) >> PAGE_SHIFT;
              unsigned long charged = 0;
              pgoff_t pgoff;
              int error;
              int dirty_accountable = 0;
      
              if (newflags == oldflags) {
    2                 *pprev = vma;
                      return 0;
              }
      
              /*
               * Do PROT_NONE PFN permission checks here when we can still
               * bail out without undoing a lot of state. This is a rather
               * uncommon case, so doesn't need to be very optimized.
               */
              if (arch_has_pfn_modify_check() &&
   25             (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
    1             (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
                      error = prot_none_walk(vma, start, end, newflags);
                      if (error)
                              return error;
              }
      
              /*
               * If we make a private mapping writable we increase our commit;
               * but (without finer accounting) cannot reduce our commit if we
               * make it unwritable again. hugetlb mapping were accounted for
               * even if read-only so there is no need to account for them here
               */
   25         if (newflags & VM_WRITE) {
    8                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
                                                      VM_SHARED|VM_NORESERVE))) {
                              charged = nrpages;
    1                         if (security_vm_enough_memory_mm(mm, charged))
                                      return -ENOMEM;
    1                         newflags |= VM_ACCOUNT;
                      }
              }
      
              /*
               * First try to merge with previous and/or next vma.
               */
   25         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
   25         *pprev = vma_merge(mm, *pprev, start, end, newflags,
                                 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
                                 vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
              if (*pprev) {
                      vma = *pprev;
                      goto success;
              }
      
   25         *pprev = vma;
      
              if (start != vma->vm_start) {
    7                 error = split_vma(mm, vma, start, 1);
                      if (error)
                              goto fail;
              }
      
   25         if (end != vma->vm_end) {
   22                 error = split_vma(mm, vma, end, 0);
                      if (error)
                              goto fail;
              }
      
      success:
              /*
               * vm_flags and vm_page_prot are protected by the mmap_sem
               * held in write mode.
               */
   25         vma->vm_flags = newflags;
              dirty_accountable = vma_wants_writenotify(vma);
              vma_set_page_prot(vma);
      
              change_protection(vma, start, end, vma->vm_page_prot,
                                dirty_accountable, 0);
      
              /*
               * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
               * fault on access.
               */
              if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
    2                         (newflags & VM_WRITE)) {
    1                 populate_vma_page_range(vma, start, end, NULL);
              }
      
   25         vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
              vm_stat_account(mm, newflags, vma->vm_file, nrpages);
              perf_event_mmap(vma);
   26         return 0;
      
      fail:
              vm_unacct_memory(charged);
              return error;
      }
      
   33 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                      unsigned long, prot)
      {
              unsigned long vm_flags, nstart, end, tmp, reqprot;
              struct vm_area_struct *vma, *prev;
              int error = -EINVAL;
              const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
   30         prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
              if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
                      return -EINVAL;
      
   32         if (start & ~PAGE_MASK)
                      return -EINVAL;
   32         if (!len)
                      return 0;
   31         len = PAGE_ALIGN(len);
              end = start + len;
              if (end <= start)
                      return -ENOMEM;
              if (!arch_validate_prot(prot))
                      return -EINVAL;
      
              reqprot = prot;
              /*
               * Does the application expect PROT_READ to imply PROT_EXEC:
               */
    6         if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
    2                 prot |= PROT_EXEC;
      
              vm_flags = calc_vm_prot_bits(prot);
      
   30         down_write(&current->mm->mmap_sem);
      
              vma = find_vma(current->mm, start);
              error = -ENOMEM;
              if (!vma)
                      goto out;
   30         prev = vma->vm_prev;
              if (unlikely(grows & PROT_GROWSDOWN)) {
    3                 if (vma->vm_start >= end)
                              goto out;
                      start = vma->vm_start;
                      error = -EINVAL;
    2                 if (!(vma->vm_flags & VM_GROWSDOWN))
                              goto out;
              } else {
   27                 if (vma->vm_start > start)
                              goto out;
   26                 if (unlikely(grows & PROT_GROWSUP)) {
                              end = vma->vm_end;
                              error = -EINVAL;
                              if (!(vma->vm_flags & VM_GROWSUP))
                                      goto out;
                      }
              }
   25         if (start > vma->vm_start)
    8                 prev = vma;
      
              for (nstart = start ; ; ) {
                      unsigned long newflags;
      
                      /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
      
                      newflags = vm_flags;
   26                 newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
      
                      /* newflags >> 4 shift VM_MAY% in place of VM_% */
                      if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
                              error = -EACCES;
                              goto out;
                      }
      
   26                 error = security_file_mprotect(vma, reqprot, prot);
                      if (error)
                              goto out;
      
   26                 tmp = vma->vm_end;
                      if (tmp > end)
                              tmp = end;
                      error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
                      if (error)
                              goto out;
                      nstart = tmp;
      
   26                 if (nstart < prev->vm_end)
                              nstart = prev->vm_end;
                      if (nstart >= end)
                              goto out;
      
    7                 vma = prev->vm_next;
    7                 if (!vma || vma->vm_start != nstart) {
                              error = -ENOMEM;
                              goto out;
                      }
              }
      out:
   30         up_write(&current->mm->mmap_sem);
   33         return error;
      }
      #ifndef __NET_FRAG_H__
      #define __NET_FRAG_H__
      
      #include <linux/rhashtable.h>
      
      struct netns_frags {
              /* sysctls */
              long                        high_thresh;
              long                        low_thresh;
              int                        timeout;
              struct inet_frags        *f;
      
              struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
      
              /* Keep atomic mem on separate cachelines in structs that include it */
              atomic_long_t                mem ____cacheline_aligned_in_smp;
      };
      
      /**
       * fragment queue flags
       *
       * @INET_FRAG_FIRST_IN: first fragment has arrived
       * @INET_FRAG_LAST_IN: final fragment has arrived
       * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
       */
      enum {
              INET_FRAG_FIRST_IN        = BIT(0),
              INET_FRAG_LAST_IN        = BIT(1),
              INET_FRAG_COMPLETE        = BIT(2),
      };
      
      struct frag_v4_compare_key {
              __be32                saddr;
              __be32                daddr;
              u32                user;
              u32                vif;
              __be16                id;
              u16                protocol;
      };
      
      struct frag_v6_compare_key {
              struct in6_addr        saddr;
              struct in6_addr        daddr;
              u32                user;
              __be32                id;
              u32                iif;
      };
      
      /**
       * struct inet_frag_queue - fragment queue
       *
       * @node: rhash node
       * @key: keys identifying this frag.
       * @timer: queue expiration timer
       * @lock: spinlock protecting this frag
       * @refcnt: reference count of the queue
       * @fragments: received fragments head
       * @rb_fragments: received fragments rb-tree root
       * @fragments_tail: received fragments tail
       * @last_run_head: the head of the last "run". see ip_fragment.c
       * @stamp: timestamp of the last received fragment
       * @len: total length of the original datagram
       * @meat: length of received fragments so far
       * @flags: fragment queue flags
       * @max_size: maximum received fragment size
       * @net: namespace that this frag belongs to
       * @rcu: rcu head for freeing deferall
       */
      struct inet_frag_queue {
              struct rhash_head        node;
              union {
                      struct frag_v4_compare_key v4;
                      struct frag_v6_compare_key v6;
              } key;
              struct timer_list        timer;
              spinlock_t                lock;
              atomic_t                refcnt;
              struct sk_buff                *fragments;  /* Used in IPv6. */
              struct rb_root                rb_fragments; /* Used in IPv4. */
              struct sk_buff                *fragments_tail;
              struct sk_buff                *last_run_head;
              ktime_t                        stamp;
              int                        len;
              int                        meat;
              __u8                        flags;
              u16                        max_size;
              struct netns_frags      *net;
              struct rcu_head                rcu;
      };
      
      struct inet_frags {
              int                        qsize;
      
              void                        (*constructor)(struct inet_frag_queue *q,
                                                     const void *arg);
              void                        (*destructor)(struct inet_frag_queue *);
              void                        (*skb_free)(struct sk_buff *);
              void                        (*frag_expire)(unsigned long data);
              struct kmem_cache        *frags_cachep;
              const char                *frags_cache_name;
              struct rhashtable_params rhash_params;
      };
      
      int inet_frags_init(struct inet_frags *);
      void inet_frags_fini(struct inet_frags *);
      
      static inline int inet_frags_init_net(struct netns_frags *nf)
      {
              atomic_long_set(&nf->mem, 0);
              return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
      }
      void inet_frags_exit_net(struct netns_frags *nf);
      
      void inet_frag_kill(struct inet_frag_queue *q);
      void inet_frag_destroy(struct inet_frag_queue *q);
      struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
      
      /* Free all skbs in the queue; return the sum of their truesizes. */
      unsigned int inet_frag_rbtree_purge(struct rb_root *root);
      
      static inline void inet_frag_put(struct inet_frag_queue *q)
      {
   36         if (atomic_dec_and_test(&q->refcnt))
                      inet_frag_destroy(q);
      }
      
      /* Memory Tracking Functions. */
      
      static inline long frag_mem_limit(const struct netns_frags *nf)
      {
   35         return atomic_long_read(&nf->mem);
      }
      
      static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
      {
              atomic_long_sub(val, &nf->mem);
      }
      
      static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
      {
              atomic_long_add(val, &nf->mem);
      }
      
      /* RFC 3168 support :
       * We want to check ECN values of all fragments, do detect invalid combinations.
       * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
       */
      #define        IPFRAG_ECN_NOT_ECT        0x01 /* one frag had ECN_NOT_ECT */
      #define        IPFRAG_ECN_ECT_1        0x02 /* one frag had ECN_ECT_1 */
      #define        IPFRAG_ECN_ECT_0        0x04 /* one frag had ECN_ECT_0 */
      #define        IPFRAG_ECN_CE                0x08 /* one frag had ECN_CE */
      
      extern const u8 ip_frag_ecn_table[16];
      
      #endif
      /*
       * drivers/staging/android/ion/ion_system_heap.c
       *
       * Copyright (C) 2011 Google, Inc.
       *
       * This software is licensed under the terms of the GNU General Public
       * License version 2, as published by the Free Software Foundation, and
       * may be copied, distributed, and modified under those terms.
       *
       * This program is distributed in the hope that it will be useful,
       * but WITHOUT ANY WARRANTY; without even the implied warranty of
       * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       * GNU General Public License for more details.
       *
       */
      
      #include <asm/page.h>
      #include <linux/dma-mapping.h>
      #include <linux/err.h>
      #include <linux/highmem.h>
      #include <linux/mm.h>
      #include <linux/scatterlist.h>
      #include <linux/seq_file.h>
      #include <linux/slab.h>
      #include <linux/vmalloc.h>
      #include "ion.h"
      #include "ion_priv.h"
      
      static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN |
                                           __GFP_NORETRY) & ~__GFP_RECLAIM;
      static gfp_t low_order_gfp_flags  = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN);
      static const unsigned int orders[] = {8, 4, 0};
      static const int num_orders = ARRAY_SIZE(orders);
      static int order_to_index(unsigned int order)
      {
              int i;
      
              for (i = 0; i < num_orders; i++)
                      if (order == orders[i])
                              return i;
              BUG();
              return -1;
      }
      
      static inline unsigned int order_to_size(int order)
      {
              return PAGE_SIZE << order;
      }
      
      struct ion_system_heap {
              struct ion_heap heap;
              struct ion_page_pool *pools[0];
      };
      
      static struct page *alloc_buffer_page(struct ion_system_heap *heap,
                                            struct ion_buffer *buffer,
                                            unsigned long order)
      {
              bool cached = ion_buffer_cached(buffer);
              struct ion_page_pool *pool = heap->pools[order_to_index(order)];
              struct page *page;
      
              if (!cached) {
                      page = ion_page_pool_alloc(pool);
              } else {
                      gfp_t gfp_flags = low_order_gfp_flags;
      
                      if (order > 4)
                              gfp_flags = high_order_gfp_flags;
                      page = alloc_pages(gfp_flags | __GFP_COMP, order);
                      if (!page)
                              return NULL;
                      ion_pages_sync_for_device(NULL, page, PAGE_SIZE << order,
                                                      DMA_BIDIRECTIONAL);
              }
      
              return page;
      }
      
      static void free_buffer_page(struct ion_system_heap *heap,
                                   struct ion_buffer *buffer, struct page *page)
      {
              unsigned int order = compound_order(page);
              bool cached = ion_buffer_cached(buffer);
      
              if (!cached) {
                      struct ion_page_pool *pool = heap->pools[order_to_index(order)];
                      if (buffer->private_flags & ION_PRIV_FLAG_SHRINKER_FREE)
                              ion_page_pool_free_immediate(pool, page);
                      else
                              ion_page_pool_free(pool, page);
              } else {
                      __free_pages(page, order);
              }
      }
      
      
      static struct page *alloc_largest_available(struct ion_system_heap *heap,
                                                  struct ion_buffer *buffer,
                                                  unsigned long size,
                                                  unsigned int max_order)
      {
              struct page *page;
              int i;
      
              for (i = 0; i < num_orders; i++) {
                      if (size < order_to_size(orders[i]))
                              continue;
                      if (max_order < orders[i])
                              continue;
      
                      page = alloc_buffer_page(heap, buffer, orders[i]);
                      if (!page)
                              continue;
      
                      return page;
              }
      
              return NULL;
      }
      
      static int ion_system_heap_allocate(struct ion_heap *heap,
                                           struct ion_buffer *buffer,
                                           unsigned long size, unsigned long align,
                                           unsigned long flags)
      {
              struct ion_system_heap *sys_heap = container_of(heap,
                                                              struct ion_system_heap,
                                                              heap);
              struct sg_table *table;
              struct scatterlist *sg;
              struct list_head pages;
              struct page *page, *tmp_page;
              int i = 0;
              unsigned long size_remaining = PAGE_ALIGN(size);
              unsigned int max_order = orders[0];
      
              if (align > PAGE_SIZE)
                      return -EINVAL;
      
              if (size / PAGE_SIZE > totalram_pages / 2)
                      return -ENOMEM;
      
              INIT_LIST_HEAD(&pages);
              while (size_remaining > 0) {
                      page = alloc_largest_available(sys_heap, buffer, size_remaining,
                                                      max_order);
                      if (!page)
                              goto free_pages;
                      list_add_tail(&page->lru, &pages);
                      size_remaining -= PAGE_SIZE << compound_order(page);
                      max_order = compound_order(page);
                      i++;
              }
              table = kmalloc(sizeof(struct sg_table), GFP_KERNEL);
              if (!table)
                      goto free_pages;
      
              if (sg_alloc_table(table, i, GFP_KERNEL))
                      goto free_table;
      
              sg = table->sgl;
              list_for_each_entry_safe(page, tmp_page, &pages, lru) {
                      sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0);
                      sg = sg_next(sg);
                      list_del(&page->lru);
              }
      
              buffer->priv_virt = table;
              return 0;
      
      free_table:
              kfree(table);
      free_pages:
              list_for_each_entry_safe(page, tmp_page, &pages, lru)
                      free_buffer_page(sys_heap, buffer, page);
              return -ENOMEM;
      }
      
      static void ion_system_heap_free(struct ion_buffer *buffer)
      {
              struct ion_system_heap *sys_heap = container_of(buffer->heap,
                                                              struct ion_system_heap,
                                                              heap);
              struct sg_table *table = buffer->sg_table;
              bool cached = ion_buffer_cached(buffer);
              struct scatterlist *sg;
              int i;
      
              /*
               *  uncached pages come from the page pools, zero them before returning
               *  for security purposes (other allocations are zerod at
               *  alloc time
               */
              if (!cached && !(buffer->private_flags & ION_PRIV_FLAG_SHRINKER_FREE))
                      ion_heap_buffer_zero(buffer);
      
              for_each_sg(table->sgl, sg, table->nents, i)
                      free_buffer_page(sys_heap, buffer, sg_page(sg));
              sg_free_table(table);
              kfree(table);
      }
      
      static struct sg_table *ion_system_heap_map_dma(struct ion_heap *heap,
                                                      struct ion_buffer *buffer)
      {
              return buffer->priv_virt;
      }
      
      static void ion_system_heap_unmap_dma(struct ion_heap *heap,
                                            struct ion_buffer *buffer)
      {
      }
      
      static int ion_system_heap_shrink(struct ion_heap *heap, gfp_t gfp_mask,
                                              int nr_to_scan)
      {
              struct ion_system_heap *sys_heap;
              int nr_total = 0;
              int i, nr_freed;
              int only_scan = 0;
      
              sys_heap = container_of(heap, struct ion_system_heap, heap);
      
              if (!nr_to_scan)
                      only_scan = 1;
      
    2         for (i = 0; i < num_orders; i++) {
    2                 struct ion_page_pool *pool = sys_heap->pools[i];
      
                      nr_freed = ion_page_pool_shrink(pool, gfp_mask, nr_to_scan);
                      nr_total += nr_freed;
      
                      if (!only_scan) {
                              nr_to_scan -= nr_freed;
                              /* shrink completed */
                              if (nr_to_scan <= 0)
                                      break;
                      }
              }
      
    2         return nr_total;
      }
      
      static struct ion_heap_ops system_heap_ops = {
              .allocate = ion_system_heap_allocate,
              .free = ion_system_heap_free,
              .map_dma = ion_system_heap_map_dma,
              .unmap_dma = ion_system_heap_unmap_dma,
              .map_kernel = ion_heap_map_kernel,
              .unmap_kernel = ion_heap_unmap_kernel,
              .map_user = ion_heap_map_user,
              .shrink = ion_system_heap_shrink,
      };
      
      static int ion_system_heap_debug_show(struct ion_heap *heap, struct seq_file *s,
                                            void *unused)
      {
      
              struct ion_system_heap *sys_heap = container_of(heap,
                                                              struct ion_system_heap,
                                                              heap);
              int i;
      
              for (i = 0; i < num_orders; i++) {
                      struct ion_page_pool *pool = sys_heap->pools[i];
      
                      seq_printf(s, "%d order %u highmem pages in pool = %lu total\n",
                                 pool->high_count, pool->order,
                                 (PAGE_SIZE << pool->order) * pool->high_count);
                      seq_printf(s, "%d order %u lowmem pages in pool = %lu total\n",
                                 pool->low_count, pool->order,
                                 (PAGE_SIZE << pool->order) * pool->low_count);
              }
              return 0;
      }
      
      struct ion_heap *ion_system_heap_create(struct ion_platform_heap *unused)
      {
              struct ion_system_heap *heap;
              int i;
      
              heap = kzalloc(sizeof(struct ion_system_heap) +
                              sizeof(struct ion_page_pool *) * num_orders,
                              GFP_KERNEL);
              if (!heap)
                      return ERR_PTR(-ENOMEM);
              heap->heap.ops = &system_heap_ops;
              heap->heap.type = ION_HEAP_TYPE_SYSTEM;
              heap->heap.flags = ION_HEAP_FLAG_DEFER_FREE;
      
              for (i = 0; i < num_orders; i++) {
                      struct ion_page_pool *pool;
                      gfp_t gfp_flags = low_order_gfp_flags;
      
                      if (orders[i] > 4)
                              gfp_flags = high_order_gfp_flags;
                      pool = ion_page_pool_create(gfp_flags, orders[i]);
                      if (!pool)
                              goto destroy_pools;
                      heap->pools[i] = pool;
              }
      
              heap->heap.debug_show = ion_system_heap_debug_show;
              return &heap->heap;
      
      destroy_pools:
              while (i--)
                      ion_page_pool_destroy(heap->pools[i]);
              kfree(heap);
              return ERR_PTR(-ENOMEM);
      }
      
      void ion_system_heap_destroy(struct ion_heap *heap)
      {
              struct ion_system_heap *sys_heap = container_of(heap,
                                                              struct ion_system_heap,
                                                              heap);
              int i;
      
              for (i = 0; i < num_orders; i++)
                      ion_page_pool_destroy(sys_heap->pools[i]);
              kfree(sys_heap);
      }
      
      static int ion_system_contig_heap_allocate(struct ion_heap *heap,
                                                 struct ion_buffer *buffer,
                                                 unsigned long len,
                                                 unsigned long align,
                                                 unsigned long flags)
      {
              int order = get_order(len);
              struct page *page;
              struct sg_table *table;
              unsigned long i;
              int ret;
      
              if (align > (PAGE_SIZE << order))
                      return -EINVAL;
      
              page = alloc_pages(low_order_gfp_flags, order);
              if (!page)
                      return -ENOMEM;
      
              split_page(page, order);
      
              len = PAGE_ALIGN(len);
              for (i = len >> PAGE_SHIFT; i < (1 << order); i++)
                      __free_page(page + i);
      
              table = kmalloc(sizeof(struct sg_table), GFP_KERNEL);
              if (!table) {
                      ret = -ENOMEM;
                      goto free_pages;
              }
      
              ret = sg_alloc_table(table, 1, GFP_KERNEL);
              if (ret)
                      goto free_table;
      
              sg_set_page(table->sgl, page, len, 0);
      
              buffer->priv_virt = table;
      
              ion_pages_sync_for_device(NULL, page, len, DMA_BIDIRECTIONAL);
      
              return 0;
      
      free_table:
              kfree(table);
      free_pages:
              for (i = 0; i < len >> PAGE_SHIFT; i++)
                      __free_page(page + i);
      
              return ret;
      }
      
      static void ion_system_contig_heap_free(struct ion_buffer *buffer)
      {
              struct sg_table *table = buffer->priv_virt;
              struct page *page = sg_page(table->sgl);
              unsigned long pages = PAGE_ALIGN(buffer->size) >> PAGE_SHIFT;
              unsigned long i;
      
              for (i = 0; i < pages; i++)
                      __free_page(page + i);
              sg_free_table(table);
              kfree(table);
      }
      
      static int ion_system_contig_heap_phys(struct ion_heap *heap,
                                             struct ion_buffer *buffer,
                                             ion_phys_addr_t *addr, size_t *len)
      {
              struct sg_table *table = buffer->priv_virt;
              struct page *page = sg_page(table->sgl);
              *addr = page_to_phys(page);
              *len = buffer->size;
              return 0;
      }
      
      static struct sg_table *ion_system_contig_heap_map_dma(struct ion_heap *heap,
                                                      struct ion_buffer *buffer)
      {
              return buffer->priv_virt;
      }
      
      static void ion_system_contig_heap_unmap_dma(struct ion_heap *heap,
                                                   struct ion_buffer *buffer)
      {
      }
      
      static struct ion_heap_ops kmalloc_ops = {
              .allocate = ion_system_contig_heap_allocate,
              .free = ion_system_contig_heap_free,
              .phys = ion_system_contig_heap_phys,
              .map_dma = ion_system_contig_heap_map_dma,
              .unmap_dma = ion_system_contig_heap_unmap_dma,
              .map_kernel = ion_heap_map_kernel,
              .unmap_kernel = ion_heap_unmap_kernel,
              .map_user = ion_heap_map_user,
      };
      
      struct ion_heap *ion_system_contig_heap_create(struct ion_platform_heap *unused)
      {
              struct ion_heap *heap;
      
              heap = kzalloc(sizeof(struct ion_heap), GFP_KERNEL);
              if (!heap)
                      return ERR_PTR(-ENOMEM);
              heap->ops = &kmalloc_ops;
              heap->type = ION_HEAP_TYPE_SYSTEM_CONTIG;
              return heap;
      }
      
      void ion_system_contig_heap_destroy(struct ion_heap *heap)
      {
              kfree(heap);
      }
      #ifndef _LINUX_PAGEMAP_H
      #define _LINUX_PAGEMAP_H
      
      /*
       * Copyright 1995 Linus Torvalds
       */
      #include <linux/mm.h>
      #include <linux/fs.h>
      #include <linux/list.h>
      #include <linux/highmem.h>
      #include <linux/compiler.h>
      #include <asm/uaccess.h>
      #include <linux/gfp.h>
      #include <linux/bitops.h>
      #include <linux/hardirq.h> /* for in_interrupt() */
      #include <linux/hugetlb_inline.h>
      
      /*
       * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
       * allocation mode flags.
       */
      enum mapping_flags {
              AS_EIO                = __GFP_BITS_SHIFT + 0,        /* IO error on async write */
              AS_ENOSPC        = __GFP_BITS_SHIFT + 1,        /* ENOSPC on async write */
              AS_MM_ALL_LOCKS        = __GFP_BITS_SHIFT + 2,        /* under mm_take_all_locks() */
              AS_UNEVICTABLE        = __GFP_BITS_SHIFT + 3,        /* e.g., ramdisk, SHM_LOCK */
              AS_EXITING        = __GFP_BITS_SHIFT + 4, /* final truncate in progress */
      };
      
      static inline void mapping_set_error(struct address_space *mapping, int error)
      {
              if (unlikely(error)) {
                      if (error == -ENOSPC)
                              set_bit(AS_ENOSPC, &mapping->flags);
                      else
                              set_bit(AS_EIO, &mapping->flags);
              }
      }
      
      static inline void mapping_set_unevictable(struct address_space *mapping)
      {
              set_bit(AS_UNEVICTABLE, &mapping->flags);
      }
      
      static inline void mapping_clear_unevictable(struct address_space *mapping)
      {
              clear_bit(AS_UNEVICTABLE, &mapping->flags);
      }
      
      static inline int mapping_unevictable(struct address_space *mapping)
      {
              if (mapping)
    7                 return test_bit(AS_UNEVICTABLE, &mapping->flags);
              return !!mapping;
      }
      
      static inline void mapping_set_exiting(struct address_space *mapping)
      {
  673         set_bit(AS_EXITING, &mapping->flags);
      }
      
      static inline int mapping_exiting(struct address_space *mapping)
      {
    2         return test_bit(AS_EXITING, &mapping->flags);
      }
      
      static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
      {
  820         return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
      }
      
      /* Restricts the given gfp_mask to what the mapping allows. */
      static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
                      gfp_t gfp_mask)
      {
  288         return mapping_gfp_mask(mapping) & gfp_mask;
      }
      
      /*
       * This is non-atomic.  Only to be used before the mapping is activated.
       * Probably needs a barrier...
       */
      static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
      {
              m->flags = (m->flags & ~(__force unsigned long)__GFP_BITS_MASK) |
                                      (__force unsigned long)mask;
      }
      
      /*
       * The page cache can be done in larger chunks than
       * one page, because it allows for more efficient
       * throughput (it can then be mapped into user
       * space in smaller chunks for same flexibility).
       *
       * Or rather, it _will_ be done in larger chunks.
       */
      #define PAGE_CACHE_SHIFT        PAGE_SHIFT
      #define PAGE_CACHE_SIZE                PAGE_SIZE
      #define PAGE_CACHE_MASK                PAGE_MASK
      #define PAGE_CACHE_ALIGN(addr)        (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)
      
      #define page_cache_get(page)                get_page(page)
      #define page_cache_release(page)        put_page(page)
      void release_pages(struct page **pages, int nr, bool cold);
      
      /*
       * speculatively take a reference to a page.
       * If the page is free (_count == 0), then _count is untouched, and 0
       * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
       *
       * This function must be called inside the same rcu_read_lock() section as has
       * been used to lookup the page in the pagecache radix-tree (or page table):
       * this allows allocators to use a synchronize_rcu() to stabilize _count.
       *
       * Unless an RCU grace period has passed, the count of all pages coming out
       * of the allocator must be considered unstable. page_count may return higher
       * than expected, and put_page must be able to do the right thing when the
       * page has been finished with, no matter what it is subsequently allocated
       * for (because put_page is what is used here to drop an invalid speculative
       * reference).
       *
       * This is the interesting part of the lockless pagecache (and lockless
       * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
       * has the following pattern:
       * 1. find page in radix tree
       * 2. conditionally increment refcount
       * 3. check the page is still in pagecache (if no, goto 1)
       *
       * Remove-side that cares about stability of _count (eg. reclaim) has the
       * following (with tree_lock held for write):
       * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
       * B. remove page from pagecache
       * C. free the page
       *
       * There are 2 critical interleavings that matter:
       * - 2 runs before A: in this case, A sees elevated refcount and bails out
       * - A runs before 2: in this case, 2 sees zero refcount and retries;
       *   subsequently, B will complete and 1 will find no page, causing the
       *   lookup to return NULL.
       *
       * It is possible that between 1 and 2, the page is removed then the exact same
       * page is inserted into the same position in pagecache. That's OK: the
       * old find_get_page using tree_lock could equally have run before or after
       * such a re-insertion, depending on order that locks are granted.
       *
       * Lookups racing against pagecache insertion isn't a big problem: either 1
       * will find the page or it will not. Likewise, the old find_get_page could run
       * either before the insertion or afterwards, depending on timing.
       */
      static inline int page_cache_get_speculative(struct page *page)
      {
 1685         VM_BUG_ON(in_interrupt());
      
      #ifdef CONFIG_TINY_RCU
      # ifdef CONFIG_PREEMPT_COUNT
              VM_BUG_ON(!in_atomic() && !irqs_disabled());
      # endif
              /*
               * Preempt must be disabled here - we rely on rcu_read_lock doing
               * this for us.
               *
               * Pagecache won't be truncated from interrupt context, so if we have
               * found a page in the radix tree here, we have pinned its refcount by
               * disabling preempt, and hence no need for the "speculative get" that
               * SMP requires.
               */
              VM_BUG_ON_PAGE(page_count(page) == 0, page);
              atomic_inc(&page->_count);
      
      #else
 1685         if (unlikely(!get_page_unless_zero(page))) {
                      /*
                       * Either the page has been freed, or will be freed.
                       * In either case, retry here and the caller should
                       * do the right thing (see comments above).
                       */
                      return 0;
              }
      #endif
 1685         VM_BUG_ON_PAGE(PageTail(page), page);
      
              return 1;
      }
      
      /*
       * Same as above, but add instead of inc (could just be merged)
       */
      static inline int page_cache_add_speculative(struct page *page, int count)
      {
              VM_BUG_ON(in_interrupt());
      
      #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
      # ifdef CONFIG_PREEMPT_COUNT
              VM_BUG_ON(!in_atomic() && !irqs_disabled());
      # endif
              VM_BUG_ON_PAGE(page_count(page) == 0, page);
              atomic_add(count, &page->_count);
      
      #else
              if (unlikely(!atomic_add_unless(&page->_count, count, 0)))
                      return 0;
      #endif
              VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page);
      
              return 1;
      }
      
      static inline int page_freeze_refs(struct page *page, int count)
      {
    1         return likely(atomic_cmpxchg(&page->_count, count, 0) == count);
      }
      
      static inline void page_unfreeze_refs(struct page *page, int count)
      {
   19         VM_BUG_ON_PAGE(page_count(page) != 0, page);
    1         VM_BUG_ON(count == 0);
      
   18         atomic_set(&page->_count, count);
      }
      
      #ifdef CONFIG_NUMA
      extern struct page *__page_cache_alloc(gfp_t gfp);
      #else
      static inline struct page *__page_cache_alloc(gfp_t gfp)
      {
  689         return alloc_pages(gfp, 0);
      }
      #endif
      
      static inline struct page *page_cache_alloc(struct address_space *x)
      {
              return __page_cache_alloc(mapping_gfp_mask(x));
      }
      
      static inline struct page *page_cache_alloc_cold(struct address_space *x)
      {
  288         return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
      }
      
      static inline struct page *page_cache_alloc_readahead(struct address_space *x)
      {
  528         return __page_cache_alloc(mapping_gfp_mask(x) |
                                        __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN);
      }
      
      typedef int filler_t(void *, struct page *);
      
      pgoff_t page_cache_next_hole(struct address_space *mapping,
                                   pgoff_t index, unsigned long max_scan);
      pgoff_t page_cache_prev_hole(struct address_space *mapping,
                                   pgoff_t index, unsigned long max_scan);
      
      #define FGP_ACCESSED                0x00000001
      #define FGP_LOCK                0x00000002
      #define FGP_CREAT                0x00000004
      #define FGP_WRITE                0x00000008
      #define FGP_NOFS                0x00000010
      #define FGP_NOWAIT                0x00000020
      
      struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
                      int fgp_flags, gfp_t cache_gfp_mask);
      
      /**
       * find_get_page - find and get a page reference
       * @mapping: the address_space to search
       * @offset: the page index
       *
       * Looks up the page cache slot at @mapping & @offset.  If there is a
       * page cache page, it is returned with an increased refcount.
       *
       * Otherwise, %NULL is returned.
       */
      static inline struct page *find_get_page(struct address_space *mapping,
                                              pgoff_t offset)
      {
  527         return pagecache_get_page(mapping, offset, 0, 0);
      }
      
      static inline struct page *find_get_page_flags(struct address_space *mapping,
                                              pgoff_t offset, int fgp_flags)
      {
              return pagecache_get_page(mapping, offset, fgp_flags, 0);
      }
      
      /**
       * find_lock_page - locate, pin and lock a pagecache page
       * pagecache_get_page - find and get a page reference
       * @mapping: the address_space to search
       * @offset: the page index
       *
       * Looks up the page cache slot at @mapping & @offset.  If there is a
       * page cache page, it is returned locked and with an increased
       * refcount.
       *
       * Otherwise, %NULL is returned.
       *
       * find_lock_page() may sleep.
       */
      static inline struct page *find_lock_page(struct address_space *mapping,
                                              pgoff_t offset)
      {
              return pagecache_get_page(mapping, offset, FGP_LOCK, 0);
      }
      
      /**
       * find_or_create_page - locate or add a pagecache page
       * @mapping: the page's address_space
       * @index: the page's index into the mapping
       * @gfp_mask: page allocation mode
       *
       * Looks up the page cache slot at @mapping & @offset.  If there is a
       * page cache page, it is returned locked and with an increased
       * refcount.
       *
       * If the page is not present, a new page is allocated using @gfp_mask
       * and added to the page cache and the VM's LRU list.  The page is
       * returned locked and with an increased refcount.
       *
       * On memory exhaustion, %NULL is returned.
       *
       * find_or_create_page() may sleep, even if @gfp_flags specifies an
       * atomic allocation!
       */
      static inline struct page *find_or_create_page(struct address_space *mapping,
                                              pgoff_t offset, gfp_t gfp_mask)
      {
              return pagecache_get_page(mapping, offset,
                                              FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
                                              gfp_mask);
      }
      
      /**
       * grab_cache_page_nowait - returns locked page at given index in given cache
       * @mapping: target address_space
       * @index: the page index
       *
       * Same as grab_cache_page(), but do not wait if the page is unavailable.
       * This is intended for speculative data generators, where the data can
       * be regenerated if the page couldn't be grabbed.  This routine should
       * be safe to call while holding the lock for another page.
       *
       * Clear __GFP_FS when allocating the page to avoid recursion into the fs
       * and deadlock against the caller's locked page.
       */
      static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                      pgoff_t index)
      {
              return pagecache_get_page(mapping, index,
                              FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                              mapping_gfp_mask(mapping));
      }
      
      struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
      struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
      unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
                                unsigned int nr_entries, struct page **entries,
                                pgoff_t *indices);
      unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
                              unsigned int nr_pages, struct page **pages);
      unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
                                     unsigned int nr_pages, struct page **pages);
      unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
                              int tag, unsigned int nr_pages, struct page **pages);
      
      struct page *grab_cache_page_write_begin(struct address_space *mapping,
                              pgoff_t index, unsigned flags);
      
      /*
       * Returns locked page at given index in given cache, creating it if needed.
       */
      static inline struct page *grab_cache_page(struct address_space *mapping,
                                                                      pgoff_t index)
      {
              return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
      }
      
      extern struct page * read_cache_page(struct address_space *mapping,
                                      pgoff_t index, filler_t *filler, void *data);
      extern struct page * read_cache_page_gfp(struct address_space *mapping,
                                      pgoff_t index, gfp_t gfp_mask);
      extern int read_cache_pages(struct address_space *mapping,
                      struct list_head *pages, filler_t *filler, void *data);
      
      static inline struct page *read_mapping_page(struct address_space *mapping,
                                      pgoff_t index, void *data)
      {
              filler_t *filler = (filler_t *)mapping->a_ops->readpage;
              return read_cache_page(mapping, index, filler, data);
      }
      
      /*
       * Get the offset in PAGE_SIZE.
       * (TODO: hugepage should have ->index in PAGE_SIZE)
       */
      static inline pgoff_t page_to_pgoff(struct page *page)
      {
              if (unlikely(PageHeadHuge(page)))
                      return page->index << compound_order(page);
              else
                      return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
      }
      
      /*
       * Return byte-offset into filesystem object for page.
       */
      static inline loff_t page_offset(struct page *page)
      {
   97         return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
      }
      
      static inline loff_t page_file_offset(struct page *page)
      {
              return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
      }
      
    4 extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
                                           unsigned long address);
      
      static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
                                              unsigned long address)
      {
              pgoff_t pgoff;
              if (unlikely(is_vm_hugetlb_page(vma)))
                      return linear_hugepage_index(vma, address);
              pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
              pgoff += vma->vm_pgoff;
              return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
      }
      
      extern void __lock_page(struct page *page);
      extern int __lock_page_killable(struct page *page);
      extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
    4                                 unsigned int flags);
      extern void unlock_page(struct page *page);
      
      static inline void __set_page_locked(struct page *page)
      {
              __set_bit(PG_locked, &page->flags);
      }
      
      static inline void __clear_page_locked(struct page *page)
      {
              __clear_bit(PG_locked, &page->flags);
      }
      
      static inline int trylock_page(struct page *page)
      {
              return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
      }
      
   28 /*
       * lock_page may only be called if we have the page's inode pinned.
       */
      static inline void lock_page(struct page *page)
      {
 1737         might_sleep();
              if (!trylock_page(page))
                      __lock_page(page);
      }
      
      /*
       * lock_page_killable is like lock_page but can be interrupted by fatal
       * signals.  It returns 0 if it locked the page and -EINTR if it was
 1095  * killed while waiting.
 1146  */
  338 static inline int lock_page_killable(struct page *page)
      {
              might_sleep();
              if (!trylock_page(page))
                      return __lock_page_killable(page);
              return 0;
      }
      
      /*
       * lock_page_or_retry - Lock the page, unless this would block and the
       * caller indicated that it can handle a retry.
       *
       * Return value and mmap_sem implications depend on flags; see
       * __lock_page_or_retry().
       */
      static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                           unsigned int flags)
      {
              might_sleep();
              return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
      }
      
      /*
       * This is exported only for wait_on_page_locked/wait_on_page_writeback,
       * and for filesystems which need to wait on PG_private.
       */
      extern void wait_on_page_bit(struct page *page, int bit_nr);
   76 
      extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
      extern int wait_on_page_bit_killable_timeout(struct page *page,
                                                   int bit_nr, unsigned long timeout);
      
      static inline int wait_on_page_locked_killable(struct page *page)
      {
              if (PageLocked(page))
                      return wait_on_page_bit_killable(page, PG_locked);
              return 0;
      }
      
      extern wait_queue_head_t *page_waitqueue(struct page *page);
      static inline void wake_up_page(struct page *page, int bit)
      {
   20         __wake_up_bit(page_waitqueue(page), &page->flags, bit);
   20 }
      
      /* 
       * Wait for a page to be unlocked.
       *
       * This must be called with the caller "holding" the page,
       * ie with increased "page->count" so that the page won't
       * go away during the wait..
       */
      static inline void wait_on_page_locked(struct page *page)
      {
              if (PageLocked(page))
                      wait_on_page_bit(page, PG_locked);
      }
      
      /* 
       * Wait for a page to complete writeback
       */
      static inline void wait_on_page_writeback(struct page *page)
    2 {
   21         if (PageWriteback(page))
                      wait_on_page_bit(page, PG_writeback);
      }
      
      extern void end_page_writeback(struct page *page);
      void wait_for_stable_page(struct page *page);
      
      void page_endio(struct page *page, int rw, int err);
  605 
  384 /*
       * Add an arbitrary waiter to a page's wait queue
       */
      extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
      
      /*
       * Fault a userspace page into pagetables.  Return non-zero on a fault.
       *
       * This assumes that two userspace pages are always sufficient.  That's
       * not true if PAGE_CACHE_SIZE > PAGE_SIZE.
       */
      static inline int fault_in_pages_writeable(char __user *uaddr, int size)
      {
              int ret;
      
              if (unlikely(size == 0))
                      return 0;
      
              /*
               * Writing zeroes into userspace here is OK, because we know that if
               * the zero gets there, we'll be overwriting it.
               */
              ret = __put_user(0, uaddr);
              if (ret == 0) {
                      char __user *end = uaddr + size - 1;
      
                      /*
                       * If the page was already mapped, this will get a cache miss
                       * for sure, so try to avoid doing it.
                       */
  602                 if (((unsigned long)uaddr & PAGE_MASK) !=
                                      ((unsigned long)end & PAGE_MASK))
  595                         ret = __put_user(0, end);
              }
              return ret;
      }
      
      static inline int fault_in_pages_readable(const char __user *uaddr, int size)
      {
              volatile char c;
  525         int ret;
      
              if (unlikely(size == 0))
                      return 0;
      
              ret = __get_user(c, uaddr);
              if (ret == 0) {
                      const char __user *end = uaddr + size - 1;
      
                      if (((unsigned long)uaddr & PAGE_MASK) !=
                                      ((unsigned long)end & PAGE_MASK)) {
                              ret = __get_user(c, end);
                              (void)c;
  267                 }
              }
  264         return ret;
      }
      
      /*
  246  * Multipage variants of the above prefault helpers, useful if more than
       * PAGE_SIZE of data needs to be prefaulted. These are separate from the above
       * functions (which only handle up to PAGE_SIZE) to avoid clobbering the
       * filemap.c hotpaths.
       */
      static inline int fault_in_multipages_writeable(char __user *uaddr, int size)
      {
              char __user *end = uaddr + size - 1;
      
              if (unlikely(size == 0))
                      return 0;
      
              if (unlikely(uaddr > end))
                      return -EFAULT;
              /*
               * Writing zeroes into userspace here is OK, because we know that if
               * the zero gets there, we'll be overwriting it.
               */
              do {
                      if (unlikely(__put_user(0, uaddr) != 0))
                              return -EFAULT;
                      uaddr += PAGE_SIZE;
              } while (uaddr <= end);
      
              /* Check whether the range spilled into the next page. */
              if (((unsigned long)uaddr & PAGE_MASK) ==
                              ((unsigned long)end & PAGE_MASK))
                      return __put_user(0, end);
      
              return 0;
      }
      
      static inline int fault_in_multipages_readable(const char __user *uaddr,
                                                     int size)
      {
              volatile char c;
              const char __user *end = uaddr + size - 1;
      
              if (unlikely(size == 0))
                      return 0;
      
              if (unlikely(uaddr > end))
                      return -EFAULT;
      
              do {
                      if (unlikely(__get_user(c, uaddr) != 0))
                              return -EFAULT;
                      uaddr += PAGE_SIZE;
              } while (uaddr <= end);
  363 
              /* Check whether the range spilled into the next page. */
              if (((unsigned long)uaddr & PAGE_MASK) ==
                              ((unsigned long)end & PAGE_MASK)) {
  363                 return __get_user(c, end);
              }
  363 
              return 0;
      }
      
  363 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                                      pgoff_t index, gfp_t gfp_mask);
  255 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                      pgoff_t index, gfp_t gfp_mask);
      extern void delete_from_page_cache(struct page *page);
      extern void __delete_from_page_cache(struct page *page, void *shadow,
                                           struct mem_cgroup *memcg);
      int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
      
      /*
       * Like add_to_page_cache_locked, but used to add newly allocated pages:
       * the page is new, so we can just run __set_page_locked() against it.
       */
      static inline int add_to_page_cache(struct page *page,
                      struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
      {
              int error;
      
              __set_page_locked(page);
              error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
              if (unlikely(error))
                      __clear_page_locked(page);
              return error;
      }
      
      static inline unsigned long dir_pages(struct inode *inode)
      {
              return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >>
                                     PAGE_CACHE_SHIFT;
      }
      
      #endif /* _LINUX_PAGEMAP_H */
      /*
       * INET                An implementation of the TCP/IP protocol suite for the LINUX
       *                operating system.  INET  is implemented using the  BSD Socket
       *                interface as the means of communication with the user level.
       *
       *                Definitions for the Forwarding Information Base.
       *
       * Authors:        A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru>
       *
       *                This program is free software; you can redistribute it and/or
       *                modify it under the terms of the GNU General Public License
       *                as published by the Free Software Foundation; either version
       *                2 of the License, or (at your option) any later version.
       */
      
      #ifndef _NET_IP_FIB_H
      #define _NET_IP_FIB_H
      
      #include <net/flow.h>
      #include <linux/seq_file.h>
      #include <linux/rcupdate.h>
      #include <net/fib_rules.h>
      #include <net/inetpeer.h>
      #include <linux/percpu.h>
      
      struct fib_config {
              u8                        fc_dst_len;
              u8                        fc_tos;
              u8                        fc_protocol;
              u8                        fc_scope;
              u8                        fc_type;
              /* 3 bytes unused */
              u32                        fc_table;
              __be32                        fc_dst;
              __be32                        fc_gw;
              int                        fc_oif;
              u32                        fc_flags;
              u32                        fc_priority;
              __be32                        fc_prefsrc;
              struct nlattr                *fc_mx;
              struct rtnexthop        *fc_mp;
              int                        fc_mx_len;
              int                        fc_mp_len;
              u32                        fc_flow;
              u32                        fc_nlflags;
              struct nl_info                fc_nlinfo;
              struct nlattr                *fc_encap;
              u16                        fc_encap_type;
      };
      
      struct fib_info;
      struct rtable;
      
      struct fib_nh_exception {
              struct fib_nh_exception __rcu        *fnhe_next;
              int                                fnhe_genid;
              __be32                                fnhe_daddr;
              u32                                fnhe_pmtu;
              bool                                fnhe_mtu_locked;
              __be32                                fnhe_gw;
              unsigned long                        fnhe_expires;
              struct rtable __rcu                *fnhe_rth_input;
              struct rtable __rcu                *fnhe_rth_output;
              unsigned long                        fnhe_stamp;
              struct rcu_head                        rcu;
      };
      
      struct fnhe_hash_bucket {
              struct fib_nh_exception __rcu        *chain;
      };
      
      #define FNHE_HASH_SHIFT                11
      #define FNHE_HASH_SIZE                (1 << FNHE_HASH_SHIFT)
      #define FNHE_RECLAIM_DEPTH        5
      
      struct fib_nh {
              struct net_device        *nh_dev;
              struct hlist_node        nh_hash;
              struct fib_info                *nh_parent;
              unsigned int                nh_flags;
              unsigned char                nh_scope;
      #ifdef CONFIG_IP_ROUTE_MULTIPATH
              int                        nh_weight;
              atomic_t                nh_upper_bound;
      #endif
      #ifdef CONFIG_IP_ROUTE_CLASSID
              __u32                        nh_tclassid;
      #endif
              int                        nh_oif;
              __be32                        nh_gw;
              __be32                        nh_saddr;
              int                        nh_saddr_genid;
              struct rtable __rcu * __percpu *nh_pcpu_rth_output;
              struct rtable __rcu        *nh_rth_input;
              struct fnhe_hash_bucket        __rcu *nh_exceptions;
              struct lwtunnel_state        *nh_lwtstate;
      };
      
      /*
       * This structure contains data shared by many of routes.
       */
      
      struct fib_info {
              struct hlist_node        fib_hash;
              struct hlist_node        fib_lhash;
              struct net                *fib_net;
              int                        fib_treeref;
              atomic_t                fib_clntref;
              unsigned int                fib_flags;
              unsigned char                fib_dead;
              unsigned char                fib_protocol;
              unsigned char                fib_scope;
              unsigned char                fib_type;
              __be32                        fib_prefsrc;
              u32                        fib_priority;
              struct dst_metrics        *fib_metrics;
      #define fib_mtu fib_metrics->metrics[RTAX_MTU-1]
      #define fib_window fib_metrics->metrics[RTAX_WINDOW-1]
      #define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
      #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
              int                        fib_nhs;
      #ifdef CONFIG_IP_ROUTE_MULTIPATH
              int                        fib_weight;
      #endif
              struct rcu_head                rcu;
              struct fib_nh                fib_nh[0];
      #define fib_dev                fib_nh[0].nh_dev
      };
      
      
      #ifdef CONFIG_IP_MULTIPLE_TABLES
      struct fib_rule;
      #endif
      
      struct fib_table;
      struct fib_result {
              unsigned char        prefixlen;
              unsigned char        nh_sel;
              unsigned char        type;
              unsigned char        scope;
              u32                tclassid;
              struct fib_info *fi;
              struct fib_table *table;
              struct hlist_head *fa_head;
      };
      
      struct fib_result_nl {
              __be32                fl_addr;   /* To be looked up*/
              u32                fl_mark;
              unsigned char        fl_tos;
              unsigned char   fl_scope;
              unsigned char   tb_id_in;
      
              unsigned char   tb_id;      /* Results */
              unsigned char        prefixlen;
              unsigned char        nh_sel;
              unsigned char        type;
              unsigned char        scope;
              int             err;      
      };
      
      #ifdef CONFIG_IP_ROUTE_MULTIPATH
      #define FIB_RES_NH(res)                ((res).fi->fib_nh[(res).nh_sel])
      #else /* CONFIG_IP_ROUTE_MULTIPATH */
      #define FIB_RES_NH(res)                ((res).fi->fib_nh[0])
      #endif /* CONFIG_IP_ROUTE_MULTIPATH */
      
      #ifdef CONFIG_IP_MULTIPLE_TABLES
      #define FIB_TABLE_HASHSZ 256
      #else
      #define FIB_TABLE_HASHSZ 2
      #endif
      
      __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
      
      #define FIB_RES_SADDR(net, res)                                \
              ((FIB_RES_NH(res).nh_saddr_genid ==                \
                atomic_read(&(net)->ipv4.dev_addr_genid)) ?        \
               FIB_RES_NH(res).nh_saddr :                        \
               fib_info_update_nh_saddr((net), &FIB_RES_NH(res)))
      #define FIB_RES_GW(res)                        (FIB_RES_NH(res).nh_gw)
      #define FIB_RES_DEV(res)                (FIB_RES_NH(res).nh_dev)
      #define FIB_RES_OIF(res)                (FIB_RES_NH(res).nh_oif)
      
      #define FIB_RES_PREFSRC(net, res)        ((res).fi->fib_prefsrc ? : \
                                               FIB_RES_SADDR(net, res))
      
      struct fib_table {
              struct hlist_node        tb_hlist;
              u32                        tb_id;
              int                        tb_num_default;
              struct rcu_head                rcu;
              unsigned long                 *tb_data;
              unsigned long                __data[0];
      };
      
      int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
                           struct fib_result *res, int fib_flags);
      int fib_table_insert(struct fib_table *, struct fib_config *);
      int fib_table_delete(struct fib_table *, struct fib_config *);
      int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
                         struct netlink_callback *cb);
      int fib_table_flush(struct fib_table *table, bool flush_all);
      struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
      void fib_table_flush_external(struct fib_table *table);
      void fib_free_table(struct fib_table *tb);
      
      #ifndef CONFIG_IP_MULTIPLE_TABLES
      
      #define TABLE_LOCAL_INDEX        (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1))
      #define TABLE_MAIN_INDEX        (RT_TABLE_MAIN  & (FIB_TABLE_HASHSZ - 1))
      
      static inline struct fib_table *fib_get_table(struct net *net, u32 id)
      {
              struct hlist_node *tb_hlist;
              struct hlist_head *ptr;
      
              ptr = id == RT_TABLE_LOCAL ?
                      &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
                      &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
      
              tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr));
      
              return hlist_entry(tb_hlist, struct fib_table, tb_hlist);
      }
      
      static inline struct fib_table *fib_new_table(struct net *net, u32 id)
      {
              return fib_get_table(net, id);
      }
      
      static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
                                   struct fib_result *res, unsigned int flags)
      {
              struct fib_table *tb;
              int err = -ENETUNREACH;
      
              rcu_read_lock();
      
              tb = fib_get_table(net, RT_TABLE_MAIN);
              if (tb)
                      err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF);
      
              if (err == -EAGAIN)
                      err = -ENETUNREACH;
      
              rcu_read_unlock();
      
              return err;
      }
      
      #else /* CONFIG_IP_MULTIPLE_TABLES */
      int __net_init fib4_rules_init(struct net *net);
      void __net_exit fib4_rules_exit(struct net *net);
      
      struct fib_table *fib_new_table(struct net *net, u32 id);
      struct fib_table *fib_get_table(struct net *net, u32 id);
      
      int __fib_lookup(struct net *net, struct flowi4 *flp,
                       struct fib_result *res, unsigned int flags);
      
      static inline int fib_lookup(struct net *net, struct flowi4 *flp,
                                   struct fib_result *res, unsigned int flags)
      {
              struct fib_table *tb;
              int err = -ENETUNREACH;
      
              flags |= FIB_LOOKUP_NOREF;
  519         if (net->ipv4.fib_has_custom_rules)
  211                 return __fib_lookup(net, flp, res, flags);
      
  390         rcu_read_lock();
      
  390         res->tclassid = 0;
      
  390         tb = rcu_dereference_rtnl(net->ipv4.fib_main);
  390         if (tb)
                      err = fib_table_lookup(tb, flp, res, flags);
      
              if (!err)
                      goto out;
      
  167         tb = rcu_dereference_rtnl(net->ipv4.fib_default);
  167         if (tb)
                      err = fib_table_lookup(tb, flp, res, flags);
      
      out:
  167         if (err == -EAGAIN)
                      err = -ENETUNREACH;
      
  390         rcu_read_unlock();
      
              return err;
      }
      
      #endif /* CONFIG_IP_MULTIPLE_TABLES */
      
      /* Exported by fib_frontend.c */
      extern const struct nla_policy rtm_ipv4_policy[];
      void ip_fib_init(void);
      __be32 fib_compute_spec_dst(struct sk_buff *skb);
      int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                              u8 tos, int oif, struct net_device *dev,
                              struct in_device *idev, u32 *itag);
      void fib_select_default(const struct flowi4 *flp, struct fib_result *res);
      #ifdef CONFIG_IP_ROUTE_CLASSID
      static inline int fib_num_tclassid_users(struct net *net)
      {
              return net->ipv4.fib_num_tclassid_users;
      }
      #else
      static inline int fib_num_tclassid_users(struct net *net)
      {
              return 0;
      }
      #endif
      int fib_unmerge(struct net *net);
      void fib_flush_external(struct net *net);
      
      /* Exported by fib_semantics.c */
      int ip_fib_check_default(__be32 gw, struct net_device *dev);
      int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
      int fib_sync_down_addr(struct net *net, __be32 local);
      int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
      void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
      
      extern u32 fib_multipath_secret __read_mostly;
      
      static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
      {
              return jhash_2words(saddr, daddr, fib_multipath_secret) >> 1;
      }
      
      void fib_select_multipath(struct fib_result *res, int hash);
      void fib_select_path(struct net *net, struct fib_result *res,
                           struct flowi4 *fl4, int mp_hash);
      
      /* Exported by fib_trie.c */
      void fib_trie_init(void);
      struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
      
      static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
      {
      #ifdef CONFIG_IP_ROUTE_CLASSID
      #ifdef CONFIG_IP_MULTIPLE_TABLES
              u32 rtag;
      #endif
              *itag = FIB_RES_NH(*res).nh_tclassid<<16;
      #ifdef CONFIG_IP_MULTIPLE_TABLES
              rtag = res->tclassid;
              if (*itag == 0)
                      *itag = (rtag<<16);
              *itag |= (rtag>>16);
      #endif
      #endif
      }
      
      void free_fib_info(struct fib_info *fi);
      
      static inline void fib_info_put(struct fib_info *fi)
      {
              if (atomic_dec_and_test(&fi->fib_clntref))
   56                 free_fib_info(fi);
      }
      
      #ifdef CONFIG_PROC_FS
      int __net_init fib_proc_init(struct net *net);
      void __net_exit fib_proc_exit(struct net *net);
      #else
      static inline int fib_proc_init(struct net *net)
      {
              return 0;
      }
      static inline void fib_proc_exit(struct net *net)
      {
      }
      #endif
      
      #endif  /* _NET_FIB_H */
      /*
       * mm/interval_tree.c - interval tree for mapping->i_mmap
       *
       * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
       *
       * This file is released under the GPL v2.
       */
      
      #include <linux/mm.h>
      #include <linux/fs.h>
      #include <linux/rmap.h>
      #include <linux/interval_tree_generic.h>
      
      static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
      {
  355         return v->vm_pgoff;
      }
      
      static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
      {
  293         return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
      }
      
  493 INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
                           unsigned long, shared.rb_subtree_last,
                           vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
      
      /* Insert node immediately after prev in the interval tree */
      void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                          struct vm_area_struct *prev,
                                          struct rb_root *root)
      {
              struct rb_node **link;
              struct vm_area_struct *parent;
  150         unsigned long last = vma_last_pgoff(node);
      
              VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
      
  150         if (!prev->shared.rb.rb_right) {
                      parent = prev;
  143                 link = &prev->shared.rb.rb_right;
              } else {
  122                 parent = rb_entry(prev->shared.rb.rb_right,
                                        struct vm_area_struct, shared.rb);
  122                 if (parent->shared.rb_subtree_last < last)
    1                         parent->shared.rb_subtree_last = last;
  122                 while (parent->shared.rb.rb_left) {
    2                         parent = rb_entry(parent->shared.rb.rb_left,
                                      struct vm_area_struct, shared.rb);
    2                         if (parent->shared.rb_subtree_last < last)
    1                                 parent->shared.rb_subtree_last = last;
                      }
  122                 link = &parent->shared.rb.rb_left;
              }
      
  150         node->shared.rb_subtree_last = last;
              rb_link_node(&node->shared.rb, &parent->shared.rb, link);
              rb_insert_augmented(&node->shared.rb, root,
                                  &vma_interval_tree_augment);
      }
      
      static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
      {
  592         return vma_start_pgoff(avc->vma);
      }
      
      static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
      {
  127         return vma_last_pgoff(avc->vma);
      }
      
  595 INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
                           avc_start_pgoff, avc_last_pgoff,
                           static inline, __anon_vma_interval_tree)
      
      void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                         struct rb_root *root)
      {
      #ifdef CONFIG_DEBUG_VM_RB
  591         node->cached_vma_start = avc_start_pgoff(node);
              node->cached_vma_last = avc_last_pgoff(node);
      #endif
  591         __anon_vma_interval_tree_insert(node, root);
      }
      
      void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                         struct rb_root *root)
      {
  293         __anon_vma_interval_tree_remove(node, root);
  293 }
      
      struct anon_vma_chain *
      anon_vma_interval_tree_iter_first(struct rb_root *root,
                                        unsigned long first, unsigned long last)
      {
    5         return __anon_vma_interval_tree_iter_first(root, first, last);
      }
      
      struct anon_vma_chain *
      anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
                                       unsigned long first, unsigned long last)
      {
    5         return __anon_vma_interval_tree_iter_next(node, first, last);
      }
      
      #ifdef CONFIG_DEBUG_VM_RB
      void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
      {
  410         WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
  410         WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
  410 }
      #endif
      /*
       * INET                An implementation of the TCP/IP protocol suite for the LINUX
       *                operating system.  INET is implemented using the  BSD Socket
       *                interface as the means of communication with the user level.
       *
       *                Generic TIME_WAIT sockets functions
       *
       *                From code orinally in TCP
       */
      
      #include <linux/kernel.h>
      #include <linux/kmemcheck.h>
      #include <linux/slab.h>
      #include <linux/module.h>
      #include <net/inet_hashtables.h>
      #include <net/inet_timewait_sock.h>
      #include <net/ip.h>
      
      
      /**
       *        inet_twsk_bind_unhash - unhash a timewait socket from bind hash
       *        @tw: timewait socket
       *        @hashinfo: hashinfo pointer
       *
       *        unhash a timewait socket from bind hash, if hashed.
       *        bind hash lock must be held by caller.
       *        Returns 1 if caller should call inet_twsk_put() after lock release.
       */
      void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
                                struct inet_hashinfo *hashinfo)
      {
   54         struct inet_bind_bucket *tb = tw->tw_tb;
      
              if (!tb)
                      return;
      
   54         __hlist_del(&tw->tw_bind_node);
   54         tw->tw_tb = NULL;
              inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
   54         __sock_put((struct sock *)tw);
      }
      
      /* Must be called with locally disabled BHs. */
      static void inet_twsk_kill(struct inet_timewait_sock *tw)
      {
   54         struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
              spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
              struct inet_bind_hashbucket *bhead;
      
              spin_lock(lock);
              sk_nulls_del_node_init_rcu((struct sock *)tw);
   54         spin_unlock(lock);
      
              /* Disassociate with bind bucket. */
              bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
                              hashinfo->bhash_size)];
      
              spin_lock(&bhead->lock);
              inet_twsk_bind_unhash(tw, hashinfo);
              spin_unlock(&bhead->lock);
      
              atomic_dec(&tw->tw_dr->tw_count);
              inet_twsk_put(tw);
      }
      
      void inet_twsk_free(struct inet_timewait_sock *tw)
      {
   54         struct module *owner = tw->tw_prot->owner;
   54         twsk_destructor((struct sock *)tw);
      #ifdef SOCK_REFCNT_DEBUG
              pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
      #endif
   54         kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
              module_put(owner);
      }
      
      void inet_twsk_put(struct inet_timewait_sock *tw)
      {
   62         if (atomic_dec_and_test(&tw->tw_refcnt))
   54                 inet_twsk_free(tw);
   62 }
      EXPORT_SYMBOL_GPL(inet_twsk_put);
      
      static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
                                         struct hlist_nulls_head *list)
      {
    8         hlist_nulls_add_head_rcu(&tw->tw_node, list);
      }
      
      static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
                                          struct hlist_head *list)
      {
    8         hlist_add_head(&tw->tw_bind_node, list);
      }
      
      /*
       * Enter the time wait state. This is called with locally disabled BH.
       * Essentially we whip up a timewait bucket, copy the relevant info into it
       * from the SK, and mess with hash chains and list linkage.
       */
      void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
                                 struct inet_hashinfo *hashinfo)
      {
              const struct inet_sock *inet = inet_sk(sk);
              const struct inet_connection_sock *icsk = inet_csk(sk);
    8         struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
              spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
              struct inet_bind_hashbucket *bhead;
              /* Step 1: Put TW into bind hash. Original socket stays there too.
                 Note, that any socket with inet->num != 0 MUST be bound in
                 binding cache, even if it is closed.
               */
              bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
                              hashinfo->bhash_size)];
              spin_lock(&bhead->lock);
              tw->tw_tb = icsk->icsk_bind_hash;
              WARN_ON(!icsk->icsk_bind_hash);
    8         inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
              spin_unlock(&bhead->lock);
      
              spin_lock(lock);
      
              /*
               * Step 2: Hash TW into tcp ehash chain.
               * Notes :
               * - tw_refcnt is set to 4 because :
               * - We have one reference from bhash chain.
               * - We have one reference from ehash chain.
               * - We have one reference from timer.
               * - One reference for ourself (our caller will release it).
               * We can use atomic_set() because prior spin_lock()/spin_unlock()
               * committed into memory all tw fields.
               */
              atomic_set(&tw->tw_refcnt, 4);
    8         inet_twsk_add_node_rcu(tw, &ehead->chain);
      
              /* Step 3: Remove SK from hash chain */
    8         if (__sk_nulls_del_node_init_rcu(sk))
                      sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
      
    8         spin_unlock(lock);
      }
      EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
      
      static void tw_timer_handler(unsigned long data)
      {
              struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
      
              if (tw->tw_kill)
                      NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
              else
                      NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
              inet_twsk_kill(tw);
      }
      
      struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
                                                 struct inet_timewait_death_row *dr,
                                                 const int state)
      {
              struct inet_timewait_sock *tw;
      
    8         if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
                      return NULL;
      
    8         tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
                                    GFP_ATOMIC);
              if (tw) {
                      const struct inet_sock *inet = inet_sk(sk);
      
                      kmemcheck_annotate_bitfield(tw, flags);
      
    8                 tw->tw_dr            = dr;
                      /* Give us an identity. */
                      tw->tw_daddr            = inet->inet_daddr;
                      tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
                      tw->tw_bound_dev_if = sk->sk_bound_dev_if;
                      tw->tw_tos            = inet->tos;
                      tw->tw_num            = inet->inet_num;
                      tw->tw_state            = TCP_TIME_WAIT;
                      tw->tw_substate            = state;
                      tw->tw_sport            = inet->inet_sport;
                      tw->tw_dport            = inet->inet_dport;
                      tw->tw_family            = sk->sk_family;
                      tw->tw_reuse            = sk->sk_reuse;
                      tw->tw_reuseport    = sk->sk_reuseport;
                      tw->tw_hash            = sk->sk_hash;
                      tw->tw_ipv6only            = 0;
                      tw->tw_transparent  = inet->transparent;
                      tw->tw_prot            = sk->sk_prot_creator;
                      atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
                      twsk_net_set(tw, sock_net(sk));
                      setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
                      /*
                       * Because we use RCU lookups, we should not set tw_refcnt
                       * to a non null value before everything is setup for this
                       * timewait socket.
                       */
                      atomic_set(&tw->tw_refcnt, 0);
      
    8                 __module_get(tw->tw_prot->owner);
              }
      
              return tw;
      }
      EXPORT_SYMBOL_GPL(inet_twsk_alloc);
      
      /* These are always called from BH context.  See callers in
       * tcp_input.c to verify this.
       */
      
      /* This is for handling early-kills of TIME_WAIT sockets.
       * Warning : consume reference.
       * Caller should not access tw anymore.
       */
      void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
      {
   54         if (del_timer_sync(&tw->tw_timer))
   54                 inet_twsk_kill(tw);
   54         inet_twsk_put(tw);
      }
      EXPORT_SYMBOL(inet_twsk_deschedule_put);
      
      void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
      {
              /* timeout := RTO * 3.5
               *
               * 3.5 = 1+2+0.5 to wait for two retransmits.
               *
               * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
               * our ACK acking that FIN can be lost. If N subsequent retransmitted
               * FINs (or previous seqments) are lost (probability of such event
               * is p^(N+1), where p is probability to lose single packet and
               * time to detect the loss is about RTO*(2^N - 1) with exponential
               * backoff). Normal timewait length is calculated so, that we
               * waited at least for one retransmitted FIN (maximal RTO is 120sec).
               * [ BTW Linux. following BSD, violates this requirement waiting
               *   only for 60sec, we should wait at least for 240 secs.
               *   Well, 240 consumes too much of resources 8)
               * ]
               * This interval is not reduced to catch old duplicate and
               * responces to our wandering segments living for two MSLs.
               * However, if we use PAWS to detect
               * old duplicates, we can reduce the interval to bounds required
               * by RTO, rather than MSL. So, if peer understands PAWS, we
               * kill tw bucket after 3.5*RTO (it is important that this number
               * is greater than TS tick!) and detect old duplicates with help
               * of PAWS.
               */
      
    8         tw->tw_kill = timeo <= 4*HZ;
              if (!rearm) {
    8                 BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo));
    8                 atomic_inc(&tw->tw_dr->tw_count);
              } else {
                      mod_timer_pending(&tw->tw_timer, jiffies + timeo);
              }
    8 }
      EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
      
      void inet_twsk_purge(struct inet_hashinfo *hashinfo,
                           struct inet_timewait_death_row *twdr, int family)
      {
              struct inet_timewait_sock *tw;
              struct sock *sk;
              struct hlist_nulls_node *node;
              unsigned int slot;
      
              for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
                      struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
      restart_rcu:
                      cond_resched();
                      rcu_read_lock();
      restart:
                      sk_nulls_for_each_rcu(sk, node, &head->chain) {
                              if (sk->sk_state != TCP_TIME_WAIT)
                                      continue;
                              tw = inet_twsk(sk);
                              if ((tw->tw_family != family) ||
                                      atomic_read(&twsk_net(tw)->count))
                                      continue;
      
                              if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
                                      continue;
      
                              if (unlikely((tw->tw_family != family) ||
                                           atomic_read(&twsk_net(tw)->count))) {
                                      inet_twsk_put(tw);
                                      goto restart;
                              }
      
                              rcu_read_unlock();
                              local_bh_disable();
                              inet_twsk_deschedule_put(tw);
                              local_bh_enable();
                              goto restart_rcu;
                      }
                      /* If the nulls value we got at the end of this lookup is
                       * not the expected one, we must restart lookup.
                       * We probably met an item that was moved to another chain.
                       */
                      if (get_nulls_value(node) != slot)
                              goto restart;
                      rcu_read_unlock();
              }
      }
      EXPORT_SYMBOL_GPL(inet_twsk_purge);
      /*
       * NET                Generic infrastructure for Network protocols.
       *
       * Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
       *
       *                 From code originally in include/net/tcp.h
       *
       *                This program is free software; you can redistribute it and/or
       *                modify it under the terms of the GNU General Public License
       *                as published by the Free Software Foundation; either version
       *                2 of the License, or (at your option) any later version.
       */
      
      #include <linux/module.h>
      #include <linux/random.h>
      #include <linux/slab.h>
      #include <linux/string.h>
      #include <linux/tcp.h>
      #include <linux/vmalloc.h>
      
      #include <net/request_sock.h>
      
      /*
       * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
       * One SYN_RECV socket costs about 80bytes on a 32bit machine.
       * It would be better to replace it with a global counter for all sockets
       * but then some measure against one socket starving all other sockets
       * would be needed.
       *
       * The minimum value of it is 128. Experiments with real servers show that
       * it is absolutely not enough even at 100conn/sec. 256 cures most
       * of problems.
       * This value is adjusted to 128 for low memory machines,
       * and it will increase in proportion to the memory of machine.
       * Note : Dont forget somaxconn that may limit backlog too.
       */
      int sysctl_max_syn_backlog = 256;
      EXPORT_SYMBOL(sysctl_max_syn_backlog);
      
      void reqsk_queue_alloc(struct request_sock_queue *queue)
      {
   24         spin_lock_init(&queue->rskq_lock);
      
              spin_lock_init(&queue->fastopenq.lock);
              queue->fastopenq.rskq_rst_head = NULL;
              queue->fastopenq.rskq_rst_tail = NULL;
              queue->fastopenq.qlen = 0;
      
              queue->rskq_accept_head = NULL;
      }
      
      /*
       * This function is called to set a Fast Open socket's "fastopen_rsk" field
       * to NULL when a TFO socket no longer needs to access the request_sock.
       * This happens only after 3WHS has been either completed or aborted (e.g.,
       * RST is received).
       *
       * Before TFO, a child socket is created only after 3WHS is completed,
       * hence it never needs to access the request_sock. things get a lot more
       * complex with TFO. A child socket, accepted or not, has to access its
       * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
       * until 3WHS is either completed or aborted. Afterwards the req will stay
       * until either the child socket is accepted, or in the rare case when the
       * listener is closed before the child is accepted.
       *
       * In short, a request socket is only freed after BOTH 3WHS has completed
       * (or aborted) and the child socket has been accepted (or listener closed).
       * When a child socket is accepted, its corresponding req->sk is set to
       * NULL since it's no longer needed. More importantly, "req->sk == NULL"
       * will be used by the code below to determine if a child socket has been
       * accepted or not, and the check is protected by the fastopenq->lock
       * described below.
       *
       * Note that fastopen_rsk is only accessed from the child socket's context
       * with its socket lock held. But a request_sock (req) can be accessed by
       * both its child socket through fastopen_rsk, and a listener socket through
       * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
       * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
       * only in the rare case when both the listener and the child locks are held,
       * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
       * The lock also protects other fields such as fastopenq->qlen, which is
       * decremented by this function when fastopen_rsk is no longer needed.
       *
       * Note that another solution was to simply use the existing socket lock
       * from the listener. But first socket lock is difficult to use. It is not
       * a simple spin lock - one must consider sock_owned_by_user() and arrange
       * to use sk_add_backlog() stuff. But what really makes it infeasible is the
       * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
       * acquire a child's lock while holding listener's socket lock. A corner
       * case might also exist in tcp_v4_hnd_req() that will trigger this locking
       * order.
       *
       * This function also sets "treq->tfo_listener" to false.
       * treq->tfo_listener is used by the listener so it is protected by the
       * fastopenq->lock in this function.
       */
      void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
                                 bool reset)
      {
              struct sock *lsk = req->rsk_listener;
              struct fastopen_queue *fastopenq;
      
              fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
      
              tcp_sk(sk)->fastopen_rsk = NULL;
              spin_lock_bh(&fastopenq->lock);
              fastopenq->qlen--;
              tcp_rsk(req)->tfo_listener = false;
              if (req->sk)        /* the child socket hasn't been accepted yet */
                      goto out;
      
              if (!reset || lsk->sk_state != TCP_LISTEN) {
                      /* If the listener has been closed don't bother with the
                       * special RST handling below.
                       */
                      spin_unlock_bh(&fastopenq->lock);
                      reqsk_put(req);
                      return;
              }
              /* Wait for 60secs before removing a req that has triggered RST.
               * This is a simple defense against TFO spoofing attack - by
               * counting the req against fastopen.max_qlen, and disabling
               * TFO when the qlen exceeds max_qlen.
               *
               * For more details see CoNext'11 "TCP Fast Open" paper.
               */
              req->rsk_timer.expires = jiffies + 60*HZ;
              if (fastopenq->rskq_rst_head == NULL)
                      fastopenq->rskq_rst_head = req;
              else
                      fastopenq->rskq_rst_tail->dl_next = req;
      
              req->dl_next = NULL;
              fastopenq->rskq_rst_tail = req;
              fastopenq->qlen++;
      out:
              spin_unlock_bh(&fastopenq->lock);
      }
      /*
       * mm/truncate.c - code for taking down pages from address_spaces
       *
       * Copyright (C) 2002, Linus Torvalds
       *
       * 10Sep2002        Andrew Morton
       *                Initial version.
       */
      
      #include <linux/kernel.h>
      #include <linux/backing-dev.h>
      #include <linux/gfp.h>
      #include <linux/mm.h>
      #include <linux/swap.h>
      #include <linux/export.h>
      #include <linux/pagemap.h>
      #include <linux/highmem.h>
      #include <linux/pagevec.h>
      #include <linux/task_io_accounting_ops.h>
      #include <linux/buffer_head.h>        /* grr. try_to_release_page,
                                         do_invalidatepage */
      #include <linux/cleancache.h>
      #include <linux/rmap.h>
      #include "internal.h"
      
    2 static void clear_exceptional_entry(struct address_space *mapping,
                                          pgoff_t index, void *entry)
      {
              struct radix_tree_node *node;
              void **slot;
      
              /* Handled by shmem itself */
    2         if (shmem_mapping(mapping))
                      return;
      
    2         spin_lock_irq(&mapping->tree_lock);
              /*
               * Regular page slots are stabilized by the page lock even
               * without the tree itself locked.  These unlocked entries
               * need verification under the tree lock.
               */
              if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
                      goto unlock;
    2         if (*slot != entry)
                      goto unlock;
    2         radix_tree_replace_slot(slot, NULL);
              mapping->nrshadows--;
              if (!node)
                      goto unlock;
    2         workingset_node_shadows_dec(node);
              /*
               * Don't track node without shadow entries.
               *
               * Avoid acquiring the list_lru lock if already untracked.
               * The list_empty() test is safe as node->private_list is
               * protected by mapping->tree_lock.
               */
              if (!workingset_node_shadows(node) &&
    2             !list_empty(&node->private_list))
    2                 list_lru_del(&workingset_shadow_nodes, &node->private_list);
    2         __radix_tree_delete_node(&mapping->page_tree, node);
      unlock:
    2         spin_unlock_irq(&mapping->tree_lock);
      }
      
      /**
       * do_invalidatepage - invalidate part or all of a page
       * @page: the page which is affected
       * @offset: start of the range to invalidate
       * @length: length of the range to invalidate
       *
       * do_invalidatepage() is called when all or part of the page has become
       * invalidated by a truncate operation.
       *
       * do_invalidatepage() does not have to release all buffers, but it must
       * ensure that no dirty buffer is left outside @offset and that no I/O
       * is underway against any of the blocks which are outside the truncation
       * point.  Because the caller is about to free (and possibly reuse) those
       * blocks on-disk.
       */
      void do_invalidatepage(struct page *page, unsigned int offset,
                             unsigned int length)
      {
              void (*invalidatepage)(struct page *, unsigned int, unsigned int);
      
  346         invalidatepage = page->mapping->a_ops->invalidatepage;
      #ifdef CONFIG_BLOCK
              if (!invalidatepage)
                      invalidatepage = block_invalidatepage;
      #endif
   17         if (invalidatepage)
  343                 (*invalidatepage)(page, offset, length);
    1 }
      
      /*
       * If truncate cannot remove the fs-private metadata from the page, the page
       * becomes orphaned.  It will be left on the LRU and may even be mapped into
       * user pagetables if we're racing with filemap_fault().
       *
       * We need to bale out if page->mapping is no longer equal to the original
       * mapping.  This happens a) when the VM reclaimed the page while we waited on
       * its lock, b) when a concurrent invalidate_mapping_pages got there first and
       * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
       */
      static int
      truncate_complete_page(struct address_space *mapping, struct page *page)
      {
  483         if (page->mapping != mapping)
                      return -EIO;
      
  483         if (page_has_private(page))
  342                 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
      
              /*
               * Some filesystems seem to re-dirty the page even after
               * the VM has canceled the dirty bit (eg ext3 journaling).
               * Hence dirty accounting check is placed after invalidation.
               */
  483         cancel_dirty_page(page);
              ClearPageMappedToDisk(page);
              delete_from_page_cache(page);
              return 0;
      }
      
      /*
       * This is for invalidate_mapping_pages().  That function can be called at
       * any time, and is not supposed to throw away dirty pages.  But pages can
       * be marked dirty at any time too, so use remove_mapping which safely
       * discards clean, unused pages.
       *
       * Returns non-zero if the page was successfully invalidated.
       */
      static int
      invalidate_complete_page(struct address_space *mapping, struct page *page)
      {
              int ret;
      
   18         if (page->mapping != mapping)
                      return 0;
      
   18         if (page_has_private(page) && !try_to_release_page(page, 0))
                      return 0;
      
   18         ret = remove_mapping(mapping, page);
      
              return ret;
      }
      
      int truncate_inode_page(struct address_space *mapping, struct page *page)
      {
  483         if (page_mapped(page)) {
                      unmap_mapping_range(mapping,
    2                                    (loff_t)page->index << PAGE_CACHE_SHIFT,
                                         PAGE_CACHE_SIZE, 0);
              }
  483         return truncate_complete_page(mapping, page);
      }
      
      /*
       * Used to get rid of pages on hardware memory corruption.
       */
      int generic_error_remove_page(struct address_space *mapping, struct page *page)
      {
              if (!mapping)
                      return -EINVAL;
              /*
               * Only punch for normal data pages for now.
               * Handling other types like directories would need more auditing.
               */
              if (!S_ISREG(mapping->host->i_mode))
                      return -EIO;
              return truncate_inode_page(mapping, page);
      }
      EXPORT_SYMBOL(generic_error_remove_page);
      
      /*
       * Safely invalidate one page from its pagecache mapping.
       * It only drops clean, unused pages. The page must be locked.
       *
       * Returns 1 if the page is successfully invalidated, otherwise 0.
       */
      int invalidate_inode_page(struct page *page)
      {
   21         struct address_space *mapping = page_mapping(page);
              if (!mapping)
   21                 return 0;
   21         if (PageDirty(page) || PageWriteback(page))
                      return 0;
   21         if (page_mapped(page))
                      return 0;
   18         return invalidate_complete_page(mapping, page);
      }
      
      /**
       * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
       * @mapping: mapping to truncate
       * @lstart: offset from which to truncate
       * @lend: offset to which to truncate (inclusive)
       *
       * Truncate the page cache, removing the pages that are between
       * specified offsets (and zeroing out partial pages
       * if lstart or lend + 1 is not page aligned).
       *
       * Truncate takes two passes - the first pass is nonblocking.  It will not
       * block on page locks and it will not block on writeback.  The second pass
       * will wait.  This is to prevent as much IO as possible in the affected region.
       * The first pass will remove most pages, so the search cost of the second pass
       * is low.
       *
       * We pass down the cache-hot hint to the page freeing code.  Even if the
       * mapping is large, it is probably the case that the final pages are the most
       * recently touched, and freeing happens in ascending file offset order.
       *
       * Note that since ->invalidatepage() accepts range to invalidate
       * truncate_inode_pages_range is able to handle cases where lend + 1 is not
       * page aligned properly.
       */
      void truncate_inode_pages_range(struct address_space *mapping,
                                      loff_t lstart, loff_t lend)
      {
              pgoff_t                start;                /* inclusive */
              pgoff_t                end;                /* exclusive */
              unsigned int        partial_start;        /* inclusive */
              unsigned int        partial_end;        /* exclusive */
              struct pagevec        pvec;
              pgoff_t                indices[PAGEVEC_SIZE];
              pgoff_t                index;
              int                i;
      
              cleancache_invalidate_inode(mapping);
 1196         if (mapping->nrpages == 0 && mapping->nrshadows == 0)
                      return;
      
              /* Offsets within partial pages */
  480         partial_start = lstart & (PAGE_CACHE_SIZE - 1);
              partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
      
              /*
               * 'start' and 'end' always covers the range of pages to be fully
               * truncated. Partial pages are covered with 'partial_start' at the
               * start of the range and 'partial_end' at the end of the range.
               * Note that 'end' is exclusive while 'lend' is inclusive.
               */
              start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
              if (lend == -1)
                      /*
                       * lend == -1 indicates end-of-file so we have to set 'end'
                       * to the highest possible pgoff_t and since the type is
                       * unsigned we're using -1.
                       */
                      end = -1;
              else
   30                 end = (lend + 1) >> PAGE_CACHE_SHIFT;
      
  460         pagevec_init(&pvec, 0);
              index = start;
              while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
  480                         min(end - index, (pgoff_t)PAGEVEC_SIZE),
                              indices)) {
  449                 for (i = 0; i < pagevec_count(&pvec); i++) {
  449                         struct page *page = pvec.pages[i];
      
                              /* We rely upon deletion not changing page->index */
                              index = indices[i];
                              if (index >= end)
                                      break;
      
  449                         if (radix_tree_exceptional_entry(page)) {
    2                                 clear_exceptional_entry(mapping, index, page);
                                      continue;
                              }
      
  449                         if (!trylock_page(page))
                                      continue;
                              WARN_ON(page->index != index);
  449                         if (PageWriteback(page)) {
    7                                 unlock_page(page);
                                      continue;
                              }
  449                         truncate_inode_page(mapping, page);
                              unlock_page(page);
                      }
  449                 pagevec_remove_exceptionals(&pvec);
  449                 pagevec_release(&pvec);
  449                 cond_resched();
                      index++;
              }
      
  428         if (partial_start) {
   35                 struct page *page = find_lock_page(mapping, start - 1);
                      if (page) {
                              unsigned int top = PAGE_CACHE_SIZE;
   21                         if (start > end) {
                                      /* Truncation within a single page */
                                      top = partial_end;
                                      partial_end = 0;
                              }
   21                         wait_on_page_writeback(page);
   21                         zero_user_segment(page, partial_start, top);
                              cleancache_invalidate_page(mapping, page);
   21                         if (page_has_private(page))
   14                                 do_invalidatepage(page, partial_start,
                                                        top - partial_start);
   21                         unlock_page(page);
                              page_cache_release(page);
                      }
              }
  428         if (partial_end) {
                      struct page *page = find_lock_page(mapping, end);
                      if (page) {
                              wait_on_page_writeback(page);
                              zero_user_segment(page, 0, partial_end);
                              cleancache_invalidate_page(mapping, page);
                              if (page_has_private(page))
                                      do_invalidatepage(page, 0,
                                                        partial_end);
                              unlock_page(page);
                              page_cache_release(page);
                      }
              }
              /*
               * If the truncation happened within a single page no pages
               * will be released, just zeroed, so we can bail out now.
               */
  428         if (start >= end)
                      return;
      
              index = start;
              for ( ; ; ) {
  428                 cond_resched();
                      if (!pagevec_lookup_entries(&pvec, mapping, index,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
                              /* If all gone from start onwards, we're done */
  413                         if (index == start)
                                      break;
                              /* Otherwise restart to make sure all gone */
                              index = start;
                              continue;
                      }
   44                 if (index == start && indices[0] >= end) {
                              /* All gone out of hole to be punched, we're done */
   25                         pagevec_remove_exceptionals(&pvec);
   25                         pagevec_release(&pvec);
                              break;
                      }
   20                 for (i = 0; i < pagevec_count(&pvec); i++) {
   20                         struct page *page = pvec.pages[i];
      
                              /* We rely upon deletion not changing page->index */
                              index = indices[i];
                              if (index >= end) {
                                      /* Restart punch to make sure all gone */
                                      index = start - 1;
                                      break;
                              }
      
   20                         if (radix_tree_exceptional_entry(page)) {
                                      clear_exceptional_entry(mapping, index, page);
                                      continue;
                              }
      
   20                         lock_page(page);
   20                         WARN_ON(page->index != index);
   20                         wait_on_page_writeback(page);
   20                         truncate_inode_page(mapping, page);
                              unlock_page(page);
                      }
   20                 pagevec_remove_exceptionals(&pvec);
   20                 pagevec_release(&pvec);
                      index++;
              }
              cleancache_invalidate_inode(mapping);
      }
      EXPORT_SYMBOL(truncate_inode_pages_range);
      
      /**
       * truncate_inode_pages - truncate *all* the pages from an offset
       * @mapping: mapping to truncate
       * @lstart: offset from which to truncate
       *
       * Called under (and serialised by) inode->i_mutex.
       *
       * Note: When this function returns, there can be a page in the process of
       * deletion (inside __delete_from_page_cache()) in the specified range.  Thus
       * mapping->nrpages can be non-zero when this function returns even after
       * truncation of the whole mapping.
       */
      void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
      {
  786         truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
      }
      EXPORT_SYMBOL(truncate_inode_pages);
      
      /**
       * truncate_inode_pages_final - truncate *all* pages before inode dies
       * @mapping: mapping to truncate
       *
       * Called under (and serialized by) inode->i_mutex.
       *
       * Filesystems have to use this in the .evict_inode path to inform the
       * VM that this is the final truncate and the inode is going away.
       */
      void truncate_inode_pages_final(struct address_space *mapping)
      {
              unsigned long nrshadows;
              unsigned long nrpages;
      
              /*
               * Page reclaim can not participate in regular inode lifetime
               * management (can't call iput()) and thus can race with the
               * inode teardown.  Tell it when the address space is exiting,
               * so that it does not install eviction information after the
               * final truncate has begun.
               */
  673         mapping_set_exiting(mapping);
      
              /*
               * When reclaim installs eviction entries, it increases
               * nrshadows first, then decreases nrpages.  Make sure we see
               * this in the right order or we might miss an entry.
               */
              nrpages = mapping->nrpages;
              smp_rmb();
              nrshadows = mapping->nrshadows;
      
              if (nrpages || nrshadows) {
                      /*
                       * As truncation uses a lockless tree lookup, cycle
                       * the tree lock to make sure any ongoing tree
                       * modification that does not see AS_EXITING is
                       * completed before starting the final truncate.
                       */
   15                 spin_lock_irq(&mapping->tree_lock);
                      spin_unlock_irq(&mapping->tree_lock);
              }
      
              /*
               * Cleancache needs notification even if there are no pages or shadow
               * entries.
               */
  673         truncate_inode_pages(mapping, 0);
      }
      EXPORT_SYMBOL(truncate_inode_pages_final);
      
      /**
       * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
       * @mapping: the address_space which holds the pages to invalidate
       * @start: the offset 'from' which to invalidate
       * @end: the offset 'to' which to invalidate (inclusive)
       *
       * This function only removes the unlocked pages, if you want to
       * remove all the pages of one inode, you must call truncate_inode_pages.
       *
       * invalidate_mapping_pages() will not block on IO activity. It will not
       * invalidate pages which are dirty, locked, under writeback or mapped into
       * pagetables.
       */
      unsigned long invalidate_mapping_pages(struct address_space *mapping,
                      pgoff_t start, pgoff_t end)
      {
              pgoff_t indices[PAGEVEC_SIZE];
              struct pagevec pvec;
              pgoff_t index = start;
              unsigned long ret;
              unsigned long count = 0;
              int i;
      
   21         pagevec_init(&pvec, 0);
              while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
   21                         min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
                              indices)) {
   21                 for (i = 0; i < pagevec_count(&pvec); i++) {
   21                         struct page *page = pvec.pages[i];
      
                              /* We rely upon deletion not changing page->index */
                              index = indices[i];
                              if (index > end)
                                      break;
      
   21                         if (radix_tree_exceptional_entry(page)) {
                                      clear_exceptional_entry(mapping, index, page);
                                      continue;
                              }
      
   21                         if (!trylock_page(page))
                                      continue;
                              WARN_ON(page->index != index);
   21                         ret = invalidate_inode_page(page);
                              unlock_page(page);
                              /*
                               * Invalidation is a hint that the page is no longer
                               * of interest and try to speed up its reclaim.
                               */
                              if (!ret)
   11                                 deactivate_file_page(page);
   21                         count += ret;
                      }
   21                 pagevec_remove_exceptionals(&pvec);
   21                 pagevec_release(&pvec);
   21                 cond_resched();
                      index++;
              }
   21         return count;
      }
      EXPORT_SYMBOL(invalidate_mapping_pages);
      
      /*
       * This is like invalidate_complete_page(), except it ignores the page's
       * refcount.  We do this because invalidate_inode_pages2() needs stronger
       * invalidation guarantees, and cannot afford to leave pages behind because
       * shrink_page_list() has a temp ref on them, or because they're transiently
       * sitting in the lru_cache_add() pagevecs.
       */
      static int
      invalidate_complete_page2(struct address_space *mapping, struct page *page)
      {
              struct mem_cgroup *memcg;
              unsigned long flags;
      
  343         if (page->mapping != mapping)
                      return 0;
      
  343         if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                      return 0;
      
              memcg = mem_cgroup_begin_page_stat(page);
  343         spin_lock_irqsave(&mapping->tree_lock, flags);
              if (PageDirty(page))
                      goto failed;
      
  343         BUG_ON(page_has_private(page));
  343         __delete_from_page_cache(page, NULL, memcg);
              spin_unlock_irqrestore(&mapping->tree_lock, flags);
              mem_cgroup_end_page_stat(memcg);
      
              if (mapping->a_ops->freepage)
                      mapping->a_ops->freepage(page);
      
  343         page_cache_release(page);        /* pagecache ref */
              return 1;
      failed:
              spin_unlock_irqrestore(&mapping->tree_lock, flags);
              mem_cgroup_end_page_stat(memcg);
              return 0;
      }
      
      static int do_launder_page(struct address_space *mapping, struct page *page)
      {
  343         if (!PageDirty(page))
                      return 0;
   38         if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
                      return 0;
              return mapping->a_ops->launder_page(page);
      }
      
      /**
       * invalidate_inode_pages2_range - remove range of pages from an address_space
       * @mapping: the address_space
       * @start: the page offset 'from' which to invalidate
       * @end: the page offset 'to' which to invalidate (inclusive)
       *
       * Any pages which are found to be mapped into pagetables are unmapped prior to
       * invalidation.
       *
       * Returns -EBUSY if any pages could not be invalidated.
       */
      int invalidate_inode_pages2_range(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end)
      {
              pgoff_t indices[PAGEVEC_SIZE];
              struct pagevec pvec;
              pgoff_t index;
              int i;
              int ret = 0;
              int ret2 = 0;
              int did_range_unmap = 0;
      
              cleancache_invalidate_inode(mapping);
  362         pagevec_init(&pvec, 0);
              index = start;
              while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
  362                         min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
                              indices)) {
  343                 for (i = 0; i < pagevec_count(&pvec); i++) {
  343                         struct page *page = pvec.pages[i];
      
                              /* We rely upon deletion not changing page->index */
                              index = indices[i];
                              if (index > end)
                                      break;
      
  343                         if (radix_tree_exceptional_entry(page)) {
                                      clear_exceptional_entry(mapping, index, page);
                                      continue;
                              }
      
  343                         lock_page(page);
  343                         WARN_ON(page->index != index);
  343                         if (page->mapping != mapping) {
                                      unlock_page(page);
                                      continue;
                              }
  343                         wait_on_page_writeback(page);
  343                         if (page_mapped(page)) {
   30                                 if (!did_range_unmap) {
                                              /*
                                               * Zap the rest of the file in one hit.
                                               */
   30                                         unmap_mapping_range(mapping,
                                                 (loff_t)index << PAGE_CACHE_SHIFT,
                                                 (loff_t)(1 + end - index)
                                                               << PAGE_CACHE_SHIFT,
                                                  0);
                                              did_range_unmap = 1;
                                      } else {
                                              /*
                                               * Just zap this page
                                               */
    8                                         unmap_mapping_range(mapping,
                                                 (loff_t)index << PAGE_CACHE_SHIFT,
                                                 PAGE_CACHE_SIZE, 0);
                                      }
                              }
  343                         BUG_ON(page_mapped(page));
  343                         ret2 = do_launder_page(mapping, page);
                              if (ret2 == 0) {
  343                                 if (!invalidate_complete_page2(mapping, page))
                                              ret2 = -EBUSY;
                              }
                              if (ret2 < 0)
                                      ret = ret2;
  343                         unlock_page(page);
                      }
  343                 pagevec_remove_exceptionals(&pvec);
  343                 pagevec_release(&pvec);
  343                 cond_resched();
                      index++;
              }
              cleancache_invalidate_inode(mapping);
  361         return ret;
      }
      EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
      
      /**
       * invalidate_inode_pages2 - remove all pages from an address_space
       * @mapping: the address_space
       *
       * Any pages which are found to be mapped into pagetables are unmapped prior to
       * invalidation.
       *
       * Returns -EBUSY if any pages could not be invalidated.
       */
      int invalidate_inode_pages2(struct address_space *mapping)
      {
              return invalidate_inode_pages2_range(mapping, 0, -1);
      }
      EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
      
      /**
       * truncate_pagecache - unmap and remove pagecache that has been truncated
       * @inode: inode
       * @newsize: new file size
       *
       * inode's new i_size must already be written before truncate_pagecache
       * is called.
       *
       * This function should typically be called before the filesystem
       * releases resources associated with the freed range (eg. deallocates
       * blocks). This way, pagecache will always stay logically coherent
       * with on-disk format, and the filesystem would not have to deal with
       * situations such as writepage being called for a page that has already
       * had its underlying blocks deallocated.
       */
      void truncate_pagecache(struct inode *inode, loff_t newsize)
      {
  513         struct address_space *mapping = inode->i_mapping;
              loff_t holebegin = round_up(newsize, PAGE_SIZE);
      
              /*
               * unmap_mapping_range is called twice, first simply for
               * efficiency so that truncate_inode_pages does fewer
               * single-page unmaps.  However after this first call, and
               * before truncate_inode_pages finishes, it is possible for
               * private pages to be COWed, which remain after
               * truncate_inode_pages finishes, hence the second
               * unmap_mapping_range call must be made for correctness.
               */
              unmap_mapping_range(mapping, holebegin, 0, 1);
              truncate_inode_pages(mapping, newsize);
              unmap_mapping_range(mapping, holebegin, 0, 1);
      }
      EXPORT_SYMBOL(truncate_pagecache);
      
      /**
       * truncate_setsize - update inode and pagecache for a new file size
       * @inode: inode
       * @newsize: new file size
       *
       * truncate_setsize updates i_size and performs pagecache truncation (if
       * necessary) to @newsize. It will be typically be called from the filesystem's
       * setattr function when ATTR_SIZE is passed in.
       *
       * Must be called with a lock serializing truncates and writes (generally
       * i_mutex but e.g. xfs uses a different lock) and before all filesystem
       * specific block truncation has been performed.
       */
      void truncate_setsize(struct inode *inode, loff_t newsize)
      {
   25         loff_t oldsize = inode->i_size;
      
              i_size_write(inode, newsize);
              if (newsize > oldsize)
    6                 pagecache_isize_extended(inode, oldsize, newsize);
   25         truncate_pagecache(inode, newsize);
      }
      EXPORT_SYMBOL(truncate_setsize);
      
      /**
       * pagecache_isize_extended - update pagecache after extension of i_size
       * @inode:        inode for which i_size was extended
       * @from:        original inode size
       * @to:                new inode size
       *
       * Handle extension of inode size either caused by extending truncate or by
       * write starting after current i_size. We mark the page straddling current
       * i_size RO so that page_mkwrite() is called on the nearest write access to
       * the page.  This way filesystem can be sure that page_mkwrite() is called on
       * the page before user writes to the page via mmap after the i_size has been
       * changed.
       *
       * The function must be called after i_size is updated so that page fault
       * coming after we unlock the page will already see the new i_size.
       * The function must be called while we still hold i_mutex - this not only
       * makes sure i_size is stable but also that userspace cannot observe new
       * i_size value before we are prepared to store mmap writes at new inode size.
       */
    1 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
      {
   76         int bsize = i_blocksize(inode);
              loff_t rounded_from;
              struct page *page;
              pgoff_t index;
      
              WARN_ON(to > inode->i_size);
      
   76         if (from >= to || bsize == PAGE_CACHE_SIZE)
                      return;
              /* Page straddling @from will not have any hole block created? */
    5         rounded_from = round_up(from, bsize);
   76         if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
                      return;
      
    1         index = from >> PAGE_CACHE_SHIFT;
              page = find_lock_page(inode->i_mapping, index);
              /* Page not cached? Nothing to do */
              if (!page)
                      return;
              /*
               * See clear_page_dirty_for_io() for details why set_page_dirty()
               * is needed.
               */
              if (page_mkclean(page))
                      set_page_dirty(page);
              unlock_page(page);
              page_cache_release(page);
      }
      EXPORT_SYMBOL(pagecache_isize_extended);
      
      /**
       * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
       * @inode: inode
       * @lstart: offset of beginning of hole
       * @lend: offset of last byte of hole
       *
       * This function should typically be called before the filesystem
       * releases resources associated with the freed range (eg. deallocates
       * blocks). This way, pagecache will always stay logically coherent
       * with on-disk format, and the filesystem would not have to deal with
       * situations such as writepage being called for a page that has already
       * had its underlying blocks deallocated.
       */
      void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
      {
   31         struct address_space *mapping = inode->i_mapping;
              loff_t unmap_start = round_up(lstart, PAGE_SIZE);
              loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
              /*
               * This rounding is currently just for example: unmap_mapping_range
               * expands its hole outwards, whereas we want it to contract the hole
               * inwards.  However, existing callers of truncate_pagecache_range are
               * doing their own page rounding first.  Note that unmap_mapping_range
               * allows holelen 0 for all, and we allow lend -1 for end of file.
               */
      
              /*
               * Unlike in truncate_pagecache, unmap_mapping_range is called only
               * once (before truncating pagecache), and without "even_cows" flag:
               * hole-punching should not remove private COWed pages from the hole.
               */
              if ((u64)unmap_end > (u64)unmap_start)
   31                 unmap_mapping_range(mapping, unmap_start,
                                          1 + unmap_end - unmap_start, 0);
   31         truncate_inode_pages_range(mapping, lstart, lend);
      }
      EXPORT_SYMBOL(truncate_pagecache_range);
      /*
       * NTP state machine interfaces and logic.
       *
       * This code was mainly moved from kernel/timer.c and kernel/time.c
       * Please see those files for relevant copyright info and historical
       * changelogs.
       */
      #include <linux/capability.h>
      #include <linux/clocksource.h>
      #include <linux/workqueue.h>
      #include <linux/hrtimer.h>
      #include <linux/jiffies.h>
      #include <linux/math64.h>
      #include <linux/timex.h>
      #include <linux/time.h>
      #include <linux/mm.h>
      #include <linux/module.h>
      #include <linux/rtc.h>
      
      #include "ntp_internal.h"
      
      /*
       * NTP timekeeping variables:
       *
       * Note: All of the NTP state is protected by the timekeeping locks.
       */
      
      
      /* USER_HZ period (usecs): */
      unsigned long                        tick_usec = TICK_USEC;
      
      /* SHIFTED_HZ period (nsecs): */
      unsigned long                        tick_nsec;
      
      static u64                        tick_length;
      static u64                        tick_length_base;
      
      #define SECS_PER_DAY                86400
      #define MAX_TICKADJ                500LL                /* usecs */
      #define MAX_TICKADJ_SCALED \
              (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
      #define MAX_TAI_OFFSET                100000
      
      /*
       * phase-lock loop variables
       */
      
      /*
       * clock synchronization status
       *
       * (TIME_ERROR prevents overwriting the CMOS clock)
       */
      static int                        time_state = TIME_OK;
      
      /* clock status bits:                                                        */
      static int                        time_status = STA_UNSYNC;
      
      /* time adjustment (nsecs):                                                */
      static s64                        time_offset;
      
      /* pll time constant:                                                        */
      static long                        time_constant = 2;
      
      /* maximum error (usecs):                                                */
      static long                        time_maxerror = NTP_PHASE_LIMIT;
      
      /* estimated error (usecs):                                                */
      static long                        time_esterror = NTP_PHASE_LIMIT;
      
      /* frequency offset (scaled nsecs/secs):                                */
      static s64                        time_freq;
      
      /* time at last adjustment (secs):                                        */
      static long                        time_reftime;
      
      static long                        time_adjust;
      
      /* constant (boot-param configurable) NTP tick adjustment (upscaled)        */
      static s64                        ntp_tick_adj;
      
      /* second value of the next pending leapsecond, or TIME64_MAX if no leap */
      static time64_t                        ntp_next_leap_sec = TIME64_MAX;
      
      #ifdef CONFIG_NTP_PPS
      
      /*
       * The following variables are used when a pulse-per-second (PPS) signal
       * is available. They establish the engineering parameters of the clock
       * discipline loop when controlled by the PPS signal.
       */
      #define PPS_VALID        10        /* PPS signal watchdog max (s) */
      #define PPS_POPCORN        4        /* popcorn spike threshold (shift) */
      #define PPS_INTMIN        2        /* min freq interval (s) (shift) */
      #define PPS_INTMAX        8        /* max freq interval (s) (shift) */
      #define PPS_INTCOUNT        4        /* number of consecutive good intervals to
                                         increase pps_shift or consecutive bad
                                         intervals to decrease it */
      #define PPS_MAXWANDER        100000        /* max PPS freq wander (ns/s) */
      
      static int pps_valid;                /* signal watchdog counter */
      static long pps_tf[3];                /* phase median filter */
      static long pps_jitter;                /* current jitter (ns) */
      static struct timespec64 pps_fbase; /* beginning of the last freq interval */
      static int pps_shift;                /* current interval duration (s) (shift) */
      static int pps_intcnt;                /* interval counter */
      static s64 pps_freq;                /* frequency offset (scaled ns/s) */
      static long pps_stabil;                /* current stability (scaled ns/s) */
      
      /*
       * PPS signal quality monitors
       */
      static long pps_calcnt;                /* calibration intervals */
      static long pps_jitcnt;                /* jitter limit exceeded */
      static long pps_stbcnt;                /* stability limit exceeded */
      static long pps_errcnt;                /* calibration errors */
      
      
      /* PPS kernel consumer compensates the whole phase error immediately.
       * Otherwise, reduce the offset by a fixed factor times the time constant.
       */
      static inline s64 ntp_offset_chunk(s64 offset)
      {
              if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
                      return offset;
              else
                      return shift_right(offset, SHIFT_PLL + time_constant);
      }
      
      static inline void pps_reset_freq_interval(void)
      {
              /* the PPS calibration interval may end
                 surprisingly early */
              pps_shift = PPS_INTMIN;
              pps_intcnt = 0;
      }
      
      /**
       * pps_clear - Clears the PPS state variables
       */
      static inline void pps_clear(void)
      {
              pps_reset_freq_interval();
              pps_tf[0] = 0;
              pps_tf[1] = 0;
              pps_tf[2] = 0;
              pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
              pps_freq = 0;
      }
      
      /* Decrease pps_valid to indicate that another second has passed since
       * the last PPS signal. When it reaches 0, indicate that PPS signal is
       * missing.
       */
      static inline void pps_dec_valid(void)
      {
              if (pps_valid > 0)
                      pps_valid--;
              else {
                      time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
                                       STA_PPSWANDER | STA_PPSERROR);
                      pps_clear();
              }
      }
      
      static inline void pps_set_freq(s64 freq)
      {
              pps_freq = freq;
      }
      
      static inline int is_error_status(int status)
      {
              return (status & (STA_UNSYNC|STA_CLOCKERR))
                      /* PPS signal lost when either PPS time or
                       * PPS frequency synchronization requested
                       */
                      || ((status & (STA_PPSFREQ|STA_PPSTIME))
                              && !(status & STA_PPSSIGNAL))
                      /* PPS jitter exceeded when
                       * PPS time synchronization requested */
                      || ((status & (STA_PPSTIME|STA_PPSJITTER))
                              == (STA_PPSTIME|STA_PPSJITTER))
                      /* PPS wander exceeded or calibration error when
                       * PPS frequency synchronization requested
                       */
                      || ((status & STA_PPSFREQ)
                              && (status & (STA_PPSWANDER|STA_PPSERROR)));
      }
      
      static inline void pps_fill_timex(struct timex *txc)
      {
              txc->ppsfreq           = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
                                               PPM_SCALE_INV, NTP_SCALE_SHIFT);
              txc->jitter           = pps_jitter;
              if (!(time_status & STA_NANO))
                      txc->jitter /= NSEC_PER_USEC;
              txc->shift           = pps_shift;
              txc->stabil           = pps_stabil;
              txc->jitcnt           = pps_jitcnt;
              txc->calcnt           = pps_calcnt;
              txc->errcnt           = pps_errcnt;
              txc->stbcnt           = pps_stbcnt;
      }
      
      #else /* !CONFIG_NTP_PPS */
      
      static inline s64 ntp_offset_chunk(s64 offset)
      {
              return shift_right(offset, SHIFT_PLL + time_constant);
      }
      
      static inline void pps_reset_freq_interval(void) {}
      static inline void pps_clear(void) {}
      static inline void pps_dec_valid(void) {}
      static inline void pps_set_freq(s64 freq) {}
      
      static inline int is_error_status(int status)
      {
              return status & (STA_UNSYNC|STA_CLOCKERR);
      }
      
      static inline void pps_fill_timex(struct timex *txc)
      {
              /* PPS is not implemented, so these are zero */
              txc->ppsfreq           = 0;
              txc->jitter           = 0;
              txc->shift           = 0;
              txc->stabil           = 0;
              txc->jitcnt           = 0;
              txc->calcnt           = 0;
              txc->errcnt           = 0;
              txc->stbcnt           = 0;
      }
      
      #endif /* CONFIG_NTP_PPS */
      
      
      /**
       * ntp_synced - Returns 1 if the NTP status is not UNSYNC
       *
       */
      static inline int ntp_synced(void)
      {
              return !(time_status & STA_UNSYNC);
      }
      
      
      /*
       * NTP methods:
       */
      
      /*
       * Update (tick_length, tick_length_base, tick_nsec), based
       * on (tick_usec, ntp_tick_adj, time_freq):
       */
      static void ntp_update_frequency(void)
      {
              u64 second_length;
              u64 new_base;
      
              second_length                 = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
                                                      << NTP_SCALE_SHIFT;
      
              second_length                += ntp_tick_adj;
              second_length                += time_freq;
      
              tick_nsec                 = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
              new_base                 = div_u64(second_length, NTP_INTERVAL_FREQ);
      
              /*
               * Don't wait for the next second_overflow, apply
               * the change to the tick length immediately:
               */
              tick_length                += new_base - tick_length_base;
              tick_length_base         = new_base;
      }
      
      static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
      {
              time_status &= ~STA_MODE;
      
              if (secs < MINSEC)
                      return 0;
      
              if (!(time_status & STA_FLL) && (secs <= MAXSEC))
                      return 0;
      
              time_status |= STA_MODE;
      
              return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
      }
      
      static void ntp_update_offset(long offset)
      {
              s64 freq_adj;
              s64 offset64;
              long secs;
      
              if (!(time_status & STA_PLL))
                      return;
      
              if (!(time_status & STA_NANO))
                      offset *= NSEC_PER_USEC;
      
              /*
               * Scale the phase adjustment and
               * clamp to the operating range.
               */
              offset = min(offset, MAXPHASE);
              offset = max(offset, -MAXPHASE);
      
              /*
               * Select how the frequency is to be controlled
               * and in which mode (PLL or FLL).
               */
              secs = get_seconds() - time_reftime;
              if (unlikely(time_status & STA_FREQHOLD))
                      secs = 0;
      
              time_reftime = get_seconds();
      
              offset64    = offset;
              freq_adj    = ntp_update_offset_fll(offset64, secs);
      
              /*
               * Clamp update interval to reduce PLL gain with low
               * sampling rate (e.g. intermittent network connection)
               * to avoid instability.
               */
              if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
                      secs = 1 << (SHIFT_PLL + 1 + time_constant);
      
              freq_adj    += (offset64 * secs) <<
                              (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
      
              freq_adj    = min(freq_adj + time_freq, MAXFREQ_SCALED);
      
              time_freq   = max(freq_adj, -MAXFREQ_SCALED);
      
              time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
      }
      
      /**
       * ntp_clear - Clears the NTP state variables
       */
      void ntp_clear(void)
      {
              time_adjust        = 0;                /* stop active adjtime() */
              time_status        |= STA_UNSYNC;
              time_maxerror        = NTP_PHASE_LIMIT;
              time_esterror        = NTP_PHASE_LIMIT;
      
              ntp_update_frequency();
      
              tick_length        = tick_length_base;
              time_offset        = 0;
      
              ntp_next_leap_sec = TIME64_MAX;
              /* Clear PPS state variables */
              pps_clear();
      }
      
      
      u64 ntp_tick_length(void)
      {
              return tick_length;
      }
      
      /**
       * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
       *
       * Provides the time of the next leapsecond against CLOCK_REALTIME in
       * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
       */
      ktime_t ntp_get_next_leap(void)
      {
              ktime_t ret;
    5 
              if ((time_state == TIME_INS) && (time_status & STA_INS))
                      return ktime_set(ntp_next_leap_sec, 0);
    5         ret.tv64 = KTIME_MAX;
              return ret;
      }
      
      /*
       * this routine handles the overflow of the microsecond field
       *
       * The tricky bits of code to handle the accurate clock support
       * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
       * They were originally developed for SUN and DEC kernels.
       * All the kudos should go to Dave for this stuff.
       *
       * Also handles leap second processing, and returns leap offset
       */
      int second_overflow(unsigned long secs)
      {
              s64 delta;
              int leap = 0;
      
              /*
               * Leap second processing. If in leap-insert state at the end of the
               * day, the system clock is set back one second; if in leap-delete
               * state, the system clock is set ahead one second.
               */
              switch (time_state) {
              case TIME_OK:
                      if (time_status & STA_INS) {
                              time_state = TIME_INS;
                              ntp_next_leap_sec = secs + SECS_PER_DAY -
                                                      (secs % SECS_PER_DAY);
                      } else if (time_status & STA_DEL) {
                              time_state = TIME_DEL;
                              ntp_next_leap_sec = secs + SECS_PER_DAY -
                                                       ((secs+1) % SECS_PER_DAY);
                      }
                      break;
              case TIME_INS:
                      if (!(time_status & STA_INS)) {
                              ntp_next_leap_sec = TIME64_MAX;
                              time_state = TIME_OK;
                      } else if (secs % SECS_PER_DAY == 0) {
                              leap = -1;
                              time_state = TIME_OOP;
                              printk(KERN_NOTICE
                                      "Clock: inserting leap second 23:59:60 UTC\n");
                      }
                      break;
              case TIME_DEL:
                      if (!(time_status & STA_DEL)) {
                              ntp_next_leap_sec = TIME64_MAX;
                              time_state = TIME_OK;
                      } else if ((secs + 1) % SECS_PER_DAY == 0) {
                              leap = 1;
                              ntp_next_leap_sec = TIME64_MAX;
                              time_state = TIME_WAIT;
                              printk(KERN_NOTICE
                                      "Clock: deleting leap second 23:59:59 UTC\n");
                      }
                      break;
              case TIME_OOP:
                      ntp_next_leap_sec = TIME64_MAX;
                      time_state = TIME_WAIT;
                      break;
              case TIME_WAIT:
                      if (!(time_status & (STA_INS | STA_DEL)))
                              time_state = TIME_OK;
                      break;
              }
      
      
              /* Bump the maxerror field */
              time_maxerror += MAXFREQ / NSEC_PER_USEC;
              if (time_maxerror > NTP_PHASE_LIMIT) {
                      time_maxerror = NTP_PHASE_LIMIT;
                      time_status |= STA_UNSYNC;
              }
      
              /* Compute the phase adjustment for the next second */
              tick_length         = tick_length_base;
      
              delta                 = ntp_offset_chunk(time_offset);
              time_offset        -= delta;
              tick_length        += delta;
      
              /* Check PPS signal */
              pps_dec_valid();
      
              if (!time_adjust)
                      goto out;
      
              if (time_adjust > MAX_TICKADJ) {
                      time_adjust -= MAX_TICKADJ;
                      tick_length += MAX_TICKADJ_SCALED;
                      goto out;
              }
      
              if (time_adjust < -MAX_TICKADJ) {
                      time_adjust += MAX_TICKADJ;
                      tick_length -= MAX_TICKADJ_SCALED;
                      goto out;
              }
      
              tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
                                                               << NTP_SCALE_SHIFT;
              time_adjust = 0;
      
      out:
              return leap;
      }
      
      #ifdef CONFIG_GENERIC_CMOS_UPDATE
      int __weak update_persistent_clock(struct timespec now)
      {
              return -ENODEV;
      }
      
      int __weak update_persistent_clock64(struct timespec64 now64)
      {
              struct timespec now;
      
              now = timespec64_to_timespec(now64);
              return update_persistent_clock(now);
      }
      #endif
      
      #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
      static void sync_cmos_clock(struct work_struct *work);
      
      static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
      
      static void sync_cmos_clock(struct work_struct *work)
      {
              struct timespec64 now;
              struct timespec64 next;
              int fail = 1;
      
              /*
               * If we have an externally synchronized Linux clock, then update
               * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
               * called as close as possible to 500 ms before the new second starts.
               * This code is run on a timer.  If the clock is set, that timer
               * may not expire at the correct time.  Thus, we adjust...
               * We want the clock to be within a couple of ticks from the target.
               */
              if (!ntp_synced()) {
                      /*
                       * Not synced, exit, do not restart a timer (if one is
                       * running, let it run out).
                       */
                      return;
              }
      
              getnstimeofday64(&now);
              if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
                      struct timespec64 adjust = now;
      
                      fail = -ENODEV;
                      if (persistent_clock_is_local)
                              adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
      #ifdef CONFIG_GENERIC_CMOS_UPDATE
                      fail = update_persistent_clock64(adjust);
      #endif
      
      #ifdef CONFIG_RTC_SYSTOHC
                      if (fail == -ENODEV)
                              fail = rtc_set_ntp_time(adjust);
      #endif
              }
      
              next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
              if (next.tv_nsec <= 0)
                      next.tv_nsec += NSEC_PER_SEC;
      
              if (!fail || fail == -ENODEV)
                      next.tv_sec = 659;
              else
                      next.tv_sec = 0;
      
              if (next.tv_nsec >= NSEC_PER_SEC) {
                      next.tv_sec++;
                      next.tv_nsec -= NSEC_PER_SEC;
              }
              queue_delayed_work(system_power_efficient_wq,
                                 &sync_cmos_work, timespec64_to_jiffies(&next));
      }
      
      void ntp_notify_cmos_timer(void)
    5 {
              queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
      }
      
      #else
      void ntp_notify_cmos_timer(void) { }
      #endif
      
      
      /*
       * Propagate a new txc->status value into the NTP state:
       */
      static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
      {
              if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
                      time_state = TIME_OK;
                      time_status = STA_UNSYNC;
                      ntp_next_leap_sec = TIME64_MAX;
                      /* restart PPS frequency calibration */
                      pps_reset_freq_interval();
              }
      
              /*
               * If we turn on PLL adjustments then reset the
               * reference time to current time.
               */
              if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
                      time_reftime = get_seconds();
      
              /* only set allowed bits */
              time_status &= STA_RONLY;
              time_status |= txc->status & ~STA_RONLY;
      }
      
      
      static inline void process_adjtimex_modes(struct timex *txc,
                                                      struct timespec64 *ts,
                                                      s32 *time_tai)
      {
              if (txc->modes & ADJ_STATUS)
                      process_adj_status(txc, ts);
      
              if (txc->modes & ADJ_NANO)
                      time_status |= STA_NANO;
      
              if (txc->modes & ADJ_MICRO)
                      time_status &= ~STA_NANO;
      
              if (txc->modes & ADJ_FREQUENCY) {
                      time_freq = txc->freq * PPM_SCALE;
                      time_freq = min(time_freq, MAXFREQ_SCALED);
                      time_freq = max(time_freq, -MAXFREQ_SCALED);
                      /* update pps_freq */
                      pps_set_freq(time_freq);
              }
      
              if (txc->modes & ADJ_MAXERROR)
                      time_maxerror = txc->maxerror;
      
              if (txc->modes & ADJ_ESTERROR)
                      time_esterror = txc->esterror;
      
              if (txc->modes & ADJ_TIMECONST) {
                      time_constant = txc->constant;
                      if (!(time_status & STA_NANO))
                              time_constant += 4;
                      time_constant = min(time_constant, (long)MAXTC);
                      time_constant = max(time_constant, 0l);
              }
      
              if (txc->modes & ADJ_TAI &&
                              txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET)
                      *time_tai = txc->constant;
      
              if (txc->modes & ADJ_OFFSET)
                      ntp_update_offset(txc->offset);
      
              if (txc->modes & ADJ_TICK)
                      tick_usec = txc->tick;
      
              if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
                      ntp_update_frequency();
      }
      
      
      
      /**
       * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
       */
    9 int ntp_validate_timex(struct timex *txc)
      {
    5         if (txc->modes & ADJ_ADJTIME) {
                      /* singleshot must not be used with any other mode bits */
    4                 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
    1                         return -EINVAL;
                      if (!(txc->modes & ADJ_OFFSET_READONLY) &&
                          !capable(CAP_SYS_TIME))
                              return -EPERM;
    4         } else {
                      /* In order to modify anything, you gotta be super-user! */
                       if (txc->modes && !capable(CAP_SYS_TIME))
                              return -EPERM;
                      /*
                       * if the quartz is off by more than 10% then
                       * something is VERY wrong!
                       */
                      if (txc->modes & ADJ_TICK &&
                          (txc->tick <  900000/USER_HZ ||
                           txc->tick > 1100000/USER_HZ))
                              return -EINVAL;
    6         }
      
    1         if (txc->modes & ADJ_SETOFFSET) {
                      /* In order to inject time, you gotta be super-user! */
                      if (!capable(CAP_SYS_TIME))
                              return -EPERM;
      
                      if (txc->modes & ADJ_NANO) {
                              struct timespec ts;
      
                              ts.tv_sec = txc->time.tv_sec;
                              ts.tv_nsec = txc->time.tv_usec;
                              if (!timespec_inject_offset_valid(&ts))
                                      return -EINVAL;
      
                      } else {
                              if (!timeval_inject_offset_valid(&txc->time))
                                      return -EINVAL;
                      }
              }
      
              /*
               * Check for potential multiplication overflows that can
    5          * only happen on 64-bit systems:
    1          */
              if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
    1                 if (LLONG_MIN / PPM_SCALE > txc->freq)
                              return -EINVAL;
                      if (LLONG_MAX / PPM_SCALE < txc->freq)
                              return -EINVAL;
    9         }
      
              return 0;
      }
      
      
      /*
       * adjtimex mainly allows reading (and writing, if superuser) of
       * kernel time-keeping variables. used by xntpd.
       */
      int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
      {
    5         int result;
    2 
              if (txc->modes & ADJ_ADJTIME) {
                      long save_adjust = time_adjust;
      
                      if (!(txc->modes & ADJ_OFFSET_READONLY)) {
                              /* adjtime() is independent from ntp_adjtime() */
                              time_adjust = txc->offset;
    2                         ntp_update_frequency();
                      }
                      txc->offset = save_adjust;
              } else {
    3 
                      /* If there are input parameters, then process them: */
                      if (txc->modes)
    3                         process_adjtimex_modes(txc, ts, time_tai);
      
    3                 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
    3                                   NTP_SCALE_SHIFT);
                      if (!(time_status & STA_NANO))
                              txc->offset /= NSEC_PER_USEC;
    5         }
      
              result = time_state;        /* mostly `TIME_OK' */
              /* check for errors */
              if (is_error_status(time_status))
    5                 result = TIME_ERROR;
      
              txc->freq           = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
                                               PPM_SCALE_INV, NTP_SCALE_SHIFT);
              txc->maxerror           = time_maxerror;
              txc->esterror           = time_esterror;
              txc->status           = time_status;
              txc->constant           = time_constant;
              txc->precision           = 1;
              txc->tolerance           = MAXFREQ_SCALED / PPM_SCALE;
              txc->tick           = tick_usec;
              txc->tai           = *time_tai;
      
              /* fill PPS status fields */
              pps_fill_timex(txc);
      
              txc->time.tv_sec = (time_t)ts->tv_sec;
    5         txc->time.tv_usec = ts->tv_nsec;
              if (!(time_status & STA_NANO))
                      txc->time.tv_usec /= NSEC_PER_USEC;
    5 
              /* Handle leapsec adjustments */
              if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
                      if ((time_state == TIME_INS) && (time_status & STA_INS)) {
                              result = TIME_OOP;
                              txc->tai++;
                              txc->time.tv_sec--;
                      }
                      if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
                              result = TIME_WAIT;
                              txc->tai--;
                              txc->time.tv_sec++;
                      }
                      if ((time_state == TIME_OOP) &&
                                              (ts->tv_sec == ntp_next_leap_sec)) {
                              result = TIME_WAIT;
                      }
    5         }
      
              return result;
      }
      
      #ifdef        CONFIG_NTP_PPS
      
      /* actually struct pps_normtime is good old struct timespec, but it is
       * semantically different (and it is the reason why it was invented):
       * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
       * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
      struct pps_normtime {
              s64                sec;        /* seconds */
              long                nsec;        /* nanoseconds */
      };
      
      /* normalize the timestamp so that nsec is in the
         ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
      static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts)
      {
              struct pps_normtime norm = {
                      .sec = ts.tv_sec,
                      .nsec = ts.tv_nsec
              };
      
              if (norm.nsec > (NSEC_PER_SEC >> 1)) {
                      norm.nsec -= NSEC_PER_SEC;
                      norm.sec++;
              }
      
              return norm;
      }
      
      /* get current phase correction and jitter */
      static inline long pps_phase_filter_get(long *jitter)
      {
              *jitter = pps_tf[0] - pps_tf[1];
              if (*jitter < 0)
                      *jitter = -*jitter;
      
              /* TODO: test various filters */
              return pps_tf[0];
      }
      
      /* add the sample to the phase filter */
      static inline void pps_phase_filter_add(long err)
      {
              pps_tf[2] = pps_tf[1];
              pps_tf[1] = pps_tf[0];
              pps_tf[0] = err;
      }
      
      /* decrease frequency calibration interval length.
       * It is halved after four consecutive unstable intervals.
       */
      static inline void pps_dec_freq_interval(void)
      {
              if (--pps_intcnt <= -PPS_INTCOUNT) {
                      pps_intcnt = -PPS_INTCOUNT;
                      if (pps_shift > PPS_INTMIN) {
                              pps_shift--;
                              pps_intcnt = 0;
                      }
              }
      }
      
      /* increase frequency calibration interval length.
       * It is doubled after four consecutive stable intervals.
       */
      static inline void pps_inc_freq_interval(void)
      {
              if (++pps_intcnt >= PPS_INTCOUNT) {
                      pps_intcnt = PPS_INTCOUNT;
                      if (pps_shift < PPS_INTMAX) {
                              pps_shift++;
                              pps_intcnt = 0;
                      }
              }
      }
      
      /* update clock frequency based on MONOTONIC_RAW clock PPS signal
       * timestamps
       *
       * At the end of the calibration interval the difference between the
       * first and last MONOTONIC_RAW clock timestamps divided by the length
       * of the interval becomes the frequency update. If the interval was
       * too long, the data are discarded.
       * Returns the difference between old and new frequency values.
       */
      static long hardpps_update_freq(struct pps_normtime freq_norm)
      {
              long delta, delta_mod;
              s64 ftemp;
      
              /* check if the frequency interval was too long */
              if (freq_norm.sec > (2 << pps_shift)) {
                      time_status |= STA_PPSERROR;
                      pps_errcnt++;
                      pps_dec_freq_interval();
                      printk_deferred(KERN_ERR
                              "hardpps: PPSERROR: interval too long - %lld s\n",
                              freq_norm.sec);
                      return 0;
              }
      
              /* here the raw frequency offset and wander (stability) is
               * calculated. If the wander is less than the wander threshold
               * the interval is increased; otherwise it is decreased.
               */
              ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
                              freq_norm.sec);
              delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
              pps_freq = ftemp;
              if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
                      printk_deferred(KERN_WARNING
                                      "hardpps: PPSWANDER: change=%ld\n", delta);
                      time_status |= STA_PPSWANDER;
                      pps_stbcnt++;
                      pps_dec_freq_interval();
              } else {        /* good sample */
                      pps_inc_freq_interval();
              }
      
              /* the stability metric is calculated as the average of recent
               * frequency changes, but is used only for performance
               * monitoring
               */
              delta_mod = delta;
              if (delta_mod < 0)
                      delta_mod = -delta_mod;
              pps_stabil += (div_s64(((s64)delta_mod) <<
                                      (NTP_SCALE_SHIFT - SHIFT_USEC),
                                      NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
      
              /* if enabled, the system clock frequency is updated */
              if ((time_status & STA_PPSFREQ) != 0 &&
                  (time_status & STA_FREQHOLD) == 0) {
                      time_freq = pps_freq;
                      ntp_update_frequency();
              }
      
              return delta;
      }
      
      /* correct REALTIME clock phase error against PPS signal */
      static void hardpps_update_phase(long error)
      {
              long correction = -error;
              long jitter;
      
              /* add the sample to the median filter */
              pps_phase_filter_add(correction);
              correction = pps_phase_filter_get(&jitter);
      
              /* Nominal jitter is due to PPS signal noise. If it exceeds the
               * threshold, the sample is discarded; otherwise, if so enabled,
               * the time offset is updated.
               */
              if (jitter > (pps_jitter << PPS_POPCORN)) {
                      printk_deferred(KERN_WARNING
                                      "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
                                      jitter, (pps_jitter << PPS_POPCORN));
                      time_status |= STA_PPSJITTER;
                      pps_jitcnt++;
              } else if (time_status & STA_PPSTIME) {
                      /* correct the time using the phase offset */
                      time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
                                      NTP_INTERVAL_FREQ);
                      /* cancel running adjtime() */
                      time_adjust = 0;
              }
              /* update jitter */
              pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
      }
      
      /*
       * __hardpps() - discipline CPU clock oscillator to external PPS signal
       *
       * This routine is called at each PPS signal arrival in order to
       * discipline the CPU clock oscillator to the PPS signal. It takes two
       * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
       * is used to correct clock phase error and the latter is used to
       * correct the frequency.
       *
       * This code is based on David Mills's reference nanokernel
       * implementation. It was mostly rewritten but keeps the same idea.
       */
      void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
      {
              struct pps_normtime pts_norm, freq_norm;
      
              pts_norm = pps_normalize_ts(*phase_ts);
      
              /* clear the error bits, they will be set again if needed */
              time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
      
              /* indicate signal presence */
              time_status |= STA_PPSSIGNAL;
              pps_valid = PPS_VALID;
      
              /* when called for the first time,
               * just start the frequency interval */
              if (unlikely(pps_fbase.tv_sec == 0)) {
                      pps_fbase = *raw_ts;
                      return;
              }
      
              /* ok, now we have a base for frequency calculation */
              freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase));
      
              /* check that the signal is in the range
               * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
              if ((freq_norm.sec == 0) ||
                              (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
                              (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
                      time_status |= STA_PPSJITTER;
                      /* restart the frequency calibration interval */
                      pps_fbase = *raw_ts;
                      printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
                      return;
              }
      
              /* signal is ok */
      
              /* check if the current frequency interval is finished */
              if (freq_norm.sec >= (1 << pps_shift)) {
                      pps_calcnt++;
                      /* restart the frequency calibration interval */
                      pps_fbase = *raw_ts;
                      hardpps_update_freq(freq_norm);
              }
      
              hardpps_update_phase(pts_norm.nsec);
      
      }
      #endif        /* CONFIG_NTP_PPS */
      
      static int __init ntp_tick_adj_setup(char *str)
      {
              int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
      
              if (rc)
                      return rc;
              ntp_tick_adj <<= NTP_SCALE_SHIFT;
      
              return 1;
      }
      
      __setup("ntp_tick_adj=", ntp_tick_adj_setup);
      
      void __init ntp_init(void)
      {
              ntp_clear();
      }
      /*
       * RT Mutexes: blocking mutual exclusion locks with PI support
       *
       * started by Ingo Molnar and Thomas Gleixner:
       *
       *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
       *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
       *
       * This file contains the private data structure and API definitions.
       */
      
      #ifndef __KERNEL_RTMUTEX_COMMON_H
      #define __KERNEL_RTMUTEX_COMMON_H
      
      #include <linux/rtmutex.h>
      
      /*
       * This is the control structure for tasks blocked on a rt_mutex,
       * which is allocated on the kernel stack on of the blocked task.
       *
       * @tree_entry:                pi node to enqueue into the mutex waiters tree
       * @pi_tree_entry:        pi node to enqueue into the mutex owner waiters tree
       * @task:                task reference to the blocked task
       */
      struct rt_mutex_waiter {
              struct rb_node          tree_entry;
              struct rb_node          pi_tree_entry;
              struct task_struct        *task;
              struct rt_mutex                *lock;
      #ifdef CONFIG_DEBUG_RT_MUTEXES
              unsigned long                ip;
              struct pid                *deadlock_task_pid;
              struct rt_mutex                *deadlock_lock;
      #endif
              int prio;
      };
      
      /*
       * Various helpers to access the waiters-tree:
       */
      static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
      {
              return !RB_EMPTY_ROOT(&lock->waiters);
      }
      
      static inline struct rt_mutex_waiter *
      rt_mutex_top_waiter(struct rt_mutex *lock)
      {
              struct rt_mutex_waiter *w;
      
              w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
                           tree_entry);
              BUG_ON(w->lock != lock);
      
              return w;
      }
      
      static inline int task_has_pi_waiters(struct task_struct *p)
      {
              return !RB_EMPTY_ROOT(&p->pi_waiters);
      }
      
      static inline struct rt_mutex_waiter *
      task_top_pi_waiter(struct task_struct *p)
      {
              return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
                              pi_tree_entry);
      }
      
      /*
       * lock->owner state tracking:
       */
      #define RT_MUTEX_HAS_WAITERS        1UL
      #define RT_MUTEX_OWNER_MASKALL        1UL
      
      static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
      {
    9         unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
      
              return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL);
      }
      
      /*
       * Constants for rt mutex functions which have a selectable deadlock
       * detection.
       *
       * RT_MUTEX_MIN_CHAINWALK:        Stops the lock chain walk when there are
       *                                no further PI adjustments to be made.
       *
       * RT_MUTEX_FULL_CHAINWALK:        Invoke deadlock detection with a full
       *                                walk of the lock chain.
       */
      enum rtmutex_chainwalk {
              RT_MUTEX_MIN_CHAINWALK,
              RT_MUTEX_FULL_CHAINWALK,
      };
      
      /*
       * PI-futex support (proxy locking functions, etc.):
       */
      extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
      extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                             struct task_struct *proxy_owner);
      extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                        struct task_struct *proxy_owner);
      extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                                           struct rt_mutex_waiter *waiter,
                                           struct task_struct *task);
      extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
                                     struct hrtimer_sleeper *to,
                                     struct rt_mutex_waiter *waiter);
      extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
                                       struct rt_mutex_waiter *waiter);
      extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
      extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
                                        struct wake_q_head *wqh);
      extern void rt_mutex_adjust_prio(struct task_struct *task);
      
      #ifdef CONFIG_DEBUG_RT_MUTEXES
      # include "rtmutex-debug.h"
      #else
      # include "rtmutex.h"
      #endif
      
      #endif
      /*
       * lib/hexdump.c
       *
       * This program is free software; you can redistribute it and/or modify
       * it under the terms of the GNU General Public License version 2 as
       * published by the Free Software Foundation. See README and COPYING for
       * more details.
       */
      
      #include <linux/types.h>
      #include <linux/ctype.h>
      #include <linux/kernel.h>
      #include <linux/export.h>
      #include <asm/unaligned.h>
      
      const char hex_asc[] = "0123456789abcdef";
      EXPORT_SYMBOL(hex_asc);
      const char hex_asc_upper[] = "0123456789ABCDEF";
      EXPORT_SYMBOL(hex_asc_upper);
      
      /**
       * hex_to_bin - convert a hex digit to its real value
       * @ch: ascii character represents hex digit
       *
       * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad
       * input.
       */
      int hex_to_bin(char ch)
      {
    4         if ((ch >= '0') && (ch <= '9'))
    2                 return ch - '0';
    3         ch = tolower(ch);
    3         if ((ch >= 'a') && (ch <= 'f'))
    3                 return ch - 'a' + 10;
              return -1;
      }
      EXPORT_SYMBOL(hex_to_bin);
      
      /**
       * hex2bin - convert an ascii hexadecimal string to its binary representation
       * @dst: binary result
       * @src: ascii hexadecimal string
       * @count: result length
       *
       * Return 0 on success, -1 in case of bad input.
       */
      int hex2bin(u8 *dst, const char *src, size_t count)
      {
              while (count--) {
                      int hi = hex_to_bin(*src++);
                      int lo = hex_to_bin(*src++);
      
                      if ((hi < 0) || (lo < 0))
                              return -1;
      
                      *dst++ = (hi << 4) | lo;
              }
              return 0;
      }
      EXPORT_SYMBOL(hex2bin);
      
      /**
       * bin2hex - convert binary data to an ascii hexadecimal string
       * @dst: ascii hexadecimal result
       * @src: binary data
       * @count: binary data length
       */
      char *bin2hex(char *dst, const void *src, size_t count)
      {
              const unsigned char *_src = src;
      
              while (count--)
                      dst = hex_byte_pack(dst, *_src++);
              return dst;
      }
      EXPORT_SYMBOL(bin2hex);
      
      /**
       * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory
       * @buf: data blob to dump
       * @len: number of bytes in the @buf
       * @rowsize: number of bytes to print per line; must be 16 or 32
       * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1)
       * @linebuf: where to put the converted data
       * @linebuflen: total size of @linebuf, including space for terminating NUL
       * @ascii: include ASCII after the hex output
       *
       * hex_dump_to_buffer() works on one "line" of output at a time, i.e.,
       * 16 or 32 bytes of input data converted to hex + ASCII output.
       *
       * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data
       * to a hex + ASCII dump at the supplied memory location.
       * The converted output is always NUL-terminated.
       *
       * E.g.:
       *   hex_dump_to_buffer(frame->data, frame->len, 16, 1,
       *                        linebuf, sizeof(linebuf), true);
       *
       * example output buffer:
       * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f  @ABCDEFGHIJKLMNO
       *
       * Return:
       * The amount of bytes placed in the buffer without terminating NUL. If the
       * output was truncated, then the return value is the number of bytes
       * (excluding the terminating NUL) which would have been written to the final
       * string if enough space had been available.
       */
      int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize,
                             char *linebuf, size_t linebuflen, bool ascii)
      {
              const u8 *ptr = buf;
              int ngroups;
              u8 ch;
              int j, lx = 0;
              int ascii_column;
              int ret;
      
              if (rowsize != 16 && rowsize != 32)
                      rowsize = 16;
      
              if (len > rowsize)                /* limit to one line at a time */
                      len = rowsize;
              if (!is_power_of_2(groupsize) || groupsize > 8)
                      groupsize = 1;
              if ((len % groupsize) != 0)        /* no mixed size output */
                      groupsize = 1;
      
              ngroups = len / groupsize;
              ascii_column = rowsize * 2 + rowsize / groupsize + 1;
      
              if (!linebuflen)
                      goto overflow1;
      
              if (!len)
                      goto nil;
      
              if (groupsize == 8) {
                      const u64 *ptr8 = buf;
      
                      for (j = 0; j < ngroups; j++) {
                              ret = snprintf(linebuf + lx, linebuflen - lx,
                                             "%s%16.16llx", j ? " " : "",
                                             get_unaligned(ptr8 + j));
                              if (ret >= linebuflen - lx)
                                      goto overflow1;
                              lx += ret;
                      }
              } else if (groupsize == 4) {
                      const u32 *ptr4 = buf;
      
                      for (j = 0; j < ngroups; j++) {
                              ret = snprintf(linebuf + lx, linebuflen - lx,
                                             "%s%8.8x", j ? " " : "",
                                             get_unaligned(ptr4 + j));
                              if (ret >= linebuflen - lx)
                                      goto overflow1;
                              lx += ret;
                      }
              } else if (groupsize == 2) {
                      const u16 *ptr2 = buf;
      
                      for (j = 0; j < ngroups; j++) {
                              ret = snprintf(linebuf + lx, linebuflen - lx,
                                             "%s%4.4x", j ? " " : "",
                                             get_unaligned(ptr2 + j));
                              if (ret >= linebuflen - lx)
                                      goto overflow1;
                              lx += ret;
                      }
              } else {
                      for (j = 0; j < len; j++) {
                              if (linebuflen < lx + 2)
                                      goto overflow2;
                              ch = ptr[j];
                              linebuf[lx++] = hex_asc_hi(ch);
                              if (linebuflen < lx + 2)
                                      goto overflow2;
                              linebuf[lx++] = hex_asc_lo(ch);
                              if (linebuflen < lx + 2)
                                      goto overflow2;
                              linebuf[lx++] = ' ';
                      }
                      if (j)
                              lx--;
              }
              if (!ascii)
                      goto nil;
      
              while (lx < ascii_column) {
                      if (linebuflen < lx + 2)
                              goto overflow2;
                      linebuf[lx++] = ' ';
              }
              for (j = 0; j < len; j++) {
                      if (linebuflen < lx + 2)
                              goto overflow2;
                      ch = ptr[j];
                      linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.';
              }
      nil:
              linebuf[lx] = '\0';
              return lx;
      overflow2:
              linebuf[lx++] = '\0';
      overflow1:
              return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1;
      }
      EXPORT_SYMBOL(hex_dump_to_buffer);
      
      #ifdef CONFIG_PRINTK
      /**
       * print_hex_dump - print a text hex dump to syslog for a binary blob of data
       * @level: kernel log level (e.g. KERN_DEBUG)
       * @prefix_str: string to prefix each line with;
       *  caller supplies trailing spaces for alignment if desired
       * @prefix_type: controls whether prefix of an offset, address, or none
       *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
       * @rowsize: number of bytes to print per line; must be 16 or 32
       * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1)
       * @buf: data blob to dump
       * @len: number of bytes in the @buf
       * @ascii: include ASCII after the hex output
       *
       * Given a buffer of u8 data, print_hex_dump() prints a hex + ASCII dump
       * to the kernel log at the specified kernel log level, with an optional
       * leading prefix.
       *
       * print_hex_dump() works on one "line" of output at a time, i.e.,
       * 16 or 32 bytes of input data converted to hex + ASCII output.
       * print_hex_dump() iterates over the entire input @buf, breaking it into
       * "line size" chunks to format and print.
       *
       * E.g.:
       *   print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS,
       *                    16, 1, frame->data, frame->len, true);
       *
       * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode:
       * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f  @ABCDEFGHIJKLMNO
       * Example output using %DUMP_PREFIX_ADDRESS and 4-byte mode:
       * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c  pqrstuvwxyz{|}~.
       */
      void print_hex_dump(const char *level, const char *prefix_str, int prefix_type,
                          int rowsize, int groupsize,
                          const void *buf, size_t len, bool ascii)
      {
              const u8 *ptr = buf;
              int i, linelen, remaining = len;
              unsigned char linebuf[32 * 3 + 2 + 32 + 1];
      
              if (rowsize != 16 && rowsize != 32)
                      rowsize = 16;
      
              for (i = 0; i < len; i += rowsize) {
                      linelen = min(remaining, rowsize);
                      remaining -= rowsize;
      
                      hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
                                         linebuf, sizeof(linebuf), ascii);
      
                      switch (prefix_type) {
                      case DUMP_PREFIX_ADDRESS:
                              printk("%s%s%p: %s\n",
                                     level, prefix_str, ptr + i, linebuf);
                              break;
                      case DUMP_PREFIX_OFFSET:
                              printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf);
                              break;
                      default:
                              printk("%s%s%s\n", level, prefix_str, linebuf);
                              break;
                      }
              }
      }
      EXPORT_SYMBOL(print_hex_dump);
      
      #if !defined(CONFIG_DYNAMIC_DEBUG)
      /**
       * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
       * @prefix_str: string to prefix each line with;
       *  caller supplies trailing spaces for alignment if desired
       * @prefix_type: controls whether prefix of an offset, address, or none
       *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
       * @buf: data blob to dump
       * @len: number of bytes in the @buf
       *
       * Calls print_hex_dump(), with log level of KERN_DEBUG,
       * rowsize of 16, groupsize of 1, and ASCII output included.
       */
      void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                                const void *buf, size_t len)
      {
              print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1,
                             buf, len, true);
      }
      EXPORT_SYMBOL(print_hex_dump_bytes);
      #endif /* !defined(CONFIG_DYNAMIC_DEBUG) */
      #endif /* defined(CONFIG_PRINTK) */
      /*
       * Implementation of the kernel access vector cache (AVC).
       *
       * Authors:  Stephen Smalley, <sds@epoch.ncsc.mil>
       *             James Morris <jmorris@redhat.com>
       *
       * Update:   KaiGai, Kohei <kaigai@ak.jp.nec.com>
       *        Replaced the avc_lock spinlock by RCU.
       *
       * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
       *
       *        This program is free software; you can redistribute it and/or modify
       *        it under the terms of the GNU General Public License version 2,
       *        as published by the Free Software Foundation.
       */
      #include <linux/types.h>
      #include <linux/stddef.h>
      #include <linux/kernel.h>
      #include <linux/slab.h>
      #include <linux/fs.h>
      #include <linux/dcache.h>
      #include <linux/init.h>
      #include <linux/skbuff.h>
      #include <linux/percpu.h>
      #include <linux/list.h>
      #include <net/sock.h>
      #include <linux/un.h>
      #include <net/af_unix.h>
      #include <linux/ip.h>
      #include <linux/audit.h>
      #include <linux/ipv6.h>
      #include <net/ipv6.h>
      #include "avc.h"
      #include "avc_ss.h"
      #include "classmap.h"
      
      #define AVC_CACHE_SLOTS                        512
      #define AVC_DEF_CACHE_THRESHOLD                512
      #define AVC_CACHE_RECLAIM                16
      
      #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
      #define avc_cache_stats_incr(field)        this_cpu_inc(avc_cache_stats.field)
      #else
      #define avc_cache_stats_incr(field)        do {} while (0)
      #endif
      
      struct avc_entry {
              u32                        ssid;
              u32                        tsid;
              u16                        tclass;
              struct av_decision        avd;
              struct avc_xperms_node        *xp_node;
      };
      
      struct avc_node {
              struct avc_entry        ae;
              struct hlist_node        list; /* anchored in avc_cache->slots[i] */
              struct rcu_head                rhead;
      };
      
      struct avc_xperms_decision_node {
              struct extended_perms_decision xpd;
              struct list_head xpd_list; /* list of extended_perms_decision */
      };
      
      struct avc_xperms_node {
              struct extended_perms xp;
              struct list_head xpd_head; /* list head of extended_perms_decision */
      };
      
      struct avc_cache {
              struct hlist_head        slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
              spinlock_t                slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
              atomic_t                lru_hint;        /* LRU hint for reclaim scan */
              atomic_t                active_nodes;
              u32                        latest_notif;        /* latest revocation notification */
      };
      
      struct avc_callback_node {
              int (*callback) (u32 event);
              u32 events;
              struct avc_callback_node *next;
      };
      
      /* Exported via selinufs */
      unsigned int avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD;
      
      #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
      DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
      #endif
      
      static struct avc_cache avc_cache;
      static struct avc_callback_node *avc_callbacks;
      static struct kmem_cache *avc_node_cachep;
      static struct kmem_cache *avc_xperms_data_cachep;
      static struct kmem_cache *avc_xperms_decision_cachep;
      static struct kmem_cache *avc_xperms_cachep;
      
      static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
      {
 2118         return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
      }
      
      /**
       * avc_dump_av - Display an access vector in human-readable form.
       * @tclass: target security class
       * @av: access vector
       */
      static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av)
      {
              const char **perms;
              int i, perm;
      
              if (av == 0) {
                      audit_log_format(ab, " null");
                      return;
              }
      
              BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
              perms = secclass_map[tclass-1].perms;
      
              audit_log_format(ab, " {");
              i = 0;
              perm = 1;
              while (i < (sizeof(av) * 8)) {
  149                 if ((perm & av) && perms[i]) {
  149                         audit_log_format(ab, " %s", perms[i]);
                              av &= ~perm;
                      }
  149                 i++;
                      perm <<= 1;
              }
      
  149         if (av)
                      audit_log_format(ab, " 0x%x", av);
      
  149         audit_log_format(ab, " }");
      }
      
      /**
       * avc_dump_query - Display a SID pair and a class in human-readable form.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       */
      static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass)
      {
              int rc;
              char *scontext;
              u32 scontext_len;
      
              rc = security_sid_to_context(ssid, &scontext, &scontext_len);
              if (rc)
                      audit_log_format(ab, "ssid=%d", ssid);
              else {
  149                 audit_log_format(ab, "scontext=%s", scontext);
                      kfree(scontext);
              }
      
  149         rc = security_sid_to_context(tsid, &scontext, &scontext_len);
              if (rc)
                      audit_log_format(ab, " tsid=%d", tsid);
              else {
  149                 audit_log_format(ab, " tcontext=%s", scontext);
                      kfree(scontext);
              }
      
  149         BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
  149         audit_log_format(ab, " tclass=%s", secclass_map[tclass-1].name);
      }
      
      /**
       * avc_init - Initialize the AVC.
       *
       * Initialize the access vector cache.
       */
      void __init avc_init(void)
      {
              int i;
      
              for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                      INIT_HLIST_HEAD(&avc_cache.slots[i]);
                      spin_lock_init(&avc_cache.slots_lock[i]);
              }
              atomic_set(&avc_cache.active_nodes, 0);
              atomic_set(&avc_cache.lru_hint, 0);
      
              avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
                                              0, SLAB_PANIC, NULL);
              avc_xperms_cachep = kmem_cache_create("avc_xperms_node",
                                              sizeof(struct avc_xperms_node),
                                              0, SLAB_PANIC, NULL);
              avc_xperms_decision_cachep = kmem_cache_create(
                                              "avc_xperms_decision_node",
                                              sizeof(struct avc_xperms_decision_node),
                                              0, SLAB_PANIC, NULL);
              avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data",
                                              sizeof(struct extended_perms_data),
                                              0, SLAB_PANIC, NULL);
      
              audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n");
      }
      
      int avc_get_hash_stats(char *page)
      {
              int i, chain_len, max_chain_len, slots_used;
              struct avc_node *node;
              struct hlist_head *head;
      
    8         rcu_read_lock();
      
              slots_used = 0;
              max_chain_len = 0;
    8         for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                      head = &avc_cache.slots[i];
    8                 if (!hlist_empty(head)) {
    8                         slots_used++;
                              chain_len = 0;
    8                         hlist_for_each_entry_rcu(node, head, list)
    8                                 chain_len++;
    8                         if (chain_len > max_chain_len)
                                      max_chain_len = chain_len;
                      }
              }
      
    8         rcu_read_unlock();
      
              return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
                               "longest chain: %d\n",
                               atomic_read(&avc_cache.active_nodes),
                               slots_used, AVC_CACHE_SLOTS, max_chain_len);
      }
      
      /*
       * using a linked list for extended_perms_decision lookup because the list is
       * always small. i.e. less than 5, typically 1
       */
      static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver,
                                              struct avc_xperms_node *xp_node)
      {
              struct avc_xperms_decision_node *xpd_node;
      
              list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
                      if (xpd_node->xpd.driver == driver)
                              return &xpd_node->xpd;
              }
              return NULL;
      }
      
      static inline unsigned int
      avc_xperms_has_perm(struct extended_perms_decision *xpd,
                                              u8 perm, u8 which)
      {
              unsigned int rc = 0;
      
              if ((which == XPERMS_ALLOWED) &&
                              (xpd->used & XPERMS_ALLOWED))
                      rc = security_xperm_test(xpd->allowed->p, perm);
              else if ((which == XPERMS_AUDITALLOW) &&
                              (xpd->used & XPERMS_AUDITALLOW))
                      rc = security_xperm_test(xpd->auditallow->p, perm);
              else if ((which == XPERMS_DONTAUDIT) &&
                              (xpd->used & XPERMS_DONTAUDIT))
                      rc = security_xperm_test(xpd->dontaudit->p, perm);
              return rc;
      }
      
      static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
                                      u8 driver, u8 perm)
      {
              struct extended_perms_decision *xpd;
              security_xperm_set(xp_node->xp.drivers.p, driver);
              xpd = avc_xperms_decision_lookup(driver, xp_node);
              if (xpd && xpd->allowed)
                      security_xperm_set(xpd->allowed->p, perm);
      }
      
      static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node)
      {
              struct extended_perms_decision *xpd;
      
              xpd = &xpd_node->xpd;
              if (xpd->allowed)
                      kmem_cache_free(avc_xperms_data_cachep, xpd->allowed);
              if (xpd->auditallow)
                      kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow);
              if (xpd->dontaudit)
                      kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit);
              kmem_cache_free(avc_xperms_decision_cachep, xpd_node);
      }
      
      static void avc_xperms_free(struct avc_xperms_node *xp_node)
      {
              struct avc_xperms_decision_node *xpd_node, *tmp;
      
   56         if (!xp_node)
                      return;
      
              list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) {
                      list_del(&xpd_node->xpd_list);
                      avc_xperms_decision_free(xpd_node);
              }
   56         kmem_cache_free(avc_xperms_cachep, xp_node);
      }
      
      static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
                                              struct extended_perms_decision *src)
      {
              dest->driver = src->driver;
              dest->used = src->used;
              if (dest->used & XPERMS_ALLOWED)
                      memcpy(dest->allowed->p, src->allowed->p,
                                      sizeof(src->allowed->p));
              if (dest->used & XPERMS_AUDITALLOW)
                      memcpy(dest->auditallow->p, src->auditallow->p,
                                      sizeof(src->auditallow->p));
              if (dest->used & XPERMS_DONTAUDIT)
                      memcpy(dest->dontaudit->p, src->dontaudit->p,
                                      sizeof(src->dontaudit->p));
      }
      
      /*
       * similar to avc_copy_xperms_decision, but only copy decision
       * information relevant to this perm
       */
      static inline void avc_quick_copy_xperms_decision(u8 perm,
                              struct extended_perms_decision *dest,
                              struct extended_perms_decision *src)
      {
              /*
               * compute index of the u32 of the 256 bits (8 u32s) that contain this
               * command permission
               */
              u8 i = perm >> 5;
      
              dest->used = src->used;
              if (dest->used & XPERMS_ALLOWED)
                      dest->allowed->p[i] = src->allowed->p[i];
              if (dest->used & XPERMS_AUDITALLOW)
                      dest->auditallow->p[i] = src->auditallow->p[i];
              if (dest->used & XPERMS_DONTAUDIT)
                      dest->dontaudit->p[i] = src->dontaudit->p[i];
      }
      
      static struct avc_xperms_decision_node
                      *avc_xperms_decision_alloc(u8 which)
      {
              struct avc_xperms_decision_node *xpd_node;
              struct extended_perms_decision *xpd;
      
              xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT);
              if (!xpd_node)
                      return NULL;
      
              xpd = &xpd_node->xpd;
              if (which & XPERMS_ALLOWED) {
                      xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                      GFP_NOWAIT);
                      if (!xpd->allowed)
                              goto error;
              }
              if (which & XPERMS_AUDITALLOW) {
                      xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                      GFP_NOWAIT);
                      if (!xpd->auditallow)
                              goto error;
              }
              if (which & XPERMS_DONTAUDIT) {
                      xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                      GFP_NOWAIT);
                      if (!xpd->dontaudit)
                              goto error;
              }
              return xpd_node;
      error:
              avc_xperms_decision_free(xpd_node);
              return NULL;
      }
      
      static int avc_add_xperms_decision(struct avc_node *node,
                              struct extended_perms_decision *src)
      {
              struct avc_xperms_decision_node *dest_xpd;
      
              node->ae.xp_node->xp.len++;
              dest_xpd = avc_xperms_decision_alloc(src->used);
              if (!dest_xpd)
                      return -ENOMEM;
              avc_copy_xperms_decision(&dest_xpd->xpd, src);
              list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head);
              return 0;
      }
      
      static struct avc_xperms_node *avc_xperms_alloc(void)
      {
              struct avc_xperms_node *xp_node;
      
              xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT);
              if (!xp_node)
                      return xp_node;
              INIT_LIST_HEAD(&xp_node->xpd_head);
              return xp_node;
      }
      
      static int avc_xperms_populate(struct avc_node *node,
                                      struct avc_xperms_node *src)
      {
              struct avc_xperms_node *dest;
              struct avc_xperms_decision_node *dest_xpd;
              struct avc_xperms_decision_node *src_xpd;
      
              if (src->xp.len == 0)
                      return 0;
              dest = avc_xperms_alloc();
              if (!dest)
                      return -ENOMEM;
      
              memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
              dest->xp.len = src->xp.len;
      
              /* for each source xpd allocate a destination xpd and copy */
              list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
                      dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used);
                      if (!dest_xpd)
                              goto error;
                      avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd);
                      list_add(&dest_xpd->xpd_list, &dest->xpd_head);
              }
              node->ae.xp_node = dest;
              return 0;
      error:
              avc_xperms_free(dest);
              return -ENOMEM;
      
      }
      
      static inline u32 avc_xperms_audit_required(u32 requested,
                                              struct av_decision *avd,
                                              struct extended_perms_decision *xpd,
                                              u8 perm,
                                              int result,
                                              u32 *deniedp)
      {
              u32 denied, audited;
      
              denied = requested & ~avd->allowed;
              if (unlikely(denied)) {
    1                 audited = denied & avd->auditdeny;
    1                 if (audited && xpd) {
                              if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT))
                                      audited &= ~requested;
                      }
 1978         } else if (result) {
                      audited = denied = requested;
              } else {
 1978                 audited = requested & avd->auditallow;
                      if (audited && xpd) {
                              if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW))
                                      audited &= ~requested;
                      }
              }
      
              *deniedp = denied;
              return audited;
      }
      
      static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass,
                                      u32 requested, struct av_decision *avd,
                                      struct extended_perms_decision *xpd,
                                      u8 perm, int result,
                                      struct common_audit_data *ad)
      {
              u32 audited, denied;
      
 1979         audited = avc_xperms_audit_required(
                              requested, avd, xpd, perm, result, &denied);
 1979         if (likely(!audited))
                      return 0;
    1         return slow_avc_audit(ssid, tsid, tclass, requested,
                              audited, denied, result, ad, 0);
      }
      
      static void avc_node_free(struct rcu_head *rhead)
      {
              struct avc_node *node = container_of(rhead, struct avc_node, rhead);
              avc_xperms_free(node->ae.xp_node);
              kmem_cache_free(avc_node_cachep, node);
              avc_cache_stats_incr(frees);
      }
      
      static void avc_node_delete(struct avc_node *node)
      {
 2064         hlist_del_rcu(&node->list);
              call_rcu(&node->rhead, avc_node_free);
              atomic_dec(&avc_cache.active_nodes);
      }
      
      static void avc_node_kill(struct avc_node *node)
      {
   56         avc_xperms_free(node->ae.xp_node);
              kmem_cache_free(avc_node_cachep, node);
              avc_cache_stats_incr(frees);
              atomic_dec(&avc_cache.active_nodes);
      }
      
      static void avc_node_replace(struct avc_node *new, struct avc_node *old)
      {
  120         hlist_replace_rcu(&old->list, &new->list);
              call_rcu(&old->rhead, avc_node_free);
              atomic_dec(&avc_cache.active_nodes);
      }
      
      static inline int avc_reclaim_node(void)
      {
              struct avc_node *node;
              int hvalue, try, ecx;
              unsigned long flags;
              struct hlist_head *head;
              spinlock_t *lock;
      
 2097         for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) {
 2097                 hvalue = atomic_inc_return(&avc_cache.lru_hint) & (AVC_CACHE_SLOTS - 1);
                      head = &avc_cache.slots[hvalue];
                      lock = &avc_cache.slots_lock[hvalue];
      
    3                 if (!spin_trylock_irqsave(lock, flags))
                              continue;
      
 2097                 rcu_read_lock();
 2097                 hlist_for_each_entry(node, head, list) {
 2058                         avc_node_delete(node);
                              avc_cache_stats_incr(reclaims);
                              ecx++;
                              if (ecx >= AVC_CACHE_RECLAIM) {
    1                                 rcu_read_unlock();
                                      spin_unlock_irqrestore(lock, flags);
                                      goto out;
                              }
                      }
 2097                 rcu_read_unlock();
                      spin_unlock_irqrestore(lock, flags);
              }
      out:
              return ecx;
      }
      
 2097 static struct avc_node *avc_alloc_node(void)
      {
              struct avc_node *node;
      
 2120         node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT);
              if (!node)
                      goto out;
      
 2120         INIT_HLIST_NODE(&node->list);
              avc_cache_stats_incr(allocations);
      
              if (atomic_inc_return(&avc_cache.active_nodes) > avc_cache_threshold)
 2097                 avc_reclaim_node();
      
      out:
 2120         return node;
      }
      
      static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd)
      {
              node->ae.ssid = ssid;
              node->ae.tsid = tsid;
              node->ae.tclass = tclass;
              memcpy(&node->ae.avd, avd, sizeof(node->ae.avd));
      }
      
      static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass)
      {
              struct avc_node *node, *ret = NULL;
              int hvalue;
              struct hlist_head *head;
      
              hvalue = avc_hash(ssid, tsid, tclass);
              head = &avc_cache.slots[hvalue];
 7546         hlist_for_each_entry_rcu(node, head, list) {
 7546                 if (ssid == node->ae.ssid &&
 8180                     tclass == node->ae.tclass &&
 7543                     tsid == node->ae.tsid) {
                              ret = node;
                              break;
                      }
              }
      
              return ret;
      }
      
      /**
       * avc_lookup - Look up an AVC entry.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       *
       * Look up an AVC entry that is valid for the
       * (@ssid, @tsid), interpreting the permissions
       * based on @tclass.  If a valid AVC entry exists,
       * then this function returns the avc_node.
       * Otherwise, this function returns NULL.
       */
      static struct avc_node *avc_lookup(u32 ssid, u32 tsid, u16 tclass)
      {
              struct avc_node *node;
      
 8180         avc_cache_stats_incr(lookups);
 8180         node = avc_search_node(ssid, tsid, tclass);
      
              if (node)
                      return node;
      
 2119         avc_cache_stats_incr(misses);
              return NULL;
      }
      
      static int avc_latest_notif_update(int seqno, int is_insert)
      {
              int ret = 0;
              static DEFINE_SPINLOCK(notif_lock);
              unsigned long flag;
      
    6         spin_lock_irqsave(&notif_lock, flag);
              if (is_insert) {
                      if (seqno < avc_cache.latest_notif) {
                              printk(KERN_WARNING "SELinux: avc:  seqno %d < latest_notif %d\n",
                                     seqno, avc_cache.latest_notif);
                              ret = -EAGAIN;
                      }
              } else {
                      if (seqno > avc_cache.latest_notif)
    6                         avc_cache.latest_notif = seqno;
              }
 2118         spin_unlock_irqrestore(&notif_lock, flag);
      
              return ret;
      }
      
      /**
       * avc_insert - Insert an AVC entry.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       * @avd: resulting av decision
       * @xp_node: resulting extended permissions
       *
       * Insert an AVC entry for the SID pair
       * (@ssid, @tsid) and class @tclass.
       * The access vectors and the sequence number are
       * normally provided by the security server in
       * response to a security_compute_av() call.  If the
       * sequence number @avd->seqno is not less than the latest
       * revocation notification, then the function copies
       * the access vectors into a cache entry, returns
       * avc_node inserted. Otherwise, this function returns NULL.
       */
      static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass,
                                      struct av_decision *avd,
                                      struct avc_xperms_node *xp_node)
      {
              struct avc_node *pos, *node = NULL;
              int hvalue;
              unsigned long flag;
      
 2118         if (avc_latest_notif_update(avd->seqno, 1))
                      goto out;
      
              node = avc_alloc_node();
              if (node) {
                      struct hlist_head *head;
                      spinlock_t *lock;
                      int rc = 0;
      
 2118                 hvalue = avc_hash(ssid, tsid, tclass);
                      avc_node_populate(node, ssid, tsid, tclass, avd);
                      rc = avc_xperms_populate(node, xp_node);
                      if (rc) {
                              kmem_cache_free(avc_node_cachep, node);
                              return NULL;
                      }
 2118                 head = &avc_cache.slots[hvalue];
                      lock = &avc_cache.slots_lock[hvalue];
      
                      spin_lock_irqsave(lock, flag);
  119                 hlist_for_each_entry(pos, head, list) {
  119                         if (pos->ae.ssid == ssid &&
  119                             pos->ae.tsid == tsid &&
  112                             pos->ae.tclass == tclass) {
  112                                 avc_node_replace(node, pos);
                                      goto found;
                              }
                      }
 2115                 hlist_add_head_rcu(&node->list, head);
      found:
 2118                 spin_unlock_irqrestore(lock, flag);
              }
      out:
              return node;
      }
      
      /**
       * avc_audit_pre_callback - SELinux specific information
       * will be called by generic audit code
       * @ab: the audit buffer
       * @a: audit_data
       */
      static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
      {
              struct common_audit_data *ad = a;
  149         audit_log_format(ab, "avc:  %s ",
  149                          ad->selinux_audit_data->denied ? "denied" : "granted");
  149         avc_dump_av(ab, ad->selinux_audit_data->tclass,
                              ad->selinux_audit_data->audited);
  149         audit_log_format(ab, " for ");
      }
      
      /**
       * avc_audit_post_callback - SELinux specific information
       * will be called by generic audit code
       * @ab: the audit buffer
       * @a: audit_data
       */
      static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
      {
              struct common_audit_data *ad = a;
  149         audit_log_format(ab, " ");
  149         avc_dump_query(ab, ad->selinux_audit_data->ssid,
                                 ad->selinux_audit_data->tsid,
                                 ad->selinux_audit_data->tclass);
              if (ad->selinux_audit_data->denied) {
                      audit_log_format(ab, " permissive=%u",
  149                                  ad->selinux_audit_data->result ? 0 : 1);
              }
  149 }
      
      /* This is the slow part of avc audit with big stack footprint */
      noinline int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass,
                      u32 requested, u32 audited, u32 denied, int result,
                      struct common_audit_data *a,
                      unsigned flags)
      {
              struct common_audit_data stack_data;
              struct selinux_audit_data sad;
      
  149         if (!a) {
                      a = &stack_data;
  133                 a->type = LSM_AUDIT_DATA_NONE;
              }
      
              /*
               * When in a RCU walk do the audit on the RCU retry.  This is because
               * the collection of the dname in an inode audit message is not RCU
               * safe.  Note this may drop some audits when the situation changes
               * during retry. However this is logically just as if the operation
               * happened a little later.
               */
   44         if ((a->type == LSM_AUDIT_DATA_INODE) &&
    1             (flags & MAY_NOT_BLOCK))
                      return -ECHILD;
      
  149         sad.tclass = tclass;
              sad.requested = requested;
              sad.ssid = ssid;
              sad.tsid = tsid;
              sad.audited = audited;
              sad.denied = denied;
              sad.result = result;
      
              a->selinux_audit_data = &sad;
      
              common_lsm_audit(a, avc_audit_pre_callback, avc_audit_post_callback);
  150         return 0;
      }
      
      /**
       * avc_add_callback - Register a callback for security events.
       * @callback: callback function
       * @events: security events
       *
       * Register a callback function for events in the set @events.
       * Returns %0 on success or -%ENOMEM if insufficient memory
       * exists to add the callback.
       */
      int __init avc_add_callback(int (*callback)(u32 event), u32 events)
      {
              struct avc_callback_node *c;
              int rc = 0;
      
              c = kmalloc(sizeof(*c), GFP_KERNEL);
              if (!c) {
                      rc = -ENOMEM;
                      goto out;
              }
      
              c->callback = callback;
              c->events = events;
              c->next = avc_callbacks;
              avc_callbacks = c;
      out:
              return rc;
      }
      
      /**
       * avc_update_node Update an AVC entry
       * @event : Updating event
       * @perms : Permission mask bits
       * @ssid,@tsid,@tclass : identifier of an AVC entry
       * @seqno : sequence number when decision was made
       * @xpd: extended_perms_decision to be added to the node
       *
       * if a valid AVC entry doesn't exist,this function returns -ENOENT.
       * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
       * otherwise, this function updates the AVC entry. The original AVC-entry object
       * will release later by RCU.
       */
      static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
                              u32 tsid, u16 tclass, u32 seqno,
                              struct extended_perms_decision *xpd,
                              u32 flags)
      {
              int hvalue, rc = 0;
              unsigned long flag;
              struct avc_node *pos, *node, *orig = NULL;
              struct hlist_head *head;
              spinlock_t *lock;
      
   59         node = avc_alloc_node();
              if (!node) {
                      rc = -ENOMEM;
                      goto out;
              }
      
              /* Lock the target slot */
   59         hvalue = avc_hash(ssid, tsid, tclass);
      
              head = &avc_cache.slots[hvalue];
              lock = &avc_cache.slots_lock[hvalue];
      
              spin_lock_irqsave(lock, flag);
      
    8         hlist_for_each_entry(pos, head, list) {
    8                 if (ssid == pos->ae.ssid &&
    8                     tsid == pos->ae.tsid &&
    8                     tclass == pos->ae.tclass &&
    8                     seqno == pos->ae.avd.seqno){
                              orig = pos;
                              break;
                      }
              }
      
              if (!orig) {
                      rc = -ENOENT;
   56                 avc_node_kill(node);
                      goto out_unlock;
              }
      
              /*
               * Copy and replace original node.
               */
      
    8         avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd);
      
              if (orig->ae.xp_node) {
                      rc = avc_xperms_populate(node, orig->ae.xp_node);
                      if (rc) {
                              kmem_cache_free(avc_node_cachep, node);
                              goto out_unlock;
                      }
              }
      
    8         switch (event) {
              case AVC_CALLBACK_GRANT:
    8                 node->ae.avd.allowed |= perms;
                      if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
                              avc_xperms_allow_perm(node->ae.xp_node, driver, xperm);
                      break;
              case AVC_CALLBACK_TRY_REVOKE:
              case AVC_CALLBACK_REVOKE:
                      node->ae.avd.allowed &= ~perms;
                      break;
              case AVC_CALLBACK_AUDITALLOW_ENABLE:
                      node->ae.avd.auditallow |= perms;
                      break;
              case AVC_CALLBACK_AUDITALLOW_DISABLE:
                      node->ae.avd.auditallow &= ~perms;
                      break;
              case AVC_CALLBACK_AUDITDENY_ENABLE:
                      node->ae.avd.auditdeny |= perms;
                      break;
              case AVC_CALLBACK_AUDITDENY_DISABLE:
                      node->ae.avd.auditdeny &= ~perms;
                      break;
              case AVC_CALLBACK_ADD_XPERMS:
                      avc_add_xperms_decision(node, xpd);
                      break;
              }
    8         avc_node_replace(node, orig);
      out_unlock:
   59         spin_unlock_irqrestore(lock, flag);
      out:
   59         return rc;
      }
      
      /**
       * avc_flush - Flush the cache
       */
      static void avc_flush(void)
    6 {
              struct hlist_head *head;
              struct avc_node *node;
              spinlock_t *lock;
              unsigned long flag;
              int i;
      
              for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                      head = &avc_cache.slots[i];
                      lock = &avc_cache.slots_lock[i];
      
    6                 spin_lock_irqsave(lock, flag);
                      /*
                       * With preemptable RCU, the outer spinlock does not
                       * prevent RCU grace periods from ending.
                       */
    6                 rcu_read_lock();
    6                 hlist_for_each_entry(node, head, list)
    6                         avc_node_delete(node);
    6                 rcu_read_unlock();
                      spin_unlock_irqrestore(lock, flag);
              }
    6 }
      
      /**
       * avc_ss_reset - Flush the cache and revalidate migrated permissions.
       * @seqno: policy sequence number
       */
      int avc_ss_reset(u32 seqno)
      {
              struct avc_callback_node *c;
              int rc = 0, tmprc;
      
    6         avc_flush();
      
    6         for (c = avc_callbacks; c; c = c->next) {
    6                 if (c->events & AVC_CALLBACK_RESET) {
    6                         tmprc = c->callback(AVC_CALLBACK_RESET);
                              /* save the first error encountered for the return
                                 value and continue processing the callbacks */
                              if (!rc)
                                      rc = tmprc;
                      }
              }
      
    6         avc_latest_notif_update(seqno, 0);
              return rc;
      }
      
      /*
       * Slow-path helper function for avc_has_perm_noaudit,
       * when the avc_node lookup fails. We get called with
       * the RCU read lock held, and need to return with it
       * still held, but drop if for the security compute.
       *
       * Don't inline this, since it's the slow-path and just
       * results in a bigger stack frame.
       */
      static noinline struct avc_node *avc_compute_av(u32 ssid, u32 tsid,
                               u16 tclass, struct av_decision *avd,
                               struct avc_xperms_node *xp_node)
      {
 2119         rcu_read_unlock();
              INIT_LIST_HEAD(&xp_node->xpd_head);
              security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp);
 2118         rcu_read_lock();
 2118         return avc_insert(ssid, tsid, tclass, avd, xp_node);
      }
      
      static noinline int avc_denied(u32 ssid, u32 tsid,
                                      u16 tclass, u32 requested,
                                      u8 driver, u8 xperm, unsigned flags,
                                      struct av_decision *avd)
      {
  154         if (flags & AVC_STRICT)
                      return -EACCES;
      
  150         if (selinux_enforcing && !(avd->flags & AVD_FLAGS_PERMISSIVE))
                      return -EACCES;
      
   59         avc_update_node(AVC_CALLBACK_GRANT, requested, driver, xperm, ssid,
                                      tsid, tclass, avd->seqno, NULL, flags);
  154         return 0;
      }
      
      /*
       * The avc extended permissions logic adds an additional 256 bits of
       * permissions to an avc node when extended permissions for that node are
       * specified in the avtab. If the additional 256 permissions is not adequate,
       * as-is the case with ioctls, then multiple may be chained together and the
       * driver field is used to specify which set contains the permission.
       */
      int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                              u8 driver, u8 xperm, struct common_audit_data *ad)
      {
              struct avc_node *node;
              struct av_decision avd;
              u32 denied;
              struct extended_perms_decision local_xpd;
              struct extended_perms_decision *xpd = NULL;
              struct extended_perms_data allowed;
              struct extended_perms_data auditallow;
              struct extended_perms_data dontaudit;
              struct avc_xperms_node local_xp_node;
              struct avc_xperms_node *xp_node;
              int rc = 0, rc2;
      
              xp_node = &local_xp_node;
 1979         BUG_ON(!requested);
      
 1979         rcu_read_lock();
      
 1979         node = avc_lookup(ssid, tsid, tclass);
              if (unlikely(!node)) {
  466                 node = avc_compute_av(ssid, tsid, tclass, &avd, xp_node);
              } else {
 1654                 memcpy(&avd, &node->ae.avd, sizeof(avd));
                      xp_node = node->ae.xp_node;
              }
              /* if extended permissions are not defined, only consider av_decision */
  466         if (!xp_node || !xp_node->xp.len)
                      goto decision;
      
              local_xpd.allowed = &allowed;
              local_xpd.auditallow = &auditallow;
              local_xpd.dontaudit = &dontaudit;
      
              xpd = avc_xperms_decision_lookup(driver, xp_node);
              if (unlikely(!xpd)) {
                      /*
                       * Compute the extended_perms_decision only if the driver
                       * is flagged
                       */
                      if (!security_xperm_test(xp_node->xp.drivers.p, driver)) {
                              avd.allowed &= ~requested;
                              goto decision;
                      }
                      rcu_read_unlock();
                      security_compute_xperms_decision(ssid, tsid, tclass, driver,
                                                      &local_xpd);
                      rcu_read_lock();
                      avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm,
                                      ssid, tsid, tclass, avd.seqno, &local_xpd, 0);
              } else {
                      avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
              }
              xpd = &local_xpd;
      
              if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED))
                      avd.allowed &= ~requested;
      
      decision:
 1979         denied = requested & ~(avd.allowed);
              if (unlikely(denied))
    1                 rc = avc_denied(ssid, tsid, tclass, requested, driver, xperm,
                                      AVC_EXTENDED_PERMS, &avd);
      
 1979         rcu_read_unlock();
      
 1979         rc2 = avc_xperms_audit(ssid, tsid, tclass, requested,
                              &avd, xpd, xperm, rc, ad);
              if (rc2)
                      return rc2;
              return rc;
      }
      
      /**
       * avc_has_perm_noaudit - Check permissions but perform no auditing.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       * @requested: requested permissions, interpreted based on @tclass
       * @flags:  AVC_STRICT or 0
       * @avd: access vector decisions
       *
       * Check the AVC to determine whether the @requested permissions are granted
       * for the SID pair (@ssid, @tsid), interpreting the permissions
       * based on @tclass, and call the security server on a cache miss to obtain
       * a new decision and add it to the cache.  Return a copy of the decisions
       * in @avd.  Return %0 if all @requested permissions are granted,
       * -%EACCES if any permissions are denied, or another -errno upon
       * other errors.  This function is typically called by avc_has_perm(),
       * but may also be called directly to separate permission checking from
       * auditing, e.g. in cases where a lock must be held for the check but
       * should be released for the auditing.
       */
      inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
                               u16 tclass, u32 requested,
                               unsigned flags,
                               struct av_decision *avd)
      {
              struct avc_node *node;
              struct avc_xperms_node xp_node;
              int rc = 0;
              u32 denied;
      
 3717         BUG_ON(!requested);
      
 7467         rcu_read_lock();
      
 7467         node = avc_lookup(ssid, tsid, tclass);
              if (unlikely(!node))
 1904                 node = avc_compute_av(ssid, tsid, tclass, avd, &xp_node);
              else
 6968                 memcpy(avd, &node->ae.avd, sizeof(*avd));
      
 7466         denied = requested & ~(avd->allowed);
              if (unlikely(denied))
  153                 rc = avc_denied(ssid, tsid, tclass, requested, 0, 0, flags, avd);
      
 7466         rcu_read_unlock();
              return rc;
      }
      
      /**
       * avc_has_perm - Check permissions and perform any appropriate auditing.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       * @requested: requested permissions, interpreted based on @tclass
       * @auditdata: auxiliary audit data
       *
       * Check the AVC to determine whether the @requested permissions are granted
       * for the SID pair (@ssid, @tsid), interpreting the permissions
       * based on @tclass, and call the security server on a cache miss to obtain
       * a new decision and add it to the cache.  Audit the granting or denial of
       * permissions in accordance with the policy.  Return %0 if all @requested
       * permissions are granted, -%EACCES if any permissions are denied, or
       * another -errno upon other errors.
       */
      int avc_has_perm(u32 ssid, u32 tsid, u16 tclass,
                       u32 requested, struct common_audit_data *auditdata)
      {
              struct av_decision avd;
              int rc, rc2;
      
 7083         rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd);
      
 7083         rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata, 0);
              if (rc2)
                      return rc2;
              return rc;
      }
      
      int avc_has_perm_flags(u32 ssid, u32 tsid, u16 tclass,
                             u32 requested, struct common_audit_data *auditdata,
                             int flags)
      {
              struct av_decision avd;
              int rc, rc2;
      
  679         rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd);
      
  679         rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc,
                              auditdata, flags);
              if (rc2)
                      return rc2;
              return rc;
      }
      
      u32 avc_policy_seqno(void)
      {
 3291         return avc_cache.latest_notif;
      }
      
      void avc_disable(void)
      {
              /*
               * If you are looking at this because you have realized that we are
               * not destroying the avc_node_cachep it might be easy to fix, but
               * I don't know the memory barrier semantics well enough to know.  It's
               * possible that some other task dereferenced security_ops when
               * it still pointed to selinux operations.  If that is the case it's
               * possible that it is about to use the avc and is about to need the
               * avc_node_cachep.  I know I could wrap the security.c security_ops call
               * in an rcu_lock, but seriously, it's not worth it.  Instead I just flush
               * the cache and get that memory back.
               */
              if (avc_node_cachep) {
                      avc_flush();
                      /* kmem_cache_destroy(avc_node_cachep); */
              }
      }
      /* binder_alloc.c
       *
       * Android IPC Subsystem
       *
       * Copyright (C) 2007-2017 Google, Inc.
       *
       * This software is licensed under the terms of the GNU General Public
       * License version 2, as published by the Free Software Foundation, and
       * may be copied, distributed, and modified under those terms.
       *
       * This program is distributed in the hope that it will be useful,
       * but WITHOUT ANY WARRANTY; without even the implied warranty of
       * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       * GNU General Public License for more details.
       *
       */
      
      #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      
      #include <asm/cacheflush.h>
      #include <linux/list.h>
      #include <linux/mm.h>
      #include <linux/module.h>
      #include <linux/rtmutex.h>
      #include <linux/rbtree.h>
      #include <linux/seq_file.h>
      #include <linux/vmalloc.h>
      #include <linux/slab.h>
      #include <linux/sched.h>
      #include <linux/list_lru.h>
      #include "binder_alloc.h"
      #include "binder_trace.h"
      
      struct list_lru binder_alloc_lru;
      
      static DEFINE_MUTEX(binder_alloc_mmap_lock);
      
      enum {
              BINDER_DEBUG_OPEN_CLOSE             = 1U << 1,
              BINDER_DEBUG_BUFFER_ALLOC           = 1U << 2,
              BINDER_DEBUG_BUFFER_ALLOC_ASYNC     = 1U << 3,
      };
      static uint32_t binder_alloc_debug_mask;
      
      module_param_named(debug_mask, binder_alloc_debug_mask,
                         uint, S_IWUSR | S_IRUGO);
      
      #define binder_alloc_debug(mask, x...) \
              do { \
                      if (binder_alloc_debug_mask & mask) \
                              pr_info(x); \
              } while (0)
      
      static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer)
      {
              return list_entry(buffer->entry.next, struct binder_buffer, entry);
      }
      
      static struct binder_buffer *binder_buffer_prev(struct binder_buffer *buffer)
      {
              return list_entry(buffer->entry.prev, struct binder_buffer, entry);
      }
      
   64 static size_t binder_alloc_buffer_size(struct binder_alloc *alloc,
                                             struct binder_buffer *buffer)
      {
   64         if (list_is_last(&buffer->entry, &alloc->buffers))
   64                 return (u8 *)alloc->buffer +
                              alloc->buffer_size - (u8 *)buffer->data;
   56         return (u8 *)binder_buffer_next(buffer)->data - (u8 *)buffer->data;
      }
      
      static void binder_insert_free_buffer(struct binder_alloc *alloc,
                                            struct binder_buffer *new_buffer)
      {
   63         struct rb_node **p = &alloc->free_buffers.rb_node;
              struct rb_node *parent = NULL;
              struct binder_buffer *buffer;
              size_t buffer_size;
              size_t new_buffer_size;
      
              BUG_ON(!new_buffer->free);
      
   63         new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer);
      
   63         binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                           "%d: add free buffer, size %zd, at %pK\n",
                            alloc->pid, new_buffer_size, new_buffer);
      
   63         while (*p) {
                      parent = *p;
                      buffer = rb_entry(parent, struct binder_buffer, rb_node);
   55                 BUG_ON(!buffer->free);
      
   55                 buffer_size = binder_alloc_buffer_size(alloc, buffer);
      
   55                 if (new_buffer_size < buffer_size)
    3                         p = &parent->rb_left;
                      else
   52                         p = &parent->rb_right;
              }
   63         rb_link_node(&new_buffer->rb_node, parent, p);
              rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers);
      }
      
      static void binder_insert_allocated_buffer_locked(
                      struct binder_alloc *alloc, struct binder_buffer *new_buffer)
      {
              struct rb_node **p = &alloc->allocated_buffers.rb_node;
              struct rb_node *parent = NULL;
              struct binder_buffer *buffer;
      
              BUG_ON(new_buffer->free);
      
   55         while (*p) {
                      parent = *p;
                      buffer = rb_entry(parent, struct binder_buffer, rb_node);
    4                 BUG_ON(buffer->free);
      
    4                 if (new_buffer->data < buffer->data)
                              p = &parent->rb_left;
    4                 else if (new_buffer->data > buffer->data)
    4                         p = &parent->rb_right;
                      else
                              BUG();
              }
   55         rb_link_node(&new_buffer->rb_node, parent, p);
              rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers);
      }
      
      static struct binder_buffer *binder_alloc_prepare_to_free_locked(
                      struct binder_alloc *alloc,
                      uintptr_t user_ptr)
      {
              struct rb_node *n = alloc->allocated_buffers.rb_node;
              struct binder_buffer *buffer;
              void *kern_ptr;
      
              kern_ptr = (void *)(user_ptr - alloc->user_buffer_offset);
      
    1         while (n) {
    1                 buffer = rb_entry(n, struct binder_buffer, rb_node);
    2                 BUG_ON(buffer->free);
      
    2                 if (kern_ptr < buffer->data)
    1                         n = n->rb_left;
    1                 else if (kern_ptr > buffer->data)
                              n = n->rb_right;
                      else {
                              /*
                               * Guard against user threads attempting to
                               * free the buffer when in use by kernel or
                               * after it's already been freed.
                               */
    1                         if (!buffer->allow_user_free)
                                      return ERR_PTR(-EPERM);
                              buffer->allow_user_free = 0;
                              return buffer;
                      }
              }
              return NULL;
      }
      
      /**
       * binder_alloc_buffer_lookup() - get buffer given user ptr
       * @alloc:        binder_alloc for this proc
       * @user_ptr:        User pointer to buffer data
       *
       * Validate userspace pointer to buffer data and return buffer corresponding to
       * that user pointer. Search the rb tree for buffer that matches user data
       * pointer.
       *
       * Return:        Pointer to buffer or NULL
       */
      struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc,
                                                         uintptr_t user_ptr)
      {
              struct binder_buffer *buffer;
      
    3         mutex_lock(&alloc->mutex);
    2         buffer = binder_alloc_prepare_to_free_locked(alloc, user_ptr);
    3         mutex_unlock(&alloc->mutex);
              return buffer;
      }
      
   53 static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
                                          void *start, void *end)
      {
              void *page_addr;
              unsigned long user_page_addr;
              struct binder_lru_page *page;
              struct vm_area_struct *vma = NULL;
              struct mm_struct *mm = NULL;
              bool need_mm = false;
      
   57         binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                           "%d: %s pages %pK-%pK\n", alloc->pid,
                           allocate ? "allocate" : "free", start, end);
      
   57         if (end <= start)
   57                 return 0;
      
   53         trace_binder_update_page_range(alloc, allocate, start, end);
      
   53         if (allocate == 0)
                      goto free_range;
      
    4         for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
   52                 page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
                      if (!page->page_ptr) {
                              need_mm = true;
                              break;
                      }
              }
      
              /* Same as mmget_not_zero() in later kernel versions */
   48         if (need_mm && atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users))
   48                 mm = alloc->vma_vm_mm;
      
              if (mm) {
   48                 down_read(&mm->mmap_sem);
                      if (!mmget_still_valid(mm)) {
                              if (allocate == 0)
                                      goto free_range;
   48                         goto err_no_vma;
                      }
                      vma = alloc->vma;
              }
      
              if (!vma && need_mm) {
    4                 pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
                              alloc->pid);
                      goto err_no_vma;
              }
      
   52         for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
                      int ret;
                      bool on_lru;
                      size_t index;
    4 
                      index = (page_addr - alloc->buffer) / PAGE_SIZE;
    4                 page = &alloc->pages[index];
      
                      if (page->page_ptr) {
    4                         trace_binder_alloc_lru_start(alloc, index);
   51 
                              on_lru = list_lru_del(&binder_alloc_lru, &page->lru);
                              WARN_ON(!on_lru);
   48 
                              trace_binder_alloc_lru_end(alloc, index);
                              continue;
   48                 }
   48 
                      if (WARN_ON(!vma))
                              goto err_page_ptr_cleared;
      
                      trace_binder_alloc_page_start(alloc, index);
                      page->page_ptr = alloc_page(GFP_KERNEL |
                                                  __GFP_HIGHMEM |
                                                  __GFP_ZERO);
   48                 if (!page->page_ptr) {
                              pr_err("%d: binder_alloc_buf failed for page at %pK\n",
                                      alloc->pid, page_addr);
                              goto err_alloc_page_failed;
                      }
                      page->alloc = alloc;
                      INIT_LIST_HEAD(&page->lru);
      
                      ret = map_kernel_range_noflush((unsigned long)page_addr,
                                                     PAGE_SIZE, PAGE_KERNEL,
                                                     &page->page_ptr);
                      flush_cache_vmap((unsigned long)page_addr,
                                      (unsigned long)page_addr + PAGE_SIZE);
                      if (ret != 1) {
   48                         pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n",
                                     alloc->pid, page_addr);
                              goto err_map_kernel_failed;
    1                 }
                      user_page_addr =
                              (uintptr_t)page_addr + alloc->user_buffer_offset;
                      ret = vm_insert_page(vma, user_page_addr, page[0].page_ptr);
                      if (ret) {
   47                         pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
   47                                alloc->pid, user_page_addr);
                              goto err_vm_insert_page_failed;
   51                 }
      
                      if (index + 1 > alloc->pages_high)
   51                         alloc->pages_high = index + 1;
   47 
                      trace_binder_alloc_page_end(alloc, index);
                      /* vm_insert_page does not seem to increment the refcount */
              }
              if (mm) {
                      up_read(&mm->mmap_sem);
   29                 mmput(mm);
   30         }
              return 0;
      
      free_range:
   29         for (page_addr = end - PAGE_SIZE; page_addr >= start;
                   page_addr -= PAGE_SIZE) {
                      bool ret;
   29                 size_t index;
      
   29                 index = (page_addr - alloc->buffer) / PAGE_SIZE;
                      page = &alloc->pages[index];
      
   29                 trace_binder_free_lru_start(alloc, index);
      
                      ret = list_lru_add(&binder_alloc_lru, &page->lru);
                      WARN_ON(!ret);
      
                      trace_binder_free_lru_end(alloc, index);
    1                 continue;
      
      err_vm_insert_page_failed:
                      unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
      err_map_kernel_failed:
                      __free_page(page->page_ptr);
                      page->page_ptr = NULL;
   30 err_alloc_page_failed:
    1 err_page_ptr_cleared:
                      ;
              }
   30 err_no_vma:
              if (mm) {
                      up_read(&mm->mmap_sem);
                      mmput(mm);
              }
              return vma ? -ENOMEM : -ESRCH;
      }
      
      static struct binder_buffer *binder_alloc_new_buf_locked(
                                      struct binder_alloc *alloc,
   57                                 size_t data_size,
                                      size_t offsets_size,
                                      size_t extra_buffers_size,
                                      int is_async)
      {
              struct rb_node *n = alloc->free_buffers.rb_node;
              struct binder_buffer *buffer;
              size_t buffer_size;
              struct rb_node *best_fit = NULL;
              void *has_page_addr;
    8         void *end_page_addr;
              size_t size, data_offsets_size;
              int ret;
      
              if (alloc->vma == NULL) {
   61                 pr_err("%d: binder_alloc_buf, no vma\n",
                             alloc->pid);
                      return ERR_PTR(-ESRCH);
   60         }
    2 
              data_offsets_size = ALIGN(data_size, sizeof(void *)) +
                      ALIGN(offsets_size, sizeof(void *));
      
              if (data_offsets_size < data_size || data_offsets_size < offsets_size) {
   59                 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
   58                                 "%d: got transaction with invalid size %zd-%zd\n",
    1                                 alloc->pid, data_size, offsets_size);
                      return ERR_PTR(-EINVAL);
              }
              size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
              if (size < data_offsets_size || size < extra_buffers_size) {
   58                 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
    4                                 "%d: got transaction with invalid extra_buffers_size %zd\n",
    1                                 alloc->pid, extra_buffers_size);
                      return ERR_PTR(-EINVAL);
              }
              if (is_async &&
                  alloc->free_async_space < size + sizeof(struct binder_buffer)) {
                      binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                                   "%d: binder_alloc_buf size %zd failed, no async space left\n",
                                    alloc->pid, size);
                      return ERR_PTR(-ENOSPC);
   57         }
      
   57         /* Pad 0-size buffers so they get assigned unique addresses */
   57         size = max(size, sizeof(void *));
      
   57         while (n) {
                      buffer = rb_entry(n, struct binder_buffer, rb_node);
   56                 BUG_ON(!buffer->free);
    1                 buffer_size = binder_alloc_buffer_size(alloc, buffer);
    1 
                      if (size < buffer_size) {
                              best_fit = n;
                              n = n->rb_left;
                      } else if (size > buffer_size)
                              n = n->rb_right;
   57                 else {
                              best_fit = n;
                              break;
                      }
              }
              if (best_fit == NULL) {
                      size_t allocated_buffers = 0;
                      size_t largest_alloc_size = 0;
    1                 size_t total_alloc_size = 0;
                      size_t free_buffers = 0;
                      size_t largest_free_size = 0;
                      size_t total_free_size = 0;
      
                      for (n = rb_first(&alloc->allocated_buffers); n != NULL;
                           n = rb_next(n)) {
                              buffer = rb_entry(n, struct binder_buffer, rb_node);
                              buffer_size = binder_alloc_buffer_size(alloc, buffer);
    1                         allocated_buffers++;
                              total_alloc_size += buffer_size;
                              if (buffer_size > largest_alloc_size)
    1                                 largest_alloc_size = buffer_size;
    1                 }
                      for (n = rb_first(&alloc->free_buffers); n != NULL;
                           n = rb_next(n)) {
                              buffer = rb_entry(n, struct binder_buffer, rb_node);
                              buffer_size = binder_alloc_buffer_size(alloc, buffer);
    1                         free_buffers++;
                              total_free_size += buffer_size;
                              if (buffer_size > largest_free_size)
                                      largest_free_size = buffer_size;
                      }
                      pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
                              alloc->pid, size);
                      pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n",
   56                        total_alloc_size, allocated_buffers, largest_alloc_size,
   56                        total_free_size, free_buffers, largest_free_size);
                      return ERR_PTR(-ENOSPC);
              }
   56         if (n == NULL) {
                      buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
                      buffer_size = binder_alloc_buffer_size(alloc, buffer);
              }
      
   56         binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                           "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n",
   56                       alloc->pid, size, buffer, buffer_size);
      
              has_page_addr =
                      (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
              WARN_ON(n && buffer_size != size);
              end_page_addr =
                      (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
    1         if (end_page_addr > has_page_addr)
                      end_page_addr = has_page_addr;
   55         ret = binder_update_page_range(alloc, 1,
                  (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr);
              if (ret)
   55                 return ERR_PTR(ret);
      
              if (buffer_size != size) {
                      struct binder_buffer *new_buffer;
      
                      new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
   55                 if (!new_buffer) {
   55                         pr_err("%s: %d failed to alloc new buffer struct\n",
   55                                __func__, alloc->pid);
                              goto err_alloc_buf_struct_failed;
                      }
                      new_buffer->data = (u8 *)buffer->data + size;
   55                 list_add(&new_buffer->entry, &buffer->entry);
                      new_buffer->free = 1;
                      binder_insert_free_buffer(alloc, new_buffer);
   55         }
      
              rb_erase(best_fit, &alloc->free_buffers);
              buffer->free = 0;
   55         buffer->allow_user_free = 0;
              binder_insert_allocated_buffer_locked(alloc, buffer);
              binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                           "%d: binder_alloc_buf size %zd got %pK\n",
                            alloc->pid, size, buffer);
    3         buffer->data_size = data_size;
              buffer->offsets_size = offsets_size;
              buffer->async_transaction = is_async;
              buffer->extra_buffers_size = extra_buffers_size;
              if (is_async) {
                      alloc->free_async_space -= size + sizeof(struct binder_buffer);
                      binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
                                   "%d: binder_alloc_buf size %zd async free %zd\n",
                                    alloc->pid, size, alloc->free_async_space);
              }
              return buffer;
      
      err_alloc_buf_struct_failed:
              binder_update_page_range(alloc, 0,
                                       (void *)PAGE_ALIGN((uintptr_t)buffer->data),
                                       end_page_addr);
              return ERR_PTR(-ENOMEM);
      }
      
      /**
       * binder_alloc_new_buf() - Allocate a new binder buffer
       * @alloc:              binder_alloc for this proc
       * @data_size:          size of user data buffer
       * @offsets_size:       user specified buffer offset
       * @extra_buffers_size: size of extra space for meta-data (eg, security context)
       * @is_async:           buffer for async transaction
       *
       * Allocate a new buffer given the requested sizes. Returns
       * the kernel version of the buffer pointer. The size allocated
       * is the sum of the three given sizes (each rounded up to
       * pointer-sized boundary)
       *
       * Return:        The allocated buffer or %NULL if error
       */
      struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
                                                 size_t data_size,
                                                 size_t offsets_size,
   65                                            size_t extra_buffers_size,
   65                                            int is_async)
      {
   65         struct binder_buffer *buffer;
      
              mutex_lock(&alloc->mutex);
              buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size,
                                                   extra_buffers_size, is_async);
              mutex_unlock(&alloc->mutex);
   27         return buffer;
      }
      
      static void *buffer_start_page(struct binder_buffer *buffer)
      {
              return (void *)((uintptr_t)buffer->data & PAGE_MASK);
      }
      
      static void *prev_buffer_end_page(struct binder_buffer *buffer)
      {
              return (void *)(((uintptr_t)(buffer->data) - 1) & PAGE_MASK);
      }
      
   31 static void binder_delete_free_buffer(struct binder_alloc *alloc,
   31                                       struct binder_buffer *buffer)
      {
   31         struct binder_buffer *prev, *next = NULL;
              bool to_free = true;
    2         BUG_ON(alloc->buffers.next == &buffer->entry);
              prev = binder_buffer_prev(buffer);
              BUG_ON(!prev->free);
              if (prev_buffer_end_page(prev) == buffer_start_page(buffer)) {
                      to_free = false;
   31                 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                                         "%d: merge free, buffer %pK share page with %pK\n",
                                         alloc->pid, buffer->data, prev->data);
              }
      
              if (!list_is_last(&buffer->entry, &alloc->buffers)) {
                      next = binder_buffer_next(buffer);
                      if (buffer_start_page(next) == buffer_start_page(buffer)) {
                              to_free = false;
                              binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                                                 "%d: merge free, buffer %pK share page with %pK\n",
                                                 alloc->pid,
   31                                            buffer->data,
    2                                            next->data);
                      }
              }
      
              if (PAGE_ALIGNED(buffer->data)) {
                      binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
   29                                    "%d: merge free, buffer start %pK is page aligned\n",
   27                                    alloc->pid, buffer->data);
                      to_free = false;
              }
      
              if (to_free) {
   27                 binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                                         "%d: merge free, buffer %pK do not share page with %pK or %pK\n",
   31                                    alloc->pid, buffer->data,
                                         prev->data, next ? next->data : NULL);
                      binder_update_page_range(alloc, 0, buffer_start_page(buffer),
                                               buffer_start_page(buffer) + PAGE_SIZE);
              }
              list_del(&buffer->entry);
              kfree(buffer);
      }
      
   31 static void binder_free_buf_locked(struct binder_alloc *alloc,
                                         struct binder_buffer *buffer)
   31 {
              size_t size, buffer_size;
      
              buffer_size = binder_alloc_buffer_size(alloc, buffer);
      
              size = ALIGN(buffer->data_size, sizeof(void *)) +
                      ALIGN(buffer->offsets_size, sizeof(void *)) +
                      ALIGN(buffer->extra_buffers_size, sizeof(void *));
   31 
   31         binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
   31                      "%d: binder_free_buf %pK size %zd buffer_size %zd\n",
   31                       alloc->pid, buffer, size, buffer_size);
   31 
              BUG_ON(buffer->free);
   31         BUG_ON(size > buffer_size);
    2         BUG_ON(buffer->transaction != NULL);
              BUG_ON(buffer->data < alloc->buffer);
    2         BUG_ON(buffer->data > alloc->buffer + alloc->buffer_size);
      
              if (buffer->async_transaction) {
                      alloc->free_async_space += size + sizeof(struct binder_buffer);
      
                      binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
                                   "%d: binder_free_buf size %zd async free %zd\n",
   31                               alloc->pid, size, alloc->free_async_space);
              }
      
              binder_update_page_range(alloc, 0,
                      (void *)PAGE_ALIGN((uintptr_t)buffer->data),
                      (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK));
      
   31         rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
   31         buffer->free = 1;
              if (!list_is_last(&buffer->entry, &alloc->buffers)) {
                      struct binder_buffer *next = binder_buffer_next(buffer);
      
   31                 if (next->free) {
    2                         rb_erase(&next->rb_node, &alloc->free_buffers);
                              binder_delete_free_buffer(alloc, next);
                      }
              }
              if (alloc->buffers.next != &buffer->entry) {
                      struct binder_buffer *prev = binder_buffer_prev(buffer);
      
                      if (prev->free) {
   31                         binder_delete_free_buffer(alloc, buffer);
                              rb_erase(&prev->rb_node, &alloc->free_buffers);
                              buffer = prev;
                      }
              }
              binder_insert_free_buffer(alloc, buffer);
      }
      
      /**
       * binder_alloc_free_buf() - free a binder buffer
       * @alloc:        binder_alloc for this proc
       * @buffer:        kernel pointer to buffer
       *
   31  * Free the buffer allocated via binder_alloc_new_buffer()
       */
      void binder_alloc_free_buf(struct binder_alloc *alloc,
                                  struct binder_buffer *buffer)
      {
              mutex_lock(&alloc->mutex);
              binder_free_buf_locked(alloc, buffer);
              mutex_unlock(&alloc->mutex);
      }
      
      /**
       * binder_alloc_mmap_handler() - map virtual address space for proc
       * @alloc:        alloc structure for this proc
       * @vma:        vma passed to mmap()
       *
       * Called by binder_mmap() to initialize the space specified in
       * vma for allocating binder buffers
       *
       * Return:
       *      0 = success
       *      -EBUSY = address space already mapped
       *      -ENOMEM = failed to map memory to given address space
       */
      int binder_alloc_mmap_handler(struct binder_alloc *alloc,
                                    struct vm_area_struct *vma)
      {
   22         int ret;
              struct vm_struct *area;
              const char *failure_string;
              struct binder_buffer *buffer;
      
              mutex_lock(&binder_alloc_mmap_lock);
              if (alloc->buffer) {
   21                 ret = -EBUSY;
                      failure_string = "already mapped";
                      goto err_already_mapped;
              }
      
              area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
   21         if (area == NULL) {
                      ret = -ENOMEM;
                      failure_string = "get_vm_area";
                      goto err_get_vm_area_failed;
              }
              alloc->buffer = area->addr;
              alloc->user_buffer_offset =
                      vma->vm_start - (uintptr_t)alloc->buffer;
              mutex_unlock(&binder_alloc_mmap_lock);
      
      #ifdef CONFIG_CPU_CACHE_VIPT
              if (cache_is_vipt_aliasing()) {
                      while (CACHE_COLOUR(
                                      (vma->vm_start ^ (uint32_t)alloc->buffer))) {
                              pr_info("binder_mmap: %d %lx-%lx maps %pK bad alignment\n",
                                      alloc->pid, vma->vm_start, vma->vm_end,
                                      alloc->buffer);
                              vma->vm_start += PAGE_SIZE;
                      }
              }
      #endif
              alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
                                         ((vma->vm_end - vma->vm_start) / PAGE_SIZE),
                                     GFP_KERNEL);
   21         if (alloc->pages == NULL) {
                      ret = -ENOMEM;
                      failure_string = "alloc page array";
                      goto err_alloc_pages_failed;
              }
              alloc->buffer_size = vma->vm_end - vma->vm_start;
      
              buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
              if (!buffer) {
   21                 ret = -ENOMEM;
   21                 failure_string = "alloc buffer struct";
   21                 goto err_alloc_buf_struct_failed;
              }
      
              buffer->data = alloc->buffer;
              list_add(&buffer->entry, &alloc->buffers);
              buffer->free = 1;
              binder_insert_free_buffer(alloc, buffer);
              alloc->free_async_space = alloc->buffer_size / 2;
              barrier();
   22         alloc->vma = vma;
              alloc->vma_vm_mm = vma->vm_mm;
              /* Same as mmgrab() in later kernel versions */
              atomic_inc(&alloc->vma_vm_mm->mm_count);
      
              return 0;
      
      err_alloc_buf_struct_failed:
              kfree(alloc->pages);
              alloc->pages = NULL;
      err_alloc_pages_failed:
    1         mutex_lock(&binder_alloc_mmap_lock);
              vfree(alloc->buffer);
              alloc->buffer = NULL;
      err_get_vm_area_failed:
      err_already_mapped:
              mutex_unlock(&binder_alloc_mmap_lock);
              pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
                     alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
              return ret;
      }
      
      
      void binder_alloc_deferred_release(struct binder_alloc *alloc)
      {
              struct rb_node *n;
              int buffers, page_count;
              struct binder_buffer *buffer;
      
              BUG_ON(alloc->vma);
      
              buffers = 0;
              mutex_lock(&alloc->mutex);
              while ((n = rb_first(&alloc->allocated_buffers))) {
                      buffer = rb_entry(n, struct binder_buffer, rb_node);
      
                      /* Transaction should already have been freed */
                      BUG_ON(buffer->transaction);
      
                      binder_free_buf_locked(alloc, buffer);
                      buffers++;
              }
      
              while (!list_empty(&alloc->buffers)) {
                      buffer = list_first_entry(&alloc->buffers,
                                                struct binder_buffer, entry);
                      WARN_ON(!buffer->free);
      
                      list_del(&buffer->entry);
                      WARN_ON_ONCE(!list_empty(&alloc->buffers));
                      kfree(buffer);
              }
      
              page_count = 0;
              if (alloc->pages) {
                      int i;
      
                      for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
                              void *page_addr;
                              bool on_lru;
      
                              if (!alloc->pages[i].page_ptr)
                                      continue;
      
                              on_lru = list_lru_del(&binder_alloc_lru,
                                                    &alloc->pages[i].lru);
                              page_addr = alloc->buffer + i * PAGE_SIZE;
                              binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
                                           "%s: %d: page %d at %pK %s\n",
                                           __func__, alloc->pid, i, page_addr,
                                           on_lru ? "on lru" : "active");
                              unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
                              __free_page(alloc->pages[i].page_ptr);
                              page_count++;
                      }
                      kfree(alloc->pages);
                      vfree(alloc->buffer);
              }
              mutex_unlock(&alloc->mutex);
              if (alloc->vma_vm_mm)
                      mmdrop(alloc->vma_vm_mm);
      
              binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE,
                           "%s: %d buffers %d, pages %d\n",
                           __func__, alloc->pid, buffers, page_count);
      }
      
      static void print_binder_buffer(struct seq_file *m, const char *prefix,
                                      struct binder_buffer *buffer)
      {
              seq_printf(m, "%s %d: %pK size %zd:%zd:%zd %s\n",
                         prefix, buffer->debug_id, buffer->data,
                         buffer->data_size, buffer->offsets_size,
                         buffer->extra_buffers_size,
                         buffer->transaction ? "active" : "delivered");
      }
      
      /**
       * binder_alloc_print_allocated() - print buffer info
       * @m:     seq_file for output via seq_printf()
       * @alloc: binder_alloc for this proc
       *
       * Prints information about every buffer associated with
       * the binder_alloc state to the given seq_file
       */
      void binder_alloc_print_allocated(struct seq_file *m,
                                        struct binder_alloc *alloc)
      {
              struct rb_node *n;
      
              mutex_lock(&alloc->mutex);
              for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
                      print_binder_buffer(m, "  buffer",
                                          rb_entry(n, struct binder_buffer, rb_node));
              mutex_unlock(&alloc->mutex);
      }
      
      /**
       * binder_alloc_print_pages() - print page usage
       * @m:     seq_file for output via seq_printf()
       * @alloc: binder_alloc for this proc
       */
      void binder_alloc_print_pages(struct seq_file *m,
                                    struct binder_alloc *alloc)
      {
              struct binder_lru_page *page;
              int i;
              int active = 0;
              int lru = 0;
              int free = 0;
      
              mutex_lock(&alloc->mutex);
              for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
                      page = &alloc->pages[i];
                      if (!page->page_ptr)
                              free++;
                      else if (list_empty(&page->lru))
                              active++;
                      else
                              lru++;
              }
              mutex_unlock(&alloc->mutex);
              seq_printf(m, "  pages: %d:%d:%d\n", active, lru, free);
              seq_printf(m, "  pages high watermark: %zu\n", alloc->pages_high);
      }
      
      /**
       * binder_alloc_get_allocated_count() - return count of buffers
       * @alloc: binder_alloc for this proc
       *
       * Return: count of allocated buffers
       */
      int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
      {
              struct rb_node *n;
              int count = 0;
      
              mutex_lock(&alloc->mutex);
              for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
                      count++;
              mutex_unlock(&alloc->mutex);
              return count;
      }
      
      
      /**
       * binder_alloc_vma_close() - invalidate address space
       * @alloc: binder_alloc for this proc
       *
       * Called from binder_vma_close() when releasing address space.
    4  * Clears alloc->vma to prevent new incoming transactions from
       * allocating more buffers.
       */
      void binder_alloc_vma_close(struct binder_alloc *alloc)
      {
              WRITE_ONCE(alloc->vma, NULL);
      }
      
      /**
       * binder_alloc_free_page() - shrinker callback to free pages
       * @item:   item to free
       * @lock:   lock protecting the item
       * @cb_arg: callback argument
       *
       * Called from list_lru_walk() in binder_shrink_scan() to free
       * up pages when the system is under memory pressure.
       */
      enum lru_status binder_alloc_free_page(struct list_head *item,
                                             struct list_lru_one *lru,
                                             spinlock_t *lock,
                                             void *cb_arg)
      {
              struct mm_struct *mm = NULL;
              struct binder_lru_page *page = container_of(item,
                                                          struct binder_lru_page,
                                                          lru);
              struct binder_alloc *alloc;
              uintptr_t page_addr;
              size_t index;
              struct vm_area_struct *vma;
      
              alloc = page->alloc;
              if (!mutex_trylock(&alloc->mutex))
                      goto err_get_alloc_mutex_failed;
      
              if (!page->page_ptr)
                      goto err_page_already_freed;
      
              index = page - alloc->pages;
              page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
      
              mm = alloc->vma_vm_mm;
              /* Same as mmget_not_zero() in later kernel versions */
              if (!atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users))
                      goto err_mmget;
              if (!down_write_trylock(&mm->mmap_sem))
                      goto err_down_write_mmap_sem_failed;
              vma = alloc->vma;
      
              list_lru_isolate(lru, item);
              spin_unlock(lock);
      
              if (vma) {
                      trace_binder_unmap_user_start(alloc, index);
      
                      zap_page_range(vma,
                                     page_addr +
                                     alloc->user_buffer_offset,
                                     PAGE_SIZE, NULL);
      
                      trace_binder_unmap_user_end(alloc, index);
              }
              up_write(&mm->mmap_sem);
              mmput(mm);
      
              trace_binder_unmap_kernel_start(alloc, index);
      
              unmap_kernel_range(page_addr, PAGE_SIZE);
              __free_page(page->page_ptr);
              page->page_ptr = NULL;
      
              trace_binder_unmap_kernel_end(alloc, index);
      
              spin_lock(lock);
              mutex_unlock(&alloc->mutex);
              return LRU_REMOVED_RETRY;
      
      err_down_write_mmap_sem_failed:
              mmput_async(mm);
      err_mmget:
      err_page_already_freed:
              mutex_unlock(&alloc->mutex);
      err_get_alloc_mutex_failed:
              return LRU_SKIP;
      }
      
    2 static unsigned long
      binder_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
      {
              unsigned long ret = list_lru_count(&binder_alloc_lru);
              return ret;
      }
      
      static unsigned long
      binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
      {
              unsigned long ret;
      
              ret = list_lru_walk(&binder_alloc_lru, binder_alloc_free_page,
                                  NULL, sc->nr_to_scan);
              return ret;
      }
      
      static struct shrinker binder_shrinker = {
              .count_objects = binder_shrink_count,
              .scan_objects = binder_shrink_scan,
              .seeks = DEFAULT_SEEKS,
      };
      
      /**
       * binder_alloc_init() - called by binder_open() for per-proc initialization
       * @alloc: binder_alloc for this proc
       *
       * Called from binder_open() to initialize binder_alloc fields for
       * new binder proc
   41  */
      void binder_alloc_init(struct binder_alloc *alloc)
      {
              alloc->pid = current->group_leader->pid;
              mutex_init(&alloc->mutex);
              INIT_LIST_HEAD(&alloc->buffers);
      }
      
      int binder_alloc_shrinker_init(void)
      {
              int ret = list_lru_init(&binder_alloc_lru);
      
              if (ret == 0) {
                      ret = register_shrinker(&binder_shrinker);
                      if (ret)
                              list_lru_destroy(&binder_alloc_lru);
              }
              return ret;
      }
      #include <linux/sched.h>
      #include <linux/errno.h>
      #include <linux/dcache.h>
      #include <linux/path.h>
      #include <linux/fdtable.h>
      #include <linux/namei.h>
      #include <linux/pid.h>
      #include <linux/security.h>
      #include <linux/file.h>
      #include <linux/seq_file.h>
      #include <linux/fs.h>
      
      #include <linux/proc_fs.h>
      
      #include "../mount.h"
      #include "internal.h"
      #include "fd.h"
      
      static int seq_show(struct seq_file *m, void *v)
      {
              struct files_struct *files = NULL;
   27         int f_flags = 0, ret = -ENOENT;
              struct file *file = NULL;
              struct task_struct *task;
      
   27         task = get_proc_task(m->private);
              if (!task)
                      return -ENOENT;
      
   26         files = get_files_struct(task);
   26         put_task_struct(task);
      
   26         if (files) {
   25                 int fd = proc_fd(m->private);
      
                      spin_lock(&files->file_lock);
   25                 file = fcheck_files(files, fd);
                      if (file) {
   24                         struct fdtable *fdt = files_fdtable(files);
      
                              f_flags = file->f_flags;
                              if (close_on_exec(fd, fdt))
    1                                 f_flags |= O_CLOEXEC;
      
   24                         get_file(file);
                              ret = 0;
                      }
    1                 spin_unlock(&files->file_lock);
                      put_files_struct(files);
              }
      
              if (ret)
                      return ret;
      
              seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
                         (long long)file->f_pos, f_flags,
                         real_mount(file->f_path.mnt)->mnt_id);
      
              show_fd_locks(m, file, files);
              if (seq_has_overflowed(m))
                      goto out;
      
   24         if (file->f_op->show_fdinfo)
   19                 file->f_op->show_fdinfo(m, file);
      
      out:
   24         fput(file);
              return 0;
      }
      
      static int seq_fdinfo_open(struct inode *inode, struct file *file)
      {
    6         return single_open(file, seq_show, inode);
      }
      
      static const struct file_operations proc_fdinfo_file_operations = {
              .open                = seq_fdinfo_open,
              .read                = seq_read,
              .llseek                = seq_lseek,
              .release        = single_release,
      };
      
      static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
      {
              struct files_struct *files;
              struct task_struct *task;
              const struct cred *cred;
              struct inode *inode;
              int fd;
      
   71         if (flags & LOOKUP_RCU)
                      return -ECHILD;
      
   71         inode = d_inode(dentry);
              task = get_proc_task(inode);
   71         fd = proc_fd(inode);
      
              if (task) {
                      files = get_files_struct(task);
                      if (files) {
                              struct file *file;
      
   71                         rcu_read_lock();
   71                         file = fcheck_files(files, fd);
                              if (file) {
                                      unsigned f_mode = file->f_mode;
      
   62                                 rcu_read_unlock();
                                      put_files_struct(files);
      
   62                                 if (task_dumpable(task)) {
   61                                         rcu_read_lock();
   61                                         cred = __task_cred(task);
   61                                         inode->i_uid = cred->euid;
                                              inode->i_gid = cred->egid;
   61                                         rcu_read_unlock();
                                      } else {
    1                                         inode->i_uid = GLOBAL_ROOT_UID;
                                              inode->i_gid = GLOBAL_ROOT_GID;
                                      }
      
   62                                 if (S_ISLNK(inode->i_mode)) {
                                              unsigned i_mode = S_IFLNK;
   50                                         if (f_mode & FMODE_READ)
                                                      i_mode |= S_IRUSR | S_IXUSR;
   50                                         if (f_mode & FMODE_WRITE)
   30                                                 i_mode |= S_IWUSR | S_IXUSR;
   50                                         inode->i_mode = i_mode;
                                      }
      
   62                                 security_task_to_inode(task, inode);
   62                                 put_task_struct(task);
                                      return 1;
                              }
   12                         rcu_read_unlock();
                              put_files_struct(files);
                      }
   12                 put_task_struct(task);
              }
   71         return 0;
      }
      
      static const struct dentry_operations tid_fd_dentry_operations = {
              .d_revalidate        = tid_fd_revalidate,
              .d_delete        = pid_delete_dentry,
      };
      
      static int proc_fd_link(struct dentry *dentry, struct path *path)
      {
              struct files_struct *files = NULL;
              struct task_struct *task;
              int ret = -ENOENT;
      
   47         task = get_proc_task(d_inode(dentry));
              if (task) {
   47                 files = get_files_struct(task);
   47                 put_task_struct(task);
              }
      
   47         if (files) {
   47                 int fd = proc_fd(d_inode(dentry));
                      struct file *fd_file;
      
                      spin_lock(&files->file_lock);
   47                 fd_file = fcheck_files(files, fd);
                      if (fd_file) {
                              *path = fd_file->f_path;
                              path_get(&fd_file->f_path);
                              ret = 0;
                      }
   47                 spin_unlock(&files->file_lock);
                      put_files_struct(files);
              }
      
   47         return ret;
      }
      
      static int
      proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
                          struct task_struct *task, const void *ptr)
      {
   56         unsigned fd = (unsigned long)ptr;
              struct proc_inode *ei;
              struct inode *inode;
      
              inode = proc_pid_make_inode(dir->i_sb, task);
              if (!inode)
                      goto out;
      
              ei = PROC_I(inode);
   56         ei->fd = fd;
      
              inode->i_mode = S_IFLNK;
              inode->i_op = &proc_pid_link_inode_operations;
              inode->i_size = 64;
      
              ei->op.proc_get_link = proc_fd_link;
      
              d_set_d_op(dentry, &tid_fd_dentry_operations);
              d_add(dentry, inode);
      
              /* Close the race of the process dying before we return the dentry */
              if (tid_fd_revalidate(dentry, 0))
   56                 return 0;
       out:
              return -ENOENT;
      }
      
      static struct dentry *proc_lookupfd_common(struct inode *dir,
                                                 struct dentry *dentry,
                                                 instantiate_t instantiate)
      {
              struct task_struct *task = get_proc_task(dir);
              int result = -ENOENT;
   70         unsigned fd = name_to_int(&dentry->d_name);
      
   70         if (!task)
                      goto out_no_task;
              if (fd == ~0U)
                      goto out;
      
   65         result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
      out:
   68         put_task_struct(task);
      out_no_task:
   70         return ERR_PTR(result);
      }
      
      static int proc_readfd_common(struct file *file, struct dir_context *ctx,
                                    instantiate_t instantiate)
      {
    8         struct task_struct *p = get_proc_task(file_inode(file));
              struct files_struct *files;
              unsigned int fd;
      
              if (!p)
                      return -ENOENT;
      
    7         if (!dir_emit_dots(file, ctx))
                      goto out;
    6         files = get_files_struct(p);
              if (!files)
                      goto out;
      
    5         rcu_read_lock();
    5         for (fd = ctx->pos - 2;
    5              fd < files_fdtable(files)->max_fds;
                   fd++, ctx->pos++) {
                      char name[PROC_NUMBUF];
                      int len;
      
    5                 if (!fcheck_files(files, fd))
    5                         continue;
    5                 rcu_read_unlock();
      
                      len = snprintf(name, sizeof(name), "%d", fd);
                      if (!proc_fill_cache(file, ctx,
                                           name, len, instantiate, p,
                                           (void *)(unsigned long)fd))
    3                         goto out_fd_loop;
    5                 rcu_read_lock();
              }
    3         rcu_read_unlock();
      out_fd_loop:
    5         put_files_struct(files);
      out:
    7         put_task_struct(p);
    8         return 0;
      }
      
      static int proc_readfd(struct file *file, struct dir_context *ctx)
      {
    4         return proc_readfd_common(file, ctx, proc_fd_instantiate);
      }
      
      const struct file_operations proc_fd_operations = {
              .read                = generic_read_dir,
              .iterate        = proc_readfd,
              .llseek                = default_llseek,
      };
      
      static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
                                          unsigned int flags)
      {
   55         return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
      }
      
      /*
       * /proc/pid/fd needs a special permission handler so that a process can still
       * access /proc/self/fd after it has executed a setuid().
       */
      int proc_fd_permission(struct inode *inode, int mask)
      {
              struct task_struct *p;
              int rv;
      
   70         rv = generic_permission(inode, mask);
   70         if (rv == 0)
                      return rv;
      
              rcu_read_lock();
              p = pid_task(proc_pid(inode), PIDTYPE_PID);
              if (p && same_thread_group(p, current))
                      rv = 0;
              rcu_read_unlock();
      
              return rv;
      }
      
      const struct inode_operations proc_fd_inode_operations = {
              .lookup                = proc_lookupfd,
              .permission        = proc_fd_permission,
              .setattr        = proc_setattr,
      };
      
      static int
      proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
                              struct task_struct *task, const void *ptr)
      {
   14         unsigned fd = (unsigned long)ptr;
              struct proc_inode *ei;
              struct inode *inode;
      
              inode = proc_pid_make_inode(dir->i_sb, task);
              if (!inode)
                      goto out;
      
              ei = PROC_I(inode);
   14         ei->fd = fd;
      
              inode->i_mode = S_IFREG | S_IRUSR;
              inode->i_fop = &proc_fdinfo_file_operations;
      
              d_set_d_op(dentry, &tid_fd_dentry_operations);
              d_add(dentry, inode);
      
              /* Close the race of the process dying before we return the dentry */
              if (tid_fd_revalidate(dentry, 0))
   14                 return 0;
       out:
              return -ENOENT;
      }
      
      static struct dentry *
      proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
      {
   15         return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
      }
      
      static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
      {
    4         return proc_readfd_common(file, ctx,
                                        proc_fdinfo_instantiate);
      }
      
      const struct inode_operations proc_fdinfo_inode_operations = {
              .lookup                = proc_lookupfdinfo,
              .setattr        = proc_setattr,
      };
      
      const struct file_operations proc_fdinfo_operations = {
              .read                = generic_read_dir,
              .iterate        = proc_readfdinfo,
              .llseek                = default_llseek,
      };
      #include <linux/sysctl.h>
      #include <linux/slab.h>
      #include <net/net_namespace.h>
      #include <net/xfrm.h>
      
      static void __net_init __xfrm_sysctl_init(struct net *net)
      {
   30         net->xfrm.sysctl_aevent_etime = XFRM_AE_ETIME;
              net->xfrm.sysctl_aevent_rseqth = XFRM_AE_SEQT_SIZE;
              net->xfrm.sysctl_larval_drop = 1;
              net->xfrm.sysctl_acq_expires = 30;
      }
      
      #ifdef CONFIG_SYSCTL
      static struct ctl_table xfrm_table[] = {
              {
                      .procname        = "xfrm_aevent_etime",
                      .maxlen                = sizeof(u32),
                      .mode                = 0644,
                      .proc_handler        = proc_dointvec
              },
              {
                      .procname        = "xfrm_aevent_rseqth",
                      .maxlen                = sizeof(u32),
                      .mode                = 0644,
                      .proc_handler        = proc_dointvec
              },
              {
                      .procname        = "xfrm_larval_drop",
                      .maxlen                = sizeof(int),
                      .mode                = 0644,
                      .proc_handler        = proc_dointvec
              },
              {
                      .procname        = "xfrm_acq_expires",
                      .maxlen                = sizeof(int),
                      .mode                = 0644,
                      .proc_handler        = proc_dointvec
              },
              {}
      };
      
      int __net_init xfrm_sysctl_init(struct net *net)
      {
              struct ctl_table *table;
      
              __xfrm_sysctl_init(net);
      
              table = kmemdup(xfrm_table, sizeof(xfrm_table), GFP_KERNEL);
              if (!table)
                      goto out_kmemdup;
   30         table[0].data = &net->xfrm.sysctl_aevent_etime;
              table[1].data = &net->xfrm.sysctl_aevent_rseqth;
              table[2].data = &net->xfrm.sysctl_larval_drop;
              table[3].data = &net->xfrm.sysctl_acq_expires;
      
              /* Don't export sysctls to unprivileged users */
              if (net->user_ns != &init_user_ns)
   30                 table[0].procname = NULL;
      
   30         net->xfrm.sysctl_hdr = register_net_sysctl(net, "net/core", table);
              if (!net->xfrm.sysctl_hdr)
                      goto out_register;
   30         return 0;
      
      out_register:
              kfree(table);
      out_kmemdup:
              return -ENOMEM;
      }
      
      void __net_exit xfrm_sysctl_fini(struct net *net)
      {
              struct ctl_table *table;
      
              table = net->xfrm.sysctl_hdr->ctl_table_arg;
              unregister_net_sysctl_table(net->xfrm.sysctl_hdr);
              kfree(table);
      }
      #else
      int __net_init xfrm_sysctl_init(struct net *net)
      {
              __xfrm_sysctl_init(net);
              return 0;
      }
      #endif
      #include <linux/syscalls.h>
      #include <linux/export.h>
      #include <linux/fs.h>
      #include <linux/file.h>
      #include <linux/mount.h>
      #include <linux/namei.h>
      #include <linux/statfs.h>
      #include <linux/security.h>
      #include <linux/uaccess.h>
      #include "internal.h"
      
      static int flags_by_mnt(int mnt_flags)
      {
              int flags = 0;
      
              if (mnt_flags & MNT_READONLY)
                      flags |= ST_RDONLY;
              if (mnt_flags & MNT_NOSUID)
    1                 flags |= ST_NOSUID;
    8         if (mnt_flags & MNT_NODEV)
    2                 flags |= ST_NODEV;
    8         if (mnt_flags & MNT_NOEXEC)
    1                 flags |= ST_NOEXEC;
    8         if (mnt_flags & MNT_NOATIME)
    1                 flags |= ST_NOATIME;
    8         if (mnt_flags & MNT_NODIRATIME)
    1                 flags |= ST_NODIRATIME;
    8         if (mnt_flags & MNT_RELATIME)
    4                 flags |= ST_RELATIME;
              return flags;
      }
      
      static int flags_by_sb(int s_flags)
      {
              int flags = 0;
              if (s_flags & MS_SYNCHRONOUS)
                      flags |= ST_SYNCHRONOUS;
    8         if (s_flags & MS_MANDLOCK)
    1                 flags |= ST_MANDLOCK;
              return flags;
      }
      
      static int calculate_f_flags(struct vfsmount *mnt)
      {
    8         return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
    8                 flags_by_sb(mnt->mnt_sb->s_flags);
      }
      
   13 static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
      {
              int retval;
      
   13         if (!dentry->d_sb->s_op->statfs)
                      return -ENOSYS;
      
   13         memset(buf, 0, sizeof(*buf));
              retval = security_sb_statfs(dentry);
              if (retval)
                      return retval;
   13         retval = dentry->d_sb->s_op->statfs(dentry, buf);
   13         if (retval == 0 && buf->f_frsize == 0)
   13                 buf->f_frsize = buf->f_bsize;
              return retval;
      }
      
    8 int vfs_statfs(struct path *path, struct kstatfs *buf)
      {
              int error;
      
    8         error = statfs_by_dentry(path->dentry, buf);
              if (!error)
    8                 buf->f_flags = calculate_f_flags(path->mnt);
    8         return error;
      }
      EXPORT_SYMBOL(vfs_statfs);
      
      int user_statfs(const char __user *pathname, struct kstatfs *st)
      {
              struct path path;
              int error;
              unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT;
      retry:
    4         error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
              if (!error) {
    2                 error = vfs_statfs(&path, st);
                      path_put(&path);
                      if (retry_estale(error, lookup_flags)) {
                              lookup_flags |= LOOKUP_REVAL;
                              goto retry;
                      }
              }
    4         return error;
      }
      
      int fd_statfs(int fd, struct kstatfs *st)
      {
    7         struct fd f = fdget_raw(fd);
              int error = -EBADF;
              if (f.file) {
    6                 error = vfs_statfs(&f.file->f_path, st);
    5                 fdput(f);
              }
    7         return error;
      }
      
      static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
      {
              struct statfs buf;
      
              if (sizeof(buf) == sizeof(*st))
                      memcpy(&buf, st, sizeof(*st));
              else {
                      if (sizeof buf.f_blocks == 4) {
                              if ((st->f_blocks | st->f_bfree | st->f_bavail |
                                   st->f_bsize | st->f_frsize) &
                                  0xffffffff00000000ULL)
                                      return -EOVERFLOW;
                              /*
                               * f_files and f_ffree may be -1; it's okay to stuff
                               * that into 32 bits
                               */
                              if (st->f_files != -1 &&
                                  (st->f_files & 0xffffffff00000000ULL))
                                      return -EOVERFLOW;
                              if (st->f_ffree != -1 &&
                                  (st->f_ffree & 0xffffffff00000000ULL))
                                      return -EOVERFLOW;
                      }
      
                      buf.f_type = st->f_type;
                      buf.f_bsize = st->f_bsize;
                      buf.f_blocks = st->f_blocks;
                      buf.f_bfree = st->f_bfree;
                      buf.f_bavail = st->f_bavail;
                      buf.f_files = st->f_files;
                      buf.f_ffree = st->f_ffree;
                      buf.f_fsid = st->f_fsid;
                      buf.f_namelen = st->f_namelen;
                      buf.f_frsize = st->f_frsize;
                      buf.f_flags = st->f_flags;
                      memset(buf.f_spare, 0, sizeof(buf.f_spare));
              }
              if (copy_to_user(p, &buf, sizeof(buf)))
                      return -EFAULT;
              return 0;
      }
      
      static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
      {
              struct statfs64 buf;
              if (sizeof(buf) == sizeof(*st))
                      memcpy(&buf, st, sizeof(*st));
              else {
                      buf.f_type = st->f_type;
                      buf.f_bsize = st->f_bsize;
                      buf.f_blocks = st->f_blocks;
                      buf.f_bfree = st->f_bfree;
                      buf.f_bavail = st->f_bavail;
                      buf.f_files = st->f_files;
                      buf.f_ffree = st->f_ffree;
                      buf.f_fsid = st->f_fsid;
                      buf.f_namelen = st->f_namelen;
                      buf.f_frsize = st->f_frsize;
                      buf.f_flags = st->f_flags;
                      memset(buf.f_spare, 0, sizeof(buf.f_spare));
              }
              if (copy_to_user(p, &buf, sizeof(buf)))
                      return -EFAULT;
              return 0;
      }
      
      SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
      {
              struct kstatfs st;
              int error = user_statfs(pathname, &st);
              if (!error)
                      error = do_statfs_native(&st, buf);
              return error;
      }
      
      SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
      {
              struct kstatfs st;
              int error;
              if (sz != sizeof(*buf))
                      return -EINVAL;
              error = user_statfs(pathname, &st);
              if (!error)
                      error = do_statfs64(&st, buf);
              return error;
      }
      
      SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
      {
              struct kstatfs st;
              int error = fd_statfs(fd, &st);
              if (!error)
                      error = do_statfs_native(&st, buf);
              return error;
      }
      
      SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
      {
              struct kstatfs st;
              int error;
      
              if (sz != sizeof(*buf))
                      return -EINVAL;
      
              error = fd_statfs(fd, &st);
              if (!error)
                      error = do_statfs64(&st, buf);
              return error;
      }
      
      int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
      {
    7         struct super_block *s = user_get_super(dev);
              int err;
              if (!s)
                      return -EINVAL;
      
    5         err = statfs_by_dentry(s->s_root, sbuf);
              drop_super(s);
    7         return err;
      }
      
      SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
      {
              struct ustat tmp;
              struct kstatfs sbuf;
              int err = vfs_ustat(new_decode_dev(dev), &sbuf);
              if (err)
                      return err;
      
              memset(&tmp,0,sizeof(struct ustat));
              tmp.f_tfree = sbuf.f_bfree;
              tmp.f_tinode = sbuf.f_ffree;
      
              return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
      }
      #ifndef _LINUX_MM_TYPES_H
      #define _LINUX_MM_TYPES_H
      
      #include <linux/auxvec.h>
      #include <linux/types.h>
      #include <linux/threads.h>
      #include <linux/list.h>
      #include <linux/spinlock.h>
      #include <linux/rbtree.h>
      #include <linux/rwsem.h>
      #include <linux/completion.h>
      #include <linux/cpumask.h>
      #include <linux/uprobes.h>
      #include <linux/page-flags-layout.h>
      #include <linux/workqueue.h>
      #include <asm/page.h>
      #include <asm/mmu.h>
      
      #ifndef AT_VECTOR_SIZE_ARCH
      #define AT_VECTOR_SIZE_ARCH 0
      #endif
      #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
      
      struct address_space;
      struct mem_cgroup;
      
      #define USE_SPLIT_PTE_PTLOCKS        (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
      #define USE_SPLIT_PMD_PTLOCKS        (USE_SPLIT_PTE_PTLOCKS && \
                      IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
      #define ALLOC_SPLIT_PTLOCKS        (SPINLOCK_SIZE > BITS_PER_LONG/8)
      
      /*
       * Each physical page in the system has a struct page associated with
       * it to keep track of whatever it is we are using the page for at the
       * moment. Note that we have no way to track which tasks are using
       * a page, though if it is a pagecache page, rmap structures can tell us
       * who is mapping it.
       *
       * The objects in struct page are organized in double word blocks in
       * order to allows us to use atomic double word operations on portions
       * of struct page. That is currently only used by slub but the arrangement
       * allows the use of atomic double word operations on the flags/mapping
       * and lru list pointers also.
       */
      struct page {
              /* First double word block */
              unsigned long flags;                /* Atomic flags, some possibly
                                               * updated asynchronously */
              union {
                      struct address_space *mapping;        /* If low bit clear, points to
                                                       * inode address_space, or NULL.
                                                       * If page mapped as anonymous
                                                       * memory, low bit is set, and
                                                       * it points to anon_vma object:
                                                       * see PAGE_MAPPING_ANON below.
                                                       */
                      void *s_mem;                        /* slab first object */
              };
      
              /* Second double word */
              struct {
                      union {
                              pgoff_t index;                /* Our offset within mapping. */
                              void *freelist;                /* sl[aou]b first free object */
                      };
      
                      union {
      #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
              defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
                              /* Used for cmpxchg_double in slub */
                              unsigned long counters;
      #else
                              /*
                               * Keep _count separate from slub cmpxchg_double data.
                               * As the rest of the double word is protected by
                               * slab_lock but _count is not.
                               */
                              unsigned counters;
      #endif
      
                              struct {
      
                                      union {
                                              /*
                                               * Count of ptes mapped in
                                               * mms, to show when page is
                                               * mapped & limit reverse map
                                               * searches.
                                               *
                                               * Used also for tail pages
                                               * refcounting instead of
                                               * _count. Tail pages cannot
                                               * be mapped and keeping the
                                               * tail page _count zero at
                                               * all times guarantees
                                               * get_page_unless_zero() will
                                               * never succeed on tail
                                               * pages.
                                               */
                                              atomic_t _mapcount;
      
                                              struct { /* SLUB */
                                                      unsigned inuse:16;
                                                      unsigned objects:15;
                                                      unsigned frozen:1;
                                              };
                                              int units;        /* SLOB */
                                      };
                                      atomic_t _count;                /* Usage count, see below. */
                              };
                              unsigned int active;        /* SLAB */
                      };
              };
      
              /*
               * Third double word block
               *
               * WARNING: bit 0 of the first word encode PageTail(). That means
               * the rest users of the storage space MUST NOT use the bit to
               * avoid collision and false-positive PageTail().
               */
              union {
                      struct list_head lru;        /* Pageout list, eg. active_list
                                               * protected by zone->lru_lock !
                                               * Can be used as a generic list
                                               * by the page owner.
                                               */
                      struct {                /* slub per cpu partial pages */
                              struct page *next;        /* Next partial slab */
      #ifdef CONFIG_64BIT
                              int pages;        /* Nr of partial slabs left */
                              int pobjects;        /* Approximate # of objects */
      #else
                              short int pages;
                              short int pobjects;
      #endif
                      };
      
                      struct rcu_head rcu_head;        /* Used by SLAB
                                                       * when destroying via RCU
                                                       */
                      /* Tail pages of compound page */
                      struct {
                              unsigned long compound_head; /* If bit zero is set */
      
                              /* First tail page only */
      #ifdef CONFIG_64BIT
                              /*
                               * On 64 bit system we have enough space in struct page
                               * to encode compound_dtor and compound_order with
                               * unsigned int. It can help compiler generate better or
                               * smaller code on some archtectures.
                               */
                              unsigned int compound_dtor;
                              unsigned int compound_order;
      #else
                              unsigned short int compound_dtor;
                              unsigned short int compound_order;
      #endif
                      };
      
      #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
                      struct {
                              unsigned long __pad;        /* do not overlay pmd_huge_pte
                                                       * with compound_head to avoid
                                                       * possible bit 0 collision.
                                                       */
                              pgtable_t pmd_huge_pte; /* protected by page->ptl */
                      };
      #endif
              };
      
              /* Remainder is not double word aligned */
              union {
                      unsigned long private;                /* Mapping-private opaque data:
                                                        * usually used for buffer_heads
                                                       * if PagePrivate set; used for
                                                       * swp_entry_t if PageSwapCache;
                                                       * indicates order in the buddy
                                                       * system if PG_buddy is set.
                                                       */
      #if USE_SPLIT_PTE_PTLOCKS
      #if ALLOC_SPLIT_PTLOCKS
                      spinlock_t *ptl;
      #else
                      spinlock_t ptl;
      #endif
      #endif
                      struct kmem_cache *slab_cache;        /* SL[AU]B: Pointer to slab */
              };
      
      #ifdef CONFIG_MEMCG
              struct mem_cgroup *mem_cgroup;
      #endif
      
              /*
               * On machines where all RAM is mapped into kernel address space,
               * we can simply calculate the virtual address. On machines with
               * highmem some memory is mapped into kernel virtual memory
               * dynamically, so we need a place to store that address.
               * Note that this field could be 16 bits on x86 ... ;)
               *
               * Architectures with slow multiplication can define
               * WANT_PAGE_VIRTUAL in asm/page.h
               */
      #if defined(WANT_PAGE_VIRTUAL)
              void *virtual;                        /* Kernel virtual address (NULL if
                                                 not kmapped, ie. highmem) */
      #endif /* WANT_PAGE_VIRTUAL */
      
      #ifdef CONFIG_KMEMCHECK
              /*
               * kmemcheck wants to track the status of each byte in a page; this
               * is a pointer to such a status block. NULL if not tracked.
               */
              void *shadow;
      #endif
      
      #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
              int _last_cpupid;
      #endif
      }
      /*
       * The struct page can be forced to be double word aligned so that atomic ops
       * on double words work. The SLUB allocator can make use of such a feature.
       */
      #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
              __aligned(2 * sizeof(unsigned long))
      #endif
      ;
      
      struct page_frag {
              struct page *page;
      #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
              __u32 offset;
              __u32 size;
      #else
              __u16 offset;
              __u16 size;
      #endif
      };
      
      #define PAGE_FRAG_CACHE_MAX_SIZE        __ALIGN_MASK(32768, ~PAGE_MASK)
      #define PAGE_FRAG_CACHE_MAX_ORDER        get_order(PAGE_FRAG_CACHE_MAX_SIZE)
      
      struct page_frag_cache {
              void * va;
      #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
              __u16 offset;
              __u16 size;
      #else
              __u32 offset;
      #endif
              /* we maintain a pagecount bias, so that we dont dirty cache line
               * containing page->_count every time we allocate a fragment.
               */
              unsigned int                pagecnt_bias;
              bool pfmemalloc;
      };
      
      typedef unsigned long vm_flags_t;
      
      /*
       * A region containing a mapping of a non-memory backed file under NOMMU
       * conditions.  These are held in a global tree and are pinned by the VMAs that
       * map parts of them.
       */
      struct vm_region {
              struct rb_node        vm_rb;                /* link in global region tree */
              vm_flags_t        vm_flags;        /* VMA vm_flags */
              unsigned long        vm_start;        /* start address of region */
              unsigned long        vm_end;                /* region initialised to here */
              unsigned long        vm_top;                /* region allocated to here */
              unsigned long        vm_pgoff;        /* the offset in vm_file corresponding to vm_start */
              struct file        *vm_file;        /* the backing file or NULL */
      
              int                vm_usage;        /* region usage count (access under nommu_region_sem) */
              bool                vm_icache_flushed : 1; /* true if the icache has been flushed for
                                                      * this region */
      };
      
      #ifdef CONFIG_USERFAULTFD
      #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
      struct vm_userfaultfd_ctx {
              struct userfaultfd_ctx *ctx;
      };
      #else /* CONFIG_USERFAULTFD */
      #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
      struct vm_userfaultfd_ctx {};
      #endif /* CONFIG_USERFAULTFD */
      
      /*
       * This struct defines a memory VMM memory area. There is one of these
       * per VM-area/task.  A VM area is any part of the process virtual memory
       * space that has a special rule for the page-fault handlers (ie a shared
       * library, the executable area etc).
       */
      struct vm_area_struct {
              /* The first cache line has the info for VMA tree walking. */
      
              unsigned long vm_start;                /* Our start address within vm_mm. */
              unsigned long vm_end;                /* The first byte after our end address
                                                 within vm_mm. */
      
              /* linked list of VM areas per task, sorted by address */
              struct vm_area_struct *vm_next, *vm_prev;
      
              struct rb_node vm_rb;
      
              /*
               * Largest free memory gap in bytes to the left of this VMA.
               * Either between this VMA and vma->vm_prev, or between one of the
               * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
               * get_unmapped_area find a free area of the right size.
               */
              unsigned long rb_subtree_gap;
      
              /* Second cache line starts here. */
      
              struct mm_struct *vm_mm;        /* The address space we belong to. */
              pgprot_t vm_page_prot;                /* Access permissions of this VMA. */
              unsigned long vm_flags;                /* Flags, see mm.h. */
      
              /*
               * For areas with an address space and backing store,
               * linkage into the address_space->i_mmap interval tree.
               *
               * For private anonymous mappings, a pointer to a null terminated string
               * in the user process containing the name given to the vma, or NULL
               * if unnamed.
               */
              union {
                      struct {
                              struct rb_node rb;
                              unsigned long rb_subtree_last;
                      } shared;
                      const char __user *anon_name;
              };
      
              /*
               * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
               * list, after a COW of one of the file pages.        A MAP_SHARED vma
               * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
               * or brk vma (with NULL file) can only be in an anon_vma list.
               */
              struct list_head anon_vma_chain; /* Serialized by mmap_sem &
                                                * page_table_lock */
              struct anon_vma *anon_vma;        /* Serialized by page_table_lock */
      
              /* Function pointers to deal with this struct. */
              const struct vm_operations_struct *vm_ops;
      
              /* Information about our backing store: */
              unsigned long vm_pgoff;                /* Offset (within vm_file) in PAGE_SIZE
                                                 units, *not* PAGE_CACHE_SIZE */
              struct file * vm_file;                /* File we map to (can be NULL). */
              void * vm_private_data;                /* was vm_pte (shared mem) */
      
      #ifndef CONFIG_MMU
              struct vm_region *vm_region;        /* NOMMU mapping region */
      #endif
      #ifdef CONFIG_NUMA
              struct mempolicy *vm_policy;        /* NUMA policy for the VMA */
      #endif
              struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
      };
      
      struct core_thread {
              struct task_struct *task;
              struct core_thread *next;
      };
      
      struct core_state {
              atomic_t nr_threads;
              struct core_thread dumper;
              struct completion startup;
      };
      
      enum {
              MM_FILEPAGES,
              MM_ANONPAGES,
              MM_SWAPENTS,
              NR_MM_COUNTERS
      };
      
      #if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
      #define SPLIT_RSS_COUNTING
      /* per-thread cached information, */
      struct task_rss_stat {
              int events;        /* for synchronization threshold */
              int count[NR_MM_COUNTERS];
      };
      #endif /* USE_SPLIT_PTE_PTLOCKS */
      
      struct mm_rss_stat {
              atomic_long_t count[NR_MM_COUNTERS];
      };
      
      struct kioctx_table;
      struct mm_struct {
              struct vm_area_struct *mmap;                /* list of VMAs */
              struct rb_root mm_rb;
              u64 vmacache_seqnum;                   /* per-thread vmacache */
      #ifdef CONFIG_MMU
              unsigned long (*get_unmapped_area) (struct file *filp,
                                      unsigned long addr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags);
      #endif
              unsigned long mmap_base;                /* base of mmap area */
              unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
              unsigned long task_size;                /* size of task vm space */
              unsigned long highest_vm_end;                /* highest vma end address */
              pgd_t * pgd;
              atomic_t mm_users;                        /* How many users with user space? */
              atomic_t mm_count;                        /* How many references to "struct mm_struct" (users count as 1) */
              atomic_long_t nr_ptes;                        /* PTE page table pages */
      #if CONFIG_PGTABLE_LEVELS > 2
              atomic_long_t nr_pmds;                        /* PMD page table pages */
      #endif
              int map_count;                                /* number of VMAs */
      
              spinlock_t page_table_lock;                /* Protects page tables and some counters */
              struct rw_semaphore mmap_sem;
      
              struct list_head mmlist;                /* List of maybe swapped mm's.        These are globally strung
                                                       * together off init_mm.mmlist, and are protected
                                                       * by mmlist_lock
                                                       */
      
      
              unsigned long hiwater_rss;        /* High-watermark of RSS usage */
              unsigned long hiwater_vm;        /* High-water virtual memory usage */
      
              unsigned long total_vm;                /* Total pages mapped */
              unsigned long locked_vm;        /* Pages that have PG_mlocked set */
              unsigned long pinned_vm;        /* Refcount permanently increased */
              unsigned long shared_vm;        /* Shared pages (files) */
              unsigned long exec_vm;                /* VM_EXEC & ~VM_WRITE */
              unsigned long stack_vm;                /* VM_GROWSUP/DOWN */
              unsigned long def_flags;
              unsigned long start_code, end_code, start_data, end_data;
              unsigned long start_brk, brk, start_stack;
              unsigned long arg_start, arg_end, env_start, env_end;
      
              unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
      
              /*
               * Special counters, in some configurations protected by the
               * page_table_lock, in other configurations by being atomic.
               */
              struct mm_rss_stat rss_stat;
      
              struct linux_binfmt *binfmt;
      
              cpumask_var_t cpu_vm_mask_var;
      
              /* Architecture-specific MM context */
              mm_context_t context;
      
              unsigned long flags; /* Must use atomic bitops to access the bits */
      
              struct core_state *core_state; /* coredumping support */
      #ifdef CONFIG_AIO
              spinlock_t                        ioctx_lock;
              struct kioctx_table __rcu        *ioctx_table;
      #endif
      #ifdef CONFIG_MEMCG
              /*
               * "owner" points to a task that is regarded as the canonical
               * user/owner of this mm. All of the following must be true in
               * order for it to be changed:
               *
               * current == mm->owner
               * current->mm != mm
               * new_owner->mm == mm
               * new_owner->alloc_lock is held
               */
              struct task_struct __rcu *owner;
      #endif
              struct user_namespace *user_ns;
      
              /* store ref to file /proc/<pid>/exe symlink points to */
              struct file __rcu *exe_file;
      #ifdef CONFIG_MMU_NOTIFIER
              struct mmu_notifier_mm *mmu_notifier_mm;
      #endif
      #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
              pgtable_t pmd_huge_pte; /* protected by page_table_lock */
      #endif
      #ifdef CONFIG_CPUMASK_OFFSTACK
              struct cpumask cpumask_allocation;
      #endif
      #ifdef CONFIG_NUMA_BALANCING
              /*
               * numa_next_scan is the next time that the PTEs will be marked
               * pte_numa. NUMA hinting faults will gather statistics and migrate
               * pages to new nodes if necessary.
               */
              unsigned long numa_next_scan;
      
              /* Restart point for scanning and setting pte_numa */
              unsigned long numa_scan_offset;
      
              /* numa_scan_seq prevents two threads setting pte_numa */
              int numa_scan_seq;
      #endif
      #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
              /*
               * An operation with batched TLB flushing is going on. Anything that
               * can move process memory needs to flush the TLB when moving a
               * PROT_NONE or PROT_NUMA mapped page.
               */
              bool tlb_flush_pending;
      #endif
      #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
              /* See flush_tlb_batched_pending() */
              bool tlb_flush_batched;
      #endif
              struct uprobes_state uprobes_state;
      #ifdef CONFIG_X86_INTEL_MPX
              /* address of the bounds directory */
              void __user *bd_addr;
      #endif
      #ifdef CONFIG_HUGETLB_PAGE
              atomic_long_t hugetlb_usage;
      #endif
              struct work_struct async_put_work;
      };
      
      static inline void mm_init_cpumask(struct mm_struct *mm)
      {
      #ifdef CONFIG_CPUMASK_OFFSTACK
              mm->cpu_vm_mask_var = &mm->cpumask_allocation;
      #endif
              cpumask_clear(mm->cpu_vm_mask_var);
      }
      
      /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
      static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
      {
              return mm->cpu_vm_mask_var;
      }
      
      #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
      /*
       * Memory barriers to keep this state in sync are graciously provided by
       * the page table locks, outside of which no page table modifications happen.
       * The barriers below prevent the compiler from re-ordering the instructions
       * around the memory barriers that are already present in the code.
       */
      static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
      {
    1         barrier();
              return mm->tlb_flush_pending;
      }
      static inline void set_tlb_flush_pending(struct mm_struct *mm)
      {
              mm->tlb_flush_pending = true;
      
              /*
               * Guarantee that the tlb_flush_pending store does not leak into the
               * critical section updating the page tables
               */
              smp_mb__before_spinlock();
      }
      /* Clearing is done after a TLB flush, which also provides a barrier. */
      static inline void clear_tlb_flush_pending(struct mm_struct *mm)
      {
   25         barrier();
              mm->tlb_flush_pending = false;
      }
      #else
      static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
      {
              return false;
      }
      static inline void set_tlb_flush_pending(struct mm_struct *mm)
      {
      }
      static inline void clear_tlb_flush_pending(struct mm_struct *mm)
      {
      }
      #endif
      
      struct vm_special_mapping
      {
              const char *name;
              struct page **pages;
      };
      
      enum tlb_flush_reason {
              TLB_FLUSH_ON_TASK_SWITCH,
              TLB_REMOTE_SHOOTDOWN,
              TLB_LOCAL_SHOOTDOWN,
              TLB_LOCAL_MM_SHOOTDOWN,
              TLB_REMOTE_SEND_IPI,
              NR_TLB_FLUSH_REASONS,
      };
      
       /*
        * A swap entry has to fit into a "unsigned long", as the entry is hidden
        * in the "index" field of the swapper address space.
        */
      typedef struct {
              unsigned long val;
      } swp_entry_t;
      
      /* Return the name for an anonymous mapping or NULL for a file-backed mapping */
      static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
      {
   84         if (vma->vm_file)
                      return NULL;
      
  104         return vma->anon_name;
      }
      
      #endif /* _LINUX_MM_TYPES_H */
      /*
       * net/ipv6/fib6_rules.c        IPv6 Routing Policy Rules
       *
       * Copyright (C)2003-2006 Helsinki University of Technology
       * Copyright (C)2003-2006 USAGI/WIDE Project
       *
       *        This program is free software; you can redistribute it and/or
       *        modify it under the terms of the GNU General Public License as
       *        published by the Free Software Foundation, version 2.
       *
       * Authors
       *        Thomas Graf                <tgraf@suug.ch>
       *        Ville Nuorvala                <vnuorval@tcs.hut.fi>
       */
      
      #include <linux/netdevice.h>
      #include <linux/export.h>
      
      #include <net/fib_rules.h>
      #include <net/ipv6.h>
      #include <net/addrconf.h>
      #include <net/ip6_route.h>
      #include <net/netlink.h>
      
      struct fib6_rule {
              struct fib_rule                common;
              struct rt6key                src;
              struct rt6key                dst;
              u8                        tclass;
      };
      
      struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                         int flags, pol_lookup_t lookup)
      {
  688         struct fib_lookup_arg arg = {
                      .lookup_ptr = lookup,
                      .flags = FIB_LOOKUP_NOREF,
              };
      
              fib_rules_lookup(net->ipv6.fib6_rules_ops,
                               flowi6_to_flowi(fl6), flags, &arg);
      
  688         if (arg.result)
                      return arg.result;
      
   65         dst_hold(&net->ipv6.ip6_null_entry->dst);
              return &net->ipv6.ip6_null_entry->dst;
      }
      
      static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
                                  int flags, struct fib_lookup_arg *arg)
      {
              struct flowi6 *flp6 = &flp->u.ip6;
              struct rt6_info *rt = NULL;
              struct fib6_table *table;
  686         struct net *net = rule->fr_net;
  686         pol_lookup_t lookup = arg->lookup_ptr;
              int err = 0;
      
              switch (rule->action) {
              case FR_ACT_TO_TBL:
                      break;
              case FR_ACT_UNREACHABLE:
                      err = -ENETUNREACH;
                      rt = net->ipv6.ip6_null_entry;
                      goto discard_pkt;
              default:
              case FR_ACT_BLACKHOLE:
                      err = -EINVAL;
                      rt = net->ipv6.ip6_blk_hole_entry;
                      goto discard_pkt;
              case FR_ACT_PROHIBIT:
                      err = -EACCES;
                      rt = net->ipv6.ip6_prohibit_entry;
                      goto discard_pkt;
              }
      
              table = fib6_get_table(net, rule->table);
              if (!table) {
                      err = -EAGAIN;
                      goto out;
              }
      
              rt = lookup(net, table, flp6, flags);
              if (rt != net->ipv6.ip6_null_entry) {
                      struct fib6_rule *r = (struct fib6_rule *)rule;
      
                      /*
                       * If we need to find a source address for this traffic,
                       * we check the result if it meets requirement of the rule.
                       */
  651                 if ((rule->flags & FIB_RULE_FIND_SADDR) &&
                          r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
                              struct in6_addr saddr;
      
                              if (ipv6_dev_get_saddr(net,
                                                     ip6_dst_idev(&rt->dst)->dev,
                                                     &flp6->daddr,
                                                     rt6_flags2srcprefs(flags),
                                                     &saddr))
                                      goto again;
                              if (!ipv6_prefix_equal(&saddr, &r->src.addr,
                                                     r->src.plen))
                                      goto again;
                              flp6->saddr = saddr;
                      }
  651                 err = rt->dst.error;
                      if (err != -EAGAIN)
                              goto out;
              }
      again:
  281         ip6_rt_put(rt);
              err = -EAGAIN;
              rt = NULL;
              goto out;
      
      discard_pkt:
              dst_hold(&rt->dst);
      out:
  686         arg->result = rt;
              return err;
      }
      
      static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
      {
  650         struct rt6_info *rt = (struct rt6_info *) arg->result;
              struct net_device *dev = NULL;
      
              if (rt->rt6i_idev)
                      dev = rt->rt6i_idev->dev;
      
              /* do not accept result if the route does
               * not meet the required prefix length
               */
  650         if (rt->rt6i_dst.plen <= rule->suppress_prefixlen)
                      goto suppress_route;
      
              /* do not accept result if the route uses a device
               * belonging to a forbidden interface group
               */
  650         if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
                      goto suppress_route;
      
              return false;
      
      suppress_route:
              ip6_rt_put(rt);
  650         return true;
      }
      
      static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
      {
              struct fib6_rule *r = (struct fib6_rule *) rule;
              struct flowi6 *fl6 = &fl->u.ip6;
      
  686         if (r->dst.plen &&
  686             !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen))
                      return 0;
      
              /*
               * If FIB_RULE_FIND_SADDR is set and we do not have a
               * source address for the traffic, we defer check for
               * source address.
               */
  686         if (r->src.plen) {
                      if (flags & RT6_LOOKUP_F_HAS_SADDR) {
                              if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr,
                                                     r->src.plen))
                                      return 0;
                      } else if (!(r->common.flags & FIB_RULE_FIND_SADDR))
                              return 0;
              }
      
  686         if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel))
                      return 0;
      
              return 1;
      }
      
      static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
              FRA_GENERIC_POLICY,
      };
      
      static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
                                     struct fib_rule_hdr *frh,
                                     struct nlattr **tb)
      {
              int err = -EINVAL;
    2         struct net *net = sock_net(skb->sk);
              struct fib6_rule *rule6 = (struct fib6_rule *) rule;
      
    4         if (rule->action == FR_ACT_TO_TBL) {
    3                 if (rule->table == RT6_TABLE_UNSPEC)
                              goto errout;
      
                      if (fib6_new_table(net, rule->table) == NULL) {
                              err = -ENOBUFS;
                              goto errout;
                      }
              }
      
    3         if (frh->src_len)
                      rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]);
      
    3         if (frh->dst_len)
                      rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]);
      
    3         rule6->src.plen = frh->src_len;
              rule6->dst.plen = frh->dst_len;
              rule6->tclass = frh->tos;
      
              err = 0;
      errout:
    4         return err;
      }
      
      static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
                                   struct nlattr **tb)
      {
              struct fib6_rule *rule6 = (struct fib6_rule *) rule;
      
    3         if (frh->src_len && (rule6->src.plen != frh->src_len))
    3                 return 0;
      
    3         if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
                      return 0;
      
    3         if (frh->tos && (rule6->tclass != frh->tos))
                      return 0;
      
    1         if (frh->src_len &&
                  nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
                      return 0;
      
    1         if (frh->dst_len &&
                  nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr)))
                      return 0;
      
              return 1;
      }
      
      static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
                                struct fib_rule_hdr *frh)
      {
              struct fib6_rule *rule6 = (struct fib6_rule *) rule;
      
    7         frh->dst_len = rule6->dst.plen;
              frh->src_len = rule6->src.plen;
              frh->tos = rule6->tclass;
      
              if ((rule6->dst.plen &&
                   nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
    7             (rule6->src.plen &&
                   nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr)))
                      goto nla_put_failure;
    7         return 0;
      
      nla_put_failure:
              return -ENOBUFS;
      }
      
      static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
      {
              return nla_total_size(16) /* dst */
    4                + nla_total_size(16); /* src */
      }
      
      static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
              .family                        = AF_INET6,
              .rule_size                = sizeof(struct fib6_rule),
              .addr_size                = sizeof(struct in6_addr),
              .action                        = fib6_rule_action,
              .match                        = fib6_rule_match,
              .suppress                = fib6_rule_suppress,
              .configure                = fib6_rule_configure,
              .compare                = fib6_rule_compare,
              .fill                        = fib6_rule_fill,
              .nlmsg_payload                = fib6_rule_nlmsg_payload,
              .nlgroup                = RTNLGRP_IPV6_RULE,
              .policy                        = fib6_rule_policy,
              .owner                        = THIS_MODULE,
              .fro_net                = &init_net,
      };
      
      static int __net_init fib6_rules_net_init(struct net *net)
      {
              struct fib_rules_ops *ops;
              int err = -ENOMEM;
      
   28         ops = fib_rules_register(&fib6_rules_ops_template, net);
              if (IS_ERR(ops))
                      return PTR_ERR(ops);
      
   28         err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL, 0);
              if (err)
                      goto out_fib6_rules_ops;
      
   28         err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN, 0);
              if (err)
                      goto out_fib6_rules_ops;
      
   28         net->ipv6.fib6_rules_ops = ops;
      out:
              return err;
      
      out_fib6_rules_ops:
              fib_rules_unregister(ops);
              goto out;
      }
      
      static void __net_exit fib6_rules_net_exit(struct net *net)
      {
              rtnl_lock();
              fib_rules_unregister(net->ipv6.fib6_rules_ops);
              rtnl_unlock();
      }
      
      static struct pernet_operations fib6_rules_net_ops = {
              .init = fib6_rules_net_init,
              .exit = fib6_rules_net_exit,
      };
      
      int __init fib6_rules_init(void)
      {
              return register_pernet_subsys(&fib6_rules_net_ops);
      }
      
      
      void fib6_rules_cleanup(void)
      {
              unregister_pernet_subsys(&fib6_rules_net_ops);
      }
      /*
       * Copyright (C) 1991, 1992 Linus Torvalds
       * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
       * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
       * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
       * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
       *        -  July2000
       * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
       */
      
      /*
       * This handles all read/write requests to block devices
       */
      #include <linux/kernel.h>
      #include <linux/module.h>
      #include <linux/backing-dev.h>
      #include <linux/bio.h>
      #include <linux/blkdev.h>
      #include <linux/blk-mq.h>
      #include <linux/highmem.h>
      #include <linux/mm.h>
      #include <linux/kernel_stat.h>
      #include <linux/string.h>
      #include <linux/init.h>
      #include <linux/completion.h>
      #include <linux/slab.h>
      #include <linux/swap.h>
      #include <linux/writeback.h>
      #include <linux/task_io_accounting_ops.h>
      #include <linux/fault-inject.h>
      #include <linux/list_sort.h>
      #include <linux/delay.h>
      #include <linux/ratelimit.h>
      #include <linux/pm_runtime.h>
      #include <linux/blk-cgroup.h>
      
      #define CREATE_TRACE_POINTS
      #include <trace/events/block.h>
      
      #include "blk.h"
      #include "blk-mq.h"
      
      #include <linux/math64.h>
      
      EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
      EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
      EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
      EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
      EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
      
      DEFINE_IDA(blk_queue_ida);
      
      /*
       * For the allocated request tables
       */
      struct kmem_cache *request_cachep = NULL;
      
      /*
       * For queue allocation
       */
      struct kmem_cache *blk_requestq_cachep;
      
      /*
       * Controlling structure to kblockd
       */
      static struct workqueue_struct *kblockd_workqueue;
      
      static void blk_clear_congested(struct request_list *rl, int sync)
      {
      #ifdef CONFIG_CGROUP_WRITEBACK
              clear_wb_congested(rl->blkg->wb_congested, sync);
      #else
              /*
               * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
               * flip its congestion state for events on other blkcgs.
               */
  162         if (rl == &rl->q->root_rl)
  162                 clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
      #endif
      }
      
      static void blk_set_congested(struct request_list *rl, int sync)
      {
      #ifdef CONFIG_CGROUP_WRITEBACK
              set_wb_congested(rl->blkg->wb_congested, sync);
      #else
              /* see blk_clear_congested() */
    1         if (rl == &rl->q->root_rl)
    1                 set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
      #endif
      }
      
      void blk_queue_congestion_threshold(struct request_queue *q)
      {
              int nr;
      
   23         nr = q->nr_requests - (q->nr_requests / 8) + 1;
              if (nr > q->nr_requests)
                      nr = q->nr_requests;
   23         q->nr_congestion_on = nr;
      
              nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
              if (nr < 1)
                      nr = 1;
              q->nr_congestion_off = nr;
      }
      
      /**
       * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
       * @bdev:        device
       *
       * Locates the passed device's request queue and returns the address of its
       * backing_dev_info.  This function can only be called if @bdev is opened
       * and the return value is never NULL.
       */
      struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
      {
  478         struct request_queue *q = bdev_get_queue(bdev);
      
              return &q->backing_dev_info;
      }
      EXPORT_SYMBOL(blk_get_backing_dev_info);
      
      void blk_rq_init(struct request_queue *q, struct request *rq)
      {
  750         memset(rq, 0, sizeof(*rq));
      
              INIT_LIST_HEAD(&rq->queuelist);
              INIT_LIST_HEAD(&rq->timeout_list);
              rq->cpu = -1;
              rq->q = q;
              rq->__sector = (sector_t) -1;
              INIT_HLIST_NODE(&rq->hash);
              RB_CLEAR_NODE(&rq->rb_node);
              rq->cmd = rq->__cmd;
              rq->cmd_len = BLK_MAX_CDB;
              rq->tag = -1;
              rq->start_time = jiffies;
              set_start_time_ns(rq);
              rq->part = NULL;
      }
      EXPORT_SYMBOL(blk_rq_init);
      
      static void req_bio_endio(struct request *rq, struct bio *bio,
                                unsigned int nbytes, int error)
      {
    1         if (error)
    1                 bio->bi_error = error;
      
    1         if (unlikely(rq->cmd_flags & REQ_QUIET))
                      bio_set_flag(bio, BIO_QUIET);
      
    1         bio_advance(bio, nbytes);
      
              /* don't actually finish bio if it's part of flush sequence */
    1         if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
    1                 bio_endio(bio);
      }
      
      void blk_dump_rq_flags(struct request *rq, char *msg)
      {
              int bit;
      
              printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
                      rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
                      (unsigned long long) rq->cmd_flags);
      
              printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
                     (unsigned long long)blk_rq_pos(rq),
                     blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
              printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
                     rq->bio, rq->biotail, blk_rq_bytes(rq));
      
              if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                      printk(KERN_INFO "  cdb: ");
                      for (bit = 0; bit < BLK_MAX_CDB; bit++)
                              printk("%02x ", rq->cmd[bit]);
                      printk("\n");
              }
      }
      EXPORT_SYMBOL(blk_dump_rq_flags);
      
      static void blk_delay_work(struct work_struct *work)
      {
              struct request_queue *q;
      
              q = container_of(work, struct request_queue, delay_work.work);
              spin_lock_irq(q->queue_lock);
              __blk_run_queue(q);
              spin_unlock_irq(q->queue_lock);
      }
      
      /**
       * blk_delay_queue - restart queueing after defined interval
       * @q:                The &struct request_queue in question
       * @msecs:        Delay in msecs
       *
       * Description:
       *   Sometimes queueing needs to be postponed for a little while, to allow
       *   resources to come back. This function will make sure that queueing is
       *   restarted around the specified time. Queue lock must be held.
       */
      void blk_delay_queue(struct request_queue *q, unsigned long msecs)
      {
              if (likely(!blk_queue_dead(q)))
                      queue_delayed_work(kblockd_workqueue, &q->delay_work,
                                         msecs_to_jiffies(msecs));
      }
      EXPORT_SYMBOL(blk_delay_queue);
      
      /**
       * blk_start_queue_async - asynchronously restart a previously stopped queue
       * @q:    The &struct request_queue in question
       *
       * Description:
       *   blk_start_queue_async() will clear the stop flag on the queue, and
       *   ensure that the request_fn for the queue is run from an async
       *   context.
       **/
      void blk_start_queue_async(struct request_queue *q)
      {
              queue_flag_clear(QUEUE_FLAG_STOPPED, q);
              blk_run_queue_async(q);
      }
      EXPORT_SYMBOL(blk_start_queue_async);
      
      /**
       * blk_start_queue - restart a previously stopped queue
       * @q:    The &struct request_queue in question
       *
       * Description:
       *   blk_start_queue() will clear the stop flag on the queue, and call
       *   the request_fn for the queue if it was in a stopped state when
       *   entered. Also see blk_stop_queue(). Queue lock must be held.
       **/
      void blk_start_queue(struct request_queue *q)
      {
              WARN_ON(!in_interrupt() && !irqs_disabled());
      
              queue_flag_clear(QUEUE_FLAG_STOPPED, q);
              __blk_run_queue(q);
      }
      EXPORT_SYMBOL(blk_start_queue);
      
      /**
       * blk_stop_queue - stop a queue
       * @q:    The &struct request_queue in question
       *
       * Description:
       *   The Linux block layer assumes that a block driver will consume all
       *   entries on the request queue when the request_fn strategy is called.
       *   Often this will not happen, because of hardware limitations (queue
       *   depth settings). If a device driver gets a 'queue full' response,
       *   or if it simply chooses not to queue more I/O at one point, it can
       *   call this function to prevent the request_fn from being called until
       *   the driver has signalled it's ready to go again. This happens by calling
       *   blk_start_queue() to restart queue operations. Queue lock must be held.
       **/
      void blk_stop_queue(struct request_queue *q)
      {
              cancel_delayed_work(&q->delay_work);
              queue_flag_set(QUEUE_FLAG_STOPPED, q);
      }
      EXPORT_SYMBOL(blk_stop_queue);
      
      /**
       * blk_sync_queue - cancel any pending callbacks on a queue
       * @q: the queue
       *
       * Description:
       *     The block layer may perform asynchronous callback activity
       *     on a queue, such as calling the unplug function after a timeout.
       *     A block device may call blk_sync_queue to ensure that any
       *     such activity is cancelled, thus allowing it to release resources
       *     that the callbacks might use. The caller must already have made sure
       *     that its ->make_request_fn will not re-add plugging prior to calling
       *     this function.
       *
       *     This function does not cancel any asynchronous activity arising
       *     out of elevator or throttling code. That would require elevator_exit()
       *     and blkcg_exit_queue() to be called with queue lock initialized.
       *
       */
   32 void blk_sync_queue(struct request_queue *q)
      {
   32         del_timer_sync(&q->timeout);
      
              if (q->mq_ops) {
                      struct blk_mq_hw_ctx *hctx;
                      int i;
      
   32                 queue_for_each_hw_ctx(q, hctx, i) {
                              cancel_delayed_work_sync(&hctx->run_work);
                              cancel_delayed_work_sync(&hctx->delay_work);
                      }
              } else {
                      cancel_delayed_work_sync(&q->delay_work);
              }
   32 }
      EXPORT_SYMBOL(blk_sync_queue);
      
      /**
       * __blk_run_queue_uncond - run a queue whether or not it has been stopped
       * @q:        The queue to run
       *
       * Description:
       *    Invoke request handling on a queue if there are any pending requests.
       *    May be used to restart request handling after a request has completed.
       *    This variant runs the queue whether or not the queue has been
       *    stopped. Must be called with the queue lock held and interrupts
       *    disabled. See also @blk_run_queue.
       */
      inline void __blk_run_queue_uncond(struct request_queue *q)
      {
  650         if (unlikely(blk_queue_dead(q)))
                      return;
      
              /*
               * Some request_fn implementations, e.g. scsi_request_fn(), unlock
               * the queue lock internally. As a result multiple threads may be
               * running such a request function concurrently. Keep track of the
               * number of active request_fn invocations such that blk_drain_queue()
               * can wait until all these request_fn calls have finished.
               */
  650         q->request_fn_active++;
              q->request_fn(q);
  650         q->request_fn_active--;
      }
      EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
      
      /**
       * __blk_run_queue - run a single device queue
       * @q:        The queue to run
       *
       * Description:
       *    See @blk_run_queue. This variant must be called with the queue lock
       *    held and interrupts disabled.
       */
      void __blk_run_queue(struct request_queue *q)
      {
  650         if (unlikely(blk_queue_stopped(q)))
                      return;
      
  650         __blk_run_queue_uncond(q);
      }
      EXPORT_SYMBOL(__blk_run_queue);
      
      /**
       * blk_run_queue_async - run a single device queue in workqueue context
       * @q:        The queue to run
       *
       * Description:
       *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
       *    of us. The caller must hold the queue lock.
       */
      void blk_run_queue_async(struct request_queue *q)
      {
  433         if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
  433                 mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
  433 }
      EXPORT_SYMBOL(blk_run_queue_async);
      
      /**
       * blk_run_queue - run a single device queue
       * @q: The queue to run
       *
       * Description:
       *    Invoke request handling on this queue, if it has pending work to do.
       *    May be used to restart queueing when a request has completed.
       */
      void blk_run_queue(struct request_queue *q)
      {
              unsigned long flags;
      
              spin_lock_irqsave(q->queue_lock, flags);
              __blk_run_queue(q);
              spin_unlock_irqrestore(q->queue_lock, flags);
      }
      EXPORT_SYMBOL(blk_run_queue);
      
      void blk_put_queue(struct request_queue *q)
      {
   32         kobject_put(&q->kobj);
      }
      EXPORT_SYMBOL(blk_put_queue);
      
      /**
       * __blk_drain_queue - drain requests from request_queue
       * @q: queue to drain
       * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
       *
       * Drain requests from @q.  If @drain_all is set, all requests are drained.
       * If not, only ELVPRIV requests are drained.  The caller is responsible
       * for ensuring that no new requests which need to be drained are queued.
       */
      static void __blk_drain_queue(struct request_queue *q, bool drain_all)
              __releases(q->queue_lock)
              __acquires(q->queue_lock)
      {
              int i;
      
              lockdep_assert_held(q->queue_lock);
      
              while (true) {
                      bool drain = false;
      
                      /*
                       * The caller might be trying to drain @q before its
                       * elevator is initialized.
                       */
                      if (q->elevator)
                              elv_drain_elevator(q);
      
                      blkcg_drain_queue(q);
      
                      /*
                       * This function might be called on a queue which failed
                       * driver init after queue creation or is not yet fully
                       * active yet.  Some drivers (e.g. fd and loop) get unhappy
                       * in such cases.  Kick queue iff dispatch queue has
                       * something on it and @q has request_fn set.
                       */
                      if (!list_empty(&q->queue_head) && q->request_fn)
                              __blk_run_queue(q);
      
                      drain |= q->nr_rqs_elvpriv;
                      drain |= q->request_fn_active;
      
                      /*
                       * Unfortunately, requests are queued at and tracked from
                       * multiple places and there's no single counter which can
                       * be drained.  Check all the queues and counters.
                       */
                      if (drain_all) {
                              struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
                              drain |= !list_empty(&q->queue_head);
                              for (i = 0; i < 2; i++) {
                                      drain |= q->nr_rqs[i];
                                      drain |= q->in_flight[i];
                                      if (fq)
                                          drain |= !list_empty(&fq->flush_queue[i]);
                              }
                      }
      
                      if (!drain)
                              break;
      
                      spin_unlock_irq(q->queue_lock);
      
                      msleep(10);
      
                      spin_lock_irq(q->queue_lock);
              }
      
              /*
               * With queue marked dead, any woken up waiter will fail the
               * allocation path, so the wakeup chaining is lost and we're
               * left with hung waiters. We need to wake up those waiters.
               */
              if (q->request_fn) {
                      struct request_list *rl;
      
                      blk_queue_for_each_rl(rl, q)
                              for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
                                      wake_up_all(&rl->wait[i]);
              }
      }
      
      /**
       * blk_queue_bypass_start - enter queue bypass mode
       * @q: queue of interest
       *
       * In bypass mode, only the dispatch FIFO queue of @q is used.  This
       * function makes @q enter bypass mode and drains all requests which were
       * throttled or issued before.  On return, it's guaranteed that no request
       * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
       * inside queue or RCU read lock.
       */
      void blk_queue_bypass_start(struct request_queue *q)
      {
              spin_lock_irq(q->queue_lock);
              q->bypass_depth++;
              queue_flag_set(QUEUE_FLAG_BYPASS, q);
              spin_unlock_irq(q->queue_lock);
      
              /*
               * Queues start drained.  Skip actual draining till init is
               * complete.  This avoids lenghty delays during queue init which
               * can happen many times during boot.
               */
              if (blk_queue_init_done(q)) {
                      spin_lock_irq(q->queue_lock);
                      __blk_drain_queue(q, false);
                      spin_unlock_irq(q->queue_lock);
      
                      /* ensure blk_queue_bypass() is %true inside RCU read lock */
                      synchronize_rcu();
              }
      }
      EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
      
      /**
       * blk_queue_bypass_end - leave queue bypass mode
       * @q: queue of interest
       *
       * Leave bypass mode and restore the normal queueing behavior.
       */
      void blk_queue_bypass_end(struct request_queue *q)
      {
   23         spin_lock_irq(q->queue_lock);
              if (!--q->bypass_depth)
   23                 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
   23         WARN_ON_ONCE(q->bypass_depth < 0);
   23         spin_unlock_irq(q->queue_lock);
      }
      EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
      
      void blk_set_queue_dying(struct request_queue *q)
      {
   32         spin_lock_irq(q->queue_lock);
   32         queue_flag_set(QUEUE_FLAG_DYING, q);
              spin_unlock_irq(q->queue_lock);
      
              if (q->mq_ops)
   32                 blk_mq_wake_waiters(q);
              else {
                      struct request_list *rl;
      
                      blk_queue_for_each_rl(rl, q) {
                              if (rl->rq_pool) {
                                      wake_up_all(&rl->wait[BLK_RW_SYNC]);
                                      wake_up_all(&rl->wait[BLK_RW_ASYNC]);
                              }
                      }
              }
   32 }
      EXPORT_SYMBOL_GPL(blk_set_queue_dying);
      
      /**
       * blk_cleanup_queue - shutdown a request queue
       * @q: request queue to shutdown
       *
       * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
       * put it.  All future requests will be failed immediately with -ENODEV.
       */
      void blk_cleanup_queue(struct request_queue *q)
      {
   32         spinlock_t *lock = q->queue_lock;
      
              /* mark @q DYING, no new request or merges will be allowed afterwards */
              mutex_lock(&q->sysfs_lock);
              blk_set_queue_dying(q);
              spin_lock_irq(lock);
      
              /*
               * A dying queue is permanently in bypass mode till released.  Note
               * that, unlike blk_queue_bypass_start(), we aren't performing
               * synchronize_rcu() after entering bypass mode to avoid the delay
               * as some drivers create and destroy a lot of queues while
               * probing.  This is still safe because blk_release_queue() will be
               * called only after the queue refcnt drops to zero and nothing,
               * RCU or not, would be traversing the queue by then.
               */
              q->bypass_depth++;
   32         queue_flag_set(QUEUE_FLAG_BYPASS, q);
      
   32         queue_flag_set(QUEUE_FLAG_NOMERGES, q);
   32         queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
   32         queue_flag_set(QUEUE_FLAG_DYING, q);
              spin_unlock_irq(lock);
              mutex_unlock(&q->sysfs_lock);
      
              /*
               * Drain all requests queued before DYING marking. Set DEAD flag to
               * prevent that q->request_fn() gets invoked after draining finished.
               */
              blk_freeze_queue(q);
              spin_lock_irq(lock);
              if (!q->mq_ops)
                      __blk_drain_queue(q, true);
   32         queue_flag_set(QUEUE_FLAG_DEAD, q);
              spin_unlock_irq(lock);
      
              /* for synchronous bio-based driver finish in-flight integrity i/o */
              blk_flush_integrity();
      
              /* @q won't process any more request, flush async actions */
              del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
              blk_sync_queue(q);
      
              if (q->mq_ops)
   32                 blk_mq_free_queue(q);
   32         percpu_ref_exit(&q->q_usage_counter);
      
              spin_lock_irq(lock);
              if (q->queue_lock != &q->__queue_lock)
                      q->queue_lock = &q->__queue_lock;
   32         spin_unlock_irq(lock);
      
              bdi_unregister(&q->backing_dev_info);
      
              /* @q is and will stay empty, shutdown and put */
              blk_put_queue(q);
      }
      EXPORT_SYMBOL(blk_cleanup_queue);
      
      /* Allocate memory local to the request queue */
      static void *alloc_request_struct(gfp_t gfp_mask, void *data)
      {
              int nid = (int)(long)data;
  694         return kmem_cache_alloc_node(request_cachep, gfp_mask, nid);
      }
      
      static void free_request_struct(void *element, void *unused)
      {
  162         kmem_cache_free(request_cachep, element);
      }
      
      int blk_init_rl(struct request_list *rl, struct request_queue *q,
                      gfp_t gfp_mask)
      {
              if (unlikely(rl->rq_pool))
                      return 0;
      
              rl->q = q;
              rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
              rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
              init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
              init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
      
              rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct,
                                                free_request_struct,
                                                (void *)(long)q->node, gfp_mask,
                                                q->node);
              if (!rl->rq_pool)
                      return -ENOMEM;
      
              return 0;
      }
      
      void blk_exit_rl(struct request_list *rl)
      {
   32         if (rl->rq_pool)
                      mempool_destroy(rl->rq_pool);
   32 }
      
      struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
      {
              return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
      }
      EXPORT_SYMBOL(blk_alloc_queue);
      
      int blk_queue_enter(struct request_queue *q, gfp_t gfp)
  762 {
              while (true) {
  762                 if (percpu_ref_tryget_live(&q->q_usage_counter))
  761                         return 0;
      
    7                 if (!gfpflags_allow_blocking(gfp))
                              return -EBUSY;
      
    7                 wait_event(q->mq_freeze_wq,
                                 !atomic_read(&q->mq_freeze_depth) ||
                                 blk_queue_dying(q));
    7                 if (blk_queue_dying(q))
                              return -ENODEV;
              }
      }
      
      void blk_queue_exit(struct request_queue *q)
      {
  761         percpu_ref_put(&q->q_usage_counter);
  761 }
      
      static void blk_queue_usage_counter_release(struct percpu_ref *ref)
      {
              struct request_queue *q =
                      container_of(ref, struct request_queue, q_usage_counter);
      
              wake_up_all(&q->mq_freeze_wq);
      }
      
      struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
      {
              struct request_queue *q;
              int err;
      
   23         q = kmem_cache_alloc_node(blk_requestq_cachep,
                                      gfp_mask | __GFP_ZERO, node_id);
              if (!q)
                      return NULL;
      
   23         q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
              if (q->id < 0)
                      goto fail_q;
      
   23         q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
              if (!q->bio_split)
                      goto fail_id;
      
   23         q->backing_dev_info.ra_pages =
                              (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
              q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
              q->backing_dev_info.name = "block";
              q->node = node_id;
      
              err = bdi_init(&q->backing_dev_info);
              if (err)
                      goto fail_split;
      
   23         setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                          laptop_mode_timer_fn, (unsigned long) q);
              setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
              INIT_LIST_HEAD(&q->queue_head);
              INIT_LIST_HEAD(&q->timeout_list);
              INIT_LIST_HEAD(&q->icq_list);
      #ifdef CONFIG_BLK_CGROUP
              INIT_LIST_HEAD(&q->blkg_list);
      #endif
              INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
      
              kobject_init(&q->kobj, &blk_queue_ktype);
      
              mutex_init(&q->sysfs_lock);
              spin_lock_init(&q->__queue_lock);
      
              /*
               * By default initialize queue_lock to internal lock and driver can
               * override it later if need be.
               */
              q->queue_lock = &q->__queue_lock;
      
              /*
               * A queue starts its life with bypass turned on to avoid
               * unnecessary bypass on/off overhead and nasty surprises during
               * init.  The initial bypass will be finished when the queue is
               * registered by blk_register_queue().
               */
              q->bypass_depth = 1;
              __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
      
              init_waitqueue_head(&q->mq_freeze_wq);
      
              /*
               * Init percpu_ref in atomic mode so that it's faster to shutdown.
               * See blk_register_queue() for details.
               */
   23         if (percpu_ref_init(&q->q_usage_counter,
                                      blk_queue_usage_counter_release,
                                      PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
                      goto fail_bdi;
      
              if (blkcg_init_queue(q))
                      goto fail_ref;
      
              return q;
      
      fail_ref:
              percpu_ref_exit(&q->q_usage_counter);
      fail_bdi:
              bdi_destroy(&q->backing_dev_info);
      fail_split:
              bioset_free(q->bio_split);
      fail_id:
              ida_simple_remove(&blk_queue_ida, q->id);
      fail_q:
              kmem_cache_free(blk_requestq_cachep, q);
              return NULL;
      }
      EXPORT_SYMBOL(blk_alloc_queue_node);
      
      /**
       * blk_init_queue  - prepare a request queue for use with a block device
       * @rfn:  The function to be called to process requests that have been
       *        placed on the queue.
       * @lock: Request queue spin lock
       *
       * Description:
       *    If a block device wishes to use the standard request handling procedures,
       *    which sorts requests and coalesces adjacent requests, then it must
       *    call blk_init_queue().  The function @rfn will be called when there
       *    are requests on the queue that need to be processed.  If the device
       *    supports plugging, then @rfn may not be called immediately when requests
       *    are available on the queue, but may be called at some time later instead.
       *    Plugged queues are generally unplugged when a buffer belonging to one
       *    of the requests on the queue is needed, or due to memory pressure.
       *
       *    @rfn is not required, or even expected, to remove all requests off the
       *    queue, but only as many as it can handle at a time.  If it does leave
       *    requests on the queue, it is responsible for arranging that the requests
       *    get dealt with eventually.
       *
       *    The queue spin lock must be held while manipulating the requests on the
       *    request queue; this lock will be taken also from interrupt context, so irq
       *    disabling is needed for it.
       *
       *    Function returns a pointer to the initialized request queue, or %NULL if
       *    it didn't succeed.
       *
       * Note:
       *    blk_init_queue() must be paired with a blk_cleanup_queue() call
       *    when the block device is deactivated (such as at module unload).
       **/
      
      struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
      {
              return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
      }
      EXPORT_SYMBOL(blk_init_queue);
      
      struct request_queue *
      blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
      {
              struct request_queue *uninit_q, *q;
      
              uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
              if (!uninit_q)
                      return NULL;
      
              q = blk_init_allocated_queue(uninit_q, rfn, lock);
              if (!q)
                      blk_cleanup_queue(uninit_q);
      
              return q;
      }
      EXPORT_SYMBOL(blk_init_queue_node);
      
      static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
      
      struct request_queue *
      blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
                               spinlock_t *lock)
      {
              if (!q)
                      return NULL;
      
              q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
              if (!q->fq)
                      return NULL;
      
              if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
                      goto fail;
      
              q->request_fn                = rfn;
              q->prep_rq_fn                = NULL;
              q->unprep_rq_fn                = NULL;
              q->queue_flags                |= QUEUE_FLAG_DEFAULT;
      
              /* Override internal queue lock with supplied lock pointer */
              if (lock)
                      q->queue_lock                = lock;
      
              /*
               * This also sets hw/phys segments, boundary and size
               */
              blk_queue_make_request(q, blk_queue_bio);
      
              q->sg_reserved_size = INT_MAX;
      
              /* Protect q->elevator from elevator_change */
              mutex_lock(&q->sysfs_lock);
      
              /* init elevator */
              if (elevator_init(q, NULL)) {
                      mutex_unlock(&q->sysfs_lock);
                      goto fail;
              }
      
              mutex_unlock(&q->sysfs_lock);
      
              return q;
      
      fail:
              blk_free_flush_queue(q->fq);
              q->fq = NULL;
              return NULL;
      }
      EXPORT_SYMBOL(blk_init_allocated_queue);
      
      bool blk_get_queue(struct request_queue *q)
   23 {
   23         if (likely(!blk_queue_dying(q))) {
   23                 __blk_get_queue(q);
                      return true;
              }
      
              return false;
      }
      EXPORT_SYMBOL(blk_get_queue);
      
      static inline void blk_free_request(struct request_list *rl, struct request *rq)
  162 {
  162         if (rq->cmd_flags & REQ_ELVPRIV) {
                      elv_put_request(rl->q, rq);
  162                 if (rq->elv.icq)
                              put_io_context(rq->elv.icq->ioc);
              }
  162 
              mempool_free(rq, rl->rq_pool);
      }
      
      /*
       * ioc_batching returns true if the ioc is a valid batching request and
       * should be given priority access to a request.
       */
      static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
  694 {
              if (!ioc)
                      return 0;
      
              /*
               * Make sure the process is able to allocate at least 1 request
               * even if the batch times out, otherwise we could theoretically
               * lose wakeups.
  694          */
              return ioc->nr_batch_requests == q->nr_batching ||
    1                 (ioc->nr_batch_requests > 0
                      && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
      }
      
      /*
       * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
       * will cause the process to be a "batcher" on all queues in the system. This
       * is the behaviour we want though - once it gets a wakeup it should be given
       * a nice run.
       */
      static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
    1 {
              if (!ioc || ioc_batching(q, ioc))
                      return;
    1 
              ioc->nr_batch_requests = q->nr_batching;
              ioc->last_waited = jiffies;
      }
  162 
      static void __freed_request(struct request_list *rl, int sync)
  162 {
              struct request_queue *q = rl->q;
      
  162         if (rl->count[sync] < queue_congestion_off_threshold(q))
                      blk_clear_congested(rl, sync);
  162 
  162         if (rl->count[sync] + 1 <= q->nr_requests) {
                      if (waitqueue_active(&rl->wait[sync]))
                              wake_up(&rl->wait[sync]);
  162 
                      blk_clear_rl_full(rl, sync);
  162         }
      }
      
      /*
       * A request has just been released.  Account for it, update the full and
       * congestion status, wake up any waiters.   Called under q->queue_lock.
       */
      static void freed_request(struct request_list *rl, unsigned int flags)
  162 {
              struct request_queue *q = rl->q;
              int sync = rw_is_sync(flags);
      
              q->nr_rqs[sync]--;
              rl->count[sync]--;
  162         if (flags & REQ_ELVPRIV)
                      q->nr_rqs_elvpriv--;
  162 
              __freed_request(rl, sync);
      
              if (unlikely(rl->starved[sync ^ 1]))
  162                 __freed_request(rl, sync ^ 1);
      }
      
      int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
      {
              struct request_list *rl;
              int on_thresh, off_thresh;
      
              spin_lock_irq(q->queue_lock);
              q->nr_requests = nr;
              blk_queue_congestion_threshold(q);
              on_thresh = queue_congestion_on_threshold(q);
              off_thresh = queue_congestion_off_threshold(q);
      
              blk_queue_for_each_rl(rl, q) {
                      if (rl->count[BLK_RW_SYNC] >= on_thresh)
                              blk_set_congested(rl, BLK_RW_SYNC);
                      else if (rl->count[BLK_RW_SYNC] < off_thresh)
                              blk_clear_congested(rl, BLK_RW_SYNC);
      
                      if (rl->count[BLK_RW_ASYNC] >= on_thresh)
                              blk_set_congested(rl, BLK_RW_ASYNC);
                      else if (rl->count[BLK_RW_ASYNC] < off_thresh)
                              blk_clear_congested(rl, BLK_RW_ASYNC);
      
                      if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
                              blk_set_rl_full(rl, BLK_RW_SYNC);
                      } else {
                              blk_clear_rl_full(rl, BLK_RW_SYNC);
                              wake_up(&rl->wait[BLK_RW_SYNC]);
                      }
      
                      if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
                              blk_set_rl_full(rl, BLK_RW_ASYNC);
                      } else {
                              blk_clear_rl_full(rl, BLK_RW_ASYNC);
                              wake_up(&rl->wait[BLK_RW_ASYNC]);
                      }
              }
      
              spin_unlock_irq(q->queue_lock);
              return 0;
      }
      
      /*
       * Determine if elevator data should be initialized when allocating the
       * request associated with @bio.
       */
      static bool blk_rq_should_init_elevator(struct bio *bio)
      {
              if (!bio)
                      return true;
      
              /*
               * Flush requests do not use the elevator so skip initialization.
               * This allows a request to share the flush and elevator data.
  694          */
              if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
                      return false;
      
              return true;
      }
      
      /**
       * rq_ioc - determine io_context for request allocation
       * @bio: request being allocated is for this bio (can be %NULL)
       *
       * Determine io_context to use for request allocation for @bio.  May return
       * %NULL if %current->io_context doesn't exist.
       */
      static struct io_context *rq_ioc(struct bio *bio)
      {
      #ifdef CONFIG_BLK_CGROUP
              if (bio && bio->bi_ioc)
                      return bio->bi_ioc;
      #endif
              return current->io_context;
      }
      
      /**
       * __get_request - get a free request
       * @rl: request list to allocate from
       * @rw_flags: RW and SYNC flags
       * @bio: bio to allocate request for (can be %NULL)
       * @gfp_mask: allocation mask
       *
       * Get a free request from @q.  This function may fail under memory
       * pressure or if @q is dead.
       *
       * Must be called with @q->queue_lock held and,
       * Returns ERR_PTR on failure, with @q->queue_lock held.
       * Returns request pointer on success, with @q->queue_lock *not held*.
       */
      static struct request *__get_request(struct request_list *rl, int rw_flags,
                                           struct bio *bio, gfp_t gfp_mask)
  694 {
              struct request_queue *q = rl->q;
              struct request *rq;
              struct elevator_type *et = q->elevator->type;
              struct io_context *ioc = rq_ioc(bio);
              struct io_cq *icq = NULL;
              const bool is_sync = rw_is_sync(rw_flags) != 0;
              int may_queue;
      
              if (unlikely(blk_queue_dying(q)))
                      return ERR_PTR(-ENODEV);
  694 
              may_queue = elv_may_queue(q, rw_flags);
              if (may_queue == ELV_MQUEUE_NO)
                      goto rq_starved;
  694 
    1         if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
                      if (rl->count[is_sync]+1 >= q->nr_requests) {
                              /*
                               * The queue will fill after this allocation, so set
                               * it as full, and mark this process as "batching".
                               * This process will be allowed to complete a batch of
                               * requests, others will be blocked.
    1                          */
    1                         if (!blk_rl_full(rl, is_sync)) {
    1                                 ioc_set_batching(q, ioc);
                                      blk_set_rl_full(rl, is_sync);
    1                         } else {
    1                                 if (may_queue != ELV_MQUEUE_MUST
                                                      && !ioc_batching(q, ioc)) {
                                              /*
                                               * The queue is full and the allocating
                                               * process is not a "batcher", and not
                                               * exempted by the IO scheduler
                                               */
                                              return ERR_PTR(-ENOMEM);
                                      }
                              }
    1                 }
                      blk_set_congested(rl, is_sync);
              }
      
              /*
               * Only allow batching queuers to allocate up to 50% over the defined
               * limit of requests, otherwise we could have thousands of requests
               * allocated with any setting of ->nr_requests
  694          */
              if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
                      return ERR_PTR(-ENOMEM);
  694 
              q->nr_rqs[is_sync]++;
              rl->count[is_sync]++;
              rl->starved[is_sync] = 0;
      
              /*
               * Decide whether the new request will be managed by elevator.  If
               * so, mark @rw_flags and increment elvpriv.  Non-zero elvpriv will
               * prevent the current elevator from being destroyed until the new
               * request is freed.  This guarantees icq's won't be destroyed and
               * makes creating new ones safe.
               *
               * Also, lookup icq while holding queue_lock.  If it doesn't exist,
               * it will be created after releasing queue_lock.
  694          */
  694         if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
                      rw_flags |= REQ_ELVPRIV;
  694                 q->nr_rqs_elvpriv++;
  694                 if (et->icq_cache && ioc)
                              icq = ioc_lookup_icq(ioc, q);
              }
  694 
  694         if (blk_queue_io_stat(q))
  694                 rw_flags |= REQ_IO_STAT;
              spin_unlock_irq(q->queue_lock);
      
              /* allocate and init request */
              rq = mempool_alloc(rl->rq_pool, gfp_mask);
              if (!rq)
                      goto fail_alloc;
  694 
              blk_rq_init(q, rq);
              blk_rq_set_rl(rq, rl);
              rq->cmd_flags = rw_flags | REQ_ALLOCED;
      
              /* init elvpriv */
  694         if (rw_flags & REQ_ELVPRIV) {
  652                 if (unlikely(et->icq_cache && !icq)) {
  652                         if (ioc)
                                      icq = ioc_create_icq(ioc, q, gfp_mask);
                              if (!icq)
                                      goto fail_elvpriv;
                      }
  694 
                      rq->elv.icq = icq;
                      if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
                              goto fail_elvpriv;
      
                      /* @rq->elv.icq holds io_context until @rq is freed */
  694                 if (icq)
                              get_io_context(icq->ioc);
              }
      out:
              /*
               * ioc may be NULL here, and ioc_batching will be false. That's
               * OK, if the queue is under the request limit then requests need
               * not count toward the nr_batch_requests limit. There will always
               * be some limit enforced by BLK_BATCH_TIME.
  694          */
    1         if (ioc_batching(q, ioc))
                      ioc->nr_batch_requests--;
  694 
              trace_block_getrq(q, bio, rw_flags & 1);
              return rq;
      
      fail_elvpriv:
              /*
               * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
               * and may fail indefinitely under memory pressure and thus
               * shouldn't stall IO.  Treat this request as !elvpriv.  This will
               * disturb iosched and blkcg but weird is bettern than dead.
               */
              printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
                                 __func__, dev_name(q->backing_dev_info.dev));
      
              rq->cmd_flags &= ~REQ_ELVPRIV;
              rq->elv.icq = NULL;
      
              spin_lock_irq(q->queue_lock);
              q->nr_rqs_elvpriv--;
              spin_unlock_irq(q->queue_lock);
              goto out;
      
      fail_alloc:
              /*
               * Allocation failed presumably due to memory. Undo anything we
               * might have messed up.
               *
               * Allocating task should really be put onto the front of the wait
               * queue, but this is pretty rare.
               */
              spin_lock_irq(q->queue_lock);
              freed_request(rl, rw_flags);
      
              /*
               * in the very unlikely event that allocation failed and no
               * requests for this direction was pending, mark us starved so that
               * freeing of a request in the other direction will notice
               * us. another possible fix would be to split the rq mempool into
               * READ and WRITE
               */
      rq_starved:
              if (unlikely(rl->count[is_sync] == 0))
                      rl->starved[is_sync] = 1;
              return ERR_PTR(-ENOMEM);
      }
      
      /**
       * get_request - get a free request
       * @q: request_queue to allocate request from
       * @rw_flags: RW and SYNC flags
       * @bio: bio to allocate request for (can be %NULL)
       * @gfp_mask: allocation mask
       *
       * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
       * this function keeps retrying under memory pressure and fails iff @q is dead.
       *
       * Must be called with @q->queue_lock held and,
       * Returns ERR_PTR on failure, with @q->queue_lock held.
       * Returns request pointer on success, with @q->queue_lock *not held*.
       */
      static struct request *get_request(struct request_queue *q, int rw_flags,
                                         struct bio *bio, gfp_t gfp_mask)
  694 {
              const bool is_sync = rw_is_sync(rw_flags) != 0;
              DEFINE_WAIT(wait);
              struct request_list *rl;
              struct request *rq;
      
              rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
  694 retry:
  694         rq = __get_request(rl, rw_flags, bio, gfp_mask);
              if (!IS_ERR(rq))
                      return rq;
      
              if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
                      blk_put_rl(rl);
                      return rq;
              }
      
              /* wait on @rl and retry */
              prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
                                        TASK_UNINTERRUPTIBLE);
      
              trace_block_sleeprq(q, bio, rw_flags & 1);
      
              spin_unlock_irq(q->queue_lock);
              io_schedule();
      
              /*
               * After sleeping, we become a "batching" process and will be able
               * to allocate at least one request, and up to a big batch of them
               * for a small period time.  See ioc_batching, ioc_set_batching
    1          */
              ioc_set_batching(q, current->io_context);
    1 
              spin_lock_irq(q->queue_lock);
              finish_wait(&rl->wait[is_sync], &wait);
      
              goto retry;
      }
      
      static struct request *blk_old_get_request(struct request_queue *q, int rw,
                      gfp_t gfp_mask)
      {
              struct request *rq;
      
              BUG_ON(rw != READ && rw != WRITE);
      
              /* create ioc upfront */
              create_io_context(gfp_mask, q->node);
      
              spin_lock_irq(q->queue_lock);
              rq = get_request(q, rw, NULL, gfp_mask);
              if (IS_ERR(rq))
                      spin_unlock_irq(q->queue_lock);
              /* q->queue_lock is unlocked at this point */
      
              return rq;
      }
      
      struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
      {
              if (q->mq_ops)
                      return blk_mq_alloc_request(q, rw, gfp_mask, false);
              else
                      return blk_old_get_request(q, rw, gfp_mask);
      }
      EXPORT_SYMBOL(blk_get_request);
      
      /**
       * blk_make_request - given a bio, allocate a corresponding struct request.
       * @q: target request queue
       * @bio:  The bio describing the memory mappings that will be submitted for IO.
       *        It may be a chained-bio properly constructed by block/bio layer.
       * @gfp_mask: gfp flags to be used for memory allocation
       *
       * blk_make_request is the parallel of generic_make_request for BLOCK_PC
       * type commands. Where the struct request needs to be farther initialized by
       * the caller. It is passed a &struct bio, which describes the memory info of
       * the I/O transfer.
       *
       * The caller of blk_make_request must make sure that bi_io_vec
       * are set to describe the memory buffers. That bio_data_dir() will return
       * the needed direction of the request. (And all bio's in the passed bio-chain
       * are properly set accordingly)
       *
       * If called under none-sleepable conditions, mapped bio buffers must not
       * need bouncing, by calling the appropriate masked or flagged allocator,
       * suitable for the target device. Otherwise the call to blk_queue_bounce will
       * BUG.
       *
       * WARNING: When allocating/cloning a bio-chain, careful consideration should be
       * given to how you allocate bios. In particular, you cannot use
       * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
       * you risk waiting for IO completion of a bio that hasn't been submitted yet,
       * thus resulting in a deadlock. Alternatively bios should be allocated using
       * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
       * If possible a big IO should be split into smaller parts when allocation
       * fails. Partial allocation should not be an error, or you risk a live-lock.
       */
      struct request *blk_make_request(struct request_queue *q, struct bio *bio,
                                       gfp_t gfp_mask)
      {
              struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
      
              if (IS_ERR(rq))
                      return rq;
      
              blk_rq_set_block_pc(rq);
      
              for_each_bio(bio) {
                      struct bio *bounce_bio = bio;
                      int ret;
      
                      blk_queue_bounce(q, &bounce_bio);
                      ret = blk_rq_append_bio(q, rq, bounce_bio);
                      if (unlikely(ret)) {
                              blk_put_request(rq);
                              return ERR_PTR(ret);
                      }
              }
      
              return rq;
      }
      EXPORT_SYMBOL(blk_make_request);
      
      /**
       * blk_rq_set_block_pc - initialize a request to type BLOCK_PC
       * @rq:                request to be initialized
       *
       */
      void blk_rq_set_block_pc(struct request *rq)
      {
              rq->cmd_type = REQ_TYPE_BLOCK_PC;
              rq->__data_len = 0;
              rq->__sector = (sector_t) -1;
              rq->bio = rq->biotail = NULL;
              memset(rq->__cmd, 0, sizeof(rq->__cmd));
      }
      EXPORT_SYMBOL(blk_rq_set_block_pc);
      
      /**
       * blk_requeue_request - put a request back on queue
       * @q:                request queue where request should be inserted
       * @rq:                request to be inserted
       *
       * Description:
       *    Drivers often keep queueing requests until the hardware cannot accept
       *    more, when that condition happens we need to put the request back
       *    on the queue. Must be called with queue lock held.
       */
      void blk_requeue_request(struct request_queue *q, struct request *rq)
      {
              blk_delete_timer(rq);
              blk_clear_rq_complete(rq);
              trace_block_rq_requeue(q, rq);
      
              if (rq->cmd_flags & REQ_QUEUED)
                      blk_queue_end_tag(q, rq);
      
              BUG_ON(blk_queued_rq(rq));
      
              elv_requeue_request(q, rq);
      }
      EXPORT_SYMBOL(blk_requeue_request);
      
      static void add_acct_request(struct request_queue *q, struct request *rq,
                                   int where)
      {
              blk_account_io_start(rq, true);
              __elv_add_request(q, rq, where);
      }
      
      static void part_round_stats_single(int cpu, struct hd_struct *part,
                                          unsigned long now)
      {
              int inflight;
  761 
              if (now == part->stamp)
                      return;
  729 
              inflight = part_in_flight(part);
  497         if (inflight) {
                      __part_stat_add(cpu, part, time_in_queue,
                                      inflight * (now - part->stamp));
                      __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
  761         }
              part->stamp = now;
      }
      
      /**
       * part_round_stats() - Round off the performance stats on a struct disk_stats.
       * @cpu: cpu number for stats access
       * @part: target partition
       *
       * The average IO queue length and utilisation statistics are maintained
       * by observing the current state of the queue length and the amount of
       * time it has been in this state for.
       *
       * Normally, that accounting is done on IO completion, but that can result
       * in more than a second's worth of IO being accounted for within any one
       * second, leading to >100% utilisation.  To deal with that, we call this
       * function to do a round-off before returning the results when reading
       * /proc/diskstats.  This accounts immediately for all queue usage up to
       * the current jiffies and restarts the counters again.
       */
      void part_round_stats(int cpu, struct hd_struct *part)
  761 {
              unsigned long now = jiffies;
      
  694         if (part->partno)
  761                 part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
              part_round_stats_single(cpu, part, now);
      }
      EXPORT_SYMBOL_GPL(part_round_stats);
      
      #ifdef CONFIG_PM
      static void blk_pm_put_request(struct request *rq)
  162 {
              if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
                      pm_runtime_mark_last_busy(rq->q->dev);
      }
      #else
      static inline void blk_pm_put_request(struct request *rq) {}
      #endif
      
      /*
       * queue lock must be held
  162  */
      void __blk_put_request(struct request_queue *q, struct request *req)
  162 {
              if (unlikely(!q))
                      return;
  162 
              if (q->mq_ops) {
                      blk_mq_free_request(req);
                      return;
              }
  162 
              blk_pm_put_request(req);
  162 
              elv_completed_request(q, req);
      
              /* this is a bio leak */
              WARN_ON(req->bio != NULL);
      
              /*
               * Request may not have originated from ll_rw_blk. if not,
               * it didn't come out of our reserved rq pools
  162          */
  162         if (req->cmd_flags & REQ_ALLOCED) {
                      unsigned int flags = req->cmd_flags;
                      struct request_list *rl = blk_rq_rl(req);
      
  162                 BUG_ON(!list_empty(&req->queuelist));
                      BUG_ON(ELV_ON_HASH(req));
  162 
                      blk_free_request(rl, req);
  162                 freed_request(rl, flags);
                      blk_put_rl(rl);
              }
      }
      EXPORT_SYMBOL_GPL(__blk_put_request);
      
      void blk_put_request(struct request *req)
      {
              struct request_queue *q = req->q;
      
              if (q->mq_ops)
                      blk_mq_free_request(req);
              else {
                      unsigned long flags;
      
                      spin_lock_irqsave(q->queue_lock, flags);
                      __blk_put_request(q, req);
                      spin_unlock_irqrestore(q->queue_lock, flags);
              }
      }
      EXPORT_SYMBOL(blk_put_request);
      
      /**
       * blk_add_request_payload - add a payload to a request
       * @rq: request to update
       * @page: page backing the payload
       * @len: length of the payload.
       *
       * This allows to later add a payload to an already submitted request by
       * a block driver.  The driver needs to take care of freeing the payload
       * itself.
       *
       * Note that this is a quite horrible hack and nothing but handling of
       * discard requests should ever use it.
       */
      void blk_add_request_payload(struct request *rq, struct page *page,
                      unsigned int len)
      {
              struct bio *bio = rq->bio;
      
              bio->bi_io_vec->bv_page = page;
              bio->bi_io_vec->bv_offset = 0;
              bio->bi_io_vec->bv_len = len;
      
              bio->bi_iter.bi_size = len;
              bio->bi_vcnt = 1;
              bio->bi_phys_segments = 1;
      
              rq->__data_len = rq->resid_len = len;
              rq->nr_phys_segments = 1;
      }
      EXPORT_SYMBOL_GPL(blk_add_request_payload);
      
      bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
                                  struct bio *bio)
  440 {
              const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
      
              if (!ll_back_merge_fn(q, req, bio))
                      return false;
  345 
              trace_block_bio_backmerge(q, req, bio);
  345 
              if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                      blk_rq_set_mixed_merge(req);
  345 
              req->biotail->bi_next = bio;
              req->biotail = bio;
              req->__data_len += bio->bi_iter.bi_size;
              req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
      
  440         blk_account_io_start(req, false);
              return true;
      }
      
      bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
                                   struct bio *bio)
   22 {
              const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
      
              if (!ll_front_merge_fn(q, req, bio))
                      return false;
   21 
              trace_block_bio_frontmerge(q, req, bio);
   21 
              if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                      blk_rq_set_mixed_merge(req);
   21 
              bio->bi_next = req->bio;
              req->bio = bio;
      
              req->__sector = bio->bi_iter.bi_sector;
              req->__data_len += bio->bi_iter.bi_size;
              req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
      
   22         blk_account_io_start(req, false);
              return true;
      }
      
      /**
       * blk_attempt_plug_merge - try to merge with %current's plugged list
       * @q: request_queue new bio is being queued at
       * @bio: new bio being queued
       * @request_count: out parameter for number of traversed plugged requests
       * @same_queue_rq: pointer to &struct request that gets filled in when
       * another request associated with @q is found on the plug list
       * (optional, may be %NULL)
       *
       * Determine whether @bio being queued on @q can be merged with a request
       * on %current's plugged list.  Returns %true if merge was successful,
       * otherwise %false.
       *
       * Plugging coalesces IOs from the same issuer for the same purpose without
       * going through @q->queue_lock.  As such it's more of an issuing mechanism
       * than scheduling, and the request, while may have elvpriv data, is not
       * added on the elevator at this point.  In addition, we don't have
       * reliable access to the elevator outside queue lock.  Only check basic
       * merging parameters without querying the elevator.
       *
       * Caller must ensure !blk_queue_nomerges(q) beforehand.
       */
      bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                                  unsigned int *request_count,
                                  struct request **same_queue_rq)
      {
              struct blk_plug *plug;
              struct request *rq;
              bool ret = false;
              struct list_head *plug_list;
  694 
              plug = current->plug;
              if (!plug)
  652                 goto out;
              *request_count = 0;
      
              if (q->mq_ops)
                      plug_list = &plug->mq_list;
  652         else
                      plug_list = &plug->list;
  652 
              list_for_each_entry_reverse(rq, plug_list, queuelist) {
                      int el_ret;
  524 
  524                 if (rq->q == q) {
                              (*request_count)++;
                              /*
                               * Only blk-mq multiple hardware queues case checks the
                               * rq in the same queue, there should be only one such
                               * rq in a queue
                               **/
                              if (same_queue_rq)
                                      *same_queue_rq = rq;
                      }
  524 
                      if (rq->q != q || !blk_rq_merge_ok(rq, bio))
                              continue;
  523 
                      el_ret = blk_try_merge(rq, bio);
  410                 if (el_ret == ELEVATOR_BACK_MERGE) {
                              ret = bio_attempt_back_merge(q, rq, bio);
                              if (ret)
  485                                 break;
    3                 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
                              ret = bio_attempt_front_merge(q, rq, bio);
                              if (ret)
                                      break;
                      }
              }
  694 out:
              return ret;
      }
      
      unsigned int blk_plug_queued_count(struct request_queue *q)
      {
              struct blk_plug *plug;
              struct request *rq;
              struct list_head *plug_list;
              unsigned int ret = 0;
   67 
              plug = current->plug;
              if (!plug)
                      goto out;
   66 
   66         if (q->mq_ops)
                      plug_list = &plug->mq_list;
              else
                      plug_list = &plug->list;
   66 
   63         list_for_each_entry(rq, plug_list, queuelist) {
   63                 if (rq->q == q)
                              ret++;
              }
   67 out:
              return ret;
      }
      
      void init_request_from_bio(struct request *req, struct bio *bio)
  761 {
              req->cmd_type = REQ_TYPE_FS;
      
              req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
              if (bio->bi_rw & REQ_RAHEAD)
                      req->cmd_flags |= REQ_FAILFAST_MASK;
  761 
              req->errors = 0;
              req->__sector = bio->bi_iter.bi_sector;
              req->ioprio = bio_prio(bio);
              blk_rq_bio_prep(req->q, req, bio);
      }
      
      static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
  694 {
              const bool sync = !!(bio->bi_rw & REQ_SYNC);
              struct blk_plug *plug;
              int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
              struct request *req;
              unsigned int request_count = 0;
      
              /*
               * low level driver can indicate that it wants pages above a
               * certain limit bounced to low memory (ie for highmem, or even
               * ISA dma in theory)
               */
              blk_queue_bounce(q, &bio);
      
              blk_queue_split(q, &bio, q->bio_split);
      
              if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
                      bio->bi_error = -EIO;
                      bio_endio(bio);
                      return BLK_QC_T_NONE;
              }
      
  379         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
                      spin_lock_irq(q->queue_lock);
                      where = ELEVATOR_INSERT_FLUSH;
                      goto get_rq;
              }
      
              /*
               * Check if we can merge with the plugged list before grabbing
               * any locks.
  694          */
  694         if (!blk_queue_nomerges(q)) {
                      if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
                              return BLK_QC_T_NONE;
              } else
                      request_count = blk_plug_queued_count(q);
  694 
              spin_lock_irq(q->queue_lock);
      
              el_ret = elv_merge(q, &req, bio);
   90         if (el_ret == ELEVATOR_BACK_MERGE) {
   90                 if (bio_attempt_back_merge(q, req, bio)) {
                              elv_bio_merged(q, req, bio);
   90                         if (!attempt_back_merge(q, req))
                                      elv_merged_request(q, req, el_ret);
                              goto out_unlock;
  694                 }
   19         } else if (el_ret == ELEVATOR_FRONT_MERGE) {
   19                 if (bio_attempt_front_merge(q, req, bio)) {
                              elv_bio_merged(q, req, bio);
   18                         if (!attempt_front_merge(q, req))
                                      elv_merged_request(q, req, el_ret);
                              goto out_unlock;
                      }
              }
      
      get_rq:
              /*
               * This sync check and mask will be re-done in init_request_from_bio(),
               * but we need to set it earlier to expose the sync flag to the
               * rq allocator and io schedulers.
  694          */
              rw_flags = bio_data_dir(bio);
  672         if (sync)
                      rw_flags |= REQ_SYNC;
      
              /*
               * Grab a free request. This is might sleep but can not fail.
               * Returns with the queue unlocked.
  694          */
              req = get_request(q, rw_flags, bio, GFP_NOIO);
              if (IS_ERR(req)) {
                      bio->bi_error = PTR_ERR(req);
                      bio_endio(bio);
                      goto out_unlock;
              }
      
              /*
               * After dropping the lock and possibly sleeping here, our request
               * may now be mergeable after it had proven unmergeable (above).
               * We don't worry about that case for efficiency. It won't happen
               * often, and the elevators are able to handle it.
  694          */
              init_request_from_bio(req, bio);
      
  694         if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
                      req->cpu = raw_smp_processor_id();
  694 
              plug = current->plug;
              if (plug) {
                      /*
                       * If this is the first request added after a plug, fire
                       * of a plug trace.
  652                  */
  648                 if (!request_count)
                              trace_block_plug(q);
  494                 else {
   21                         if (request_count >= BLK_MAX_REQUEST_COUNT) {
  652                                 blk_flush_plug_list(plug, false);
                                      trace_block_plug(q);
                              }
  652                 }
  694                 list_add_tail(&req->queuelist, &plug->list);
                      blk_account_io_start(req, true);
  555         } else {
                      spin_lock_irq(q->queue_lock);
                      add_acct_request(q, req, where);
                      __blk_run_queue(q);
  557 out_unlock:
                      spin_unlock_irq(q->queue_lock);
              }
      
              return BLK_QC_T_NONE;
      }
      
      /*
       * If bio->bi_dev is a partition, remap the location
       */
      static inline void blk_partition_remap(struct bio *bio)
  762 {
              struct block_device *bdev = bio->bi_bdev;
  763 
  695         if (bio_sectors(bio) && bdev != bdev->bd_contains) {
                      struct hd_struct *p = bdev->bd_part;
      
                      bio->bi_iter.bi_sector += p->start_sect;
                      bio->bi_bdev = bdev->bd_contains;
  695 
                      trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
                                            bdev->bd_dev,
                                            bio->bi_iter.bi_sector - p->start_sect);
              }
      }
      
      static void handle_bad_sector(struct bio *bio)
      {
              char b[BDEVNAME_SIZE];
      
              printk(KERN_INFO "attempt to access beyond end of device\n");
              printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
                              bdevname(bio->bi_bdev, b),
                              bio->bi_rw,
                              (unsigned long long)bio_end_sector(bio),
                              (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
      }
      
      #ifdef CONFIG_FAIL_MAKE_REQUEST
      
      static DECLARE_FAULT_ATTR(fail_make_request);
      
      static int __init setup_fail_make_request(char *str)
      {
              return setup_fault_attr(&fail_make_request, str);
      }
      __setup("fail_make_request=", setup_fail_make_request);
      
      static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
      {
              return part->make_it_fail && should_fail(&fail_make_request, bytes);
      }
      
      static int __init fail_make_request_debugfs(void)
      {
              struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
                                                      NULL, &fail_make_request);
      
              return PTR_ERR_OR_ZERO(dir);
      }
      
      late_initcall(fail_make_request_debugfs);
      
      #else /* CONFIG_FAIL_MAKE_REQUEST */
      
      static inline bool should_fail_request(struct hd_struct *part,
                                              unsigned int bytes)
      {
              return false;
      }
      
      #endif /* CONFIG_FAIL_MAKE_REQUEST */
      
      /*
       * Check whether this bio extends beyond the end of the device.
       */
      static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
      {
              sector_t maxsector;
  763 
              if (!nr_sectors)
                      return 0;
      
  762         /* Test device or partition size, when known. */
              maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
  762         if (maxsector) {
                      sector_t sector = bio->bi_iter.bi_sector;
  762 
                      if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
                              /*
                               * This may well happen - the kernel calls bread()
                               * without checking the size of the device, e.g., when
                               * mounting a device.
                               */
                              handle_bad_sector(bio);
                              return 1;
                      }
              }
      
              return 0;
      }
      
      static noinline_for_stack bool
      generic_make_request_checks(struct bio *bio)
      {
  763         struct request_queue *q;
              int nr_sectors = bio_sectors(bio);
              int err = -EIO;
              char b[BDEVNAME_SIZE];
              struct hd_struct *part;
      
              might_sleep();
  762 
              if (bio_check_eod(bio, nr_sectors))
                      goto end_io;
  763 
              q = bdev_get_queue(bio->bi_bdev);
              if (unlikely(!q)) {
                      printk(KERN_ERR
                             "generic_make_request: Trying to access "
                              "nonexistent block-device %s (%Lu)\n",
                              bdevname(bio->bi_bdev, b),
                              (long long) bio->bi_iter.bi_sector);
                      goto end_io;
              }
  763 
              part = bio->bi_bdev->bd_part;
  763         if (should_fail_request(part, bio->bi_iter.bi_size) ||
                  should_fail_request(&part_to_disk(part)->part0,
                                      bio->bi_iter.bi_size))
                      goto end_io;
      
              /*
               * If this device has partitions, remap block n
               * of partition p to block n+start(p) of the disk.
  763          */
              blk_partition_remap(bio);
  763 
              if (bio_check_eod(bio, nr_sectors))
                      goto end_io;
      
              /*
               * Filter flush bio's early so that make_request based
               * drivers without flush support don't have to worry
               * about them.
  763          */
    1         if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
                      bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
                      if (!nr_sectors) {
                              err = 0;
                              goto end_io;
                      }
              }
  762 
    1         if ((bio->bi_rw & REQ_DISCARD) &&
    1             (!blk_queue_discard(q) ||
                   ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
                      err = -EOPNOTSUPP;
                      goto end_io;
              }
  762 
              if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
                      err = -EOPNOTSUPP;
                      goto end_io;
              }
      
              /*
               * Various block parts want %current->io_context and lazy ioc
               * allocation ends up trading a lot of pain for a small amount of
               * memory.  Just allocate it upfront.  This may fail and block
               * layer knows how to live with it.
  762          */
              create_io_context(GFP_ATOMIC, q->node);
      
              if (!blkcg_bio_issue_check(q, bio))
                      return false;
  762 
  436         trace_block_bio_queue(q, bio);
              return true;
      
    1 end_io:
              bio->bi_error = err;
  763         bio_endio(bio);
              return false;
      }
      
      /**
       * generic_make_request - hand a buffer to its device driver for I/O
       * @bio:  The bio describing the location in memory and on the device.
       *
       * generic_make_request() is used to make I/O requests of block
       * devices. It is passed a &struct bio, which describes the I/O that needs
       * to be done.
       *
       * generic_make_request() does not return any status.  The
       * success/failure status of the request, along with notification of
       * completion, is delivered asynchronously through the bio->bi_end_io
       * function described (one day) else where.
       *
       * The caller of generic_make_request must make sure that bi_io_vec
       * are set to describe the memory buffer, and that bi_dev and bi_sector are
       * set to describe the device address, and the
       * bi_end_io and optionally bi_private are set to describe how
       * completion notification should be signaled.
       *
       * generic_make_request and the drivers it calls may use bi_next if this
       * bio happens to be merged with someone else, and may resubmit the bio to
       * a lower device by calling into generic_make_request recursively, which
       * means the bio should NOT be touched after the call to ->make_request_fn.
  762  */
      blk_qc_t generic_make_request(struct bio *bio)
      {
              /*
               * bio_list_on_stack[0] contains bios submitted by the current
               * make_request_fn.
               * bio_list_on_stack[1] contains bios that were submitted before
               * the current make_request_fn, but that haven't been processed
               * yet.
               */
              struct bio_list bio_list_on_stack[2];
              blk_qc_t ret = BLK_QC_T_NONE;
  763 
              if (!generic_make_request_checks(bio))
                      goto out;
      
              /*
               * We only want one ->make_request_fn to be active at a time, else
               * stack usage with stacked devices could be a problem.  So use
               * current->bio_list to keep a list of requests submited by a
               * make_request_fn function.  current->bio_list is also used as a
               * flag to say if generic_make_request is currently active in this
               * task or not.  If it is NULL, then no make_request is active.  If
               * it is non-NULL, then a make_request is active, and new requests
               * should be added at the tail
  762          */
  139         if (current->bio_list) {
                      bio_list_add(&current->bio_list[0], bio);
                      goto out;
              }
      
              /* following loop may be a bit non-obvious, and so deserves some
               * explanation.
               * Before entering the loop, bio->bi_next is NULL (as all callers
               * ensure that) so we have a list with a single bio.
               * We pretend that we have just taken it off a longer list, so
               * we assign bio_list to a pointer to the bio_list_on_stack,
               * thus initialising the bio_list of new bios to be
               * added.  ->make_request() may indeed add some more bios
               * through a recursive call to generic_make_request.  If it
               * did, we find a non-NULL value in bio_list and re-enter the loop
               * from the top.  In this case we really did just take the bio
               * of the top of the list (no pretending) and so remove it from
               * bio_list, and call into ->make_request() again.
  762          */
  762         BUG_ON(bio->bi_next);
              bio_list_init(&bio_list_on_stack[0]);
              current->bio_list = bio_list_on_stack;
  762         do {
                      struct request_queue *q = bdev_get_queue(bio->bi_bdev);
      
                      if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
                              struct bio_list lower, same;
      
  761                         /* Create a fresh bio_list for all subordinate requests */
                              bio_list_on_stack[1] = bio_list_on_stack[0];
                              bio_list_init(&bio_list_on_stack[0]);
      
                              ret = q->make_request_fn(q, bio);
      
                              blk_queue_exit(q);
                              /* sort new bios into those for a lower level
                               * and those for the same level
                               */
                              bio_list_init(&lower);
  139                         bio_list_init(&same);
                              while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
  139                                 if (q == bdev_get_queue(bio->bi_bdev))
                                              bio_list_add(&same, bio);
                                      else
                                              bio_list_add(&lower, bio);
  139                         /* now assemble so we handle the lowest level first */
  139                         bio_list_merge(&bio_list_on_stack[0], &lower);
  761                         bio_list_merge(&bio_list_on_stack[0], &same);
                              bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
                      } else {
                              bio_io_error(bio);
  761                 }
                      bio = bio_list_pop(&bio_list_on_stack[0]);
  760         } while (bio);
              current->bio_list = NULL; /* deactivate */
      
  762 out:
              return ret;
      }
      EXPORT_SYMBOL(generic_make_request);
      
      /**
       * submit_bio - submit a bio to the block device layer for I/O
       * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
       * @bio: The &struct bio which describes the I/O
       *
       * submit_bio() is very similar in purpose to generic_make_request(), and
       * uses that function to do most of the work. Both are fairly rough
       * interfaces; @bio must be presetup and ready for I/O.
       *
       */
      blk_qc_t submit_bio(int rw, struct bio *bio)
  763 {
              bio->bi_rw |= rw;
      
              /*
               * If it's a regular read/write or a barrier with data attached,
               * go through the normal accounting stuff before submission.
  763          */
              if (bio_has_data(bio)) {
                      unsigned int count;
  761 
    7                 if (unlikely(rw & REQ_WRITE_SAME))
                              count = bdev_logical_block_size(bio->bi_bdev) >> 9;
  761                 else
                              count = bio_sectors(bio);
  761 
                      if (rw & WRITE) {
                              count_vm_events(PGPGOUT, count);
  386                 } else {
                              task_io_account_read(bio->bi_iter.bi_size);
                              count_vm_events(PGPGIN, count);
                      }
  750 
                      if (unlikely(block_dump)) {
                              char b[BDEVNAME_SIZE];
                              printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
                              current->comm, task_pid_nr(current),
                                      (rw & WRITE) ? "WRITE" : "READ",
                                      (unsigned long long)bio->bi_iter.bi_sector,
                                      bdevname(bio->bi_bdev, b),
                                      count);
                      }
              }
  763 
              return generic_make_request(bio);
      }
      EXPORT_SYMBOL(submit_bio);
      
      /**
       * blk_cloned_rq_check_limits - Helper function to check a cloned request
       *                              for new the queue limits
       * @q:  the queue
       * @rq: the request being checked
       *
       * Description:
       *    @rq may have been made based on weaker limitations of upper-level queues
       *    in request stacking drivers, and it may violate the limitation of @q.
       *    Since the block layer and the underlying device driver trust @rq
       *    after it is inserted to @q, it should be checked against @q before
       *    the insertion using this generic function.
       *
       *    Request stacking drivers like request-based dm may change the queue
       *    limits when retrying requests on other queues. Those requests need
       *    to be checked against the new queue limits again during dispatch.
       */
      static int blk_cloned_rq_check_limits(struct request_queue *q,
                                            struct request *rq)
      {
              if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
                      printk(KERN_ERR "%s: over max size limit.\n", __func__);
                      return -EIO;
              }
      
              /*
               * queue's settings related to segment counting like q->bounce_pfn
               * may differ from that of other stacking queues.
               * Recalculate it to check the request correctly on this queue's
               * limitation.
               */
              blk_recalc_rq_segments(rq);
              if (rq->nr_phys_segments > queue_max_segments(q)) {
                      printk(KERN_ERR "%s: over max segments limit.\n", __func__);
                      return -EIO;
              }
      
              return 0;
      }
      
      /**
       * blk_insert_cloned_request - Helper for stacking drivers to submit a request
       * @q:  the queue to submit the request
       * @rq: the request being queued
       */
      int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
      {
              unsigned long flags;
              int where = ELEVATOR_INSERT_BACK;
      
              if (blk_cloned_rq_check_limits(q, rq))
                      return -EIO;
      
              if (rq->rq_disk &&
                  should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
                      return -EIO;
      
              if (q->mq_ops) {
                      if (blk_queue_io_stat(q))
                              blk_account_io_start(rq, true);
                      blk_mq_insert_request(rq, false, true, false);
                      return 0;
              }
      
              spin_lock_irqsave(q->queue_lock, flags);
              if (unlikely(blk_queue_dying(q))) {
                      spin_unlock_irqrestore(q->queue_lock, flags);
                      return -ENODEV;
              }
      
              /*
               * Submitting request must be dequeued before calling this function
               * because it will be linked to another request_queue
               */
              BUG_ON(blk_queued_rq(rq));
      
              if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
                      where = ELEVATOR_INSERT_FLUSH;
      
              add_acct_request(q, rq, where);
              if (where == ELEVATOR_INSERT_FLUSH)
                      __blk_run_queue(q);
              spin_unlock_irqrestore(q->queue_lock, flags);
      
              return 0;
      }
      EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
      
      /**
       * blk_rq_err_bytes - determine number of bytes till the next failure boundary
       * @rq: request to examine
       *
       * Description:
       *     A request could be merge of IOs which require different failure
       *     handling.  This function determines the number of bytes which
       *     can be failed from the beginning of the request without
       *     crossing into area which need to be retried further.
       *
       * Return:
       *     The number of bytes to fail.
       *
       * Context:
       *     queue_lock must be held.
       */
      unsigned int blk_rq_err_bytes(const struct request *rq)
      {
              unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
              unsigned int bytes = 0;
              struct bio *bio;
      
              if (!(rq->cmd_flags & REQ_MIXED_MERGE))
                      return blk_rq_bytes(rq);
      
              /*
               * Currently the only 'mixing' which can happen is between
               * different fastfail types.  We can safely fail portions
               * which have all the failfast bits that the first one has -
               * the ones which are at least as eager to fail as the first
               * one.
               */
              for (bio = rq->bio; bio; bio = bio->bi_next) {
                      if ((bio->bi_rw & ff) != ff)
                              break;
                      bytes += bio->bi_iter.bi_size;
              }
      
              /* this could lead to infinite loop */
              BUG_ON(blk_rq_bytes(rq) && !bytes);
              return bytes;
      }
      EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
      
      void blk_account_io_completion(struct request *req, unsigned int bytes)
    1 {
    1         if (blk_do_io_stat(req)) {
                      const int rw = rq_data_dir(req);
                      struct hd_struct *part;
                      int cpu;
    1 
                      cpu = part_stat_lock();
                      part = req->part;
    1                 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                      part_stat_unlock();
    1         }
      }
      
      void blk_account_io_done(struct request *req)
      {
              /*
               * Account IO completion.  flush_rq isn't accounted as a
               * normal IO on queueing nor completion.  Accounting the
               * containing request is enough.
    1          */
    1         if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
                      unsigned long duration = jiffies - req->start_time;
                      const int rw = rq_data_dir(req);
                      struct hd_struct *part;
                      int cpu;
    1 
                      cpu = part_stat_lock();
                      part = req->part;
      
    1                 part_stat_inc(cpu, part, ios[rw]);
    1                 part_stat_add(cpu, part, ticks[rw], duration);
                      part_round_stats(cpu, part);
                      part_dec_in_flight(part, rw);
    1 
    1                 hd_struct_put(part);
                      part_stat_unlock();
    1         }
      }
      
      #ifdef CONFIG_PM
      /*
       * Don't process normal requests when queue is suspended
       * or in the process of suspending/resuming
       */
      static struct request *blk_pm_peek_request(struct request_queue *q,
                                                 struct request *rq)
  645 {
              if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
                  (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
                      return NULL;
              else
                      return rq;
      }
      #else
      static inline struct request *blk_pm_peek_request(struct request_queue *q,
                                                        struct request *rq)
      {
              return rq;
      }
      #endif
      
      void blk_account_io_start(struct request *rq, bool new_io)
      {
  761         struct hd_struct *part;
              int rw = rq_data_dir(rq);
              int cpu;
  761 
              if (!blk_do_io_stat(rq))
                      return;
  761 
              cpu = part_stat_lock();
      
  348         if (!new_io) {
  348                 part = rq->part;
                      part_stat_inc(cpu, part, merges[rw]);
  761         } else {
  761                 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
                      if (!hd_struct_try_get(part)) {
                              /*
                               * The partition is already being removed,
                               * the request will be accounted on the disk only
                               *
                               * We take a reference on disk->part0 although that
                               * partition will never be deleted, so we can treat
                               * it as any other partition.
                               */
                              part = &rq->rq_disk->part0;
                              hd_struct_get(part);
  761                 }
  694                 part_round_stats(cpu, part);
  761                 part_inc_in_flight(part, rw);
                      rq->part = part;
              }
  761 
              part_stat_unlock();
      }
      
      /**
       * blk_peek_request - peek at the top of a request queue
       * @q: request queue to peek at
       *
       * Description:
       *     Return the request at the top of @q.  The returned request
       *     should be started using blk_start_request() before LLD starts
       *     processing it.
       *
       * Return:
       *     Pointer to the request at the top of @q if available.  Null
       *     otherwise.
       *
       * Context:
       *     queue_lock must be held.
       */
  650 struct request *blk_peek_request(struct request_queue *q)
      {
              struct request *rq;
              int ret;
  650 
              while ((rq = __elv_next_request(q)) != NULL) {
  645 
                      rq = blk_pm_peek_request(q, rq);
                      if (!rq)
                              break;
  645 
                      if (!(rq->cmd_flags & REQ_STARTED)) {
                              /*
                               * This is the first time the device driver
                               * sees this request (possibly after
                               * requeueing).  Notify IO scheduler.
  645                          */
  645                         if (rq->cmd_flags & REQ_SORTED)
                                      elv_activate_rq(q, rq);
      
                              /*
                               * just mark as started even if we don't start
                               * it, a request that has been delayed should
                               * not be passed by new incoming requests
  645                          */
  645                         rq->cmd_flags |= REQ_STARTED;
                              trace_block_rq_issue(q, rq);
                      }
  645 
  645                 if (!q->boundary_rq || q->boundary_rq == rq) {
                              q->end_sector = rq_end_sector(rq);
                              q->boundary_rq = NULL;
                      }
  645 
                      if (rq->cmd_flags & REQ_DONTPREP)
                              break;
  645 
                      if (q->dma_drain_size && blk_rq_bytes(rq)) {
                              /*
                               * make sure space for the drain appears we
                               * know we can do this because max_hw_segments
                               * has been adjusted to be one fewer than the
                               * device can handle
                               */
                              rq->nr_phys_segments++;
                      }
  645 
                      if (!q->prep_rq_fn)
                              break;
  645 
                      ret = q->prep_rq_fn(q, rq);
                      if (ret == BLKPREP_OK) {
                              break;
                      } else if (ret == BLKPREP_DEFER) {
                              /*
                               * the request may have been (partially) prepped.
                               * we need to keep this request in the front to
                               * avoid resource deadlock.  REQ_STARTED will
                               * prevent other fs requests from passing this one.
                               */
                              if (q->dma_drain_size && blk_rq_bytes(rq) &&
                                  !(rq->cmd_flags & REQ_DONTPREP)) {
                                      /*
                                       * remove the space for the drain we added
                                       * so that we don't add it again
                                       */
                                      --rq->nr_phys_segments;
                              }
      
                              rq = NULL;
                              break;
                      } else if (ret == BLKPREP_KILL) {
                              rq->cmd_flags |= REQ_QUIET;
                              /*
                               * Mark this request as started so we don't trigger
                               * any debug logic in the end I/O path.
                               */
                              blk_start_request(rq);
                              __blk_end_request_all(rq, -EIO);
                      } else {
                              printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
                              break;
                      }
              }
  650 
              return rq;
      }
      EXPORT_SYMBOL(blk_peek_request);
      
      void blk_dequeue_request(struct request *rq)
  645 {
              struct request_queue *q = rq->q;
  645 
  645         BUG_ON(list_empty(&rq->queuelist));
              BUG_ON(ELV_ON_HASH(rq));
  645 
              list_del_init(&rq->queuelist);
      
              /*
               * the time frame between a request being removed from the lists
               * and to it is freed is accounted as io that is in progress at
               * the driver side.
  645          */
  645         if (blk_account_rq(rq)) {
                      q->in_flight[rq_is_sync(rq)]++;
                      set_io_start_time_ns(rq);
  645         }
      }
      
      /**
       * blk_start_request - start request processing on the driver
       * @req: request to dequeue
       *
       * Description:
       *     Dequeue @req and start timeout timer on it.  This hands off the
       *     request to the driver.
       *
       *     Block internal functions which don't want to start timer should
       *     call blk_dequeue_request().
       *
       * Context:
       *     queue_lock must be held.
       */
      void blk_start_request(struct request *req)
  645 {
              blk_dequeue_request(req);
      
              /*
               * We are now handing the request to the hardware, initialize
               * resid_len to full count and add the timeout handler.
               */
              req->resid_len = blk_rq_bytes(req);
              if (unlikely(blk_bidi_rq(req)))
                      req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
  645 
  645         BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
              blk_add_timer(req);
      }
      EXPORT_SYMBOL(blk_start_request);
      
      /**
       * blk_fetch_request - fetch a request from a request queue
       * @q: request queue to fetch a request from
       *
       * Description:
       *     Return the request at the top of @q.  The request is started on
       *     return and LLD can start processing it immediately.
       *
       * Return:
       *     Pointer to the request at the top of @q if available.  Null
       *     otherwise.
       *
       * Context:
       *     queue_lock must be held.
       */
      struct request *blk_fetch_request(struct request_queue *q)
      {
              struct request *rq;
      
              rq = blk_peek_request(q);
              if (rq)
                      blk_start_request(rq);
              return rq;
      }
      EXPORT_SYMBOL(blk_fetch_request);
      
      /**
       * blk_update_request - Special helper function for request stacking drivers
       * @req:      the request being processed
       * @error:    %0 for success, < %0 for error
       * @nr_bytes: number of bytes to complete @req
       *
       * Description:
       *     Ends I/O on a number of bytes attached to @req, but doesn't complete
       *     the request structure even if @req doesn't have leftover.
       *     If @req has leftover, sets it up for the next range of segments.
       *
       *     This special helper function is only for request stacking drivers
       *     (e.g. request-based dm) so that they can handle partial completion.
       *     Actual device drivers should use blk_end_request instead.
       *
       *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
       *     %false return from this function.
       *
       * Return:
       *     %false - this request doesn't have any more data
       *     %true  - this request has more data
       **/
      bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
      {
              int total_bytes;
    1 
              trace_block_rq_complete(req->q, req, nr_bytes);
    1 
              if (!req->bio)
                      return false;
      
              /*
               * For fs requests, rq is just carrier of independent bio's
               * and each partial completion should be handled separately.
               * Reset per-request error on each partial completion.
               *
               * TODO: tj: This is too subtle.  It would be better to let
               * low level drivers do what they see fit.
    1          */
    1         if (req->cmd_type == REQ_TYPE_FS)
                      req->errors = 0;
      
    1         if (error && req->cmd_type == REQ_TYPE_FS &&
                  !(req->cmd_flags & REQ_QUIET)) {
                      char *error_type;
    1 
                      switch (error) {
                      case -ENOLINK:
                              error_type = "recoverable transport";
                              break;
                      case -EREMOTEIO:
                              error_type = "critical target";
                              break;
                      case -EBADE:
                              error_type = "critical nexus";
                              break;
                      case -ETIMEDOUT:
                              error_type = "timeout";
                              break;
                      case -ENOSPC:
                              error_type = "critical space allocation";
                              break;
                      case -ENODATA:
                              error_type = "critical medium";
                              break;
                      case -EIO:
                      default:
                              error_type = "I/O";
                              break;
    1                 }
                      printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
                                         __func__, error_type, req->rq_disk ?
                                         req->rq_disk->disk_name : "?",
                                         (unsigned long long)blk_rq_pos(req));
      
              }
    1 
              blk_account_io_completion(req, nr_bytes);
      
    1         total_bytes = 0;
              while (req->bio) {
    1                 struct bio *bio = req->bio;
                      unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
      
    1                 if (bio_bytes == bio->bi_iter.bi_size)
                              req->bio = bio->bi_next;
    1 
                      req_bio_endio(req, bio, bio_bytes, error);
    1 
                      total_bytes += bio_bytes;
                      nr_bytes -= bio_bytes;
      
                      if (!nr_bytes)
                              break;
              }
      
              /*
               * completely done
    1          */
              if (!req->bio) {
                      /*
                       * Reset counters so that the request stacking driver
                       * can find how many bytes remain in the request
                       * later.
    1                  */
                      req->__data_len = 0;
                      return false;
              }
      
              req->__data_len -= total_bytes;
      
              /* update sector only for requests with clear definition of sector */
              if (req->cmd_type == REQ_TYPE_FS)
                      req->__sector += total_bytes >> 9;
      
              /* mixed attributes always follow the first bio */
              if (req->cmd_flags & REQ_MIXED_MERGE) {
                      req->cmd_flags &= ~REQ_FAILFAST_MASK;
                      req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
              }
      
              /*
               * If total number of sectors is less than the first segment
               * size, something has gone terribly wrong.
               */
              if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
                      blk_dump_rq_flags(req, "request botched");
                      req->__data_len = blk_rq_cur_bytes(req);
              }
      
              /* recalculate the number of segments */
              blk_recalc_rq_segments(req);
    1 
              return true;
      }
      EXPORT_SYMBOL_GPL(blk_update_request);
      
      static bool blk_update_bidi_request(struct request *rq, int error,
                                          unsigned int nr_bytes,
                                          unsigned int bidi_bytes)
      {
              if (blk_update_request(rq, error, nr_bytes))
                      return true;
      
              /* Bidi request must be completed as a whole */
              if (unlikely(blk_bidi_rq(rq)) &&
                  blk_update_request(rq->next_rq, error, bidi_bytes))
                      return true;
      
              if (blk_queue_add_random(rq->q))
                      add_disk_randomness(rq->rq_disk);
      
              return false;
      }
      
      /**
       * blk_unprep_request - unprepare a request
       * @req:        the request
       *
       * This function makes a request ready for complete resubmission (or
       * completion).  It happens only after all error handling is complete,
       * so represents the appropriate moment to deallocate any resources
       * that were allocated to the request in the prep_rq_fn.  The queue
       * lock is held when calling this.
       */
      void blk_unprep_request(struct request *req)
      {
              struct request_queue *q = req->q;
      
              req->cmd_flags &= ~REQ_DONTPREP;
              if (q->unprep_rq_fn)
                      q->unprep_rq_fn(q, req);
      }
      EXPORT_SYMBOL_GPL(blk_unprep_request);
      
      /*
       * queue lock must be held
       */
      void blk_finish_request(struct request *req, int error)
      {
              if (req->cmd_flags & REQ_QUEUED)
                      blk_queue_end_tag(req->q, req);
      
              BUG_ON(blk_queued_rq(req));
      
              if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
                      laptop_io_completion(&req->q->backing_dev_info);
      
              blk_delete_timer(req);
      
              if (req->cmd_flags & REQ_DONTPREP)
                      blk_unprep_request(req);
      
              blk_account_io_done(req);
      
              if (req->end_io)
                      req->end_io(req, error);
              else {
                      if (blk_bidi_rq(req))
                              __blk_put_request(req->next_rq->q, req->next_rq);
      
                      __blk_put_request(req->q, req);
              }
      }
      EXPORT_SYMBOL(blk_finish_request);
      
      /**
       * blk_end_bidi_request - Complete a bidi request
       * @rq:         the request to complete
       * @error:      %0 for success, < %0 for error
       * @nr_bytes:   number of bytes to complete @rq
       * @bidi_bytes: number of bytes to complete @rq->next_rq
       *
       * Description:
       *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
       *     Drivers that supports bidi can safely call this member for any
       *     type of request, bidi or uni.  In the later case @bidi_bytes is
       *     just ignored.
       *
       * Return:
       *     %false - we are done with this request
       *     %true  - still buffers pending for this request
       **/
      static bool blk_end_bidi_request(struct request *rq, int error,
                                       unsigned int nr_bytes, unsigned int bidi_bytes)
      {
              struct request_queue *q = rq->q;
              unsigned long flags;
      
              if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
                      return true;
      
              spin_lock_irqsave(q->queue_lock, flags);
              blk_finish_request(rq, error);
              spin_unlock_irqrestore(q->queue_lock, flags);
      
              return false;
      }
      
      /**
       * __blk_end_bidi_request - Complete a bidi request with queue lock held
       * @rq:         the request to complete
       * @error:      %0 for success, < %0 for error
       * @nr_bytes:   number of bytes to complete @rq
       * @bidi_bytes: number of bytes to complete @rq->next_rq
       *
       * Description:
       *     Identical to blk_end_bidi_request() except that queue lock is
       *     assumed to be locked on entry and remains so on return.
       *
       * Return:
       *     %false - we are done with this request
       *     %true  - still buffers pending for this request
       **/
      bool __blk_end_bidi_request(struct request *rq, int error,
                                         unsigned int nr_bytes, unsigned int bidi_bytes)
      {
              if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
                      return true;
      
              blk_finish_request(rq, error);
      
              return false;
      }
      
      /**
       * blk_end_request - Helper function for drivers to complete the request.
       * @rq:       the request being processed
       * @error:    %0 for success, < %0 for error
       * @nr_bytes: number of bytes to complete
       *
       * Description:
       *     Ends I/O on a number of bytes attached to @rq.
       *     If @rq has leftover, sets it up for the next range of segments.
       *
       * Return:
       *     %false - we are done with this request
       *     %true  - still buffers pending for this request
       **/
      bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
      {
              return blk_end_bidi_request(rq, error, nr_bytes, 0);
      }
      EXPORT_SYMBOL(blk_end_request);
      
      /**
       * blk_end_request_all - Helper function for drives to finish the request.
       * @rq: the request to finish
       * @error: %0 for success, < %0 for error
       *
       * Description:
       *     Completely finish @rq.
       */
      void blk_end_request_all(struct request *rq, int error)
      {
              bool pending;
              unsigned int bidi_bytes = 0;
      
              if (unlikely(blk_bidi_rq(rq)))
                      bidi_bytes = blk_rq_bytes(rq->next_rq);
      
              pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
              BUG_ON(pending);
      }
      EXPORT_SYMBOL(blk_end_request_all);
      
      /**
       * blk_end_request_cur - Helper function to finish the current request chunk.
       * @rq: the request to finish the current chunk for
       * @error: %0 for success, < %0 for error
       *
       * Description:
       *     Complete the current consecutively mapped chunk from @rq.
       *
       * Return:
       *     %false - we are done with this request
       *     %true  - still buffers pending for this request
       */
      bool blk_end_request_cur(struct request *rq, int error)
      {
              return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
      }
      EXPORT_SYMBOL(blk_end_request_cur);
      
      /**
       * blk_end_request_err - Finish a request till the next failure boundary.
       * @rq: the request to finish till the next failure boundary for
       * @error: must be negative errno
       *
       * Description:
       *     Complete @rq till the next failure boundary.
       *
       * Return:
       *     %false - we are done with this request
       *     %true  - still buffers pending for this request
       */
      bool blk_end_request_err(struct request *rq, int error)
      {
              WARN_ON(error >= 0);
              return blk_end_request(rq, error, blk_rq_err_bytes(rq));
      }
      EXPORT_SYMBOL_GPL(blk_end_request_err);
      
      /**
       * __blk_end_request - Helper function for drivers to complete the request.
       * @rq:       the request being processed
       * @error:    %0 for success, < %0 for error
       * @nr_bytes: number of bytes to complete
       *
       * Description:
       *     Must be called with queue lock held unlike blk_end_request().
       *
       * Return:
       *     %false - we are done with this request
       *     %true  - still buffers pending for this request
       **/
      bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
      {
              return __blk_end_bidi_request(rq, error, nr_bytes, 0);
      }
      EXPORT_SYMBOL(__blk_end_request);
      
      /**
       * __blk_end_request_all - Helper function for drives to finish the request.
       * @rq: the request to finish
       * @error: %0 for success, < %0 for error
       *
       * Description:
       *     Completely finish @rq.  Must be called with queue lock held.
       */
      void __blk_end_request_all(struct request *rq, int error)
      {
              bool pending;
              unsigned int bidi_bytes = 0;
      
              if (unlikely(blk_bidi_rq(rq)))
                      bidi_bytes = blk_rq_bytes(rq->next_rq);
      
              pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
              BUG_ON(pending);
      }
      EXPORT_SYMBOL(__blk_end_request_all);
      
      /**
       * __blk_end_request_cur - Helper function to finish the current request chunk.
       * @rq: the request to finish the current chunk for
       * @error: %0 for success, < %0 for error
       *
       * Description:
       *     Complete the current consecutively mapped chunk from @rq.  Must
       *     be called with queue lock held.
       *
       * Return:
       *     %false - we are done with this request
       *     %true  - still buffers pending for this request
       */
      bool __blk_end_request_cur(struct request *rq, int error)
      {
              return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
      }
      EXPORT_SYMBOL(__blk_end_request_cur);
      
      /**
       * __blk_end_request_err - Finish a request till the next failure boundary.
       * @rq: the request to finish till the next failure boundary for
       * @error: must be negative errno
       *
       * Description:
       *     Complete @rq till the next failure boundary.  Must be called
       *     with queue lock held.
       *
       * Return:
       *     %false - we are done with this request
       *     %true  - still buffers pending for this request
       */
      bool __blk_end_request_err(struct request *rq, int error)
      {
              WARN_ON(error >= 0);
              return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
      }
      EXPORT_SYMBOL_GPL(__blk_end_request_err);
      
      void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                           struct bio *bio)
      {
  761         /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
              rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
  761 
  760         if (bio_has_data(bio))
                      rq->nr_phys_segments = bio_phys_segments(q, bio);
  761 
              rq->__data_len = bio->bi_iter.bi_size;
              rq->bio = rq->biotail = bio;
      
  761         if (bio->bi_bdev)
  761                 rq->rq_disk = bio->bi_bdev->bd_disk;
      }
      
      #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
      /**
       * rq_flush_dcache_pages - Helper function to flush all pages in a request
       * @rq: the request to be flushed
       *
       * Description:
       *     Flush all pages in @rq.
       */
      void rq_flush_dcache_pages(struct request *rq)
      {
              struct req_iterator iter;
              struct bio_vec bvec;
      
              rq_for_each_segment(bvec, rq, iter)
                      flush_dcache_page(bvec.bv_page);
      }
      EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
      #endif
      
      /**
       * blk_lld_busy - Check if underlying low-level drivers of a device are busy
       * @q : the queue of the device being checked
       *
       * Description:
       *    Check if underlying low-level drivers of a device are busy.
       *    If the drivers want to export their busy state, they must set own
       *    exporting function using blk_queue_lld_busy() first.
       *
       *    Basically, this function is used only by request stacking drivers
       *    to stop dispatching requests to underlying devices when underlying
       *    devices are busy.  This behavior helps more I/O merging on the queue
       *    of the request stacking driver and prevents I/O throughput regression
       *    on burst I/O load.
       *
       * Return:
       *    0 - Not busy (The request stacking driver should dispatch request)
       *    1 - Busy (The request stacking driver should stop dispatching request)
       */
      int blk_lld_busy(struct request_queue *q)
      {
              if (q->lld_busy_fn)
                      return q->lld_busy_fn(q);
      
              return 0;
      }
      EXPORT_SYMBOL_GPL(blk_lld_busy);
      
      /**
       * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
       * @rq: the clone request to be cleaned up
       *
       * Description:
       *     Free all bios in @rq for a cloned request.
       */
      void blk_rq_unprep_clone(struct request *rq)
      {
              struct bio *bio;
      
              while ((bio = rq->bio) != NULL) {
                      rq->bio = bio->bi_next;
      
                      bio_put(bio);
              }
      }
      EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
      
      /*
       * Copy attributes of the original request to the clone request.
       * The actual data parts (e.g. ->cmd, ->sense) are not copied.
       */
      static void __blk_rq_prep_clone(struct request *dst, struct request *src)
      {
              dst->cpu = src->cpu;
              dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
              dst->cmd_type = src->cmd_type;
              dst->__sector = blk_rq_pos(src);
              dst->__data_len = blk_rq_bytes(src);
              dst->nr_phys_segments = src->nr_phys_segments;
              dst->ioprio = src->ioprio;
              dst->extra_len = src->extra_len;
      }
      
      /**
       * blk_rq_prep_clone - Helper function to setup clone request
       * @rq: the request to be setup
       * @rq_src: original request to be cloned
       * @bs: bio_set that bios for clone are allocated from
       * @gfp_mask: memory allocation mask for bio
       * @bio_ctr: setup function to be called for each clone bio.
       *           Returns %0 for success, non %0 for failure.
       * @data: private data to be passed to @bio_ctr
       *
       * Description:
       *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
       *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
       *     are not copied, and copying such parts is the caller's responsibility.
       *     Also, pages which the original bios are pointing to are not copied
       *     and the cloned bios just point same pages.
       *     So cloned bios must be completed before original bios, which means
       *     the caller must complete @rq before @rq_src.
       */
      int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                            struct bio_set *bs, gfp_t gfp_mask,
                            int (*bio_ctr)(struct bio *, struct bio *, void *),
                            void *data)
      {
              struct bio *bio, *bio_src;
      
              if (!bs)
                      bs = fs_bio_set;
      
              __rq_for_each_bio(bio_src, rq_src) {
                      bio = bio_clone_fast(bio_src, gfp_mask, bs);
                      if (!bio)
                              goto free_and_out;
      
                      if (bio_ctr && bio_ctr(bio, bio_src, data))
                              goto free_and_out;
      
                      if (rq->bio) {
                              rq->biotail->bi_next = bio;
                              rq->biotail = bio;
                      } else
                              rq->bio = rq->biotail = bio;
              }
      
              __blk_rq_prep_clone(rq, rq_src);
      
              return 0;
      
      free_and_out:
              if (bio)
                      bio_put(bio);
              blk_rq_unprep_clone(rq);
      
              return -ENOMEM;
      }
      EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
      
      int kblockd_schedule_work(struct work_struct *work)
   92 {
              return queue_work(kblockd_workqueue, work);
      }
      EXPORT_SYMBOL(kblockd_schedule_work);
      
      int kblockd_schedule_delayed_work(struct delayed_work *dwork,
                                        unsigned long delay)
      {
              return queue_delayed_work(kblockd_workqueue, dwork, delay);
      }
      EXPORT_SYMBOL(kblockd_schedule_delayed_work);
      
      int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
                                           unsigned long delay)
   58 {
              return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
      }
      EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
      
      /**
       * blk_start_plug - initialize blk_plug and track it inside the task_struct
       * @plug:        The &struct blk_plug that needs to be initialized
       *
       * Description:
       *   Tracking blk_plug inside the task_struct will help with auto-flushing the
       *   pending I/O should the task end up blocking between blk_start_plug() and
       *   blk_finish_plug(). This is important from a performance perspective, but
       *   also ensures that we don't deadlock. For instance, if the task is blocking
       *   for a memory allocation, memory reclaim could end up wanting to free a
       *   page belonging to that request that is currently residing in our private
       *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
       *   this kind of deadlock.
       */
      void blk_start_plug(struct blk_plug *plug)
  946 {
              struct task_struct *tsk = current;
      
              /*
               * If this is a nested plug, don't actually assign it.
  469          */
              if (tsk->plug)
                      return;
  946 
              INIT_LIST_HEAD(&plug->list);
              INIT_LIST_HEAD(&plug->mq_list);
              INIT_LIST_HEAD(&plug->cb_list);
              /*
               * Store ordering should not be needed here, since a potential
               * preempt will imply a full memory barrier
  946          */
              tsk->plug = plug;
      }
      EXPORT_SYMBOL(blk_start_plug);
      
      static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
      {
              struct request *rqa = container_of(a, struct request, queuelist);
              struct request *rqb = container_of(b, struct request, queuelist);
  468 
  468         return !(rqa->q < rqb->q ||
                      (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
      }
      
      /*
       * If 'from_schedule' is true, then postpone the dispatch of requests
       * until a safe kblockd context. We due this to avoid accidental big
       * additional stack usage in driver dispatch, in places where the originally
       * plugger did not intend it.
       */
      static void queue_unplugged(struct request_queue *q, unsigned int depth,
                                  bool from_schedule)
              __releases(q->queue_lock)
  628 {
              trace_block_unplug(q, depth, !from_schedule);
  628 
  433         if (from_schedule)
                      blk_run_queue_async(q);
  565         else
  626                 __blk_run_queue(q);
              spin_unlock(q->queue_lock);
      }
      
      static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
      {
              LIST_HEAD(callbacks);
  912 
              while (!list_empty(&plug->cb_list)) {
                      list_splice_init(&plug->cb_list, &callbacks);
      
                      while (!list_empty(&callbacks)) {
                              struct blk_plug_cb *cb = list_first_entry(&callbacks,
                                                                struct blk_plug_cb,
                                                                list);
                              list_del(&cb->list);
                              cb->callback(cb, from_schedule);
                      }
              }
      }
      
      struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
                                            int size)
      {
              struct blk_plug *plug = current->plug;
              struct blk_plug_cb *cb;
      
              if (!plug)
                      return NULL;
      
              list_for_each_entry(cb, &plug->cb_list, list)
                      if (cb->callback == unplug && cb->data == data)
                              return cb;
      
              /* Not currently on the callback list */
              BUG_ON(size < sizeof(*cb));
              cb = kzalloc(size, GFP_ATOMIC);
              if (cb) {
                      cb->data = data;
                      cb->callback = unplug;
                      list_add(&cb->list, &plug->cb_list);
              }
              return cb;
      }
      EXPORT_SYMBOL(blk_check_plugged);
      
      void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
      {
              struct request_queue *q;
              unsigned long flags;
  912         struct request *rq;
              LIST_HEAD(list);
              unsigned int depth;
  912 
              flush_plug_callbacks(plug, from_schedule);
      
   66         if (!list_empty(&plug->mq_list))
                      blk_mq_flush_plug_list(plug, from_schedule);
  912 
  910         if (list_empty(&plug->list))
                      return;
  628 
              list_splice_init(&plug->list, &list);
      
              list_sort(NULL, &list, plug_rq_cmp);
      
              q = NULL;
              depth = 0;
      
              /*
               * Save and disable interrupts here, to avoid doing it for every
               * queue lock we have to take.
               */
  628         local_irq_save(flags);
              while (!list_empty(&list)) {
  628                 rq = list_entry_rq(list.next);
                      list_del_init(&rq->queuelist);
  628                 BUG_ON(!rq->q);
                      if (rq->q != q) {
                              /*
                               * This drops the queue lock
  628                          */
                              if (q)
                                      queue_unplugged(q, depth, from_schedule);
                              q = rq->q;
  628                         depth = 0;
                              spin_lock(q->queue_lock);
                      }
      
                      /*
                       * Short-circuit if @q is dead
  628                  */
                      if (unlikely(blk_queue_dying(q))) {
                              __blk_end_request_all(rq, -ENODEV);
                              continue;
                      }
      
                      /*
                       * rq is already accounted, so use raw insert
  628                  */
  139                 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
                              __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
  628                 else
                              __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
  628 
                      depth++;
              }
      
              /*
               * This drops the queue lock
  628          */
  628         if (q)
                      queue_unplugged(q, depth, from_schedule);
  626 
              local_irq_restore(flags);
      }
  899 
      void blk_finish_plug(struct blk_plug *plug)
  904 {
              if (plug != current->plug)
  899                 return;
              blk_flush_plug_list(plug, false);
  899 
              current->plug = NULL;
      }
      EXPORT_SYMBOL(blk_finish_plug);
      
      bool blk_poll(struct request_queue *q, blk_qc_t cookie)
      {
              struct blk_plug *plug;
              long state;
  346 
              if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
  346             !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
                      return false;
      
              plug = current->plug;
              if (plug)
                      blk_flush_plug_list(plug, false);
      
              state = current->state;
              while (!need_resched()) {
                      unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
                      struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
                      int ret;
      
                      hctx->poll_invoked++;
      
                      ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
                      if (ret > 0) {
                              hctx->poll_success++;
                              set_current_state(TASK_RUNNING);
                              return true;
                      }
      
                      if (signal_pending_state(state, current))
                              set_current_state(TASK_RUNNING);
      
                      if (current->state == TASK_RUNNING)
                              return true;
                      if (ret < 0)
                              break;
                      cpu_relax();
              }
      
              return false;
      }
      
      #ifdef CONFIG_PM
      /**
       * blk_pm_runtime_init - Block layer runtime PM initialization routine
       * @q: the queue of the device
       * @dev: the device the queue belongs to
       *
       * Description:
       *    Initialize runtime-PM-related fields for @q and start auto suspend for
       *    @dev. Drivers that want to take advantage of request-based runtime PM
       *    should call this function after @dev has been initialized, and its
       *    request queue @q has been allocated, and runtime PM for it can not happen
       *    yet(either due to disabled/forbidden or its usage_count > 0). In most
       *    cases, driver should call this function before any I/O has taken place.
       *
       *    This function takes care of setting up using auto suspend for the device,
       *    the autosuspend delay is set to -1 to make runtime suspend impossible
       *    until an updated value is either set by user or by driver. Drivers do
       *    not need to touch other autosuspend settings.
       *
       *    The block layer runtime PM is request based, so only works for drivers
       *    that use request as their IO unit instead of those directly use bio's.
       */
      void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
      {
              q->dev = dev;
              q->rpm_status = RPM_ACTIVE;
              pm_runtime_set_autosuspend_delay(q->dev, -1);
              pm_runtime_use_autosuspend(q->dev);
      }
      EXPORT_SYMBOL(blk_pm_runtime_init);
      
      /**
       * blk_pre_runtime_suspend - Pre runtime suspend check
       * @q: the queue of the device
       *
       * Description:
       *    This function will check if runtime suspend is allowed for the device
       *    by examining if there are any requests pending in the queue. If there
       *    are requests pending, the device can not be runtime suspended; otherwise,
       *    the queue's status will be updated to SUSPENDING and the driver can
       *    proceed to suspend the device.
       *
       *    For the not allowed case, we mark last busy for the device so that
       *    runtime PM core will try to autosuspend it some time later.
       *
       *    This function should be called near the start of the device's
       *    runtime_suspend callback.
       *
       * Return:
       *    0                - OK to runtime suspend the device
       *    -EBUSY        - Device should not be runtime suspended
       */
      int blk_pre_runtime_suspend(struct request_queue *q)
      {
              int ret = 0;
      
              if (!q->dev)
                      return ret;
      
              spin_lock_irq(q->queue_lock);
              if (q->nr_pending) {
                      ret = -EBUSY;
                      pm_runtime_mark_last_busy(q->dev);
              } else {
                      q->rpm_status = RPM_SUSPENDING;
              }
              spin_unlock_irq(q->queue_lock);
              return ret;
      }
      EXPORT_SYMBOL(blk_pre_runtime_suspend);
      
      /**
       * blk_post_runtime_suspend - Post runtime suspend processing
       * @q: the queue of the device
       * @err: return value of the device's runtime_suspend function
       *
       * Description:
       *    Update the queue's runtime status according to the return value of the
       *    device's runtime suspend function and mark last busy for the device so
       *    that PM core will try to auto suspend the device at a later time.
       *
       *    This function should be called near the end of the device's
       *    runtime_suspend callback.
       */
      void blk_post_runtime_suspend(struct request_queue *q, int err)
      {
              if (!q->dev)
                      return;
      
              spin_lock_irq(q->queue_lock);
              if (!err) {
                      q->rpm_status = RPM_SUSPENDED;
              } else {
                      q->rpm_status = RPM_ACTIVE;
                      pm_runtime_mark_last_busy(q->dev);
              }
              spin_unlock_irq(q->queue_lock);
      }
      EXPORT_SYMBOL(blk_post_runtime_suspend);
      
      /**
       * blk_pre_runtime_resume - Pre runtime resume processing
       * @q: the queue of the device
       *
       * Description:
       *    Update the queue's runtime status to RESUMING in preparation for the
       *    runtime resume of the device.
       *
       *    This function should be called near the start of the device's
       *    runtime_resume callback.
       */
      void blk_pre_runtime_resume(struct request_queue *q)
      {
              if (!q->dev)
                      return;
      
              spin_lock_irq(q->queue_lock);
              q->rpm_status = RPM_RESUMING;
              spin_unlock_irq(q->queue_lock);
      }
      EXPORT_SYMBOL(blk_pre_runtime_resume);
      
      /**
       * blk_post_runtime_resume - Post runtime resume processing
       * @q: the queue of the device
       * @err: return value of the device's runtime_resume function
       *
       * Description:
       *    Update the queue's runtime status according to the return value of the
       *    device's runtime_resume function. If it is successfully resumed, process
       *    the requests that are queued into the device's queue when it is resuming
       *    and then mark last busy and initiate autosuspend for it.
       *
       *    This function should be called near the end of the device's
       *    runtime_resume callback.
       */
      void blk_post_runtime_resume(struct request_queue *q, int err)
      {
              if (!q->dev)
                      return;
      
              spin_lock_irq(q->queue_lock);
              if (!err) {
                      q->rpm_status = RPM_ACTIVE;
                      __blk_run_queue(q);
                      pm_runtime_mark_last_busy(q->dev);
                      pm_request_autosuspend(q->dev);
              } else {
                      q->rpm_status = RPM_SUSPENDED;
              }
              spin_unlock_irq(q->queue_lock);
      }
      EXPORT_SYMBOL(blk_post_runtime_resume);
      #endif
      
      int __init blk_dev_init(void)
      {
              BUILD_BUG_ON(__REQ_NR_BITS > 8 *
                              FIELD_SIZEOF(struct request, cmd_flags));
      
              /* used for unplugging and affects IO latency/throughput - HIGHPRI */
              kblockd_workqueue = alloc_workqueue("kblockd",
                                                  WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
              if (!kblockd_workqueue)
                      panic("Failed to create kblockd\n");
      
              request_cachep = kmem_cache_create("blkdev_requests",
                              sizeof(struct request), 0, SLAB_PANIC, NULL);
      
              blk_requestq_cachep = kmem_cache_create("blkdev_queue",
                              sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
      
              return 0;
      }
      
      /*
       * Blk IO latency support. We want this to be as cheap as possible, so doing
       * this lockless (and avoiding atomics), a few off by a few errors in this
       * code is not harmful, and we don't want to do anything that is
       * perf-impactful.
       * TODO : If necessary, we can make the histograms per-cpu and aggregate
       * them when printing them out.
       */
      ssize_t
      blk_latency_hist_show(char* name, struct io_latency_state *s, char *buf,
                      int buf_size)
      {
              int i;
              int bytes_written = 0;
              u_int64_t num_elem, elem;
              int pct;
              u_int64_t average;
      
             num_elem = s->latency_elems;
             if (num_elem > 0) {
                     average = div64_u64(s->latency_sum, s->latency_elems);
                     bytes_written += scnprintf(buf + bytes_written,
                                     buf_size - bytes_written,
                                     "IO svc_time %s Latency Histogram (n = %llu,"
                                     " average = %llu):\n", name, num_elem, average);
                     for (i = 0;
                          i < ARRAY_SIZE(latency_x_axis_us);
                          i++) {
                             elem = s->latency_y_axis[i];
                             pct = div64_u64(elem * 100, num_elem);
                             bytes_written += scnprintf(buf + bytes_written,
                                             PAGE_SIZE - bytes_written,
                                             "\t< %6lluus%15llu%15d%%\n",
                                             latency_x_axis_us[i],
                                             elem, pct);
                     }
                     /* Last element in y-axis table is overflow */
                     elem = s->latency_y_axis[i];
                     pct = div64_u64(elem * 100, num_elem);
                     bytes_written += scnprintf(buf + bytes_written,
                                     PAGE_SIZE - bytes_written,
                                     "\t>=%6lluus%15llu%15d%%\n",
                                     latency_x_axis_us[i - 1], elem, pct);
              }
      
              return bytes_written;
      }
      EXPORT_SYMBOL(blk_latency_hist_show);
      /*
       * fs/dcache.c
       *
       * Complete reimplementation
       * (C) 1997 Thomas Schoebel-Theuer,
       * with heavy changes by Linus Torvalds
       */
      
      /*
       * Notes on the allocation strategy:
       *
       * The dcache is a master of the icache - whenever a dcache entry
       * exists, the inode will always exist. "iput()" is done either when
       * the dcache entry is deleted or garbage collected.
       */
      
      #include <linux/syscalls.h>
      #include <linux/string.h>
      #include <linux/mm.h>
      #include <linux/fs.h>
      #include <linux/fsnotify.h>
      #include <linux/slab.h>
      #include <linux/init.h>
      #include <linux/hash.h>
      #include <linux/cache.h>
      #include <linux/export.h>
      #include <linux/mount.h>
      #include <linux/file.h>
      #include <asm/uaccess.h>
      #include <linux/security.h>
      #include <linux/seqlock.h>
      #include <linux/swap.h>
      #include <linux/bootmem.h>
      #include <linux/fs_struct.h>
      #include <linux/hardirq.h>
      #include <linux/bit_spinlock.h>
      #include <linux/rculist_bl.h>
      #include <linux/prefetch.h>
      #include <linux/ratelimit.h>
      #include <linux/list_lru.h>
      #include <linux/kasan.h>
      
      #include "internal.h"
      #include "mount.h"
      
      /*
       * Usage:
       * dcache->d_inode->i_lock protects:
       *   - i_dentry, d_u.d_alias, d_inode of aliases
       * dcache_hash_bucket lock protects:
       *   - the dcache hash table
       * s_anon bl list spinlock protects:
       *   - the s_anon list (see __d_drop)
       * dentry->d_sb->s_dentry_lru_lock protects:
       *   - the dcache lru lists and counters
       * d_lock protects:
       *   - d_flags
       *   - d_name
       *   - d_lru
       *   - d_count
       *   - d_unhashed()
       *   - d_parent and d_subdirs
       *   - childrens' d_child and d_parent
       *   - d_u.d_alias, d_inode
       *
       * Ordering:
       * dentry->d_inode->i_lock
       *   dentry->d_lock
       *     dentry->d_sb->s_dentry_lru_lock
       *     dcache_hash_bucket lock
       *     s_anon lock
       *
       * If there is an ancestor relationship:
       * dentry->d_parent->...->d_parent->d_lock
       *   ...
       *     dentry->d_parent->d_lock
       *       dentry->d_lock
       *
       * If no ancestor relationship:
       * if (dentry1 < dentry2)
       *   dentry1->d_lock
       *     dentry2->d_lock
       */
      int sysctl_vfs_cache_pressure __read_mostly = 100;
      EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
      
      __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
      
      EXPORT_SYMBOL(rename_lock);
      
      static struct kmem_cache *dentry_cache __read_mostly;
      
      /*
       * This is the single most critical data structure when it comes
       * to the dcache: the hashtable for lookups. Somebody should try
       * to make this good - I've just made it work.
       *
       * This hash-function tries to avoid losing too many bits of hash
       * information, yet avoid using a prime hash-size or similar.
       */
      
      static unsigned int d_hash_mask __read_mostly;
      static unsigned int d_hash_shift __read_mostly;
      
      static struct hlist_bl_head *dentry_hashtable __read_mostly;
      
      static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
                                              unsigned int hash)
      {
              hash += (unsigned long) parent / L1_CACHE_BYTES;
              return dentry_hashtable + hash_32(hash, d_hash_shift);
      }
      
      /* Statistics gathering. */
      struct dentry_stat_t dentry_stat = {
              .age_limit = 45,
      };
      
      static DEFINE_PER_CPU(long, nr_dentry);
      static DEFINE_PER_CPU(long, nr_dentry_unused);
      
      #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
      
      /*
       * Here we resort to our own counters instead of using generic per-cpu counters
       * for consistency with what the vfs inode code does. We are expected to harvest
       * better code and performance by having our own specialized counters.
       *
       * Please note that the loop is done over all possible CPUs, not over all online
       * CPUs. The reason for this is that we don't want to play games with CPUs going
       * on and off. If one of them goes off, we will just keep their counters.
       *
       * glommer: See cffbc8a for details, and if you ever intend to change this,
       * please update all vfs counters to match.
       */
      static long get_nr_dentry(void)
      {
              int i;
              long sum = 0;
              for_each_possible_cpu(i)
                      sum += per_cpu(nr_dentry, i);
              return sum < 0 ? 0 : sum;
      }
      
      static long get_nr_dentry_unused(void)
      {
              int i;
              long sum = 0;
              for_each_possible_cpu(i)
                      sum += per_cpu(nr_dentry_unused, i);
              return sum < 0 ? 0 : sum;
      }
      
      int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
                         size_t *lenp, loff_t *ppos)
      {
              dentry_stat.nr_dentry = get_nr_dentry();
              dentry_stat.nr_unused = get_nr_dentry_unused();
              return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
      }
      #endif
      
      /*
       * Compare 2 name strings, return 0 if they match, otherwise non-zero.
       * The strings are both count bytes long, and count is non-zero.
       */
      #ifdef CONFIG_DCACHE_WORD_ACCESS
      
      #include <asm/word-at-a-time.h>
      /*
       * NOTE! 'cs' and 'scount' come from a dentry, so it has a
       * aligned allocation for this particular component. We don't
       * strictly need the load_unaligned_zeropad() safety, but it
       * doesn't hurt either.
       *
       * In contrast, 'ct' and 'tcount' can be from a pathname, and do
       * need the careful unaligned handling.
       */
      static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
      {
              unsigned long a,b,mask;
      
              for (;;) {
 1996                 a = *(unsigned long *)cs;
                      b = load_unaligned_zeropad(ct);
                      if (tcount < sizeof(unsigned long))
                              break;
  361                 if (unlikely(a != b))
                              return 1;
  360                 cs += sizeof(unsigned long);
                      ct += sizeof(unsigned long);
                      tcount -= sizeof(unsigned long);
                      if (!tcount)
                              return 0;
              }
 1995         mask = bytemask_from_count(tcount);
              return unlikely(!!((a ^ b) & mask));
      }
      
      #else
      
      static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
      {
              do {
                      if (*cs != *ct)
                              return 1;
                      cs++;
                      ct++;
                      tcount--;
              } while (tcount);
              return 0;
      }
      
      #endif
      
      static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
      {
              const unsigned char *cs;
              /*
               * Be careful about RCU walk racing with rename:
               * use ACCESS_ONCE to fetch the name pointer.
               *
               * NOTE! Even if a rename will mean that the length
               * was not loaded atomically, we don't care. The
               * RCU walk will check the sequence count eventually,
               * and catch it. And we won't overrun the buffer,
               * because we're reading the name pointer atomically,
               * and a dentry name is guaranteed to be properly
               * terminated with a NUL byte.
               *
               * End result: even if 'len' is wrong, we'll exit
               * early because the data cannot match (there can
               * be no NUL in the ct/tcount data)
               */
 1230         cs = ACCESS_ONCE(dentry->d_name.name);
              smp_read_barrier_depends();
 1996         return dentry_string_cmp(cs, ct, tcount);
      }
      
      struct external_name {
              union {
                      atomic_t count;
                      struct rcu_head head;
              } u;
              unsigned char name[];
      };
      
      static inline struct external_name *external_name(struct dentry *dentry)
      {
    4         return container_of(dentry->d_name.name, struct external_name, name[0]);
      }
      
      static void __d_free(struct rcu_head *head)
      {
              struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
      
  467         kmem_cache_free(dentry_cache, dentry); 
      }
      
      static void __d_free_external(struct rcu_head *head)
      {
              struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
              kfree(external_name(dentry));
              kmem_cache_free(dentry_cache, dentry); 
      }
      
      static inline int dname_external(const struct dentry *dentry)
      {
  840         return dentry->d_name.name != dentry->d_iname;
      }
      
      void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
      {
   90         spin_lock(&dentry->d_lock);
              if (unlikely(dname_external(dentry))) {
                      struct external_name *p = external_name(dentry);
    9                 atomic_inc(&p->u.count);
                      spin_unlock(&dentry->d_lock);
                      name->name = p->name;
              } else {
                      memcpy(name->inline_name, dentry->d_iname,
   81                        dentry->d_name.len + 1);
                      spin_unlock(&dentry->d_lock);
                      name->name = name->inline_name;
              }
   90 }
      EXPORT_SYMBOL(take_dentry_name_snapshot);
      
      void release_dentry_name_snapshot(struct name_snapshot *name)
      {
   90         if (unlikely(name->name != name->inline_name)) {
                      struct external_name *p;
                      p = container_of(name->name, struct external_name, name[0]);
    9                 if (unlikely(atomic_dec_and_test(&p->u.count)))
    4                         kfree_rcu(p, u.head);
              }
   90 }
      EXPORT_SYMBOL(release_dentry_name_snapshot);
      
      static inline void __d_set_inode_and_type(struct dentry *dentry,
                                                struct inode *inode,
                                                unsigned type_flags)
      {
              unsigned flags;
      
              dentry->d_inode = inode;
              flags = READ_ONCE(dentry->d_flags);
              flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
              flags |= type_flags;
              WRITE_ONCE(dentry->d_flags, flags);
      }
      
      static inline void __d_clear_type_and_inode(struct dentry *dentry)
      {
  569         unsigned flags = READ_ONCE(dentry->d_flags);
      
              flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
              WRITE_ONCE(dentry->d_flags, flags);
              dentry->d_inode = NULL;
      }
      
  467 static void dentry_free(struct dentry *dentry)
      {
  830         WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
  830         if (unlikely(dname_external(dentry))) {
                      struct external_name *p = external_name(dentry);
   15                 if (likely(atomic_dec_and_test(&p->u.count))) {
   12                         call_rcu(&dentry->d_u.d_rcu, __d_free_external);
                              return;
                      }
              }
              /* if dentry was never visible to RCU, immediate free is OK */
  819         if (!(dentry->d_flags & DCACHE_RCUACCESS))
  467                 __d_free(&dentry->d_u.d_rcu);
              else
  389                 call_rcu(&dentry->d_u.d_rcu, __d_free);
      }
      
      /**
       * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
       * @dentry: the target dentry
       * After this call, in-progress rcu-walk path lookup will fail. This
       * should be called after unhashing, and after changing d_inode (if
       * the dentry has not already been unhashed).
       */
      static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
      {
  280         lockdep_assert_held(&dentry->d_lock);
              /* Go through am invalidation barrier */
  280         write_seqcount_invalidate(&dentry->d_seq);
      }
      
      /*
       * Release the dentry's inode, using the filesystem
       * d_iput() operation if defined. Dentry has no refcount
       * and is unhashed.
       */
      static void dentry_iput(struct dentry * dentry)
              __releases(dentry->d_lock)
              __releases(dentry->d_inode->i_lock)
      {
  832         struct inode *inode = dentry->d_inode;
              if (inode) {
  569                 __d_clear_type_and_inode(dentry);
  569                 hlist_del_init(&dentry->d_u.d_alias);
  569                 spin_unlock(&dentry->d_lock);
                      spin_unlock(&inode->i_lock);
                      if (!inode->i_nlink)
   43                         fsnotify_inoderemove(inode);
  569                 if (dentry->d_op && dentry->d_op->d_iput)
                              dentry->d_op->d_iput(dentry, inode);
                      else
  569                         iput(inode);
              } else {
  288                 spin_unlock(&dentry->d_lock);
              }
      }
      
      /*
       * Release the dentry's inode, using the filesystem
       * d_iput() operation if defined. dentry remains in-use.
       */
      static void dentry_unlink_inode(struct dentry * dentry)
              __releases(dentry->d_lock)
              __releases(dentry->d_inode->i_lock)
      {
              struct inode *inode = dentry->d_inode;
      
              raw_write_seqcount_begin(&dentry->d_seq);
              __d_clear_type_and_inode(dentry);
   70         hlist_del_init(&dentry->d_u.d_alias);
   70         raw_write_seqcount_end(&dentry->d_seq);
              spin_unlock(&dentry->d_lock);
              spin_unlock(&inode->i_lock);
              if (!inode->i_nlink)
   70                 fsnotify_inoderemove(inode);
   70         if (dentry->d_op && dentry->d_op->d_iput)
                      dentry->d_op->d_iput(dentry, inode);
              else
   70                 iput(inode);
      }
      
      /*
       * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
       * is in use - which includes both the "real" per-superblock
       * LRU list _and_ the DCACHE_SHRINK_LIST use.
       *
       * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
       * on the shrink list (ie not on the superblock LRU list).
       *
       * The per-cpu "nr_dentry_unused" counters are updated with
       * the DCACHE_LRU_LIST bit.
       *
       * These helper functions make sure we always follow the
       * rules. d_lock must be held by the caller.
       */
      #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
      static void d_lru_add(struct dentry *dentry)
      {
  215         D_FLAG_VERIFY(dentry, 0);
  215         dentry->d_flags |= DCACHE_LRU_LIST;
              this_cpu_inc(nr_dentry_unused);
              WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
      }
      
      static void d_lru_del(struct dentry *dentry)
      {
   21         D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
   21         dentry->d_flags &= ~DCACHE_LRU_LIST;
              this_cpu_dec(nr_dentry_unused);
              WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
   21 }
      
      static void d_shrink_del(struct dentry *dentry)
      {
    7         D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
    7         list_del_init(&dentry->d_lru);
              dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
              this_cpu_dec(nr_dentry_unused);
      }
      
      static void d_shrink_add(struct dentry *dentry, struct list_head *list)
      {
    7         D_FLAG_VERIFY(dentry, 0);
    7         list_add(&dentry->d_lru, list);
    7         dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
              this_cpu_inc(nr_dentry_unused);
      }
      
      /*
       * These can only be called under the global LRU lock, ie during the
       * callback for freeing the LRU list. "isolate" removes it from the
       * LRU lists entirely, while shrink_move moves it to the indicated
       * private list.
       */
      static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
      {
              D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
              dentry->d_flags &= ~DCACHE_LRU_LIST;
              this_cpu_dec(nr_dentry_unused);
              list_lru_isolate(lru, &dentry->d_lru);
      }
      
      static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
                                    struct list_head *list)
      {
              D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
              dentry->d_flags |= DCACHE_SHRINK_LIST;
              list_lru_isolate_move(lru, &dentry->d_lru, list);
      }
      
      /*
       * dentry_lru_(add|del)_list) must be called with d_lock held.
       */
      static void dentry_lru_add(struct dentry *dentry)
      {
  283         if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
  215                 d_lru_add(dentry);
      }
      
      /**
       * d_drop - drop a dentry
       * @dentry: dentry to drop
       *
       * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
       * be found through a VFS lookup any more. Note that this is different from
       * deleting the dentry - d_delete will try to mark the dentry negative if
       * possible, giving a successful _negative_ lookup, while d_drop will
       * just make the cache lookup fail.
       *
       * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
       * reason (NFS timeouts or autofs deletes).
       *
       * __d_drop requires dentry->d_lock.
       */
      void __d_drop(struct dentry *dentry)
      {
  880         if (!d_unhashed(dentry)) {
                      struct hlist_bl_head *b;
                      /*
                       * Hashed dentries are normally on the dentry hashtable,
                       * with the exception of those newly allocated by
                       * d_obtain_alias, which are always IS_ROOT:
                       */
  280                 if (unlikely(IS_ROOT(dentry)))
                              b = &dentry->d_sb->s_anon;
                      else
  280                         b = d_hash(dentry->d_parent, dentry->d_name.hash);
      
  280                 hlist_bl_lock(b);
  280                 __hlist_bl_del(&dentry->d_hash);
  280                 dentry->d_hash.pprev = NULL;
  280                 hlist_bl_unlock(b);
  280                 dentry_rcuwalk_invalidate(dentry);
              }
  880 }
      EXPORT_SYMBOL(__d_drop);
      
      void d_drop(struct dentry *dentry)
      {
   24         spin_lock(&dentry->d_lock);
              __d_drop(dentry);
              spin_unlock(&dentry->d_lock);
      }
      EXPORT_SYMBOL(d_drop);
      
      static void __dentry_kill(struct dentry *dentry)
      {
              struct dentry *parent = NULL;
              bool can_free = true;
  832         if (!IS_ROOT(dentry))
                      parent = dentry->d_parent;
      
              /*
               * The dentry is now unrecoverably dead to the world.
               */
  832         lockref_mark_dead(&dentry->d_lockref);
      
              /*
               * inform the fs via d_prune that this dentry is about to be
               * unhashed and destroyed.
               */
              if (dentry->d_flags & DCACHE_OP_PRUNE)
   16                 dentry->d_op->d_prune(dentry);
      
  832         if (dentry->d_flags & DCACHE_LRU_LIST) {
   15                 if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
   15                         d_lru_del(dentry);
              }
              /* if it was on the hash then remove it */
  832         __d_drop(dentry);
  832         __list_del_entry(&dentry->d_child);
              /*
               * Inform d_walk() that we are no longer attached to the
               * dentry tree
               */
  832         dentry->d_flags |= DCACHE_DENTRY_KILLED;
              if (parent)
  355                 spin_unlock(&parent->d_lock);
  832         dentry_iput(dentry);
              /*
               * dentry_iput drops the locks, at which point nobody (except
               * transient RCU lookups) can reach this dentry.
               */
  830         BUG_ON(dentry->d_lockref.count > 0);
  830         this_cpu_dec(nr_dentry);
  681         if (dentry->d_op && dentry->d_op->d_release)
    8                 dentry->d_op->d_release(dentry);
      
  830         spin_lock(&dentry->d_lock);
              if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                      dentry->d_flags |= DCACHE_MAY_FREE;
                      can_free = false;
              }
  830         spin_unlock(&dentry->d_lock);
              if (likely(can_free))
                      dentry_free(dentry);
  830 }
      
      /*
       * Finish off a dentry we've decided to kill.
       * dentry->d_lock must be held, returns with it unlocked.
       * If ref is non-zero, then decrement the refcount too.
       * Returns dentry requiring refcount drop, or NULL if we're done.
       */
      static struct dentry *dentry_kill(struct dentry *dentry)
              __releases(dentry->d_lock)
      {
  831         struct inode *inode = dentry->d_inode;
              struct dentry *parent = NULL;
      
  568         if (inode && unlikely(!spin_trylock(&inode->i_lock)))
                      goto failed;
      
  831         if (!IS_ROOT(dentry)) {
                      parent = dentry->d_parent;
  350                 if (unlikely(!spin_trylock(&parent->d_lock))) {
                              if (inode)
                                      spin_unlock(&inode->i_lock);
                              goto failed;
                      }
              }
      
  831         __dentry_kill(dentry);
              return parent;
      
      failed:
              spin_unlock(&dentry->d_lock);
              return dentry; /* try again with same dentry */
      }
      
      static inline struct dentry *lock_parent(struct dentry *dentry)
      {
    1         struct dentry *parent = dentry->d_parent;
              if (IS_ROOT(dentry))
                      return NULL;
    7         if (unlikely(dentry->d_lockref.count < 0))
                      return NULL;
    7         if (likely(spin_trylock(&parent->d_lock)))
                      return parent;
              rcu_read_lock();
              spin_unlock(&dentry->d_lock);
      again:
              parent = ACCESS_ONCE(dentry->d_parent);
              spin_lock(&parent->d_lock);
              /*
               * We can't blindly lock dentry until we are sure
               * that we won't violate the locking order.
               * Any changes of dentry->d_parent must have
               * been done with parent->d_lock held, so
               * spin_lock() above is enough of a barrier
               * for checking if it's still our child.
               */
              if (unlikely(parent != dentry->d_parent)) {
                      spin_unlock(&parent->d_lock);
                      goto again;
              }
              if (parent != dentry) {
                      spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                      if (unlikely(dentry->d_lockref.count < 0)) {
                              spin_unlock(&parent->d_lock);
                              parent = NULL;
                      }
              } else {
                      parent = NULL;
              }
              rcu_read_unlock();
              return parent;
      }
      
      /*
       * Try to do a lockless dput(), and return whether that was successful.
       *
       * If unsuccessful, we return false, having already taken the dentry lock.
       *
       * The caller needs to hold the RCU read lock, so that the dentry is
       * guaranteed to stay around even if the refcount goes down to zero!
       */
      static inline bool fast_dput(struct dentry *dentry)
      {
              int ret;
              unsigned int d_flags;
      
              /*
               * If we have a d_op->d_delete() operation, we sould not
               * let the dentry count go to zero, so use "put_or_lock".
               */
 2852         if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
 1965                 return lockref_put_or_lock(&dentry->d_lockref);
      
              /*
               * .. otherwise, we can try to just decrement the
               * lockref optimistically.
               */
 1945         ret = lockref_put_return(&dentry->d_lockref);
      
              /*
               * If the lockref_put_return() failed due to the lock being held
               * by somebody else, the fast path has failed. We will need to
               * get the lock, and then check the count again.
               */
              if (unlikely(ret < 0)) {
 1945                 spin_lock(&dentry->d_lock);
                      if (dentry->d_lockref.count > 1) {
 1579                         dentry->d_lockref.count--;
                              spin_unlock(&dentry->d_lock);
                              return 1;
                      }
                      return 0;
              }
      
              /*
               * If we weren't the last ref, we're done.
               */
              if (ret)
                      return 1;
      
              /*
               * Careful, careful. The reference count went down
               * to zero, but we don't hold the dentry lock, so
               * somebody else could get it again, and do another
               * dput(), and we need to not race with that.
               *
               * However, there is a very special and common case
               * where we don't care, because there is nothing to
               * do: the dentry is still hashed, it does not have
               * a 'delete' op, and it's referenced and already on
               * the LRU list.
               *
               * NOTE! Since we aren't locked, these values are
               * not "stable". However, it is sufficient that at
               * some point after we dropped the reference the
               * dentry was hashed and the flags had the proper
               * value. Other dentry users may have re-gotten
               * a reference to the dentry and change that, but
               * our work is done - we can leave the dentry
               * around with a zero refcount.
               */
              smp_rmb();
              d_flags = ACCESS_ONCE(dentry->d_flags);
              d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
      
              /* Nothing to do? Dropping the reference was all we needed? */
              if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
                      return 1;
      
              /*
               * Not the fast normal case? Get the lock. We've already decremented
               * the refcount, but we'll need to re-check the situation after
               * getting the lock.
               */
              spin_lock(&dentry->d_lock);
      
              /*
               * Did somebody else grab a reference to it in the meantime, and
               * we're no longer the last user after all? Alternatively, somebody
               * else could have killed it and marked it dead. Either way, we
               * don't need to do anything else.
               */
              if (dentry->d_lockref.count) {
                      spin_unlock(&dentry->d_lock);
                      return 1;
              }
      
              /*
               * Re-get the reference we optimistically dropped. We hold the
               * lock, and we just tested that it was zero, so we can just
               * set it to 1.
               */
              dentry->d_lockref.count = 1;
              return 0;
      }
      
      
      /* 
       * This is dput
       *
       * This is complicated by the fact that we do not want to put
       * dentries that are no longer on any hash chain on the unused
       * list: we'd much rather just get rid of them immediately.
       *
       * However, that implies that we have to traverse the dentry
       * tree upwards to the parents which might _also_ now be
       * scheduled for deletion (it may have been only waiting for
       * its last child to go away).
       *
       * This tail recursion is done by hand as we don't want to depend
       * on the compiler to always get this right (gcc generally doesn't).
       * Real recursion would eat up our stack space.
       */
      
      /*
       * dput - release a dentry
       * @dentry: dentry to release 
       *
       * Release a dentry. This will drop the usage count and if appropriate
       * call the dentry unlink method as well as removing it from the queues and
       * releasing its resources. If the parent dentries were scheduled for release
       * they too may now get deleted.
       */
 2852 void dput(struct dentry *dentry)
      {
 2869         if (unlikely(!dentry))
                      return;
      
      repeat:
 2852         might_sleep();
      
 2852         rcu_read_lock();
 2852         if (likely(fast_dput(dentry))) {
 2558                 rcu_read_unlock();
 2852                 return;
              }
      
              /* Slow case: now with the dentry lock held */
 1013         rcu_read_unlock();
      
              /* Unreachable? Get rid of it */
              if (unlikely(d_unhashed(dentry)))
                      goto kill_it;
      
  441         if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
                      goto kill_it;
      
  441         if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
  353                 if (dentry->d_op->d_delete(dentry))
                              goto kill_it;
              }
      
  283         if (!(dentry->d_flags & DCACHE_REFERENCED))
  215                 dentry->d_flags |= DCACHE_REFERENCED;
  283         dentry_lru_add(dentry);
      
  283         dentry->d_lockref.count--;
              spin_unlock(&dentry->d_lock);
              return;
      
      kill_it:
  831         dentry = dentry_kill(dentry);
  350         if (dentry) {
  350                 cond_resched();
                      goto repeat;
              }
      }
      EXPORT_SYMBOL(dput);
      
      
      /* This must be called with d_lock held */
      static inline void __dget_dlock(struct dentry *dentry)
      {
   68         dentry->d_lockref.count++;
      }
      
      static inline void __dget(struct dentry *dentry)
      {
              lockref_get(&dentry->d_lockref);
      }
      
      struct dentry *dget_parent(struct dentry *dentry)
      {
              int gotref;
              struct dentry *ret;
      
              /*
               * Do optimistic parent lookup without any
               * locking.
               */
   66         rcu_read_lock();
   66         ret = ACCESS_ONCE(dentry->d_parent);
              gotref = lockref_get_not_zero(&ret->d_lockref);
   66         rcu_read_unlock();
              if (likely(gotref)) {
   66                 if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
                              return ret;
                      dput(ret);
              }
      
      repeat:
              /*
               * Don't need rcu_dereference because we re-check it was correct under
               * the lock.
               */
              rcu_read_lock();
              ret = dentry->d_parent;
              spin_lock(&ret->d_lock);
              if (unlikely(ret != dentry->d_parent)) {
                      spin_unlock(&ret->d_lock);
                      rcu_read_unlock();
                      goto repeat;
              }
              rcu_read_unlock();
              BUG_ON(!ret->d_lockref.count);
              ret->d_lockref.count++;
              spin_unlock(&ret->d_lock);
              return ret;
      }
      EXPORT_SYMBOL(dget_parent);
      
      /**
       * d_find_alias - grab a hashed alias of inode
       * @inode: inode in question
       *
       * If inode has a hashed alias, or is a directory and has any alias,
       * acquire the reference to alias and return it. Otherwise return NULL.
       * Notice that if inode is a directory there can be only one alias and
       * it can be unhashed only if it has no children, or if it is the root
       * of a filesystem, or if the directory was renamed and d_revalidate
       * was the first vfs operation to notice.
       *
       * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
       * any other hashed alias over that one.
       */
      static struct dentry *__d_find_alias(struct inode *inode)
      {
              struct dentry *alias, *discon_alias;
      
      again:
              discon_alias = NULL;
   68         hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
   68                 spin_lock(&alias->d_lock);
   68                  if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
   68                         if (IS_ROOT(alias) &&
                                  (alias->d_flags & DCACHE_DISCONNECTED)) {
                                      discon_alias = alias;
                              } else {
   68                                 __dget_dlock(alias);
                                      spin_unlock(&alias->d_lock);
                                      return alias;
                              }
                      }
                      spin_unlock(&alias->d_lock);
              }
              if (discon_alias) {
                      alias = discon_alias;
                      spin_lock(&alias->d_lock);
                      if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
                              __dget_dlock(alias);
                              spin_unlock(&alias->d_lock);
                              return alias;
                      }
                      spin_unlock(&alias->d_lock);
                      goto again;
              }
              return NULL;
      }
      
      struct dentry *d_find_alias(struct inode *inode)
      {
              struct dentry *de = NULL;
      
   68         if (!hlist_empty(&inode->i_dentry)) {
   68                 spin_lock(&inode->i_lock);
   68                 de = __d_find_alias(inode);
   68                 spin_unlock(&inode->i_lock);
              }
   68         return de;
      }
      EXPORT_SYMBOL(d_find_alias);
      
      /*
       *        Try to kill dentries associated with this inode.
       * WARNING: you must own a reference to inode.
       */
      void d_prune_aliases(struct inode *inode)
      {
              struct dentry *dentry;
      restart:
              spin_lock(&inode->i_lock);
              hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
                      spin_lock(&dentry->d_lock);
                      if (!dentry->d_lockref.count) {
                              struct dentry *parent = lock_parent(dentry);
                              if (likely(!dentry->d_lockref.count)) {
                                      __dentry_kill(dentry);
                                      dput(parent);
                                      goto restart;
                              }
                              if (parent)
                                      spin_unlock(&parent->d_lock);
                      }
                      spin_unlock(&dentry->d_lock);
              }
              spin_unlock(&inode->i_lock);
      }
      EXPORT_SYMBOL(d_prune_aliases);
      
      static void shrink_dentry_list(struct list_head *list)
      {
              struct dentry *dentry, *parent;
      
    7         while (!list_empty(list)) {
                      struct inode *inode;
    7                 dentry = list_entry(list->prev, struct dentry, d_lru);
                      spin_lock(&dentry->d_lock);
    7                 parent = lock_parent(dentry);
      
                      /*
                       * The dispose list is isolated and dentries are not accounted
                       * to the LRU here, so we can simply remove it from the list
                       * here regardless of whether it is referenced or not.
                       */
    7                 d_shrink_del(dentry);
      
                      /*
                       * We found an inuse dentry which was not removed from
                       * the LRU because of laziness during lookup. Do not free it.
                       */
                      if (dentry->d_lockref.count > 0) {
                              spin_unlock(&dentry->d_lock);
                              if (parent)
                                      spin_unlock(&parent->d_lock);
                              continue;
                      }
      
      
    7                 if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
                              bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
                              spin_unlock(&dentry->d_lock);
                              if (parent)
                                      spin_unlock(&parent->d_lock);
                              if (can_free)
                                      dentry_free(dentry);
                              continue;
                      }
      
    7                 inode = dentry->d_inode;
    7                 if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
                              d_shrink_add(dentry, list);
                              spin_unlock(&dentry->d_lock);
                              if (parent)
                                      spin_unlock(&parent->d_lock);
                              continue;
                      }
      
    7                 __dentry_kill(dentry);
      
                      /*
                       * We need to prune ancestors too. This is necessary to prevent
                       * quadratic behavior of shrink_dcache_parent(), but is also
                       * expected to be beneficial in reducing dentry cache
                       * fragmentation.
                       */
                      dentry = parent;
    7                 while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
    1                         parent = lock_parent(dentry);
    1                         if (dentry->d_lockref.count != 1) {
                                      dentry->d_lockref.count--;
                                      spin_unlock(&dentry->d_lock);
                                      if (parent)
                                              spin_unlock(&parent->d_lock);
                                      break;
                              }
    1                         inode = dentry->d_inode;        /* can't be NULL */
                              if (unlikely(!spin_trylock(&inode->i_lock))) {
                                      spin_unlock(&dentry->d_lock);
                                      if (parent)
                                              spin_unlock(&parent->d_lock);
                                      cpu_relax();
                                      continue;
                              }
    1                         __dentry_kill(dentry);
                              dentry = parent;
                      }
              }
    7 }
      
      static enum lru_status dentry_lru_isolate(struct list_head *item,
                      struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
      {
              struct list_head *freeable = arg;
              struct dentry        *dentry = container_of(item, struct dentry, d_lru);
      
      
              /*
               * we are inverting the lru lock/dentry->d_lock here,
               * so use a trylock. If we fail to get the lock, just skip
               * it
               */
              if (!spin_trylock(&dentry->d_lock))
                      return LRU_SKIP;
      
              /*
               * Referenced dentries are still in use. If they have active
               * counts, just remove them from the LRU. Otherwise give them
               * another pass through the LRU.
               */
              if (dentry->d_lockref.count) {
                      d_lru_isolate(lru, dentry);
                      spin_unlock(&dentry->d_lock);
                      return LRU_REMOVED;
              }
      
              if (dentry->d_flags & DCACHE_REFERENCED) {
                      dentry->d_flags &= ~DCACHE_REFERENCED;
                      spin_unlock(&dentry->d_lock);
      
                      /*
                       * The list move itself will be made by the common LRU code. At
                       * this point, we've dropped the dentry->d_lock but keep the
                       * lru lock. This is safe to do, since every list movement is
                       * protected by the lru lock even if both locks are held.
                       *
                       * This is guaranteed by the fact that all LRU management
                       * functions are intermediated by the LRU API calls like
                       * list_lru_add and list_lru_del. List movement in this file
                       * only ever occur through this functions or through callbacks
                       * like this one, that are called from the LRU API.
                       *
                       * The only exceptions to this are functions like
                       * shrink_dentry_list, and code that first checks for the
                       * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
                       * operating only with stack provided lists after they are
                       * properly isolated from the main list.  It is thus, always a
                       * local access.
                       */
                      return LRU_ROTATE;
              }
      
              d_lru_shrink_move(lru, dentry, freeable);
              spin_unlock(&dentry->d_lock);
      
              return LRU_REMOVED;
      }
      
      /**
       * prune_dcache_sb - shrink the dcache
       * @sb: superblock
       * @sc: shrink control, passed to list_lru_shrink_walk()
       *
       * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
       * is done when we need more memory and called from the superblock shrinker
       * function.
       *
       * This function may fail to free any resources if all the dentries are in
       * use.
       */
      long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
      {
              LIST_HEAD(dispose);
              long freed;
      
              freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
                                           dentry_lru_isolate, &dispose);
              shrink_dentry_list(&dispose);
              return freed;
      }
      
      static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
                      struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
      {
              struct list_head *freeable = arg;
              struct dentry        *dentry = container_of(item, struct dentry, d_lru);
      
              /*
               * we are inverting the lru lock/dentry->d_lock here,
               * so use a trylock. If we fail to get the lock, just skip
               * it
               */
              if (!spin_trylock(&dentry->d_lock))
                      return LRU_SKIP;
      
              d_lru_shrink_move(lru, dentry, freeable);
              spin_unlock(&dentry->d_lock);
      
              return LRU_REMOVED;
      }
      
      
      /**
       * shrink_dcache_sb - shrink dcache for a superblock
       * @sb: superblock
       *
       * Shrink the dcache for the specified super block. This is used to free
       * the dcache before unmounting a file system.
       */
      void shrink_dcache_sb(struct super_block *sb)
      {
              do {
                      LIST_HEAD(dispose);
      
                      list_lru_walk(&sb->s_dentry_lru,
                              dentry_lru_isolate_shrink, &dispose, 1024);
                      shrink_dentry_list(&dispose);
                      cond_resched();
              } while (list_lru_count(&sb->s_dentry_lru) > 0);
      }
      EXPORT_SYMBOL(shrink_dcache_sb);
      
      /**
       * enum d_walk_ret - action to talke during tree walk
       * @D_WALK_CONTINUE:        contrinue walk
       * @D_WALK_QUIT:        quit walk
       * @D_WALK_NORETRY:        quit when retry is needed
       * @D_WALK_SKIP:        skip this dentry and its children
       */
      enum d_walk_ret {
              D_WALK_CONTINUE,
              D_WALK_QUIT,
              D_WALK_NORETRY,
              D_WALK_SKIP,
      };
      
      /**
       * d_walk - walk the dentry tree
       * @parent:        start of walk
       * @data:        data passed to @enter() and @finish()
       * @enter:        callback when first entering the dentry
       * @finish:        callback when successfully finished the walk
       *
       * The @enter() and @finish() callbacks are called with d_lock held.
       */
      static void d_walk(struct dentry *parent, void *data,
                         enum d_walk_ret (*enter)(void *, struct dentry *),
                         void (*finish)(void *))
      {
              struct dentry *this_parent;
              struct list_head *next;
              unsigned seq = 0;
              enum d_walk_ret ret;
              bool retry = true;
      
      again:
   63         read_seqbegin_or_lock(&rename_lock, &seq);
              this_parent = parent;
   63         spin_lock(&this_parent->d_lock);
      
              ret = enter(data, this_parent);
   63         switch (ret) {
              case D_WALK_CONTINUE:
                      break;
              case D_WALK_QUIT:
              case D_WALK_SKIP:
                      goto out_unlock;
              case D_WALK_NORETRY:
                      retry = false;
                      break;
              }
      repeat:
   63         next = this_parent->d_subdirs.next;
      resume:
   63         while (next != &this_parent->d_subdirs) {
                      struct list_head *tmp = next;
   15                 struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
                      next = tmp->next;
      
                      spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
      
                      ret = enter(data, dentry);
                      switch (ret) {
                      case D_WALK_CONTINUE:
                              break;
                      case D_WALK_QUIT:
                              spin_unlock(&dentry->d_lock);
                              goto out_unlock;
                      case D_WALK_NORETRY:
                              retry = false;
                              break;
                      case D_WALK_SKIP:
                              spin_unlock(&dentry->d_lock);
                              continue;
                      }
      
   15                 if (!list_empty(&dentry->d_subdirs)) {
    3                         spin_unlock(&this_parent->d_lock);
                              spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                              this_parent = dentry;
                              spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                              goto repeat;
                      }
   15                 spin_unlock(&dentry->d_lock);
              }
              /*
               * All done at this level ... ascend and resume the search.
               */
   63         rcu_read_lock();
      ascend:
   63         if (this_parent != parent) {
                      struct dentry *child = this_parent;
    3                 this_parent = child->d_parent;
      
                      spin_unlock(&child->d_lock);
                      spin_lock(&this_parent->d_lock);
      
                      /* might go back up the wrong parent if we have had a rename. */
    3                 if (need_seqretry(&rename_lock, seq))
                              goto rename_retry;
                      /* go into the first sibling still alive */
                      do {
    3                         next = child->d_child.next;
                              if (next == &this_parent->d_subdirs)
                                      goto ascend;
    2                         child = list_entry(next, struct dentry, d_child);
                      } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
    2                 rcu_read_unlock();
                      goto resume;
              }
   63         if (need_seqretry(&rename_lock, seq))
                      goto rename_retry;
   63         rcu_read_unlock();
              if (finish)
   17                 finish(data);
      
      out_unlock:
   63         spin_unlock(&this_parent->d_lock);
   63         done_seqretry(&rename_lock, seq);
              return;
      
      rename_retry:
              spin_unlock(&this_parent->d_lock);
              rcu_read_unlock();
              BUG_ON(seq & 1);
              if (!retry)
                      return;
              seq = 1;
              goto again;
      }
      
      /*
       * Search for at least 1 mount point in the dentry's subdirs.
       * We descend to the next level whenever the d_subdirs
       * list is non-empty and continue searching.
       */
      
      static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
      {
              int *ret = data;
              if (d_mountpoint(dentry)) {
                      *ret = 1;
                      return D_WALK_QUIT;
              }
              return D_WALK_CONTINUE;
      }
      
      /**
       * have_submounts - check for mounts over a dentry
       * @parent: dentry to check.
       *
       * Return true if the parent or its subdirectories contain
       * a mount point
       */
      int have_submounts(struct dentry *parent)
      {
              int ret = 0;
      
              d_walk(parent, &ret, check_mount, NULL);
      
              return ret;
      }
      EXPORT_SYMBOL(have_submounts);
      
      /*
       * Called by mount code to set a mountpoint and check if the mountpoint is
       * reachable (e.g. NFS can unhash a directory dentry and then the complete
       * subtree can become unreachable).
       *
       * Only one of d_invalidate() and d_set_mounted() must succeed.  For
       * this reason take rename_lock and d_lock on dentry and ancestors.
       */
      int d_set_mounted(struct dentry *dentry)
      {
              struct dentry *p;
              int ret = -ENOENT;
  153         write_seqlock(&rename_lock);
              for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
                      /* Need exclusion wrt. d_invalidate() */
  133                 spin_lock(&p->d_lock);
                      if (unlikely(d_unhashed(p))) {
                              spin_unlock(&p->d_lock);
                              goto out;
                      }
  133                 spin_unlock(&p->d_lock);
              }
  153         spin_lock(&dentry->d_lock);
  117         if (!d_unlinked(dentry)) {
                      ret = -EBUSY;
  153                 if (!d_mountpoint(dentry)) {
  153                         dentry->d_flags |= DCACHE_MOUNTED;
                              ret = 0;
                      }
              }
  153          spin_unlock(&dentry->d_lock);
      out:
  153         write_sequnlock(&rename_lock);
              return ret;
      }
      
      /*
       * Search the dentry child list of the specified parent,
       * and move any unused dentries to the end of the unused
       * list for prune_dcache(). We descend to the next level
       * whenever the d_subdirs list is non-empty and continue
       * searching.
       *
       * It returns zero iff there are no unused children,
       * otherwise  it returns the number of children moved to
       * the end of the unused list. This may not be the total
       * number of unused children, because select_parent can
       * drop the lock and return early due to latency
       * constraints.
       */
      
      struct select_data {
              struct dentry *start;
              struct list_head dispose;
              int found;
      };
      
    7 static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
      {
              struct select_data *data = _data;
              enum d_walk_ret ret = D_WALK_CONTINUE;
      
   63         if (data->start == dentry)
                      goto out;
      
   15         if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                      data->found++;
              } else {
   15                 if (dentry->d_flags & DCACHE_LRU_LIST)
    6                         d_lru_del(dentry);
   15                 if (!dentry->d_lockref.count) {
    7                         d_shrink_add(dentry, &data->dispose);
                              data->found++;
                      }
              }
              /*
               * We can return to the caller if we have found some (this
               * ensures forward progress). We'll be coming back to find
               * the rest.
               */
   15         if (!list_empty(&data->dispose))
    7                 ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
      out:
   63         return ret;
      }
      
      /**
       * shrink_dcache_parent - prune dcache
       * @parent: parent of entries to prune
       *
       * Prune the dcache to remove unused children of the parent dentry.
       */
      void shrink_dcache_parent(struct dentry *parent)
      {
              for (;;) {
                      struct select_data data;
      
   46                 INIT_LIST_HEAD(&data.dispose);
                      data.start = parent;
                      data.found = 0;
      
                      d_walk(parent, &data, select_collect, NULL);
   46                 if (!data.found)
                              break;
      
    6                 shrink_dentry_list(&data.dispose);
                      cond_resched();
              }
      }
      EXPORT_SYMBOL(shrink_dcache_parent);
      
      static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
      {
              /* it has busy descendents; complain about those instead */
   24         if (!list_empty(&dentry->d_subdirs))
                      return D_WALK_CONTINUE;
      
              /* root with refcount 1 is fine */
   24         if (dentry == _data && dentry->d_lockref.count == 1)
                      return D_WALK_CONTINUE;
      
              printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
                              " still in use (%d) [unmount of %s %s]\n",
                             dentry,
                             dentry->d_inode ?
                             dentry->d_inode->i_ino : 0UL,
                             dentry,
                             dentry->d_lockref.count,
                             dentry->d_sb->s_type->name,
                             dentry->d_sb->s_id);
              WARN_ON(1);
              return D_WALK_CONTINUE;
      }
      
      static void do_one_tree(struct dentry *dentry)
      {
   24         shrink_dcache_parent(dentry);
              d_walk(dentry, dentry, umount_check, NULL);
              d_drop(dentry);
   24         dput(dentry);
   24 }
      
      /*
       * destroy the dentries attached to a superblock on unmounting
       */
      void shrink_dcache_for_umount(struct super_block *sb)
      {
              struct dentry *dentry;
      
   24         WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
      
   24         dentry = sb->s_root;
              sb->s_root = NULL;
              do_one_tree(dentry);
      
              while (!hlist_bl_empty(&sb->s_anon)) {
                      dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
                      do_one_tree(dentry);
              }
   24 }
      
      struct detach_data {
              struct select_data select;
              struct dentry *mountpoint;
      };
      static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
      {
              struct detach_data *data = _data;
      
   17         if (d_mountpoint(dentry)) {
                      __dget_dlock(dentry);
                      data->mountpoint = dentry;
                      return D_WALK_QUIT;
              }
      
   17         return select_collect(&data->select, dentry);
      }
      
   17 static void check_and_drop(void *_data)
      {
              struct detach_data *data = _data;
      
   17         if (!data->mountpoint && list_empty(&data->select.dispose))
   17                 __d_drop(data->select.start);
   17 }
      
      /**
       * d_invalidate - detach submounts, prune dcache, and drop
       * @dentry: dentry to invalidate (aka detach, prune and drop)
       *
       * no dcache lock.
       *
       * The final d_drop is done as an atomic operation relative to
       * rename_lock ensuring there are no races with d_set_mounted.  This
       * ensures there are no unhashed dentries on the path to a mountpoint.
       */
   17 void d_invalidate(struct dentry *dentry)
      {
              /*
               * If it's already been dropped, return OK.
               */
   17         spin_lock(&dentry->d_lock);
              if (d_unhashed(dentry)) {
                      spin_unlock(&dentry->d_lock);
                      return;
              }
   17         spin_unlock(&dentry->d_lock);
      
              /* Negative dentries can be dropped without further checks */
              if (!dentry->d_inode) {
                      d_drop(dentry);
                      return;
              }
      
              for (;;) {
                      struct detach_data data;
      
   17                 data.mountpoint = NULL;
                      INIT_LIST_HEAD(&data.select.dispose);
                      data.select.start = dentry;
                      data.select.found = 0;
      
                      d_walk(dentry, &data, detach_and_collect, check_and_drop);
      
                      if (!list_empty(&data.select.dispose))
    1                         shrink_dentry_list(&data.select.dispose);
                      else if (!data.mountpoint)
   17                         return;
      
                      if (data.mountpoint) {
                              detach_mounts(data.mountpoint);
                              dput(data.mountpoint);
   17                 }
                      cond_resched();
              }
    1 }
      EXPORT_SYMBOL(d_invalidate);
      
      /**
       * __d_alloc        -        allocate a dcache entry
       * @sb: filesystem it will belong to
       * @name: qstr of the name
       *
       * Allocates a dentry. It returns %NULL if there is insufficient memory
       * available. On a success the dentry is returned. The name passed in is
       * copied and the copy passed in may be reused after this call.
       */
       
      struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
      {
              struct dentry *dentry;
              char *dname;
      
              dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
              if (!dentry)
 2076                 return NULL;
      
              /*
               * We guarantee that the inline name is always NUL-terminated.
               * This way the memcpy() done by the name switching in rename
               * will still always have a NUL at the end, even if we might
               * be overwriting an internal NUL character
               */
              dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
              if (name->len > DNAME_INLINE_LEN-1) {
 2076                 size_t size = offsetof(struct external_name, name[1]);
                      struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
                      if (!p) {
   20                         kmem_cache_free(dentry_cache, dentry); 
                              return NULL;
                      }
                      atomic_set(&p->u.count, 1);
                      dname = p->name;
   20                 if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
                              kasan_unpoison_shadow(dname,
                                      round_up(name->len + 1,        sizeof(unsigned long)));
              } else  {
                      dname = dentry->d_iname;
              }        
 2063 
              dentry->d_name.len = name->len;
              dentry->d_name.hash = name->hash;
 2076         memcpy(dname, name->name, name->len);
              dname[name->len] = 0;
      
              /* Make sure we always see the terminating NUL character */
              smp_wmb();
              dentry->d_name.name = dname;
      
              dentry->d_lockref.count = 1;
              dentry->d_flags = 0;
              spin_lock_init(&dentry->d_lock);
              seqcount_init(&dentry->d_seq);
              dentry->d_inode = NULL;
              dentry->d_parent = dentry;
              dentry->d_sb = sb;
              dentry->d_op = NULL;
              dentry->d_fsdata = NULL;
              INIT_HLIST_BL_NODE(&dentry->d_hash);
              INIT_LIST_HEAD(&dentry->d_lru);
              INIT_LIST_HEAD(&dentry->d_subdirs);
              INIT_HLIST_NODE(&dentry->d_u.d_alias);
              INIT_LIST_HEAD(&dentry->d_child);
              d_set_d_op(dentry, dentry->d_sb->s_d_op);
      
              this_cpu_inc(nr_dentry);
      
              return dentry;
      }
 2076 
      /**
       * d_alloc        -        allocate a dcache entry
       * @parent: parent of entry to allocate
       * @name: qstr of the name
       *
       * Allocates a dentry. It returns %NULL if there is insufficient memory
       * available. On a success the dentry is returned. The name passed in is
       * copied and the copy passed in may be reused after this call.
       */
      struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
      {
              struct dentry *dentry = __d_alloc(parent->d_sb, name);
              if (!dentry)
 1140                 return NULL;
              dentry->d_flags |= DCACHE_RCUACCESS;
              spin_lock(&parent->d_lock);
 1140         /*
               * don't need child lock because it is not subject
               * to concurrency here
               */
              __dget_dlock(parent);
              dentry->d_parent = parent;
              list_add(&dentry->d_child, &parent->d_subdirs);
              spin_unlock(&parent->d_lock);
 1140 
 1140         return dentry;
      }
 1140 EXPORT_SYMBOL(d_alloc);
      
      /**
       * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
       * @sb: the superblock
       * @name: qstr of the name
       *
       * For a filesystem that just pins its dentries in memory and never
       * performs lookups at all, return an unhashed IS_ROOT dentry.
       */
      struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
      {
              return __d_alloc(sb, name);
      }
 1101 EXPORT_SYMBOL(d_alloc_pseudo);
      
      struct dentry *d_alloc_name(struct dentry *parent, const char *name)
      {
              struct qstr q;
      
              q.name = name;
              q.len = strlen(name);
  149         q.hash = full_name_hash(q.name, q.len);
              return d_alloc(parent, &q);
      }
      EXPORT_SYMBOL(d_alloc_name);
      
      void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
      {
              WARN_ON_ONCE(dentry->d_op);
              WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH        |
 2076                                 DCACHE_OP_COMPARE        |
 2076                                 DCACHE_OP_REVALIDATE        |
                                      DCACHE_OP_WEAK_REVALIDATE        |
                                      DCACHE_OP_DELETE        |
                                      DCACHE_OP_SELECT_INODE        |
                                      DCACHE_OP_REAL));
              dentry->d_op = op;
              if (!op)
 2076                 return;
              if (op->d_hash)
                      dentry->d_flags |= DCACHE_OP_HASH;
 1815         if (op->d_compare)
                      dentry->d_flags |= DCACHE_OP_COMPARE;
 1815         if (op->d_revalidate)
                      dentry->d_flags |= DCACHE_OP_REVALIDATE;
 1815         if (op->d_weak_revalidate)
  499                 dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
 1815         if (op->d_delete)
                      dentry->d_flags |= DCACHE_OP_DELETE;
 1815         if (op->d_prune)
  898                 dentry->d_flags |= DCACHE_OP_PRUNE;
 1815         if (op->d_select_inode)
   15                 dentry->d_flags |= DCACHE_OP_SELECT_INODE;
 1815         if (op->d_real)
                      dentry->d_flags |= DCACHE_OP_REAL;
 1815 
 2076 }
      EXPORT_SYMBOL(d_set_d_op);
      
      
      /*
       * d_set_fallthru - Mark a dentry as falling through to a lower layer
       * @dentry - The dentry to mark
       *
       * Mark a dentry as falling through to the lower layer (as set with
       * d_pin_lower()).  This flag may be recorded on the medium.
       */
      void d_set_fallthru(struct dentry *dentry)
      {
              spin_lock(&dentry->d_lock);
              dentry->d_flags |= DCACHE_FALLTHRU;
              spin_unlock(&dentry->d_lock);
      }
      EXPORT_SYMBOL(d_set_fallthru);
      
      static unsigned d_flags_for_inode(struct inode *inode)
      {
              unsigned add_flags = DCACHE_REGULAR_TYPE;
      
              if (!inode)
                      return DCACHE_MISS_TYPE;
 2024 
              if (S_ISDIR(inode->i_mode)) {
                      add_flags = DCACHE_DIRECTORY_TYPE;
 1952                 if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
                              if (unlikely(!inode->i_op->lookup))
  628                                 add_flags = DCACHE_AUTODIR_TYPE;
  628                         else
                                      inode->i_opflags |= IOP_LOOKUP;
                      }
  628                 goto type_determined;
              }
      
              if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
                      if (unlikely(inode->i_op->follow_link)) {
 1788                         add_flags = DCACHE_SYMLINK_TYPE;
 1758                         goto type_determined;
                      }
                      inode->i_opflags |= IOP_NOFOLLOW;
              }
 1644 
              if (unlikely(!S_ISREG(inode->i_mode)))
                      add_flags = DCACHE_SPECIAL_TYPE;
 1676 
      type_determined:
              if (unlikely(IS_AUTOMOUNT(inode)))
                      add_flags |= DCACHE_NEED_AUTOMOUNT;
 2024         return add_flags;
      }
      
      static void __d_instantiate(struct dentry *dentry, struct inode *inode)
      {
              unsigned add_flags = d_flags_for_inode(inode);
      
 2024         spin_lock(&dentry->d_lock);
              if (inode)
                      hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
              raw_write_seqcount_begin(&dentry->d_seq);
 1952         __d_set_inode_and_type(dentry, inode, add_flags);
 2024         raw_write_seqcount_end(&dentry->d_seq);
              spin_unlock(&dentry->d_lock);
              fsnotify_d_instantiate(dentry, inode);
      }
 1952 
 2024 /**
       * d_instantiate - fill in inode information for a dentry
       * @entry: dentry to complete
       * @inode: inode to attach to this dentry
       *
       * Fill in inode information in the entry.
       *
       * This turns negative dentries into productive full members
       * of society.
       *
       * NOTE! This assumes that the inode count has been incremented
       * (or otherwise set) by the caller to indicate that it is now
       * in use by the dcache.
       */
       
      void d_instantiate(struct dentry *entry, struct inode * inode)
      {
              BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
              if (inode)
 1986                 spin_lock(&inode->i_lock);
 1986         __d_instantiate(entry, inode);
 1917         if (inode)
  432                 spin_unlock(&inode->i_lock);
              security_d_instantiate(entry, inode);
      }
 1986 EXPORT_SYMBOL(d_instantiate);
      
      /**
       * d_instantiate_unique - instantiate a non-aliased dentry
       * @entry: dentry to instantiate
       * @inode: inode to attach to this dentry
       *
       * Fill in inode information in the entry. On success, it returns NULL.
       * If an unhashed alias of "entry" already exists, then we return the
       * aliased dentry instead and drop one reference to inode.
       *
       * Note that in order to avoid conflicts with rename() etc, the caller
       * had better be holding the parent directory semaphore.
       *
       * This also assumes that the inode count has been incremented
       * (or otherwise set) by the caller to indicate that it is now
       * in use by the dcache.
       */
      static struct dentry *__d_instantiate_unique(struct dentry *entry,
                                                   struct inode *inode)
      {
              struct dentry *alias;
              int len = entry->d_name.len;
              const char *name = entry->d_name.name;
              unsigned int hash = entry->d_name.hash;
      
              if (!inode) {
                      __d_instantiate(entry, NULL);
                      return NULL;
              }
      
              hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                      /*
                       * Don't need alias->d_lock here, because aliases with
                       * d_parent == entry->d_parent are not subject to name or
                       * parent changes, because the parent inode i_mutex is held.
                       */
                      if (alias->d_name.hash != hash)
                              continue;
                      if (alias->d_parent != entry->d_parent)
                              continue;
                      if (alias->d_name.len != len)
                              continue;
                      if (dentry_cmp(alias, name, len))
                              continue;
                      __dget(alias);
                      return alias;
              }
      
              __d_instantiate(entry, inode);
              return NULL;
      }
      
      struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
      {
              struct dentry *result;
      
              BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
      
              if (inode)
                      spin_lock(&inode->i_lock);
              result = __d_instantiate_unique(entry, inode);
              if (inode)
                      spin_unlock(&inode->i_lock);
      
              if (!result) {
                      security_d_instantiate(entry, inode);
                      return NULL;
              }
      
              BUG_ON(!d_unhashed(result));
              iput(inode);
              return result;
      }
      
      EXPORT_SYMBOL(d_instantiate_unique);
      
      /*
       * This should be equivalent to d_instantiate() + unlock_new_inode(),
       * with lockdep-related part of unlock_new_inode() done before
       * anything else.  Use that instead of open-coding d_instantiate()/
       * unlock_new_inode() combinations.
       */
      void d_instantiate_new(struct dentry *entry, struct inode *inode)
      {
              BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
              BUG_ON(!inode);
   47         lockdep_annotate_inode_mutex_key(inode);
   47         spin_lock(&inode->i_lock);
   47         __d_instantiate(entry, inode);
              WARN_ON(!(inode->i_state & I_NEW));
              inode->i_state &= ~I_NEW;
              smp_mb();
              wake_up_bit(&inode->i_state, __I_NEW);
   47         spin_unlock(&inode->i_lock);
              security_d_instantiate(entry, inode);
      }
      EXPORT_SYMBOL(d_instantiate_new);
      
      /**
       * d_instantiate_no_diralias - instantiate a non-aliased dentry
       * @entry: dentry to complete
       * @inode: inode to attach to this dentry
       *
       * Fill in inode information in the entry.  If a directory alias is found, then
       * return an error (and drop inode).  Together with d_materialise_unique() this
       * guarantees that a directory inode may never have more than one alias.
       */
      int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
      {
              BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
      
              spin_lock(&inode->i_lock);
              if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
                      spin_unlock(&inode->i_lock);
                      iput(inode);
                      return -EBUSY;
              }
              __d_instantiate(entry, inode);
              spin_unlock(&inode->i_lock);
              security_d_instantiate(entry, inode);
      
              return 0;
      }
      EXPORT_SYMBOL(d_instantiate_no_diralias);
      
      struct dentry *d_make_root(struct inode *root_inode)
      {
              struct dentry *res = NULL;
      
              if (root_inode) {
                      static const struct qstr name = QSTR_INIT("/", 1);
  162 
                      res = __d_alloc(root_inode->i_sb, &name);
                      if (res) {
  162                         res->d_flags |= DCACHE_RCUACCESS;
                              d_instantiate(res, root_inode);
  162                 } else {
                              iput(root_inode);
                      }
              }
              return res;
      }
  162 EXPORT_SYMBOL(d_make_root);
      
      static struct dentry * __d_find_any_alias(struct inode *inode)
      {
              struct dentry *alias;
      
              if (hlist_empty(&inode->i_dentry))
                      return NULL;
    6         alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
              __dget(alias);
   26         return alias;
      }
      
      /**
       * d_find_any_alias - find any alias for a given inode
       * @inode: inode to find an alias for
       *
       * If any aliases exist for the given inode, take and return a
       * reference for one of them.  If no aliases exist, return %NULL.
       */
      struct dentry *d_find_any_alias(struct inode *inode)
      {
              struct dentry *de;
      
              spin_lock(&inode->i_lock);
              de = __d_find_any_alias(inode);
   26         spin_unlock(&inode->i_lock);
   26         return de;
   26 }
      EXPORT_SYMBOL(d_find_any_alias);
      
      static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
      {
              static const struct qstr anonstring = QSTR_INIT("/", 1);
              struct dentry *tmp;
              struct dentry *res;
              unsigned add_flags;
      
              if (!inode)
                      return ERR_PTR(-ESTALE);
              if (IS_ERR(inode))
                      return ERR_CAST(inode);
      
              res = d_find_any_alias(inode);
              if (res)
                      goto out_iput;
      
              tmp = __d_alloc(inode->i_sb, &anonstring);
              if (!tmp) {
                      res = ERR_PTR(-ENOMEM);
                      goto out_iput;
              }
      
              spin_lock(&inode->i_lock);
              res = __d_find_any_alias(inode);
              if (res) {
                      spin_unlock(&inode->i_lock);
                      dput(tmp);
                      goto out_iput;
              }
      
              /* attach a disconnected dentry */
              add_flags = d_flags_for_inode(inode);
      
              if (disconnected)
                      add_flags |= DCACHE_DISCONNECTED;
      
              spin_lock(&tmp->d_lock);
              __d_set_inode_and_type(tmp, inode, add_flags);
              hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry);
              hlist_bl_lock(&tmp->d_sb->s_anon);
              hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
              hlist_bl_unlock(&tmp->d_sb->s_anon);
              spin_unlock(&tmp->d_lock);
              spin_unlock(&inode->i_lock);
              security_d_instantiate(tmp, inode);
      
              return tmp;
      
       out_iput:
              if (res && !IS_ERR(res))
                      security_d_instantiate(res, inode);
              iput(inode);
              return res;
      }
      
      /**
       * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
       * @inode: inode to allocate the dentry for
       *
       * Obtain a dentry for an inode resulting from NFS filehandle conversion or
       * similar open by handle operations.  The returned dentry may be anonymous,
       * or may have a full name (if the inode was already in the cache).
       *
       * When called on a directory inode, we must ensure that the inode only ever
       * has one dentry.  If a dentry is found, that is returned instead of
       * allocating a new one.
       *
       * On successful return, the reference to the inode has been transferred
       * to the dentry.  In case of an error the reference on the inode is released.
       * To make it easier to use in export operations a %NULL or IS_ERR inode may
       * be passed in and the error will be propagated to the return value,
       * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
       */
      struct dentry *d_obtain_alias(struct inode *inode)
      {
              return __d_obtain_alias(inode, 1);
      }
      EXPORT_SYMBOL(d_obtain_alias);
      
      /**
       * d_obtain_root - find or allocate a dentry for a given inode
       * @inode: inode to allocate the dentry for
       *
       * Obtain an IS_ROOT dentry for the root of a filesystem.
       *
       * We must ensure that directory inodes only ever have one dentry.  If a
       * dentry is found, that is returned instead of allocating a new one.
       *
       * On successful return, the reference to the inode has been transferred
       * to the dentry.  In case of an error the reference on the inode is
       * released.  A %NULL or IS_ERR inode may be passed in and will be the
       * error will be propagate to the return value, with a %NULL @inode
       * replaced by ERR_PTR(-ESTALE).
       */
      struct dentry *d_obtain_root(struct inode *inode)
      {
              return __d_obtain_alias(inode, 0);
      }
      EXPORT_SYMBOL(d_obtain_root);
      
      /**
       * d_add_ci - lookup or allocate new dentry with case-exact name
       * @inode:  the inode case-insensitive lookup has found
       * @dentry: the negative dentry that was passed to the parent's lookup func
       * @name:   the case-exact name to be associated with the returned dentry
       *
       * This is to avoid filling the dcache with case-insensitive names to the
       * same inode, only the actual correct case is stored in the dcache for
       * case-insensitive filesystems.
       *
       * For a case-insensitive lookup match and if the the case-exact dentry
       * already exists in in the dcache, use it and return it.
       *
       * If no entry exists with the exact case name, allocate new dentry with
       * the exact case, and return the spliced entry.
       */
      struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
                              struct qstr *name)
      {
              struct dentry *found;
              struct dentry *new;
      
              /*
               * First check if a dentry matching the name already exists,
               * if not go ahead and create it now.
               */
              found = d_hash_and_lookup(dentry->d_parent, name);
              if (!found) {
                      new = d_alloc(dentry->d_parent, name);
                      if (!new) {
                              found = ERR_PTR(-ENOMEM);
                      } else {
                              found = d_splice_alias(inode, new);
                              if (found) {
                                      dput(new);
                                      return found;
                              }
                              return new;
                      }
              }
              iput(inode);
              return found;
      }
      EXPORT_SYMBOL(d_add_ci);
      
      /*
       * Do the slow-case of the dentry name compare.
       *
       * Unlike the dentry_cmp() function, we need to atomically
       * load the name and length information, so that the
       * filesystem can rely on them, and can use the 'name' and
       * 'len' information without worrying about walking off the
       * end of memory etc.
       *
       * Thus the read_seqcount_retry() and the "duplicate" info
       * in arguments (the low-level filesystem should not look
       * at the dentry inode or name contents directly, since
       * rename can change them while we're in RCU mode).
       */
      enum slow_d_compare {
              D_COMP_OK,
              D_COMP_NOMATCH,
              D_COMP_SEQRETRY,
      };
      
      static noinline enum slow_d_compare slow_dentry_cmp(
                      const struct dentry *parent,
                      struct dentry *dentry,
                      unsigned int seq,
                      const struct qstr *name)
      {
              int tlen = dentry->d_name.len;
              const char *tname = dentry->d_name.name;
      
              if (read_seqcount_retry(&dentry->d_seq, seq)) {
                      cpu_relax();
                      return D_COMP_SEQRETRY;
              }
              if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
                      return D_COMP_NOMATCH;
              return D_COMP_OK;
      }
      
      /**
       * __d_lookup_rcu - search for a dentry (racy, store-free)
       * @parent: parent dentry
       * @name: qstr of name we wish to find
       * @seqp: returns d_seq value at the point where the dentry was found
       * Returns: dentry, or NULL
       *
       * __d_lookup_rcu is the dcache lookup function for rcu-walk name
       * resolution (store-free path walking) design described in
       * Documentation/filesystems/path-lookup.txt.
       *
       * This is not to be used outside core vfs.
       *
       * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
       * held, and rcu_read_lock held. The returned dentry must not be stored into
       * without taking d_lock and checking d_seq sequence count against @seq
       * returned here.
       *
       * A refcount may be taken on the found dentry with the d_rcu_to_refcount
       * function.
       *
       * Alternatively, __d_lookup_rcu may be called again to look up the child of
       * the returned dentry, so long as its parent's seqlock is checked after the
       * child is looked up. Thus, an interlocking stepping of sequence lock checks
       * is formed, giving integrity down the path walk.
       *
       * NOTE! The caller *has* to check the resulting dentry against the sequence
       * number we've returned before using any of the resulting dentry state!
       */
      struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                      const struct qstr *name,
                                      unsigned *seqp)
      {
              u64 hashlen = name->hash_len;
              const unsigned char *str = name->name;
 1685         struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen));
              struct hlist_bl_node *node;
              struct dentry *dentry;
      
              /*
               * Note: There is significant duplication with __d_lookup_rcu which is
               * required to prevent single threaded performance regressions
               * especially on architectures where smp_rmb (in seqcounts) are costly.
               * Keep the two functions in sync.
               */
      
              /*
               * The hash list is protected using RCU.
               *
               * Carefully use d_seq when comparing a candidate dentry, to avoid
               * races with d_move().
               *
               * It is possible that concurrent renames can mess up our list
               * walk here and result in missing our dentry, resulting in the
               * false-negative result. d_lookup() protects against concurrent
               * renames using rename_lock seqlock.
               *
               * See Documentation/filesystems/path-lookup.txt for more details.
               */
              hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                      unsigned seq;
 1685 
      seqretry:
                      /*
                       * The dentry sequence count protects us from concurrent
                       * renames, and thus protects parent and name fields.
                       *
                       * The caller must perform a seqcount check in order
                       * to do anything useful with the returned dentry.
                       *
                       * NOTE! We do a "raw" seqcount_begin here. That means that
                       * we don't wait for the sequence count to stabilize if it
                       * is in the middle of a sequence change. If we do the slow
                       * dentry compare, we will do seqretries until it is stable,
                       * and if we end up with a successful lookup, we actually
                       * want to exit RCU lookup anyway.
                       */
                      seq = raw_seqcount_begin(&dentry->d_seq);
                      if (dentry->d_parent != parent)
 1607                         continue;
                      if (d_unhashed(dentry))
                              continue;
 1604 
                      if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
                              if (dentry->d_name.hash != hashlen_hash(hashlen))
                                      continue;
                              *seqp = seq;
                              switch (slow_dentry_cmp(parent, dentry, seq, name)) {
                              case D_COMP_OK:
                                      return dentry;
                              case D_COMP_NOMATCH:
                                      continue;
                              default:
                                      goto seqretry;
                              }
                      }
      
                      if (dentry->d_name.hash_len != hashlen)
                              continue;
 1604                 *seqp = seq;
                      if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
 1603                         return dentry;
 1607         }
              return NULL;
      }
 1685 
      /**
       * d_lookup - search for a dentry
       * @parent: parent dentry
       * @name: qstr of name we wish to find
       * Returns: dentry, or NULL
       *
       * d_lookup searches the children of the parent dentry for the name in
       * question. If the dentry is found its reference count is incremented and the
       * dentry is returned. The caller must use dput to free the entry when it has
       * finished using it. %NULL is returned if the dentry does not exist.
       */
      struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
      {
              struct dentry *dentry;
              unsigned seq;
      
              do {
                      seq = read_seqbegin(&rename_lock);
                      dentry = __d_lookup(parent, name);
 1544                 if (dentry)
                              break;
              } while (read_seqretry(&rename_lock, seq));
              return dentry;
 1032 }
 1544 EXPORT_SYMBOL(d_lookup);
      
      /**
       * __d_lookup - search for a dentry (racy)
       * @parent: parent dentry
       * @name: qstr of name we wish to find
       * Returns: dentry, or NULL
       *
       * __d_lookup is like d_lookup, however it may (rarely) return a
       * false-negative result due to unrelated rename activity.
       *
       * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
       * however it must be used carefully, eg. with a following d_lookup in
       * the case of failure.
       *
       * __d_lookup callers must be commented.
       */
      struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
      {
              unsigned int len = name->len;
              unsigned int hash = name->hash;
 1646         const unsigned char *str = name->name;
              struct hlist_bl_head *b = d_hash(parent, hash);
              struct hlist_bl_node *node;
              struct dentry *found = NULL;
              struct dentry *dentry;
      
              /*
               * Note: There is significant duplication with __d_lookup_rcu which is
               * required to prevent single threaded performance regressions
               * especially on architectures where smp_rmb (in seqcounts) are costly.
               * Keep the two functions in sync.
               */
      
              /*
               * The hash list is protected using RCU.
               *
               * Take d_lock when comparing a candidate dentry, to avoid races
               * with d_move().
               *
               * It is possible that concurrent renames can mess up our list
               * walk here and result in missing our dentry, resulting in the
               * false-negative result. d_lookup() protects against concurrent
               * renames using rename_lock seqlock.
               *
               * See Documentation/filesystems/path-lookup.txt for more details.
               */
              rcu_read_lock();
              
 1646         hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
      
 1646                 if (dentry->d_name.hash != hash)
                              continue;
 1245 
                      spin_lock(&dentry->d_lock);
                      if (dentry->d_parent != parent)
 1231                         goto next;
                      if (d_unhashed(dentry))
                              goto next;
 1231 
                      /*
                       * It is safe to compare names since d_move() cannot
                       * change the qstr (protected by d_lock).
                       */
                      if (parent->d_flags & DCACHE_OP_COMPARE) {
                              int tlen = dentry->d_name.len;
                              const char *tname = dentry->d_name.name;
                              if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
                                      goto next;
                      } else {
                              if (dentry->d_name.len != len)
                                      goto next;
 1231                         if (dentry_cmp(dentry, str, len))
                                      goto next;
 1230                 }
      
                      dentry->d_lockref.count++;
                      found = dentry;
 1229                 spin_unlock(&dentry->d_lock);
                      break;
      next:
                      spin_unlock(&dentry->d_lock);
               }
    2          rcu_read_unlock();
      
 1646          return found;
      }
      
      /**
       * d_hash_and_lookup - hash the qstr then search for a dentry
       * @dir: Directory to search in
       * @name: qstr of name we wish to find
       *
       * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
       */
      struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
      {
              /*
               * Check for a fs-specific hash function. Note that we must
               * calculate the standard hash first, as the d_op->d_hash()
               * routine may choose to leave the hash value unchanged.
               */
              name->hash = full_name_hash(name->name, name->len);
              if (dir->d_flags & DCACHE_OP_HASH) {
   67                 int err = dir->d_op->d_hash(dir, name);
                      if (unlikely(err < 0))
                              return ERR_PTR(err);
              }
              return d_lookup(dir, name);
      }
   67 EXPORT_SYMBOL(d_hash_and_lookup);
      
      /*
       * When a file is deleted, we have two options:
       * - turn this dentry into a negative dentry
       * - unhash this dentry and free it.
       *
       * Usually, we want to just turn this into
       * a negative dentry, but if anybody else is
       * currently using the dentry or the inode
       * we can't do that and we fall back on removing
       * it from the hash queues and waiting for
       * it to be deleted later when it has no users
       */
       
      /**
       * d_delete - delete a dentry
       * @dentry: The dentry to delete
       *
       * Turn the dentry into a negative dentry if possible, otherwise
       * remove it from the hash queues so it can be deleted later
       */
       
      void d_delete(struct dentry * dentry)
      {
              struct inode *inode;
  129         int isdir = 0;
              /*
               * Are we the only user?
               */
      again:
              spin_lock(&dentry->d_lock);
              inode = dentry->d_inode;
  129         isdir = S_ISDIR(inode->i_mode);
              if (dentry->d_lockref.count == 1) {
                      if (!spin_trylock(&inode->i_lock)) {
                              spin_unlock(&dentry->d_lock);
   70                         cpu_relax();
                              goto again;
                      }
                      dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                      dentry_unlink_inode(dentry);
   70                 fsnotify_nameremove(dentry, isdir);
   70                 return;
   70         }
      
              if (!d_unhashed(dentry))
                      __d_drop(dentry);
   62 
   62         spin_unlock(&dentry->d_lock);
      
   62         fsnotify_nameremove(dentry, isdir);
      }
  129 EXPORT_SYMBOL(d_delete);
      
      static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
      {
 1082         BUG_ON(!d_unhashed(entry));
              hlist_bl_lock(b);
 1082         hlist_bl_add_head_rcu(&entry->d_hash, b);
 1082         hlist_bl_unlock(b);
 1082 }
 1082 
 1082 static void _d_rehash(struct dentry * entry)
      {
              __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
      }
      
      /**
       * d_rehash        - add an entry back to the hash
       * @entry: dentry to add to the hash
       *
       * Adds a dentry to the hash according to its name.
       */
       
      void d_rehash(struct dentry * entry)
      {
              spin_lock(&entry->d_lock);
              _d_rehash(entry);
 1068         spin_unlock(&entry->d_lock);
      }
      EXPORT_SYMBOL(d_rehash);
      
      /**
       * dentry_update_name_case - update case insensitive dentry with a new name
       * @dentry: dentry to be updated
       * @name: new name
       *
       * Update a case insensitive dentry with new case of name.
       *
       * dentry must have been returned by d_lookup with name @name. Old and new
       * name lengths must match (ie. no d_compare which allows mismatched name
       * lengths).
       *
       * Parent inode i_mutex must be held over d_lookup and into this call (to
       * keep renames and concurrent inserts, and readdir(2) away).
       */
      void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
      {
              BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
              BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
      
              spin_lock(&dentry->d_lock);
              write_seqcount_begin(&dentry->d_seq);
              memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
              write_seqcount_end(&dentry->d_seq);
              spin_unlock(&dentry->d_lock);
      }
      EXPORT_SYMBOL(dentry_update_name_case);
      
      static void swap_names(struct dentry *dentry, struct dentry *target)
      {
              if (unlikely(dname_external(target))) {
                      if (unlikely(dname_external(dentry))) {
                              /*
    2                          * Both external: swap the pointers
                               */
                              swap(target->d_name.name, dentry->d_name.name);
                      } else {
    1                         /*
                               * dentry:internal, target:external.  Steal target's
                               * storage and make target internal.
                               */
                              memcpy(target->d_iname, dentry->d_name.name,
                                              dentry->d_name.len + 1);
                              dentry->d_name.name = target->d_name.name;
    1                         target->d_name.name = target->d_iname;
                      }
              } else {
                      if (unlikely(dname_external(dentry))) {
                              /*
    6                          * dentry:external, target:internal.  Give dentry's
                               * storage to target and make dentry internal
                               */
                              memcpy(dentry->d_iname, target->d_name.name,
                                              target->d_name.len + 1);
                              target->d_name.name = dentry->d_name.name;
    2                         dentry->d_name.name = dentry->d_iname;
                      } else {
                              /*
                               * Both are internal.
                               */
                              unsigned int i;
                              BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
                              kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
                              kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
                              for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
                                      swap(((long *) &dentry->d_iname)[i],
                                           ((long *) &target->d_iname)[i]);
    4                         }
                      }
              }
              swap(dentry->d_name.hash_len, target->d_name.hash_len);
      }
    8 
      static void copy_name(struct dentry *dentry, struct dentry *target)
      {
              struct external_name *old_name = NULL;
              if (unlikely(dname_external(dentry)))
                      old_name = external_name(dentry);
   21         if (unlikely(dname_external(target))) {
    4                 atomic_inc(&external_name(target)->u.count);
   21                 dentry->d_name = target->d_name;
    3         } else {
                      memcpy(dentry->d_iname, target->d_name.name,
                                      target->d_name.len + 1);
                      dentry->d_name.name = dentry->d_iname;
   18                 dentry->d_name.hash_len = target->d_name.hash_len;
              }
              if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
                      kfree_rcu(old_name, u.head);
   21 }
      
      static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
      {
              /*
               * XXXX: do we really need to take target->d_lock?
               */
              if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
                      spin_lock(&target->d_parent->d_lock);
   29         else {
   19                 if (d_ancestor(dentry->d_parent, target->d_parent)) {
                              spin_lock(&dentry->d_parent->d_lock);
   10                         spin_lock_nested(&target->d_parent->d_lock,
    6                                                 DENTRY_D_LOCK_NESTED);
                      } else {
                              spin_lock(&target->d_parent->d_lock);
                              spin_lock_nested(&dentry->d_parent->d_lock,
    4                                                 DENTRY_D_LOCK_NESTED);
                      }
              }
              if (target < dentry) {
                      spin_lock_nested(&target->d_lock, 2);
   29                 spin_lock_nested(&dentry->d_lock, 3);
   24         } else {
                      spin_lock_nested(&dentry->d_lock, 2);
                      spin_lock_nested(&target->d_lock, 3);
   24         }
      }
      
      static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
      {
              if (target->d_parent != dentry->d_parent)
                      spin_unlock(&dentry->d_parent->d_lock);
              if (target->d_parent != target)
   10                 spin_unlock(&target->d_parent->d_lock);
   29         spin_unlock(&target->d_lock);
   29         spin_unlock(&dentry->d_lock);
   29 }
      
      /*
       * When switching names, the actual string doesn't strictly have to
       * be preserved in the target - because we're dropping the target
       * anyway. As such, we can just do a simple memcpy() to copy over
       * the new name before we switch, unless we are going to rehash
       * it.  Note that if we *do* unhash the target, we are not allowed
       * to rehash it without giving it a new name/hash key - whether
       * we swap or overwrite the names here, resulting name won't match
       * the reality in filesystem; it's only there for d_path() purposes.
       * Note that all of this is happening under rename_lock, so the
       * any hash lookup seeing it in the middle of manipulations will
       * be discarded anyway.  So we do not care what happens to the hash
       * key in that case.
       */
      /*
       * __d_move - move a dentry
       * @dentry: entry to move
       * @target: new dentry
       * @exchange: exchange the two dentries
       *
       * Update the dcache to reflect the move of a file name. Negative
       * dcache entries should not be moved in this way. Caller must hold
       * rename_lock, the i_mutex of the source and target directories,
       * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
       */
      static void __d_move(struct dentry *dentry, struct dentry *target,
                           bool exchange)
   29 {
              if (!dentry->d_inode)
                      printk(KERN_WARNING "VFS: moving negative dcache entry\n");
   29 
              BUG_ON(d_ancestor(dentry, target));
              BUG_ON(d_ancestor(target, dentry));
   29 
   29         dentry_lock_for_move(dentry, target);
      
   29         write_seqcount_begin(&dentry->d_seq);
              write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
   29 
              /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
      
              /*
               * Move the dentry to the target hash queue. Don't bother checking
               * for the same hash queue because of how unlikely it is.
               */
              __d_drop(dentry);
              __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
      
              /*
               * Unhash the target (d_delete() is not usable here).  If exchanging
               * the two dentries, then rehash onto the other's hash queue.
               */
              __d_drop(target);
              if (exchange) {
                      __d_rehash(target,
                                 d_hash(dentry->d_parent, dentry->d_name.hash));
    8         }
      
              /* Switch the names.. */
              if (exchange)
                      swap_names(dentry, target);
              else
    8                 copy_name(dentry, target);
      
   21         /* ... and switch them in the tree */
              if (IS_ROOT(dentry)) {
                      /* splicing a tree */
   29                 dentry->d_flags |= DCACHE_RCUACCESS;
                      dentry->d_parent = target->d_parent;
                      target->d_parent = target;
                      list_del_init(&target->d_child);
                      list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
              } else {
                      /* swapping two dentries */
                      swap(dentry->d_parent, target->d_parent);
                      list_move(&target->d_child, &target->d_parent->d_subdirs);
   29                 list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
   29                 if (exchange)
   29                         fsnotify_d_move(target);
   29                 fsnotify_d_move(dentry);
    8         }
   29 
              write_seqcount_end(&target->d_seq);
              write_seqcount_end(&dentry->d_seq);
   29 
              dentry_unlock_for_move(dentry, target);
      }
   29 
      /*
       * d_move - move a dentry
       * @dentry: entry to move
       * @target: new dentry
       *
       * Update the dcache to reflect the move of a file name. Negative
       * dcache entries should not be moved in this way. See the locking
       * requirements for __d_move.
       */
      void d_move(struct dentry *dentry, struct dentry *target)
      {
              write_seqlock(&rename_lock);
              __d_move(dentry, target, false);
   21         write_sequnlock(&rename_lock);
      }
      EXPORT_SYMBOL(d_move);
      
      /*
       * d_exchange - exchange two dentries
       * @dentry1: first dentry
       * @dentry2: second dentry
       */
      void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
      {
              write_seqlock(&rename_lock);
      
    8         WARN_ON(!dentry1->d_inode);
              WARN_ON(!dentry2->d_inode);
              WARN_ON(IS_ROOT(dentry1));
    8         WARN_ON(IS_ROOT(dentry2));
    8 
    8         __d_move(dentry1, dentry2, true);
      
    8         write_sequnlock(&rename_lock);
      }
      
      /**
       * d_ancestor - search for an ancestor
       * @p1: ancestor dentry
       * @p2: child dentry
       *
       * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
       * an ancestor of p2, else NULL.
       */
      struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
      {
              struct dentry *p;
      
              for (p = p2; !IS_ROOT(p); p = p->d_parent) {
                      if (p->d_parent == p1)
   64                         return p;
   64         }
              return NULL;
      }
   14 
      /*
       * This helper attempts to cope with remotely renamed directories
       *
       * It assumes that the caller is already holding
       * dentry->d_parent->d_inode->i_mutex, and rename_lock
       *
       * Note: If ever the locking in lock_rename() changes, then please
       * remember to update this too...
       */
      static int __d_unalias(struct inode *inode,
                      struct dentry *dentry, struct dentry *alias)
      {
              struct mutex *m1 = NULL, *m2 = NULL;
              int ret = -ESTALE;
      
              /* If alias and dentry share a parent, then no extra locks required */
              if (alias->d_parent == dentry->d_parent)
                      goto out_unalias;
      
              /* See lock_rename() */
              if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
                      goto out_err;
              m1 = &dentry->d_sb->s_vfs_rename_mutex;
              if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
                      goto out_err;
              m2 = &alias->d_parent->d_inode->i_mutex;
      out_unalias:
              __d_move(alias, dentry, false);
              ret = 0;
      out_err:
              if (m2)
                      mutex_unlock(m2);
              if (m1)
                      mutex_unlock(m1);
              return ret;
      }
      
      /**
       * d_splice_alias - splice a disconnected dentry into the tree if one exists
       * @inode:  the inode which may have a disconnected dentry
       * @dentry: a negative dentry which we want to point to the inode.
       *
       * If inode is a directory and has an IS_ROOT alias, then d_move that in
       * place of the given dentry and return it, else simply d_add the inode
       * to the dentry and return NULL.
       *
       * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
       * we should error out: directories can't have multiple aliases.
       *
       * This is needed in the lookup routine of any filesystem that is exportable
       * (via knfsd) so that we can build dcache paths to directories effectively.
       *
       * If a dentry was found and moved, then it is returned.  Otherwise NULL
       * is returned.  This matches the expected return value of ->lookup.
       *
       * Cluster filesystems may call this function with a negative, hashed dentry.
       * In that case, we know that the inode will be a regular file, and also this
       * will only occur during atomic_open. So we need to check for the dentry
       * being already hashed only in the final case.
       */
      struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
      {
              if (IS_ERR(inode))
                      return ERR_CAST(inode);
   52 
              BUG_ON(!d_unhashed(dentry));
      
   52         if (!inode) {
                      __d_instantiate(dentry, NULL);
   52                 goto out;
   46         }
              spin_lock(&inode->i_lock);
              if (S_ISDIR(inode->i_mode)) {
    6                 struct dentry *new = __d_find_any_alias(inode);
                      if (unlikely(new)) {
    6                         /* The reference to new ensures it remains an alias */
                              spin_unlock(&inode->i_lock);
                              write_seqlock(&rename_lock);
                              if (unlikely(d_ancestor(new, dentry))) {
                                      write_sequnlock(&rename_lock);
                                      dput(new);
                                      new = ERR_PTR(-ELOOP);
                                      pr_warn_ratelimited(
                                              "VFS: Lookup of '%s' in %s %s"
                                              " would have caused loop\n",
                                              dentry->d_name.name,
                                              inode->i_sb->s_type->name,
                                              inode->i_sb->s_id);
                              } else if (!IS_ROOT(new)) {
                                      int err = __d_unalias(inode, dentry, new);
                                      write_sequnlock(&rename_lock);
                                      if (err) {
                                              dput(new);
                                              new = ERR_PTR(err);
                                      }
                              } else {
                                      __d_move(new, dentry, false);
                                      write_sequnlock(&rename_lock);
                                      security_d_instantiate(new, inode);
                              }
                              iput(inode);
                              return new;
                      }
              }
              /* already taking inode->i_lock, so d_add() by hand */
              __d_instantiate(dentry, inode);
              spin_unlock(&inode->i_lock);
    6 out:
              security_d_instantiate(dentry, inode);
              d_rehash(dentry);
   52         return NULL;
      }
   52 EXPORT_SYMBOL(d_splice_alias);
      
      static int prepend(char **buffer, int *buflen, const char *str, int namelen)
      {
              *buflen -= namelen;
              if (*buflen < 0)
  388                 return -ENAMETOOLONG;
              *buffer -= namelen;
              memcpy(*buffer, str, namelen);
   93         return 0;
      }
      
      /**
       * prepend_name - prepend a pathname in front of current buffer pointer
       * @buffer: buffer pointer
       * @buflen: allocated length of the buffer
       * @name:   name string and length qstr structure
       *
       * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
       * make sure that either the old or the new name pointer and length are
       * fetched. However, there may be mismatch between length and pointer.
       * The length cannot be trusted, we need to copy it byte-by-byte until
       * the length is reached or a null byte is found. It also prepends "/" at
       * the beginning of the name. The sequence number check at the caller will
       * retry it again when a d_move() does happen. So any garbage in the buffer
       * due to mismatched pointer and length will be discarded.
       *
       * Data dependency barrier is needed to make sure that we see that terminating
       * NUL.  Alpha strikes again, film at 11...
       */
      static int prepend_name(char **buffer, int *buflen, struct qstr *name)
      {
              const char *dname = ACCESS_ONCE(name->name);
              u32 dlen = ACCESS_ONCE(name->len);
  347         char *p;
      
              smp_read_barrier_depends();
      
              *buflen -= dlen + 1;
              if (*buflen < 0)
                      return -ENAMETOOLONG;
              p = *buffer -= dlen + 1;
              *p++ = '/';
  347         while (dlen--) {
                      char c = *dname++;
                      if (!c)
  347                         break;
                      *p++ = c;
              }
  347         return 0;
      }
  347 
      /**
       * prepend_path - Prepend path string to a buffer
       * @path: the dentry/vfsmount to report
       * @root: root vfsmnt/dentry
       * @buffer: pointer to the end of the buffer
       * @buflen: pointer to buffer length
       *
       * The function will first try to write out the pathname without taking any
       * lock other than the RCU read lock to make sure that dentries won't go away.
       * It only checks the sequence number of the global rename_lock as any change
       * in the dentry's d_seq will be preceded by changes in the rename_lock
       * sequence number. If the sequence number had been changed, it will restart
       * the whole pathname back-tracing sequence again by taking the rename_lock.
       * In this case, there is no need to take the RCU read lock as the recursive
       * parent pointer references will keep the dentry chain alive as long as no
       * rename operation is performed.
       */
      static int prepend_path(const struct path *path,
                              const struct path *root,
                              char **buffer, int *buflen)
      {
              struct dentry *dentry;
              struct vfsmount *vfsmnt;
              struct mount *mnt;
              int error = 0;
              unsigned seq, m_seq = 0;
              char *bptr;
              int blen;
      
              rcu_read_lock();
      restart_mnt:
   96         read_seqbegin_or_lock(&mount_lock, &m_seq);
              seq = 0;
   96         rcu_read_lock();
      restart:
   96         bptr = *buffer;
              blen = *buflen;
   96         error = 0;
              dentry = path->dentry;
              vfsmnt = path->mnt;
              mnt = real_mount(vfsmnt);
              read_seqbegin_or_lock(&rename_lock, &seq);
              while (dentry != root->dentry || vfsmnt != root->mnt) {
   96                 struct dentry * parent;
   96 
                      if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
                              struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
   96                         /* Escaped? */
   93                         if (dentry != vfsmnt->mnt_root) {
                                      bptr = *buffer;
                                      blen = *buflen;
                                      error = 3;
                                      break;
                              }
                              /* Global root? */
                              if (mnt != parent) {
                                      dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
                                      mnt = parent;
   92                                 vfsmnt = &mnt->mnt;
                                      continue;
                              }
                              if (!error)
                                      error = is_mounted(vfsmnt) ? 1 : 2;
                              break;
   90                 }
                      parent = dentry->d_parent;
                      prefetch(parent);
                      error = prepend_name(&bptr, &blen, &dentry->d_name);
   95                 if (error)
                              break;
   95 
                      dentry = parent;
              }
              if (!(seq & 1))
                      rcu_read_unlock();
   96         if (need_seqretry(&rename_lock, seq)) {
   96                 seq = 1;
   96                 goto restart;
              }
              done_seqretry(&rename_lock, seq);
      
              if (!(m_seq & 1))
                      rcu_read_unlock();
   96         if (need_seqretry(&mount_lock, m_seq)) {
   96                 m_seq = 1;
   96                 goto restart_mnt;
              }
              done_seqretry(&mount_lock, m_seq);
      
              if (error >= 0 && bptr == *buffer) {
                      if (--blen < 0)
   96                         error = -ENAMETOOLONG;
   48                 else
                              *--bptr = '/';
              }
   48         *buffer = bptr;
              *buflen = blen;
   96         return error;
      }
      
      /**
       * __d_path - return the path of a dentry
       * @path: the dentry/vfsmount to report
       * @root: root vfsmnt/dentry
       * @buf: buffer to return value in
       * @buflen: buffer length
       *
       * Convert a dentry into an ASCII path name.
       *
       * Returns a pointer into the buffer or an error code if the
       * path was too long.
       *
       * "buflen" should be positive.
       *
       * If the path is not reachable from the supplied root, return %NULL.
       */
      char *__d_path(const struct path *path,
                     const struct path *root,
                     char *buf, int buflen)
      {
              char *res = buf + buflen;
              int error;
   47 
              prepend(&res, &buflen, "\0", 1);
              error = prepend_path(path, root, &res, &buflen);
   47 
   47         if (error < 0)
                      return ERR_PTR(error);
              if (error > 0)
    1                 return NULL;
   47         return res;
      }
   47 
      char *d_absolute_path(const struct path *path,
                     char *buf, int buflen)
      {
              struct path root = {};
              char *res = buf + buflen;
              int error;
      
              prepend(&res, &buflen, "\0", 1);
              error = prepend_path(path, &root, &res, &buflen);
      
              if (error > 1)
                      error = -EINVAL;
              if (error < 0)
                      return ERR_PTR(error);
              return res;
      }
      EXPORT_SYMBOL(d_absolute_path);
      
      /*
       * same as __d_path but appends "(deleted)" for unlinked files.
       */
      static int path_with_deleted(const struct path *path,
                                   const struct path *root,
                                   char **buf, int *buflen)
      {
              prepend(buf, buflen, "\0", 1);
              if (d_unlinked(path->dentry)) {
   45                 int error = prepend(buf, buflen, " (deleted)", 10);
   45                 if (error)
    1                         return error;
              }
      
              return prepend_path(path, root, buf, buflen);
      }
   45 
      static int prepend_unreachable(char **buffer, int *buflen)
      {
              return prepend(buffer, buflen, "(unreachable)", 13);
      }
    1 
      static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
      {
              unsigned seq;
      
              do {
                      seq = read_seqcount_begin(&fs->seq);
                      *root = fs->root;
   45         } while (read_seqcount_retry(&fs->seq, seq));
      }
      
      /**
       * d_path - return the path of a dentry
       * @path: path to report
       * @buf: buffer to return value in
       * @buflen: buffer length
       *
       * Convert a dentry into an ASCII path name. If the entry has been deleted
       * the string " (deleted)" is appended. Note that this is ambiguous.
       *
       * Returns a pointer into the buffer or an error code if the path was
       * too long. Note: Callers should use the returned pointer, not the passed
       * in buffer, to use the name! The implementation often starts at an offset
       * into the buffer, and may leave 0 bytes at the start.
       *
       * "buflen" should be positive.
       */
      char *d_path(const struct path *path, char *buf, int buflen)
      {
              char *res = buf + buflen;
              struct path root;
   53         int error;
      
              /*
               * We have various synthetic filesystems that never get mounted.  On
               * these filesystems dentries are never used for lookup purposes, and
               * thus don't need to be hashed.  They also don't need a name until a
               * user wants to identify the object in /proc/pid/fd/.  The little hack
               * below allows us to generate a name for these objects on demand:
               *
               * Some pseudo inodes are mountable.  When they are mounted
               * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
               * and instead have d_path return the mounted path.
               */
              if (path->dentry->d_op && path->dentry->d_op->d_dname &&
                  (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
   42                 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
   14 
   53         rcu_read_lock();
              get_fs_root_rcu(current->fs, &root);
   45         error = path_with_deleted(path, &root, &res, &buflen);
   45         rcu_read_unlock();
   45 
   45         if (error < 0)
                      res = ERR_PTR(error);
   45         return res;
    1 }
      EXPORT_SYMBOL(d_path);
      
      /*
       * Helper function for dentry_operations.d_dname() members
       */
      char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
                              const char *fmt, ...)
      {
              va_list args;
              char temp[64];
              int sz;
      
              va_start(args, fmt);
              sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
    8         va_end(args);
      
              if (sz > sizeof(temp) || sz > buflen)
                      return ERR_PTR(-ENAMETOOLONG);
    8 
    8         buffer += buflen - sz;
              return memcpy(buffer, temp, sz);
    8 }
      
      char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
      {
              char *end = buffer + buflen;
              /* these dentries are never renamed, so d_lock is not needed */
    6         if (prepend(&end, &buflen, " (deleted)", 11) ||
                  prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
    6             prepend(&end, &buflen, "/", 1))  
    6                 end = ERR_PTR(-ENAMETOOLONG);
    6         return end;
      }
    6 EXPORT_SYMBOL(simple_dname);
      
      /*
       * Write full pathname from the root of the filesystem into the buffer.
       */
      static char *__dentry_path(struct dentry *d, char *buf, int buflen)
      {
              struct dentry *dentry;
              char *end, *retval;
              int len, seq = 0;
              int error = 0;
      
              if (buflen < 2)
                      goto Elong;
  339 
              rcu_read_lock();
      restart:
  339         dentry = d;
              end = buf + buflen;
              len = buflen;
              prepend(&end, &len, "\0", 1);
              /* Get '/' right */
  339         retval = end-1;
              *retval = '/';
              read_seqbegin_or_lock(&rename_lock, &seq);
              while (!IS_ROOT(dentry)) {
  339                 struct dentry *parent = dentry->d_parent;
      
                      prefetch(parent);
                      error = prepend_name(&end, &len, &dentry->d_name);
  269                 if (error)
                              break;
      
                      retval = end;
                      dentry = parent;
  269         }
              if (!(seq & 1))
                      rcu_read_unlock();
  339         if (need_seqretry(&rename_lock, seq)) {
  339                 seq = 1;
                      goto restart;
              }
              done_seqretry(&rename_lock, seq);
              if (error)
                      goto Elong;
  339         return retval;
      Elong:
              return ERR_PTR(-ENAMETOOLONG);
      }
  339 
      char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
      {
              return __dentry_path(dentry, buf, buflen);
      }
  322 EXPORT_SYMBOL(dentry_path_raw);
      
      char *dentry_path(struct dentry *dentry, char *buf, int buflen)
      {
              char *p = NULL;
              char *retval;
      
              if (d_unlinked(dentry)) {
                      p = buf + buflen;
   17                 if (prepend(&p, &buflen, "//deleted", 10) != 0)
    1                         goto Elong;
    1                 buflen++;
              }
              retval = __dentry_path(dentry, buf, buflen);
              if (!IS_ERR(retval) && p)
   17                 *p = '/';        /* restore '/' overriden with '\0' */
   17         return retval;
    1 Elong:
              return ERR_PTR(-ENAMETOOLONG);
      }
      
      static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
                                          struct path *pwd)
      {
              unsigned seq;
      
              do {
                      seq = read_seqcount_begin(&fs->seq);
                      *root = fs->root;
    5                 *pwd = fs->pwd;
              } while (read_seqcount_retry(&fs->seq, seq));
      }
      
      /*
       * NOTE! The user-level library version returns a
       * character pointer. The kernel system call just
       * returns the length of the buffer filled (which
       * includes the ending '\0' character), or a negative
       * error value. So libc would do something like
       *
       *        char *getcwd(char * buf, size_t size)
       *        {
       *                int retval;
       *
       *                retval = sys_getcwd(buf, size);
       *                if (retval >= 0)
       *                        return buf;
       *                errno = -retval;
       *                return NULL;
       *        }
       */
      SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
      {
    5         int error;
              struct path pwd, root;
              char *page = __getname();
      
    5         if (!page)
                      return -ENOMEM;
      
              rcu_read_lock();
              get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
    5 
    5         error = -ENOENT;
              if (!d_unlinked(pwd.dentry)) {
                      unsigned long len;
    5                 char *cwd = page + PATH_MAX;
                      int buflen = PATH_MAX;
      
                      prepend(&cwd, &buflen, "\0", 1);
                      error = prepend_path(&pwd, &root, &cwd, &buflen);
    4                 rcu_read_unlock();
      
    4                 if (error < 0)
                              goto out;
      
    4                 /* Unreachable from current root */
                      if (error > 0) {
                              error = prepend_unreachable(&cwd, &buflen);
    4                         if (error)
    1                                 goto out;
                      }
      
                      error = -ERANGE;
                      len = PATH_MAX + page - cwd;
                      if (len <= size) {
    4                         error = len;
                              if (copy_to_user(buf, cwd, len))
    2                                 error = -EFAULT;
    3                 }
              } else {
                      rcu_read_unlock();
              }
    1 
      out:
              __putname(page);
              return error;
    5 }
    5 
      /*
       * Test whether new_dentry is a subdirectory of old_dentry.
       *
       * Trivially implemented using the dcache structure
       */
      
      /**
       * is_subdir - is new dentry a subdirectory of old_dentry
       * @new_dentry: new dentry
       * @old_dentry: old dentry
       *
       * Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
       * Returns 0 otherwise.
       * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
       */
        
      int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
      {
              int result;
              unsigned seq;
      
              if (new_dentry == old_dentry)
                      return 1;
   66 
              do {
                      /* for restarting inner loop in case of seq retry */
                      seq = read_seqbegin(&rename_lock);
                      /*
   31                  * Need rcu_readlock to protect against the d_parent trashing
                       * due to d_move
                       */
                      rcu_read_lock();
                      if (d_ancestor(old_dentry, new_dentry))
   31                         result = 1;
   31                 else
                              result = 0;
                      rcu_read_unlock();
              } while (read_seqretry(&rename_lock, seq));
   31 
   66         return result;
      }
      
      static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
      {
   21         struct dentry *root = data;
              if (dentry != root) {
                      if (d_unhashed(dentry) || !dentry->d_inode)
   21                         return D_WALK_SKIP;
    2 
                      if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
                              dentry->d_flags |= DCACHE_GENOCIDE;
   21                         dentry->d_lockref.count--;
    2                 }
              }
              return D_WALK_CONTINUE;
      }
      
      void d_genocide(struct dentry *parent)
      {
              d_walk(parent, parent, d_genocide_kill, NULL);
      }
   21 
      void d_tmpfile(struct dentry *dentry, struct inode *inode)
      {
              inode_dec_link_count(inode);
              BUG_ON(dentry->d_name.name != dentry->d_iname ||
    3                 !hlist_unhashed(&dentry->d_u.d_alias) ||
    3                 !d_unlinked(dentry));
              spin_lock(&dentry->d_parent->d_lock);
              spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
    3         dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
                                      (unsigned long long)inode->i_ino);
              spin_unlock(&dentry->d_lock);
              spin_unlock(&dentry->d_parent->d_lock);
              d_instantiate(dentry, inode);
      }
      EXPORT_SYMBOL(d_tmpfile);
      
      static __initdata unsigned long dhash_entries;
      static int __init set_dhash_entries(char *str)
      {
              if (!str)
                      return 0;
              dhash_entries = simple_strtoul(str, &str, 0);
              return 1;
      }
      __setup("dhash_entries=", set_dhash_entries);
      
      static void __init dcache_init_early(void)
      {
              unsigned int loop;
      
              /* If hashes are distributed across NUMA nodes, defer
               * hash allocation until vmalloc space is available.
               */
              if (hashdist)
                      return;
      
              dentry_hashtable =
                      alloc_large_system_hash("Dentry cache",
                                              sizeof(struct hlist_bl_head),
                                              dhash_entries,
                                              13,
                                              HASH_EARLY,
                                              &d_hash_shift,
                                              &d_hash_mask,
                                              0,
                                              0);
      
              for (loop = 0; loop < (1U << d_hash_shift); loop++)
                      INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
      }
      
      static void __init dcache_init(void)
      {
              unsigned int loop;
      
              /* 
               * A constructor could be added for stable state like the lists,
               * but it is probably not worth it because of the cache nature
               * of the dcache. 
               */
              dentry_cache = KMEM_CACHE(dentry,
                      SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
      
              /* Hash may have been set up in dcache_init_early */
              if (!hashdist)
                      return;
      
              dentry_hashtable =
                      alloc_large_system_hash("Dentry cache",
                                              sizeof(struct hlist_bl_head),
                                              dhash_entries,
                                              13,
                                              0,
                                              &d_hash_shift,
                                              &d_hash_mask,
                                              0,
                                              0);
      
              for (loop = 0; loop < (1U << d_hash_shift); loop++)
                      INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
      }
      
      /* SLAB cache for __getname() consumers */
      struct kmem_cache *names_cachep __read_mostly;
      EXPORT_SYMBOL(names_cachep);
      
      EXPORT_SYMBOL(d_genocide);
      
      void __init vfs_caches_init_early(void)
      {
              dcache_init_early();
              inode_init_early();
      }
      
      void __init vfs_caches_init(void)
      {
              names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
                              SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
      
              dcache_init();
              inode_init();
              files_init();
              files_maxfiles_init();
              mnt_init();
              bdev_cache_init();
              chrdev_init();
      }
      /*
       * Rusty Russell (C)2000 -- This code is GPL.
       * Patrick McHardy (c) 2006-2012
       */
      
      #include <linux/kernel.h>
      #include <linux/slab.h>
      #include <linux/init.h>
      #include <linux/module.h>
      #include <linux/proc_fs.h>
      #include <linux/skbuff.h>
      #include <linux/netfilter.h>
      #include <linux/netfilter_bridge.h>
      #include <linux/seq_file.h>
      #include <linux/rcupdate.h>
      #include <net/protocol.h>
      #include <net/netfilter/nf_queue.h>
      #include <net/dst.h>
      
      #include "nf_internals.h"
      
      /*
       * Hook for nfnetlink_queue to register its queue handler.
       * We do this so that most of the NFQUEUE code can be modular.
       *
       * Once the queue is registered it must reinject all packets it
       * receives, no matter what.
       */
      
      /* return EBUSY when somebody else is registered, return EEXIST if the
       * same handler is registered, return 0 in case of success. */
      void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
      {
              /* should never happen, we only have one queueing backend in kernel */
   30         WARN_ON(rcu_access_pointer(net->nf.queue_handler));
   30         rcu_assign_pointer(net->nf.queue_handler, qh);
      }
      EXPORT_SYMBOL(nf_register_queue_handler);
      
      /* The caller must flush their queue before this */
      void nf_unregister_queue_handler(struct net *net)
      {
              RCU_INIT_POINTER(net->nf.queue_handler, NULL);
      }
      EXPORT_SYMBOL(nf_unregister_queue_handler);
      
      void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
      {
              struct nf_hook_state *state = &entry->state;
      
              /* Release those devices we held, or Alexey will kill me. */
              if (state->in)
                      dev_put(state->in);
              if (state->out)
                      dev_put(state->out);
              if (state->sk)
                      sock_put(state->sk);
      #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
              if (entry->skb->nf_bridge) {
                      struct net_device *physdev;
      
                      physdev = nf_bridge_get_physindev(entry->skb);
                      if (physdev)
                              dev_put(physdev);
                      physdev = nf_bridge_get_physoutdev(entry->skb);
                      if (physdev)
                              dev_put(physdev);
              }
      #endif
      }
      EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
      
      /* Bump dev refs so they don't vanish while packet is out */
      void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
      {
              struct nf_hook_state *state = &entry->state;
      
              if (state->in)
                      dev_hold(state->in);
              if (state->out)
                      dev_hold(state->out);
              if (state->sk)
                      sock_hold(state->sk);
      #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
              if (entry->skb->nf_bridge) {
                      struct net_device *physdev;
      
                      physdev = nf_bridge_get_physindev(entry->skb);
                      if (physdev)
                              dev_hold(physdev);
                      physdev = nf_bridge_get_physoutdev(entry->skb);
                      if (physdev)
                              dev_hold(physdev);
              }
      #endif
      }
      EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
      
      void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops)
      {
              const struct nf_queue_handler *qh;
      
              rcu_read_lock();
              qh = rcu_dereference(net->nf.queue_handler);
              if (qh)
                      qh->nf_hook_drop(net, ops);
              rcu_read_unlock();
      }
      
      /*
       * Any packet that leaves via this function must come back
       * through nf_reinject().
       */
      int nf_queue(struct sk_buff *skb,
                   struct nf_hook_ops *elem,
                   struct nf_hook_state *state,
                   unsigned int queuenum)
      {
              int status = -ENOENT;
              struct nf_queue_entry *entry = NULL;
              const struct nf_afinfo *afinfo;
              const struct nf_queue_handler *qh;
              struct net *net = state->net;
      
              /* QUEUE == DROP if no one is waiting, to be safe. */
              qh = rcu_dereference(net->nf.queue_handler);
              if (!qh) {
                      status = -ESRCH;
                      goto err;
              }
      
              afinfo = nf_get_afinfo(state->pf);
              if (!afinfo)
                      goto err;
      
              entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
              if (!entry) {
                      status = -ENOMEM;
                      goto err;
              }
      
              *entry = (struct nf_queue_entry) {
                      .skb        = skb,
                      .elem        = elem,
                      .state        = *state,
                      .size        = sizeof(*entry) + afinfo->route_key_size,
              };
      
              nf_queue_entry_get_refs(entry);
              skb_dst_force(skb);
              afinfo->saveroute(skb, entry);
              status = qh->outfn(entry, queuenum);
      
              if (status < 0) {
                      nf_queue_entry_release_refs(entry);
                      goto err;
              }
      
              return 0;
      
      err:
              kfree(entry);
              return status;
      }
      
      void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
      {
              struct sk_buff *skb = entry->skb;
              struct nf_hook_ops *elem = entry->elem;
              const struct nf_afinfo *afinfo;
              int err;
      
              nf_queue_entry_release_refs(entry);
      
              /* Continue traversal iff userspace said ok... */
              if (verdict == NF_REPEAT)
                      verdict = elem->hook(elem->priv, skb, &entry->state);
      
              if (verdict == NF_ACCEPT) {
                      afinfo = nf_get_afinfo(entry->state.pf);
                      if (!afinfo || afinfo->reroute(entry->state.net, skb, entry) < 0)
                              verdict = NF_DROP;
              }
      
              entry->state.thresh = INT_MIN;
      
              if (verdict == NF_ACCEPT) {
              next_hook:
                      verdict = nf_iterate(entry->state.hook_list,
                                           skb, &entry->state, &elem);
              }
      
              switch (verdict & NF_VERDICT_MASK) {
              case NF_ACCEPT:
              case NF_STOP:
                      local_bh_disable();
                      entry->state.okfn(entry->state.net, entry->state.sk, skb);
                      local_bh_enable();
                      break;
              case NF_QUEUE:
                      err = nf_queue(skb, elem, &entry->state,
                                     verdict >> NF_VERDICT_QBITS);
                      if (err < 0) {
                              if (err == -ESRCH &&
                                 (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
                                      goto next_hook;
                              kfree_skb(skb);
                      }
                      break;
              case NF_STOLEN:
                      break;
              default:
                      kfree_skb(skb);
              }
      
              kfree(entry);
      }
      EXPORT_SYMBOL(nf_reinject);
      /*
       * include/linux/pagevec.h
       *
       * In many places it is efficient to batch an operation up against multiple
       * pages.  A pagevec is a multipage container which is used for that.
       */
      
      #ifndef _LINUX_PAGEVEC_H
      #define _LINUX_PAGEVEC_H
      
      /* 14 pointers + two long's align the pagevec structure to a power of two */
      #define PAGEVEC_SIZE        14
      
      struct page;
      struct address_space;
      
      struct pagevec {
              unsigned long nr;
              unsigned long cold;
              struct page *pages[PAGEVEC_SIZE];
      };
      
      void __pagevec_release(struct pagevec *pvec);
      void __pagevec_lru_add(struct pagevec *pvec);
      unsigned pagevec_lookup_entries(struct pagevec *pvec,
                                      struct address_space *mapping,
                                      pgoff_t start, unsigned nr_entries,
                                      pgoff_t *indices);
      void pagevec_remove_exceptionals(struct pagevec *pvec);
      unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
                      pgoff_t start, unsigned nr_pages);
      unsigned pagevec_lookup_tag(struct pagevec *pvec,
                      struct address_space *mapping, pgoff_t *index, int tag,
                      unsigned nr_pages);
      
      static inline void pagevec_init(struct pagevec *pvec, int cold)
      {
              pvec->nr = 0;
              pvec->cold = cold;
      }
      
      static inline void pagevec_reinit(struct pagevec *pvec)
      {
              pvec->nr = 0;
      }
  929 
   14 static inline unsigned pagevec_count(struct pagevec *pvec)
      {
              return pvec->nr;
      }
      
      static inline unsigned pagevec_space(struct pagevec *pvec)
      {
              return PAGEVEC_SIZE - pvec->nr;
      }
      
 2260 /*
       * Add a page to a pagevec.  Returns the number of slots still available.
       */
      static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page)
      {
 3106         pvec->pages[pvec->nr++] = page;
              return pagevec_space(pvec);
      }
      
      static inline void pagevec_release(struct pagevec *pvec)
      {
              if (pagevec_count(pvec))
                      __pagevec_release(pvec);
 3152 }
      
      #endif /* _LINUX_PAGEVEC_H */
      /*
       * NET                An implementation of the SOCKET network access protocol.
       *
       * Version:        @(#)socket.c        1.1.93        18/02/95
       *
       * Authors:        Orest Zborowski, <obz@Kodak.COM>
       *                Ross Biro
       *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
       *
       * Fixes:
       *                Anonymous        :        NOTSOCK/BADF cleanup. Error fix in
       *                                        shutdown()
       *                Alan Cox        :        verify_area() fixes
       *                Alan Cox        :        Removed DDI
       *                Jonathan Kamens        :        SOCK_DGRAM reconnect bug
       *                Alan Cox        :        Moved a load of checks to the very
       *                                        top level.
       *                Alan Cox        :        Move address structures to/from user
       *                                        mode above the protocol layers.
       *                Rob Janssen        :        Allow 0 length sends.
       *                Alan Cox        :        Asynchronous I/O support (cribbed from the
       *                                        tty drivers).
       *                Niibe Yutaka        :        Asynchronous I/O for writes (4.4BSD style)
       *                Jeff Uphoff        :        Made max number of sockets command-line
       *                                        configurable.
       *                Matti Aarnio        :        Made the number of sockets dynamic,
       *                                        to be allocated when needed, and mr.
       *                                        Uphoff's max is used as max to be
       *                                        allowed to allocate.
       *                Linus                :        Argh. removed all the socket allocation
       *                                        altogether: it's in the inode now.
       *                Alan Cox        :        Made sock_alloc()/sock_release() public
       *                                        for NetROM and future kernel nfsd type
       *                                        stuff.
       *                Alan Cox        :        sendmsg/recvmsg basics.
       *                Tom Dyas        :        Export net symbols.
       *                Marcin Dalecki        :        Fixed problems with CONFIG_NET="n".
       *                Alan Cox        :        Added thread locking to sys_* calls
       *                                        for sockets. May have errors at the
       *                                        moment.
       *                Kevin Buhr        :        Fixed the dumb errors in the above.
       *                Andi Kleen        :        Some small cleanups, optimizations,
       *                                        and fixed a copy_from_user() bug.
       *                Tigran Aivazian        :        sys_send(args) calls sys_sendto(args, NULL, 0)
       *                Tigran Aivazian        :        Made listen(2) backlog sanity checks
       *                                        protocol-independent
       *
       *
       *                This program is free software; you can redistribute it and/or
       *                modify it under the terms of the GNU General Public License
       *                as published by the Free Software Foundation; either version
       *                2 of the License, or (at your option) any later version.
       *
       *
       *        This module is effectively the top level interface to the BSD socket
       *        paradigm.
       *
       *        Based upon Swansea University Computer Society NET3.039
       */
      
      #include <linux/mm.h>
      #include <linux/socket.h>
      #include <linux/file.h>
      #include <linux/net.h>
      #include <linux/interrupt.h>
      #include <linux/thread_info.h>
      #include <linux/rcupdate.h>
      #include <linux/netdevice.h>
      #include <linux/proc_fs.h>
      #include <linux/seq_file.h>
      #include <linux/mutex.h>
      #include <linux/if_bridge.h>
      #include <linux/if_frad.h>
      #include <linux/if_vlan.h>
      #include <linux/ptp_classify.h>
      #include <linux/init.h>
      #include <linux/poll.h>
      #include <linux/cache.h>
      #include <linux/module.h>
      #include <linux/highmem.h>
      #include <linux/mount.h>
      #include <linux/security.h>
      #include <linux/syscalls.h>
      #include <linux/compat.h>
      #include <linux/kmod.h>
      #include <linux/audit.h>
      #include <linux/wireless.h>
      #include <linux/nsproxy.h>
      #include <linux/magic.h>
      #include <linux/slab.h>
      #include <linux/xattr.h>
      #include <linux/nospec.h>
      
      #include <asm/uaccess.h>
      #include <asm/unistd.h>
      
      #include <net/compat.h>
      #include <net/wext.h>
      #include <net/cls_cgroup.h>
      
      #include <net/sock.h>
      #include <linux/netfilter.h>
      
      #include <linux/if_tun.h>
      #include <linux/ipv6_route.h>
      #include <linux/route.h>
      #include <linux/sockios.h>
      #include <linux/atalk.h>
      #include <net/busy_poll.h>
      #include <linux/errqueue.h>
      
      #ifdef CONFIG_NET_RX_BUSY_POLL
      unsigned int sysctl_net_busy_read __read_mostly;
      unsigned int sysctl_net_busy_poll __read_mostly;
      #endif
      
      static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
      static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
      static int sock_mmap(struct file *file, struct vm_area_struct *vma);
      
      static int sock_close(struct inode *inode, struct file *file);
      static unsigned int sock_poll(struct file *file,
                                    struct poll_table_struct *wait);
      static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
      #ifdef CONFIG_COMPAT
      static long compat_sock_ioctl(struct file *file,
                                    unsigned int cmd, unsigned long arg);
      #endif
      static int sock_fasync(int fd, struct file *filp, int on);
      static ssize_t sock_sendpage(struct file *file, struct page *page,
                                   int offset, size_t size, loff_t *ppos, int more);
      static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                      struct pipe_inode_info *pipe, size_t len,
                                      unsigned int flags);
      
      /*
       *        Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
       *        in the operation structures but are done directly via the socketcall() multiplexor.
       */
      
      static const struct file_operations socket_file_ops = {
              .owner =        THIS_MODULE,
              .llseek =        no_llseek,
              .read_iter =        sock_read_iter,
              .write_iter =        sock_write_iter,
              .poll =                sock_poll,
              .unlocked_ioctl = sock_ioctl,
      #ifdef CONFIG_COMPAT
              .compat_ioctl = compat_sock_ioctl,
      #endif
              .mmap =                sock_mmap,
              .release =        sock_close,
              .fasync =        sock_fasync,
              .sendpage =        sock_sendpage,
              .splice_write = generic_splice_sendpage,
              .splice_read =        sock_splice_read,
      };
      
      /*
       *        The protocol list. Each protocol is registered in here.
       */
      
      static DEFINE_SPINLOCK(net_family_lock);
      static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
      
      /*
       *        Statistics counters of the socket lists
       */
      
      static DEFINE_PER_CPU(int, sockets_in_use);
      
      /*
       * Support routines.
       * Move socket addresses back and forth across the kernel/user
       * divide and look after the messy bits.
       */
      
      /**
       *        move_addr_to_kernel        -        copy a socket address into kernel space
       *        @uaddr: Address in user space
       *        @kaddr: Address in kernel space
       *        @ulen: Length in user space
       *
       *        The address is copied into kernel space. If the provided address is
       *        too long an error code of -EINVAL is returned. If the copy gives
       *        invalid addresses -EFAULT is returned. On a success 0 is returned.
       */
      
      int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
      {
 1310         if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
                      return -EINVAL;
  422         if (ulen == 0)
                      return 0;
 1299         if (copy_from_user(kaddr, uaddr, ulen))
                      return -EFAULT;
 1299         return audit_sockaddr(ulen, kaddr);
      }
      
      /**
       *        move_addr_to_user        -        copy an address to user space
       *        @kaddr: kernel space address
       *        @klen: length of address in kernel
       *        @uaddr: user space address
       *        @ulen: pointer to user length field
       *
       *        The value pointed to by ulen on entry is the buffer length available.
       *        This is overwritten with the buffer space used. -EINVAL is returned
       *        if an overlong buffer is specified or a negative buffer size. -EFAULT
       *        is returned if either the buffer or the length field are not
       *        accessible.
       *        After copying the data up to the limit the user specifies, the true
       *        length of the data is written over the length limit the user
       *        specified. Zero is returned for a success.
       */
      
      static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
                                   void __user *uaddr, int __user *ulen)
      {
              int err;
              int len;
      
  129         BUG_ON(klen > sizeof(struct sockaddr_storage));
  129         err = get_user(len, ulen);
              if (err)
                      return err;
  125         if (len > klen)
                      len = klen;
              if (len < 0)
                      return -EINVAL;
  124         if (len) {
   64                 if (audit_sockaddr(klen, kaddr))
                              return -ENOMEM;
   64                 if (copy_to_user(uaddr, kaddr, len))
                              return -EFAULT;
              }
              /*
               *      "fromlen shall refer to the value before truncation.."
               *                      1003.1g
               */
  129         return __put_user(klen, ulen);
      }
      
      static struct kmem_cache *sock_inode_cachep __read_mostly;
      
      static struct inode *sock_alloc_inode(struct super_block *sb)
      {
              struct socket_alloc *ei;
              struct socket_wq *wq;
      
  936         ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
              if (!ei)
                      return NULL;
  936         wq = kmalloc(sizeof(*wq), GFP_KERNEL);
              if (!wq) {
                      kmem_cache_free(sock_inode_cachep, ei);
                      return NULL;
              }
  936         init_waitqueue_head(&wq->wait);
              wq->fasync_list = NULL;
              wq->flags = 0;
              RCU_INIT_POINTER(ei->socket.wq, wq);
      
              ei->socket.state = SS_UNCONNECTED;
              ei->socket.flags = 0;
              ei->socket.ops = NULL;
              ei->socket.sk = NULL;
              ei->socket.file = NULL;
      
  936         return &ei->vfs_inode;
      }
      
      static void sock_destroy_inode(struct inode *inode)
      {
              struct socket_alloc *ei;
              struct socket_wq *wq;
      
  496         ei = container_of(inode, struct socket_alloc, vfs_inode);
              wq = rcu_dereference_protected(ei->socket.wq, 1);
              kfree_rcu(wq, rcu);
              kmem_cache_free(sock_inode_cachep, ei);
      }
      
      static void init_once(void *foo)
      {
              struct socket_alloc *ei = (struct socket_alloc *)foo;
      
   28         inode_init_once(&ei->vfs_inode);
      }
      
      static int init_inodecache(void)
      {
              sock_inode_cachep = kmem_cache_create("sock_inode_cache",
                                                    sizeof(struct socket_alloc),
                                                    0,
                                                    (SLAB_HWCACHE_ALIGN |
                                                     SLAB_RECLAIM_ACCOUNT |
                                                     SLAB_MEM_SPREAD),
                                                    init_once);
              if (sock_inode_cachep == NULL)
                      return -ENOMEM;
              return 0;
      }
      
      static const struct super_operations sockfs_ops = {
              .alloc_inode        = sock_alloc_inode,
              .destroy_inode        = sock_destroy_inode,
              .statfs                = simple_statfs,
      };
      
      /*
       * sockfs_dname() is called from d_path().
       */
      static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
      {
              return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
    4                                 d_inode(dentry)->i_ino);
      }
      
      static const struct dentry_operations sockfs_dentry_operations = {
              .d_dname  = sockfs_dname,
      };
      
      static struct dentry *sockfs_mount(struct file_system_type *fs_type,
                               int flags, const char *dev_name, void *data)
      {
              return mount_pseudo(fs_type, "socket:", &sockfs_ops,
                      &sockfs_dentry_operations, SOCKFS_MAGIC);
      }
      
      static struct vfsmount *sock_mnt __read_mostly;
      
      static struct file_system_type sock_fs_type = {
              .name =                "sockfs",
              .mount =        sockfs_mount,
              .kill_sb =        kill_anon_super,
      };
      
      /*
       *        Obtains the first available file descriptor and sets it up for use.
       *
       *        These functions create file structures and maps them to fd space
       *        of the current process. On success it returns file descriptor
       *        and file struct implicitly stored in sock->file.
       *        Note that another thread may close file descriptor before we return
       *        from this function. We use the fact that now we do not refer
       *        to socket after mapping. If one day we will need it, this
       *        function will increment ref. count on file by 1.
       *
       *        In any case returned fd MAY BE not valid!
       *        This race condition is unavoidable
       *        with shared fd spaces, we cannot solve it inside kernel,
       *        but we take care of internal coherence yet.
       */
      
      struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
      {
  833         struct qstr name = { .name = "" };
              struct path path;
              struct file *file;
      
              if (dname) {
   35                 name.name = dname;
                      name.len = strlen(name.name);
  803         } else if (sock->sk) {
  803                 name.name = sock->sk->sk_prot_creator->name;
                      name.len = strlen(name.name);
              }
  833         path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
              if (unlikely(!path.dentry))
                      return ERR_PTR(-ENOMEM);
  833         path.mnt = mntget(sock_mnt);
      
              d_instantiate(path.dentry, SOCK_INODE(sock));
      
              file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
                        &socket_file_ops);
              if (IS_ERR(file)) {
                      /* drop dentry, keep inode */
                      ihold(d_inode(path.dentry));
                      path_put(&path);
                      return file;
              }
      
  833         sock->file = file;
              file->f_flags = O_RDWR | (flags & O_NONBLOCK);
              file->private_data = sock;
  833         return file;
      }
      EXPORT_SYMBOL(sock_alloc_file);
      
      static int sock_map_fd(struct socket *sock, int flags)
      {
              struct file *newfile;
              int fd = get_unused_fd_flags(flags);
              if (unlikely(fd < 0))
                      return fd;
      
  697         newfile = sock_alloc_file(sock, flags, NULL);
              if (likely(!IS_ERR(newfile))) {
  697                 fd_install(fd, newfile);
                      return fd;
              }
      
              put_unused_fd(fd);
              return PTR_ERR(newfile);
      }
      
      struct socket *sock_from_file(struct file *file, int *err)
      {
 3720         if (file->f_op == &socket_file_ops)
 3713                 return file->private_data;        /* set in sock_map_fd */
      
   12         *err = -ENOTSOCK;
              return NULL;
      }
      EXPORT_SYMBOL(sock_from_file);
      
      /**
       *        sockfd_lookup - Go from a file number to its socket slot
       *        @fd: file handle
       *        @err: pointer to an error code return
       *
       *        The file handle passed in is locked and the socket it is bound
       *        too is returned. If an error occurs the err pointer is overwritten
       *        with a negative errno code and NULL is returned. The function checks
       *        for both invalid handles and passing a handle which is not a socket.
       *
       *        On a success the socket object pointer is returned.
       */
      
      struct socket *sockfd_lookup(int fd, int *err)
      {
              struct file *file;
              struct socket *sock;
      
 1240         file = fget(fd);
              if (!file) {
   22                 *err = -EBADF;
                      return NULL;
              }
      
 1233         sock = sock_from_file(file, err);
 1231         if (!sock)
   24                 fput(file);
              return sock;
      }
      EXPORT_SYMBOL(sockfd_lookup);
      
      static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
      {
 2877         struct fd f = fdget(fd);
              struct socket *sock;
      
              *err = -EBADF;
              if (f.file) {
 2870                 sock = sock_from_file(f.file, err);
                      if (likely(sock)) {
 2865                         *fput_needed = f.flags;
                              return sock;
                      }
   10                 fdput(f);
              }
 2877         return NULL;
      }
      
      #define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
      #define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
      #define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
      static ssize_t sockfs_getxattr(struct dentry *dentry,
                                     const char *name, void *value, size_t size)
      {
              if (!strcmp(name, XATTR_NAME_SOCKPROTONAME)) {
                      if (value) {
                              if (dentry->d_name.len + 1 > size)
                                      return -ERANGE;
                              memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
    6                 }
    3                 return dentry->d_name.len + 1;
              }
              return -EOPNOTSUPP;
      }
      
    2 static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
                                      size_t size)
      {
    1         ssize_t len;
              ssize_t used = 0;
    2 
              len = security_inode_listsecurity(d_inode(dentry), buffer, size);
              if (len < 0)
                      return len;
    6         used += len;
              if (buffer) {
                      if (size < used)
                              return -ERANGE;
                      buffer += len;
              }
      
              len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
              used += len;
    6         if (buffer) {
                      if (size < used)
                              return -ERANGE;
                      memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
    6                 buffer += len;
    5         }
      
    4         return used;
      }
      
      static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
    1 {
    1         int err = simple_setattr(dentry, iattr);
    4 
              if (!err && (iattr->ia_valid & ATTR_UID)) {
    3                 struct socket *sock = SOCKET_I(d_inode(dentry));
    5 
                      sock->sk->sk_uid = iattr->ia_uid;
              }
      
              return err;
      }
    1 
      static const struct inode_operations sockfs_inode_ops = {
    3         .getxattr = sockfs_getxattr,
              .listxattr = sockfs_listxattr,
    3         .setattr = sockfs_setattr,
    1 };
      
      /**
    1  *        sock_alloc        -        allocate a socket
       *
       *        Allocate a new inode and socket object. The two are bound together
       *        and initialised. The socket is then returned. If we are out of inodes
       *        NULL is returned.
    3  */
      
      static struct socket *sock_alloc(void)
      {
              struct inode *inode;
              struct socket *sock;
      
              inode = new_inode_pseudo(sock_mnt->mnt_sb);
              if (!inode)
                      return NULL;
      
              sock = SOCKET_I(inode);
      
              kmemcheck_annotate_bitfield(sock, type);
              inode->i_ino = get_next_ino();
              inode->i_mode = S_IFSOCK | S_IRWXUGO;
              inode->i_uid = current_fsuid();
              inode->i_gid = current_fsgid();
              inode->i_op = &sockfs_inode_ops;
      
              this_cpu_add(sockets_in_use, 1);
              return sock;
  936 }
      
      /**
       *        sock_release        -        close a socket
  936  *        @sock: socket to close
       *
       *        The socket is released from the protocol stack if it has a release
       *        callback, and the inode is then released if the socket is bound to
       *        an inode not a file.
       */
      
      void sock_release(struct socket *sock)
      {
              if (sock->ops) {
  936                 struct module *owner = sock->ops->owner;
      
                      sock->ops->release(sock);
                      sock->ops = NULL;
                      module_put(owner);
              }
      
              if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
                      pr_err("%s: fasync list not empty!\n", __func__);
      
              this_cpu_sub(sockets_in_use, 1);
              if (!sock->file) {
                      iput(SOCK_INODE(sock));
                      return;
  495         }
  414         sock->file = NULL;
      }
      EXPORT_SYMBOL(sock_release);
  405 
   10 void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags)
      {
              u8 flags = *tx_flags;
  415 
              if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
                      flags |= SKBTX_HW_TSTAMP;
      
  496         if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
                      flags |= SKBTX_SW_TSTAMP;
      
  496         if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)
                      flags |= SKBTX_SCHED_TSTAMP;
   96 
              if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)
                      flags |= SKBTX_ACK_TSTAMP;
  496 
              *tx_flags = flags;
      }
      EXPORT_SYMBOL(__sock_tx_timestamp);
      
   86 static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
      {
              int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg));
              BUG_ON(ret == -EIOCBQUEUED);
              return ret;
      }
  101 
      int sock_sendmsg(struct socket *sock, struct msghdr *msg)
      {
   13         int err = security_socket_sendmsg(sock, msg,
                                                msg_data_left(msg));
  101 
   18         return err ?: sock_sendmsg_nosec(sock, msg);
      }
  101 EXPORT_SYMBOL(sock_sendmsg);
   96 
      int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
  101                    struct kvec *vec, size_t num, size_t size)
   12 {
              iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size);
  101         return sock_sendmsg(sock, msg);
      }
      EXPORT_SYMBOL(kernel_sendmsg);
      
      /*
       * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 2874  */
      void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
              struct sk_buff *skb)
      {
              int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
              struct scm_timestamping tss;
              int empty = 1;
              struct skb_shared_hwtstamps *shhwtstamps =
 2878                 skb_hwtstamps(skb);
      
 2875         /* Race occurred between timestamp enabling and packet
                 receiving.  Fill in the current time for now. */
              if (need_software_tstamp && skb->tstamp.tv64 == 0)
                      __net_timestamp(skb);
      
              if (need_software_tstamp) {
                      if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
  227                         struct timeval tv;
                              skb_get_timestamp(skb, &tv);
                              put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
                                       sizeof(tv), &tv);
                      } else {
                              struct timespec ts;
                              skb_get_timestampns(skb, &ts);
                              put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
                                       sizeof(ts), &ts);
                      }
              }
   36 
              memset(&tss, 0, sizeof(tss));
              if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
                  ktime_to_timespec_cond(skb->tstamp, tss.ts + 0))
                      empty = 0;
              if (shhwtstamps &&
                  (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
                  ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
    9                 empty = 0;
    3         if (!empty)
                      put_cmsg(msg, SOL_SOCKET,
                               SCM_TIMESTAMPING, sizeof(tss), &tss);
    9 }
      EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
    4 
      void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
              struct sk_buff *skb)
      {
              int ack;
    5 
              if (!sock_flag(sk, SOCK_WIFI_STATUS))
                      return;
              if (!skb->wifi_acked_valid)
                      return;
      
   36         ack = skb->wifi_acked;
   27 
   25         put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
      }
   11 EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
   25 
      static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
                                         struct sk_buff *skb)
   36 {
   25         if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
                      put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
                              sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
      }
      
      void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
              struct sk_buff *skb)
      {
              sock_recv_timestamp(msg, sk, skb);
              sock_recv_drops(msg, sk, skb);
      }
      EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
      
      static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
                                           size_t size, int flags)
      {
              return sock->ops->recvmsg(sock, msg, size, flags);
      }
      
      int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                       int flags)
      {
              int err = security_socket_recvmsg(sock, msg, size, flags);
      
   14         return err ?: sock_recvmsg_nosec(sock, msg, size, flags);
      }
      EXPORT_SYMBOL(sock_recvmsg);
      
      /**
       * kernel_recvmsg - Receive a message from a socket (kernel space)
       * @sock:       The socket to receive the message from
       * @msg:        Received message
   14  * @vec:        Input s/g array for message data
   14  * @num:        Size of input s/g array
   14  * @size:       Number of bytes to read
       * @flags:      Message flags (MSG_DONTWAIT, etc...)
       *
       * On return the msg structure contains the scatter/gather array passed in the
       * vec argument. The array is modified so that it consists of the unfilled
       * portion of the original array.
  747  *
       * The returned value is the total number of bytes received, or an error.
       */
  747 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
                         struct kvec *vec, size_t num, size_t size, int flags)
      {
  747         mm_segment_t oldfs = get_fs();
              int result;
  749 
              iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size);
              set_fs(KERNEL_DS);
              result = sock_recvmsg(sock, msg, size, flags);
              set_fs(oldfs);
              return result;
      }
      EXPORT_SYMBOL(kernel_recvmsg);
      
      static ssize_t sock_sendpage(struct file *file, struct page *page,
                                   int offset, size_t size, loff_t *ppos, int more)
      {
              struct socket *sock;
              int flags;
      
              sock = file->private_data;
      
              flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
              /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
              flags |= more;
      
              return kernel_sendpage(sock, page, offset, size, flags);
      }
      
      static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                      struct pipe_inode_info *pipe, size_t len,
                                      unsigned int flags)
      {
              struct socket *sock = file->private_data;
      
              if (unlikely(!sock->ops->splice_read))
                      return -EINVAL;
      
              return sock->ops->splice_read(sock, ppos, pipe, len, flags);
      }
      
      static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
      {
              struct file *file = iocb->ki_filp;
  398         struct socket *sock = file->private_data;
              struct msghdr msg = {.msg_iter = *to,
                                   .msg_iocb = iocb};
              ssize_t res;
      
              if (file->f_flags & O_NONBLOCK)
                      msg.msg_flags = MSG_DONTWAIT;
      
              if (iocb->ki_pos != 0)
                      return -ESPIPE;
      
              if (!iov_iter_count(to))        /* Match SYS5 behaviour */
                      return 0;
   20 
              res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags);
              *to = msg.msg_iter;
              return res;
      }
   20 
      static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
      {
              struct file *file = iocb->ki_filp;
              struct socket *sock = file->private_data;
   62         struct msghdr msg = {.msg_iter = *from,
                                   .msg_iocb = iocb};
              ssize_t res;
      
              if (iocb->ki_pos != 0)
                      return -ESPIPE;
      
    7         if (file->f_flags & O_NONBLOCK)
                      msg.msg_flags = MSG_DONTWAIT;
   62 
              if (sock->type == SOCK_SEQPACKET)
                      msg.msg_flags |= MSG_EOR;
   62 
              res = sock_sendmsg(sock, &msg);
              *from = msg.msg_iter;
   50         return res;
      }
      
      /*
       * Atomic setting of ioctl hooks to avoid race
       * with module unload.
       */
  706 
      static DEFINE_MUTEX(br_ioctl_mutex);
      static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
      
      void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
      {
              mutex_lock(&br_ioctl_mutex);
              br_ioctl_hook = hook;
              mutex_unlock(&br_ioctl_mutex);
  705 }
   78 EXPORT_SYMBOL(brioctl_set);
      
  705 static DEFINE_MUTEX(vlan_ioctl_mutex);
   16 static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
      
  705 void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
      {
  601         mutex_lock(&vlan_ioctl_mutex);
              vlan_ioctl_hook = hook;
              mutex_unlock(&vlan_ioctl_mutex);
      }
      EXPORT_SYMBOL(vlan_ioctl_set);
      
      static DEFINE_MUTEX(dlci_ioctl_mutex);
      static int (*dlci_ioctl_hook) (unsigned int, void __user *);
      
      void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
      {
              mutex_lock(&dlci_ioctl_mutex);
              dlci_ioctl_hook = hook;
              mutex_unlock(&dlci_ioctl_mutex);
      }
      EXPORT_SYMBOL(dlci_ioctl_set);
      
      static long sock_do_ioctl(struct net *net, struct socket *sock,
                                       unsigned int cmd, unsigned long arg)
      {
              int err;
              void __user *argp = (void __user *)arg;
      
              err = sock->ops->ioctl(sock, cmd, arg);
      
              /*
               * If this ioctl is unknown try to hand it down
               * to the NIC driver.
               */
              if (err == -ENOIOCTLCMD)
                      err = dev_ioctl(net, cmd, argp);
      
              return err;
      }
      
      /*
       *        With an ioctl, arg may well be a user mode pointer, but we don't know
       *        what to do with it - that's up to the protocol still.
       */
      
      static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
      {
              struct socket *sock;
              struct sock *sk;
              void __user *argp = (void __user *)arg;
  101         int pid, err;
              struct net *net;
  335 
              sock = file->private_data;
              sk = sock->sk;
              net = sock_net(sk);
              if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
                      err = dev_ioctl(net, cmd, argp);
              } else
      #ifdef CONFIG_WEXT_CORE
              if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
  317                 err = dev_ioctl(net, cmd, argp);
              } else
      #endif
                      switch (cmd) {
                      case FIOSETOWN:
                      case SIOCSPGRP:
                              err = -EFAULT;
                              if (get_user(pid, (int __user *)argp))
                                      break;
                              f_setown(sock->file, pid, 1);
                              err = 0;
                              break;
   58                 case FIOGETOWN:
                      case SIOCGPGRP:
                              err = put_user(f_getown(sock->file),
                                             (int __user *)argp);
                              break;
                      case SIOCGIFBR:
                      case SIOCSIFBR:
                      case SIOCBRADDBR:
                      case SIOCBRDELBR:
                              err = -ENOPKG;
                              if (!br_ioctl_hook)
                                      request_module("bridge");
      
                              mutex_lock(&br_ioctl_mutex);
                              if (br_ioctl_hook)
   58                                 err = br_ioctl_hook(net, cmd, argp);
                              mutex_unlock(&br_ioctl_mutex);
                              break;
                      case SIOCGIFVLAN:
    3                 case SIOCSIFVLAN:
                              err = -ENOPKG;
    3                         if (!vlan_ioctl_hook)
                                      request_module("8021q");
      
                              mutex_lock(&vlan_ioctl_mutex);
                              if (vlan_ioctl_hook)
    7                                 err = vlan_ioctl_hook(net, argp);
                              mutex_unlock(&vlan_ioctl_mutex);
                              break;
                      case SIOCADDDLCI:
                      case SIOCDELDLCI:
                              err = -ENOPKG;
                              if (!dlci_ioctl_hook)
                                      request_module("dlci");
    3 
    3                         mutex_lock(&dlci_ioctl_mutex);
                              if (dlci_ioctl_hook)
    3                                 err = dlci_ioctl_hook(cmd, argp);
                              mutex_unlock(&dlci_ioctl_mutex);
                              break;
    3                 default:
                              err = sock_do_ioctl(net, sock, cmd, arg);
                              break;
                      }
              return err;
    3 }
    3 
      int sock_create_lite(int family, int type, int protocol, struct socket **res)
    4 {
              int err;
              struct socket *sock = NULL;
    4 
              err = security_socket_create(family, type, protocol, 1);
              if (err)
                      goto out;
      
    3         sock = sock_alloc();
    3         if (!sock) {
                      err = -ENOMEM;
    3                 goto out;
              }
      
    3         sock->type = type;
              err = security_socket_post_create(sock, family, type, protocol, 1);
              if (err)
   39                 goto out_release;
      
      out:
   59         *res = sock;
              return err;
      out_release:
              sock_release(sock);
              sock = NULL;
              goto out;
      }
      EXPORT_SYMBOL(sock_create_lite);
   30 
      /* No kernel lock held - perfect */
      static unsigned int sock_poll(struct file *file, poll_table *wait)
      {
   30         unsigned int busy_flag = 0;
              struct socket *sock;
      
              /*
               *      We can't return errors to poll, so it's either yes or no.
               */
   30         sock = file->private_data;
      
              if (sk_can_busy_loop(sock->sk)) {
                      /* this socket can poll_ll so tell the system call */
                      busy_flag = POLL_BUSY_LOOP;
      
   30                 /* once, only if requested by syscall */
                      if (wait && (wait->_key & POLL_BUSY_LOOP))
                              sk_busy_loop(sock->sk, 1);
              }
      
              return busy_flag | sock->ops->poll(file, sock, wait);
      }
      
      static int sock_mmap(struct file *file, struct vm_area_struct *vma)
      {
              struct socket *sock = file->private_data;
      
              return sock->ops->mmap(file, sock, vma);
      }
      
      static int sock_close(struct inode *inode, struct file *filp)
      {
              sock_release(SOCKET_I(inode));
  136         return 0;
      }
      
      /*
       *        Update the socket async list
       *
       *        Fasync_list locking strategy.
       *
       *        1. fasync_list is modified only under process context socket lock
       *           i.e. under semaphore.
       *        2. fasync_list is used under read_lock(&sk->sk_callback_lock)
  136  *           or under socket lock
       */
      
      static int sock_fasync(int fd, struct file *filp, int on)
      {
   10         struct socket *sock = filp->private_data;
              struct sock *sk = sock->sk;
              struct socket_wq *wq;
      
              if (sk == NULL)
                      return -EINVAL;
      
  405         lock_sock(sk);
              wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
              fasync_helper(fd, filp, on, &wq->fasync_list);
      
              if (!wq->fasync_list)
                      sock_reset_flag(sk, SOCK_FASYNC);
              else
                      sock_set_flag(sk, SOCK_FASYNC);
      
              release_sock(sk);
              return 0;
      }
      
      /* This function may be called only under rcu_lock */
      
      int sock_wake_async(struct socket_wq *wq, int how, int band)
      {
   14         if (!wq || !wq->fasync_list)
                      return -1;
      
              switch (how) {
              case SOCK_WAKE_WAITD:
                      if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
                              break;
   14                 goto call_kill;
   14         case SOCK_WAKE_SPACE:
                      if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
                              break;
                      /* fall through */
    1         case SOCK_WAKE_IO:
      call_kill:
   13                 kill_fasync(&wq->fasync_list, SIGIO, band);
                      break;
   14         case SOCK_WAKE_URG:
   14                 kill_fasync(&wq->fasync_list, SIGURG, band);
              }
      
              return 0;
      }
      EXPORT_SYMBOL(sock_wake_async);
      
   97 int __sock_create(struct net *net, int family, int type, int protocol,
                               struct socket **res, int kern)
      {
   96         int err;
              struct socket *sock;
   41         const struct net_proto_family *pf;
      
              /*
               *      Check protocol is in range
   54          */
              if (family < 0 || family >= NPROTO)
                      return -EAFNOSUPPORT;
              if (type < 0 || type >= SOCK_MAX)
                      return -EINVAL;
   70 
              /* Compatibility.
      
    1            This uglymoron is moved from INET layer to here to avoid
                 deadlock in module load.
               */
   97         if (family == PF_INET && type == SOCK_PACKET) {
                      static int warned;
                      if (!warned) {
                              warned = 1;
                              pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
                                      current->comm);
                      }
                      family = PF_PACKET;
              }
      
              err = security_socket_create(family, type, protocol, kern);
              if (err)
                      return err;
      
  967         /*
               *        Allocate the socket and allow the family to set things up. if
  967          *        the protocol is 0, the family is instructed to select an appropriate
               *        default.
               */
              sock = sock_alloc();
              if (!sock) {
                      net_warn_ratelimited("socket: no more sockets\n");
                      return -ENFILE;        /* Not exactly a match, but its the
                                         closest posix thing */
  966         }
      
    2         sock->type = type;
      
      #ifdef CONFIG_MODULES
              /* Attempt to load a protocol module if the find failed.
               *
               * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
               * requested real, full-featured networking support upon configuration.
               * Otherwise module support will break!
  966          */
  967         if (rcu_access_pointer(net_families[family]) == NULL)
                      request_module("net-pf-%d", family);
      #endif
      
              rcu_read_lock();
              pf = rcu_dereference(net_families[family]);
              err = -EAFNOSUPPORT;
              if (!pf)
  906                 goto out_release;
      
              /*
               * We will call the ->create function, that possibly is in a loadable
               * module, so we have to bump that loadable module refcnt first.
               */
              if (!try_module_get(pf->owner))
  906                 goto out_release;
      
              /* Now protected by module ref count */
              rcu_read_unlock();
      
              err = pf->create(net, sock, protocol, kern);
              if (err < 0)
                      goto out_module_put;
      
              /*
   10          * Now to bump the refcnt of the [loadable] module that owns this
               * socket at sock_release time we decrement its refcnt.
               */
  906         if (!try_module_get(sock->ops->owner))
  906                 goto out_module_busy;
      
  906         /*
               * Now that we're done with the ->create function, the [loadable]
               * module can have its refcnt decremented
               */
              module_put(pf->owner);
              err = security_socket_post_create(sock, family, type, protocol, kern);
              if (err)
                      goto out_sock_release;
              *res = sock;
      
              return 0;
  897 
      out_module_busy:
              err = -EAFNOSUPPORT;
      out_module_put:
              sock->ops = NULL;
              module_put(pf->owner);
      out_sock_release:
              sock_release(sock);
              return err;
      
  840 out_release:
              rcu_read_unlock();
              goto out_sock_release;
      }
      EXPORT_SYMBOL(__sock_create);
      
      int sock_create(int family, int type, int protocol, struct socket **res)
  840 {
              return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
      }
      EXPORT_SYMBOL(sock_create);
  840 
      int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
      {
              return __sock_create(net, family, type, protocol, res, 1);
      }
      EXPORT_SYMBOL(sock_create_kern);
      
   76 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
      {
              int retval;
   86         struct socket *sock;
              int flags;
      
              /* Check the SOCK_* constants for consistency.  */
   10         BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
              BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
              BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
              BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
      
              flags = type & ~SOCK_TYPE_MASK;
              if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
  132                 return -EINVAL;
              type &= SOCK_TYPE_MASK;
      
              if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                      flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
      
   30         retval = sock_create(family, type, protocol, &sock);
              if (retval < 0)
                      goto out;
      
  828         retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
              if (retval < 0)
                      goto out_release;
      
      out:
              /* It may be already another descriptor 8) Not kernel problem. */
              return retval;
      
      out_release:
              sock_release(sock);
              return retval;
      }
      
      /*
       *        Create a pair of connected sockets.
  821  */
      
      SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
                      int __user *, usockvec)
      {
              struct socket *sock1, *sock2;
              int fd1, fd2, err;
              struct file *newfile1, *newfile2;
              int flags;
  699 
              flags = type & ~SOCK_TYPE_MASK;
              if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                      return -EINVAL;
              type &= SOCK_TYPE_MASK;
      
  822         if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                      flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
      
    2         /*
               * Obtain the first socket and check if the underlying protocol
               * supports the socketpair call.
               */
      
              err = sock_create(family, type, protocol, &sock1);
              if (err < 0)
                      goto out;
  144 
              err = sock_create(family, type, protocol, &sock2);
              if (err < 0)
                      goto out_release_1;
      
              err = sock1->ops->socketpair(sock1, sock2);
              if (err < 0)
                      goto out_release_both;
  125 
              fd1 = get_unused_fd_flags(flags);
              if (unlikely(fd1 < 0)) {
  141                 err = fd1;
                      goto out_release_both;
              }
      
              fd2 = get_unused_fd_flags(flags);
              if (unlikely(fd2 < 0)) {
                      err = fd2;
                      goto out_put_unused_1;
              }
      
              newfile1 = sock_alloc_file(sock1, flags, NULL);
              if (IS_ERR(newfile1)) {
                      err = PTR_ERR(newfile1);
                      goto out_put_unused_both;
  131         }
      
              newfile2 = sock_alloc_file(sock2, flags, NULL);
              if (IS_ERR(newfile2)) {
  131                 err = PTR_ERR(newfile2);
                      goto out_fput_1;
              }
      
              err = put_user(fd1, &usockvec[0]);
              if (err)
                      goto out_fput_both;
      
              err = put_user(fd2, &usockvec[1]);
              if (err)
  125                 goto out_fput_both;
      
              audit_fd_pair(fd1, fd2);
      
              fd_install(fd1, newfile1);
              fd_install(fd2, newfile2);
  125         /* fd1 and fd2 may be already another descriptors.
               * Not kernel problem.
               */
      
              return 0;
      
  125 out_fput_both:
              fput(newfile2);
              fput(newfile1);
              put_unused_fd(fd2);
              put_unused_fd(fd1);
              goto out;
      
      out_fput_1:
              fput(newfile1);
              put_unused_fd(fd2);
   85         put_unused_fd(fd1);
              sock_release(sock2);
              goto out;
      
   85 out_put_unused_both:
              put_unused_fd(fd2);
   85 out_put_unused_1:
              put_unused_fd(fd1);
      out_release_both:
              sock_release(sock2);
      out_release_1:
              sock_release(sock1);
      out:
              return err;
      }
   40 
      /*
       *        Bind a name to a socket. Nothing much to do here since it's
       *        the protocol's responsibility to handle the local address.
       *
       *        We move the socket address to kernel space before we call
       *        the protocol layer (having also checked the address is ok).
       */
      
      SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
      {
              struct socket *sock;
              struct sockaddr_storage address;
              int err, fput_needed;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (sock) {
                      err = move_addr_to_kernel(umyaddr, addrlen, &address);
    8                 if (err >= 0) {
                              err = security_socket_bind(sock,
    8                                                    (struct sockaddr *)&address,
                                                         addrlen);
  143                         if (!err)
                                      err = sock->ops->bind(sock,
                                                            (struct sockaddr *)
                                                            &address, addrlen);
                      }
                      fput_light(sock->file, fput_needed);
              }
              return err;
      }
      
      /*
  258  *        Perform a listen. Basically, we allow the protocol to do anything
       *        necessary for a listen, and if that works, we mark the socket as
       *        ready for listening.
       */
      
      SYSCALL_DEFINE2(listen, int, fd, int, backlog)
      {
              struct socket *sock;
  257         int err, fput_needed;
              int somaxconn;
  254 
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (sock) {
                      somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
  253                 if ((unsigned int)backlog > somaxconn)
                              backlog = somaxconn;
      
                      err = security_socket_listen(sock, backlog);
  258                 if (!err)
                              err = sock->ops->listen(sock, backlog);
  258 
                      fput_light(sock->file, fput_needed);
              }
              return err;
      }
      
      /*
       *        For accept, we attempt to create a new socket, set up the link
       *        with the client, wake up the client, then return the new
   35  *        connected fd. We collect the address of the connector in kernel
       *        space and move it to user at the very end. This is unclean because
       *        we open the socket then return an error.
       *
       *        1003.1g adds the ability to recvmsg() to query connection pending
       *        status to recvmsg. We need to add that support in a way thats
       *        clean when we restucture accept also.
       */
   33 
      SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
                      int __user *, upeer_addrlen, int, flags)
      {
   33         struct socket *sock, *newsock;
              struct file *newfile;
   33         int err, len, newfd, fput_needed;
              struct sockaddr_storage address;
   34 
              if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
   35                 return -EINVAL;
      
              if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                      flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (!sock)
                      goto out;
      
              err = -ENFILE;
              newsock = sock_alloc();
              if (!newsock)
                      goto out_put;
      
              newsock->type = sock->type;
   42         newsock->ops = sock->ops;
      
              /*
               * We don't need try_module_get here, as the listening socket (sock)
               * has the protocol module (sock->ops->owner) held.
               */
              __module_get(newsock->ops->owner);
      
              newfd = get_unused_fd_flags(flags);
              if (unlikely(newfd < 0)) {
                      err = newfd;
                      sock_release(newsock);
                      goto out_put;
              }
   39         newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
              if (IS_ERR(newfile)) {
                      err = PTR_ERR(newfile);
                      put_unused_fd(newfd);
   36                 sock_release(newsock);
                      goto out_put;
              }
      
              err = security_socket_accept(sock, newsock);
   36         if (err)
                      goto out_fd;
      
              err = sock->ops->accept(sock, newsock, sock->file->f_flags);
              if (err < 0)
                      goto out_fd;
      
              if (upeer_sockaddr) {
                      if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
                                                &len, 2) < 0) {
                              err = -ECONNABORTED;
    1                         goto out_fd;
                      }
                      err = move_addr_to_user(&address,
                                              len, upeer_sockaddr, upeer_addrlen);
   35                 if (err < 0)
                              goto out_fd;
              }
      
              /* File flags are not inherited via accept() unlike another OSes. */
      
              fd_install(newfd, newfile);
              err = newfd;
   35 
      out_put:
              fput_light(sock->file, fput_needed);
      out:
   35         return err;
      out_fd:
              fput(newfile);
              put_unused_fd(newfd);
   19         goto out_put;
    4 }
      
      SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
                      int __user *, upeer_addrlen)
      {
    4         return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
      }
      
      /*
       *        Attempt to connect to a socket with the server address.  The address
       *        is in user space so we verify it is OK and move it to kernel space.
       *
       *        For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
   18  *        break bindings
       *
       *        NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
       *        other SEQPACKET protocols that take time to connect() as it doesn't
   37  *        include the -EINPROGRESS status for such sockets.
       */
   42 
      SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
   20                 int, addrlen)
      {
              struct socket *sock;
              struct sockaddr_storage address;
              int err, fput_needed;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (!sock)
                      goto out;
              err = move_addr_to_kernel(uservaddr, addrlen, &address);
              if (err < 0)
                      goto out_put;
      
              err =
                  security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
              if (err)
                      goto out_put;
      
              err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
                                       sock->file->f_flags);
      out_put:
              fput_light(sock->file, fput_needed);
      out:
  472         return err;
      }
      
      /*
       *        Get the local address ('name') of a socket object. Move the obtained
       *        name to user space.
       */
      
      SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
                      int __user *, usockaddr_len)
  467 {
              struct socket *sock;
              struct sockaddr_storage address;
              int len, err, fput_needed;
      
  465         sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (!sock)
                      goto out;
      
  463         err = security_socket_getsockname(sock);
              if (err)
                      goto out_put;
  447 
              err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0);
  450         if (err)
                      goto out_put;
              err = move_addr_to_user(&address, len, usockaddr, usockaddr_len);
      
      out_put:
              fput_light(sock->file, fput_needed);
      out:
              return err;
   19 }
      
      /*
       *        Get the remote address ('name') of a socket object. Move the obtained
       *        name to user space.
       */
      
      SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
                      int __user *, usockaddr_len)
      {
              struct socket *sock;
   18         struct sockaddr_storage address;
              int len, err, fput_needed;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
   18         if (sock != NULL) {
                      err = security_socket_getpeername(sock);
                      if (err) {
   17                         fput_light(sock->file, fput_needed);
                              return err;
                      }
   18 
                      err =
   19                     sock->ops->getname(sock, (struct sockaddr *)&address, &len,
                                             1);
                      if (!err)
                              err = move_addr_to_user(&address, len, usockaddr,
                                                      usockaddr_len);
                      fput_light(sock->file, fput_needed);
              }
              return err;
   13 }
      
      /*
       *        Send a datagram to a given address. We move the address into kernel
       *        space and check the user space data area is readable before invoking
       *        the protocol.
       */
      
      SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
   11                 unsigned int, flags, struct sockaddr __user *, addr,
                      int, addr_len)
      {
              struct socket *sock;
              struct sockaddr_storage address;
              int err;
              struct msghdr msg;
   11         struct iovec iov;
              int fput_needed;
      
    5         err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter);
              if (unlikely(err))
   11                 return err;
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
   13         if (!sock)
                      goto out;
      
              msg.msg_name = NULL;
              msg.msg_control = NULL;
              msg.msg_controllen = 0;
              msg.msg_namelen = 0;
              if (addr) {
                      err = move_addr_to_kernel(addr, addr_len, &address);
  488                 if (err < 0)
                              goto out_put;
                      msg.msg_name = (struct sockaddr *)&address;
                      msg.msg_namelen = addr_len;
              }
              if (sock->file->f_flags & O_NONBLOCK)
                      flags |= MSG_DONTWAIT;
              msg.msg_flags = flags;
              err = sock_sendmsg(sock, &msg);
      
      out_put:
              fput_light(sock->file, fput_needed);
      out:
              return err;
  487 }
      
      /*
       *        Send a datagram down a socket.
  481  */
      
      SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
                      unsigned int, flags)
      {
  314         return sys_sendto(fd, buff, len, flags, NULL, 0);
      }
      
  313 /*
       *        Receive a frame from the socket and optionally record the address of the
       *        sender. We verify the buffers are writable and if needed move the
       *        sender address from kernel to user space.
   27  */
  480 
      SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
                      unsigned int, flags, struct sockaddr __user *, addr,
                      int __user *, addr_len)
  401 {
              struct socket *sock;
  407         struct iovec iov;
              struct msghdr msg;
              struct sockaddr_storage address;
              int err, err2;
              int fput_needed;
      
              err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter);
              if (unlikely(err))
                      return err;
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (!sock)
                      goto out;
      
              msg.msg_control = NULL;
              msg.msg_controllen = 0;
              /* Save some cycles and don't copy the address if not needed */
              msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
              /* We assume all kernel code knows the size of sockaddr_storage */
              msg.msg_namelen = 0;
  293         msg.msg_iocb = NULL;
              msg.msg_flags = 0;
              if (sock->file->f_flags & O_NONBLOCK)
                      flags |= MSG_DONTWAIT;
              err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags);
      
              if (err >= 0 && addr != NULL) {
                      err2 = move_addr_to_user(&address,
                                               msg.msg_namelen, addr, addr_len);
                      if (err2 < 0)
                              err = err2;
              }
      
              fput_light(sock->file, fput_needed);
  293 out:
              return err;
      }
      
  292 /*
       *        Receive a datagram from a socket.
       */
      
      SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
                      unsigned int, flags)
      {
              return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
      }
    3 
  292 /*
       *        Set a socket option. Because we don't know the option lengths we have
  188  *        to pass the user mode parameter for the protocols to sort out.
    2  */
      
      SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
    1                 char __user *, optval, int, optlen)
      {
              int err, fput_needed;
  201         struct socket *sock;
      
  202         if (optlen < 0)
                      return -EINVAL;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (sock != NULL) {
                      err = security_socket_setsockopt(sock, level, optname);
                      if (err)
  158                         goto out_put;
      
                      if (level == SOL_SOCKET)
                              err =
                                  sock_setsockopt(sock, level, optname, optval,
                                                  optlen);
                      else
                              err =
                                  sock->ops->setsockopt(sock, level, optname, optval,
                                                        optlen);
      out_put:
                      fput_light(sock->file, fput_needed);
              }
              return err;
      }
      
      /*
       *        Get a socket option. Because we don't know the option lengths we have
       *        to pass a user mode parameter for the protocols to sort out.
       */
      
      SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
                      char __user *, optval, int __user *, optlen)
      {
              int err, fput_needed;
              struct socket *sock;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (sock != NULL) {
                      err = security_socket_getsockopt(sock, level, optname);
                      if (err)
                              goto out_put;
      
                      if (level == SOL_SOCKET)
                              err =
                                  sock_getsockopt(sock, level, optname, optval,
                                                  optlen);
                      else
                              err =
                                  sock->ops->getsockopt(sock, level, optname, optval,
                                                        optlen);
      out_put:
                      fput_light(sock->file, fput_needed);
              }
              return err;
      }
      
      /*
       *        Shutdown a socket.
       */
      
      SYSCALL_DEFINE2(shutdown, int, fd, int, how)
      {
              int err, fput_needed;
              struct socket *sock;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (sock != NULL) {
                      err = security_socket_shutdown(sock, how);
                      if (!err)
                              err = sock->ops->shutdown(sock, how);
                      fput_light(sock->file, fput_needed);
              }
              return err;
      }
      
      /* A couple of helpful macros for getting the address of the 32/64 bit
       * fields which are the same type (int / unsigned) on our platforms.
       */
      #define COMPAT_MSG(msg, member)        ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
      #define COMPAT_NAMELEN(msg)        COMPAT_MSG(msg, msg_namelen)
      #define COMPAT_FLAGS(msg)        COMPAT_MSG(msg, msg_flags)
      
      struct used_address {
              struct sockaddr_storage name;
   50         unsigned int name_len;
      };
      
      static int copy_msghdr_from_user(struct msghdr *kmsg,
                                       struct user_msghdr __user *umsg,
                                       struct sockaddr __user **save_addr,
                                       struct iovec **iov)
      {
              struct sockaddr __user *uaddr;
   49         struct iovec __user *uiov;
   49         size_t nr_segs;
              ssize_t err;
   50 
              if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
                  __get_user(uaddr, &umsg->msg_name) ||
                  __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
                  __get_user(uiov, &umsg->msg_iov) ||
                  __get_user(nr_segs, &umsg->msg_iovlen) ||
                  __get_user(kmsg->msg_control, &umsg->msg_control) ||
                  __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
                  __get_user(kmsg->msg_flags, &umsg->msg_flags))
                      return -EFAULT;
      
              if (!uaddr)
                      kmsg->msg_namelen = 0;
      
              if (kmsg->msg_namelen < 0)
                      return -EINVAL;
      
              if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
                      kmsg->msg_namelen = sizeof(struct sockaddr_storage);
      
              if (save_addr)
                      *save_addr = uaddr;
      
              if (uaddr && kmsg->msg_namelen) {
                      if (!save_addr) {
                              err = move_addr_to_kernel(uaddr, kmsg->msg_namelen,
                                                        kmsg->msg_name);
                              if (err < 0)
                                      return err;
                      }
              } else {
                      kmsg->msg_name = NULL;
                      kmsg->msg_namelen = 0;
              }
      
              if (nr_segs > UIO_MAXIOV)
                      return -EMSGSIZE;
      
              kmsg->msg_iocb = NULL;
      
              return import_iovec(save_addr ? READ : WRITE, uiov, nr_segs,
                                  UIO_FASTIOV, iov, &kmsg->msg_iter);
      }
      
      static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
                               struct msghdr *msg_sys, unsigned int flags,
                               struct used_address *used_address)
      {
              struct compat_msghdr __user *msg_compat =
                  (struct compat_msghdr __user *)msg;
              struct sockaddr_storage address;
              struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
              unsigned char ctl[sizeof(struct cmsghdr) + 20]
                  __attribute__ ((aligned(sizeof(__kernel_size_t))));
              /* 20 is size of ipv6_pktinfo */
              unsigned char *ctl_buf = ctl;
              int ctl_len;
              ssize_t err;
      
              msg_sys->msg_name = &address;
      
              if (MSG_CMSG_COMPAT & flags)
                      err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov);
              else
                      err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov);
              if (err < 0)
                      return err;
      
              err = -ENOBUFS;
      
              if (msg_sys->msg_controllen > INT_MAX)
                      goto out_freeiov;
              ctl_len = msg_sys->msg_controllen;
              if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
                      err =
 1697                     cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
                                                           sizeof(ctl));
                      if (err)
                              goto out_freeiov;
                      ctl_buf = msg_sys->msg_control;
                      ctl_len = msg_sys->msg_controllen;
              } else if (ctl_len) {
                      if (ctl_len > sizeof(ctl)) {
                              ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
                              if (ctl_buf == NULL)
                                      goto out_freeiov;
 1697                 }
                      err = -EFAULT;
                      /*
 1697                  * Careful! Before this, msg_sys->msg_control contains a user pointer.
   20                  * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
                       * checking falls down on this.
                       */
                      if (copy_from_user(ctl_buf,
 1696                                    (void __user __force *)msg_sys->msg_control,
                                         ctl_len))
                              goto out_freectl;
 1695                 msg_sys->msg_control = ctl_buf;
              }
  156         msg_sys->msg_flags = flags;
      
              if (sock->file->f_flags & O_NONBLOCK)
                      msg_sys->msg_flags |= MSG_DONTWAIT;
  142         /*
               * If this is sendmmsg() and current destination address is same as
               * previously succeeded address, omit asking LSM's decision.
               * used_address->name_len is initialized to UINT_MAX so that the first
               * destination address never matches.
               */
              if (used_address && msg_sys->msg_name &&
                  used_address->name_len == msg_sys->msg_namelen &&
                  !memcmp(&used_address->name, msg_sys->msg_name,
                          used_address->name_len)) {
                      err = sock_sendmsg_nosec(sock, msg_sys);
                      goto out_freectl;
              }
              err = sock_sendmsg(sock, msg_sys);
              /*
               * If this is sendmmsg() and sending to current destination address was
               * successful, remember it.
               */
              if (used_address && err >= 0) {
                      used_address->name_len = msg_sys->msg_namelen;
 1694                 if (msg_sys->msg_name)
                              memcpy(&used_address->name, msg_sys->msg_name,
                                     used_address->name_len);
   16         }
      
      out_freectl:
              if (ctl_buf != ctl)
                      sock_kfree_s(sock->sk, ctl_buf, ctl_len);
      out_freeiov:
              kfree(iov);
 1694         return err;
  156 }
   38 
      /*
    3  *        BSD sendmsg interface
       */
      
 1694 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
      {
              int fput_needed, err;
              struct msghdr msg_sys;
              struct socket *sock;
      
 1058         sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (!sock)
  326                 goto out;
      
              err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
      
              fput_light(sock->file, fput_needed);
 1665 out:
   38         return err;
      }
 1667 
 1668 SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
      {
              if (flags & MSG_CMSG_COMPAT)
                      return -EINVAL;
              return __sys_sendmsg(fd, msg, flags);
      }
      
      /*
       *        Linux sendmmsg interface
       */
      
      int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
                         unsigned int flags)
  729 {
              int fput_needed, err, datagrams;
              struct socket *sock;
              struct mmsghdr __user *entry;
  722         struct compat_mmsghdr __user *compat_entry;
              struct msghdr msg_sys;
  696         struct used_address used_address;
      
  697         if (vlen > UIO_MAXIOV)
                      vlen = UIO_MAXIOV;
      
              datagrams = 0;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (!sock)
                      return err;
      
              used_address.name_len = UINT_MAX;
              entry = mmsg;
              compat_entry = (struct compat_mmsghdr __user *)mmsg;
              err = 0;
      
              while (datagrams < vlen) {
                      if (MSG_CMSG_COMPAT & flags) {
                              err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
                                                   &msg_sys, flags, &used_address);
                              if (err < 0)
                                      break;
                              err = __put_user(err, &compat_entry->msg_len);
                              ++compat_entry;
                      } else {
                              err = ___sys_sendmsg(sock,
 1074                                              (struct user_msghdr __user *)entry,
                                                   &msg_sys, flags, &used_address);
                              if (err < 0)
                                      break;
                              err = put_user(err, &entry->msg_len);
                              ++entry;
                      }
      
                      if (err)
 1074                         break;
                      ++datagrams;
                      if (msg_data_left(&msg_sys))
                              break;
              }
 1074 
 1073         fput_light(sock->file, fput_needed);
 1073 
              /* We only return an error if no datagrams were able to be sent */
              if (datagrams != 0)
                      return datagrams;
 1058 
              return err;
      }
      
      SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
                      unsigned int, vlen, unsigned int, flags)
      {
              if (flags & MSG_CMSG_COMPAT)
                      return -EINVAL;
              return __sys_sendmmsg(fd, mmsg, vlen, flags);
      }
      
 1058 static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
                               struct msghdr *msg_sys, unsigned int flags, int nosec)
 1058 {
              struct compat_msghdr __user *msg_compat =
                  (struct compat_msghdr __user *)msg;
              struct iovec iovstack[UIO_FASTIOV];
              struct iovec *iov = iovstack;
  402         unsigned long cmsg_ptr;
              int total_len, len;
              ssize_t err;
  402 
              /* kernel mode address */
              struct sockaddr_storage addr;
  403 
              /* user mode address pointers */
              struct sockaddr __user *uaddr;
              int __user *uaddr_len = COMPAT_NAMELEN(msg);
      
              msg_sys->msg_name = &addr;
      
              if (MSG_CMSG_COMPAT & flags)
                      err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov);
              else
                      err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
              if (err < 0)
                      return err;
              total_len = iov_iter_count(&msg_sys->msg_iter);
      
              cmsg_ptr = (unsigned long)msg_sys->msg_control;
              msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
  424 
              /* We assume all kernel code knows the size of sockaddr_storage */
              msg_sys->msg_namelen = 0;
      
              if (sock->file->f_flags & O_NONBLOCK)
                      flags |= MSG_DONTWAIT;
              err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
                                                                total_len, flags);
              if (err < 0)
                      goto out_freeiov;
  424         len = err;
      
              if (uaddr != NULL) {
                      err = move_addr_to_user(&addr,
                                              msg_sys->msg_namelen, uaddr,
                                              uaddr_len);
                      if (err < 0)
                              goto out_freeiov;
  424         }
    5         err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
  423                          COMPAT_FLAGS(msg));
              if (err)
  304                 goto out_freeiov;
              if (MSG_CMSG_COMPAT & flags)
                      err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                       &msg_compat->msg_controllen);
              else
                      err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                       &msg->msg_controllen);
   17         if (err)
  423                 goto out_freeiov;
              err = len;
      
      out_freeiov:
              kfree(iov);
              return err;
  304 }
  101 
      /*
       *        BSD recvmsg interface
       */
      
      long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
  304 {
              int fput_needed, err;
              struct msghdr msg_sys;
              struct socket *sock;
      
  304         sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (!sock)
                      goto out;
      
              err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
  304 
              fput_light(sock->file, fput_needed);
      out:
              return err;
      }
  313 
  314 SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
                      unsigned int, flags)
      {
              if (flags & MSG_CMSG_COMPAT)
                      return -EINVAL;
              return __sys_recvmsg(fd, msg, flags);
      }
      
      /*
       *     Linux recvmmsg interface
       */
      
      int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
  186                    unsigned int flags, struct timespec *timeout)
      {
              int fput_needed, err, datagrams;
              struct socket *sock;
  185         struct mmsghdr __user *entry;
              struct compat_mmsghdr __user *compat_entry;
   78         struct msghdr msg_sys;
              struct timespec end_time;
   78 
              if (timeout &&
                  poll_select_set_timeout(&end_time, timeout->tv_sec,
                                          timeout->tv_nsec))
                      return -EINVAL;
      
              datagrams = 0;
      
              sock = sockfd_lookup_light(fd, &err, &fput_needed);
              if (!sock)
                      return err;
      
              err = sock_error(sock->sk);
              if (err) {
                      datagrams = err;
                      goto out_put;
              }
      
              entry = mmsg;
              compat_entry = (struct compat_mmsghdr __user *)mmsg;
      
              while (datagrams < vlen) {
                      /*
                       * No need to ask LSM for more than the first datagram.
                       */
  243                 if (MSG_CMSG_COMPAT & flags) {
   72                         err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry,
                                                   &msg_sys, flags & ~MSG_WAITFORONE,
                                                   datagrams);
                              if (err < 0)
                                      break;
                              err = __put_user(err, &compat_entry->msg_len);
  243                         ++compat_entry;
                      } else {
    3                         err = ___sys_recvmsg(sock,
                                                   (struct user_msghdr __user *)entry,
  241                                              &msg_sys, flags & ~MSG_WAITFORONE,
                                                   datagrams);
                              if (err < 0)
                                      break;
                              err = put_user(err, &entry->msg_len);
                              ++entry;
                      }
      
                      if (err)
  241                         break;
                      ++datagrams;
      
                      /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
  239                 if (flags & MSG_WAITFORONE)
  239                         flags |= MSG_DONTWAIT;
      
                      if (timeout) {
                              ktime_get_ts(timeout);
                              *timeout = timespec_sub(end_time, *timeout);
  232                         if (timeout->tv_sec < 0) {
                                      timeout->tv_sec = timeout->tv_nsec = 0;
                                      break;
                              }
      
                              /* Timeout, return less than vlen datagrams */
                              if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
                                      break;
                      }
      
                      /* Out of band data, return right away */
                      if (msg_sys.msg_flags & MSG_OOB)
                              break;
  232         }
      
  232         if (err == 0)
                      goto out_put;
      
              if (datagrams == 0) {
   22                 datagrams = err;
                      goto out_put;
  232         }
   69 
              /*
               * We may return less entries than requested (vlen) if the
    7          * sock is non block and there aren't enough datagrams...
               */
              if (err != -EAGAIN) {
                      /*
                       * ... or  if recvmsg returns an error after we
   65                  * received some datagrams, where we record the
                       * error to return on the next call or if the
                       * app asks about it using getsockopt(SO_ERROR).
                       */
                      sock->sk->sk_err = -err;
  228         }
      out_put:
              fput_light(sock->file, fput_needed);
      
  103         return datagrams;
      }
      
   59 SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
                      unsigned int, vlen, unsigned int, flags,
                      struct timespec __user *, timeout)
      {
              int datagrams;
              struct timespec timeout_sys;
      
              if (flags & MSG_CMSG_COMPAT)
                      return -EINVAL;
   54 
              if (!timeout)
                      return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
      
              if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
                      return -EFAULT;
      
   36         datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
      
              if (datagrams > 0 &&
  161             copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
                      datagrams = -EFAULT;
      
              return datagrams;
      }
      
      #ifdef __ARCH_WANT_SYS_SOCKETCALL
      /* Argument list sizes for sys_socketcall */
      #define AL(x) ((x) * sizeof(unsigned long))
      static const unsigned char nargs[21] = {
              AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
              AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
              AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
              AL(4), AL(5), AL(4)
      };
      
      #undef AL
      
      /*
       *        System call vectors.
       *
       *        Argument checking cleaned up. Saved 20% in size.
       *  This function doesn't need to set the kernel lock because
       *  it is set by the callees.
       */
      
      SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
      {
              unsigned long a[AUDITSC_ARGS];
              unsigned long a0, a1;
              int err;
              unsigned int len;
      
              if (call < 1 || call > SYS_SENDMMSG)
                      return -EINVAL;
              call = array_index_nospec(call, SYS_SENDMMSG + 1);
      
              len = nargs[call];
              if (len > sizeof(a))
                      return -EINVAL;
      
              /* copy_from_user should be SMP safe. */
              if (copy_from_user(a, args, len))
                      return -EFAULT;
      
              err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
              if (err)
                      return err;
      
              a0 = a[0];
              a1 = a[1];
      
              switch (call) {
              case SYS_SOCKET:
                      err = sys_socket(a0, a1, a[2]);
                      break;
              case SYS_BIND:
                      err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
                      break;
              case SYS_CONNECT:
                      err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
                      break;
              case SYS_LISTEN:
                      err = sys_listen(a0, a1);
                      break;
              case SYS_ACCEPT:
                      err = sys_accept4(a0, (struct sockaddr __user *)a1,
                                        (int __user *)a[2], 0);
                      break;
              case SYS_GETSOCKNAME:
                      err =
                          sys_getsockname(a0, (struct sockaddr __user *)a1,
                                          (int __user *)a[2]);
                      break;
              case SYS_GETPEERNAME:
                      err =
                          sys_getpeername(a0, (struct sockaddr __user *)a1,
                                          (int __user *)a[2]);
                      break;
              case SYS_SOCKETPAIR:
                      err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
                      break;
              case SYS_SEND:
                      err = sys_send(a0, (void __user *)a1, a[2], a[3]);
                      break;
              case SYS_SENDTO:
                      err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                       (struct sockaddr __user *)a[4], a[5]);
                      break;
              case SYS_RECV:
                      err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
                      break;
              case SYS_RECVFROM:
                      err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                         (struct sockaddr __user *)a[4],
                                         (int __user *)a[5]);
                      break;
              case SYS_SHUTDOWN:
                      err = sys_shutdown(a0, a1);
                      break;
              case SYS_SETSOCKOPT:
                      err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
                      break;
              case SYS_GETSOCKOPT:
                      err =
                          sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                                         (int __user *)a[4]);
                      break;
              case SYS_SENDMSG:
                      err = sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2]);
                      break;
              case SYS_SENDMMSG:
                      err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
                      break;
              case SYS_RECVMSG:
                      err = sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2]);
                      break;
              case SYS_RECVMMSG:
                      err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
                                         (struct timespec __user *)a[4]);
                      break;
              case SYS_ACCEPT4:
                      err = sys_accept4(a0, (struct sockaddr __user *)a1,
                                        (int __user *)a[2], a[3]);
                      break;
              default:
                      err = -EINVAL;
                      break;
              }
              return err;
      }
      
      #endif                                /* __ARCH_WANT_SYS_SOCKETCALL */
      
      /**
       *        sock_register - add a socket protocol handler
       *        @ops: description of protocol
       *
       *        This function is called by a protocol handler that wants to
       *        advertise its address family, and have it linked into the
       *        socket interface. The value ops->family corresponds to the
       *        socket system call protocol family.
       */
      int sock_register(const struct net_proto_family *ops)
      {
              int err;
      
              if (ops->family >= NPROTO) {
                      pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
                      return -ENOBUFS;
              }
      
              spin_lock(&net_family_lock);
              if (rcu_dereference_protected(net_families[ops->family],
                                            lockdep_is_held(&net_family_lock)))
                      err = -EEXIST;
              else {
                      rcu_assign_pointer(net_families[ops->family], ops);
                      err = 0;
              }
              spin_unlock(&net_family_lock);
      
              pr_info("NET: Registered protocol family %d\n", ops->family);
              return err;
      }
      EXPORT_SYMBOL(sock_register);
      
      /**
       *        sock_unregister - remove a protocol handler
       *        @family: protocol family to remove
       *
       *        This function is called by a protocol handler that wants to
       *        remove its address family, and have it unlinked from the
       *        new socket creation.
       *
       *        If protocol handler is a module, then it can use module reference
       *        counts to protect against new references. If protocol handler is not
       *        a module then it needs to provide its own protection in
       *        the ops->create routine.
       */
      void sock_unregister(int family)
      {
              BUG_ON(family < 0 || family >= NPROTO);
      
              spin_lock(&net_family_lock);
              RCU_INIT_POINTER(net_families[family], NULL);
              spin_unlock(&net_family_lock);
      
              synchronize_rcu();
      
              pr_info("NET: Unregistered protocol family %d\n", family);
      }
      EXPORT_SYMBOL(sock_unregister);
      
      static int __init sock_init(void)
      {
              int err;
              /*
               *      Initialize the network sysctl infrastructure.
               */
              err = net_sysctl_init();
              if (err)
                      goto out;
      
              /*
               *      Initialize skbuff SLAB cache
               */
              skb_init();
      
              /*
               *      Initialize the protocols module.
               */
      
              init_inodecache();
      
              err = register_filesystem(&sock_fs_type);
              if (err)
                      goto out_fs;
              sock_mnt = kern_mount(&sock_fs_type);
              if (IS_ERR(sock_mnt)) {
                      err = PTR_ERR(sock_mnt);
                      goto out_mount;
              }
      
              /* The real protocol initialization is performed in later initcalls.
               */
      
      #ifdef CONFIG_NETFILTER
              err = netfilter_init();
              if (err)
                      goto out;
      #endif
      
              ptp_classifier_init();
      
      out:
              return err;
      
      out_mount:
              unregister_filesystem(&sock_fs_type);
      out_fs:
              goto out;
      }
      
      core_initcall(sock_init);        /* early initcall */
      
      static int __init jit_init(void)
      {
      #ifdef CONFIG_BPF_JIT_ALWAYS_ON
              bpf_jit_enable = 1;
      #endif
              return 0;
      }
      pure_initcall(jit_init);
      
      #ifdef CONFIG_PROC_FS
      void socket_seq_show(struct seq_file *seq)
      {
              int cpu;
              int counter = 0;
      
              for_each_possible_cpu(cpu)
                  counter += per_cpu(sockets_in_use, cpu);
      
              /* It can be negative, by the way. 8) */
              if (counter < 0)
                      counter = 0;
      
              seq_printf(seq, "sockets: used %d\n", counter);
      }
      #endif                                /* CONFIG_PROC_FS */
      
      #ifdef CONFIG_COMPAT
      static int do_siocgstamp(struct net *net, struct socket *sock,
                               unsigned int cmd, void __user *up)
      {
              mm_segment_t old_fs = get_fs();
              struct timeval ktv;
              int err;
      
              set_fs(KERNEL_DS);
    2         err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
              set_fs(old_fs);
              if (!err)
                      err = compat_put_timeval(&ktv, up);
    2 
    2         return err;
      }
      
    2 static int do_siocgstampns(struct net *net, struct socket *sock,
                                 unsigned int cmd, void __user *up)
      {
              mm_segment_t old_fs = get_fs();
              struct timespec kts;
              int err;
      
              set_fs(KERNEL_DS);
              err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
              set_fs(old_fs);
              if (!err)
    7                 err = compat_put_timespec(&kts, up);
      
              return err;
      }
      
      static int dev_ifname32(struct net *net, struct compat_ifreq __user *uifr32)
      {
              struct ifreq __user *uifr;
    5         int err;
      
              uifr = compat_alloc_user_space(sizeof(struct ifreq));
              if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
                      return -EFAULT;
      
              err = dev_ioctl(net, SIOCGIFNAME, uifr);
              if (err)
    4                 return err;
      
              if (copy_in_user(uifr32, uifr, sizeof(struct compat_ifreq)))
                      return -EFAULT;
      
              return 0;
      }
      
    3 static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
      {
              struct compat_ifconf ifc32;
              struct ifconf ifc;
              struct ifconf __user *uifc;
              struct compat_ifreq __user *ifr32;
              struct ifreq __user *ifr;
              unsigned int i, j;
              int err;
      
    2         if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
                      return -EFAULT;
      
              memset(&ifc, 0, sizeof(ifc));
    2         if (ifc32.ifcbuf == 0) {
                      ifc32.ifc_len = 0;
                      ifc.ifc_len = 0;
                      ifc.ifc_req = NULL;
    1                 uifc = compat_alloc_user_space(sizeof(struct ifconf));
              } else {
                      size_t len = ((ifc32.ifc_len / sizeof(struct compat_ifreq)) + 1) *
                              sizeof(struct ifreq);
                      uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
                      ifc.ifc_len = len;
                      ifr = ifc.ifc_req = (void __user *)(uifc + 1);
                      ifr32 = compat_ptr(ifc32.ifcbuf);
                      for (i = 0; i < ifc32.ifc_len; i += sizeof(struct compat_ifreq)) {
                              if (copy_in_user(ifr, ifr32, sizeof(struct compat_ifreq)))
                                      return -EFAULT;
                              ifr++;
                              ifr32++;
                      }
              }
              if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
   14                 return -EFAULT;
      
              err = dev_ioctl(net, SIOCGIFCONF, uifc);
   10         if (err)
                      return err;
    1 
              if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
                      return -EFAULT;
      
              ifr = ifc.ifc_req;
    9         ifr32 = compat_ptr(ifc32.ifcbuf);
              for (i = 0, j = 0;
                   i + sizeof(struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len;
                   i += sizeof(struct compat_ifreq), j += sizeof(struct ifreq)) {
                      if (copy_in_user(ifr32, ifr, sizeof(struct compat_ifreq)))
                              return -EFAULT;
                      ifr32++;
    9                 ifr++;
              }
    8 
              if (ifc32.ifcbuf == 0) {
                      /* Translate from 64-bit structure multiple to
                       * a 32-bit one.
    9                  */
                      i = ifc.ifc_len;
                      i = ((i / sizeof(struct ifreq)) * sizeof(struct compat_ifreq));
    9                 ifc32.ifc_len = i;
              } else {
                      ifc32.ifc_len = i;
              }
    9         if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf)))
                      return -EFAULT;
      
    9         return 0;
      }
      
    8 static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
      {
    8         struct compat_ethtool_rxnfc __user *compat_rxnfc;
              bool convert_in = false, convert_out = false;
    8         size_t buf_size = ALIGN(sizeof(struct ifreq), 8);
              struct ethtool_rxnfc __user *rxnfc;
              struct ifreq __user *ifr;
              u32 rule_cnt = 0, actual_rule_cnt;
    9         u32 ethcmd;
              u32 data;
              int ret;
      
    1         if (get_user(data, &ifr32->ifr_ifru.ifru_data))
    9                 return -EFAULT;
      
              compat_rxnfc = compat_ptr(data);
    8 
              if (get_user(ethcmd, &compat_rxnfc->cmd))
                      return -EFAULT;
      
              /* Most ethtool structures are defined without padding.
               * Unfortunately struct ethtool_rxnfc is an exception.
               */
              switch (ethcmd) {
              default:
                      break;
              case ETHTOOL_GRXCLSRLALL:
                      /* Buffer size is variable */
                      if (get_user(rule_cnt, &compat_rxnfc->rule_cnt))
                              return -EFAULT;
                      if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32))
                              return -ENOMEM;
                      buf_size += rule_cnt * sizeof(u32);
                      /* fall through */
              case ETHTOOL_GRXRINGS:
              case ETHTOOL_GRXCLSRLCNT:
  109         case ETHTOOL_GRXCLSRULE:
              case ETHTOOL_SRXCLSRLINS:
                      convert_out = true;
  109                 /* fall through */
              case ETHTOOL_SRXCLSRLDEL:
                      buf_size += sizeof(struct ethtool_rxnfc);
                      convert_in = true;
                      break;
              }
      
              ifr = compat_alloc_user_space(buf_size);
  108         rxnfc = (void __user *)ifr + ALIGN(sizeof(struct ifreq), 8);
      
              if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
                      return -EFAULT;
      
    3         if (put_user(convert_in ? rxnfc : compat_ptr(data),
                           &ifr->ifr_ifru.ifru_data))
    3                 return -EFAULT;
      
    2         if (convert_in) {
                      /* We expect there to be holes between fs.m_ext and
                       * fs.ring_cookie and at the end of fs, but nowhere else.
                       */
                      BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
                                   sizeof(compat_rxnfc->fs.m_ext) !=
                                   offsetof(struct ethtool_rxnfc, fs.m_ext) +
                                   sizeof(rxnfc->fs.m_ext));
                      BUILD_BUG_ON(
                              offsetof(struct compat_ethtool_rxnfc, fs.location) -
                              offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
                              offsetof(struct ethtool_rxnfc, fs.location) -
                              offsetof(struct ethtool_rxnfc, fs.ring_cookie));
      
  107                 if (copy_in_user(rxnfc, compat_rxnfc,
    5                                  (void __user *)(&rxnfc->fs.m_ext + 1) -
                                       (void __user *)rxnfc) ||
                          copy_in_user(&rxnfc->fs.ring_cookie,
                                       &compat_rxnfc->fs.ring_cookie,
                                       (void __user *)(&rxnfc->fs.location + 1) -
  107                                  (void __user *)&rxnfc->fs.ring_cookie))
                              return -EFAULT;
                      if (ethcmd == ETHTOOL_GRXCLSRLALL) {
                              if (put_user(rule_cnt, &rxnfc->rule_cnt))
  107                                 return -EFAULT;
                      } else if (copy_in_user(&rxnfc->rule_cnt,
                                              &compat_rxnfc->rule_cnt,
                                              sizeof(rxnfc->rule_cnt)))
                              return -EFAULT;
              }
      
              ret = dev_ioctl(net, SIOCETHTOOL, ifr);
              if (ret)
                      return ret;
      
              if (convert_out) {
                      if (copy_in_user(compat_rxnfc, rxnfc,
                                       (const void __user *)(&rxnfc->fs.m_ext + 1) -
                                       (const void __user *)rxnfc) ||
    5                     copy_in_user(&compat_rxnfc->fs.ring_cookie,
                                       &rxnfc->fs.ring_cookie,
                                       (const void __user *)(&rxnfc->fs.location + 1) -
                                       (const void __user *)&rxnfc->fs.ring_cookie) ||
    5                     copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt,
                                       sizeof(rxnfc->rule_cnt)))
                              return -EFAULT;
    5 
    2                 if (ethcmd == ETHTOOL_GRXCLSRLALL) {
                              /* As an optimisation, we only copy the actual
                               * number of rules that the underlying
    3                          * function returned.  Since Mallory might
                               * change the rule count in user memory, we
                               * check that it is less than the rule count
                               * originally given (as the user buffer size),
                               * which has been range-checked.
  107                          */
                              if (get_user(actual_rule_cnt, &rxnfc->rule_cnt))
                                      return -EFAULT;
                              if (actual_rule_cnt < rule_cnt)
   36                                 rule_cnt = actual_rule_cnt;
                              if (copy_in_user(&compat_rxnfc->rule_locs[0],
                                               &rxnfc->rule_locs[0],
                                               rule_cnt * sizeof(u32)))
                                      return -EFAULT;
                      }
              }
      
              return 0;
      }
      
      static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
      {
              void __user *uptr;
              compat_uptr_t uptr32;
              struct ifreq __user *uifr;
      
              uifr = compat_alloc_user_space(sizeof(*uifr));
              if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
                      return -EFAULT;
      
              if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
                      return -EFAULT;
      
              uptr = compat_ptr(uptr32);
      
              if (put_user(uptr, &uifr->ifr_settings.ifs_ifsu.raw_hdlc))
                      return -EFAULT;
      
              return dev_ioctl(net, SIOCWANDEV, uifr);
      }
      
      static int bond_ioctl(struct net *net, unsigned int cmd,
                               struct compat_ifreq __user *ifr32)
      {
              struct ifreq kifr;
              mm_segment_t old_fs;
              int err;
      
              switch (cmd) {
              case SIOCBONDENSLAVE:
    1         case SIOCBONDRELEASE:
              case SIOCBONDSETHWADDR:
              case SIOCBONDCHANGEACTIVE:
                      if (copy_from_user(&kifr, ifr32, sizeof(struct compat_ifreq)))
    1                         return -EFAULT;
      
                      old_fs = get_fs();
    1                 set_fs(KERNEL_DS);
                      err = dev_ioctl(net, cmd,
                                      (struct ifreq __user __force *) &kifr);
                      set_fs(old_fs);
      
    1                 return err;
              default:
                      return -ENOIOCTLCMD;
              }
      }
      
      /* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
      static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
                                       struct compat_ifreq __user *u_ifreq32)
      {
    6         struct ifreq __user *u_ifreq64;
              char tmp_buf[IFNAMSIZ];
              void __user *data64;
              u32 data32;
      
    6         if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
                                 IFNAMSIZ))
                      return -EFAULT;
    6         if (get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
                      return -EFAULT;
              data64 = compat_ptr(data32);
      
              u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
      
   20         if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
                               IFNAMSIZ))
                      return -EFAULT;
              if (put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
                      return -EFAULT;
      
              return dev_ioctl(net, cmd, u_ifreq64);
      }
      
      static int dev_ifsioc(struct net *net, struct socket *sock,
                               unsigned int cmd, struct compat_ifreq __user *uifr32)
      {
              struct ifreq __user *uifr;
              int err;
      
   85         uifr = compat_alloc_user_space(sizeof(*uifr));
              if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
                      return -EFAULT;
   84 
              err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);
   84 
              if (!err) {
                      switch (cmd) {
                      case SIOCGIFFLAGS:
                      case SIOCGIFMETRIC:
                      case SIOCGIFMTU:
                      case SIOCGIFMEM:
   84                 case SIOCGIFHWADDR:
                      case SIOCGIFINDEX:
                      case SIOCGIFADDR:
   85                 case SIOCGIFBRDADDR:
                      case SIOCGIFDSTADDR:
                      case SIOCGIFNETMASK:
                      case SIOCGIFPFLAGS:
                      case SIOCGIFTXQLEN:
                      case SIOCGMIIPHY:
                      case SIOCGMIIREG:
                              if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
                                      err = -EFAULT;
  228                         break;
                      }
              }
              return err;
  227 }
      
      static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
  168                         struct compat_ifreq __user *uifr32)
      {
              struct ifreq ifr;
              struct compat_ifmap __user *uifmap32;
              mm_segment_t old_fs;
              int err;
      
              uifmap32 = &uifr32->ifr_ifru.ifru_map;
              err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
              err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
              err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
              err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
              err |= get_user(ifr.ifr_map.irq, &uifmap32->irq);
              err |= get_user(ifr.ifr_map.dma, &uifmap32->dma);
              err |= get_user(ifr.ifr_map.port, &uifmap32->port);
   21         if (err)
                      return -EFAULT;
      
              old_fs = get_fs();
              set_fs(KERNEL_DS);
              err = dev_ioctl(net, cmd, (void  __user __force *)&ifr);
              set_fs(old_fs);
      
              if (cmd == SIOCGIFMAP && !err) {
                      err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
                      err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
                      err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
                      err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
                      err |= put_user(ifr.ifr_map.irq, &uifmap32->irq);
                      err |= put_user(ifr.ifr_map.dma, &uifmap32->dma);
                      err |= put_user(ifr.ifr_map.port, &uifmap32->port);
                      if (err)
    4                         err = -EFAULT;
              }
              return err;
      }
      
      struct rtentry32 {
              u32                rt_pad1;
              struct sockaddr rt_dst;         /* target address               */
              struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
              struct sockaddr rt_genmask;     /* target network mask (IP)     */
    3         unsigned short        rt_flags;
              short                rt_pad2;
              u32                rt_pad3;
              unsigned char        rt_tos;
              unsigned char        rt_class;
  131         short                rt_pad4;
    1         short                rt_metric;      /* +1 for binary compatibility! */
              /* char * */ u32 rt_dev;        /* forcing the device at add    */
              u32                rt_mtu;         /* per route MTU/Window         */
              u32                rt_window;      /* Window clamping              */
              unsigned short  rt_irtt;        /* Initial RTT                  */
      };
      
      struct in6_rtmsg32 {
              struct in6_addr                rtmsg_dst;
              struct in6_addr                rtmsg_src;
              struct in6_addr                rtmsg_gateway;
              u32                        rtmsg_type;
              u16                        rtmsg_dst_len;
              u16                        rtmsg_src_len;
              u32                        rtmsg_metric;
              u32                        rtmsg_info;
              u32                        rtmsg_flags;
              s32                        rtmsg_ifindex;
      };
      
      static int routing_ioctl(struct net *net, struct socket *sock,
                               unsigned int cmd, void __user *argp)
      {
              int ret;
              void *r = NULL;
              struct in6_rtmsg r6;
              struct rtentry r4;
              char devname[16];
              u32 rtdev;
              mm_segment_t old_fs = get_fs();
      
              if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */
                      struct in6_rtmsg32 __user *ur6 = argp;
                      ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst),
                              3 * sizeof(struct in6_addr));
                      ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
                      ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
                      ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
                      ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
                      ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
                      ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
                      ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
      
                      r = (void *) &r6;
              } else { /* ipv4 */
                      struct rtentry32 __user *ur4 = argp;
                      ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
                                              3 * sizeof(struct sockaddr));
                      ret |= get_user(r4.rt_flags, &(ur4->rt_flags));
                      ret |= get_user(r4.rt_metric, &(ur4->rt_metric));
                      ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu));
                      ret |= get_user(r4.rt_window, &(ur4->rt_window));
                      ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt));
   33                 ret |= get_user(rtdev, &(ur4->rt_dev));
                      if (rtdev) {
   33                         ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
                              r4.rt_dev = (char __user __force *)devname;
   11                         devname[15] = 0;
                      } else
                              r4.rt_dev = NULL;
      
                      r = (void *) &r4;
              }
      
              if (ret) {
                      ret = -EFAULT;
                      goto out;
              }
      
              set_fs(KERNEL_DS);
   22         ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
              set_fs(old_fs);
      
      out:
              return ret;
      }
      
      /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
       * for some operations; this forces use of the newer bridge-utils that
    3  * use compatible ioctls
       */
      static int old_bridge_ioctl(compat_ulong_t __user *argp)
      {
   19         compat_ulong_t tmp;
      
              if (get_user(tmp, argp))
                      return -EFAULT;
              if (tmp == BRCTL_GET_VERSION)
   33                 return BRCTL_VERSION + 1;
              return -EINVAL;
      }
      
      static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
   30                          unsigned int cmd, unsigned long arg)
      {
              void __user *argp = compat_ptr(arg);
              struct sock *sk = sock->sk;
              struct net *net = sock_net(sk);
   33 
              if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
                      return compat_ifr_data_ioctl(net, cmd, argp);
      
              switch (cmd) {
              case SIOCSIFBR:
              case SIOCGIFBR:
                      return old_bridge_ioctl(argp);
              case SIOCGIFNAME:
                      return dev_ifname32(net, argp);
              case SIOCGIFCONF:
    3                 return dev_ifconf(net, argp);
              case SIOCETHTOOL:
    3                 return ethtool_ioctl(net, argp);
              case SIOCWANDEV:
                      return compat_siocwandev(net, argp);
              case SIOCGIFMAP:
              case SIOCSIFMAP:
                      return compat_sioc_ifmap(net, cmd, argp);
              case SIOCBONDENSLAVE:
              case SIOCBONDRELEASE:
  587         case SIOCBONDSETHWADDR:
              case SIOCBONDCHANGEACTIVE:
                      return bond_ioctl(net, cmd, argp);
              case SIOCADDRT:
              case SIOCDELRT:
                      return routing_ioctl(net, sock, cmd, argp);
              case SIOCGSTAMP:
  514                 return do_siocgstamp(net, sock, cmd, argp);
              case SIOCGSTAMPNS:
                      return do_siocgstampns(net, sock, cmd, argp);
    3         case SIOCBONDSLAVEINFOQUERY:
              case SIOCBONDINFOQUERY:
    2         case SIOCSHWTSTAMP:
              case SIOCGHWTSTAMP:
   14                 return compat_ifr_data_ioctl(net, cmd, argp);
      
  109         case FIOSETOWN:
              case SIOCSPGRP:
    1         case FIOGETOWN:
              case SIOCGPGRP:
              case SIOCBRADDBR:
  131         case SIOCBRDELBR:
              case SIOCGIFVLAN:
              case SIOCSIFVLAN:
              case SIOCADDDLCI:
              case SIOCDELDLCI:
   20                 return sock_ioctl(file, cmd, arg);
      
              case SIOCGIFFLAGS:
   33         case SIOCSIFFLAGS:
              case SIOCGIFMETRIC:
    7         case SIOCSIFMETRIC:
              case SIOCGIFMTU:
    4         case SIOCSIFMTU:
              case SIOCGIFMEM:
              case SIOCSIFMEM:
              case SIOCGIFHWADDR:
              case SIOCSIFHWADDR:
   85         case SIOCADDMULTI:
              case SIOCDELMULTI:
              case SIOCGIFINDEX:
              case SIOCGIFADDR:
              case SIOCSIFADDR:
              case SIOCSIFHWBROADCAST:
              case SIOCDIFADDR:
              case SIOCGIFBRDADDR:
              case SIOCSIFBRDADDR:
              case SIOCGIFDSTADDR:
              case SIOCSIFDSTADDR:
              case SIOCGIFNETMASK:
   19         case SIOCSIFNETMASK:
              case SIOCSIFPFLAGS:
              case SIOCGIFPFLAGS:
              case SIOCGIFTXQLEN:
              case SIOCSIFTXQLEN:
              case SIOCBRADDIF:
              case SIOCBRDELIF:
              case SIOCSIFNAME:
              case SIOCGMIIPHY:
              case SIOCGMIIREG:
              case SIOCSMIIREG:
                      return dev_ifsioc(net, sock, cmd, argp);
      
              case SIOCSARP:
              case SIOCGARP:
              case SIOCDARP:
              case SIOCATMARK:
                      return sock_do_ioctl(net, sock, cmd, arg);
              }
      
              return -ENOIOCTLCMD;
      }
      
      static long compat_sock_ioctl(struct file *file, unsigned int cmd,
                                    unsigned long arg)
      {
              struct socket *sock = file->private_data;
              int ret = -ENOIOCTLCMD;
              struct sock *sk;
              struct net *net;
      
              sk = sock->sk;
              net = sock_net(sk);
      
              if (sock->ops->compat_ioctl)
  228                 ret = sock->ops->compat_ioctl(sock, cmd, arg);
      
              if (ret == -ENOIOCTLCMD &&
                  (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
                      ret = compat_wext_handle_ioctl(net, cmd, arg);
      
   34         if (ret == -ENOIOCTLCMD)
                      ret = compat_sock_ioctl_trans(file, sock, cmd, arg);
      
              return ret;
      }
      #endif
      
      int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
      {
  588         return sock->ops->bind(sock, addr, addrlen);
      }
      EXPORT_SYMBOL(kernel_bind);
      
      int kernel_listen(struct socket *sock, int backlog)
      {
              return sock->ops->listen(sock, backlog);
      }
      EXPORT_SYMBOL(kernel_listen);
  216 
      int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
      {
  588         struct sock *sk = sock->sk;
              int err;
      
              err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
  588                                newsock);
              if (err < 0)
  570                 goto done;
      
              err = sock->ops->accept(sock, *newsock, flags);
              if (err < 0) {
                      sock_release(*newsock);
                      *newsock = NULL;
                      goto done;
              }
      
              (*newsock)->ops = sock->ops;
              __module_get((*newsock)->ops->owner);
      
      done:
              return err;
      }
      EXPORT_SYMBOL(kernel_accept);
      
      int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
                         int flags)
      {
              return sock->ops->connect(sock, addr, addrlen, flags);
      }
      EXPORT_SYMBOL(kernel_connect);
      
      int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
                               int *addrlen)
      {
              return sock->ops->getname(sock, addr, addrlen, 0);
      }
      EXPORT_SYMBOL(kernel_getsockname);
      
      int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
                               int *addrlen)
      {
              return sock->ops->getname(sock, addr, addrlen, 1);
      }
      EXPORT_SYMBOL(kernel_getpeername);
      
      int kernel_getsockopt(struct socket *sock, int level, int optname,
                              char *optval, int *optlen)
      {
              mm_segment_t oldfs = get_fs();
              char __user *uoptval;
              int __user *uoptlen;
              int err;
      
              uoptval = (char __user __force *) optval;
              uoptlen = (int __user __force *) optlen;
      
              set_fs(KERNEL_DS);
              if (level == SOL_SOCKET)
                      err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
              else
                      err = sock->ops->getsockopt(sock, level, optname, uoptval,
                                                  uoptlen);
              set_fs(oldfs);
              return err;
      }
    4 EXPORT_SYMBOL(kernel_getsockopt);
      
      int kernel_setsockopt(struct socket *sock, int level, int optname,
                              char *optval, unsigned int optlen)
      {
              mm_segment_t oldfs = get_fs();
              char __user *uoptval;
              int err;
      
              uoptval = (char __user __force *) optval;
      
              set_fs(KERNEL_DS);
              if (level == SOL_SOCKET)
                      err = sock_setsockopt(sock, level, optname, uoptval, optlen);
              else
                      err = sock->ops->setsockopt(sock, level, optname, uoptval,
                                                  optlen);
              set_fs(oldfs);
              return err;
      }
      EXPORT_SYMBOL(kernel_setsockopt);
      
      int kernel_sendpage(struct socket *sock, struct page *page, int offset,
                          size_t size, int flags)
      {
              if (sock->ops->sendpage)
                      return sock->ops->sendpage(sock, page, offset, size, flags);
      
              return sock_no_sendpage(sock, page, offset, size, flags);
      }
      EXPORT_SYMBOL(kernel_sendpage);
      
      int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
      {
              mm_segment_t oldfs = get_fs();
              int err;
      
              set_fs(KERNEL_DS);
              err = sock->ops->ioctl(sock, cmd, arg);
              set_fs(oldfs);
      
              return err;
      }
      EXPORT_SYMBOL(kernel_sock_ioctl);
      
      int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
      {
              return sock->ops->shutdown(sock, how);
      }
  398 EXPORT_SYMBOL(kernel_sock_shutdown);
      /*
       * xfrm4_policy.c
       *
       * Changes:
       *        Kazunori MIYAZAWA @USAGI
       *         YOSHIFUJI Hideaki @USAGI
       *                Split up af-specific portion
       *
       */
      
      #include <linux/err.h>
      #include <linux/kernel.h>
      #include <linux/inetdevice.h>
      #include <linux/if_tunnel.h>
      #include <net/dst.h>
      #include <net/xfrm.h>
      #include <net/ip.h>
      #include <net/l3mdev.h>
      
      static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
      
      static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
                                                  int tos, int oif,
                                                  const xfrm_address_t *saddr,
                                                  const xfrm_address_t *daddr,
                                                  u32 mark)
      {
              struct rtable *rt;
      
  239         memset(fl4, 0, sizeof(*fl4));
              fl4->daddr = daddr->a4;
              fl4->flowi4_tos = tos;
              fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
              fl4->flowi4_mark = mark;
              if (saddr)
                      fl4->saddr = saddr->a4;
      
              fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF;
      
              rt = __ip_route_output_key(net, fl4);
              if (!IS_ERR(rt))
  238                 return &rt->dst;
      
              return ERR_CAST(rt);
      }
      
      static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
                                                const xfrm_address_t *saddr,
                                                const xfrm_address_t *daddr,
                                                u32 mark)
      {
              struct flowi4 fl4;
      
              return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark);
      }
      
      static int xfrm4_get_saddr(struct net *net, int oif,
                                 xfrm_address_t *saddr, xfrm_address_t *daddr,
                                 u32 mark)
      {
              struct dst_entry *dst;
              struct flowi4 fl4;
      
  239         dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark);
              if (IS_ERR(dst))
                      return -EHOSTUNREACH;
      
  238         saddr->a4 = fl4.saddr;
              dst_release(dst);
  239         return 0;
      }
      
      static int xfrm4_get_tos(const struct flowi *fl)
      {
              return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
      }
      
      static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
                                 int nfheader_len)
      {
              return 0;
      }
      
      static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
                                const struct flowi *fl)
      {
              struct rtable *rt = (struct rtable *)xdst->route;
              const struct flowi4 *fl4 = &fl->u.ip4;
      
              xdst->u.rt.rt_iif = fl4->flowi4_iif;
      
              xdst->u.dst.dev = dev;
              dev_hold(dev);
      
              /* Sheit... I remember I did this right. Apparently,
               * it was magically lost, so this code needs audit */
              xdst->u.rt.rt_is_input = rt->rt_is_input;
              xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
                                                    RTCF_LOCAL);
              xdst->u.rt.rt_type = rt->rt_type;
              xdst->u.rt.rt_gateway = rt->rt_gateway;
              xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
              xdst->u.rt.rt_pmtu = rt->rt_pmtu;
              xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
              xdst->u.rt.rt_table_id = rt->rt_table_id;
              INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
      
              return 0;
      }
      
      static void
      _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
      {
  152         const struct iphdr *iph = ip_hdr(skb);
              int ihl = iph->ihl;
              u8 *xprth = skb_network_header(skb) + ihl * 4;
              struct flowi4 *fl4 = &fl->u.ip4;
              int oif = 0;
  152 
  145         if (skb_dst(skb))
                      oif = l3mdev_fib_oif(skb_dst(skb)->dev);
  152 
              memset(fl4, 0, sizeof(struct flowi4));
  152         fl4->flowi4_mark = skb->mark;
              fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
      
  125         fl4->flowi4_proto = iph->protocol;
              fl4->daddr = reverse ? iph->saddr : iph->daddr;
              fl4->saddr = reverse ? iph->daddr : iph->saddr;
              fl4->flowi4_tos = iph->tos;
      
              if (!ip_is_fragment(iph)) {
  119                 switch (iph->protocol) {
  119                 case IPPROTO_UDP:
                      case IPPROTO_UDPLITE:
                      case IPPROTO_TCP:
  118                 case IPPROTO_SCTP:
                      case IPPROTO_DCCP:
                              if (xprth + 4 < skb->data ||
  118                             pskb_may_pull(skb, xprth + 4 - skb->data)) {
  118                                 __be16 *ports;
      
                                      xprth = skb_network_header(skb) + ihl * 4;
                                      ports = (__be16 *)xprth;
      
    1                                 fl4->fl4_sport = ports[!!reverse];
    1                                 fl4->fl4_dport = ports[!reverse];
                              }
                              break;
      
                      case IPPROTO_ICMP:
                              if (xprth + 2 < skb->data ||
                                  pskb_may_pull(skb, xprth + 2 - skb->data)) {
                                      u8 *icmp;
      
                                      xprth = skb_network_header(skb) + ihl * 4;
                                      icmp = xprth;
      
    1                                 fl4->fl4_icmp_type = icmp[0];
    1                                 fl4->fl4_icmp_code = icmp[1];
                              }
                              break;
    1 
                      case IPPROTO_ESP:
                              if (xprth + 4 < skb->data ||
                                  pskb_may_pull(skb, xprth + 4 - skb->data)) {
                                      __be32 *ehdr;
      
                                      xprth = skb_network_header(skb) + ihl * 4;
                                      ehdr = (__be32 *)xprth;
      
                                      fl4->fl4_ipsec_spi = ehdr[0];
                              }
                              break;
      
                      case IPPROTO_AH:
                              if (xprth + 8 < skb->data ||
                                  pskb_may_pull(skb, xprth + 8 - skb->data)) {
                                      __be32 *ah_hdr;
      
                                      xprth = skb_network_header(skb) + ihl * 4;
                                      ah_hdr = (__be32 *)xprth;
    1 
    1                                 fl4->fl4_ipsec_spi = ah_hdr[1];
                              }
                              break;
      
                      case IPPROTO_COMP:
                              if (xprth + 4 < skb->data ||
                                  pskb_may_pull(skb, xprth + 4 - skb->data)) {
                                      __be16 *ipcomp_hdr;
      
                                      xprth = skb_network_header(skb) + ihl * 4;
                                      ipcomp_hdr = (__be16 *)xprth;
    5 
    5                                 fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
                              }
                              break;
      
    3                 case IPPROTO_GRE:
                              if (xprth + 12 < skb->data ||
                                  pskb_may_pull(skb, xprth + 12 - skb->data)) {
                                      __be16 *greflags;
                                      __be32 *gre_hdr;
    2 
    1                                 xprth = skb_network_header(skb) + ihl * 4;
    2                                 greflags = (__be16 *)xprth;
                                      gre_hdr = (__be32 *)xprth;
      
                                      if (greflags[0] & GRE_KEY) {
                                              if (greflags[0] & GRE_CSUM)
                                                      gre_hdr++;
                                              fl4->fl4_gre_key = gre_hdr[1];
                                      }
                              }
                              break;
  152 
  152                 default:
  152                         fl4->fl4_ipsec_spi = 0;
                              break;
                      }
              }
      }
      
      static inline int xfrm4_garbage_collect(struct dst_ops *ops)
      {
              struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
      
              xfrm4_policy_afinfo.garbage_collect(net);
              return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
      }
      
      static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                    struct sk_buff *skb, u32 mtu)
      {
              struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
              struct dst_entry *path = xdst->route;
      
              path->ops->update_pmtu(path, sk, skb, mtu);
      }
      
      static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
                                 struct sk_buff *skb)
      {
              struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
              struct dst_entry *path = xdst->route;
      
              path->ops->redirect(path, sk, skb);
      }
      
      static void xfrm4_dst_destroy(struct dst_entry *dst)
      {
              struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
      
              dst_destroy_metrics_generic(dst);
      
              xfrm_dst_destroy(xdst);
      }
      
      static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
                                   int unregister)
      {
              if (!unregister)
                      return;
      
              xfrm_dst_ifdown(dst, dev);
      }
      
      static struct dst_ops xfrm4_dst_ops_template = {
              .family =                AF_INET,
              .gc =                        xfrm4_garbage_collect,
              .update_pmtu =                xfrm4_update_pmtu,
              .redirect =                xfrm4_redirect,
              .cow_metrics =                dst_cow_metrics_generic,
              .destroy =                xfrm4_dst_destroy,
              .ifdown =                xfrm4_dst_ifdown,
              .local_out =                __ip_local_out,
              .gc_thresh =                INT_MAX,
      };
      
      static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
              .family =                 AF_INET,
              .dst_ops =                &xfrm4_dst_ops_template,
              .dst_lookup =                xfrm4_dst_lookup,
              .get_saddr =                xfrm4_get_saddr,
              .decode_session =        _decode_session4,
              .get_tos =                xfrm4_get_tos,
              .init_path =                xfrm4_init_path,
              .fill_dst =                xfrm4_fill_dst,
              .blackhole_route =        ipv4_blackhole_route,
      };
      
      #ifdef CONFIG_SYSCTL
      static struct ctl_table xfrm4_policy_table[] = {
              {
                      .procname       = "xfrm4_gc_thresh",
                      .data           = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
                      .maxlen         = sizeof(int),
                      .mode           = 0644,
                      .proc_handler   = proc_dointvec,
              },
              { }
      };
      
      static int __net_init xfrm4_net_sysctl_init(struct net *net)
      {
              struct ctl_table *table;
              struct ctl_table_header *hdr;
   30 
   30         table = xfrm4_policy_table;
              if (!net_eq(net, &init_net)) {
                      table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL);
                      if (!table)
   30                         goto err_alloc;
      
                      table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
              }
      
              hdr = register_net_sysctl(net, "net/ipv4", table);
              if (!hdr)
   30                 goto err_reg;
      
              net->ipv4.xfrm4_hdr = hdr;
              return 0;
      
      err_reg:
              if (!net_eq(net, &init_net))
                      kfree(table);
      err_alloc:
              return -ENOMEM;
      }
      
      static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
      {
              struct ctl_table *table;
      
              if (!net->ipv4.xfrm4_hdr)
                      return;
      
              table = net->ipv4.xfrm4_hdr->ctl_table_arg;
              unregister_net_sysctl_table(net->ipv4.xfrm4_hdr);
              if (!net_eq(net, &init_net))
                      kfree(table);
      }
      #else /* CONFIG_SYSCTL */
      static int inline xfrm4_net_sysctl_init(struct net *net)
      {
              return 0;
      }
      
      static void inline xfrm4_net_sysctl_exit(struct net *net)
      {
      }
      #endif
      
      static int __net_init xfrm4_net_init(struct net *net)
      {
   30         int ret;
      
              memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template,
   30                sizeof(xfrm4_dst_ops_template));
              ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops);
              if (ret)
   30                 return ret;
      
              ret = xfrm4_net_sysctl_init(net);
              if (ret)
                      dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
      
              return ret;
      }
      
      static void __net_exit xfrm4_net_exit(struct net *net)
      {
              xfrm4_net_sysctl_exit(net);
              dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
      }
      
      static struct pernet_operations __net_initdata xfrm4_net_ops = {
              .init        = xfrm4_net_init,
              .exit        = xfrm4_net_exit,
      };
      
      static void __init xfrm4_policy_init(void)
      {
              xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
      }
      
      void __init xfrm4_init(void)
      {
              xfrm4_state_init();
              xfrm4_policy_init();
              xfrm4_protocol_init();
              register_pernet_subsys(&xfrm4_net_ops);
      }
      
      #ifndef __LINUX_UACCESS_H__
      #define __LINUX_UACCESS_H__
      
      #include <linux/sched.h>
      
      #define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS)
      
      #include <asm/uaccess.h>
      
      static __always_inline void pagefault_disabled_inc(void)
      {
  102         current->pagefault_disabled++;
      }
      
      static __always_inline void pagefault_disabled_dec(void)
      {
              current->pagefault_disabled--;
   66         WARN_ON(current->pagefault_disabled < 0);
      }
      
      /*
       * These routines enable/disable the pagefault handler. If disabled, it will
       * not take any locks and go straight to the fixup table.
       *
       * User access methods will not sleep when called from a pagefault_disabled()
       * environment.
       */
      static inline void pagefault_disable(void)
      {
  102         pagefault_disabled_inc();
              /*
               * make sure to have issued the store before a pagefault
               * can hit.
               */
              barrier();
      }
      
      static inline void pagefault_enable(void)
      {
              /*
               * make sure to issue those last loads/stores before enabling
               * the pagefault handler again.
               */
 1822         barrier();
              pagefault_disabled_dec();
      }
      
      /*
       * Is the pagefault handler disabled? If so, user access methods will not sleep.
       */
      #define pagefault_disabled() (current->pagefault_disabled != 0)
      
      /*
       * The pagefault handler is in general disabled by pagefault_disable() or
       * when in irq context (via in_atomic()).
       *
       * This function should only be used by the fault handlers. Other users should
       * stick to pagefault_disabled().
       * Please NEVER use preempt_disable() to disable the fault handler. With
       * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
       * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
       */
      #define faulthandler_disabled() (pagefault_disabled() || in_atomic())
      
      #ifndef ARCH_HAS_NOCACHE_UACCESS
      
      static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
                                      const void __user *from, unsigned long n)
      {
              return __copy_from_user_inatomic(to, from, n);
      }
      
      static inline unsigned long __copy_from_user_nocache(void *to,
                                      const void __user *from, unsigned long n)
      {
              return __copy_from_user(to, from, n);
      }
      
      #endif                /* ARCH_HAS_NOCACHE_UACCESS */
      
      /*
       * probe_kernel_read(): safely attempt to read from a location
       * @dst: pointer to the buffer that shall take the data
       * @src: address to read from
       * @size: size of the data chunk
       *
       * Safely read from address @src to the buffer at @dst.  If a kernel fault
       * happens, handle that and return -EFAULT.
       */
      extern long probe_kernel_read(void *dst, const void *src, size_t size);
      extern long __probe_kernel_read(void *dst, const void *src, size_t size);
      
      /*
       * probe_kernel_write(): safely attempt to write to a location
       * @dst: address to write to
       * @src: pointer to the data that shall be written
       * @size: size of the data chunk
       *
       * Safely write to address @dst from the buffer at @src.  If a kernel fault
       * happens, handle that and return -EFAULT.
       */
      extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
      extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
      
      extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
      
      /**
       * probe_kernel_address(): safely attempt to read from a location
       * @addr: address to read from
       * @retval: read into this variable
       *
       * Returns 0 on success, or -EFAULT.
       */
      #define probe_kernel_address(addr, retval)                \
              probe_kernel_read(&retval, addr, sizeof(retval))
      
      #ifndef user_access_begin
      #define user_access_begin() do { } while (0)
      #define user_access_end() do { } while (0)
      #define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
      #define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
      #endif
      
      #endif                /* __LINUX_UACCESS_H__ */
      /*
       * 32bit compatibility wrappers for the input subsystem.
       *
       * Very heavily based on evdev.c - Copyright (c) 1999-2002 Vojtech Pavlik
       *
       * This program is free software; you can redistribute it and/or modify it
       * under the terms of the GNU General Public License version 2 as published by
       * the Free Software Foundation.
       */
      
      #include <linux/export.h>
      #include <asm/uaccess.h>
      #include "input-compat.h"
      
      #ifdef CONFIG_COMPAT
      
   81 int input_event_from_user(const char __user *buffer,
                                struct input_event *event)
      {
   81         if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) {
                      struct input_event_compat compat_event;
      
   81                 if (copy_from_user(&compat_event, buffer,
                                         sizeof(struct input_event_compat)))
    2                         return -EFAULT;
      
   81                 event->time.tv_sec = compat_event.time.tv_sec;
                      event->time.tv_usec = compat_event.time.tv_usec;
                      event->type = compat_event.type;
                      event->code = compat_event.code;
                      event->value = compat_event.value;
      
              } else {
                      if (copy_from_user(event, buffer, sizeof(struct input_event)))
   81                         return -EFAULT;
              }
      
              return 0;
      }
      
   10 int input_event_to_user(char __user *buffer,
                              const struct input_event *event)
      {
   10         if (INPUT_COMPAT_TEST && !COMPAT_USE_64BIT_TIME) {
                      struct input_event_compat compat_event;
      
   10                 compat_event.time.tv_sec = event->time.tv_sec;
                      compat_event.time.tv_usec = event->time.tv_usec;
                      compat_event.type = event->type;
                      compat_event.code = event->code;
                      compat_event.value = event->value;
      
   10                 if (copy_to_user(buffer, &compat_event,
                                       sizeof(struct input_event_compat)))
    2                         return -EFAULT;
      
              } else {
                      if (copy_to_user(buffer, event, sizeof(struct input_event)))
   10                         return -EFAULT;
              }
      
              return 0;
      }
      
    5 int input_ff_effect_from_user(const char __user *buffer, size_t size,
                                    struct ff_effect *effect)
      {
    5         if (INPUT_COMPAT_TEST) {
                      struct ff_effect_compat *compat_effect;
      
    5                 if (size != sizeof(struct ff_effect_compat))
                              return -EINVAL;
      
                      /*
                       * It so happens that the pointer which needs to be changed
                       * is the last field in the structure, so we can retrieve the
                       * whole thing and replace just the pointer.
                       */
                      compat_effect = (struct ff_effect_compat *)effect;
      
    4                 if (copy_from_user(compat_effect, buffer,
                                         sizeof(struct ff_effect_compat)))
                              return -EFAULT;
      
    3                 if (compat_effect->type == FF_PERIODIC &&
    2                     compat_effect->u.periodic.waveform == FF_CUSTOM)
    5                         effect->u.periodic.custom_data =
    1                                 compat_ptr(compat_effect->u.periodic.custom_data);
              } else {
                      if (size != sizeof(struct ff_effect))
                              return -EINVAL;
      
                      if (copy_from_user(effect, buffer, sizeof(struct ff_effect)))
                              return -EFAULT;
              }
      
              return 0;
      }
      
      #else
      
      int input_event_from_user(const char __user *buffer,
                               struct input_event *event)
      {
              if (copy_from_user(event, buffer, sizeof(struct input_event)))
                      return -EFAULT;
      
              return 0;
      }
      
      int input_event_to_user(char __user *buffer,
                              const struct input_event *event)
      {
              if (copy_to_user(buffer, event, sizeof(struct input_event)))
                      return -EFAULT;
      
              return 0;
      }
      
      int input_ff_effect_from_user(const char __user *buffer, size_t size,
                                    struct ff_effect *effect)
      {
              if (size != sizeof(struct ff_effect))
                      return -EINVAL;
      
              if (copy_from_user(effect, buffer, sizeof(struct ff_effect)))
                      return -EFAULT;
      
              return 0;
      }
      
      #endif /* CONFIG_COMPAT */
      
      EXPORT_SYMBOL_GPL(input_event_from_user);
      EXPORT_SYMBOL_GPL(input_event_to_user);
      EXPORT_SYMBOL_GPL(input_ff_effect_from_user);
      
      #include <linux/syscalls.h>
      #include <linux/compat.h>
      #include <linux/quotaops.h>
      
      /*
       * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
       * and is necessary due to alignment problems.
       */
      struct compat_if_dqblk {
              compat_u64 dqb_bhardlimit;
              compat_u64 dqb_bsoftlimit;
              compat_u64 dqb_curspace;
              compat_u64 dqb_ihardlimit;
              compat_u64 dqb_isoftlimit;
              compat_u64 dqb_curinodes;
              compat_u64 dqb_btime;
              compat_u64 dqb_itime;
              compat_uint_t dqb_valid;
      };
      
      /* XFS structures */
      struct compat_fs_qfilestat {
              compat_u64 dqb_bhardlimit;
              compat_u64 qfs_nblks;
              compat_uint_t qfs_nextents;
      };
      
      struct compat_fs_quota_stat {
              __s8                qs_version;
              __u16                qs_flags;
              __s8                qs_pad;
              struct compat_fs_qfilestat        qs_uquota;
              struct compat_fs_qfilestat        qs_gquota;
              compat_uint_t        qs_incoredqs;
              compat_int_t        qs_btimelimit;
              compat_int_t        qs_itimelimit;
              compat_int_t        qs_rtbtimelimit;
              __u16                qs_bwarnlimit;
              __u16                qs_iwarnlimit;
      };
      
      asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
                                                      qid_t id, void __user *addr)
      {
              unsigned int cmds;
              struct if_dqblk __user *dqblk;
              struct compat_if_dqblk __user *compat_dqblk;
              struct fs_quota_stat __user *fsqstat;
              struct compat_fs_quota_stat __user *compat_fsqstat;
              compat_uint_t data;
              u16 xdata;
              long ret;
      
   22         cmds = cmd >> SUBCMDSHIFT;
      
              switch (cmds) {
              case Q_GETQUOTA:
    1                 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
                      compat_dqblk = addr;
                      ret = sys_quotactl(cmd, special, id, dqblk);
                      if (ret)
                              break;
                      if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
                              get_user(data, &dqblk->dqb_valid) ||
                              put_user(data, &compat_dqblk->dqb_valid))
                              ret = -EFAULT;
                      break;
              case Q_SETQUOTA:
    2                 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
                      compat_dqblk = addr;
                      ret = -EFAULT;
                      if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
    1                         get_user(data, &compat_dqblk->dqb_valid) ||
    1                         put_user(data, &dqblk->dqb_valid))
                              break;
                      ret = sys_quotactl(cmd, special, id, dqblk);
                      break;
              case Q_XGETQSTAT:
                      fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
                      compat_fsqstat = addr;
                      ret = sys_quotactl(cmd, special, id, fsqstat);
                      if (ret)
                              break;
                      ret = -EFAULT;
                      /* Copying qs_version, qs_flags, qs_pad */
                      if (copy_in_user(compat_fsqstat, fsqstat,
                              offsetof(struct compat_fs_quota_stat, qs_uquota)))
                              break;
                      /* Copying qs_uquota */
                      if (copy_in_user(&compat_fsqstat->qs_uquota,
                              &fsqstat->qs_uquota,
                              sizeof(compat_fsqstat->qs_uquota)) ||
                              get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
                              put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
                              break;
                      /* Copying qs_gquota */
                      if (copy_in_user(&compat_fsqstat->qs_gquota,
                              &fsqstat->qs_gquota,
                              sizeof(compat_fsqstat->qs_gquota)) ||
                              get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
                              put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
                              break;
                      /* Copying the rest */
                      if (copy_in_user(&compat_fsqstat->qs_incoredqs,
                              &fsqstat->qs_incoredqs,
                              sizeof(struct compat_fs_quota_stat) -
                              offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
                              get_user(xdata, &fsqstat->qs_iwarnlimit) ||
                              put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
                              break;
                      ret = 0;
                      break;
              default:
   20                 ret = sys_quotactl(cmd, special, id, addr);
              }
   21         return ret;
      }
      /*
       *        Linux INET6 implementation
       *        Forwarding Information Database
       *
       *        Authors:
       *        Pedro Roque                <roque@di.fc.ul.pt>
       *
       *        This program is free software; you can redistribute it and/or
       *      modify it under the terms of the GNU General Public License
       *      as published by the Free Software Foundation; either version
       *      2 of the License, or (at your option) any later version.
       *
       *        Changes:
       *        Yuji SEKIYA @USAGI:        Support default route on router node;
       *                                remove ip6_null_entry from the top of
       *                                routing table.
       *        Ville Nuorvala:                Fixed routing subtrees.
       */
      
      #define pr_fmt(fmt) "IPv6: " fmt
      
      #include <linux/errno.h>
      #include <linux/types.h>
      #include <linux/net.h>
      #include <linux/route.h>
      #include <linux/netdevice.h>
      #include <linux/in6.h>
      #include <linux/init.h>
      #include <linux/list.h>
      #include <linux/slab.h>
      
      #include <net/ipv6.h>
      #include <net/ndisc.h>
      #include <net/addrconf.h>
      #include <net/lwtunnel.h>
      
      #include <net/ip6_fib.h>
      #include <net/ip6_route.h>
      
      #define RT6_DEBUG 2
      
      #if RT6_DEBUG >= 3
      #define RT6_TRACE(x...) pr_debug(x)
      #else
      #define RT6_TRACE(x...) do { ; } while (0)
      #endif
      
      static struct kmem_cache *fib6_node_kmem __read_mostly;
      
      struct fib6_cleaner {
              struct fib6_walker w;
              struct net *net;
              int (*func)(struct rt6_info *, void *arg);
              int sernum;
              void *arg;
      };
      
      static DEFINE_RWLOCK(fib6_walker_lock);
      
      #ifdef CONFIG_IPV6_SUBTREES
      #define FWS_INIT FWS_S
      #else
      #define FWS_INIT FWS_L
      #endif
      
      static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
      static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
      static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
      static int fib6_walk(struct fib6_walker *w);
      static int fib6_walk_continue(struct fib6_walker *w);
      
      /*
       *        A routing update causes an increase of the serial number on the
       *        affected subtree. This allows for cached routes to be asynchronously
       *        tested when modifications are made to the destination cache as a
       *        result of redirects, path MTU changes, etc.
       */
      
      static void fib6_gc_timer_cb(unsigned long arg);
      
      static LIST_HEAD(fib6_walkers);
      #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
      
      static void fib6_walker_link(struct fib6_walker *w)
      {
  235         write_lock_bh(&fib6_walker_lock);
  235         list_add(&w->lh, &fib6_walkers);
  235         write_unlock_bh(&fib6_walker_lock);
      }
      
      static void fib6_walker_unlink(struct fib6_walker *w)
      {
  235         write_lock_bh(&fib6_walker_lock);
  235         list_del(&w->lh);
              write_unlock_bh(&fib6_walker_lock);
      }
      
      static int fib6_new_sernum(struct net *net)
      {
              int new, old;
      
              do {
  148                 old = atomic_read(&net->ipv6.fib6_sernum);
  148                 new = old < INT_MAX ? old + 1 : 1;
  148         } while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
                                      old, new) != old);
              return new;
      }
      
      enum {
              FIB6_NO_SERNUM_CHANGE = 0,
      };
      
      /*
       *        Auxiliary address test functions for the radix tree.
       *
       *        These assume a 32bit processor (although it will work on
       *        64bit processors)
       */
      
      /*
       *        test bit
       */
      #if defined(__LITTLE_ENDIAN)
      # define BITOP_BE32_SWIZZLE        (0x1F & ~7)
      #else
      # define BITOP_BE32_SWIZZLE        0
      #endif
      
      static __be32 addr_bit_set(const void *token, int fn_bit)
      {
              const __be32 *addr = token;
              /*
               * Here,
               *        1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
               * is optimized version of
               *        htonl(1 << ((~fn_bit)&0x1F))
               * See include/asm-generic/bitops/le.h.
               */
   20         return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
                     addr[fn_bit >> 5];
      }
      
      static struct fib6_node *node_alloc(void)
      {
              struct fib6_node *fn;
      
   84         fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
      
              return fn;
      }
      
      static void node_free_immediate(struct fib6_node *fn)
      {
              kmem_cache_free(fib6_node_kmem, fn);
      }
      
      static void node_free_rcu(struct rcu_head *head)
      {
              struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
      
              kmem_cache_free(fib6_node_kmem, fn);
      }
      
      static void node_free(struct fib6_node *fn)
      {
              call_rcu(&fn->rcu, node_free_rcu);
      }
      
      static void rt6_rcu_free(struct rt6_info *rt)
      {
   41         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
      }
      
      static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
      {
              int cpu;
      
   41         if (!non_pcpu_rt->rt6i_pcpu)
                      return;
      
   41         for_each_possible_cpu(cpu) {
                      struct rt6_info **ppcpu_rt;
                      struct rt6_info *pcpu_rt;
      
   41                 ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
                      pcpu_rt = *ppcpu_rt;
                      if (pcpu_rt) {
    5                         rt6_rcu_free(pcpu_rt);
                              *ppcpu_rt = NULL;
                      }
              }
      
   41         free_percpu(non_pcpu_rt->rt6i_pcpu);
              non_pcpu_rt->rt6i_pcpu = NULL;
      }
      
   41 static void rt6_release(struct rt6_info *rt)
      {
   42         if (atomic_dec_and_test(&rt->rt6i_ref)) {
   41                 rt6_free_pcpu(rt);
   41                 rt6_rcu_free(rt);
              }
   42 }
      
      static void fib6_free_table(struct fib6_table *table)
      {
              inetpeer_invalidate_tree(&table->tb6_peers);
              kfree(table);
      }
      
      static void fib6_link_table(struct net *net, struct fib6_table *tb)
      {
              unsigned int h;
      
              /*
               * Initialize table lock at a single place to give lockdep a key,
               * tables aren't visible prior to being linked to the list.
               */
   29         rwlock_init(&tb->tb6_lock);
      
              h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
      
              /*
               * No protection necessary, this is the only list mutatation
               * operation, tables never disappear once they exist.
               */
              hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
   29 }
      
      #ifdef CONFIG_IPV6_MULTIPLE_TABLES
      
      static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
      {
              struct fib6_table *table;
      
    1         table = kzalloc(sizeof(*table), GFP_ATOMIC);
              if (table) {
    1                 table->tb6_id = id;
                      table->tb6_root.leaf = net->ipv6.ip6_null_entry;
                      table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
                      inet_peer_base_init(&table->tb6_peers);
              }
      
              return table;
      }
      
      struct fib6_table *fib6_new_table(struct net *net, u32 id)
      {
              struct fib6_table *tb;
      
   59         if (id == 0)
                      id = RT6_TABLE_MAIN;
   59         tb = fib6_get_table(net, id);
   59         if (tb)
                      return tb;
      
    1         tb = fib6_alloc_table(net, id);
              if (tb)
    1                 fib6_link_table(net, tb);
      
              return tb;
      }
      
      struct fib6_table *fib6_get_table(struct net *net, u32 id)
      {
              struct fib6_table *tb;
              struct hlist_head *head;
              unsigned int h;
      
  808         if (id == 0)
                      id = RT6_TABLE_MAIN;
              h = id & (FIB6_TABLE_HASHSZ - 1);
  808         rcu_read_lock();
  808         head = &net->ipv6.fib_table_hash[h];
  808         hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
  808                 if (tb->tb6_id == id) {
  807                         rcu_read_unlock();
                              return tb;
                      }
              }
    2         rcu_read_unlock();
      
  808         return NULL;
      }
      EXPORT_SYMBOL_GPL(fib6_get_table);
      
      static void __net_init fib6_tables_init(struct net *net)
      {
              fib6_link_table(net, net->ipv6.fib6_main_tbl);
   28         fib6_link_table(net, net->ipv6.fib6_local_tbl);
      }
      #else
      
      struct fib6_table *fib6_new_table(struct net *net, u32 id)
      {
              return fib6_get_table(net, id);
      }
      
      struct fib6_table *fib6_get_table(struct net *net, u32 id)
      {
                return net->ipv6.fib6_main_tbl;
      }
      
      struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                         int flags, pol_lookup_t lookup)
      {
              struct rt6_info *rt;
      
              rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
              if (rt->dst.error == -EAGAIN) {
                      ip6_rt_put(rt);
                      rt = net->ipv6.ip6_null_entry;
                      dst_hold(&rt->dst);
              }
      
              return &rt->dst;
      }
      
      static void __net_init fib6_tables_init(struct net *net)
      {
              fib6_link_table(net, net->ipv6.fib6_main_tbl);
      }
      
      #endif
      
      static int fib6_dump_node(struct fib6_walker *w)
      {
              int res;
              struct rt6_info *rt;
      
   10         for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
   10                 res = rt6_dump_route(rt, w->args);
                      if (res < 0) {
                              /* Frame is full, suspend walking */
    1                         w->leaf = rt;
                              return 1;
                      }
              }
   10         w->leaf = NULL;
   10         return 0;
      }
      
      static void fib6_dump_end(struct netlink_callback *cb)
      {
    9         struct fib6_walker *w = (void *)cb->args[2];
      
              if (w) {
                      if (cb->args[4]) {
                              cb->args[4] = 0;
                              fib6_walker_unlink(w);
                      }
    9                 cb->args[2] = 0;
                      kfree(w);
              }
    9         cb->done = (void *)cb->args[3];
              cb->args[1] = 3;
      }
      
      static int fib6_dump_done(struct netlink_callback *cb)
      {
              fib6_dump_end(cb);
              return cb->done ? cb->done(cb) : 0;
      }
      
      static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                                 struct netlink_callback *cb)
      {
              struct fib6_walker *w;
              int res;
      
   10         w = (void *)cb->args[2];
              w->root = &table->tb6_root;
      
              if (cb->args[4] == 0) {
   10                 w->count = 0;
                      w->skip = 0;
      
                      read_lock_bh(&table->tb6_lock);
                      res = fib6_walk(w);
                      read_unlock_bh(&table->tb6_lock);
                      if (res > 0) {
    1                         cb->args[4] = 1;
                              cb->args[5] = w->root->fn_sernum;
                      }
              } else {
                      if (cb->args[5] != w->root->fn_sernum) {
                              /* Begin at the root if the tree changed */
                              cb->args[5] = w->root->fn_sernum;
                              w->state = FWS_INIT;
                              w->node = w->root;
                              w->skip = w->count;
                      } else
                              w->skip = 0;
      
                      read_lock_bh(&table->tb6_lock);
                      res = fib6_walk_continue(w);
                      read_unlock_bh(&table->tb6_lock);
                      if (res <= 0) {
                              fib6_walker_unlink(w);
                              cb->args[4] = 0;
                      }
              }
      
              return res;
      }
      
      static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
      {
   10         struct net *net = sock_net(skb->sk);
              unsigned int h, s_h;
              unsigned int e = 0, s_e;
              struct rt6_rtnl_dump_arg arg;
              struct fib6_walker *w;
              struct fib6_table *tb;
              struct hlist_head *head;
              int res = 0;
      
              s_h = cb->args[0];
              s_e = cb->args[1];
      
    8         w = (void *)cb->args[2];
              if (!w) {
                      /* New dump:
                       *
                       * 1. hook callback destructor.
                       */
   10                 cb->args[3] = (long)cb->done;
                      cb->done = fib6_dump_done;
      
                      /*
                       * 2. allocate and initialize walker.
                       */
                      w = kzalloc(sizeof(*w), GFP_ATOMIC);
                      if (!w)
                              return -ENOMEM;
   10                 w->func = fib6_dump_node;
                      cb->args[2] = (long)w;
              }
      
   10         arg.skb = skb;
              arg.cb = cb;
              arg.net = net;
              w->args = &arg;
      
   10         rcu_read_lock();
   10         for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
                      e = 0;
   10                 head = &net->ipv6.fib_table_hash[h];
                      hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
   10                         if (e < s_e)
                                      goto next;
   10                         res = fib6_dump_table(tb, skb, cb);
   10                         if (res != 0)
                                      goto out;
      next:
   10                         e++;
                      }
              }
      out:
   10         rcu_read_unlock();
              cb->args[1] = e;
              cb->args[0] = h;
      
   10         res = res < 0 ? res : skb->len;
   10         if (res <= 0)
    9                 fib6_dump_end(cb);
              return res;
      }
      
      /*
       *        Routing Table
       *
       *        return the appropriate node for a routing tree "add" operation
       *        by either creating and inserting or by returning an existing
       *        node.
       */
      
      static struct fib6_node *fib6_add_1(struct fib6_node *root,
                                           struct in6_addr *addr, int plen,
                                           int offset, int allow_create,
                                           int replace_required, int sernum)
      {
              struct fib6_node *fn, *in, *ln;
              struct fib6_node *pn = NULL;
              struct rt6key *key;
              int        bit;
              __be32        dir = 0;
      
              RT6_TRACE("fib6_add_1\n");
      
              /* insert node in tree */
      
              fn = root;
      
              do {
  127                 key = (struct rt6key *)((u8 *)fn->leaf + offset);
      
                      /*
                       *        Prefix match
                       */
                      if (plen < fn->fn_bit ||
  127                     !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
   83                         if (!allow_create) {
    1                                 if (replace_required) {
    2                                         pr_warn("Can't replace route, no match found\n");
                                              return ERR_PTR(-ENOENT);
                                      }
                                      pr_warn("NLM_F_CREATE should be set when creating new route\n");
                              }
                              goto insert_above;
                      }
      
                      /*
                       *        Exact match ?
                       */
      
  127                 if (plen == fn->fn_bit) {
                              /* clean up an intermediate node */
   57                         if (!(fn->fn_flags & RTN_RTINFO)) {
    1                                 rt6_release(fn->leaf);
                                      fn->leaf = NULL;
                              }
      
   57                         fn->fn_sernum = sernum;
      
                              return fn;
                      }
      
                      /*
                       *        We have more bits to go
                       */
      
                      /* Try to walk down on tree. */
  123                 fn->fn_sernum = sernum;
                      dir = addr_bit_set(addr, fn->fn_bit);
                      pn = fn;
                      fn = dir ? fn->right : fn->left;
  123         } while (fn);
      
   54         if (!allow_create) {
                      /* We should not create new node because
                       * NLM_F_REPLACE was specified without NLM_F_CREATE
                       * I assume it is safe to require NLM_F_CREATE when
                       * REPLACE flag is used! Later we may want to remove the
                       * check for replace_required, because according
                       * to netlink specification, NLM_F_CREATE
                       * MUST be specified if new route is created.
                       * That would keep IPv6 consistent with IPv4
                       */
    1                 if (replace_required) {
                              pr_warn("Can't replace route, no match found\n");
                              return ERR_PTR(-ENOENT);
                      }
                      pr_warn("NLM_F_CREATE should be set when creating new route\n");
              }
              /*
               *        We walked to the bottom of tree.
               *        Create new leaf node without children.
               */
      
   53         ln = node_alloc();
      
              if (!ln)
                      return ERR_PTR(-ENOMEM);
   53         ln->fn_bit = plen;
      
              ln->parent = pn;
              ln->fn_sernum = sernum;
      
              if (dir)
    1                 pn->right = ln;
              else
   52                 pn->left  = ln;
      
              return ln;
      
      
      insert_above:
              /*
               * split since we don't have a common prefix anymore or
               * we have a less significant route.
               * we've to insert an intermediate node on the list
               * this new node will point to the one we need to create
               * and the current
               */
      
   82         pn = fn->parent;
      
              /* find 1st bit in difference between the 2 addrs.
      
                 See comment in __ipv6_addr_diff: bit may be an invalid value,
                 but if it is >= plen, the value is ignored in any case.
               */
      
   82         bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
      
              /*
               *                (intermediate)[in]
               *                  /           \
               *        (new leaf node)[ln] (old node)[fn]
               */
   82         if (plen > bit) {
   74                 in = node_alloc();
                      ln = node_alloc();
      
   74                 if (!in || !ln) {
                              if (in)
                                      node_free_immediate(in);
                              if (ln)
                                      node_free_immediate(ln);
                              return ERR_PTR(-ENOMEM);
                      }
      
                      /*
                       * new intermediate node.
                       * RTN_RTINFO will
                       * be off since that an address that chooses one of
                       * the branches would not match less specific routes
                       * in the other branch
                       */
      
   74                 in->fn_bit = bit;
      
                      in->parent = pn;
                      in->leaf = fn->leaf;
                      atomic_inc(&in->leaf->rt6i_ref);
      
                      in->fn_sernum = sernum;
      
                      /* update parent pointer */
                      if (dir)
   59                         pn->right = in;
                      else
   69                         pn->left  = in;
      
   74                 ln->fn_bit = plen;
      
                      ln->parent = in;
                      fn->parent = in;
      
                      ln->fn_sernum = sernum;
      
                      if (addr_bit_set(addr, bit)) {
   61                         in->right = ln;
                              in->left  = fn;
                      } else {
   67                         in->left  = ln;
                              in->right = fn;
                      }
              } else { /* plen <= bit */
      
                      /*
                       *                (new leaf node)[ln]
                       *                  /           \
                       *             (old node)[fn] NULL
                       */
      
    8                 ln = node_alloc();
      
                      if (!ln)
                              return ERR_PTR(-ENOMEM);
      
    8                 ln->fn_bit = plen;
      
                      ln->parent = pn;
      
                      ln->fn_sernum = sernum;
      
                      if (dir)
    2                         pn->right = ln;
                      else
    6                         pn->left  = ln;
      
    8                 if (addr_bit_set(&key->addr, plen))
                              ln->right = fn;
                      else
    8                         ln->left  = fn;
      
    8                 fn->parent = ln;
              }
              return ln;
      }
      
      static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
      {
              return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
                     RTF_GATEWAY;
      }
      
      static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
      {
              int i;
      
    1         for (i = 0; i < RTAX_MAX; i++) {
    1                 if (test_bit(i, mxc->mx_valid))
    1                         mp[i] = mxc->mx[i];
              }
      }
      
      static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
      {
  115         if (!mxc->mx)
                      return 0;
      
    5         if (dst->flags & DST_HOST) {
    1                 u32 *mp = dst_metrics_write_ptr(dst);
      
    1                 if (unlikely(!mp))
                              return -ENOMEM;
      
    1                 fib6_copy_metrics(mp, mxc);
              } else {
    4                 dst_init_metrics(dst, mxc->mx, false);
      
                      /* We've stolen mx now. */
                      mxc->mx = NULL;
              }
      
    1         return 0;
      }
      
      static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
                                struct net *net)
      {
   41         if (atomic_read(&rt->rt6i_ref) != 1) {
                      /* This route is used as dummy address holder in some split
                       * nodes. It is not leaked, but it still holds other resources,
                       * which must be released in time. So, scan ascendant nodes
                       * and replace dummy references to this route with references
                       * to still alive ones.
                       */
    3                 while (fn) {
    3                         if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
    3                                 fn->leaf = fib6_find_prefix(net, fn);
                                      atomic_inc(&fn->leaf->rt6i_ref);
                                      rt6_release(rt);
                              }
    3                         fn = fn->parent;
                      }
                      /* No more references are possible at this point. */
    3                 BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
              }
   41 }
      
      /*
       *        Insert routing information in a node.
       */
      
      static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
                                  struct nl_info *info, struct mx6_config *mxc)
      {
              struct rt6_info *iter = NULL;
              struct rt6_info **ins;
              struct rt6_info **fallback_ins = NULL;
  125         int replace = (info->nlh &&
   13                        (info->nlh->nlmsg_flags & NLM_F_REPLACE));
              int add = (!info->nlh ||
                         (info->nlh->nlmsg_flags & NLM_F_CREATE));
              int found = 0;
  125         bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
              int err;
      
              ins = &fn->leaf;
      
              for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
                      /*
                       *        Search for duplicates
                       */
      
   56                 if (iter->rt6i_metric == rt->rt6i_metric) {
                              /*
                               *        Same priority level
                               */
   52                         if (info->nlh &&
    9                             (info->nlh->nlmsg_flags & NLM_F_EXCL))
                                      return -EEXIST;
   51                         if (replace) {
    5                                 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
                                              found++;
                                              break;
                                      }
    1                                 if (rt_can_ecmp)
    1                                         fallback_ins = fallback_ins ?: ins;
                                      goto next_iter;
                              }
      
   46                         if (rt6_duplicate_nexthop(iter, rt)) {
   13                                 if (rt->rt6i_nsiblings)
                                              rt->rt6i_nsiblings = 0;
   13                                 if (!(iter->rt6i_flags & RTF_EXPIRES))
                                              return -EEXIST;
                                      if (!(rt->rt6i_flags & RTF_EXPIRES))
                                              rt6_clean_expires(iter);
                                      else
                                              rt6_set_expires(iter, rt->dst.expires);
                                      iter->rt6i_pmtu = rt->rt6i_pmtu;
                                      return -EEXIST;
                              }
                              /* If we have the same destination and the same metric,
                               * but not the same gateway, then the route we try to
                               * add is sibling to this route, increment our counter
                               * of siblings, and later we will add our route to the
                               * list.
                               * Only static routes (which don't have flag
                               * RTF_EXPIRES) are used for ECMPv6.
                               *
                               * To avoid long list, we only had siblings if the
                               * route have a gateway.
                               */
   36                         if (rt_can_ecmp &&
                                  rt6_qualify_for_ecmp(iter))
                                      rt->rt6i_nsiblings++;
                      }
      
    9                 if (iter->rt6i_metric > rt->rt6i_metric)
                              break;
      
      next_iter:
   40                 ins = &iter->dst.rt6_next;
              }
      
   40         if (fallback_ins && !found) {
                      /* No ECMP-able route found, replace first non-ECMP one */
                      ins = fallback_ins;
    1                 iter = *ins;
                      found++;
              }
      
              /* Reset round-robin state, if necessary */
   44         if (ins == &fn->leaf)
   93                 fn->rr_ptr = NULL;
      
              /* Link this route to others same route. */
  119         if (rt->rt6i_nsiblings) {
                      unsigned int rt6i_nsiblings;
                      struct rt6_info *sibling, *temp_sibling;
      
                      /* Find the first route that have the same metric */
                      sibling = fn->leaf;
                      while (sibling) {
                              if (sibling->rt6i_metric == rt->rt6i_metric &&
                                  rt6_qualify_for_ecmp(sibling)) {
                                      list_add_tail(&rt->rt6i_siblings,
                                                    &sibling->rt6i_siblings);
                                      break;
                              }
                              sibling = sibling->dst.rt6_next;
                      }
                      /* For each sibling in the list, increment the counter of
                       * siblings. BUG() if counters does not match, list of siblings
                       * is broken!
                       */
                      rt6i_nsiblings = 0;
                      list_for_each_entry_safe(sibling, temp_sibling,
                                               &rt->rt6i_siblings, rt6i_siblings) {
                              sibling->rt6i_nsiblings++;
                              BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
                              rt6i_nsiblings++;
                      }
                      BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
              }
      
              /*
               *        insert node
               */
  119         if (!replace) {
  110                 if (!add)
                              pr_warn("NLM_F_CREATE should be set when creating new route\n");
      
      add:
  111                 err = fib6_commit_metrics(&rt->dst, mxc);
                      if (err)
                              return err;
      
  111                 rt->dst.rt6_next = iter;
                      *ins = rt;
                      rcu_assign_pointer(rt->rt6i_node, fn);
                      atomic_inc(&rt->rt6i_ref);
                      inet6_rt_notify(RTM_NEWROUTE, rt, info, 0);
                      info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
      
                      if (!(fn->fn_flags & RTN_RTINFO)) {
   84                         info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                              fn->fn_flags |= RTN_RTINFO;
                      }
      
              } else {
                      int nsiblings;
      
    9                 if (!found) {
    5                         if (add)
                                      goto add;
    4                         pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
                              return -ENOENT;
                      }
      
    5                 err = fib6_commit_metrics(&rt->dst, mxc);
                      if (err)
                              return err;
      
    5                 *ins = rt;
                      rcu_assign_pointer(rt->rt6i_node, fn);
                      rt->dst.rt6_next = iter->dst.rt6_next;
                      atomic_inc(&rt->rt6i_ref);
                      inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
                      if (!(fn->fn_flags & RTN_RTINFO)) {
                              info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                              fn->fn_flags |= RTN_RTINFO;
                      }
    5                 nsiblings = iter->rt6i_nsiblings;
                      iter->rt6i_node = NULL;
                      fib6_purge_rt(iter, fn, info->nl_net);
                      if (fn->rr_ptr == iter)
                              fn->rr_ptr = NULL;
    5                 rt6_release(iter);
      
                      if (nsiblings) {
                              /* Replacing an ECMP route, remove all siblings */
                              ins = &rt->dst.rt6_next;
                              iter = *ins;
                              while (iter) {
                                      if (iter->rt6i_metric > rt->rt6i_metric)
                                              break;
                                      if (rt6_qualify_for_ecmp(iter)) {
                                              *ins = iter->dst.rt6_next;
                                              iter->rt6i_node = NULL;
                                              fib6_purge_rt(iter, fn, info->nl_net);
                                              if (fn->rr_ptr == iter)
                                                      fn->rr_ptr = NULL;
                                              rt6_release(iter);
                                              nsiblings--;
                                      } else {
                                              ins = &iter->dst.rt6_next;
                                      }
                                      iter = *ins;
                              }
                              WARN_ON(nsiblings != 0);
                      }
              }
      
              return 0;
      }
      
      static void fib6_start_gc(struct net *net, struct rt6_info *rt)
      {
              if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
  100             (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
                      mod_timer(&net->ipv6.ip6_fib_timer,
                                jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
      }
      
      void fib6_force_start_gc(struct net *net)
      {
   21         if (!timer_pending(&net->ipv6.ip6_fib_timer))
                      mod_timer(&net->ipv6.ip6_fib_timer,
   12                           jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
   21 }
      
      /*
       *        Add routing information to the routing tree.
       *        <destination addr>/<source addr>
       *        with source addr info in sub-trees
       */
      
      int fib6_add(struct fib6_node *root, struct rt6_info *rt,
                   struct nl_info *info, struct mx6_config *mxc)
      {
              struct fib6_node *fn, *pn = NULL;
              int err = -ENOMEM;
              int allow_create = 1;
              int replace_required = 0;
  127         int sernum = fib6_new_sernum(info->nl_net);
      
  127         if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) &&
                               !atomic_read(&rt->dst.__refcnt)))
                      return -EINVAL;
      
  127         if (info->nlh) {
   15                 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
                              allow_create = 0;
                      if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                              replace_required = 1;
              }
    3         if (!allow_create && !replace_required)
    2                 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
      
  127         fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
                              offsetof(struct rt6_info, rt6i_dst), allow_create,
                              replace_required, sernum);
  125         if (IS_ERR(fn)) {
                      err = PTR_ERR(fn);
                      fn = NULL;
                      goto out;
              }
      
              pn = fn;
      
      #ifdef CONFIG_IPV6_SUBTREES
              if (rt->rt6i_src.plen) {
                      struct fib6_node *sn;
      
                      if (!fn->subtree) {
                              struct fib6_node *sfn;
      
                              /*
                               * Create subtree.
                               *
                               *                fn[main tree]
                               *                |
                               *                sfn[subtree root]
                               *                   \
                               *                    sn[new leaf node]
                               */
      
                              /* Create subtree root node */
                              sfn = node_alloc();
                              if (!sfn)
                                      goto failure;
      
                              sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
                              atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
                              sfn->fn_flags = RTN_ROOT;
                              sfn->fn_sernum = sernum;
      
                              /* Now add the first leaf node to new subtree */
      
                              sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
                                              rt->rt6i_src.plen,
                                              offsetof(struct rt6_info, rt6i_src),
                                              allow_create, replace_required, sernum);
      
                              if (IS_ERR(sn)) {
                                      /* If it is failed, discard just allocated
                                         root, and then (in failure) stale node
                                         in main tree.
                                       */
                                      node_free_immediate(sfn);
                                      err = PTR_ERR(sn);
                                      goto failure;
                              }
      
                              /* Now link new subtree to main tree */
                              sfn->parent = fn;
                              fn->subtree = sfn;
                      } else {
                              sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
                                              rt->rt6i_src.plen,
                                              offsetof(struct rt6_info, rt6i_src),
                                              allow_create, replace_required, sernum);
      
                              if (IS_ERR(sn)) {
                                      err = PTR_ERR(sn);
                                      goto failure;
                              }
                      }
      
                      if (!fn->leaf) {
                              fn->leaf = rt;
                              atomic_inc(&rt->rt6i_ref);
                      }
                      fn = sn;
              }
      #endif
      
  125         err = fib6_add_rt2node(fn, rt, info, mxc);
              if (!err) {
  115                 fib6_start_gc(info->nl_net, rt);
  115                 if (!(rt->rt6i_flags & RTF_CACHE))
  115                         fib6_prune_clones(info->nl_net, pn);
  115                 rt->dst.flags &= ~DST_NOCACHE;
              }
      
      out:
              if (err) {
      #ifdef CONFIG_IPV6_SUBTREES
                      /*
                       * If fib6_add_1 has cleared the old leaf pointer in the
                       * super-tree leaf node we have to find a new one for it.
                       */
                      if (pn != fn && pn->leaf == rt) {
                              pn->leaf = NULL;
                              atomic_dec(&rt->rt6i_ref);
                      }
                      if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
                              pn->leaf = fib6_find_prefix(info->nl_net, pn);
      #if RT6_DEBUG >= 2
                              if (!pn->leaf) {
                                      WARN_ON(pn->leaf == NULL);
                                      pn->leaf = info->nl_net->ipv6.ip6_null_entry;
                              }
      #endif
                              atomic_inc(&pn->leaf->rt6i_ref);
                      }
      #endif
                      goto failure;
              }
              return err;
      
      failure:
              /* fn->leaf could be NULL if fn is an intermediate node and we
               * failed to add the new route to it in both subtree creation
               * failure and fib6_add_rt2node() failure case.
               * In both cases, fib6_repair_tree() should be called to fix
               * fn->leaf.
               */
   18         if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
    1                 fib6_repair_tree(info->nl_net, fn);
   20         if (!(rt->dst.flags & DST_NOCACHE))
   20                 dst_free(&rt->dst);
              return err;
      }
      
      /*
       *        Routing tree lookup
       *
       */
      
      struct lookup_args {
              int                        offset;                /* key offset on rt6_info        */
              const struct in6_addr        *addr;                /* search key                        */
      };
      
      static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
                                             struct lookup_args *args)
      {
              struct fib6_node *fn;
              __be32 dir;
      
              if (unlikely(args->offset == 0))
                      return NULL;
      
              /*
               *        Descend on a tree
               */
      
              fn = root;
      
              for (;;) {
                      struct fib6_node *next;
      
  686                 dir = addr_bit_set(args->addr, fn->fn_bit);
      
                      next = dir ? fn->right : fn->left;
      
  686                 if (next) {
                              fn = next;
                              continue;
                      }
                      break;
              }
      
  686         while (fn) {
  686                 if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
                              struct rt6key *key;
      
  686                         key = (struct rt6key *) ((u8 *) fn->leaf +
                                                       args->offset);
      
  686                         if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
      #ifdef CONFIG_IPV6_SUBTREES
                                      if (fn->subtree) {
                                              struct fib6_node *sfn;
                                              sfn = fib6_lookup_1(fn->subtree,
                                                                  args + 1);
                                              if (!sfn)
                                                      goto backtrack;
                                              fn = sfn;
                                      }
      #endif
                                      if (fn->fn_flags & RTN_RTINFO)
                                              return fn;
                              }
                      }
      #ifdef CONFIG_IPV6_SUBTREES
      backtrack:
      #endif
  207                 if (fn->fn_flags & RTN_ROOT)
                              break;
      
  207                 fn = fn->parent;
              }
      
              return NULL;
      }
      
      struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
                                    const struct in6_addr *saddr)
      {
              struct fib6_node *fn;
              struct lookup_args args[] = {
                      {
                              .offset = offsetof(struct rt6_info, rt6i_dst),
                              .addr = daddr,
                      },
      #ifdef CONFIG_IPV6_SUBTREES
                      {
                              .offset = offsetof(struct rt6_info, rt6i_src),
                              .addr = saddr,
                      },
      #endif
                      {
                              .offset = 0,        /* sentinel */
                      }
              };
      
  686         fn = fib6_lookup_1(root, daddr ? args : args + 1);
  686         if (!fn || fn->fn_flags & RTN_TL_ROOT)
                      fn = root;
      
  686         return fn;
      }
      
      /*
       *        Get node with specified destination prefix (and source prefix,
       *        if subtrees are used)
       */
      
      
      static struct fib6_node *fib6_locate_1(struct fib6_node *root,
                                             const struct in6_addr *addr,
                                             int plen, int offset)
      {
              struct fib6_node *fn;
      
   15         for (fn = root; fn ; ) {
   15                 struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
      
                      /*
                       *        Prefix match
                       */
                      if (plen < fn->fn_bit ||
   15                     !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
                              return NULL;
      
   15                 if (plen == fn->fn_bit)
                              return fn;
      
                      /*
                       *        We have more bits to go
                       */
   13                 if (addr_bit_set(addr, fn->fn_bit))
                              fn = fn->right;
                      else
                              fn = fn->left;
              }
              return NULL;
      }
      
      struct fib6_node *fib6_locate(struct fib6_node *root,
                                    const struct in6_addr *daddr, int dst_len,
                                    const struct in6_addr *saddr, int src_len)
      {
              struct fib6_node *fn;
      
   15         fn = fib6_locate_1(root, daddr, dst_len,
                                 offsetof(struct rt6_info, rt6i_dst));
      
      #ifdef CONFIG_IPV6_SUBTREES
              if (src_len) {
                      WARN_ON(saddr == NULL);
                      if (fn && fn->subtree)
                              fn = fib6_locate_1(fn->subtree, saddr, src_len,
                                                 offsetof(struct rt6_info, rt6i_src));
              }
      #endif
      
   10         if (fn && fn->fn_flags & RTN_RTINFO)
                      return fn;
      
   15         return NULL;
      }
      
      
      /*
       *        Deletion
       *
       */
      
      static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
      {
   19         if (fn->fn_flags & RTN_ROOT)
                      return net->ipv6.ip6_null_entry;
      
   19         while (fn) {
    3                 if (fn->left)
   19                         return fn->left->leaf;
                      if (fn->right)
                              return fn->right->leaf;
      
                      fn = FIB6_SUBTREE(fn);
              }
              return NULL;
      }
      
      /*
       *        Called to trim the tree of intermediate nodes when possible. "fn"
       *        is the node we want to try and remove.
       */
      
      static struct fib6_node *fib6_repair_tree(struct net *net,
                                                 struct fib6_node *fn)
      {
              int children;
              int nstate;
              struct fib6_node *child, *pn;
              struct fib6_walker *w;
              int iter = 0;
      
              for (;;) {
                      RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
                      iter++;
      
   29                 WARN_ON(fn->fn_flags & RTN_RTINFO);
   29                 WARN_ON(fn->fn_flags & RTN_TL_ROOT);
   29                 WARN_ON(fn->leaf);
      
                      children = 0;
                      child = NULL;
   29                 if (fn->right)
                              child = fn->right, children |= 1;
   29                 if (fn->left)
                              child = fn->left, children |= 2;
      
                      if (children == 3 || FIB6_SUBTREE(fn)
      #ifdef CONFIG_IPV6_SUBTREES
                          /* Subtree root (i.e. fn) may have one child */
                          || (children && fn->fn_flags & RTN_ROOT)
      #endif
                          ) {
   19                         fn->leaf = fib6_find_prefix(net, fn);
      #if RT6_DEBUG >= 2
                              if (!fn->leaf) {
                                      WARN_ON(!fn->leaf);
                                      fn->leaf = net->ipv6.ip6_null_entry;
                              }
      #endif
   19                         atomic_inc(&fn->leaf->rt6i_ref);
   29                         return fn->parent;
                      }
      
   28                 pn = fn->parent;
      #ifdef CONFIG_IPV6_SUBTREES
                      if (FIB6_SUBTREE(pn) == fn) {
                              WARN_ON(!(fn->fn_flags & RTN_ROOT));
                              FIB6_SUBTREE(pn) = NULL;
                              nstate = FWS_L;
                      } else {
                              WARN_ON(fn->fn_flags & RTN_ROOT);
      #endif
                              if (pn->right == fn)
   23                                 pn->right = child;
   21                         else if (pn->left == fn)
   21                                 pn->left = child;
      #if RT6_DEBUG >= 2
                              else
                                      WARN_ON(1);
      #endif
   28                         if (child)
   28                                 child->parent = pn;
                              nstate = FWS_R;
      #ifdef CONFIG_IPV6_SUBTREES
                      }
      #endif
      
   28                 read_lock(&fib6_walker_lock);
    6                 FOR_WALKERS(w) {
    6                         if (!child) {
    3                                 if (w->root == fn) {
                                              w->root = w->node = NULL;
                                              RT6_TRACE("W %p adjusted by delroot 1\n", w);
    3                                 } else if (w->node == fn) {
                                              RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
    3                                         w->node = pn;
                                              w->state = nstate;
                                      }
                              } else {
    6                                 if (w->root == fn) {
                                              w->root = child;
                                              RT6_TRACE("W %p adjusted by delroot 2\n", w);
                                      }
    6                                 if (w->node == fn) {
    6                                         w->node = child;
                                              if (children&2) {
                                                      RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
    3                                                 w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
                                              } else {
                                                      RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
    3                                                 w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
                                              }
                                      }
                              }
                      }
   28                 read_unlock(&fib6_walker_lock);
      
                      node_free(fn);
                      if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
                              return pn;
      
   24                 rt6_release(pn->leaf);
                      pn->leaf = NULL;
                      fn = pn;
              }
      }
      
      static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
                                 struct nl_info *info)
      {
              struct fib6_walker *w;
              struct rt6_info *rt = *rtp;
   37         struct net *net = info->nl_net;
      
              RT6_TRACE("fib6_del_route\n");
      
              /* Unlink it */
              *rtp = rt->dst.rt6_next;
              rt->rt6i_node = NULL;
              net->ipv6.rt6_stats->fib_rt_entries--;
              net->ipv6.rt6_stats->fib_discarded_routes++;
      
              /* Reset round-robin state, if necessary */
              if (fn->rr_ptr == rt)
    5                 fn->rr_ptr = NULL;
      
              /* Remove this entry from other siblings */
   37         if (rt->rt6i_nsiblings) {
                      struct rt6_info *sibling, *next_sibling;
      
                      list_for_each_entry_safe(sibling, next_sibling,
                                               &rt->rt6i_siblings, rt6i_siblings)
                              sibling->rt6i_nsiblings--;
                      rt->rt6i_nsiblings = 0;
                      list_del_init(&rt->rt6i_siblings);
              }
      
              /* Adjust walkers */
   37         read_lock(&fib6_walker_lock);
   19         FOR_WALKERS(w) {
   19                 if (w->state == FWS_C && w->leaf == rt) {
                              RT6_TRACE("walker %p adjusted by delroute\n", w);
   19                         w->leaf = rt->dst.rt6_next;
                              if (!w->leaf)
   18                                 w->state = FWS_U;
                      }
              }
   37         read_unlock(&fib6_walker_lock);
      
              rt->dst.rt6_next = NULL;
      
              /* If it was last route, expunge its radix tree node */
              if (!fn->leaf) {
   28                 fn->fn_flags &= ~RTN_RTINFO;
                      net->ipv6.rt6_stats->fib_route_nodes--;
                      fn = fib6_repair_tree(net, fn);
              }
      
   37         fib6_purge_rt(rt, fn, net);
      
              inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
              rt6_release(rt);
      }
      
      int fib6_del(struct rt6_info *rt, struct nl_info *info)
      {
   37         struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
                                          lockdep_is_held(&rt->rt6i_table->tb6_lock));
              struct net *net = info->nl_net;
              struct rt6_info **rtp;
      
      #if RT6_DEBUG >= 2
              if (rt->dst.obsolete > 0) {
                      WARN_ON(fn);
   37                 return -ENOENT;
              }
      #endif
   37         if (!fn || rt == net->ipv6.ip6_null_entry)
                      return -ENOENT;
      
   37         WARN_ON(!(fn->fn_flags & RTN_RTINFO));
      
   37         if (!(rt->rt6i_flags & RTF_CACHE)) {
                      struct fib6_node *pn = fn;
      #ifdef CONFIG_IPV6_SUBTREES
                      /* clones of this route might be in another subtree */
                      if (rt->rt6i_src.plen) {
                              while (!(pn->fn_flags & RTN_ROOT))
                                      pn = pn->parent;
                              pn = pn->parent;
                      }
      #endif
   35                 fib6_prune_clones(info->nl_net, pn);
              }
      
              /*
               *        Walk the leaf entries looking for ourself
               */
      
   37         for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
   37                 if (*rtp == rt) {
   37                         fib6_del_route(fn, rtp, info);
                              return 0;
                      }
              }
              return -ENOENT;
      }
      
      /*
       *        Tree traversal function.
       *
       *        Certainly, it is not interrupt safe.
       *        However, it is internally reenterable wrt itself and fib6_add/fib6_del.
       *        It means, that we can modify tree during walking
       *        and use this function for garbage collection, clone pruning,
       *        cleaning tree when a device goes down etc. etc.
       *
       *        It guarantees that every node will be traversed,
       *        and that it will be traversed only once.
       *
       *        Callback function w->func may return:
       *        0 -> continue walking.
       *        positive value -> walking is suspended (used by tree dumps,
       *        and probably by gc, if it will be split to several slices)
       *        negative value -> terminate walking.
       *
       *        The function itself returns:
       *        0   -> walk is complete.
       *        >0  -> walk is incomplete (i.e. suspended)
       *        <0  -> walk is terminated by an error.
       */
      
      static int fib6_walk_continue(struct fib6_walker *w)
  235 {
              struct fib6_node *fn, *pn;
      
              for (;;) {
                      fn = w->node;
  235                 if (!fn)
                              return 0;
      
  235                 if (w->prune && fn != w->root &&
   48                     fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
   48                         w->state = FWS_C;
                              w->leaf = fn->leaf;
                      }
  235                 switch (w->state) {
      #ifdef CONFIG_IPV6_SUBTREES
                      case FWS_S:
                              if (FIB6_SUBTREE(fn)) {
                                      w->node = FIB6_SUBTREE(fn);
                                      continue;
                              }
                              w->state = FWS_L;
      #endif
                      case FWS_L:
  235                         if (fn->left) {
  157                                 w->node = fn->left;
                                      w->state = FWS_INIT;
                                      continue;
                              }
  227                         w->state = FWS_R;
                      case FWS_R:
  235                         if (fn->right) {
  132                                 w->node = fn->right;
                                      w->state = FWS_INIT;
                                      continue;
                              }
  233                         w->state = FWS_C;
                              w->leaf = fn->leaf;
                      case FWS_C:
  235                         if (w->leaf && fn->fn_flags & RTN_RTINFO) {
                                      int err;
      
  235                                 if (w->skip) {
                                              w->skip--;
                                              goto skip;
                                      }
      
  235                                 err = w->func(w);
                                      if (err)
                                              return err;
      
  235                                 w->count++;
                                      continue;
                              }
      skip:
  235                         w->state = FWS_U;
                      case FWS_U:
  235                         if (fn == w->root)
                                      return 0;
  157                         pn = fn->parent;
                              w->node = pn;
      #ifdef CONFIG_IPV6_SUBTREES
                              if (FIB6_SUBTREE(pn) == fn) {
                                      WARN_ON(!(fn->fn_flags & RTN_ROOT));
                                      w->state = FWS_L;
                                      continue;
                              }
      #endif
                              if (pn->left == fn) {
  157                                 w->state = FWS_R;
                                      continue;
                              }
  132                         if (pn->right == fn) {
  132                                 w->state = FWS_C;
                                      w->leaf = w->node->leaf;
                                      continue;
                              }
      #if RT6_DEBUG >= 2
                              WARN_ON(1);
      #endif
                      }
              }
      }
      
      static int fib6_walk(struct fib6_walker *w)
      {
              int res;
      
  230         w->state = FWS_INIT;
              w->node = w->root;
      
              fib6_walker_link(w);
              res = fib6_walk_continue(w);
              if (res <= 0)
  230                 fib6_walker_unlink(w);
  230         return res;
      }
      
      static int fib6_clean_node(struct fib6_walker *w)
      {
              int res;
              struct rt6_info *rt;
              struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
  220         struct nl_info info = {
                      .nl_net = c->net,
              };
      
              if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
   34             w->node->fn_sernum != c->sernum)
   34                 w->node->fn_sernum = c->sernum;
      
  210         if (!c->func) {
   34                 WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
  220                 w->leaf = NULL;
  220                 return 0;
              }
      
  210         for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
  210                 res = c->func(rt, c->arg);
                      if (res < 0) {
   19                         w->leaf = rt;
                              res = fib6_del(rt, &info);
                              if (res) {
      #if RT6_DEBUG >= 2
                                      pr_debug("%s: del failed: rt=%p@%p err=%d\n",
                                               __func__, rt,
                                               rcu_access_pointer(rt->rt6i_node),
                                               res);
      #endif
                                      continue;
                              }
                              return 0;
                      }
  210                 WARN_ON(res != 0);
              }
              w->leaf = rt;
              return 0;
      }
      
      /*
       *        Convenient frontend to tree walker.
       *
       *        func is called on each route.
       *                It may return -1 -> delete this route.
       *                              0  -> continue walking
       *
       *        prune==1 -> only immediate children of node (certainly,
       *        ignoring pure split nodes) will be scanned.
       */
      
      static void fib6_clean_tree(struct net *net, struct fib6_node *root,
                                  int (*func)(struct rt6_info *, void *arg),
                                  bool prune, int sernum, void *arg)
      {
              struct fib6_cleaner c;
      
  220         c.w.root = root;
              c.w.func = fib6_clean_node;
              c.w.prune = prune;
              c.w.count = 0;
              c.w.skip = 0;
              c.func = func;
              c.sernum = sernum;
              c.arg = arg;
              c.net = net;
      
              fib6_walk(&c.w);
      }
      
      static void __fib6_clean_all(struct net *net,
                                   int (*func)(struct rt6_info *, void *),
                                   int sernum, void *arg)
      {
              struct fib6_table *table;
              struct hlist_head *head;
              unsigned int h;
      
  114         rcu_read_lock();
  114         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
  114                 head = &net->ipv6.fib_table_hash[h];
  114                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
  114                         write_lock_bh(&table->tb6_lock);
                              fib6_clean_tree(net, &table->tb6_root,
                                              func, false, sernum, arg);
                              write_unlock_bh(&table->tb6_lock);
                      }
              }
  114         rcu_read_unlock();
      }
      
      void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
                          void *arg)
      {
   91         __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
      }
      
      static int fib6_prune_clone(struct rt6_info *rt, void *arg)
      {
  133         if (rt->rt6i_flags & RTF_CACHE) {
                      RT6_TRACE("pruning clone %p\n", rt);
                      return -1;
              }
      
              return 0;
      }
      
      static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
      {
              fib6_clean_tree(net, fn, fib6_prune_clone, true,
                              FIB6_NO_SERNUM_CHANGE, NULL);
      }
      
      static void fib6_flush_trees(struct net *net)
      {
   34         int new_sernum = fib6_new_sernum(net);
      
   34         __fib6_clean_all(net, NULL, new_sernum, NULL);
      }
      
      /*
       *        Garbage collection
       */
      
      static struct fib6_gc_args
      {
              int                        timeout;
              int                        more;
      } gc_args;
      
      static int fib6_age(struct rt6_info *rt, void *arg)
      {
   13         unsigned long now = jiffies;
      
              /*
               *        check addrconf expiration here.
               *        Routes are expired even if they are in use.
               *
               *        Also age clones. Note, that clones are aged out
               *        only if they are not in use now.
               */
      
              if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
                      if (time_after(now, rt->dst.expires)) {
                              RT6_TRACE("expiring %p\n", rt);
                              return -1;
                      }
                      gc_args.more++;
   13         } else if (rt->rt6i_flags & RTF_CACHE) {
    1                 if (atomic_read(&rt->dst.__refcnt) == 0 &&
    1                     time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
                              RT6_TRACE("aging clone %p\n", rt);
                              return -1;
    1                 } else if (rt->rt6i_flags & RTF_GATEWAY) {
                              struct neighbour *neigh;
                              __u8 neigh_flags = 0;
      
    1                         neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
    1                         if (neigh) {
    1                                 neigh_flags = neigh->flags;
    1                                 neigh_release(neigh);
                              }
    1                         if (!(neigh_flags & NTF_ROUTER)) {
                                      RT6_TRACE("purging route %p via non-router but gateway\n",
                                                rt);
                                      return -1;
                              }
                      }
   13                 gc_args.more++;
              }
      
              return 0;
      }
      
      static DEFINE_SPINLOCK(fib6_gc_lock);
      
      void fib6_run_gc(unsigned long expires, struct net *net, bool force)
   13 {
              unsigned long now;
      
              if (force) {
                      spin_lock_bh(&fib6_gc_lock);
   13         } else if (!spin_trylock_bh(&fib6_gc_lock)) {
                      mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
                      return;
              }
   13         gc_args.timeout = expires ? (int)expires :
                                net->ipv6.sysctl.ip6_rt_gc_interval;
      
              gc_args.more = icmp6_dst_gc();
      
              fib6_clean_all(net, fib6_age, NULL);
              now = jiffies;
              net->ipv6.ip6_rt_last_gc = now;
      
              if (gc_args.more)
                      mod_timer(&net->ipv6.ip6_fib_timer,
                                round_jiffies(now
                                              + net->ipv6.sysctl.ip6_rt_gc_interval));
              else
   13                 del_timer(&net->ipv6.ip6_fib_timer);
   13         spin_unlock_bh(&fib6_gc_lock);
      }
      
      static void fib6_gc_timer_cb(unsigned long arg)
      {
              fib6_run_gc(0, (struct net *)arg, true);
      }
      
      static int __net_init fib6_net_init(struct net *net)
      {
              size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
      
   28         setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
      
              net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
              if (!net->ipv6.rt6_stats)
                      goto out_timer;
      
              /* Avoid false sharing : Use at least a full cache line */
              size = max_t(size_t, size, L1_CACHE_BYTES);
      
   28         net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
              if (!net->ipv6.fib_table_hash)
                      goto out_rt6_stats;
      
   28         net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
                                                GFP_KERNEL);
              if (!net->ipv6.fib6_main_tbl)
                      goto out_fib_table_hash;
      
   28         net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
              net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
              net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
                      RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
              inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
      
      #ifdef CONFIG_IPV6_MULTIPLE_TABLES
              net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
                                                 GFP_KERNEL);
              if (!net->ipv6.fib6_local_tbl)
                      goto out_fib6_main_tbl;
   28         net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
              net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
              net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
                      RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
              inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
      #endif
   28         fib6_tables_init(net);
      
              return 0;
      
      #ifdef CONFIG_IPV6_MULTIPLE_TABLES
      out_fib6_main_tbl:
              kfree(net->ipv6.fib6_main_tbl);
      #endif
      out_fib_table_hash:
              kfree(net->ipv6.fib_table_hash);
      out_rt6_stats:
              kfree(net->ipv6.rt6_stats);
      out_timer:
              return -ENOMEM;
      }
      
      static void fib6_net_exit(struct net *net)
      {
              unsigned int i;
      
              rt6_ifdown(net, NULL);
              del_timer_sync(&net->ipv6.ip6_fib_timer);
      
              for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
                      struct hlist_head *head = &net->ipv6.fib_table_hash[i];
                      struct hlist_node *tmp;
                      struct fib6_table *tb;
      
                      hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
                              hlist_del(&tb->tb6_hlist);
                              fib6_free_table(tb);
                      }
              }
      
              kfree(net->ipv6.fib_table_hash);
              kfree(net->ipv6.rt6_stats);
      }
      
      static struct pernet_operations fib6_net_ops = {
              .init = fib6_net_init,
              .exit = fib6_net_exit,
      };
      
      int __init fib6_init(void)
      {
              int ret = -ENOMEM;
      
              fib6_node_kmem = kmem_cache_create("fib6_nodes",
                                                 sizeof(struct fib6_node),
                                                 0, SLAB_HWCACHE_ALIGN,
                                                 NULL);
              if (!fib6_node_kmem)
                      goto out;
      
              ret = register_pernet_subsys(&fib6_net_ops);
              if (ret)
                      goto out_kmem_cache_create;
      
              ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
                                    NULL);
              if (ret)
                      goto out_unregister_subsys;
      
              __fib6_flush_trees = fib6_flush_trees;
      out:
              return ret;
      
      out_unregister_subsys:
              unregister_pernet_subsys(&fib6_net_ops);
      out_kmem_cache_create:
              kmem_cache_destroy(fib6_node_kmem);
              goto out;
      }
      
      void fib6_gc_cleanup(void)
      {
              unregister_pernet_subsys(&fib6_net_ops);
              kmem_cache_destroy(fib6_node_kmem);
      }
      
      #ifdef CONFIG_PROC_FS
      
      struct ipv6_route_iter {
              struct seq_net_private p;
              struct fib6_walker w;
              loff_t skip;
              struct fib6_table *tbl;
              int sernum;
      };
      
      static int ipv6_route_seq_show(struct seq_file *seq, void *v)
      {
              struct rt6_info *rt = v;
    5         struct ipv6_route_iter *iter = seq->private;
      
              seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
      
      #ifdef CONFIG_IPV6_SUBTREES
              seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
      #else
              seq_puts(seq, "00000000000000000000000000000000 00 ");
      #endif
              if (rt->rt6i_flags & RTF_GATEWAY)
                      seq_printf(seq, "%pi6", &rt->rt6i_gateway);
              else
    5                 seq_puts(seq, "00000000000000000000000000000000");
      
    5         seq_printf(seq, " %08x %08x %08x %08x %8s\n",
                         rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
                         rt->dst.__use, rt->rt6i_flags,
    5                    rt->dst.dev ? rt->dst.dev->name : "");
              iter->w.leaf = NULL;
              return 0;
      }
      
      static int ipv6_route_yield(struct fib6_walker *w)
      {
    5         struct ipv6_route_iter *iter = w->args;
      
    5         if (!iter->skip)
                      return 1;
      
              do {
    5                 iter->w.leaf = iter->w.leaf->dst.rt6_next;
                      iter->skip--;
    5                 if (!iter->skip && iter->w.leaf)
                              return 1;
    5         } while (iter->w.leaf);
      
              return 0;
      }
      
      static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
      {
    5         memset(&iter->w, 0, sizeof(iter->w));
              iter->w.func = ipv6_route_yield;
              iter->w.root = &iter->tbl->tb6_root;
              iter->w.state = FWS_INIT;
              iter->w.node = iter->w.root;
              iter->w.args = iter;
              iter->sernum = iter->w.root->fn_sernum;
              INIT_LIST_HEAD(&iter->w.lh);
              fib6_walker_link(&iter->w);
      }
      
      static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
                                                          struct net *net)
      {
              unsigned int h;
              struct hlist_node *node;
      
    5         if (tbl) {
    5                 h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
    5                 node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
              } else {
                      h = 0;
                      node = NULL;
              }
      
    5         while (!node && h < FIB6_TABLE_HASHSZ) {
    5                 node = rcu_dereference_bh(
                              hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
              }
    5         return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
      }
      
      static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
      {
    5         if (iter->sernum != iter->w.root->fn_sernum) {
                      iter->sernum = iter->w.root->fn_sernum;
                      iter->w.state = FWS_INIT;
                      iter->w.node = iter->w.root;
                      WARN_ON(iter->w.skip);
                      iter->w.skip = iter->w.count;
              }
      }
      
      static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
      {
              int r;
              struct rt6_info *n;
    5         struct net *net = seq_file_net(seq);
              struct ipv6_route_iter *iter = seq->private;
      
              if (!v)
                      goto iter_table;
      
    5         n = ((struct rt6_info *)v)->dst.rt6_next;
              if (n) {
    5                 ++*pos;
    5                 return n;
              }
      
      iter_table:
    5         ipv6_route_check_sernum(iter);
    5         read_lock(&iter->tbl->tb6_lock);
              r = fib6_walk_continue(&iter->w);
              read_unlock(&iter->tbl->tb6_lock);
              if (r > 0) {
    5                 if (v)
    5                         ++*pos;
    5                 return iter->w.leaf;
    5         } else if (r < 0) {
                      fib6_walker_unlink(&iter->w);
                      return NULL;
              }
    5         fib6_walker_unlink(&iter->w);
      
              iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
              if (!iter->tbl)
                      return NULL;
      
    5         ipv6_route_seq_setup_walk(iter);
              goto iter_table;
    5 }
      
      static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
              __acquires(RCU_BH)
      {
    5         struct net *net = seq_file_net(seq);
              struct ipv6_route_iter *iter = seq->private;
      
    5         rcu_read_lock_bh();
    5         iter->tbl = ipv6_route_seq_next_table(NULL, net);
              iter->skip = *pos;
      
              if (iter->tbl) {
    5                 ipv6_route_seq_setup_walk(iter);
    5                 return ipv6_route_seq_next(seq, NULL, pos);
              } else {
                      return NULL;
              }
      }
      
      static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
      {
              struct fib6_walker *w = &iter->w;
    5         return w->node && !(w->state == FWS_U && w->node == w->root);
      }
      
      static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
              __releases(RCU_BH)
      {
    5         struct ipv6_route_iter *iter = seq->private;
      
    5         if (ipv6_route_iter_active(iter))
    3                 fib6_walker_unlink(&iter->w);
      
    5         rcu_read_unlock_bh();
      }
      
      static const struct seq_operations ipv6_route_seq_ops = {
              .start        = ipv6_route_seq_start,
              .next        = ipv6_route_seq_next,
              .stop        = ipv6_route_seq_stop,
              .show        = ipv6_route_seq_show
      };
      
      int ipv6_route_open(struct inode *inode, struct file *file)
      {
    8         return seq_open_net(inode, file, &ipv6_route_seq_ops,
                                  sizeof(struct ipv6_route_iter));
      }
      
      #endif /* CONFIG_PROC_FS */
      /*
       * Input Multitouch Library
       *
       * Copyright (c) 2008-2010 Henrik Rydberg
       *
       * This program is free software; you can redistribute it and/or modify it
       * under the terms of the GNU General Public License version 2 as published by
       * the Free Software Foundation.
       */
      
      #include <linux/input/mt.h>
      #include <linux/export.h>
      #include <linux/slab.h>
      
      #define TRKID_SGN        ((TRKID_MAX + 1) >> 1)
      
      static void copy_abs(struct input_dev *dev, unsigned int dst, unsigned int src)
      {
              if (dev->absinfo && test_bit(src, dev->absbit)) {
                      dev->absinfo[dst] = dev->absinfo[src];
                      dev->absinfo[dst].fuzz = 0;
                      dev->absbit[BIT_WORD(dst)] |= BIT_MASK(dst);
              }
      }
      
      /**
       * input_mt_init_slots() - initialize MT input slots
       * @dev: input device supporting MT events and finger tracking
       * @num_slots: number of slots used by the device
       * @flags: mt tasks to handle in core
       *
       * This function allocates all necessary memory for MT slot handling
       * in the input device, prepares the ABS_MT_SLOT and
       * ABS_MT_TRACKING_ID events for use and sets up appropriate buffers.
       * Depending on the flags set, it also performs pointer emulation and
       * frame synchronization.
       *
       * May be called repeatedly. Returns -EINVAL if attempting to
       * reinitialize with a different number of slots.
       */
    3 int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots,
                              unsigned int flags)
      {
    2         struct input_mt *mt = dev->mt;
              int i;
      
    3         if (!num_slots)
                      return 0;
              if (mt)
    1                 return mt->num_slots != num_slots ? -EINVAL : 0;
      
    2         mt = kzalloc(sizeof(*mt) + num_slots * sizeof(*mt->slots), GFP_KERNEL);
              if (!mt)
                      goto err_mem;
      
    2         mt->num_slots = num_slots;
              mt->flags = flags;
              input_set_abs_params(dev, ABS_MT_SLOT, 0, num_slots - 1, 0, 0);
              input_set_abs_params(dev, ABS_MT_TRACKING_ID, 0, TRKID_MAX, 0, 0);
      
              if (flags & (INPUT_MT_POINTER | INPUT_MT_DIRECT)) {
                      __set_bit(EV_KEY, dev->evbit);
                      __set_bit(BTN_TOUCH, dev->keybit);
      
                      copy_abs(dev, ABS_X, ABS_MT_POSITION_X);
                      copy_abs(dev, ABS_Y, ABS_MT_POSITION_Y);
                      copy_abs(dev, ABS_PRESSURE, ABS_MT_PRESSURE);
              }
    2         if (flags & INPUT_MT_POINTER) {
                      __set_bit(BTN_TOOL_FINGER, dev->keybit);
                      __set_bit(BTN_TOOL_DOUBLETAP, dev->keybit);
                      if (num_slots >= 3)
                              __set_bit(BTN_TOOL_TRIPLETAP, dev->keybit);
                      if (num_slots >= 4)
                              __set_bit(BTN_TOOL_QUADTAP, dev->keybit);
                      if (num_slots >= 5)
                              __set_bit(BTN_TOOL_QUINTTAP, dev->keybit);
                      __set_bit(INPUT_PROP_POINTER, dev->propbit);
              }
    2         if (flags & INPUT_MT_DIRECT)
                      __set_bit(INPUT_PROP_DIRECT, dev->propbit);
    2         if (flags & INPUT_MT_SEMI_MT)
                      __set_bit(INPUT_PROP_SEMI_MT, dev->propbit);
    2         if (flags & INPUT_MT_TRACK) {
                      unsigned int n2 = num_slots * num_slots;
                      mt->red = kcalloc(n2, sizeof(*mt->red), GFP_KERNEL);
                      if (!mt->red)
                              goto err_mem;
              }
      
              /* Mark slots as 'inactive' */
              for (i = 0; i < num_slots; i++)
    2                 input_mt_set_value(&mt->slots[i], ABS_MT_TRACKING_ID, -1);
      
              /* Mark slots as 'unused' */
    2         mt->frame = 1;
      
              dev->mt = mt;
    3         return 0;
      err_mem:
              kfree(mt);
              return -ENOMEM;
      }
      EXPORT_SYMBOL(input_mt_init_slots);
      
      /**
       * input_mt_destroy_slots() - frees the MT slots of the input device
       * @dev: input device with allocated MT slots
       *
       * This function is only needed in error path as the input core will
       * automatically free the MT slots when the device is destroyed.
       */
      void input_mt_destroy_slots(struct input_dev *dev)
      {
   18         if (dev->mt) {
    1                 kfree(dev->mt->red);
                      kfree(dev->mt);
              }
   18         dev->mt = NULL;
      }
      EXPORT_SYMBOL(input_mt_destroy_slots);
      
      /**
       * input_mt_report_slot_state() - report contact state
       * @dev: input device with allocated MT slots
       * @tool_type: the tool type to use in this slot
       * @active: true if contact is active, false otherwise
       *
       * Reports a contact via ABS_MT_TRACKING_ID, and optionally
       * ABS_MT_TOOL_TYPE. If active is true and the slot is currently
       * inactive, or if the tool type is changed, a new tracking id is
       * assigned to the slot. The tool type is only reported if the
       * corresponding absbit field is set.
       */
      void input_mt_report_slot_state(struct input_dev *dev,
                                      unsigned int tool_type, bool active)
      {
              struct input_mt *mt = dev->mt;
              struct input_mt_slot *slot;
              int id;
      
              if (!mt)
                      return;
      
              slot = &mt->slots[mt->slot];
              slot->frame = mt->frame;
      
              if (!active) {
                      input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1);
                      return;
              }
      
              id = input_mt_get_value(slot, ABS_MT_TRACKING_ID);
              if (id < 0 || input_mt_get_value(slot, ABS_MT_TOOL_TYPE) != tool_type)
                      id = input_mt_new_trkid(mt);
      
              input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, id);
              input_event(dev, EV_ABS, ABS_MT_TOOL_TYPE, tool_type);
      }
      EXPORT_SYMBOL(input_mt_report_slot_state);
      
      /**
       * input_mt_report_finger_count() - report contact count
       * @dev: input device with allocated MT slots
       * @count: the number of contacts
       *
       * Reports the contact count via BTN_TOOL_FINGER, BTN_TOOL_DOUBLETAP,
       * BTN_TOOL_TRIPLETAP and BTN_TOOL_QUADTAP.
       *
       * The input core ensures only the KEY events already setup for
       * this device will produce output.
       */
      void input_mt_report_finger_count(struct input_dev *dev, int count)
      {
              input_event(dev, EV_KEY, BTN_TOOL_FINGER, count == 1);
              input_event(dev, EV_KEY, BTN_TOOL_DOUBLETAP, count == 2);
              input_event(dev, EV_KEY, BTN_TOOL_TRIPLETAP, count == 3);
              input_event(dev, EV_KEY, BTN_TOOL_QUADTAP, count == 4);
              input_event(dev, EV_KEY, BTN_TOOL_QUINTTAP, count == 5);
      }
      EXPORT_SYMBOL(input_mt_report_finger_count);
      
      /**
       * input_mt_report_pointer_emulation() - common pointer emulation
       * @dev: input device with allocated MT slots
       * @use_count: report number of active contacts as finger count
       *
       * Performs legacy pointer emulation via BTN_TOUCH, ABS_X, ABS_Y and
       * ABS_PRESSURE. Touchpad finger count is emulated if use_count is true.
       *
       * The input core ensures only the KEY and ABS axes already setup for
       * this device will produce output.
       */
      void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count)
      {
              struct input_mt *mt = dev->mt;
              struct input_mt_slot *oldest;
              int oldid, count, i;
      
              if (!mt)
                      return;
      
              oldest = NULL;
              oldid = mt->trkid;
              count = 0;
      
              for (i = 0; i < mt->num_slots; ++i) {
                      struct input_mt_slot *ps = &mt->slots[i];
                      int id = input_mt_get_value(ps, ABS_MT_TRACKING_ID);
      
                      if (id < 0)
                              continue;
                      if ((id - oldid) & TRKID_SGN) {
                              oldest = ps;
                              oldid = id;
                      }
                      count++;
              }
      
              input_event(dev, EV_KEY, BTN_TOUCH, count > 0);
              if (use_count)
                      input_mt_report_finger_count(dev, count);
      
              if (oldest) {
                      int x = input_mt_get_value(oldest, ABS_MT_POSITION_X);
                      int y = input_mt_get_value(oldest, ABS_MT_POSITION_Y);
      
                      input_event(dev, EV_ABS, ABS_X, x);
                      input_event(dev, EV_ABS, ABS_Y, y);
      
                      if (test_bit(ABS_MT_PRESSURE, dev->absbit)) {
                              int p = input_mt_get_value(oldest, ABS_MT_PRESSURE);
                              input_event(dev, EV_ABS, ABS_PRESSURE, p);
                      }
              } else {
                      if (test_bit(ABS_MT_PRESSURE, dev->absbit))
                              input_event(dev, EV_ABS, ABS_PRESSURE, 0);
              }
      }
      EXPORT_SYMBOL(input_mt_report_pointer_emulation);
      
      static void __input_mt_drop_unused(struct input_dev *dev, struct input_mt *mt)
      {
              int i;
      
              for (i = 0; i < mt->num_slots; i++) {
                      if (!input_mt_is_used(mt, &mt->slots[i])) {
                              input_mt_slot(dev, i);
                              input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1);
                      }
              }
      }
      
      /**
       * input_mt_drop_unused() - Inactivate slots not seen in this frame
       * @dev: input device with allocated MT slots
       *
       * Lift all slots not seen since the last call to this function.
       */
      void input_mt_drop_unused(struct input_dev *dev)
      {
              struct input_mt *mt = dev->mt;
      
              if (mt) {
                      __input_mt_drop_unused(dev, mt);
                      mt->frame++;
              }
      }
      EXPORT_SYMBOL(input_mt_drop_unused);
      
      /**
       * input_mt_sync_frame() - synchronize mt frame
       * @dev: input device with allocated MT slots
       *
       * Close the frame and prepare the internal state for a new one.
       * Depending on the flags, marks unused slots as inactive and performs
       * pointer emulation.
       */
      void input_mt_sync_frame(struct input_dev *dev)
      {
              struct input_mt *mt = dev->mt;
              bool use_count = false;
      
              if (!mt)
                      return;
      
              if (mt->flags & INPUT_MT_DROP_UNUSED)
                      __input_mt_drop_unused(dev, mt);
      
              if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT))
                      use_count = true;
      
              input_mt_report_pointer_emulation(dev, use_count);
      
              mt->frame++;
      }
      EXPORT_SYMBOL(input_mt_sync_frame);
      
      static int adjust_dual(int *begin, int step, int *end, int eq, int mu)
      {
              int f, *p, s, c;
      
              if (begin == end)
                      return 0;
      
              f = *begin;
              p = begin + step;
              s = p == end ? f + 1 : *p;
      
              for (; p != end; p += step)
                      if (*p < f)
                              s = f, f = *p;
                      else if (*p < s)
                              s = *p;
      
              c = (f + s + 1) / 2;
              if (c == 0 || (c > mu && (!eq || mu > 0)))
                      return 0;
              /* Improve convergence for positive matrices by penalizing overcovers */
              if (s < 0 && mu <= 0)
                      c *= 2;
      
              for (p = begin; p != end; p += step)
                      *p -= c;
      
              return (c < s && s <= 0) || (f >= 0 && f < c);
      }
      
      static void find_reduced_matrix(int *w, int nr, int nc, int nrc, int mu)
      {
              int i, k, sum;
      
              for (k = 0; k < nrc; k++) {
                      for (i = 0; i < nr; i++)
                              adjust_dual(w + i, nr, w + i + nrc, nr <= nc, mu);
                      sum = 0;
                      for (i = 0; i < nrc; i += nr)
                              sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr, mu);
                      if (!sum)
                              break;
              }
      }
      
      static int input_mt_set_matrix(struct input_mt *mt,
                                     const struct input_mt_pos *pos, int num_pos,
                                     int mu)
      {
              const struct input_mt_pos *p;
              struct input_mt_slot *s;
              int *w = mt->red;
              int x, y;
      
              for (s = mt->slots; s != mt->slots + mt->num_slots; s++) {
                      if (!input_mt_is_active(s))
                              continue;
                      x = input_mt_get_value(s, ABS_MT_POSITION_X);
                      y = input_mt_get_value(s, ABS_MT_POSITION_Y);
                      for (p = pos; p != pos + num_pos; p++) {
                              int dx = x - p->x, dy = y - p->y;
                              *w++ = dx * dx + dy * dy - mu;
                      }
              }
      
              return w - mt->red;
      }
      
      static void input_mt_set_slots(struct input_mt *mt,
                                     int *slots, int num_pos)
      {
              struct input_mt_slot *s;
              int *w = mt->red, j;
      
              for (j = 0; j != num_pos; j++)
                      slots[j] = -1;
      
              for (s = mt->slots; s != mt->slots + mt->num_slots; s++) {
                      if (!input_mt_is_active(s))
                              continue;
      
                      for (j = 0; j != num_pos; j++) {
                              if (w[j] < 0) {
                                      slots[j] = s - mt->slots;
                                      break;
                              }
                      }
      
                      w += num_pos;
              }
      
              for (s = mt->slots; s != mt->slots + mt->num_slots; s++) {
                      if (input_mt_is_active(s))
                              continue;
      
                      for (j = 0; j != num_pos; j++) {
                              if (slots[j] < 0) {
                                      slots[j] = s - mt->slots;
                                      break;
                              }
                      }
              }
      }
      
      /**
       * input_mt_assign_slots() - perform a best-match assignment
       * @dev: input device with allocated MT slots
       * @slots: the slot assignment to be filled
       * @pos: the position array to match
       * @num_pos: number of positions
       * @dmax: maximum ABS_MT_POSITION displacement (zero for infinite)
       *
       * Performs a best match against the current contacts and returns
       * the slot assignment list. New contacts are assigned to unused
       * slots.
       *
       * The assignments are balanced so that all coordinate displacements are
       * below the euclidian distance dmax. If no such assignment can be found,
       * some contacts are assigned to unused slots.
       *
       * Returns zero on success, or negative error in case of failure.
       */
      int input_mt_assign_slots(struct input_dev *dev, int *slots,
                                const struct input_mt_pos *pos, int num_pos,
                                int dmax)
      {
              struct input_mt *mt = dev->mt;
              int mu = 2 * dmax * dmax;
              int nrc;
      
              if (!mt || !mt->red)
                      return -ENXIO;
              if (num_pos > mt->num_slots)
                      return -EINVAL;
              if (num_pos < 1)
                      return 0;
      
              nrc = input_mt_set_matrix(mt, pos, num_pos, mu);
              find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc, mu);
              input_mt_set_slots(mt, slots, num_pos);
      
              return 0;
      }
      EXPORT_SYMBOL(input_mt_assign_slots);
      
      /**
       * input_mt_get_slot_by_key() - return slot matching key
       * @dev: input device with allocated MT slots
       * @key: the key of the sought slot
       *
       * Returns the slot of the given key, if it exists, otherwise
       * set the key on the first unused slot and return.
       *
       * If no available slot can be found, -1 is returned.
       * Note that for this function to work properly, input_mt_sync_frame() has
       * to be called at each frame.
       */
      int input_mt_get_slot_by_key(struct input_dev *dev, int key)
      {
              struct input_mt *mt = dev->mt;
              struct input_mt_slot *s;
      
              if (!mt)
                      return -1;
      
              for (s = mt->slots; s != mt->slots + mt->num_slots; s++)
                      if (input_mt_is_active(s) && s->key == key)
                              return s - mt->slots;
      
              for (s = mt->slots; s != mt->slots + mt->num_slots; s++)
                      if (!input_mt_is_active(s) && !input_mt_is_used(mt, s)) {
                              s->key = key;
                              return s - mt->slots;
                      }
      
              return -1;
      }
      EXPORT_SYMBOL(input_mt_get_slot_by_key);
      #ifndef _ASM_WORD_AT_A_TIME_H
      #define _ASM_WORD_AT_A_TIME_H
      
      #include <linux/kernel.h>
      
      /*
       * This is largely generic for little-endian machines, but the
       * optimal byte mask counting is probably going to be something
       * that is architecture-specific. If you have a reliably fast
       * bit count instruction, that might be better than the multiply
       * and shift, for example.
       */
      struct word_at_a_time {
              const unsigned long one_bits, high_bits;
      };
      
      #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
      
      #ifdef CONFIG_64BIT
      
      /*
       * Jan Achrenius on G+: microoptimized version of
       * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
       * that works for the bytemasks without having to
       * mask them first.
       */
      static inline long count_masked_bytes(unsigned long mask)
      {
              return mask*0x0001020304050608ul >> 56;
      }
      
      #else        /* 32-bit case */
      
      /* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
      static inline long count_masked_bytes(long mask)
      {
              /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
              long a = (0x0ff0001+mask) >> 23;
              /* Fix the 1 for 00 case */
              return a & mask;
      }
      
      #endif
      
      /* Return nonzero if it has a zero */
      static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
      {
  402         unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
              *bits = mask;
              return mask;
      }
      
      static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
      {
              return bits;
      }
      
      static inline unsigned long create_zero_mask(unsigned long bits)
      {
 2466         bits = (bits - 1) & ~bits;
              return bits >> 7;
      }
      
      /* The mask we created is directly usable as a bytemask */
      #define zero_bytemask(mask) (mask)
      
      static inline unsigned long find_zero(unsigned long mask)
      {
              return count_masked_bytes(mask);
      }
      
      /*
       * Load an unaligned word from kernel space.
       *
       * In the (very unlikely) case of the word being a page-crosser
       * and the next page not being mapped, take the exception and
       * return zeroes in the non-existing part.
       */
      static inline unsigned long load_unaligned_zeropad(const void *addr)
      {
              unsigned long ret, dummy;
      
  957         asm(
                      "1:\tmov %2,%0\n"
                      "2:\n"
                      ".section .fixup,\"ax\"\n"
                      "3:\t"
                      "lea %2,%1\n\t"
                      "and %3,%1\n\t"
                      "mov (%1),%0\n\t"
                      "leal %2,%%ecx\n\t"
                      "andl %4,%%ecx\n\t"
                      "shll $3,%%ecx\n\t"
                      "shr %%cl,%0\n\t"
                      "jmp 2b\n"
                      ".previous\n"
                      _ASM_EXTABLE(1b, 3b)
                      :"=&r" (ret),"=&c" (dummy)
                      :"m" (*(unsigned long *)addr),
                       "i" (-sizeof(unsigned long)),
                       "i" (sizeof(unsigned long)-1));
              return ret;
      }
      
      #endif /* _ASM_WORD_AT_A_TIME_H */
      #ifndef __NET_GENERIC_NETLINK_H
      #define __NET_GENERIC_NETLINK_H
      
      #include <linux/genetlink.h>
      #include <net/netlink.h>
      #include <net/net_namespace.h>
      
      #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN)
      
      /**
       * struct genl_multicast_group - generic netlink multicast group
       * @name: name of the multicast group, names are per-family
       */
      struct genl_multicast_group {
              char                        name[GENL_NAMSIZ];
      };
      
      struct genl_ops;
      struct genl_info;
      
      /**
       * struct genl_family - generic netlink family
       * @id: protocol family idenfitier
       * @hdrsize: length of user specific header in bytes
       * @name: name of family
       * @version: protocol version
       * @maxattr: maximum number of attributes supported
       * @netnsok: set to true if the family can handle network
       *        namespaces and should be presented in all of them
       * @parallel_ops: operations can be called in parallel and aren't
       *        synchronized by the core genetlink code
       * @pre_doit: called before an operation's doit callback, it may
       *        do additional, common, filtering and return an error
       * @post_doit: called after an operation's doit callback, it may
       *        undo operations done by pre_doit, for example release locks
       * @mcast_bind: a socket bound to the given multicast group (which
       *        is given as the offset into the groups array)
       * @mcast_unbind: a socket was unbound from the given multicast group.
       *        Note that unbind() will not be called symmetrically if the
       *        generic netlink family is removed while there are still open
       *        sockets.
       * @attrbuf: buffer to store parsed attributes
       * @family_list: family list
       * @mcgrps: multicast groups used by this family (private)
       * @n_mcgrps: number of multicast groups (private)
       * @mcgrp_offset: starting number of multicast group IDs in this family
       * @ops: the operations supported by this family (private)
       * @n_ops: number of operations supported by this family (private)
       */
      struct genl_family {
              unsigned int                id;
              unsigned int                hdrsize;
              char                        name[GENL_NAMSIZ];
              unsigned int                version;
              unsigned int                maxattr;
              bool                        netnsok;
              bool                        parallel_ops;
              int                        (*pre_doit)(const struct genl_ops *ops,
                                                  struct sk_buff *skb,
                                                  struct genl_info *info);
              void                        (*post_doit)(const struct genl_ops *ops,
                                                   struct sk_buff *skb,
                                                   struct genl_info *info);
              int                        (*mcast_bind)(struct net *net, int group);
              void                        (*mcast_unbind)(struct net *net, int group);
              struct nlattr **        attrbuf;        /* private */
              const struct genl_ops *        ops;                /* private */
              const struct genl_multicast_group *mcgrps; /* private */
              unsigned int                n_ops;                /* private */
              unsigned int                n_mcgrps;        /* private */
              unsigned int                mcgrp_offset;        /* private */
              struct list_head        family_list;        /* private */
              struct module                *module;
      };
      
      /**
       * struct genl_info - receiving information
       * @snd_seq: sending sequence number
       * @snd_portid: netlink portid of sender
       * @nlhdr: netlink message header
       * @genlhdr: generic netlink message header
       * @userhdr: user specific header
       * @attrs: netlink attributes
       * @_net: network namespace
       * @user_ptr: user pointers
       * @dst_sk: destination socket
       */
      struct genl_info {
              u32                        snd_seq;
              u32                        snd_portid;
              struct nlmsghdr *        nlhdr;
              struct genlmsghdr *        genlhdr;
              void *                        userhdr;
              struct nlattr **        attrs;
              possible_net_t                _net;
              void *                        user_ptr[2];
              struct sock *                dst_sk;
      };
      
      static inline struct net *genl_info_net(struct genl_info *info)
      {
    5         return read_pnet(&info->_net);
      }
      
      static inline void genl_info_net_set(struct genl_info *info, struct net *net)
      {
              write_pnet(&info->_net, net);
      }
      
      /**
       * struct genl_ops - generic netlink operations
       * @cmd: command identifier
       * @internal_flags: flags used by the family
       * @flags: flags
       * @policy: attribute validation policy
       * @doit: standard command callback
       * @start: start callback for dumps
       * @dumpit: callback for dumpers
       * @done: completion callback for dumps
       * @ops_list: operations list
       */
      struct genl_ops {
              const struct nla_policy        *policy;
              int                       (*doit)(struct sk_buff *skb,
                                             struct genl_info *info);
              int                       (*start)(struct netlink_callback *cb);
              int                       (*dumpit)(struct sk_buff *skb,
                                               struct netlink_callback *cb);
              int                       (*done)(struct netlink_callback *cb);
              u8                        cmd;
              u8                        internal_flags;
              u8                        flags;
      };
      
      int __genl_register_family(struct genl_family *family);
      
      static inline int genl_register_family(struct genl_family *family)
      {
              family->module = THIS_MODULE;
              return __genl_register_family(family);
      }
      
      /**
       * genl_register_family_with_ops - register a generic netlink family with ops
       * @family: generic netlink family
       * @ops: operations to be registered
       * @n_ops: number of elements to register
       *
       * Registers the specified family and operations from the specified table.
       * Only one family may be registered with the same family name or identifier.
       *
       * The family id may equal GENL_ID_GENERATE causing an unique id to
       * be automatically generated and assigned.
       *
       * Either a doit or dumpit callback must be specified for every registered
       * operation or the function will fail. Only one operation structure per
       * command identifier may be registered.
       *
       * See include/net/genetlink.h for more documenation on the operations
       * structure.
       *
       * Return 0 on success or a negative error code.
       */
      static inline int
      _genl_register_family_with_ops_grps(struct genl_family *family,
                                          const struct genl_ops *ops, size_t n_ops,
                                          const struct genl_multicast_group *mcgrps,
                                          size_t n_mcgrps)
      {
              family->module = THIS_MODULE;
              family->ops = ops;
              family->n_ops = n_ops;
              family->mcgrps = mcgrps;
              family->n_mcgrps = n_mcgrps;
              return __genl_register_family(family);
      }
      
      #define genl_register_family_with_ops(family, ops)                        \
              _genl_register_family_with_ops_grps((family),                        \
                                                  (ops), ARRAY_SIZE(ops),        \
                                                  NULL, 0)
      #define genl_register_family_with_ops_groups(family, ops, grps)        \
              _genl_register_family_with_ops_grps((family),                        \
                                                  (ops), ARRAY_SIZE(ops),        \
                                                  (grps), ARRAY_SIZE(grps))
      
      int genl_unregister_family(struct genl_family *family);
      void genl_notify(struct genl_family *family, struct sk_buff *skb,
                       struct genl_info *info, u32 group, gfp_t flags);
      
      struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info,
                                          gfp_t flags);
      void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                        struct genl_family *family, int flags, u8 cmd);
      
      /**
       * genlmsg_nlhdr - Obtain netlink header from user specified header
       * @user_hdr: user header as returned from genlmsg_put()
       * @family: generic netlink family
       *
       * Returns pointer to netlink header.
       */
      static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr,
                                                   struct genl_family *family)
      {
              return (struct nlmsghdr *)((char *)user_hdr -
                                         family->hdrsize -
                                         GENL_HDRLEN -
                                         NLMSG_HDRLEN);
      }
      
      /**
       * genlmsg_parse - parse attributes of a genetlink message
       * @nlh: netlink message header
       * @family: genetlink message family
       * @tb: destination array with maxtype+1 elements
       * @maxtype: maximum attribute type to be expected
       * @policy: validation policy
       * */
      static inline int genlmsg_parse(const struct nlmsghdr *nlh,
                                      const struct genl_family *family,
                                      struct nlattr *tb[], int maxtype,
                                      const struct nla_policy *policy)
      {
              return nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
                                 policy);
      }
      
      /**
       * genl_dump_check_consistent - check if sequence is consistent and advertise if not
       * @cb: netlink callback structure that stores the sequence number
       * @user_hdr: user header as returned from genlmsg_put()
       * @family: generic netlink family
       *
       * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it
       * simpler to use with generic netlink.
       */
      static inline void genl_dump_check_consistent(struct netlink_callback *cb,
                                                    void *user_hdr,
                                                    struct genl_family *family)
      {
              nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr, family));
      }
      
      /**
       * genlmsg_put_reply - Add generic netlink header to a reply message
       * @skb: socket buffer holding the message
       * @info: receiver info
       * @family: generic netlink family
       * @flags: netlink message flags
       * @cmd: generic netlink command
       *
       * Returns pointer to user specific header
       */
      static inline void *genlmsg_put_reply(struct sk_buff *skb,
                                            struct genl_info *info,
                                            struct genl_family *family,
                                            int flags, u8 cmd)
      {
              return genlmsg_put(skb, info->snd_portid, info->snd_seq, family,
                                 flags, cmd);
      }
      
      /**
       * genlmsg_end - Finalize a generic netlink message
       * @skb: socket buffer the message is stored in
       * @hdr: user specific header
       */
      static inline void genlmsg_end(struct sk_buff *skb, void *hdr)
      {
    9         nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
      }
      
      /**
       * genlmsg_cancel - Cancel construction of a generic netlink message
       * @skb: socket buffer the message is stored in
       * @hdr: generic netlink message header
       */
      static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
      {
              if (hdr)
    2                 nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
      }
      
      /**
       * genlmsg_multicast_netns - multicast a netlink message to a specific netns
       * @family: the generic netlink family
       * @net: the net namespace
       * @skb: netlink message as socket buffer
       * @portid: own netlink portid to avoid sending to yourself
       * @group: offset of multicast group in groups array
       * @flags: allocation flags
       */
      static inline int genlmsg_multicast_netns(struct genl_family *family,
                                                struct net *net, struct sk_buff *skb,
                                                u32 portid, unsigned int group, gfp_t flags)
      {
              if (WARN_ON_ONCE(group >= family->n_mcgrps))
                      return -EINVAL;
              group = family->mcgrp_offset + group;
              return nlmsg_multicast(net->genl_sock, skb, portid, group, flags);
      }
      
      /**
       * genlmsg_multicast - multicast a netlink message to the default netns
       * @family: the generic netlink family
       * @skb: netlink message as socket buffer
       * @portid: own netlink portid to avoid sending to yourself
       * @group: offset of multicast group in groups array
       * @flags: allocation flags
       */
      static inline int genlmsg_multicast(struct genl_family *family,
                                          struct sk_buff *skb, u32 portid,
                                          unsigned int group, gfp_t flags)
      {
              return genlmsg_multicast_netns(family, &init_net, skb,
                                             portid, group, flags);
      }
      
      /**
       * genlmsg_multicast_allns - multicast a netlink message to all net namespaces
       * @family: the generic netlink family
       * @skb: netlink message as socket buffer
       * @portid: own netlink portid to avoid sending to yourself
       * @group: offset of multicast group in groups array
       * @flags: allocation flags
       *
       * This function must hold the RTNL or rcu_read_lock().
       */
      int genlmsg_multicast_allns(struct genl_family *family,
                                  struct sk_buff *skb, u32 portid,
                                  unsigned int group, gfp_t flags);
      
      /**
       * genlmsg_unicast - unicast a netlink message
       * @skb: netlink message as socket buffer
       * @portid: netlink portid of the destination socket
       */
      static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid)
      {
  163         return nlmsg_unicast(net->genl_sock, skb, portid);
      }
      
      /**
       * genlmsg_reply - reply to a request
       * @skb: netlink message to be sent back
       * @info: receiver information
       */
      static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info)
      {
  163         return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid);
      }
      
      /**
       * gennlmsg_data - head of message payload
       * @gnlh: genetlink message header
       */
      static inline void *genlmsg_data(const struct genlmsghdr *gnlh)
      {
              return ((unsigned char *) gnlh + GENL_HDRLEN);
      }
      
      /**
       * genlmsg_len - length of message payload
       * @gnlh: genetlink message header
       */
      static inline int genlmsg_len(const struct genlmsghdr *gnlh)
      {
              struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh -
                                                              NLMSG_HDRLEN);
              return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN);
      }
      
      /**
       * genlmsg_msg_size - length of genetlink message not including padding
       * @payload: length of message payload
       */
      static inline int genlmsg_msg_size(int payload)
      {
              return GENL_HDRLEN + payload;
      }
      
      /**
       * genlmsg_total_size - length of genetlink message including padding
       * @payload: length of message payload
       */
      static inline int genlmsg_total_size(int payload)
      {
              return NLMSG_ALIGN(genlmsg_msg_size(payload));
      }
      
      /**
       * genlmsg_new - Allocate a new generic netlink message
       * @payload: size of the message payload
       * @flags: the type of memory to allocate.
       */
      static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
      {
              return nlmsg_new(genlmsg_total_size(payload), flags);
      }
      
      /**
       * genl_set_err - report error to genetlink broadcast listeners
       * @family: the generic netlink family
       * @net: the network namespace to report the error to
       * @portid: the PORTID of a process that we want to skip (if any)
       * @group: the broadcast group that will notice the error
       *         (this is the offset of the multicast group in the groups array)
       * @code: error code, must be negative (as usual in kernelspace)
       *
       * This function returns the number of broadcast listeners that have set the
       * NETLINK_RECV_NO_ENOBUFS socket option.
       */
      static inline int genl_set_err(struct genl_family *family, struct net *net,
                                     u32 portid, u32 group, int code)
      {
              if (WARN_ON_ONCE(group >= family->n_mcgrps))
                      return -EINVAL;
              group = family->mcgrp_offset + group;
              return netlink_set_err(net->genl_sock, portid, group, code);
      }
      
      static inline int genl_has_listeners(struct genl_family *family,
                                           struct net *net, unsigned int group)
      {
              if (WARN_ON_ONCE(group >= family->n_mcgrps))
                      return -EINVAL;
              group = family->mcgrp_offset + group;
              return netlink_has_listeners(net->genl_sock, group);
      }
      #endif        /* __NET_GENERIC_NETLINK_H */
      #ifndef __NET_RTNETLINK_H
      #define __NET_RTNETLINK_H
      
      #include <linux/rtnetlink.h>
      #include <net/netlink.h>
      
      typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *);
      typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
      typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *);
      
      int __rtnl_register(int protocol, int msgtype,
                          rtnl_doit_func, rtnl_dumpit_func, rtnl_calcit_func);
      void rtnl_register(int protocol, int msgtype,
                         rtnl_doit_func, rtnl_dumpit_func, rtnl_calcit_func);
      int rtnl_unregister(int protocol, int msgtype);
      void rtnl_unregister_all(int protocol);
      
      static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
      {
              if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg))
    8                 return ((struct rtgenmsg *) nlmsg_data(nlh))->rtgen_family;
              else
                      return AF_UNSPEC;
      }
      
      /**
       *        struct rtnl_link_ops - rtnetlink link operations
       *
       *        @list: Used internally
       *        @kind: Identifier
       *        @maxtype: Highest device specific netlink attribute number
       *        @policy: Netlink policy for device specific attribute validation
       *        @validate: Optional validation function for netlink/changelink parameters
       *        @priv_size: sizeof net_device private space
       *        @setup: net_device setup function
       *        @newlink: Function for configuring and registering a new device
       *        @changelink: Function for changing parameters of an existing device
       *        @dellink: Function to remove a device
       *        @get_size: Function to calculate required room for dumping device
       *                   specific netlink attributes
       *        @fill_info: Function to dump device specific netlink attributes
       *        @get_xstats_size: Function to calculate required room for dumping device
       *                          specific statistics
       *        @fill_xstats: Function to dump device specific statistics
       *        @get_num_tx_queues: Function to determine number of transmit queues
       *                            to create when creating a new device.
       *        @get_num_rx_queues: Function to determine number of receive queues
       *                            to create when creating a new device.
       *        @get_link_net: Function to get the i/o netns of the device
       */
      struct rtnl_link_ops {
              struct list_head        list;
      
              const char                *kind;
      
              size_t                        priv_size;
              void                        (*setup)(struct net_device *dev);
      
              int                        maxtype;
              const struct nla_policy        *policy;
              int                        (*validate)(struct nlattr *tb[],
                                                  struct nlattr *data[]);
      
              int                        (*newlink)(struct net *src_net,
                                                 struct net_device *dev,
                                                 struct nlattr *tb[],
                                                 struct nlattr *data[]);
              int                        (*changelink)(struct net_device *dev,
                                                    struct nlattr *tb[],
                                                    struct nlattr *data[]);
              void                        (*dellink)(struct net_device *dev,
                                                 struct list_head *head);
      
              size_t                        (*get_size)(const struct net_device *dev);
              int                        (*fill_info)(struct sk_buff *skb,
                                                   const struct net_device *dev);
      
              size_t                        (*get_xstats_size)(const struct net_device *dev);
              int                        (*fill_xstats)(struct sk_buff *skb,
                                                     const struct net_device *dev);
              unsigned int                (*get_num_tx_queues)(void);
              unsigned int                (*get_num_rx_queues)(void);
      
              int                        slave_maxtype;
              const struct nla_policy        *slave_policy;
              int                        (*slave_validate)(struct nlattr *tb[],
                                                        struct nlattr *data[]);
              int                        (*slave_changelink)(struct net_device *dev,
                                                          struct net_device *slave_dev,
                                                          struct nlattr *tb[],
                                                          struct nlattr *data[]);
              size_t                        (*get_slave_size)(const struct net_device *dev,
                                                        const struct net_device *slave_dev);
              int                        (*fill_slave_info)(struct sk_buff *skb,
                                                         const struct net_device *dev,
                                                         const struct net_device *slave_dev);
              struct net                *(*get_link_net)(const struct net_device *dev);
      };
      
      int __rtnl_link_register(struct rtnl_link_ops *ops);
      void __rtnl_link_unregister(struct rtnl_link_ops *ops);
      
      int rtnl_link_register(struct rtnl_link_ops *ops);
      void rtnl_link_unregister(struct rtnl_link_ops *ops);
      
      /**
       *         struct rtnl_af_ops - rtnetlink address family operations
       *
       *        @list: Used internally
       *         @family: Address family
       *         @fill_link_af: Function to fill IFLA_AF_SPEC with address family
       *                        specific netlink attributes.
       *         @get_link_af_size: Function to calculate size of address family specific
       *                            netlink attributes.
       *        @validate_link_af: Validate a IFLA_AF_SPEC attribute, must check attr
       *                           for invalid configuration settings.
       *         @set_link_af: Function to parse a IFLA_AF_SPEC attribute and modify
       *                      net_device accordingly.
       */
      struct rtnl_af_ops {
              struct list_head        list;
              int                        family;
      
              int                        (*fill_link_af)(struct sk_buff *skb,
                                                      const struct net_device *dev,
                                                      u32 ext_filter_mask);
              size_t                        (*get_link_af_size)(const struct net_device *dev,
                                                          u32 ext_filter_mask);
      
              int                        (*validate_link_af)(const struct net_device *dev,
                                                          const struct nlattr *attr);
              int                        (*set_link_af)(struct net_device *dev,
                                                     const struct nlattr *attr);
      };
      
      void __rtnl_af_unregister(struct rtnl_af_ops *ops);
      
      void rtnl_af_register(struct rtnl_af_ops *ops);
      void rtnl_af_unregister(struct rtnl_af_ops *ops);
      
      struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]);
      struct net_device *rtnl_create_link(struct net *net, const char *ifname,
                                          unsigned char name_assign_type,
                                          const struct rtnl_link_ops *ops,
                                          struct nlattr *tb[]);
      int rtnl_delete_link(struct net_device *dev);
      int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
      
      int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len);
      
      #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind)
      
      #endif
      /*
       *  linux/kernel/time.c
       *
       *  Copyright (C) 1991, 1992  Linus Torvalds
       *
       *  This file contains the interface functions for the various
       *  time related system calls: time, stime, gettimeofday, settimeofday,
       *                               adjtime
       */
      /*
       * Modification history kernel/time.c
       *
       * 1993-09-02    Philip Gladstone
       *      Created file with time related functions from sched/core.c and adjtimex()
       * 1993-10-08    Torsten Duwe
       *      adjtime interface update and CMOS clock write code
       * 1995-08-13    Torsten Duwe
       *      kernel PLL updated to 1994-12-13 specs (rfc-1589)
       * 1999-01-16    Ulrich Windl
       *        Introduced error checking for many cases in adjtimex().
       *        Updated NTP code according to technical memorandum Jan '96
       *        "A Kernel Model for Precision Timekeeping" by Dave Mills
       *        Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
       *        (Even though the technical memorandum forbids it)
       * 2004-07-14         Christoph Lameter
       *        Added getnstimeofday to allow the posix timer functions to return
       *        with nanosecond accuracy
       */
      
      #include <linux/export.h>
      #include <linux/kernel.h>
      #include <linux/timex.h>
      #include <linux/capability.h>
      #include <linux/timekeeper_internal.h>
      #include <linux/errno.h>
      #include <linux/syscalls.h>
      #include <linux/security.h>
      #include <linux/fs.h>
      #include <linux/math64.h>
      #include <linux/ptrace.h>
      
      #include <asm/uaccess.h>
      #include <asm/unistd.h>
      
      #include <generated/timeconst.h>
      #include "timekeeping.h"
      
      /*
       * The timezone where the local system is located.  Used as a default by some
       * programs who obtain this value by using gettimeofday.
       */
      struct timezone sys_tz;
      
      EXPORT_SYMBOL(sys_tz);
      
      #ifdef __ARCH_WANT_SYS_TIME
      
      /*
       * sys_time() can be implemented in user-level using
       * sys_gettimeofday().  Is this for backwards compatibility?  If so,
       * why not move it into the appropriate arch directory (for those
       * architectures that need it).
       */
      SYSCALL_DEFINE1(time, time_t __user *, tloc)
      {
              time_t i = get_seconds();
      
              if (tloc) {
                      if (put_user(i,tloc))
                              return -EFAULT;
              }
              force_successful_syscall_return();
              return i;
      }
      
      /*
       * sys_stime() can be implemented in user-level using
       * sys_settimeofday().  Is this for backwards compatibility?  If so,
       * why not move it into the appropriate arch directory (for those
       * architectures that need it).
       */
      
      SYSCALL_DEFINE1(stime, time_t __user *, tptr)
      {
              struct timespec tv;
              int err;
      
              if (get_user(tv.tv_sec, tptr))
                      return -EFAULT;
      
              tv.tv_nsec = 0;
      
              err = security_settime(&tv, NULL);
              if (err)
                      return err;
      
              do_settimeofday(&tv);
              return 0;
      }
      
      #endif /* __ARCH_WANT_SYS_TIME */
      
      SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
                      struct timezone __user *, tz)
      {
              if (likely(tv != NULL)) {
                      struct timeval ktv;
                      do_gettimeofday(&ktv);
                      if (copy_to_user(tv, &ktv, sizeof(ktv)))
                              return -EFAULT;
              }
              if (unlikely(tz != NULL)) {
                      if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                              return -EFAULT;
              }
              return 0;
      }
      
      /*
       * Indicates if there is an offset between the system clock and the hardware
       * clock/persistent clock/rtc.
       */
      int persistent_clock_is_local;
      
      /*
       * Adjust the time obtained from the CMOS to be UTC time instead of
       * local time.
       *
       * This is ugly, but preferable to the alternatives.  Otherwise we
       * would either need to write a program to do it in /etc/rc (and risk
       * confusion if the program gets run more than once; it would also be
       * hard to make the program warp the clock precisely n hours)  or
       * compile in the timezone information into the kernel.  Bad, bad....
       *
       *                                                - TYT, 1992-01-01
       *
       * The best thing to do is to keep the CMOS clock in universal time (UTC)
       * as real UNIX machines always do it. This avoids all headaches about
       * daylight saving times and warping kernel clocks.
       */
      static inline void warp_clock(void)
      {
              if (sys_tz.tz_minuteswest != 0) {
                      struct timespec adjust;
      
                      persistent_clock_is_local = 1;
                      adjust.tv_sec = sys_tz.tz_minuteswest * 60;
                      adjust.tv_nsec = 0;
                      timekeeping_inject_offset(&adjust);
              }
      }
      
      /*
       * In case for some reason the CMOS clock has not already been running
       * in UTC, but in some local time: The first time we set the timezone,
       * we will warp the clock so that it is ticking UTC time instead of
       * local time. Presumably, if someone is setting the timezone then we
       * are running in an environment where the programs understand about
       * timezones. This should be done at boot time in the /etc/rc script,
       * as soon as possible, so that the clock can be set right. Otherwise,
       * various programs will get confused when the clock gets warped.
       */
      
    3 int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
      {
              static int firsttime = 1;
              int error = 0;
      
    3         if (tv && !timespec_valid(tv))
                      return -EINVAL;
      
    1         error = security_settime(tv, tz);
              if (error)
                      return error;
      
              if (tz) {
                      /* Verify we're witin the +-15 hrs range */
                      if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
                              return -EINVAL;
      
                      sys_tz = *tz;
                      update_vsyscall_tz();
                      if (firsttime) {
                              firsttime = 0;
                              if (!tv)
                                      warp_clock();
                      }
              }
              if (tv)
    3                 return do_settimeofday(tv);
              return 0;
      }
      
      SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
                      struct timezone __user *, tz)
      {
              struct timeval user_tv;
              struct timespec        new_ts;
              struct timezone new_tz;
      
              if (tv) {
                      if (copy_from_user(&user_tv, tv, sizeof(*tv)))
                              return -EFAULT;
      
                      if (!timeval_valid(&user_tv))
                              return -EINVAL;
      
                      new_ts.tv_sec = user_tv.tv_sec;
                      new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
              }
              if (tz) {
                      if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                              return -EFAULT;
              }
      
              return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
      }
      
      SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
      {
              struct timex txc;                /* Local copy of parameter */
              int ret;
      
              /* Copy the user data space into the kernel copy
               * structure. But bear in mind that the structures
               * may change
               */
              if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
                      return -EFAULT;
              ret = do_adjtimex(&txc);
              return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
      }
      
      /**
       * current_fs_time - Return FS time
       * @sb: Superblock.
       *
       * Return the current time truncated to the time granularity supported by
       * the fs.
       */
      struct timespec current_fs_time(struct super_block *sb)
      {
 2308         struct timespec now = current_kernel_time();
              return timespec_trunc(now, sb->s_time_gran);
      }
      EXPORT_SYMBOL(current_fs_time);
      
      /*
       * Convert jiffies to milliseconds and back.
       *
       * Avoid unnecessary multiplications/divisions in the
       * two most common HZ cases:
       */
      unsigned int jiffies_to_msecs(const unsigned long j)
      {
      #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
  494         return (MSEC_PER_SEC / HZ) * j;
      #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
              return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
      #else
      # if BITS_PER_LONG == 32
              return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >>
                     HZ_TO_MSEC_SHR32;
      # else
              return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
      # endif
      #endif
      }
      EXPORT_SYMBOL(jiffies_to_msecs);
      
      unsigned int jiffies_to_usecs(const unsigned long j)
      {
              /*
               * Hz usually doesn't go much further MSEC_PER_SEC.
               * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
               */
              BUILD_BUG_ON(HZ > USEC_PER_SEC);
      
      #if !(USEC_PER_SEC % HZ)
  533         return (USEC_PER_SEC / HZ) * j;
      #else
      # if BITS_PER_LONG == 32
              return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
      # else
              return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
      # endif
      #endif
      }
      EXPORT_SYMBOL(jiffies_to_usecs);
      
      /**
       * timespec_trunc - Truncate timespec to a granularity
       * @t: Timespec
       * @gran: Granularity in ns.
       *
       * Truncate a timespec to a granularity. Always rounds down. gran must
       * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
       */
 2308 struct timespec timespec_trunc(struct timespec t, unsigned gran)
      {
              /* Avoid division in the common cases 1 ns and 1 s. */
              if (gran == 1) {
                      /* nothing */
              } else if (gran == NSEC_PER_SEC) {
                      t.tv_nsec = 0;
              } else if (gran > 1 && gran < NSEC_PER_SEC) {
                      t.tv_nsec -= t.tv_nsec % gran;
              } else {
                      WARN(1, "illegal file time granularity: %u", gran);
              }
 2308         return t;
      }
      EXPORT_SYMBOL(timespec_trunc);
      
      /*
       * mktime64 - Converts date to seconds.
       * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
       * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
       * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
       *
       * [For the Julian calendar (which was used in Russia before 1917,
       * Britain & colonies before 1752, anywhere else before 1582,
       * and is still in use by some communities) leave out the
       * -year/100+year/400 terms, and add 10.]
       *
       * This algorithm was first published by Gauss (I think).
       */
      time64_t mktime64(const unsigned int year0, const unsigned int mon0,
                      const unsigned int day, const unsigned int hour,
                      const unsigned int min, const unsigned int sec)
      {
              unsigned int mon = mon0, year = year0;
      
              /* 1..12 -> 11,12,1..10 */
   12         if (0 >= (int) (mon -= 2)) {
    2                 mon += 12;        /* Puts Feb last since it has leap day */
                      year -= 1;
              }
      
              return ((((time64_t)
   12                   (year/4 - year/100 + year/400 + 367*mon/12 + day) +
                        year*365 - 719499
                  )*24 + hour /* now have hours */
                )*60 + min /* now have minutes */
              )*60 + sec; /* finally seconds */
      }
      EXPORT_SYMBOL(mktime64);
      
      /**
       * set_normalized_timespec - set timespec sec and nsec parts and normalize
       *
       * @ts:                pointer to timespec variable to be set
       * @sec:        seconds to set
       * @nsec:        nanoseconds to set
       *
       * Set seconds and nanoseconds field of a timespec variable and
       * normalize to the timespec storage format
       *
       * Note: The tv_nsec part is always in the range of
       *        0 <= tv_nsec < NSEC_PER_SEC
       * For negative values only the tv_sec field is negative !
       */
      void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
      {
  215         while (nsec >= NSEC_PER_SEC) {
                      /*
                       * The following asm() prevents the compiler from
                       * optimising this loop into a modulo operation. See
                       * also __iter_div_u64_rem() in include/linux/time.h
                       */
   98                 asm("" : "+rm"(nsec));
                      nsec -= NSEC_PER_SEC;
                      ++sec;
              }
  220         while (nsec < 0) {
  175                 asm("" : "+rm"(nsec));
                      nsec += NSEC_PER_SEC;
                      --sec;
              }
  215         ts->tv_sec = sec;
              ts->tv_nsec = nsec;
      }
      EXPORT_SYMBOL(set_normalized_timespec);
      
      /**
       * ns_to_timespec - Convert nanoseconds to timespec
       * @nsec:       the nanoseconds value to be converted
       *
       * Returns the timespec representation of the nsec parameter.
       */
  176 struct timespec ns_to_timespec(const s64 nsec)
      {
              struct timespec ts;
              s32 rem;
      
  179         if (!nsec)
   36                 return (struct timespec) {0, 0};
      
  156         ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
              if (unlikely(rem < 0)) {
                      ts.tv_sec--;
    2                 rem += NSEC_PER_SEC;
              }
  176         ts.tv_nsec = rem;
      
              return ts;
      }
      EXPORT_SYMBOL(ns_to_timespec);
      
      /**
       * ns_to_timeval - Convert nanoseconds to timeval
       * @nsec:       the nanoseconds value to be converted
       *
       * Returns the timeval representation of the nsec parameter.
       */
      struct timeval ns_to_timeval(const s64 nsec)
      {
   36         struct timespec ts = ns_to_timespec(nsec);
              struct timeval tv;
      
              tv.tv_sec = ts.tv_sec;
              tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
      
   36         return tv;
      }
      EXPORT_SYMBOL(ns_to_timeval);
      
      #if BITS_PER_LONG == 32
      /**
       * set_normalized_timespec - set timespec sec and nsec parts and normalize
       *
       * @ts:                pointer to timespec variable to be set
       * @sec:        seconds to set
       * @nsec:        nanoseconds to set
       *
       * Set seconds and nanoseconds field of a timespec variable and
       * normalize to the timespec storage format
       *
       * Note: The tv_nsec part is always in the range of
       *        0 <= tv_nsec < NSEC_PER_SEC
       * For negative values only the tv_sec field is negative !
       */
      void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
      {
              while (nsec >= NSEC_PER_SEC) {
                      /*
                       * The following asm() prevents the compiler from
                       * optimising this loop into a modulo operation. See
                       * also __iter_div_u64_rem() in include/linux/time.h
                       */
                      asm("" : "+rm"(nsec));
                      nsec -= NSEC_PER_SEC;
                      ++sec;
              }
              while (nsec < 0) {
                      asm("" : "+rm"(nsec));
                      nsec += NSEC_PER_SEC;
                      --sec;
              }
              ts->tv_sec = sec;
              ts->tv_nsec = nsec;
      }
      EXPORT_SYMBOL(set_normalized_timespec64);
      
      /**
       * ns_to_timespec64 - Convert nanoseconds to timespec64
       * @nsec:       the nanoseconds value to be converted
       *
       * Returns the timespec64 representation of the nsec parameter.
       */
      struct timespec64 ns_to_timespec64(const s64 nsec)
      {
              struct timespec64 ts;
              s32 rem;
      
              if (!nsec)
                      return (struct timespec64) {0, 0};
      
              ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
              if (unlikely(rem < 0)) {
                      ts.tv_sec--;
                      rem += NSEC_PER_SEC;
              }
              ts.tv_nsec = rem;
      
              return ts;
      }
      EXPORT_SYMBOL(ns_to_timespec64);
      #endif
      /**
       * msecs_to_jiffies: - convert milliseconds to jiffies
       * @m:        time in milliseconds
       *
       * conversion is done as follows:
       *
       * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
       *
       * - 'too large' values [that would result in larger than
       *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
       *
       * - all other values are converted to jiffies by either multiplying
       *   the input value by a factor or dividing it with a factor and
       *   handling any 32-bit overflows.
       *   for the details see __msecs_to_jiffies()
       *
       * msecs_to_jiffies() checks for the passed in value being a constant
       * via __builtin_constant_p() allowing gcc to eliminate most of the
       * code, __msecs_to_jiffies() is called if the value passed does not
       * allow constant folding and the actual conversion must be done at
       * runtime.
       * the _msecs_to_jiffies helpers are the HZ dependent conversion
       * routines found in include/linux/jiffies.h
       */
      unsigned long __msecs_to_jiffies(const unsigned int m)
      {
              /*
               * Negative value, means infinite timeout:
               */
  828         if ((int)m < 0)
                      return MAX_JIFFY_OFFSET;
  828         return _msecs_to_jiffies(m);
      }
      EXPORT_SYMBOL(__msecs_to_jiffies);
      
      unsigned long __usecs_to_jiffies(const unsigned int u)
      {
  411         if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
                      return MAX_JIFFY_OFFSET;
  411         return _usecs_to_jiffies(u);
      }
      EXPORT_SYMBOL(__usecs_to_jiffies);
      
      /*
       * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
       * that a remainder subtract here would not do the right thing as the
       * resolution values don't fall on second boundries.  I.e. the line:
       * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
       * Note that due to the small error in the multiplier here, this
       * rounding is incorrect for sufficiently large values of tv_nsec, but
       * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
       * OK.
       *
       * Rather, we just shift the bits off the right.
       *
       * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
       * value to a scaled second value.
       */
      static unsigned long
      __timespec64_to_jiffies(u64 sec, long nsec)
      {
   10         nsec = nsec + TICK_NSEC - 1;
      
              if (sec >= MAX_SEC_IN_JIFFIES){
                      sec = MAX_SEC_IN_JIFFIES;
                      nsec = 0;
              }
              return ((sec * SEC_CONVERSION) +
                      (((u64)nsec * NSEC_CONVERSION) >>
                       (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
      
      }
      
      static unsigned long
      __timespec_to_jiffies(unsigned long sec, long nsec)
      {
              return __timespec64_to_jiffies((u64)sec, nsec);
      }
      
      unsigned long
      timespec64_to_jiffies(const struct timespec64 *value)
      {
   10         return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
      }
      EXPORT_SYMBOL(timespec64_to_jiffies);
      
      void
      jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
      {
              /*
               * Convert jiffies to nanoseconds and separate with
               * one divide.
               */
              u32 rem;
   15         value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
                                          NSEC_PER_SEC, &rem);
              value->tv_nsec = rem;
      }
      EXPORT_SYMBOL(jiffies_to_timespec64);
      
      /*
       * We could use a similar algorithm to timespec_to_jiffies (with a
       * different multiplier for usec instead of nsec). But this has a
       * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
       * usec value, since it's not necessarily integral.
       *
       * We could instead round in the intermediate scaled representation
       * (i.e. in units of 1/2^(large scale) jiffies) but that's also
       * perilous: the scaling introduces a small positive error, which
       * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
       * units to the intermediate before shifting) leads to accidental
       * overflow and overestimates.
       *
       * At the cost of one additional multiplication by a constant, just
       * use the timespec implementation.
       */
      unsigned long
      timeval_to_jiffies(const struct timeval *value)
      {
    7         return __timespec_to_jiffies(value->tv_sec,
    7                                      value->tv_usec * NSEC_PER_USEC);
      }
      EXPORT_SYMBOL(timeval_to_jiffies);
      
      void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
      {
              /*
               * Convert jiffies to nanoseconds and separate with
               * one divide.
               */
              u32 rem;
      
   24         value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
                                          NSEC_PER_SEC, &rem);
              value->tv_usec = rem / NSEC_PER_USEC;
      }
      EXPORT_SYMBOL(jiffies_to_timeval);
      
      /*
       * Convert jiffies/jiffies_64 to clock_t and back.
       */
      clock_t jiffies_to_clock_t(unsigned long x)
      {
      #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
      # if HZ < USER_HZ
              return x * (USER_HZ / HZ);
      # else
  667         return x / (HZ / USER_HZ);
      # endif
      #else
              return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
      #endif
      }
      EXPORT_SYMBOL(jiffies_to_clock_t);
      
      unsigned long clock_t_to_jiffies(unsigned long x)
    4 {
      #if (HZ % USER_HZ)==0
              if (x >= ~0UL / (HZ / USER_HZ))
                      return ~0UL;
              return x * (HZ / USER_HZ);
      #else
              /* Don't worry about loss of precision here .. */
              if (x >= ~0UL / HZ * USER_HZ)
                      return ~0UL;
      
              /* .. but do try to contain it here */
              return div_u64((u64)x * HZ, USER_HZ);
      #endif
      }
      EXPORT_SYMBOL(clock_t_to_jiffies);
      
      u64 jiffies_64_to_clock_t(u64 x)
      {
      #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
      # if HZ < USER_HZ
              x = div_u64(x * USER_HZ, HZ);
      # elif HZ > USER_HZ
              x = div_u64(x, HZ / USER_HZ);
      # else
              /* Nothing to do */
      # endif
      #else
              /*
               * There are better ways that don't overflow early,
               * but even this doesn't overflow in hundreds of years
               * in 64 bits, so..
               */
              x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
      #endif
    2         return x;
      }
      EXPORT_SYMBOL(jiffies_64_to_clock_t);
      
      u64 nsec_to_clock_t(u64 x)
      {
      #if (NSEC_PER_SEC % USER_HZ) == 0
   10         return div_u64(x, NSEC_PER_SEC / USER_HZ);
      #elif (USER_HZ % 512) == 0
              return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
      #else
              /*
               * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
               * overflow after 64.99 years.
               * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
               */
              return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
      #endif
      }
      
      /**
       * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
       *
       * @n:        nsecs in u64
       *
       * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
       * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
       * for scheduler, not for use in device drivers to calculate timeout value.
       *
       * note:
       *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
       *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
       */
      u64 nsecs_to_jiffies64(u64 n)
      {
      #if (NSEC_PER_SEC % HZ) == 0
              /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
   60         return div_u64(n, NSEC_PER_SEC / HZ);
      #elif (HZ % 512) == 0
              /* overflow after 292 years if HZ = 1024 */
              return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
      #else
              /*
               * Generic case - optimized for cases where HZ is a multiple of 3.
               * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
               */
              return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
      #endif
      }
      EXPORT_SYMBOL(nsecs_to_jiffies64);
      
      /**
       * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
       *
       * @n:        nsecs in u64
       *
       * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
       * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
       * for scheduler, not for use in device drivers to calculate timeout value.
       *
       * note:
       *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
       *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
       */
      unsigned long nsecs_to_jiffies(u64 n)
      {
   60         return (unsigned long)nsecs_to_jiffies64(n);
      }
      EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
      
      /*
       * Add two timespec values and do a safety check for overflow.
       * It's assumed that both values are valid (>= 0)
       */
      struct timespec timespec_add_safe(const struct timespec lhs,
  212                                   const struct timespec rhs)
      {
              struct timespec res;
      
  212         set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
                                      lhs.tv_nsec + rhs.tv_nsec);
      
  212         if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
                      res.tv_sec = TIME_T_MAX;
      
  212         return res;
      }
      /*
       * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
       *
       * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
       * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
       * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
       * Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
       *
       * These routines maintain argument size conversion between 32bit and 64bit
       * ioctls.
       */
      
      #include <linux/joystick.h>
      
      #include <linux/types.h>
      #include <linux/compat.h>
      #include <linux/kernel.h>
      #include <linux/capability.h>
      #include <linux/compiler.h>
      #include <linux/sched.h>
      #include <linux/smp.h>
      #include <linux/ioctl.h>
      #include <linux/if.h>
      #include <linux/if_bridge.h>
      #include <linux/raid/md_u.h>
      #include <linux/kd.h>
      #include <linux/route.h>
      #include <linux/in6.h>
      #include <linux/ipv6_route.h>
      #include <linux/skbuff.h>
      #include <linux/netlink.h>
      #include <linux/vt.h>
      #include <linux/falloc.h>
      #include <linux/fs.h>
      #include <linux/file.h>
      #include <linux/ppp_defs.h>
      #include <linux/ppp-ioctl.h>
      #include <linux/if_pppox.h>
      #include <linux/mtio.h>
      #include <linux/auto_fs.h>
      #include <linux/auto_fs4.h>
      #include <linux/tty.h>
      #include <linux/vt_kern.h>
      #include <linux/fb.h>
      #include <linux/videodev2.h>
      #include <linux/netdevice.h>
      #include <linux/raw.h>
      #include <linux/blkdev.h>
      #include <linux/elevator.h>
      #include <linux/rtc.h>
      #include <linux/pci.h>
      #include <linux/serial.h>
      #include <linux/if_tun.h>
      #include <linux/ctype.h>
      #include <linux/syscalls.h>
      #include <linux/i2c.h>
      #include <linux/i2c-dev.h>
      #include <linux/atalk.h>
      #include <linux/gfp.h>
      
      #include <net/bluetooth/bluetooth.h>
      #include <net/bluetooth/hci_sock.h>
      #include <net/bluetooth/rfcomm.h>
      
      #include <linux/capi.h>
      #include <linux/gigaset_dev.h>
      
      #ifdef CONFIG_BLOCK
      #include <linux/cdrom.h>
      #include <linux/fd.h>
      #include <scsi/scsi.h>
      #include <scsi/scsi_ioctl.h>
      #include <scsi/sg.h>
      #endif
      
      #include <asm/uaccess.h>
      #include <linux/ethtool.h>
      #include <linux/mii.h>
      #include <linux/if_bonding.h>
      #include <linux/watchdog.h>
      
      #include <linux/soundcard.h>
      #include <linux/lp.h>
      #include <linux/ppdev.h>
      
      #include <linux/atm.h>
      #include <linux/atmarp.h>
      #include <linux/atmclip.h>
      #include <linux/atmdev.h>
      #include <linux/atmioc.h>
      #include <linux/atmlec.h>
      #include <linux/atmmpc.h>
      #include <linux/atmsvc.h>
      #include <linux/atm_tcp.h>
      #include <linux/sonet.h>
      #include <linux/atm_suni.h>
      
      #include <linux/usb.h>
      #include <linux/usbdevice_fs.h>
      #include <linux/nbd.h>
      #include <linux/random.h>
      #include <linux/filter.h>
      
      #include <linux/hiddev.h>
      
      #define __DVB_CORE__
      #include <linux/dvb/audio.h>
      #include <linux/dvb/dmx.h>
      #include <linux/dvb/frontend.h>
      #include <linux/dvb/video.h>
      
      #include <linux/sort.h>
      
      #ifdef CONFIG_SPARC
      #include <asm/fbio.h>
      #endif
      
      static int w_long(unsigned int fd, unsigned int cmd,
                      compat_ulong_t __user *argp)
      {
    1         mm_segment_t old_fs = get_fs();
              int err;
              unsigned long val;
      
              set_fs (KERNEL_DS);
              err = sys_ioctl(fd, cmd, (unsigned long)&val);
              set_fs (old_fs);
    3         if (!err && put_user(val, argp))
                      return -EFAULT;
              return err;
      }
      
      struct compat_video_event {
              int32_t                type;
              compat_time_t        timestamp;
              union {
                      video_size_t size;
                      unsigned int frame_rate;
              } u;
      };
      
      static int do_video_get_event(unsigned int fd, unsigned int cmd,
                      struct compat_video_event __user *up)
      {
              struct video_event kevent;
              mm_segment_t old_fs = get_fs();
              int err;
      
              set_fs(KERNEL_DS);
              err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
              set_fs(old_fs);
      
              if (!err) {
                      err  = put_user(kevent.type, &up->type);
                      err |= put_user(kevent.timestamp, &up->timestamp);
                      err |= put_user(kevent.u.size.w, &up->u.size.w);
                      err |= put_user(kevent.u.size.h, &up->u.size.h);
                      err |= put_user(kevent.u.size.aspect_ratio,
                                      &up->u.size.aspect_ratio);
                      if (err)
                              err = -EFAULT;
              }
      
    8         return err;
      }
      
      struct compat_video_still_picture {
              compat_uptr_t iFrame;
              int32_t size;
      };
      
      static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
              struct compat_video_still_picture __user *up)
      {
              struct video_still_picture __user *up_native;
              compat_uptr_t fp;
              int32_t size;
              int err;
      
    3         err  = get_user(fp, &up->iFrame);
              err |= get_user(size, &up->size);
              if (err)
                      return -EFAULT;
      
              up_native =
    2                 compat_alloc_user_space(sizeof(struct video_still_picture));
      
              err =  put_user(compat_ptr(fp), &up_native->iFrame);
              err |= put_user(size, &up_native->size);
              if (err)
                      return -EFAULT;
      
    2         err = sys_ioctl(fd, cmd, (unsigned long) up_native);
      
              return err;
      }
      
      struct compat_video_spu_palette {
              int length;
              compat_uptr_t palette;
      };
      
      static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
                      struct compat_video_spu_palette __user *up)
      {
              struct video_spu_palette __user *up_native;
              compat_uptr_t palp;
              int length, err;
      
              err  = get_user(palp, &up->palette);
              err |= get_user(length, &up->length);
              if (err)
                      return -EFAULT;
      
              up_native = compat_alloc_user_space(sizeof(struct video_spu_palette));
              err  = put_user(compat_ptr(palp), &up_native->palette);
              err |= put_user(length, &up_native->length);
              if (err)
                      return -EFAULT;
      
              err = sys_ioctl(fd, cmd, (unsigned long) up_native);
      
              return err;
      }
      
      #ifdef CONFIG_BLOCK
      typedef struct sg_io_hdr32 {
              compat_int_t interface_id;        /* [i] 'S' for SCSI generic (required) */
              compat_int_t dxfer_direction;        /* [i] data transfer direction  */
              unsigned char cmd_len;                /* [i] SCSI command length ( <= 16 bytes) */
              unsigned char mx_sb_len;                /* [i] max length to write to sbp */
              unsigned short iovec_count;        /* [i] 0 implies no scatter gather */
              compat_uint_t dxfer_len;                /* [i] byte count of data transfer */
              compat_uint_t dxferp;                /* [i], [*io] points to data transfer memory
                                                    or scatter gather list */
              compat_uptr_t cmdp;                /* [i], [*i] points to command to perform */
              compat_uptr_t sbp;                /* [i], [*o] points to sense_buffer memory */
              compat_uint_t timeout;                /* [i] MAX_UINT->no timeout (unit: millisec) */
              compat_uint_t flags;                /* [i] 0 -> default, see SG_FLAG... */
              compat_int_t pack_id;                /* [i->o] unused internally (normally) */
              compat_uptr_t usr_ptr;                /* [i->o] unused internally */
              unsigned char status;                /* [o] scsi status */
              unsigned char masked_status;        /* [o] shifted, masked scsi status */
              unsigned char msg_status;                /* [o] messaging level data (optional) */
              unsigned char sb_len_wr;                /* [o] byte count actually written to sbp */
              unsigned short host_status;        /* [o] errors from host adapter */
              unsigned short driver_status;        /* [o] errors from software driver */
              compat_int_t resid;                /* [o] dxfer_len - actual_transferred */
              compat_uint_t duration;                /* [o] time taken by cmd (unit: millisec) */
              compat_uint_t info;                /* [o] auxiliary information */
      } sg_io_hdr32_t;  /* 64 bytes long (on sparc32) */
      
      typedef struct sg_iovec32 {
              compat_uint_t iov_base;
              compat_uint_t iov_len;
      } sg_iovec32_t;
      
      static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
      {
              sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
              sg_iovec32_t __user *iov32 = dxferp;
              int i;
      
              for (i = 0; i < iovec_count; i++) {
                      u32 base, len;
      
    1                 if (get_user(base, &iov32[i].iov_base) ||
                          get_user(len, &iov32[i].iov_len) ||
                          put_user(compat_ptr(base), &iov[i].iov_base) ||
                          put_user(len, &iov[i].iov_len))
                              return -EFAULT;
              }
      
              if (put_user(iov, &sgio->dxferp))
                      return -EFAULT;
              return 0;
      }
      
      static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
                              sg_io_hdr32_t __user *sgio32)
      {
              sg_io_hdr_t __user *sgio;
              u16 iovec_count;
              u32 data;
              void __user *dxferp;
              int err;
              int interface_id;
      
    4         if (get_user(interface_id, &sgio32->interface_id))
                      return -EFAULT;
    4         if (interface_id != 'S')
    1                 return sys_ioctl(fd, cmd, (unsigned long)sgio32);
      
    3         if (get_user(iovec_count, &sgio32->iovec_count))
                      return -EFAULT;
      
              {
    3                 void __user *top = compat_alloc_user_space(0);
                      void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
                                             (iovec_count * sizeof(sg_iovec_t)));
                      if (new > top)
                              return -EINVAL;
      
                      sgio = new;
              }
      
              /* Ok, now construct.  */
    3         if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
                               (2 * sizeof(int)) +
                               (2 * sizeof(unsigned char)) +
                               (1 * sizeof(unsigned short)) +
                               (1 * sizeof(unsigned int))))
                      return -EFAULT;
      
    2         if (get_user(data, &sgio32->dxferp))
                      return -EFAULT;
    2         dxferp = compat_ptr(data);
              if (iovec_count) {
    1                 if (sg_build_iovec(sgio, dxferp, iovec_count))
                              return -EFAULT;
              } else {
    1                 if (put_user(dxferp, &sgio->dxferp))
                              return -EFAULT;
              }
      
              {
                      unsigned char __user *cmdp;
                      unsigned char __user *sbp;
      
    1                 if (get_user(data, &sgio32->cmdp))
                              return -EFAULT;
    1                 cmdp = compat_ptr(data);
      
                      if (get_user(data, &sgio32->sbp))
                              return -EFAULT;
    1                 sbp = compat_ptr(data);
      
                      if (put_user(cmdp, &sgio->cmdp) ||
    1                     put_user(sbp, &sgio->sbp))
                              return -EFAULT;
              }
      
    1         if (copy_in_user(&sgio->timeout, &sgio32->timeout,
                               3 * sizeof(int)))
                      return -EFAULT;
      
    1         if (get_user(data, &sgio32->usr_ptr))
                      return -EFAULT;
    1         if (put_user(compat_ptr(data), &sgio->usr_ptr))
                      return -EFAULT;
      
    1         err = sys_ioctl(fd, cmd, (unsigned long) sgio);
      
              if (err >= 0) {
                      void __user *datap;
      
                      if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
                                       sizeof(int)) ||
                          get_user(datap, &sgio->usr_ptr) ||
                          put_user((u32)(unsigned long)datap,
                                   &sgio32->usr_ptr) ||
                          copy_in_user(&sgio32->status, &sgio->status,
                                       (4 * sizeof(unsigned char)) +
                                       (2 * sizeof(unsigned short)) +
                                       (3 * sizeof(int))))
                              err = -EFAULT;
              }
      
              return err;
      }
      
      struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
              char req_state;
              char orphan;
              char sg_io_owned;
              char problem;
              int pack_id;
              compat_uptr_t usr_ptr;
              unsigned int duration;
              int unused;
      };
      
      static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
                              compat_sg_req_info __user *o)
      {
              int err, i;
              sg_req_info_t __user *r;
              r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
              err = sys_ioctl(fd,cmd,(unsigned long)r);
              if (err < 0)
                      return err;
              for (i = 0; i < SG_MAX_QUEUE; i++) {
                      void __user *ptr;
                      int d;
      
                      if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) ||
                          get_user(ptr, &r[i].usr_ptr) ||
                          get_user(d, &r[i].duration) ||
                          put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) ||
                          put_user(d, &o[i].duration))
                              return -EFAULT;
              }
              return err;
      }
      #endif /* CONFIG_BLOCK */
      
      struct sock_fprog32 {
              unsigned short        len;
              compat_caddr_t        filter;
      };
      
      #define PPPIOCSPASS32        _IOW('t', 71, struct sock_fprog32)
      #define PPPIOCSACTIVE32        _IOW('t', 70, struct sock_fprog32)
      
      static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
                              struct sock_fprog32 __user *u_fprog32)
      {
    8         struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
              void __user *fptr64;
              u32 fptr32;
              u16 flen;
      
              if (get_user(flen, &u_fprog32->len) ||
    8             get_user(fptr32, &u_fprog32->filter))
                      return -EFAULT;
      
    8         fptr64 = compat_ptr(fptr32);
      
              if (put_user(flen, &u_fprog64->len) ||
    8             put_user(fptr64, &u_fprog64->filter))
                      return -EFAULT;
      
    8         if (cmd == PPPIOCSPASS32)
                      cmd = PPPIOCSPASS;
              else
                      cmd = PPPIOCSACTIVE;
      
    8         return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
      }
      
      struct ppp_option_data32 {
              compat_caddr_t        ptr;
              u32                        length;
              compat_int_t                transmit;
      };
      #define PPPIOCSCOMPRESS32        _IOW('t', 77, struct ppp_option_data32)
      
      struct ppp_idle32 {
              compat_time_t xmit_idle;
              compat_time_t recv_idle;
      };
      #define PPPIOCGIDLE32                _IOR('t', 63, struct ppp_idle32)
      
      static int ppp_gidle(unsigned int fd, unsigned int cmd,
                      struct ppp_idle32 __user *idle32)
      {
              struct ppp_idle __user *idle;
              __kernel_time_t xmit, recv;
              int err;
      
    1         idle = compat_alloc_user_space(sizeof(*idle));
      
              err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
      
    1         if (!err) {
                      if (get_user(xmit, &idle->xmit_idle) ||
                          get_user(recv, &idle->recv_idle) ||
                          put_user(xmit, &idle32->xmit_idle) ||
                          put_user(recv, &idle32->recv_idle))
                              err = -EFAULT;
              }
              return err;
      }
      
      static int ppp_scompress(unsigned int fd, unsigned int cmd,
              struct ppp_option_data32 __user *odata32)
      {
              struct ppp_option_data __user *odata;
              __u32 data;
              void __user *datap;
      
    2         odata = compat_alloc_user_space(sizeof(*odata));
      
              if (get_user(data, &odata32->ptr))
                      return -EFAULT;
      
    2         datap = compat_ptr(data);
              if (put_user(datap, &odata->ptr))
                      return -EFAULT;
      
    2         if (copy_in_user(&odata->length, &odata32->length,
                               sizeof(__u32) + sizeof(int)))
                      return -EFAULT;
      
    2         return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
      }
      
      #ifdef CONFIG_BLOCK
      struct mtget32 {
              compat_long_t        mt_type;
              compat_long_t        mt_resid;
              compat_long_t        mt_dsreg;
              compat_long_t        mt_gstat;
              compat_long_t        mt_erreg;
              compat_daddr_t        mt_fileno;
              compat_daddr_t        mt_blkno;
      };
      #define MTIOCGET32        _IOR('m', 2, struct mtget32)
      
      struct mtpos32 {
              compat_long_t        mt_blkno;
      };
      #define MTIOCPOS32        _IOR('m', 3, struct mtpos32)
      
      static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
      {
    2         mm_segment_t old_fs = get_fs();
              struct mtget get;
              struct mtget32 __user *umget32;
              struct mtpos pos;
              struct mtpos32 __user *upos32;
              unsigned long kcmd;
              void *karg;
              int err = 0;
      
              switch(cmd) {
              case MTIOCPOS32:
                      kcmd = MTIOCPOS;
                      karg = &pos;
                      break;
              default:        /* MTIOCGET32 */
                      kcmd = MTIOCGET;
                      karg = &get;
                      break;
              }
    2         set_fs (KERNEL_DS);
              err = sys_ioctl (fd, kcmd, (unsigned long)karg);
              set_fs (old_fs);
              if (err)
                      return err;
              switch (cmd) {
              case MTIOCPOS32:
                      upos32 = argp;
                      err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
                      break;
              case MTIOCGET32:
                      umget32 = argp;
                      err = __put_user(get.mt_type, &umget32->mt_type);
                      err |= __put_user(get.mt_resid, &umget32->mt_resid);
                      err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
                      err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
                      err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
                      err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
                      err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
                      break;
              }
              return err ? -EFAULT: 0;
      }
      
      #endif /* CONFIG_BLOCK */
      
      /* Bluetooth ioctls */
      #define HCIUARTSETPROTO                _IOW('U', 200, int)
      #define HCIUARTGETPROTO                _IOR('U', 201, int)
      #define HCIUARTGETDEVICE        _IOR('U', 202, int)
      #define HCIUARTSETFLAGS                _IOW('U', 203, int)
      #define HCIUARTGETFLAGS                _IOR('U', 204, int)
      
      #define BNEPCONNADD        _IOW('B', 200, int)
      #define BNEPCONNDEL        _IOW('B', 201, int)
      #define BNEPGETCONNLIST        _IOR('B', 210, int)
      #define BNEPGETCONNINFO        _IOR('B', 211, int)
      #define BNEPGETSUPPFEAT        _IOR('B', 212, int)
      
      #define CMTPCONNADD        _IOW('C', 200, int)
      #define CMTPCONNDEL        _IOW('C', 201, int)
      #define CMTPGETCONNLIST        _IOR('C', 210, int)
      #define CMTPGETCONNINFO        _IOR('C', 211, int)
      
      #define HIDPCONNADD        _IOW('H', 200, int)
      #define HIDPCONNDEL        _IOW('H', 201, int)
      #define HIDPGETCONNLIST        _IOR('H', 210, int)
      #define HIDPGETCONNINFO        _IOR('H', 211, int)
      
      
      struct serial_struct32 {
              compat_int_t    type;
              compat_int_t    line;
              compat_uint_t   port;
              compat_int_t    irq;
              compat_int_t    flags;
              compat_int_t    xmit_fifo_size;
              compat_int_t    custom_divisor;
              compat_int_t    baud_base;
              unsigned short  close_delay;
              char    io_type;
              char    reserved_char[1];
              compat_int_t    hub6;
              unsigned short  closing_wait; /* time to wait before closing */
              unsigned short  closing_wait2; /* no longer used... */
              compat_uint_t   iomem_base;
              unsigned short  iomem_reg_shift;
              unsigned int    port_high;
           /* compat_ulong_t  iomap_base FIXME */
              compat_int_t    reserved[1];
      };
      
      static int serial_struct_ioctl(unsigned fd, unsigned cmd,
                              struct serial_struct32 __user *ss32)
      {
              typedef struct serial_struct32 SS32;
              int err;
              struct serial_struct ss;
    9         mm_segment_t oldseg = get_fs();
              __u32 udata;
              unsigned int base;
      
              if (cmd == TIOCSSERIAL) {
    8                 if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
                              return -EFAULT;
    8                 if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
                              return -EFAULT;
    7                 if (__get_user(udata, &ss32->iomem_base))
                              return -EFAULT;
    7                 ss.iomem_base = compat_ptr(udata);
                      if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
    7                     __get_user(ss.port_high, &ss32->port_high))
                              return -EFAULT;
    7                 ss.iomap_base = 0UL;
              }
    8         set_fs(KERNEL_DS);
                      err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
              set_fs(oldseg);
    1         if (cmd == TIOCGSERIAL && err >= 0) {
                      if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
                              return -EFAULT;
                      if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
                              return -EFAULT;
                      base = (unsigned long)ss.iomem_base  >> 32 ?
                              0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
                      if (__put_user(base, &ss32->iomem_base) ||
                          __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
                          __put_user(ss.port_high, &ss32->port_high))
                              return -EFAULT;
              }
              return err;
      }
      
      /*
       * I2C layer ioctls
       */
      
      struct i2c_msg32 {
              u16 addr;
              u16 flags;
              u16 len;
              compat_caddr_t buf;
      };
      
      struct i2c_rdwr_ioctl_data32 {
              compat_caddr_t msgs; /* struct i2c_msg __user *msgs */
              u32 nmsgs;
      };
      
      struct i2c_smbus_ioctl_data32 {
              u8 read_write;
              u8 command;
              u32 size;
              compat_caddr_t data; /* union i2c_smbus_data *data */
      };
      
      struct i2c_rdwr_aligned {
              struct i2c_rdwr_ioctl_data cmd;
              struct i2c_msg msgs[0];
      };
      
      static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
                              struct i2c_rdwr_ioctl_data32    __user *udata)
      {
              struct i2c_rdwr_aligned                __user *tdata;
              struct i2c_msg                        __user *tmsgs;
              struct i2c_msg32                __user *umsgs;
              compat_caddr_t                        datap;
              u32                                nmsgs;
              int                                i;
      
    6         if (get_user(nmsgs, &udata->nmsgs))
                      return -EFAULT;
    5         if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
                      return -EINVAL;
      
    4         if (get_user(datap, &udata->msgs))
                      return -EFAULT;
              umsgs = compat_ptr(datap);
      
              tdata = compat_alloc_user_space(sizeof(*tdata) +
                                            nmsgs * sizeof(struct i2c_msg));
              tmsgs = &tdata->msgs[0];
      
              if (put_user(nmsgs, &tdata->cmd.nmsgs) ||
    4             put_user(tmsgs, &tdata->cmd.msgs))
                      return -EFAULT;
      
    4         for (i = 0; i < nmsgs; i++) {
    3                 if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16)))
                              return -EFAULT;
    2                 if (get_user(datap, &umsgs[i].buf) ||
    2                     put_user(compat_ptr(datap), &tmsgs[i].buf))
                              return -EFAULT;
              }
    3         return sys_ioctl(fd, cmd, (unsigned long)tdata);
      }
      
      static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
                              struct i2c_smbus_ioctl_data32   __user *udata)
      {
              struct i2c_smbus_ioctl_data        __user *tdata;
              compat_caddr_t                        datap;
      
    2         tdata = compat_alloc_user_space(sizeof(*tdata));
              if (tdata == NULL)
                      return -ENOMEM;
    2         if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata)))
                      return -EFAULT;
      
    2         if (!access_ok(VERIFY_READ, udata, sizeof(*udata)))
                      return -EFAULT;
      
    2         if (__copy_in_user(&tdata->read_write, &udata->read_write, 2 * sizeof(u8)))
                      return -EFAULT;
    2         if (__copy_in_user(&tdata->size, &udata->size, 2 * sizeof(u32)))
                      return -EFAULT;
    2         if (__get_user(datap, &udata->data) ||
    2             __put_user(compat_ptr(datap), &tdata->data))
                      return -EFAULT;
      
    2         return sys_ioctl(fd, cmd, (unsigned long)tdata);
      }
      
      #define RTC_IRQP_READ32                _IOR('p', 0x0b, compat_ulong_t)
      #define RTC_IRQP_SET32                _IOW('p', 0x0c, compat_ulong_t)
      #define RTC_EPOCH_READ32        _IOR('p', 0x0d, compat_ulong_t)
      #define RTC_EPOCH_SET32                _IOW('p', 0x0e, compat_ulong_t)
      
      static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
      {
    5         mm_segment_t oldfs = get_fs();
              compat_ulong_t val32;
              unsigned long kval;
              int ret;
      
              switch (cmd) {
              case RTC_IRQP_READ32:
              case RTC_EPOCH_READ32:
                      set_fs(KERNEL_DS);
    2                 ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
                                              RTC_IRQP_READ : RTC_EPOCH_READ,
                                              (unsigned long)&kval);
                      set_fs(oldfs);
    1                 if (ret)
                              return ret;
    1                 val32 = kval;
                      return put_user(val32, (unsigned int __user *)argp);
              case RTC_IRQP_SET32:
    2                 return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
              case RTC_EPOCH_SET32:
    1                 return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
              }
      
              return -ENOIOCTLCMD;
      }
      
      /* on ia32 l_start is on a 32-bit boundary */
      #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
      struct space_resv_32 {
              __s16                l_type;
              __s16                l_whence;
              __s64                l_start        __attribute__((packed));
                              /* len == 0 means until end of file */
              __s64                l_len __attribute__((packed));
              __s32                l_sysid;
              __u32                l_pid;
              __s32                l_pad[4];        /* reserve area */
      };
      
      #define FS_IOC_RESVSP_32                _IOW ('X', 40, struct space_resv_32)
      #define FS_IOC_RESVSP64_32        _IOW ('X', 42, struct space_resv_32)
      
      /* just account for different alignment */
      static int compat_ioctl_preallocate(struct file *file,
                              struct space_resv_32    __user *p32)
      {
              struct space_resv        __user *p = compat_alloc_user_space(sizeof(*p));
      
              if (copy_in_user(&p->l_type,        &p32->l_type,        sizeof(s16)) ||
  180             copy_in_user(&p->l_whence,        &p32->l_whence, sizeof(s16)) ||
  180             copy_in_user(&p->l_start,        &p32->l_start,        sizeof(s64)) ||
  180             copy_in_user(&p->l_len,        &p32->l_len,        sizeof(s64)) ||
  180             copy_in_user(&p->l_sysid,        &p32->l_sysid,        sizeof(s32)) ||
  180             copy_in_user(&p->l_pid,        &p32->l_pid,        sizeof(u32)) ||
  180             copy_in_user(&p->l_pad,        &p32->l_pad,        4*sizeof(u32)))
                      return -EFAULT;
      
  181         return ioctl_preallocate(file, p);
      }
      #endif
      
      /*
       * simple reversible transform to make our table more evenly
       * distributed after sorting.
       */
      #define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
      
      #define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd),
      /* ioctl should not be warned about even if it's not implemented.
         Valid reasons to use this:
         - It is implemented with ->compat_ioctl on some device, but programs
         call it on others too.
         - The ioctl is not implemented in the native kernel, but programs
         call it commonly anyways.
         Most other reasons are not valid. */
      #define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
      
      static unsigned int ioctl_pointer[] = {
      /* compatible ioctls first */
      COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
      COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
      
      /* Big T */
      COMPATIBLE_IOCTL(TCGETA)
      COMPATIBLE_IOCTL(TCSETA)
      COMPATIBLE_IOCTL(TCSETAW)
      COMPATIBLE_IOCTL(TCSETAF)
      COMPATIBLE_IOCTL(TCSBRK)
      COMPATIBLE_IOCTL(TCXONC)
      COMPATIBLE_IOCTL(TCFLSH)
      COMPATIBLE_IOCTL(TCGETS)
      COMPATIBLE_IOCTL(TCSETS)
      COMPATIBLE_IOCTL(TCSETSW)
      COMPATIBLE_IOCTL(TCSETSF)
      COMPATIBLE_IOCTL(TIOCLINUX)
      COMPATIBLE_IOCTL(TIOCSBRK)
      COMPATIBLE_IOCTL(TIOCGDEV)
      COMPATIBLE_IOCTL(TIOCCBRK)
      COMPATIBLE_IOCTL(TIOCGSID)
      COMPATIBLE_IOCTL(TIOCGICOUNT)
      COMPATIBLE_IOCTL(TIOCGPKT)
      COMPATIBLE_IOCTL(TIOCGPTLCK)
      COMPATIBLE_IOCTL(TIOCGEXCL)
      /* Little t */
      COMPATIBLE_IOCTL(TIOCGETD)
      COMPATIBLE_IOCTL(TIOCSETD)
      COMPATIBLE_IOCTL(TIOCEXCL)
      COMPATIBLE_IOCTL(TIOCNXCL)
      COMPATIBLE_IOCTL(TIOCCONS)
      COMPATIBLE_IOCTL(TIOCGSOFTCAR)
      COMPATIBLE_IOCTL(TIOCSSOFTCAR)
      COMPATIBLE_IOCTL(TIOCSWINSZ)
      COMPATIBLE_IOCTL(TIOCGWINSZ)
      COMPATIBLE_IOCTL(TIOCMGET)
      COMPATIBLE_IOCTL(TIOCMBIC)
      COMPATIBLE_IOCTL(TIOCMBIS)
      COMPATIBLE_IOCTL(TIOCMSET)
      COMPATIBLE_IOCTL(TIOCPKT)
      COMPATIBLE_IOCTL(TIOCNOTTY)
      COMPATIBLE_IOCTL(TIOCSTI)
      COMPATIBLE_IOCTL(TIOCOUTQ)
      COMPATIBLE_IOCTL(TIOCSPGRP)
      COMPATIBLE_IOCTL(TIOCGPGRP)
      COMPATIBLE_IOCTL(TIOCGPTN)
      COMPATIBLE_IOCTL(TIOCSPTLCK)
      COMPATIBLE_IOCTL(TIOCSERGETLSR)
      COMPATIBLE_IOCTL(TIOCSIG)
      #ifdef TIOCSRS485
      COMPATIBLE_IOCTL(TIOCSRS485)
      #endif
      #ifdef TIOCGRS485
      COMPATIBLE_IOCTL(TIOCGRS485)
      #endif
      #ifdef TCGETS2
      COMPATIBLE_IOCTL(TCGETS2)
      COMPATIBLE_IOCTL(TCSETS2)
      COMPATIBLE_IOCTL(TCSETSW2)
      COMPATIBLE_IOCTL(TCSETSF2)
      #endif
      /* Little f */
      COMPATIBLE_IOCTL(FIOCLEX)
      COMPATIBLE_IOCTL(FIONCLEX)
      COMPATIBLE_IOCTL(FIOASYNC)
      COMPATIBLE_IOCTL(FIONBIO)
      COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
      COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
      /* 0x00 */
      COMPATIBLE_IOCTL(FIBMAP)
      COMPATIBLE_IOCTL(FIGETBSZ)
      /* 'X' - originally XFS but some now in the VFS */
      COMPATIBLE_IOCTL(FIFREEZE)
      COMPATIBLE_IOCTL(FITHAW)
      COMPATIBLE_IOCTL(FITRIM)
      COMPATIBLE_IOCTL(KDGETKEYCODE)
      COMPATIBLE_IOCTL(KDSETKEYCODE)
      COMPATIBLE_IOCTL(KDGKBTYPE)
      COMPATIBLE_IOCTL(KDGETMODE)
      COMPATIBLE_IOCTL(KDGKBMODE)
      COMPATIBLE_IOCTL(KDGKBMETA)
      COMPATIBLE_IOCTL(KDGKBENT)
      COMPATIBLE_IOCTL(KDSKBENT)
      COMPATIBLE_IOCTL(KDGKBSENT)
      COMPATIBLE_IOCTL(KDSKBSENT)
      COMPATIBLE_IOCTL(KDGKBDIACR)
      COMPATIBLE_IOCTL(KDSKBDIACR)
      COMPATIBLE_IOCTL(KDGKBDIACRUC)
      COMPATIBLE_IOCTL(KDSKBDIACRUC)
      COMPATIBLE_IOCTL(KDKBDREP)
      COMPATIBLE_IOCTL(KDGKBLED)
      COMPATIBLE_IOCTL(KDGETLED)
      #ifdef CONFIG_BLOCK
      /* Big S */
      COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
      COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
      COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK)
      COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY)
      COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
      COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
      COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
      COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
      #endif
      /* Big V (don't complain on serial console) */
      IGNORE_IOCTL(VT_OPENQRY)
      IGNORE_IOCTL(VT_GETMODE)
      /* Little p (/dev/rtc, /dev/envctrl, etc.) */
      COMPATIBLE_IOCTL(RTC_AIE_ON)
      COMPATIBLE_IOCTL(RTC_AIE_OFF)
      COMPATIBLE_IOCTL(RTC_UIE_ON)
      COMPATIBLE_IOCTL(RTC_UIE_OFF)
      COMPATIBLE_IOCTL(RTC_PIE_ON)
      COMPATIBLE_IOCTL(RTC_PIE_OFF)
      COMPATIBLE_IOCTL(RTC_WIE_ON)
      COMPATIBLE_IOCTL(RTC_WIE_OFF)
      COMPATIBLE_IOCTL(RTC_ALM_SET)
      COMPATIBLE_IOCTL(RTC_ALM_READ)
      COMPATIBLE_IOCTL(RTC_RD_TIME)
      COMPATIBLE_IOCTL(RTC_SET_TIME)
      COMPATIBLE_IOCTL(RTC_WKALM_SET)
      COMPATIBLE_IOCTL(RTC_WKALM_RD)
      /*
       * These two are only for the sbus rtc driver, but
       * hwclock tries them on every rtc device first when
       * running on sparc.  On other architectures the entries
       * are useless but harmless.
       */
      COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */
      COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
      /* Little m */
      COMPATIBLE_IOCTL(MTIOCTOP)
      /* Socket level stuff */
      COMPATIBLE_IOCTL(FIOQSIZE)
      #ifdef CONFIG_BLOCK
      /* md calls this on random blockdevs */
      IGNORE_IOCTL(RAID_VERSION)
      /* qemu/qemu-img might call these two on plain files for probing */
      IGNORE_IOCTL(CDROM_DRIVE_STATUS)
      IGNORE_IOCTL(FDGETPRM32)
      /* SG stuff */
      COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
      COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
      COMPATIBLE_IOCTL(SG_EMULATED_HOST)
      COMPATIBLE_IOCTL(SG_GET_TRANSFORM)
      COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE)
      COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE)
      COMPATIBLE_IOCTL(SG_GET_SCSI_ID)
      COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA)
      COMPATIBLE_IOCTL(SG_GET_LOW_DMA)
      COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID)
      COMPATIBLE_IOCTL(SG_GET_PACK_ID)
      COMPATIBLE_IOCTL(SG_GET_NUM_WAITING)
      COMPATIBLE_IOCTL(SG_SET_DEBUG)
      COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE)
      COMPATIBLE_IOCTL(SG_GET_COMMAND_Q)
      COMPATIBLE_IOCTL(SG_SET_COMMAND_Q)
      COMPATIBLE_IOCTL(SG_GET_VERSION_NUM)
      COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN)
      COMPATIBLE_IOCTL(SG_SCSI_RESET)
      COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
      COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
      COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
      #endif
      /* PPP stuff */
      COMPATIBLE_IOCTL(PPPIOCGFLAGS)
      COMPATIBLE_IOCTL(PPPIOCSFLAGS)
      COMPATIBLE_IOCTL(PPPIOCGASYNCMAP)
      COMPATIBLE_IOCTL(PPPIOCSASYNCMAP)
      COMPATIBLE_IOCTL(PPPIOCGUNIT)
      COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP)
      COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP)
      COMPATIBLE_IOCTL(PPPIOCGMRU)
      COMPATIBLE_IOCTL(PPPIOCSMRU)
      COMPATIBLE_IOCTL(PPPIOCSMAXCID)
      COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP)
      COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP)
      COMPATIBLE_IOCTL(PPPIOCXFERUNIT)
      /* PPPIOCSCOMPRESS is translated */
      COMPATIBLE_IOCTL(PPPIOCGNPMODE)
      COMPATIBLE_IOCTL(PPPIOCSNPMODE)
      COMPATIBLE_IOCTL(PPPIOCGDEBUG)
      COMPATIBLE_IOCTL(PPPIOCSDEBUG)
      /* PPPIOCSPASS is translated */
      /* PPPIOCSACTIVE is translated */
      /* PPPIOCGIDLE is translated */
      COMPATIBLE_IOCTL(PPPIOCNEWUNIT)
      COMPATIBLE_IOCTL(PPPIOCATTACH)
      COMPATIBLE_IOCTL(PPPIOCDETACH)
      COMPATIBLE_IOCTL(PPPIOCSMRRU)
      COMPATIBLE_IOCTL(PPPIOCCONNECT)
      COMPATIBLE_IOCTL(PPPIOCDISCONN)
      COMPATIBLE_IOCTL(PPPIOCATTCHAN)
      COMPATIBLE_IOCTL(PPPIOCGCHAN)
      COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
      /* ppdev */
      COMPATIBLE_IOCTL(PPSETMODE)
      COMPATIBLE_IOCTL(PPRSTATUS)
      COMPATIBLE_IOCTL(PPRCONTROL)
      COMPATIBLE_IOCTL(PPWCONTROL)
      COMPATIBLE_IOCTL(PPFCONTROL)
      COMPATIBLE_IOCTL(PPRDATA)
      COMPATIBLE_IOCTL(PPWDATA)
      COMPATIBLE_IOCTL(PPCLAIM)
      COMPATIBLE_IOCTL(PPRELEASE)
      COMPATIBLE_IOCTL(PPYIELD)
      COMPATIBLE_IOCTL(PPEXCL)
      COMPATIBLE_IOCTL(PPDATADIR)
      COMPATIBLE_IOCTL(PPNEGOT)
      COMPATIBLE_IOCTL(PPWCTLONIRQ)
      COMPATIBLE_IOCTL(PPCLRIRQ)
      COMPATIBLE_IOCTL(PPSETPHASE)
      COMPATIBLE_IOCTL(PPGETMODES)
      COMPATIBLE_IOCTL(PPGETMODE)
      COMPATIBLE_IOCTL(PPGETPHASE)
      COMPATIBLE_IOCTL(PPGETFLAGS)
      COMPATIBLE_IOCTL(PPSETFLAGS)
      /* Big A */
      /* sparc only */
      /* Big Q for sound/OSS */
      COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC)
      COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE)
      COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS)
      COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD)
      COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL)
      COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND)
      COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME)
      COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID)
      COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL)
      COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE)
      /* Big T for sound/OSS */
      COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE)
      COMPATIBLE_IOCTL(SNDCTL_TMR_START)
      COMPATIBLE_IOCTL(SNDCTL_TMR_STOP)
      COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE)
      COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO)
      COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE)
      COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME)
      COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT)
      /* Little m for sound/OSS */
      COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME)
      COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE)
      COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD)
      /* Big P for sound/OSS */
      COMPATIBLE_IOCTL(SNDCTL_DSP_RESET)
      COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC)
      COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED)
      COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE)
      COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS)
      COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER)
      COMPATIBLE_IOCTL(SNDCTL_DSP_POST)
      COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE)
      COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS)
      COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE)
      COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER)
      COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR)
      /* SNDCTL_DSP_MAPINBUF,  XXX needs translation */
      /* SNDCTL_DSP_MAPOUTBUF,  XXX needs translation */
      COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO)
      COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX)
      COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY)
      COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE)
      COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE)
      COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS)
      COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS)
      COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER)
      /* Big C for sound/OSS */
      COMPATIBLE_IOCTL(SNDCTL_COPR_RESET)
      COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD)
      COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA)
      COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE)
      COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA)
      COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE)
      COMPATIBLE_IOCTL(SNDCTL_COPR_RUN)
      COMPATIBLE_IOCTL(SNDCTL_COPR_HALT)
      COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG)
      COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG)
      /* Big M for sound/OSS */
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3)
      COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1))
      COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2))
      COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3))
      COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN))
      COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT))
      COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO))
      COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO))
      COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR))
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE)
      /* SOUND_MIXER_READ_ENHANCE,  same value as READ_MUTE */
      /* SOUND_MIXER_READ_LOUD,  same value as READ_MUTE */
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS)
      COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2)
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3)
      COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1))
      COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2))
      COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3))
      COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN))
      COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT))
      COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO))
      COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO))
      COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR))
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE)
      /* SOUND_MIXER_WRITE_ENHANCE,  same value as WRITE_MUTE */
      /* SOUND_MIXER_WRITE_LOUD,  same value as WRITE_MUTE */
      COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC)
      COMPATIBLE_IOCTL(SOUND_MIXER_INFO)
      COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO)
      COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS)
      COMPATIBLE_IOCTL(SOUND_MIXER_AGC)
      COMPATIBLE_IOCTL(SOUND_MIXER_3DSE)
      COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1)
      COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2)
      COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3)
      COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4)
      COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
      COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
      COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
      COMPATIBLE_IOCTL(OSS_GETVERSION)
      /* Raw devices */
      COMPATIBLE_IOCTL(RAW_SETBIND)
      COMPATIBLE_IOCTL(RAW_GETBIND)
      /* Watchdog */
      COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
      COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
      COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
      COMPATIBLE_IOCTL(WDIOC_GETTEMP)
      COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
      COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
      COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
      COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
      /* Big R */
      COMPATIBLE_IOCTL(RNDGETENTCNT)
      COMPATIBLE_IOCTL(RNDADDTOENTCNT)
      COMPATIBLE_IOCTL(RNDGETPOOL)
      COMPATIBLE_IOCTL(RNDADDENTROPY)
      COMPATIBLE_IOCTL(RNDZAPENTCNT)
      COMPATIBLE_IOCTL(RNDCLEARPOOL)
      /* Bluetooth */
      COMPATIBLE_IOCTL(HCIDEVUP)
      COMPATIBLE_IOCTL(HCIDEVDOWN)
      COMPATIBLE_IOCTL(HCIDEVRESET)
      COMPATIBLE_IOCTL(HCIDEVRESTAT)
      COMPATIBLE_IOCTL(HCIGETDEVLIST)
      COMPATIBLE_IOCTL(HCIGETDEVINFO)
      COMPATIBLE_IOCTL(HCIGETCONNLIST)
      COMPATIBLE_IOCTL(HCIGETCONNINFO)
      COMPATIBLE_IOCTL(HCIGETAUTHINFO)
      COMPATIBLE_IOCTL(HCISETRAW)
      COMPATIBLE_IOCTL(HCISETSCAN)
      COMPATIBLE_IOCTL(HCISETAUTH)
      COMPATIBLE_IOCTL(HCISETENCRYPT)
      COMPATIBLE_IOCTL(HCISETPTYPE)
      COMPATIBLE_IOCTL(HCISETLINKPOL)
      COMPATIBLE_IOCTL(HCISETLINKMODE)
      COMPATIBLE_IOCTL(HCISETACLMTU)
      COMPATIBLE_IOCTL(HCISETSCOMTU)
      COMPATIBLE_IOCTL(HCIBLOCKADDR)
      COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
      COMPATIBLE_IOCTL(HCIINQUIRY)
      COMPATIBLE_IOCTL(HCIUARTSETPROTO)
      COMPATIBLE_IOCTL(HCIUARTGETPROTO)
      COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
      COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
      COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
      COMPATIBLE_IOCTL(RFCOMMGETDEVINFO)
      COMPATIBLE_IOCTL(RFCOMMSTEALDLC)
      COMPATIBLE_IOCTL(BNEPCONNADD)
      COMPATIBLE_IOCTL(BNEPCONNDEL)
      COMPATIBLE_IOCTL(BNEPGETCONNLIST)
      COMPATIBLE_IOCTL(BNEPGETCONNINFO)
      COMPATIBLE_IOCTL(BNEPGETSUPPFEAT)
      COMPATIBLE_IOCTL(CMTPCONNADD)
      COMPATIBLE_IOCTL(CMTPCONNDEL)
      COMPATIBLE_IOCTL(CMTPGETCONNLIST)
      COMPATIBLE_IOCTL(CMTPGETCONNINFO)
      COMPATIBLE_IOCTL(HIDPCONNADD)
      COMPATIBLE_IOCTL(HIDPCONNDEL)
      COMPATIBLE_IOCTL(HIDPGETCONNLIST)
      COMPATIBLE_IOCTL(HIDPGETCONNINFO)
      /* CAPI */
      COMPATIBLE_IOCTL(CAPI_REGISTER)
      COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER)
      COMPATIBLE_IOCTL(CAPI_GET_VERSION)
      COMPATIBLE_IOCTL(CAPI_GET_SERIAL)
      COMPATIBLE_IOCTL(CAPI_GET_PROFILE)
      COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD)
      COMPATIBLE_IOCTL(CAPI_GET_ERRCODE)
      COMPATIBLE_IOCTL(CAPI_INSTALLED)
      COMPATIBLE_IOCTL(CAPI_GET_FLAGS)
      COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
      COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
      COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
      COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
      /* Siemens Gigaset */
      COMPATIBLE_IOCTL(GIGASET_REDIR)
      COMPATIBLE_IOCTL(GIGASET_CONFIG)
      COMPATIBLE_IOCTL(GIGASET_BRKCHARS)
      COMPATIBLE_IOCTL(GIGASET_VERSION)
      /* Misc. */
      COMPATIBLE_IOCTL(0x41545900)                /* ATYIO_CLKR */
      COMPATIBLE_IOCTL(0x41545901)                /* ATYIO_CLKW */
      COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
      COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
      COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
      COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
      /* NBD */
      COMPATIBLE_IOCTL(NBD_DO_IT)
      COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
      COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
      COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
      COMPATIBLE_IOCTL(NBD_DISCONNECT)
      /* i2c */
      COMPATIBLE_IOCTL(I2C_SLAVE)
      COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
      COMPATIBLE_IOCTL(I2C_TENBIT)
      COMPATIBLE_IOCTL(I2C_PEC)
      COMPATIBLE_IOCTL(I2C_RETRIES)
      COMPATIBLE_IOCTL(I2C_TIMEOUT)
      /* hiddev */
      COMPATIBLE_IOCTL(HIDIOCGVERSION)
      COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
      COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
      COMPATIBLE_IOCTL(HIDIOCGSTRING)
      COMPATIBLE_IOCTL(HIDIOCINITREPORT)
      COMPATIBLE_IOCTL(HIDIOCGREPORT)
      COMPATIBLE_IOCTL(HIDIOCSREPORT)
      COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
      COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
      COMPATIBLE_IOCTL(HIDIOCGUSAGE)
      COMPATIBLE_IOCTL(HIDIOCSUSAGE)
      COMPATIBLE_IOCTL(HIDIOCGUCODE)
      COMPATIBLE_IOCTL(HIDIOCGFLAG)
      COMPATIBLE_IOCTL(HIDIOCSFLAG)
      COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
      COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
      /* dvb */
      COMPATIBLE_IOCTL(AUDIO_STOP)
      COMPATIBLE_IOCTL(AUDIO_PLAY)
      COMPATIBLE_IOCTL(AUDIO_PAUSE)
      COMPATIBLE_IOCTL(AUDIO_CONTINUE)
      COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE)
      COMPATIBLE_IOCTL(AUDIO_SET_MUTE)
      COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC)
      COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE)
      COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT)
      COMPATIBLE_IOCTL(AUDIO_GET_STATUS)
      COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES)
      COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
      COMPATIBLE_IOCTL(AUDIO_SET_ID)
      COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
      COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
      COMPATIBLE_IOCTL(AUDIO_SET_EXT_ID)
      COMPATIBLE_IOCTL(AUDIO_SET_ATTRIBUTES)
      COMPATIBLE_IOCTL(AUDIO_SET_KARAOKE)
      COMPATIBLE_IOCTL(DMX_START)
      COMPATIBLE_IOCTL(DMX_STOP)
      COMPATIBLE_IOCTL(DMX_SET_FILTER)
      COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
      COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
      COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
      COMPATIBLE_IOCTL(DMX_GET_CAPS)
      COMPATIBLE_IOCTL(DMX_SET_SOURCE)
      COMPATIBLE_IOCTL(DMX_GET_STC)
      COMPATIBLE_IOCTL(FE_GET_INFO)
      COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD)
      COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD)
      COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY)
      COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST)
      COMPATIBLE_IOCTL(FE_SET_TONE)
      COMPATIBLE_IOCTL(FE_SET_VOLTAGE)
      COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE)
      COMPATIBLE_IOCTL(FE_READ_STATUS)
      COMPATIBLE_IOCTL(FE_READ_BER)
      COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH)
      COMPATIBLE_IOCTL(FE_READ_SNR)
      COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS)
      COMPATIBLE_IOCTL(FE_SET_FRONTEND)
      COMPATIBLE_IOCTL(FE_GET_FRONTEND)
      COMPATIBLE_IOCTL(FE_GET_EVENT)
      COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD)
      COMPATIBLE_IOCTL(VIDEO_STOP)
      COMPATIBLE_IOCTL(VIDEO_PLAY)
      COMPATIBLE_IOCTL(VIDEO_FREEZE)
      COMPATIBLE_IOCTL(VIDEO_CONTINUE)
      COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE)
      COMPATIBLE_IOCTL(VIDEO_SET_BLANK)
      COMPATIBLE_IOCTL(VIDEO_GET_STATUS)
      COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT)
      COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
      COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
      COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
      COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
      COMPATIBLE_IOCTL(VIDEO_SET_ID)
      COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
      COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
      COMPATIBLE_IOCTL(VIDEO_SET_SYSTEM)
      COMPATIBLE_IOCTL(VIDEO_SET_HIGHLIGHT)
      COMPATIBLE_IOCTL(VIDEO_SET_SPU)
      COMPATIBLE_IOCTL(VIDEO_GET_NAVI)
      COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
      COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
      COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
      
      /* joystick */
      COMPATIBLE_IOCTL(JSIOCGVERSION)
      COMPATIBLE_IOCTL(JSIOCGAXES)
      COMPATIBLE_IOCTL(JSIOCGBUTTONS)
      COMPATIBLE_IOCTL(JSIOCGNAME(0))
      
      #ifdef TIOCGLTC
      COMPATIBLE_IOCTL(TIOCGLTC)
      COMPATIBLE_IOCTL(TIOCSLTC)
      #endif
      #ifdef TIOCSTART
      /*
       * For these two we have definitions in ioctls.h and/or termios.h on
       * some architectures but no actual implemention.  Some applications
       * like bash call them if they are defined in the headers, so we provide
       * entries here to avoid syslog message spew.
       */
      COMPATIBLE_IOCTL(TIOCSTART)
      COMPATIBLE_IOCTL(TIOCSTOP)
      #endif
      
      /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
         but we don't want warnings on other file systems. So declare
         them as compatible here. */
      #define VFAT_IOCTL_READDIR_BOTH32       _IOR('r', 1, struct compat_dirent[2])
      #define VFAT_IOCTL_READDIR_SHORT32      _IOR('r', 2, struct compat_dirent[2])
      
      IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
      IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
      
      #ifdef CONFIG_SPARC
      /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
      IGNORE_IOCTL(FBIOGTYPE)
      IGNORE_IOCTL(FBIOSATTR)
      IGNORE_IOCTL(FBIOGATTR)
      IGNORE_IOCTL(FBIOSVIDEO)
      IGNORE_IOCTL(FBIOGVIDEO)
      IGNORE_IOCTL(FBIOSCURPOS)
      IGNORE_IOCTL(FBIOGCURPOS)
      IGNORE_IOCTL(FBIOGCURMAX)
      IGNORE_IOCTL(FBIOPUTCMAP32)
      IGNORE_IOCTL(FBIOGETCMAP32)
      IGNORE_IOCTL(FBIOSCURSOR32)
      IGNORE_IOCTL(FBIOGCURSOR32)
      #endif
      };
      
      /*
       * Convert common ioctl arguments based on their command number
       *
       * Please do not add any code in here. Instead, implement
       * a compat_ioctl operation in the place that handleѕ the
       * ioctl for the native case.
       */
      static long do_ioctl_trans(int fd, unsigned int cmd,
                       unsigned long arg, struct file *file)
      {
              void __user *argp = compat_ptr(arg);
      
              switch (cmd) {
   82         case PPPIOCGIDLE32:
                      return ppp_gidle(fd, cmd, argp);
              case PPPIOCSCOMPRESS32:
                      return ppp_scompress(fd, cmd, argp);
    1         case PPPIOCSPASS32:
              case PPPIOCSACTIVE32:
    2                 return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
      #ifdef CONFIG_BLOCK
              case SG_IO:
    8                 return sg_ioctl_trans(fd, cmd, argp);
              case SG_GET_REQUEST_TABLE:
                      return sg_grt_trans(fd, cmd, argp);
    4         case MTIOCGET32:
              case MTIOCPOS32:
                      return mt_ioctl_trans(fd, cmd, argp);
      #endif
              /* Serial */
    2         case TIOCGSERIAL:
              case TIOCSSERIAL:
                      return serial_struct_ioctl(fd, cmd, argp);
              /* i2c */
              case I2C_FUNCS:
    9                 return w_long(fd, cmd, argp);
              case I2C_RDWR:
                      return do_i2c_rdwr_ioctl(fd, cmd, argp);
    3         case I2C_SMBUS:
                      return do_i2c_smbus_ioctl(fd, cmd, argp);
    6         /* Not implemented in the native kernel */
              case RTC_IRQP_READ32:
   24         case RTC_IRQP_SET32:
              case RTC_EPOCH_READ32:
              case RTC_EPOCH_SET32:
                      return rtc_ioctl(fd, cmd, argp);
      
              /* dvb */
    5         case VIDEO_GET_EVENT:
                      return do_video_get_event(fd, cmd, argp);
              case VIDEO_STILLPICTURE:
                      return do_video_stillpicture(fd, cmd, argp);
    8         case VIDEO_SET_SPU_PALETTE:
                      return do_video_set_spu_palette(fd, cmd, argp);
    3         }
      
              /*
               * These take an integer instead of a pointer as 'arg',
               * so we must not do a compat_ptr() translation.
               */
              switch (cmd) {
              /* Big T */
              case TCSBRKP:
   39         case TIOCMIWAIT:
              case TIOCSCTTY:
              /* RAID */
              case HOT_REMOVE_DISK:
              case HOT_ADD_DISK:
              case SET_DISK_FAULTY:
              case SET_BITMAP_FILE:
              /* Big K */
              case KDSIGACCEPT:
              case KIOCSOUND:
              case KDMKTONE:
              case KDSETMODE:
              case KDSKBMODE:
              case KDSKBMETA:
              case KDSKBLED:
              case KDSETLED:
              /* NBD */
              case NBD_SET_SOCK:
              case NBD_SET_BLKSIZE:
              case NBD_SET_SIZE:
              case NBD_SET_SIZE_BLOCKS:
                      return do_vfs_ioctl(file, fd, cmd, arg);
              }
      
    5         return -ENOIOCTLCMD;
      }
      
      static int compat_ioctl_check_table(unsigned int xcmd)
      {
              int i;
              const int max = ARRAY_SIZE(ioctl_pointer) - 1;
      
              BUILD_BUG_ON(max >= (1 << 16));
      
              /* guess initial offset into table, assuming a
                 normalized distribution */
              i = ((xcmd >> 16) * max) >> 16;
      
              /* do linear search up first, until greater or equal */
              while (ioctl_pointer[i] < xcmd && i < max)
                      i++;
      
  224         /* then do linear search down */
  226         while (ioctl_pointer[i] > xcmd && i > 0)
                      i--;
      
  395         return ioctl_pointer[i] == xcmd;
  200 }
      
      COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
                             compat_ulong_t, arg32)
      {
 2058         unsigned long arg = arg32;
              struct fd f = fdget(fd);
              int error = -EBADF;
              if (!f.file)
 1948                 goto out;
      
              /* RED-PEN how should LSM module know it's handling 32bit? */
              error = security_file_ioctl(f.file, cmd, arg);
              if (error)
                      goto out_fput;
      
              /*
               * To allow the compat_ioctl handlers to be self contained
               * we need to check the common ioctls here first.
               * Just handle them with the standard handlers below.
               */
              switch (cmd) {
              case FIOCLEX:
              case FIONCLEX:
 2046         case FIONBIO:
              case FIOASYNC:
              case FIOQSIZE:
                      break;
      
      #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
              case FS_IOC_RESVSP_32:
              case FS_IOC_RESVSP64_32:
                      error = compat_ioctl_preallocate(f.file, compat_ptr(arg));
                      goto out_fput;
      #else
  182         case FS_IOC_RESVSP:
              case FS_IOC_RESVSP64:
                      error = ioctl_preallocate(f.file, compat_ptr(arg));
                      goto out_fput;
      #endif
      
              case FIBMAP:
              case FIGETBSZ:
              case FIONREAD:
                      if (S_ISREG(file_inode(f.file)->i_mode))
                              break;
                      /*FALL THROUGH*/
   48 
              default:
                      if (f.file->f_op->compat_ioctl) {
                              error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
                              if (error != -ENOIOCTLCMD)
 1870                                 goto out_fput;
 1775                 }
 1472 
                      if (!f.file->f_op->unlocked_ioctl)
                              goto do_ioctl;
                      break;
  387         }
      
              if (compat_ioctl_check_table(XFORM(cmd)))
                      goto found_handler;
      
  395         error = do_ioctl_trans(fd, cmd, arg, f.file);
              if (error == -ENOIOCTLCMD)
                      error = -ENOTTY;
   82 
   47         goto out_fput;
      
       found_handler:
              arg = (unsigned long)compat_ptr(arg);
       do_ioctl:
              error = do_vfs_ioctl(f.file, fd, cmd, arg);
       out_fput:
              fdput(f);
  348  out:
              return error;
 1943 }
      
      static int __init init_sys32_ioctl_cmp(const void *p, const void *q)
      {
              unsigned int a, b;
              a = *(unsigned int *)p;
              b = *(unsigned int *)q;
              if (a > b)
                      return 1;
              if (a < b)
                      return -1;
              return 0;
      }
      
      static int __init init_sys32_ioctl(void)
      {
              sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer),
                      init_sys32_ioctl_cmp, NULL);
              return 0;
      }
      __initcall(init_sys32_ioctl);
      /*
       * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
       * Written by Alex Tomas <alex@clusterfs.com>
       *
       * This program is free software; you can redistribute it and/or modify
       * it under the terms of the GNU General Public License version 2 as
       * published by the Free Software Foundation.
       *
       * This program is distributed in the hope that it will be useful,
       * but WITHOUT ANY WARRANTY; without even the implied warranty of
       * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       * GNU General Public License for more details.
       *
       * You should have received a copy of the GNU General Public Licens
       * along with this program; if not, write to the Free Software
       * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
       */
      
      
      /*
       * mballoc.c contains the multiblocks allocation routines
       */
      
      #include "ext4_jbd2.h"
      #include "mballoc.h"
      #include <linux/log2.h>
      #include <linux/module.h>
      #include <linux/slab.h>
      #include <linux/nospec.h>
      #include <linux/backing-dev.h>
      #include <trace/events/ext4.h>
      
      #ifdef CONFIG_EXT4_DEBUG
      ushort ext4_mballoc_debug __read_mostly;
      
      module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
      MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
      #endif
      
      /*
       * MUSTDO:
       *   - test ext4_ext_search_left() and ext4_ext_search_right()
       *   - search for metadata in few groups
       *
       * TODO v4:
       *   - normalization should take into account whether file is still open
       *   - discard preallocations if no free space left (policy?)
       *   - don't normalize tails
       *   - quota
       *   - reservation for superuser
       *
       * TODO v3:
       *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
       *   - track min/max extents in each group for better group selection
       *   - mb_mark_used() may allocate chunk right after splitting buddy
       *   - tree of groups sorted by number of free blocks
       *   - error handling
       */
      
      /*
       * The allocation request involve request for multiple number of blocks
       * near to the goal(block) value specified.
       *
       * During initialization phase of the allocator we decide to use the
       * group preallocation or inode preallocation depending on the size of
       * the file. The size of the file could be the resulting file size we
       * would have after allocation, or the current file size, which ever
       * is larger. If the size is less than sbi->s_mb_stream_request we
       * select to use the group preallocation. The default value of
       * s_mb_stream_request is 16 blocks. This can also be tuned via
       * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
       * terms of number of blocks.
       *
       * The main motivation for having small file use group preallocation is to
       * ensure that we have small files closer together on the disk.
       *
       * First stage the allocator looks at the inode prealloc list,
       * ext4_inode_info->i_prealloc_list, which contains list of prealloc
       * spaces for this particular inode. The inode prealloc space is
       * represented as:
       *
       * pa_lstart -> the logical start block for this prealloc space
       * pa_pstart -> the physical start block for this prealloc space
       * pa_len    -> length for this prealloc space (in clusters)
       * pa_free   ->  free space available in this prealloc space (in clusters)
       *
       * The inode preallocation space is used looking at the _logical_ start
       * block. If only the logical file block falls within the range of prealloc
       * space we will consume the particular prealloc space. This makes sure that
       * we have contiguous physical blocks representing the file blocks
       *
       * The important thing to be noted in case of inode prealloc space is that
       * we don't modify the values associated to inode prealloc space except
       * pa_free.
       *
       * If we are not able to find blocks in the inode prealloc space and if we
       * have the group allocation flag set then we look at the locality group
       * prealloc space. These are per CPU prealloc list represented as
       *
       * ext4_sb_info.s_locality_groups[smp_processor_id()]
       *
       * The reason for having a per cpu locality group is to reduce the contention
       * between CPUs. It is possible to get scheduled at this point.
       *
       * The locality group prealloc space is used looking at whether we have
       * enough free space (pa_free) within the prealloc space.
       *
       * If we can't allocate blocks via inode prealloc or/and locality group
       * prealloc then we look at the buddy cache. The buddy cache is represented
       * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
       * mapped to the buddy and bitmap information regarding different
       * groups. The buddy information is attached to buddy cache inode so that
       * we can access them through the page cache. The information regarding
       * each group is loaded via ext4_mb_load_buddy.  The information involve
       * block bitmap and buddy information. The information are stored in the
       * inode as:
       *
       *  {                        page                        }
       *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
       *
       *
       * one block each for bitmap and buddy information.  So for each group we
       * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
       * blocksize) blocks.  So it can have information regarding groups_per_page
       * which is blocks_per_page/2
       *
       * The buddy cache inode is not stored on disk. The inode is thrown
       * away when the filesystem is unmounted.
       *
       * We look for count number of blocks in the buddy cache. If we were able
       * to locate that many free blocks we return with additional information
       * regarding rest of the contiguous physical block available
       *
       * Before allocating blocks via buddy cache we normalize the request
       * blocks. This ensure we ask for more blocks that we needed. The extra
       * blocks that we get after allocation is added to the respective prealloc
       * list. In case of inode preallocation we follow a list of heuristics
       * based on file size. This can be found in ext4_mb_normalize_request. If
       * we are doing a group prealloc we try to normalize the request to
       * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
       * dependent on the cluster size; for non-bigalloc file systems, it is
       * 512 blocks. This can be tuned via
       * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
       * terms of number of blocks. If we have mounted the file system with -O
       * stripe=<value> option the group prealloc request is normalized to the
       * the smallest multiple of the stripe value (sbi->s_stripe) which is
       * greater than the default mb_group_prealloc.
       *
       * The regular allocator (using the buddy cache) supports a few tunables.
       *
       * /sys/fs/ext4/<partition>/mb_min_to_scan
       * /sys/fs/ext4/<partition>/mb_max_to_scan
       * /sys/fs/ext4/<partition>/mb_order2_req
       *
       * The regular allocator uses buddy scan only if the request len is power of
       * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
       * value of s_mb_order2_reqs can be tuned via
       * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
       * stripe size (sbi->s_stripe), we try to search for contiguous block in
       * stripe size. This should result in better allocation on RAID setups. If
       * not, we search in the specific group using bitmap for best extents. The
       * tunable min_to_scan and max_to_scan control the behaviour here.
       * min_to_scan indicate how long the mballoc __must__ look for a best
       * extent and max_to_scan indicates how long the mballoc __can__ look for a
       * best extent in the found extents. Searching for the blocks starts with
       * the group specified as the goal value in allocation context via
       * ac_g_ex. Each group is first checked based on the criteria whether it
       * can be used for allocation. ext4_mb_good_group explains how the groups are
       * checked.
       *
       * Both the prealloc space are getting populated as above. So for the first
       * request we will hit the buddy cache which will result in this prealloc
       * space getting filled. The prealloc space is then later used for the
       * subsequent request.
       */
      
      /*
       * mballoc operates on the following data:
       *  - on-disk bitmap
       *  - in-core buddy (actually includes buddy and bitmap)
       *  - preallocation descriptors (PAs)
       *
       * there are two types of preallocations:
       *  - inode
       *    assiged to specific inode and can be used for this inode only.
       *    it describes part of inode's space preallocated to specific
       *    physical blocks. any block from that preallocated can be used
       *    independent. the descriptor just tracks number of blocks left
       *    unused. so, before taking some block from descriptor, one must
       *    make sure corresponded logical block isn't allocated yet. this
       *    also means that freeing any block within descriptor's range
       *    must discard all preallocated blocks.
       *  - locality group
       *    assigned to specific locality group which does not translate to
       *    permanent set of inodes: inode can join and leave group. space
       *    from this type of preallocation can be used for any inode. thus
       *    it's consumed from the beginning to the end.
       *
       * relation between them can be expressed as:
       *    in-core buddy = on-disk bitmap + preallocation descriptors
       *
       * this mean blocks mballoc considers used are:
       *  - allocated blocks (persistent)
       *  - preallocated blocks (non-persistent)
       *
       * consistency in mballoc world means that at any time a block is either
       * free or used in ALL structures. notice: "any time" should not be read
       * literally -- time is discrete and delimited by locks.
       *
       *  to keep it simple, we don't use block numbers, instead we count number of
       *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
       *
       * all operations can be expressed as:
       *  - init buddy:                        buddy = on-disk + PAs
       *  - new PA:                                buddy += N; PA = N
       *  - use inode PA:                        on-disk += N; PA -= N
       *  - discard inode PA                        buddy -= on-disk - PA; PA = 0
       *  - use locality group PA                on-disk += N; PA -= N
       *  - discard locality group PA                buddy -= PA; PA = 0
       *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
       *        is used in real operation because we can't know actual used
       *        bits from PA, only from on-disk bitmap
       *
       * if we follow this strict logic, then all operations above should be atomic.
       * given some of them can block, we'd have to use something like semaphores
       * killing performance on high-end SMP hardware. let's try to relax it using
       * the following knowledge:
       *  1) if buddy is referenced, it's already initialized
       *  2) while block is used in buddy and the buddy is referenced,
       *     nobody can re-allocate that block
       *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
       *     bit set and PA claims same block, it's OK. IOW, one can set bit in
       *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
       *     block
       *
       * so, now we're building a concurrency table:
       *  - init buddy vs.
       *    - new PA
       *      blocks for PA are allocated in the buddy, buddy must be referenced
       *      until PA is linked to allocation group to avoid concurrent buddy init
       *    - use inode PA
       *      we need to make sure that either on-disk bitmap or PA has uptodate data
       *      given (3) we care that PA-=N operation doesn't interfere with init
       *    - discard inode PA
       *      the simplest way would be to have buddy initialized by the discard
       *    - use locality group PA
       *      again PA-=N must be serialized with init
       *    - discard locality group PA
       *      the simplest way would be to have buddy initialized by the discard
       *  - new PA vs.
       *    - use inode PA
       *      i_data_sem serializes them
       *    - discard inode PA
       *      discard process must wait until PA isn't used by another process
       *    - use locality group PA
       *      some mutex should serialize them
       *    - discard locality group PA
       *      discard process must wait until PA isn't used by another process
       *  - use inode PA
       *    - use inode PA
       *      i_data_sem or another mutex should serializes them
       *    - discard inode PA
       *      discard process must wait until PA isn't used by another process
       *    - use locality group PA
       *      nothing wrong here -- they're different PAs covering different blocks
       *    - discard locality group PA
       *      discard process must wait until PA isn't used by another process
       *
       * now we're ready to make few consequences:
       *  - PA is referenced and while it is no discard is possible
       *  - PA is referenced until block isn't marked in on-disk bitmap
       *  - PA changes only after on-disk bitmap
       *  - discard must not compete with init. either init is done before
       *    any discard or they're serialized somehow
       *  - buddy init as sum of on-disk bitmap and PAs is done atomically
       *
       * a special case when we've used PA to emptiness. no need to modify buddy
       * in this case, but we should care about concurrent init
       *
       */
      
       /*
       * Logic in few words:
       *
       *  - allocation:
       *    load group
       *    find blocks
       *    mark bits in on-disk bitmap
       *    release group
       *
       *  - use preallocation:
       *    find proper PA (per-inode or group)
       *    load group
       *    mark bits in on-disk bitmap
       *    release group
       *    release PA
       *
       *  - free:
       *    load group
       *    mark bits in on-disk bitmap
       *    release group
       *
       *  - discard preallocations in group:
       *    mark PAs deleted
       *    move them onto local list
       *    load on-disk bitmap
       *    load group
       *    remove PA from object (inode or locality group)
       *    mark free blocks in-core
       *
       *  - discard inode's preallocations:
       */
      
      /*
       * Locking rules
       *
       * Locks:
       *  - bitlock on a group        (group)
       *  - object (inode/locality)        (object)
       *  - per-pa lock                (pa)
       *
       * Paths:
       *  - new pa
       *    object
       *    group
       *
       *  - find and use pa:
       *    pa
       *
       *  - release consumed pa:
       *    pa
       *    group
       *    object
       *
       *  - generate in-core bitmap:
       *    group
       *        pa
       *
       *  - discard all for given object (inode, locality group):
       *    object
       *        pa
       *    group
       *
       *  - discard all for given group:
       *    group
       *        pa
       *    group
       *        object
       *
       */
      static struct kmem_cache *ext4_pspace_cachep;
      static struct kmem_cache *ext4_ac_cachep;
      static struct kmem_cache *ext4_free_data_cachep;
      
      /* We create slab caches for groupinfo data structures based on the
       * superblock block size.  There will be one per mounted filesystem for
       * each unique s_blocksize_bits */
      #define NR_GRPINFO_CACHES 8
      static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
      
      static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
              "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
              "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
              "ext4_groupinfo_64k", "ext4_groupinfo_128k"
      };
      
      static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                              ext4_group_t group);
      static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                      ext4_group_t group);
      static void ext4_free_data_callback(struct super_block *sb,
                                      struct ext4_journal_cb_entry *jce, int rc);
      
      static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
      {
      #if BITS_PER_LONG == 64
  827         *bit += ((unsigned long) addr & 7UL) << 3;
              addr = (void *) ((unsigned long) addr & ~7UL);
      #elif BITS_PER_LONG == 32
              *bit += ((unsigned long) addr & 3UL) << 3;
              addr = (void *) ((unsigned long) addr & ~3UL);
      #else
      #error "how many bits you are?!"
      #endif
              return addr;
      }
      
      static inline int mb_test_bit(int bit, void *addr)
      {
              /*
               * ext4_test_bit on architecture like powerpc
               * needs unsigned long aligned address
               */
  746         addr = mb_correct_addr_and_bit(&bit, addr);
              return ext4_test_bit(bit, addr);
      }
      
      static inline void mb_set_bit(int bit, void *addr)
      {
  711         addr = mb_correct_addr_and_bit(&bit, addr);
  355         ext4_set_bit(bit, addr);
      }
      
      static inline void mb_clear_bit(int bit, void *addr)
      {
  420         addr = mb_correct_addr_and_bit(&bit, addr);
              ext4_clear_bit(bit, addr);
      }
      
      static inline int mb_test_and_clear_bit(int bit, void *addr)
      {
  421         addr = mb_correct_addr_and_bit(&bit, addr);
              return ext4_test_and_clear_bit(bit, addr);
      }
      
      static inline int mb_find_next_zero_bit(void *addr, int max, int start)
      {
              int fix = 0, ret, tmpmax;
              addr = mb_correct_addr_and_bit(&fix, addr);
              tmpmax = max + fix;
    5         start += fix;
      
              ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
              if (ret > max)
                      return max;
              return ret;
      }
      
      static inline int mb_find_next_bit(void *addr, int max, int start)
      {
              int fix = 0, ret, tmpmax;
              addr = mb_correct_addr_and_bit(&fix, addr);
              tmpmax = max + fix;
              start += fix;
      
              ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
              if (ret > max)
                      return max;
              return ret;
      }
      
      static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
      {
              char *bb;
      
  783         BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
              BUG_ON(max == NULL);
      
  783         if (order > e4b->bd_blkbits + 1) {
   43                 *max = 0;
                      return NULL;
              }
      
              /* at order 0 we see each particular block */
  783         if (order == 0) {
  572                 *max = 1 << (e4b->bd_blkbits + 3);
                      return e4b->bd_bitmap;
              }
      
  769         bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
              *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
      
  783         return bb;
      }
      
      #ifdef DOUBLE_CHECK
      static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                                 int first, int count)
      {
              int i;
              struct super_block *sb = e4b->bd_sb;
      
              if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                      return;
              assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
              for (i = 0; i < count; i++) {
                      if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                              ext4_fsblk_t blocknr;
      
                              blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                              blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
                              ext4_grp_locked_error(sb, e4b->bd_group,
                                                    inode ? inode->i_ino : 0,
                                                    blocknr,
                                                    "freeing block already freed "
                                                    "(bit %u)",
                                                    first + i);
                      }
                      mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
              }
      }
      
      static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
      {
              int i;
      
              if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                      return;
              assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
              for (i = 0; i < count; i++) {
                      BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
                      mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
              }
      }
      
      static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
      {
              if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
                      unsigned char *b1, *b2;
                      int i;
                      b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
                      b2 = (unsigned char *) bitmap;
                      for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                              if (b1[i] != b2[i]) {
                                      ext4_msg(e4b->bd_sb, KERN_ERR,
                                               "corruption in group %u "
                                               "at byte %u(%u): %x in copy != %x "
                                               "on disk/prealloc",
                                               e4b->bd_group, i, i * 8, b1[i], b2[i]);
                                      BUG();
                              }
                      }
              }
      }
      
      #else
      static inline void mb_free_blocks_double(struct inode *inode,
                                      struct ext4_buddy *e4b, int first, int count)
      {
              return;
      }
      static inline void mb_mark_used_double(struct ext4_buddy *e4b,
                                                      int first, int count)
      {
              return;
      }
      static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
      {
              return;
      }
      #endif
      
      #ifdef AGGRESSIVE_CHECK
      
      #define MB_CHECK_ASSERT(assert)                                                \
      do {                                                                        \
              if (!(assert)) {                                                \
                      printk(KERN_EMERG                                        \
                              "Assertion failure in %s() at %s:%d: \"%s\"\n",        \
                              function, file, line, # assert);                \
                      BUG();                                                        \
              }                                                                \
      } while (0)
      
      static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
                                      const char *function, int line)
      {
              struct super_block *sb = e4b->bd_sb;
              int order = e4b->bd_blkbits + 1;
              int max;
              int max2;
              int i;
              int j;
              int k;
              int count;
              struct ext4_group_info *grp;
              int fragments = 0;
              int fstart;
              struct list_head *cur;
              void *buddy;
              void *buddy2;
      
              {
                      static int mb_check_counter;
                      if (mb_check_counter++ % 100 != 0)
                              return 0;
              }
      
              while (order > 1) {
                      buddy = mb_find_buddy(e4b, order, &max);
                      MB_CHECK_ASSERT(buddy);
                      buddy2 = mb_find_buddy(e4b, order - 1, &max2);
                      MB_CHECK_ASSERT(buddy2);
                      MB_CHECK_ASSERT(buddy != buddy2);
                      MB_CHECK_ASSERT(max * 2 == max2);
      
                      count = 0;
                      for (i = 0; i < max; i++) {
      
                              if (mb_test_bit(i, buddy)) {
                                      /* only single bit in buddy2 may be 1 */
                                      if (!mb_test_bit(i << 1, buddy2)) {
                                              MB_CHECK_ASSERT(
                                                      mb_test_bit((i<<1)+1, buddy2));
                                      } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
                                              MB_CHECK_ASSERT(
                                                      mb_test_bit(i << 1, buddy2));
                                      }
                                      continue;
                              }
      
                              /* both bits in buddy2 must be 1 */
                              MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
                              MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
      
                              for (j = 0; j < (1 << order); j++) {
                                      k = (i * (1 << order)) + j;
                                      MB_CHECK_ASSERT(
                                              !mb_test_bit(k, e4b->bd_bitmap));
                              }
                              count++;
                      }
                      MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
                      order--;
              }
      
              fstart = -1;
              buddy = mb_find_buddy(e4b, 0, &max);
              for (i = 0; i < max; i++) {
                      if (!mb_test_bit(i, buddy)) {
                              MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
                              if (fstart == -1) {
                                      fragments++;
                                      fstart = i;
                              }
                              continue;
                      }
                      fstart = -1;
                      /* check used bits only */
                      for (j = 0; j < e4b->bd_blkbits + 1; j++) {
                              buddy2 = mb_find_buddy(e4b, j, &max2);
                              k = i >> j;
                              MB_CHECK_ASSERT(k < max2);
                              MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
                      }
              }
              MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
              MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
      
              grp = ext4_get_group_info(sb, e4b->bd_group);
              list_for_each(cur, &grp->bb_prealloc_list) {
                      ext4_group_t groupnr;
                      struct ext4_prealloc_space *pa;
                      pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                      ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
                      MB_CHECK_ASSERT(groupnr == e4b->bd_group);
                      for (i = 0; i < pa->pa_len; i++)
                              MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
              }
              return 0;
      }
      #undef MB_CHECK_ASSERT
      #define mb_check_buddy(e4b) __mb_check_buddy(e4b,        \
                                              __FILE__, __func__, __LINE__)
      #else
      #define mb_check_buddy(e4b)
      #endif
      
      /*
       * Divide blocks started from @first with length @len into
       * smaller chunks with power of 2 blocks.
       * Clear the bits in bitmap which the blocks of the chunk(s) covered,
       * then increase bb_counters[] for corresponded chunk size.
       */
      static void ext4_mb_mark_free_simple(struct super_block *sb,
                                      void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                              struct ext4_group_info *grp)
      {
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              ext4_grpblk_t min;
              ext4_grpblk_t max;
              ext4_grpblk_t chunk;
              unsigned int border;
      
              BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
      
   10         border = 2 << sb->s_blocksize_bits;
      
              while (len > 0) {
                      /* find how many blocks can be covered since this position */
   10                 max = ffs(first | border) - 1;
      
                      /* find how many blocks of power 2 we need to mark */
                      min = fls(len) - 1;
      
                      if (max < min)
                              min = max;
                      chunk = 1 << min;
      
                      /* mark multiblock chunks only */
                      grp->bb_counters[min]++;
                      if (min > 0)
                              mb_clear_bit(first >> min,
   10                                      buddy + sbi->s_mb_offsets[min]);
      
   10                 len -= chunk;
                      first += chunk;
              }
      }
      
      /*
       * Cache the order of the largest free extent we have available in this block
       * group.
       */
      static void
      mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
      {
              int i;
              int bits;
      
   10         grp->bb_largest_free_order = -1; /* uninit */
      
              bits = sb->s_blocksize_bits + 1;
  719         for (i = bits; i >= 0; i--) {
  795                 if (grp->bb_counters[i] > 0) {
  793                         grp->bb_largest_free_order = i;
                              break;
                      }
              }
      }
      
      static noinline_for_stack
      void ext4_mb_generate_buddy(struct super_block *sb,
                                      void *buddy, void *bitmap, ext4_group_t group)
      {
   10         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
              ext4_grpblk_t i = 0;
              ext4_grpblk_t first;
              ext4_grpblk_t len;
              unsigned free = 0;
              unsigned fragments = 0;
              unsigned long long period = get_cycles();
      
              /* initialize buddy from bitmap which is aggregation
               * of on-disk bitmap and preallocations */
              i = mb_find_next_zero_bit(bitmap, max, 0);
              grp->bb_first_free = i;
              while (i < max) {
   10                 fragments++;
                      first = i;
                      i = mb_find_next_bit(bitmap, max, i);
                      len = i - first;
                      free += len;
                      if (len > 1)
   10                         ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
                      else
    3                         grp->bb_counters[0]++;
   10                 if (i < max)
    5                         i = mb_find_next_zero_bit(bitmap, max, i);
              }
   10         grp->bb_fragments = fragments;
      
              if (free != grp->bb_free) {
    4                 ext4_grp_locked_error(sb, group, 0, 0,
                                            "block bitmap and bg descriptor "
                                            "inconsistent: %u vs %u free clusters",
                                            free, grp->bb_free);
                      /*
                       * If we intend to continue, we consider group descriptor
                       * corrupt and update bb_free using bitmap value
                       */
                      grp->bb_free = free;
                      if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                              percpu_counter_sub(&sbi->s_freeclusters_counter,
                                                 grp->bb_free);
    4                 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
              }
   10         mb_set_largest_free_order(sb, grp);
      
   10         clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
      
              period = get_cycles() - period;
              spin_lock(&EXT4_SB(sb)->s_bal_lock);
              EXT4_SB(sb)->s_mb_buddies_generated++;
              EXT4_SB(sb)->s_mb_generation_time += period;
              spin_unlock(&EXT4_SB(sb)->s_bal_lock);
      }
      
      static void mb_regenerate_buddy(struct ext4_buddy *e4b)
      {
              int count;
              int order = 1;
              void *buddy;
      
    4         while ((buddy = mb_find_buddy(e4b, order++, &count))) {
    4                 ext4_set_bits(buddy, 0, count);
              }
    4         e4b->bd_info->bb_fragments = 0;
              memset(e4b->bd_info->bb_counters, 0,
                      sizeof(*e4b->bd_info->bb_counters) *
                      (e4b->bd_sb->s_blocksize_bits + 2));
      
              ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
                      e4b->bd_bitmap, e4b->bd_group);
      }
      
      /* The buddy information is attached the buddy cache inode
       * for convenience. The information regarding each group
       * is loaded via ext4_mb_load_buddy. The information involve
       * block bitmap and buddy information. The information are
       * stored in the inode as
       *
       * {                        page                        }
       * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
       *
       *
       * one block each for bitmap and buddy information.
       * So for each group we take up 2 blocks. A page can
       * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
       * So it can have information regarding groups_per_page which
       * is blocks_per_page/2
       *
       * Locking note:  This routine takes the block group lock of all groups
       * for this page; do not hold this lock when calling this routine!
       */
      
      static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
      {
              ext4_group_t ngroups;
              int blocksize;
              int blocks_per_page;
              int groups_per_page;
              int err = 0;
              int i;
              ext4_group_t first_group, group;
              int first_block;
              struct super_block *sb;
              struct buffer_head *bhs;
              struct buffer_head **bh = NULL;
              struct inode *inode;
              char *data;
              char *bitmap;
              struct ext4_group_info *grinfo;
      
              mb_debug(1, "init page %lu\n", page->index);
      
    7         inode = page->mapping->host;
              sb = inode->i_sb;
              ngroups = ext4_get_groups_count(sb);
              blocksize = 1 << inode->i_blkbits;
              blocks_per_page = PAGE_CACHE_SIZE / blocksize;
      
              groups_per_page = blocks_per_page >> 1;
              if (groups_per_page == 0)
                      groups_per_page = 1;
      
              /* allocate buffer_heads to read bitmaps */
              if (groups_per_page > 1) {
                      i = sizeof(struct buffer_head *) * groups_per_page;
                      bh = kzalloc(i, gfp);
                      if (bh == NULL) {
                              err = -ENOMEM;
                              goto out;
                      }
              } else
                      bh = &bhs;
      
    7         first_group = page->index * blocks_per_page / 2;
      
              /* read all groups the page covers into the cache */
    6         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
    7                 if (group >= ngroups)
                              break;
      
    7                 grinfo = ext4_get_group_info(sb, group);
                      /*
                       * If page is uptodate then we came here after online resize
                       * which added some new uninitialized group info structs, so
                       * we must skip all initialized uptodate buddies on the page,
                       * which may be currently in use by an allocating task.
                       */
    7                 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
                              bh[i] = NULL;
                              continue;
                      }
    7                 bh[i] = ext4_read_block_bitmap_nowait(sb, group);
                      if (IS_ERR(bh[i])) {
    1                         err = PTR_ERR(bh[i]);
                              bh[i] = NULL;
                              goto out;
                      }
                      mb_debug(1, "read bitmap for group %u\n", group);
              }
      
              /* wait for I/O completion */
    6         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
                      int err2;
      
    6                 if (!bh[i])
                              continue;
    6                 err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
                      if (!err)
                              err = err2;
              }
      
    6         first_block = page->index * blocks_per_page;
    6         for (i = 0; i < blocks_per_page; i++) {
    6                 group = (first_block + i) >> 1;
                      if (group >= ngroups)
                              break;
      
    6                 if (!bh[group - first_group])
                              /* skip initialized uptodate buddy */
                              continue;
      
    6                 if (!buffer_verified(bh[group - first_group]))
                              /* Skip faulty bitmaps */
                              continue;
                      err = 0;
      
                      /*
                       * data carry information regarding this
                       * particular group in the format specified
                       * above
                       *
                       */
    6                 data = page_address(page) + (i * blocksize);
    6                 bitmap = bh[group - first_group]->b_data;
      
                      /*
                       * We place the buddy block and bitmap block
                       * close together
                       */
                      if ((first_block + i) & 1) {
                              /* this is block of buddy */
    6                         BUG_ON(incore == NULL);
                              mb_debug(1, "put buddy for group %u in page %lu/%x\n",
                                      group, page->index, i * blocksize);
    6                         trace_ext4_mb_buddy_bitmap_load(sb, group);
    6                         grinfo = ext4_get_group_info(sb, group);
                              grinfo->bb_fragments = 0;
                              memset(grinfo->bb_counters, 0,
                                     sizeof(*grinfo->bb_counters) *
                                      (sb->s_blocksize_bits+2));
                              /*
                               * incore got set to the group block bitmap below
                               */
    6                         ext4_lock_group(sb, group);
                              /* init the buddy */
    6                         memset(data, 0xff, blocksize);
                              ext4_mb_generate_buddy(sb, data, incore, group);
                              ext4_unlock_group(sb, group);
                              incore = NULL;
                      } else {
                              /* this is block of bitmap */
    6                         BUG_ON(incore != NULL);
                              mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
                                      group, page->index, i * blocksize);
    6                         trace_ext4_mb_bitmap_load(sb, group);
      
                              /* see comments in ext4_mb_put_pa() */
    6                         ext4_lock_group(sb, group);
    6                         memcpy(data, bitmap, blocksize);
      
                              /* mark all preallocated blks used in in-core bitmap */
                              ext4_mb_generate_from_pa(sb, data, group);
    6                         ext4_mb_generate_from_freelist(sb, data, group);
    6                         ext4_unlock_group(sb, group);
      
                              /* set incore so that the buddy information can be
                               * generated using this
                               */
                              incore = data;
                      }
              }
    6         SetPageUptodate(page);
      
      out:
              if (bh) {
    7                 for (i = 0; i < groups_per_page; i++)
    7                         brelse(bh[i]);
    7                 if (bh != &bhs)
                              kfree(bh);
              }
    7         return err;
      }
      
      /*
       * Lock the buddy and bitmap pages. This make sure other parallel init_group
       * on the same buddy page doesn't happen whild holding the buddy page lock.
       * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
       * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
       */
      static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
                      ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
      {
              struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
              int block, pnum, poff;
              int blocks_per_page;
              struct page *page;
      
              e4b->bd_buddy_page = NULL;
              e4b->bd_bitmap_page = NULL;
      
              blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
              /*
               * the buddy cache inode stores the block bitmap
               * and buddy information in consecutive blocks.
               * So for each group we need two blocks.
               */
              block = group * 2;
              pnum = block / blocks_per_page;
    6         poff = block % blocks_per_page;
              page = find_or_create_page(inode->i_mapping, pnum, gfp);
              if (!page)
                      return -ENOMEM;
    6         BUG_ON(page->mapping != inode->i_mapping);
              e4b->bd_bitmap_page = page;
    6         e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
      
              if (blocks_per_page >= 2) {
                      /* buddy and bitmap are on the same page */
                      return 0;
              }
      
    6         block++;
              pnum = block / blocks_per_page;
              page = find_or_create_page(inode->i_mapping, pnum, gfp);
              if (!page)
                      return -ENOMEM;
    6         BUG_ON(page->mapping != inode->i_mapping);
              e4b->bd_buddy_page = page;
              return 0;
      }
      
      static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
      {
              if (e4b->bd_bitmap_page) {
    6                 unlock_page(e4b->bd_bitmap_page);
                      page_cache_release(e4b->bd_bitmap_page);
              }
    6         if (e4b->bd_buddy_page) {
    6                 unlock_page(e4b->bd_buddy_page);
                      page_cache_release(e4b->bd_buddy_page);
              }
      }
      
      /*
       * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
       * block group lock of all groups for this page; do not hold the BG lock when
       * calling this routine!
       */
      static noinline_for_stack
      int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
      {
      
              struct ext4_group_info *this_grp;
              struct ext4_buddy e4b;
              struct page *page;
              int ret = 0;
      
    6         might_sleep();
              mb_debug(1, "init group %u\n", group);
    6         this_grp = ext4_get_group_info(sb, group);
              /*
               * This ensures that we don't reinit the buddy cache
               * page which map to the group from which we are already
               * allocating. If we are looking at the buddy cache we would
               * have taken a reference using ext4_mb_load_buddy and that
               * would have pinned buddy page to page cache.
               * The call to ext4_mb_get_buddy_page_lock will mark the
               * page accessed.
               */
    6         ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
    6         if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                      /*
                       * somebody initialized the group
                       * return without doing anything
                       */
                      goto err;
              }
      
              page = e4b.bd_bitmap_page;
    6         ret = ext4_mb_init_cache(page, NULL, gfp);
              if (ret)
                      goto err;
    6         if (!PageUptodate(page)) {
                      ret = -EIO;
                      goto err;
              }
      
              if (e4b.bd_buddy_page == NULL) {
                      /*
                       * If both the bitmap and buddy are in
                       * the same page we don't need to force
                       * init the buddy
                       */
                      ret = 0;
                      goto err;
              }
              /* init buddy cache */
              page = e4b.bd_buddy_page;
              ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
              if (ret)
                      goto err;
    6         if (!PageUptodate(page)) {
                      ret = -EIO;
                      goto err;
              }
      err:
    6         ext4_mb_put_buddy_page_lock(&e4b);
    6         return ret;
      }
      
      /*
       * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
       * block group lock of all groups for this page; do not hold the BG lock when
       * calling this routine!
       */
      static noinline_for_stack int
      ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
                             struct ext4_buddy *e4b, gfp_t gfp)
      {
              int blocks_per_page;
              int block;
              int pnum;
              int poff;
              struct page *page;
              int ret;
              struct ext4_group_info *grp;
  795         struct ext4_sb_info *sbi = EXT4_SB(sb);
              struct inode *inode = sbi->s_buddy_cache;
      
              might_sleep();
              mb_debug(1, "load group %u\n", group);
      
              blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
  795         grp = ext4_get_group_info(sb, group);
      
              e4b->bd_blkbits = sb->s_blocksize_bits;
              e4b->bd_info = grp;
              e4b->bd_sb = sb;
              e4b->bd_group = group;
              e4b->bd_buddy_page = NULL;
              e4b->bd_bitmap_page = NULL;
      
              if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                      /*
                       * we need full data about the group
                       * to make a good selection
                       */
    5                 ret = ext4_mb_init_group(sb, group, gfp);
                      if (ret)
                              return ret;
              }
      
              /*
               * the buddy cache inode stores the block bitmap
               * and buddy information in consecutive blocks.
               * So for each group we need two blocks.
               */
  795         block = group * 2;
              pnum = block / blocks_per_page;
              poff = block % blocks_per_page;
      
              /* we could use find_or_create_page(), but it locks page
               * what we'd like to avoid in fast path ... */
              page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
  795         if (page == NULL || !PageUptodate(page)) {
                      if (page)
                              /*
                               * drop the page reference and try
                               * to get the page with lock. If we
                               * are not uptodate that implies
                               * somebody just created the page but
                               * is yet to initialize the same. So
                               * wait for it to initialize.
                               */
                              page_cache_release(page);
                      page = find_or_create_page(inode->i_mapping, pnum, gfp);
                      if (page) {
                              BUG_ON(page->mapping != inode->i_mapping);
                              if (!PageUptodate(page)) {
                                      ret = ext4_mb_init_cache(page, NULL, gfp);
                                      if (ret) {
                                              unlock_page(page);
                                              goto err;
                                      }
                                      mb_cmp_bitmaps(e4b, page_address(page) +
                                                     (poff * sb->s_blocksize));
                              }
                              unlock_page(page);
                      }
              }
              if (page == NULL) {
                      ret = -ENOMEM;
                      goto err;
              }
  795         if (!PageUptodate(page)) {
                      ret = -EIO;
                      goto err;
              }
      
              /* Pages marked accessed already */
              e4b->bd_bitmap_page = page;
              e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
      
              block++;
              pnum = block / blocks_per_page;
              poff = block % blocks_per_page;
      
              page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
  795         if (page == NULL || !PageUptodate(page)) {
                      if (page)
    1                         page_cache_release(page);
    1                 page = find_or_create_page(inode->i_mapping, pnum, gfp);
                      if (page) {
    1                         BUG_ON(page->mapping != inode->i_mapping);
    1                         if (!PageUptodate(page)) {
    1                                 ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
                                                               gfp);
                                      if (ret) {
    1                                         unlock_page(page);
                                              goto err;
                                      }
                              }
                              unlock_page(page);
                      }
              }
              if (page == NULL) {
                      ret = -ENOMEM;
                      goto err;
              }
  795         if (!PageUptodate(page)) {
                      ret = -EIO;
                      goto err;
              }
      
              /* Pages marked accessed already */
              e4b->bd_buddy_page = page;
              e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
      
              BUG_ON(e4b->bd_bitmap_page == NULL);
              BUG_ON(e4b->bd_buddy_page == NULL);
      
              return 0;
      
      err:
              if (page)
    1                 page_cache_release(page);
    1         if (e4b->bd_bitmap_page)
    1                 page_cache_release(e4b->bd_bitmap_page);
    1         if (e4b->bd_buddy_page)
                      page_cache_release(e4b->bd_buddy_page);
    1         e4b->bd_buddy = NULL;
              e4b->bd_bitmap = NULL;
  795         return ret;
      }
      
      static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                    struct ext4_buddy *e4b)
      {
  660         return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
      }
      
      static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
      {
  795         if (e4b->bd_bitmap_page)
  795                 page_cache_release(e4b->bd_bitmap_page);
  795         if (e4b->bd_buddy_page)
  795                 page_cache_release(e4b->bd_buddy_page);
  795 }
      
      
      static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
      {
              int order = 1;
  660         int bb_incr = 1 << (e4b->bd_blkbits - 1);
              void *bb;
      
              BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
  660         BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
      
              bb = e4b->bd_buddy;
  660         while (order <= e4b->bd_blkbits + 1) {
  660                 block = block >> 1;
  660                 if (!mb_test_bit(block, bb)) {
                              /* this block is part of buddy of order 'order' */
                              return order;
                      }
  660                 bb += bb_incr;
                      bb_incr >>= 1;
                      order++;
              }
              return 0;
      }
      
      static void mb_clear_bits(void *bm, int cur, int len)
      {
              __u32 *addr;
      
  403         len = cur + len;
  403         while (cur < len) {
  403                 if ((cur & 31) == 0 && (len - cur) >= 32) {
                              /* fast path: clear whole word at once */
  269                         addr = bm + (cur >> 3);
                              *addr = 0;
                              cur += 32;
                              continue;
                      }
  403                 mb_clear_bit(cur, bm);
                      cur++;
              }
  403 }
      
      /* clear bits in given range
       * will return first found zero bit if any, -1 otherwise
       */
      static int mb_test_and_clear_bits(void *bm, int cur, int len)
      {
              __u32 *addr;
              int zero_bit = -1;
      
              len = cur + len;
  421         while (cur < len) {
  421                 if ((cur & 31) == 0 && (len - cur) >= 32) {
                              /* fast path: clear whole word at once */
  311                         addr = bm + (cur >> 3);
                              if (*addr != (__u32)(-1) && zero_bit == -1)
                                      zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
  311                         *addr = 0;
                              cur += 32;
                              continue;
                      }
  421                 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
                              zero_bit = cur;
  418                 cur++;
              }
      
              return zero_bit;
      }
      
      void ext4_set_bits(void *bm, int cur, int len)
      {
              __u32 *addr;
      
  711         len = cur + len;
  711         while (cur < len) {
  711                 if ((cur & 31) == 0 && (len - cur) >= 32) {
                              /* fast path: set whole word at once */
  556                         addr = bm + (cur >> 3);
                              *addr = 0xffffffff;
                              cur += 32;
                              continue;
                      }
  711                 mb_set_bit(cur, bm);
                      cur++;
              }
  711 }
      
      /*
       * _________________________________________________________________ */
      
      static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
      {
  400         if (mb_test_bit(*bit + side, bitmap)) {
  399                 mb_clear_bit(*bit, bitmap);
                      (*bit) -= side;
                      return 1;
              }
              else {
                      (*bit) += side;
  355                 mb_set_bit(*bit, bitmap);
                      return -1;
              }
      }
      
      static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
      {
              int max;
              int order = 1;
              void *buddy = mb_find_buddy(e4b, order, &max);
      
              while (buddy) {
                      void *buddy2;
      
                      /* Bits in range [first; last] are known to be set since
                       * corresponding blocks were allocated. Bits in range
                       * (first; last) will stay set because they form buddies on
                       * upper layer. We just deal with borders if they don't
                       * align with upper layer and then go up.
                       * Releasing entire group is all about clearing
                       * single bit of highest order buddy.
                       */
      
                      /* Example:
                       * ---------------------------------
                       * |   1   |   1   |   1   |   1   |
                       * ---------------------------------
                       * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
                       * ---------------------------------
                       *   0   1   2   3   4   5   6   7
                       *      \_____________________/
                       *
                       * Neither [1] nor [6] is aligned to above layer.
                       * Left neighbour [0] is free, so mark it busy,
                       * decrease bb_counters and extend range to
                       * [0; 6]
                       * Right neighbour [7] is busy. It can't be coaleasced with [6], so
                       * mark [6] free, increase bb_counters and shrink range to
                       * [0; 5].
                       * Then shift range to [0; 2], go up and do the same.
                       */
      
      
  400                 if (first & 1)
  387                         e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
  400                 if (!(last & 1))
  379                         e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
  400                 if (first > last)
                              break;
  385                 order++;
      
  385                 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
   39                         mb_clear_bits(buddy, first, last - first + 1);
                              e4b->bd_info->bb_counters[order - 1] += last - first + 1;
  400                         break;
                      }
  385                 first >>= 1;
                      last >>= 1;
                      buddy = buddy2;
              }
      }
      
      static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                                 int first, int count)
      {
              int left_is_free = 0;
              int right_is_free = 0;
              int block;
  421         int last = first + count - 1;
              struct super_block *sb = e4b->bd_sb;
      
              if (WARN_ON(count == 0))
                      return;
  421         BUG_ON(last >= (sb->s_blocksize << 3));
  421         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
              /* Don't bother if the block group is corrupt. */
  421         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                      return;
      
              mb_check_buddy(e4b);
              mb_free_blocks_double(inode, e4b, first, count);
      
  421         e4b->bd_info->bb_free += count;
              if (first < e4b->bd_info->bb_first_free)
  421                 e4b->bd_info->bb_first_free = first;
      
              /* access memory sequentially: check left neighbour,
               * clear range and then check right neighbour
               */
  421         if (first != 0)
  421                 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
  421         block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
  421         if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
  421                 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
      
   67         if (unlikely(block != -1)) {
                      struct ext4_sb_info *sbi = EXT4_SB(sb);
                      ext4_fsblk_t blocknr;
      
    4                 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                      blocknr += EXT4_C2B(EXT4_SB(sb), block);
    4                 ext4_grp_locked_error(sb, e4b->bd_group,
                                            inode ? inode->i_ino : 0,
                                            blocknr,
                                            "freeing already freed block "
                                            "(bit %u); block bitmap corrupt.",
                                            block);
                      if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
                              percpu_counter_sub(&sbi->s_freeclusters_counter,
    4                                            e4b->bd_info->bb_free);
                      /* Mark the block group as corrupt. */
                      set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
    4                         &e4b->bd_info->bb_state);
    4                 mb_regenerate_buddy(e4b);
                      goto done;
              }
      
              /* let's maintain fragments counter */
  420         if (left_is_free && right_is_free)
  293                 e4b->bd_info->bb_fragments--;
  387         else if (!left_is_free && !right_is_free)
  339                 e4b->bd_info->bb_fragments++;
      
              /* buddy[0] == bd_bitmap is a special case, so handle
               * it right away and let mb_buddy_mark_free stay free of
               * zero order checks.
               * Check if neighbours are to be coaleasced,
               * adjust bitmap bb_counters and borders appropriately.
               */
  377         if (first & 1) {
  334                 first += !left_is_free;
  347                 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
              }
  420         if (!(last & 1)) {
  320                 last -= !right_is_free;
  349                 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
              }
      
  420         if (first <= last)
  400                 mb_buddy_mark_free(e4b, first >> 1, last >> 1);
      
      done:
  421         mb_set_largest_free_order(sb, e4b->bd_info);
  421         mb_check_buddy(e4b);
      }
      
      static int mb_find_extent(struct ext4_buddy *e4b, int block,
                                      int needed, struct ext4_free_extent *ex)
      {
              int next = block;
              int max, order;
              void *buddy;
      
  572         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
              BUG_ON(ex == NULL);
      
  572         buddy = mb_find_buddy(e4b, 0, &max);
  246         BUG_ON(buddy == NULL);
  572         BUG_ON(block >= max);
  572         if (mb_test_bit(block, buddy)) {
  368                 ex->fe_len = 0;
                      ex->fe_start = 0;
                      ex->fe_group = 0;
  572                 return 0;
              }
      
              /* find actual order */
  550         order = mb_find_order_for_block(e4b, block);
              block = block >> order;
      
              ex->fe_len = 1 << order;
              ex->fe_start = block << order;
              ex->fe_group = e4b->bd_group;
      
              /* calc difference from given start */
              next = next - ex->fe_start;
              ex->fe_len -= next;
              ex->fe_start += next;
      
  262         while (needed > ex->fe_len &&
  275                mb_find_buddy(e4b, order, &max)) {
      
  275                 if (block + 1 >= max)
                              break;
      
  275                 next = (block + 1) * (1 << order);
                      if (mb_test_bit(next, e4b->bd_bitmap))
                              break;
      
  266                 order = mb_find_order_for_block(e4b, next);
      
                      block = next >> order;
                      ex->fe_len += 1 << order;
              }
      
  550         BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
              return ex->fe_len;
      }
      
      static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
      {
              int ord;
              int mlen = 0;
  660         int max = 0;
              int cur;
              int start = ex->fe_start;
              int len = ex->fe_len;
              unsigned ret = 0;
              int len0 = len;
              void *buddy;
      
              BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
  660         BUG_ON(e4b->bd_group != ex->fe_group);
  660         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
              mb_check_buddy(e4b);
              mb_mark_used_double(e4b, start, len);
      
  660         e4b->bd_info->bb_free -= len;
              if (e4b->bd_info->bb_first_free == start)
  276                 e4b->bd_info->bb_first_free += len;
      
              /* let's maintain fragments counter */
  660         if (start != 0)
  660                 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
  660         if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
  659                 max = !mb_test_bit(start + len, e4b->bd_bitmap);
  519         if (mlen && max)
  419                 e4b->bd_info->bb_fragments++;
  618         else if (!mlen && !max)
  447                 e4b->bd_info->bb_fragments--;
      
              /* let's maintain buddy itself */
  660         while (len) {
  660                 ord = mb_find_order_for_block(e4b, start);
      
  660                 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
                              /* the whole chunk may be allocated at once! */
                              mlen = 1 << ord;
  660                         buddy = mb_find_buddy(e4b, ord, &max);
                              BUG_ON((start >> ord) >= max);
  660                         mb_set_bit(start >> ord, buddy);
                              e4b->bd_info->bb_counters[ord]--;
                              start += mlen;
                              len -= mlen;
                              BUG_ON(len < 0);
                              continue;
                      }
      
                      /* store for history */
  420                 if (ret == 0)
  420                         ret = len | (ord << 16);
      
                      /* we have to split large buddy */
  420                 BUG_ON(ord <= 0);
  420                 buddy = mb_find_buddy(e4b, ord, &max);
                      mb_set_bit(start >> ord, buddy);
                      e4b->bd_info->bb_counters[ord]--;
      
                      ord--;
                      cur = (start >> ord) & ~1U;
                      buddy = mb_find_buddy(e4b, ord, &max);
                      mb_clear_bit(cur, buddy);
                      mb_clear_bit(cur + 1, buddy);
                      e4b->bd_info->bb_counters[ord]++;
                      e4b->bd_info->bb_counters[ord]++;
              }
  660         mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
      
  660         ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
              mb_check_buddy(e4b);
      
              return ret;
      }
      
      /*
       * Must be called under group lock!
       */
      static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
                                              struct ext4_buddy *e4b)
      {
  660         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
              int ret;
      
  660         BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
  660         BUG_ON(ac->ac_status == AC_STATUS_FOUND);
      
              ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
              ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
              ret = mb_mark_used(e4b, &ac->ac_b_ex);
      
              /* preallocation can change ac_b_ex, thus we store actually
               * allocated blocks for history */
              ac->ac_f_ex = ac->ac_b_ex;
      
              ac->ac_status = AC_STATUS_FOUND;
              ac->ac_tail = ret & 0xffff;
              ac->ac_buddy = ret >> 16;
      
              /*
               * take the page reference. We want the page to be pinned
               * so that we don't get a ext4_mb_init_cache_call for this
               * group until we update the bitmap. That would mean we
               * double allocate blocks. The reference is dropped
               * in ext4_mb_release_context
               */
              ac->ac_bitmap_page = e4b->bd_bitmap_page;
  660         get_page(ac->ac_bitmap_page);
  660         ac->ac_buddy_page = e4b->bd_buddy_page;
  660         get_page(ac->ac_buddy_page);
              /* store last allocated for subsequent stream allocation */
  660         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
  631                 spin_lock(&sbi->s_md_lock);
                      sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
                      sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
                      spin_unlock(&sbi->s_md_lock);
              }
  660 }
      
      /*
       * regular allocator, for general purposes allocation
       */
      
      static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
                                              struct ext4_buddy *e4b,
                                              int finish_group)
      {
  424         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
              struct ext4_free_extent *bex = &ac->ac_b_ex;
              struct ext4_free_extent *gex = &ac->ac_g_ex;
              struct ext4_free_extent ex;
              int max;
      
  502         if (ac->ac_status == AC_STATUS_FOUND)
                      return;
              /*
               * We don't want to scan for a whole year
               */
              if (ac->ac_found > sbi->s_mb_max_to_scan &&
   22                         !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
   22                 ac->ac_status = AC_STATUS_BREAK;
                      return;
              }
      
              /*
               * Haven't found good chunk so far, let's continue
               */
  424         if (bex->fe_len < gex->fe_len)
                      return;
      
  502         if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
  259                         && bex->fe_group == e4b->bd_group) {
                      /* recheck chunk's availability - we don't know
                       * when it was found (within this lock-unlock
                       * period or not) */
  259                 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
                      if (max >= gex->fe_len) {
  259                         ext4_mb_use_best_found(ac, e4b);
                              return;
                      }
              }
      }
      
      /*
       * The routine checks whether found extent is good enough. If it is,
       * then the extent gets marked used and flag is set to the context
       * to stop scanning. Otherwise, the extent is compared with the
       * previous found extent and if new one is better, then it's stored
       * in the context. Later, the best found extent will be used, if
       * mballoc can't find good enough extent.
       *
       * FIXME: real allocation policy is to be designed yet!
       */
      static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
                                              struct ext4_free_extent *ex,
                                              struct ext4_buddy *e4b)
      {
              struct ext4_free_extent *bex = &ac->ac_b_ex;
              struct ext4_free_extent *gex = &ac->ac_g_ex;
      
              BUG_ON(ex->fe_len <= 0);
              BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
  502         BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
  502         BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
      
  502         ac->ac_found++;
      
              /*
               * The special case - take what you catch first
               */
              if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
  439                 *bex = *ex;
                      ext4_mb_use_best_found(ac, e4b);
                      return;
              }
      
              /*
               * Let's check whether the chuck is good enough
               */
  502         if (ex->fe_len == gex->fe_len) {
                      *bex = *ex;
                      ext4_mb_use_best_found(ac, e4b);
                      return;
              }
      
              /*
               * If this is first found extent, just store it in the context
               */
  435         if (bex->fe_len == 0) {
  435                 *bex = *ex;
                      return;
              }
      
              /*
               * If new found extent is better, store it in the context
               */
  422         if (bex->fe_len < gex->fe_len) {
                      /* if the request isn't satisfied, any found extent
                       * larger than previous best one is better */
  213                 if (ex->fe_len > bex->fe_len)
  335                         *bex = *ex;
  386         } else if (ex->fe_len > gex->fe_len) {
                      /* if the request is satisfied, then we try to find
                       * an extent that still satisfy the request, but is
                       * smaller than previous one */
  377                 if (ex->fe_len < bex->fe_len)
                              *bex = *ex;
              }
      
  422         ext4_mb_check_limits(ac, e4b, 0);
      }
      
      static noinline_for_stack
      int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
                                              struct ext4_buddy *e4b)
      {
  183         struct ext4_free_extent ex = ac->ac_b_ex;
              ext4_group_t group = ex.fe_group;
              int max;
              int err;
      
              BUG_ON(ex.fe_len <= 0);
  183         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
  183         if (err)
                      return err;
      
  183         ext4_lock_group(ac->ac_sb, group);
  183         max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
      
              if (max > 0) {
  183                 ac->ac_b_ex = ex;
                      ext4_mb_use_best_found(ac, e4b);
              }
      
  183         ext4_unlock_group(ac->ac_sb, group);
              ext4_mb_unload_buddy(e4b);
      
              return 0;
      }
      
      static noinline_for_stack
      int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                                      struct ext4_buddy *e4b)
      {
  661         ext4_group_t group = ac->ac_g_ex.fe_group;
              int max;
              int err;
              struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  398         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
              struct ext4_free_extent ex;
      
  661         if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
                      return 0;
  661         if (grp->bb_free == 0)
                      return 0;
      
  397         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
              if (err)
                      return err;
      
  397         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
   32                 ext4_mb_unload_buddy(e4b);
                      return 0;
              }
      
  397         ext4_lock_group(ac->ac_sb, group);
  397         max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
                                   ac->ac_g_ex.fe_len, &ex);
              ex.fe_logical = 0xDEADFA11; /* debug value */
      
  201         if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
                      ext4_fsblk_t start;
      
                      start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
                              ex.fe_start;
                      /* use do_div to get remainder (would be 64-bit modulo) */
                      if (do_div(start, sbi->s_stripe) == 0) {
                              ac->ac_found++;
                              ac->ac_b_ex = ex;
                              ext4_mb_use_best_found(ac, e4b);
                      }
              } else if (max >= ac->ac_g_ex.fe_len) {
  201                 BUG_ON(ex.fe_len <= 0);
  201                 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
  201                 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
  201                 ac->ac_found++;
                      ac->ac_b_ex = ex;
                      ext4_mb_use_best_found(ac, e4b);
  377         } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
                      /* Sometimes, caller may want to merge even small
                       * number of blocks to an existing extent */
                      BUG_ON(ex.fe_len <= 0);
                      BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
                      BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
                      ac->ac_found++;
                      ac->ac_b_ex = ex;
                      ext4_mb_use_best_found(ac, e4b);
              }
  397         ext4_unlock_group(ac->ac_sb, group);
              ext4_mb_unload_buddy(e4b);
      
              return 0;
      }
      
      /*
       * The routine scans buddy structures (not bitmap!) from given order
       * to max order and tries to find big enough chunk to satisfy the req
       */
      static noinline_for_stack
      void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
                                              struct ext4_buddy *e4b)
      {
  584         struct super_block *sb = ac->ac_sb;
              struct ext4_group_info *grp = e4b->bd_info;
              void *buddy;
              int i;
              int k;
              int max;
      
              BUG_ON(ac->ac_2order <= 0);
  584         for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
  584                 if (grp->bb_counters[i] == 0)
                              continue;
      
  584                 buddy = mb_find_buddy(e4b, i, &max);
                      BUG_ON(buddy == NULL);
      
  584                 k = mb_find_next_zero_bit(buddy, max, 0);
                      BUG_ON(k >= max);
      
  584                 ac->ac_found++;
      
                      ac->ac_b_ex.fe_len = 1 << i;
                      ac->ac_b_ex.fe_start = k << i;
                      ac->ac_b_ex.fe_group = e4b->bd_group;
      
                      ext4_mb_use_best_found(ac, e4b);
      
                      BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
      
  584                 if (EXT4_SB(sb)->s_mb_stats)
                              atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
      
                      break;
              }
      }
      
      /*
       * The routine scans the group and measures all found extents.
       * In order to optimize scanning, caller must pass number of
       * free blocks in the group, so the routine can know upper limit.
       */
      static noinline_for_stack
      void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                              struct ext4_buddy *e4b)
      {
  502         struct super_block *sb = ac->ac_sb;
              void *bitmap = e4b->bd_bitmap;
              struct ext4_free_extent ex;
              int i;
              int free;
      
              free = e4b->bd_info->bb_free;
              BUG_ON(free <= 0);
      
  502         i = e4b->bd_info->bb_first_free;
      
  502         while (free && ac->ac_status == AC_STATUS_CONTINUE) {
                      i = mb_find_next_zero_bit(bitmap,
  502                                                 EXT4_CLUSTERS_PER_GROUP(sb), i);
                      if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
                              /*
                               * IF we have corrupt bitmap, we won't find any
                               * free blocks even though group info says we
                               * we have free blocks
                               */
                              ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
                                              "%d free clusters as per "
                                              "group info. But bitmap says 0",
                                              free);
                              break;
                      }
      
  502                 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
                      BUG_ON(ex.fe_len <= 0);
  502                 if (free < ex.fe_len) {
                              ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
                                              "%d free clusters as per "
                                              "group info. But got %d blocks",
                                              free, ex.fe_len);
                              /*
                               * The number of free blocks differs. This mostly
                               * indicate that the bitmap is corrupt. So exit
                               * without claiming the space.
                               */
                              break;
                      }
  502                 ex.fe_logical = 0xDEADC0DE; /* debug value */
  502                 ext4_mb_measure_extent(ac, &ex, e4b);
      
  502                 i += ex.fe_len;
                      free -= ex.fe_len;
              }
      
  502         ext4_mb_check_limits(ac, e4b, 1);
      }
      
      /*
       * This is a special case for storages like raid5
       * we try to find stripe-aligned chunks for stripe-size-multiple requests
       */
      static noinline_for_stack
      void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                                       struct ext4_buddy *e4b)
      {
              struct super_block *sb = ac->ac_sb;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              void *bitmap = e4b->bd_bitmap;
              struct ext4_free_extent ex;
              ext4_fsblk_t first_group_block;
              ext4_fsblk_t a;
              ext4_grpblk_t i;
              int max;
      
              BUG_ON(sbi->s_stripe == 0);
      
              /* find first stripe-aligned block in group */
              first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
      
              a = first_group_block + sbi->s_stripe - 1;
              do_div(a, sbi->s_stripe);
              i = (a * sbi->s_stripe) - first_group_block;
      
              while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
                      if (!mb_test_bit(i, bitmap)) {
                              max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
                              if (max >= sbi->s_stripe) {
                                      ac->ac_found++;
                                      ex.fe_logical = 0xDEADF00D; /* debug value */
                                      ac->ac_b_ex = ex;
                                      ext4_mb_use_best_found(ac, e4b);
                                      break;
                              }
                      }
                      i += sbi->s_stripe;
              }
      }
      
      /*
       * This is now called BEFORE we load the buddy bitmap.
       * Returns either 1 or 0 indicating that the group is either suitable
       * for the allocation or not. In addition it can also return negative
       * error code when something goes wrong.
       */
      static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                                      ext4_group_t group, int cr)
      {
              unsigned free, fragments;
  658         int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
  658         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
      
              BUG_ON(cr < 0 || cr >= 4);
      
  658         free = grp->bb_free;
              if (free == 0)
  658                 return 0;
  658         if (cr <= 2 && free < ac->ac_g_ex.fe_len)
                      return 0;
      
  658         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                      return 0;
      
              /* We only do this if the grp has never been initialized */
              if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
    1                 int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
                      if (ret)
                              return ret;
              }
      
  657         fragments = grp->bb_fragments;
              if (fragments == 0)
                      return 0;
      
  657         switch (cr) {
              case 0:
  604                 BUG_ON(ac->ac_2order == 0);
      
                      /* Avoid using the first bg of a flexgroup for data files */
  604                 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
                          (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
                          ((group % flex_size) == 0))
                              return 0;
      
  604                 if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
  604                     (free / fragments) >= ac->ac_g_ex.fe_len)
                              return 1;
      
  277                 if (grp->bb_largest_free_order < ac->ac_2order)
                              return 0;
      
                      return 1;
              case 1:
  498                 if ((free / fragments) >= ac->ac_g_ex.fe_len)
                              return 1;
                      break;
              case 2:
  169                 if (free >= ac->ac_g_ex.fe_len)
                              return 1;
                      break;
              case 3:
                      return 1;
              default:
                      BUG();
              }
      
              return 0;
      }
      
      static noinline_for_stack int
      ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
      {
              ext4_group_t ngroups, group, i;
              int cr;
              int err = 0, first_err = 0;
              struct ext4_sb_info *sbi;
              struct super_block *sb;
              struct ext4_buddy e4b;
      
  661         sb = ac->ac_sb;
              sbi = EXT4_SB(sb);
              ngroups = ext4_get_groups_count(sb);
              /* non-extent files are limited to low blocks/groups */
              if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
  239                 ngroups = sbi->s_blockfile_groups;
      
  661         BUG_ON(ac->ac_status == AC_STATUS_FOUND);
      
              /* first, try the goal */
  661         err = ext4_mb_find_by_goal(ac, &e4b);
  661         if (err || ac->ac_status == AC_STATUS_FOUND)
                      goto out;
      
  658         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                      goto out;
      
              /*
               * ac->ac2_order is set only if the fe_len is a power of 2
               * if ac2_order is set we also set criteria to 0 so that we
               * try exact allocation using buddy.
               */
  658         i = fls(ac->ac_g_ex.fe_len);
              ac->ac_2order = 0;
              /*
               * We search using buddy data only if the order of the request
               * is greater than equal to the sbi_s_mb_order2_reqs
               * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
               * We also support searching for power-of-two requests only for
               * requests upto maximum buddy size we have constructed.
               */
  632         if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
                      /*
                       * This should tell if fe_len is exactly power of 2
                       */
  632                 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
  616                         ac->ac_2order = array_index_nospec(i - 1,
                                                                 sb->s_blocksize_bits + 2);
              }
      
              /* if stream allocation is enabled, use global goal */
  658         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                      /* TBD: may be hot point */
  629                 spin_lock(&sbi->s_md_lock);
                      ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
                      ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
                      spin_unlock(&sbi->s_md_lock);
              }
      
              /* Let's just scan groups to find more-less suitable blocks */
  658         cr = ac->ac_2order ? 0 : 1;
              /*
               * cr == 0 try to get exact allocation,
               * cr == 3  try to get anything
               */
      repeat:
  658         for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
  658                 ac->ac_criteria = cr;
                      /*
                       * searching for the right group start
                       * from the goal value specified
                       */
                      group = ac->ac_g_ex.fe_group;
      
  282                 for (i = 0; i < ngroups; group++, i++) {
                              int ret = 0;
  658                         cond_resched();
                              /*
                               * Artificially restricted ngroups for non-extent
                               * files makes group > ngroups possible on first loop.
                               */
                              if (group >= ngroups)
                                      group = 0;
      
                              /* This now checks without needing the buddy page */
  658                         ret = ext4_mb_good_group(ac, group, cr);
                              if (ret <= 0) {
  282                                 if (!first_err)
                                              first_err = ret;
                                      continue;
                              }
      
  657                         err = ext4_mb_load_buddy(sb, group, &e4b);
                              if (err)
                                      goto out;
      
  657                         ext4_lock_group(sb, group);
      
                              /*
                               * We need to check again after locking the
                               * block group
                               */
  657                         ret = ext4_mb_good_group(ac, group, cr);
                              if (ret <= 0) {
                                      ext4_unlock_group(sb, group);
                                      ext4_mb_unload_buddy(&e4b);
                                      if (!first_err)
                                              first_err = ret;
                                      continue;
                              }
      
  657                         ac->ac_groups_scanned++;
                              if (cr == 0)
  584                                 ext4_mb_simple_scan_group(ac, &e4b);
  502                         else if (cr == 1 && sbi->s_stripe &&
                                              !(ac->ac_g_ex.fe_len % sbi->s_stripe))
                                      ext4_mb_scan_aligned(ac, &e4b);
                              else
  502                                 ext4_mb_complex_scan_group(ac, &e4b);
      
  657                         ext4_unlock_group(sb, group);
                              ext4_mb_unload_buddy(&e4b);
      
                              if (ac->ac_status != AC_STATUS_CONTINUE)
                                      break;
                      }
              }
      
  658         if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
  183             !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                      /*
                       * We've been searching too long. Let's try to allocate
                       * the best chunk we've found so far
                       */
      
  183                 ext4_mb_try_best_found(ac, &e4b);
                      if (ac->ac_status != AC_STATUS_FOUND) {
                              /*
                               * Someone more lucky has already allocated it.
                               * The only thing we can do is just take first
                               * found block(s)
                              printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
                               */
   28                         ac->ac_b_ex.fe_group = 0;
                              ac->ac_b_ex.fe_start = 0;
                              ac->ac_b_ex.fe_len = 0;
                              ac->ac_status = AC_STATUS_CONTINUE;
                              ac->ac_flags |= EXT4_MB_HINT_FIRST;
  658                         cr = 3;
                              atomic_inc(&sbi->s_mb_lost_chunks);
                              goto repeat;
                      }
              }
      out:
   16         if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
                      err = first_err;
  661         return err;
      }
      
      static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
      {
              struct super_block *sb = seq->private;
              ext4_group_t group;
      
              if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                      return NULL;
              group = *pos + 1;
              return (void *) ((unsigned long) group);
      }
      
      static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
      {
              struct super_block *sb = seq->private;
              ext4_group_t group;
      
              ++*pos;
              if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                      return NULL;
              group = *pos + 1;
              return (void *) ((unsigned long) group);
      }
      
      static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
      {
              struct super_block *sb = seq->private;
              ext4_group_t group = (ext4_group_t) ((unsigned long) v);
              int i;
              int err, buddy_loaded = 0;
              struct ext4_buddy e4b;
              struct ext4_group_info *grinfo;
              struct sg {
                      struct ext4_group_info info;
                      ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
              } sg;
      
              group--;
              if (group == 0)
                      seq_puts(seq, "#group: free  frags first ["
                                    " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
                                    " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]");
      
              i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
                      sizeof(struct ext4_group_info);
              grinfo = ext4_get_group_info(sb, group);
              /* Load the group info in memory only if not already loaded. */
              if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
                      err = ext4_mb_load_buddy(sb, group, &e4b);
                      if (err) {
                              seq_printf(seq, "#%-5u: I/O error\n", group);
                              return 0;
                      }
                      buddy_loaded = 1;
              }
      
              memcpy(&sg, ext4_get_group_info(sb, group), i);
      
              if (buddy_loaded)
                      ext4_mb_unload_buddy(&e4b);
      
              seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                              sg.info.bb_fragments, sg.info.bb_first_free);
              for (i = 0; i <= 13; i++)
                      seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
                                      sg.info.bb_counters[i] : 0);
              seq_printf(seq, " ]\n");
      
              return 0;
      }
      
      static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
      {
      }
      
      static const struct seq_operations ext4_mb_seq_groups_ops = {
              .start  = ext4_mb_seq_groups_start,
              .next   = ext4_mb_seq_groups_next,
              .stop   = ext4_mb_seq_groups_stop,
              .show   = ext4_mb_seq_groups_show,
      };
      
      static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
      {
              struct super_block *sb = PDE_DATA(inode);
              int rc;
      
              rc = seq_open(file, &ext4_mb_seq_groups_ops);
              if (rc == 0) {
                      struct seq_file *m = file->private_data;
                      m->private = sb;
              }
              return rc;
      
      }
      
      const struct file_operations ext4_seq_mb_groups_fops = {
              .owner                = THIS_MODULE,
              .open                = ext4_mb_seq_groups_open,
              .read                = seq_read,
              .llseek                = seq_lseek,
              .release        = seq_release,
      };
      
      static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
      {
              int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
              struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
      
              BUG_ON(!cachep);
              return cachep;
      }
      
      /*
       * Allocate the top-level s_group_info array for the specified number
       * of groups
       */
      int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
      {
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              unsigned size;
              struct ext4_group_info ***new_groupinfo;
      
              size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
                      EXT4_DESC_PER_BLOCK_BITS(sb);
              if (size <= sbi->s_group_info_size)
                      return 0;
      
              size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
              new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
              if (!new_groupinfo) {
                      ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
                      return -ENOMEM;
              }
              if (sbi->s_group_info) {
                      memcpy(new_groupinfo, sbi->s_group_info,
                             sbi->s_group_info_size * sizeof(*sbi->s_group_info));
                      kvfree(sbi->s_group_info);
              }
              sbi->s_group_info = new_groupinfo;
              sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
              ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
                         sbi->s_group_info_size);
              return 0;
      }
      
      /* Create and initialize ext4_group_info data for the given group. */
      int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                                struct ext4_group_desc *desc)
      {
              int i;
              int metalen = 0;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              struct ext4_group_info **meta_group_info;
              struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
      
              /*
               * First check if this group is the first of a reserved block.
               * If it's true, we have to allocate a new table of pointers
               * to ext4_group_info structures
               */
              if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                      metalen = sizeof(*meta_group_info) <<
                              EXT4_DESC_PER_BLOCK_BITS(sb);
                      meta_group_info = kmalloc(metalen, GFP_NOFS);
                      if (meta_group_info == NULL) {
                              ext4_msg(sb, KERN_ERR, "can't allocate mem "
                                       "for a buddy group");
                              goto exit_meta_group_info;
                      }
                      sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
                              meta_group_info;
              }
      
              meta_group_info =
                      sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
              i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
      
              meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
              if (meta_group_info[i] == NULL) {
                      ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
                      goto exit_group_info;
              }
              set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                      &(meta_group_info[i]->bb_state));
      
              /*
               * initialize bb_free to be able to skip
               * empty groups without initialization
               */
              if (ext4_has_group_desc_csum(sb) &&
                  (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                      meta_group_info[i]->bb_free =
                              ext4_free_clusters_after_init(sb, group, desc);
              } else {
                      meta_group_info[i]->bb_free =
                              ext4_free_group_clusters(sb, desc);
              }
      
              INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
              init_rwsem(&meta_group_info[i]->alloc_sem);
              meta_group_info[i]->bb_free_root = RB_ROOT;
              meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
      
      #ifdef DOUBLE_CHECK
              {
                      struct buffer_head *bh;
                      meta_group_info[i]->bb_bitmap =
                              kmalloc(sb->s_blocksize, GFP_NOFS);
                      BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
                      bh = ext4_read_block_bitmap(sb, group);
                      BUG_ON(IS_ERR_OR_NULL(bh));
                      memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
                              sb->s_blocksize);
                      put_bh(bh);
              }
      #endif
      
              return 0;
      
      exit_group_info:
              /* If a meta_group_info table has been allocated, release it now */
              if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                      kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
                      sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
              }
      exit_meta_group_info:
              return -ENOMEM;
      } /* ext4_mb_add_groupinfo */
      
      static int ext4_mb_init_backend(struct super_block *sb)
      {
              ext4_group_t ngroups = ext4_get_groups_count(sb);
              ext4_group_t i;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              int err;
              struct ext4_group_desc *desc;
              struct kmem_cache *cachep;
      
              err = ext4_mb_alloc_groupinfo(sb, ngroups);
              if (err)
                      return err;
      
              sbi->s_buddy_cache = new_inode(sb);
              if (sbi->s_buddy_cache == NULL) {
                      ext4_msg(sb, KERN_ERR, "can't get new inode");
                      goto err_freesgi;
              }
              /* To avoid potentially colliding with an valid on-disk inode number,
               * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
               * not in the inode hash, so it should never be found by iget(), but
               * this will avoid confusion if it ever shows up during debugging. */
              sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
              EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
              for (i = 0; i < ngroups; i++) {
                      desc = ext4_get_group_desc(sb, i, NULL);
                      if (desc == NULL) {
                              ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
                              goto err_freebuddy;
                      }
                      if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
                              goto err_freebuddy;
              }
      
              return 0;
      
      err_freebuddy:
              cachep = get_groupinfo_cache(sb->s_blocksize_bits);
              while (i-- > 0)
                      kmem_cache_free(cachep, ext4_get_group_info(sb, i));
              i = sbi->s_group_info_size;
              while (i-- > 0)
                      kfree(sbi->s_group_info[i]);
              iput(sbi->s_buddy_cache);
      err_freesgi:
              kvfree(sbi->s_group_info);
              return -ENOMEM;
      }
      
      static void ext4_groupinfo_destroy_slabs(void)
      {
              int i;
      
              for (i = 0; i < NR_GRPINFO_CACHES; i++) {
                      if (ext4_groupinfo_caches[i])
                              kmem_cache_destroy(ext4_groupinfo_caches[i]);
                      ext4_groupinfo_caches[i] = NULL;
              }
      }
      
      static int ext4_groupinfo_create_slab(size_t size)
      {
              static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
              int slab_size;
              int blocksize_bits = order_base_2(size);
              int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
              struct kmem_cache *cachep;
      
              if (cache_index >= NR_GRPINFO_CACHES)
                      return -EINVAL;
      
              if (unlikely(cache_index < 0))
                      cache_index = 0;
      
              mutex_lock(&ext4_grpinfo_slab_create_mutex);
              if (ext4_groupinfo_caches[cache_index]) {
                      mutex_unlock(&ext4_grpinfo_slab_create_mutex);
                      return 0;        /* Already created */
              }
      
              slab_size = offsetof(struct ext4_group_info,
                                      bb_counters[blocksize_bits + 2]);
      
              cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
                                              slab_size, 0, SLAB_RECLAIM_ACCOUNT,
                                              NULL);
      
              ext4_groupinfo_caches[cache_index] = cachep;
      
              mutex_unlock(&ext4_grpinfo_slab_create_mutex);
              if (!cachep) {
                      printk(KERN_EMERG
                             "EXT4-fs: no memory for groupinfo slab cache\n");
                      return -ENOMEM;
              }
      
              return 0;
      }
      
      int ext4_mb_init(struct super_block *sb)
      {
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              unsigned i, j;
              unsigned offset, offset_incr;
              unsigned max;
              int ret;
      
              i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
      
              sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
              if (sbi->s_mb_offsets == NULL) {
                      ret = -ENOMEM;
                      goto out;
              }
      
              i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
              sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
              if (sbi->s_mb_maxs == NULL) {
                      ret = -ENOMEM;
                      goto out;
              }
      
              ret = ext4_groupinfo_create_slab(sb->s_blocksize);
              if (ret < 0)
                      goto out;
      
              /* order 0 is regular bitmap */
              sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
              sbi->s_mb_offsets[0] = 0;
      
              i = 1;
              offset = 0;
              offset_incr = 1 << (sb->s_blocksize_bits - 1);
              max = sb->s_blocksize << 2;
              do {
                      sbi->s_mb_offsets[i] = offset;
                      sbi->s_mb_maxs[i] = max;
                      offset += offset_incr;
                      offset_incr = offset_incr >> 1;
                      max = max >> 1;
                      i++;
              } while (i <= sb->s_blocksize_bits + 1);
      
              spin_lock_init(&sbi->s_md_lock);
              spin_lock_init(&sbi->s_bal_lock);
      
              sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
              sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
              sbi->s_mb_stats = MB_DEFAULT_STATS;
              sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
              sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
              /*
               * The default group preallocation is 512, which for 4k block
               * sizes translates to 2 megabytes.  However for bigalloc file
               * systems, this is probably too big (i.e, if the cluster size
               * is 1 megabyte, then group preallocation size becomes half a
               * gigabyte!).  As a default, we will keep a two megabyte
               * group pralloc size for cluster sizes up to 64k, and after
               * that, we will force a minimum group preallocation size of
               * 32 clusters.  This translates to 8 megs when the cluster
               * size is 256k, and 32 megs when the cluster size is 1 meg,
               * which seems reasonable as a default.
               */
              sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
                                             sbi->s_cluster_bits, 32);
              /*
               * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
               * to the lowest multiple of s_stripe which is bigger than
               * the s_mb_group_prealloc as determined above. We want
               * the preallocation size to be an exact multiple of the
               * RAID stripe size so that preallocations don't fragment
               * the stripes.
               */
              if (sbi->s_stripe > 1) {
                      sbi->s_mb_group_prealloc = roundup(
                              sbi->s_mb_group_prealloc, sbi->s_stripe);
              }
      
              sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
              if (sbi->s_locality_groups == NULL) {
                      ret = -ENOMEM;
                      goto out;
              }
              for_each_possible_cpu(i) {
                      struct ext4_locality_group *lg;
                      lg = per_cpu_ptr(sbi->s_locality_groups, i);
                      mutex_init(&lg->lg_mutex);
                      for (j = 0; j < PREALLOC_TB_SIZE; j++)
                              INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
                      spin_lock_init(&lg->lg_prealloc_lock);
              }
      
              /* init file for buddy data */
              ret = ext4_mb_init_backend(sb);
              if (ret != 0)
                      goto out_free_locality_groups;
      
              return 0;
      
      out_free_locality_groups:
              free_percpu(sbi->s_locality_groups);
              sbi->s_locality_groups = NULL;
      out:
              kfree(sbi->s_mb_offsets);
              sbi->s_mb_offsets = NULL;
              kfree(sbi->s_mb_maxs);
              sbi->s_mb_maxs = NULL;
              return ret;
      }
      
      /* need to called with the ext4 group lock held */
      static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
      {
              struct ext4_prealloc_space *pa;
              struct list_head *cur, *tmp;
              int count = 0;
      
              list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
                      pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                      list_del(&pa->pa_group_list);
                      count++;
                      kmem_cache_free(ext4_pspace_cachep, pa);
              }
              if (count)
                      mb_debug(1, "mballoc: %u PAs left\n", count);
      
      }
      
      int ext4_mb_release(struct super_block *sb)
      {
              ext4_group_t ngroups = ext4_get_groups_count(sb);
              ext4_group_t i;
              int num_meta_group_infos;
              struct ext4_group_info *grinfo;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
      
              if (sbi->s_group_info) {
                      for (i = 0; i < ngroups; i++) {
                              grinfo = ext4_get_group_info(sb, i);
      #ifdef DOUBLE_CHECK
                              kfree(grinfo->bb_bitmap);
      #endif
                              ext4_lock_group(sb, i);
                              ext4_mb_cleanup_pa(grinfo);
                              ext4_unlock_group(sb, i);
                              kmem_cache_free(cachep, grinfo);
                      }
                      num_meta_group_infos = (ngroups +
                                      EXT4_DESC_PER_BLOCK(sb) - 1) >>
                              EXT4_DESC_PER_BLOCK_BITS(sb);
                      for (i = 0; i < num_meta_group_infos; i++)
                              kfree(sbi->s_group_info[i]);
                      kvfree(sbi->s_group_info);
              }
              kfree(sbi->s_mb_offsets);
              kfree(sbi->s_mb_maxs);
              iput(sbi->s_buddy_cache);
              if (sbi->s_mb_stats) {
                      ext4_msg(sb, KERN_INFO,
                             "mballoc: %u blocks %u reqs (%u success)",
                                      atomic_read(&sbi->s_bal_allocated),
                                      atomic_read(&sbi->s_bal_reqs),
                                      atomic_read(&sbi->s_bal_success));
                      ext4_msg(sb, KERN_INFO,
                            "mballoc: %u extents scanned, %u goal hits, "
                                      "%u 2^N hits, %u breaks, %u lost",
                                      atomic_read(&sbi->s_bal_ex_scanned),
                                      atomic_read(&sbi->s_bal_goals),
                                      atomic_read(&sbi->s_bal_2orders),
                                      atomic_read(&sbi->s_bal_breaks),
                                      atomic_read(&sbi->s_mb_lost_chunks));
                      ext4_msg(sb, KERN_INFO,
                             "mballoc: %lu generated and it took %Lu",
                                      sbi->s_mb_buddies_generated,
                                      sbi->s_mb_generation_time);
                      ext4_msg(sb, KERN_INFO,
                             "mballoc: %u preallocated, %u discarded",
                                      atomic_read(&sbi->s_mb_preallocated),
                                      atomic_read(&sbi->s_mb_discarded));
              }
      
              free_percpu(sbi->s_locality_groups);
      
              return 0;
      }
      
      static inline int ext4_issue_discard(struct super_block *sb,
                      ext4_group_t block_group, ext4_grpblk_t cluster, int count,
                      unsigned long flags)
      {
              ext4_fsblk_t discard_block;
      
              discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
                               ext4_group_first_block_no(sb, block_group));
              count = EXT4_C2B(EXT4_SB(sb), count);
              trace_ext4_discard_blocks(sb,
                              (unsigned long long) discard_block, count);
              return sb_issue_discard(sb, discard_block, count, GFP_NOFS, flags);
      }
      
      /*
       * This function is called by the jbd2 layer once the commit has finished,
       * so we know we can free the blocks that were released with that commit.
       */
      static void ext4_free_data_callback(struct super_block *sb,
                                          struct ext4_journal_cb_entry *jce,
                                          int rc)
      {
              struct ext4_free_data *entry = (struct ext4_free_data *)jce;
              struct ext4_buddy e4b;
              struct ext4_group_info *db;
              int err, count = 0, count2 = 0;
      
              mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                       entry->efd_count, entry->efd_group, entry);
      
              if (test_opt(sb, DISCARD)) {
                      err = ext4_issue_discard(sb, entry->efd_group,
                                               entry->efd_start_cluster,
                                               entry->efd_count, 0);
                      if (err && err != -EOPNOTSUPP)
                              ext4_msg(sb, KERN_WARNING, "discard request in"
                                       " group:%d block:%d count:%d failed"
                                       " with %d", entry->efd_group,
                                       entry->efd_start_cluster,
                                       entry->efd_count, err);
              }
      
              err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
              /* we expect to find existing buddy because it's pinned */
              BUG_ON(err != 0);
      
      
              db = e4b.bd_info;
              /* there are blocks to put in buddy to make them really free */
              count += entry->efd_count;
              count2++;
              ext4_lock_group(sb, entry->efd_group);
              /* Take it out of per group rb tree */
              rb_erase(&entry->efd_node, &(db->bb_free_root));
              mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
      
              /*
               * Clear the trimmed flag for the group so that the next
               * ext4_trim_fs can trim it.
               * If the volume is mounted with -o discard, online discard
               * is supported and the free blocks will be trimmed online.
               */
              if (!test_opt(sb, DISCARD))
                      EXT4_MB_GRP_CLEAR_TRIMMED(db);
      
              if (!db->bb_free_root.rb_node) {
                      /* No more items in the per group rb tree
                       * balance refcounts from ext4_mb_free_metadata()
                       */
                      page_cache_release(e4b.bd_buddy_page);
                      page_cache_release(e4b.bd_bitmap_page);
              }
              ext4_unlock_group(sb, entry->efd_group);
              kmem_cache_free(ext4_free_data_cachep, entry);
              ext4_mb_unload_buddy(&e4b);
      
              mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
      }
      
      int __init ext4_init_mballoc(void)
      {
              ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
                                              SLAB_RECLAIM_ACCOUNT);
              if (ext4_pspace_cachep == NULL)
                      return -ENOMEM;
      
              ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
                                          SLAB_RECLAIM_ACCOUNT);
              if (ext4_ac_cachep == NULL) {
                      kmem_cache_destroy(ext4_pspace_cachep);
                      return -ENOMEM;
              }
      
              ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
                                                 SLAB_RECLAIM_ACCOUNT);
              if (ext4_free_data_cachep == NULL) {
                      kmem_cache_destroy(ext4_pspace_cachep);
                      kmem_cache_destroy(ext4_ac_cachep);
                      return -ENOMEM;
              }
              return 0;
      }
      
      void ext4_exit_mballoc(void)
      {
              /*
               * Wait for completion of call_rcu()'s on ext4_pspace_cachep
               * before destroying the slab cache.
               */
              rcu_barrier();
              kmem_cache_destroy(ext4_pspace_cachep);
              kmem_cache_destroy(ext4_ac_cachep);
              kmem_cache_destroy(ext4_free_data_cachep);
              ext4_groupinfo_destroy_slabs();
      }
      
      
      /*
       * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
       * Returns 0 if success or error code
       */
      static noinline_for_stack int
      ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                                      handle_t *handle, unsigned int reserv_clstrs)
      {
              struct buffer_head *bitmap_bh = NULL;
              struct ext4_group_desc *gdp;
              struct buffer_head *gdp_bh;
              struct ext4_sb_info *sbi;
              struct super_block *sb;
              ext4_fsblk_t block;
              int err, len;
      
  709         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
  709         BUG_ON(ac->ac_b_ex.fe_len <= 0);
      
  709         sb = ac->ac_sb;
              sbi = EXT4_SB(sb);
      
              bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
              if (IS_ERR(bitmap_bh)) {
                      err = PTR_ERR(bitmap_bh);
                      bitmap_bh = NULL;
                      goto out_err;
              }
      
              BUFFER_TRACE(bitmap_bh, "getting write access");
  709         err = ext4_journal_get_write_access(handle, bitmap_bh);
              if (err)
                      goto out_err;
      
              err = -EIO;
  709         gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
              if (!gdp)
                      goto out_err;
      
  709         ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
                              ext4_free_group_clusters(sb, gdp));
      
              BUFFER_TRACE(gdp_bh, "get_write_access");
              err = ext4_journal_get_write_access(handle, gdp_bh);
              if (err)
                      goto out_err;
      
  709         block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
      
              len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
              if (!ext4_data_block_valid(sbi, block, len)) {
                      ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                                 "fs metadata", block, block+len);
                      /* File system mounted not to panic on error
                       * Fix the bitmap and return EFSCORRUPTED
                       * We leak some of the blocks here.
                       */
                      ext4_lock_group(sb, ac->ac_b_ex.fe_group);
                      ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
                                    ac->ac_b_ex.fe_len);
                      ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                      err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                      if (!err)
                              err = -EFSCORRUPTED;
                      goto out_err;
              }
      
  709         ext4_lock_group(sb, ac->ac_b_ex.fe_group);
      #ifdef AGGRESSIVE_CHECK
              {
                      int i;
                      for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
                              BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
                                                      bitmap_bh->b_data));
                      }
              }
      #endif
  709         ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
                            ac->ac_b_ex.fe_len);
              if (ext4_has_group_desc_csum(sb) &&
  709             (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
    6                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                      ext4_free_group_clusters_set(sb, gdp,
                                                   ext4_free_clusters_after_init(sb,
                                                      ac->ac_b_ex.fe_group, gdp));
              }
  709         len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
              ext4_free_group_clusters_set(sb, gdp, len);
              ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
              ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
      
              ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
              percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
              /*
               * Now reduce the dirty block count also. Should not go negative
               */
              if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
                      /* release all the reserved blocks if non delalloc */
  527                 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                         reserv_clstrs);
      
  709         if (sbi->s_log_groups_per_flex) {
  709                 ext4_group_t flex_group = ext4_flex_group(sbi,
                                                                ac->ac_b_ex.fe_group);
                      atomic64_sub(ac->ac_b_ex.fe_len,
                                   &sbi->s_flex_groups[flex_group].free_clusters);
              }
      
  709         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
              if (err)
                      goto out_err;
  709         err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
      
      out_err:
  709         brelse(bitmap_bh);
  709         return err;
      }
      
      /*
       * here we normalize request for locality group
       * Group request are normalized to s_mb_group_prealloc, which goes to
       * s_strip if we set the same via mount option.
       * s_mb_group_prealloc can be configured via
       * /sys/fs/ext4/<partition>/mb_group_prealloc
       *
       * XXX: should we try to preallocate more than the group has now?
       */
      static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
      {
              struct super_block *sb = ac->ac_sb;
   39         struct ext4_locality_group *lg = ac->ac_lg;
      
              BUG_ON(lg == NULL);
   39         ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
              mb_debug(1, "#%u: goal %u blocks for locality group\n",
                      current->pid, ac->ac_g_ex.fe_len);
      }
      
      /*
       * Normalization means making request better in terms of
       * size and alignment
       */
      static noinline_for_stack void
      ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                                      struct ext4_allocation_request *ar)
      {
  635         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
              int bsbits, max;
              ext4_lblk_t end;
              loff_t size, start_off;
              loff_t orig_size __maybe_unused;
              ext4_lblk_t start;
  631         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
              struct ext4_prealloc_space *pa;
      
              /* do normalize only data requests, metadata requests
                 do not need preallocation */
  661         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                      return;
      
              /* sometime caller may want exact blocks */
              if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                      return;
      
              /* caller may indicate that preallocation isn't
               * required (it's a tail, for example) */
  636         if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
                      return;
      
              if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
   39                 ext4_mb_normalize_group_request(ac);
  661                 return ;
              }
      
              bsbits = ac->ac_sb->s_blocksize_bits;
      
              /* first, let's learn actual file size
               * given current request is allocated */
              size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
              size = size << bsbits;
              if (size < i_size_read(ac->ac_inode))
                      size = i_size_read(ac->ac_inode);
              orig_size = size;
      
              /* max size of free chunks */
  221         max = 2 << bsbits;
      
      #define NRL_CHECK_SIZE(req, size, max, chunk_size)        \
                      (req <= (size) || max <= (chunk_size))
      
              /* first, try to predict filesize */
              /* XXX: should this table be tunable? */
              start_off = 0;
              if (size <= 16 * 1024) {
                      size = 16 * 1024;
  631         } else if (size <= 32 * 1024) {
                      size = 32 * 1024;
  631         } else if (size <= 64 * 1024) {
                      size = 64 * 1024;
  631         } else if (size <= 128 * 1024) {
                      size = 128 * 1024;
  458         } else if (size <= 256 * 1024) {
                      size = 256 * 1024;
  425         } else if (size <= 512 * 1024) {
                      size = 512 * 1024;
  394         } else if (size <= 1024 * 1024) {
                      size = 1024 * 1024;
  352         } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
                      start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
  160                                                 (21 - bsbits)) << 21;
                      size = 2 * 1024 * 1024;
  221         } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
                      start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
   20                                                         (22 - bsbits)) << 22;
                      size = 4 * 1024 * 1024;
  211         } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
                                              (8<<20)>>bsbits, max, 8 * 1024)) {
                      start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
  211                                                         (23 - bsbits)) << 23;
                      size = 8 * 1024 * 1024;
              } else {
                      start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
                      size          = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
                                                    ac->ac_o_ex.fe_len) << bsbits;
              }
  631         size = size >> bsbits;
              start = start_off >> bsbits;
      
              /* don't cover already allocated blocks in selected range */
  544         if (ar->pleft && start <= ar->lleft) {
  348                 size -= ar->lleft + 1 - start;
                      start = ar->lleft + 1;
              }
  631         if (ar->pright && start + size - 1 >= ar->lright)
    8                 size -= start + size - ar->lright;
      
              /*
               * Trim allocation request for filesystems with artificially small
               * groups.
               */
  631         if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
                      size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
      
  631         end = start + size;
      
              /* check we don't cross already preallocated blocks */
  631         rcu_read_lock();
  631         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
                      ext4_lblk_t pa_end;
      
   89                 if (pa->pa_deleted)
                              continue;
   89                 spin_lock(&pa->pa_lock);
                      if (pa->pa_deleted) {
                              spin_unlock(&pa->pa_lock);
                              continue;
                      }
      
   89                 pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
                                                        pa->pa_len);
      
                      /* PA must not overlap original request */
   32                 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
                              ac->ac_o_ex.fe_logical < pa->pa_lstart));
      
                      /* skip PAs this normalized request doesn't overlap with */
   89                 if (pa->pa_lstart >= end || pa_end <= start) {
                              spin_unlock(&pa->pa_lock);
                              continue;
                      }
   60                 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
      
                      /* adjust start or end to be adjacent to this pa */
   60                 if (pa_end <= ac->ac_o_ex.fe_logical) {
                              BUG_ON(pa_end < start);
                              start = pa_end;
    4                 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
                              BUG_ON(pa->pa_lstart > end);
                              end = pa->pa_lstart;
                      }
   89                 spin_unlock(&pa->pa_lock);
              }
  631         rcu_read_unlock();
              size = end - start;
      
              /* XXX: extra loop to check we really don't overlap preallocations */
  631         rcu_read_lock();
  631         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
                      ext4_lblk_t pa_end;
      
   89                 spin_lock(&pa->pa_lock);
                      if (pa->pa_deleted == 0) {
   89                         pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
                                                                pa->pa_len);
   32                         BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
                      }
   89                 spin_unlock(&pa->pa_lock);
              }
  631         rcu_read_unlock();
      
              if (start + size <= ac->ac_o_ex.fe_logical &&
                              start > ac->ac_o_ex.fe_logical) {
                      ext4_msg(ac->ac_sb, KERN_ERR,
                               "start %lu, size %lu, fe_logical %lu",
                               (unsigned long) start, (unsigned long) size,
                               (unsigned long) ac->ac_o_ex.fe_logical);
                      BUG();
              }
  631         BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
      
              /* now prepare goal request */
      
              /* XXX: is it better to align blocks WRT to logical
               * placement or satisfy big request as is */
  631         ac->ac_g_ex.fe_logical = start;
              ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
      
              /* define goal start in order to merge */
   58         if (ar->pright && (ar->lright == (start + size))) {
                      /* merge to the right */
    4                 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
                                                      &ac->ac_f_ex.fe_group,
                                                      &ac->ac_f_ex.fe_start);
                      ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
              }
  631         if (ar->pleft && (ar->lleft + 1 == start)) {
                      /* merge to the left */
  398                 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
                                                      &ac->ac_f_ex.fe_group,
                                                      &ac->ac_f_ex.fe_start);
                      ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
              }
      
              mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
                      (unsigned) orig_size, (unsigned) start);
      }
      
      static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
      {
  710         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
      
              if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
                      atomic_inc(&sbi->s_bal_reqs);
                      atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
                      if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                              atomic_inc(&sbi->s_bal_success);
                      atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
                      if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
                                      ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
                              atomic_inc(&sbi->s_bal_goals);
                      if (ac->ac_found > sbi->s_mb_max_to_scan)
                              atomic_inc(&sbi->s_bal_breaks);
              }
      
  710         if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
  661                 trace_ext4_mballoc_alloc(ac);
              else
  521                 trace_ext4_mballoc_prealloc(ac);
      }
      
      /*
       * Called on failure; free up any blocks from the inode PA for this
       * context.  We don't need this for MB_GROUP_PA because we only change
       * pa_free in ext4_mb_release_context(), but on failure, we've already
       * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
       */
    1 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
      {
    1         struct ext4_prealloc_space *pa = ac->ac_pa;
              struct ext4_buddy e4b;
              int err;
      
              if (pa == NULL) {
    1                 if (ac->ac_f_ex.fe_len == 0)
    1                         return;
                      err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
                      if (err) {
                              /*
                               * This should never happen since we pin the
                               * pages in the ext4_allocation_context so
                               * ext4_mb_load_buddy() should never fail.
                               */
                              WARN(1, "mb_load_buddy failed (%d)", err);
                              return;
                      }
                      ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
                      mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
                                     ac->ac_f_ex.fe_len);
                      ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
                      ext4_mb_unload_buddy(&e4b);
                      return;
              }
    1         if (pa->pa_type == MB_INODE_PA)
                      pa->pa_free += ac->ac_b_ex.fe_len;
      }
      
      /*
       * use blocks preallocated to inode
       */
      static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
                                      struct ext4_prealloc_space *pa)
      {
  523         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
              ext4_fsblk_t start;
              ext4_fsblk_t end;
              int len;
      
              /* found preallocated blocks, use them */
              start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
              end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
                        start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
              len = EXT4_NUM_B2C(sbi, end - start);
              ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
                                              &ac->ac_b_ex.fe_start);
              ac->ac_b_ex.fe_len = len;
              ac->ac_status = AC_STATUS_FOUND;
              ac->ac_pa = pa;
      
              BUG_ON(start < pa->pa_pstart);
  523         BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
  523         BUG_ON(pa->pa_free < len);
  523         pa->pa_free -= len;
      
              mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
      }
      
      /*
       * use blocks preallocated to locality group
       */
      static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
                                      struct ext4_prealloc_space *pa)
      {
  358         unsigned int len = ac->ac_o_ex.fe_len;
      
              ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
                                              &ac->ac_b_ex.fe_group,
                                              &ac->ac_b_ex.fe_start);
              ac->ac_b_ex.fe_len = len;
              ac->ac_status = AC_STATUS_FOUND;
              ac->ac_pa = pa;
      
              /* we don't correct pa_pstart or pa_plen here to avoid
               * possible race when the group is being loaded concurrently
               * instead we correct pa later, after blocks are marked
               * in on-disk bitmap -- see ext4_mb_release_context()
               * Other CPUs are prevented from allocating from this pa by lg_mutex
               */
              mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
      }
      
      /*
       * Return the prealloc space that have minimal distance
       * from the goal block. @cpa is the prealloc
       * space that is having currently known minimal distance
       * from the goal block.
       */
      static struct ext4_prealloc_space *
      ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
                              struct ext4_prealloc_space *pa,
                              struct ext4_prealloc_space *cpa)
      {
              ext4_fsblk_t cur_distance, new_distance;
      
  355         if (cpa == NULL) {
  355                 atomic_inc(&pa->pa_count);
                      return pa;
              }
   20         cur_distance = abs(goal_block - cpa->pa_pstart);
              new_distance = abs(goal_block - pa->pa_pstart);
      
              if (cur_distance <= new_distance)
                      return cpa;
      
              /* drop the previous reference */
   20         atomic_dec(&cpa->pa_count);
              atomic_inc(&pa->pa_count);
              return pa;
      }
      
      /*
       * search goal blocks in preallocated space
       */
      static noinline_for_stack int
      ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
      {
  699         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
              int order, i;
              struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
              struct ext4_locality_group *lg;
              struct ext4_prealloc_space *pa, *cpa = NULL;
              ext4_fsblk_t goal_block;
      
              /* only data can be preallocated */
  710         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
  710                 return 0;
      
              /* first, try per-file preallocation */
  699         rcu_read_lock();
  699         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
      
                      /* all fields in this condition don't change,
                       * so we can skip locking for them */
  462                 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
                          ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
  447                                                EXT4_C2B(sbi, pa->pa_len)))
                              continue;
      
                      /* non-extent files can't have physical blocks past 2^32 */
  445                 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
  210                     (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
                           EXT4_MAX_BLOCK_FILE_PHYS))
                              continue;
      
                      /* found preallocated blocks, use them */
  445                 spin_lock(&pa->pa_lock);
  445                 if (pa->pa_deleted == 0 && pa->pa_free) {
  445                         atomic_inc(&pa->pa_count);
                              ext4_mb_use_inode_pa(ac, pa);
                              spin_unlock(&pa->pa_lock);
                              ac->ac_criteria = 10;
  445                         rcu_read_unlock();
                              return 1;
                      }
                      spin_unlock(&pa->pa_lock);
              }
  694         rcu_read_unlock();
      
              /* can we use group allocation? */
              if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
                      return 0;
      
              /* inode may have no locality group for some reason */
  359         lg = ac->ac_lg;
              if (lg == NULL)
                      return 0;
  359         order  = fls(ac->ac_o_ex.fe_len) - 1;
              if (order > PREALLOC_TB_SIZE - 1)
                      /* The max size of hash table is PREALLOC_TB_SIZE */
                      order = PREALLOC_TB_SIZE - 1;
      
              goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
              /*
               * search for the prealloc space that is having
               * minimal distance from the goal block.
               */
              for (i = order; i < PREALLOC_TB_SIZE; i++) {
  359                 rcu_read_lock();
  359                 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
                                              pa_inode_list) {
  355                         spin_lock(&pa->pa_lock);
                              if (pa->pa_deleted == 0 &&
  355                                         pa->pa_free >= ac->ac_o_ex.fe_len) {
      
  355                                 cpa = ext4_mb_check_group_pa(goal_block,
                                                                      pa, cpa);
                              }
  355                         spin_unlock(&pa->pa_lock);
                      }
  359                 rcu_read_unlock();
              }
  359         if (cpa) {
  355                 ext4_mb_use_group_pa(ac, cpa);
                      ac->ac_criteria = 20;
                      return 1;
              }
              return 0;
      }
      
      /*
       * the function goes through all block freed in the group
       * but not yet committed and marks them used in in-core bitmap.
       * buddy must be generated from this bitmap
       * Need to be called with the ext4 group lock held
       */
      static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                      ext4_group_t group)
      {
              struct rb_node *n;
              struct ext4_group_info *grp;
              struct ext4_free_data *entry;
      
    6         grp = ext4_get_group_info(sb, group);
              n = rb_first(&(grp->bb_free_root));
      
              while (n) {
                      entry = rb_entry(n, struct ext4_free_data, efd_node);
                      ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
                      n = rb_next(n);
              }
              return;
      }
      
      /*
       * the function goes through all preallocation in this group and marks them
       * used in in-core bitmap. buddy must be generated from this bitmap
       * Need to be called with ext4 group lock held
       */
      static noinline_for_stack
      void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                              ext4_group_t group)
      {
    6         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
              struct ext4_prealloc_space *pa;
              struct list_head *cur;
              ext4_group_t groupnr;
              ext4_grpblk_t start;
              int preallocated = 0;
              int len;
      
              /* all form of preallocation discards first load group,
               * so the only competing code is preallocation use.
               * we don't need any locking here
               * notice we do NOT ignore preallocations with pa_deleted
               * otherwise we could leave used blocks available for
               * allocation in buddy when concurrent ext4_mb_put_pa()
               * is dropping preallocation
               */
              list_for_each(cur, &grp->bb_prealloc_list) {
                      pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                      spin_lock(&pa->pa_lock);
                      ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                                   &groupnr, &start);
                      len = pa->pa_len;
                      spin_unlock(&pa->pa_lock);
                      if (unlikely(len == 0))
                              continue;
                      BUG_ON(groupnr != group);
                      ext4_set_bits(bitmap, start, len);
                      preallocated += len;
              }
              mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
      }
      
      static void ext4_mb_pa_callback(struct rcu_head *head)
      {
              struct ext4_prealloc_space *pa;
              pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
      
              BUG_ON(atomic_read(&pa->pa_count));
              BUG_ON(pa->pa_deleted == 0);
              kmem_cache_free(ext4_pspace_cachep, pa);
      }
      
      /*
       * drops a reference to preallocated space descriptor
       * if this was the last reference and the space is consumed
       */
      static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                              struct super_block *sb, struct ext4_prealloc_space *pa)
      {
              ext4_group_t grp;
              ext4_fsblk_t grp_blk;
      
              /* in this short window concurrent discard can set pa_deleted */
  591         spin_lock(&pa->pa_lock);
  591         if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
                      spin_unlock(&pa->pa_lock);
                      return;
              }
      
  162         if (pa->pa_deleted == 1) {
  589                 spin_unlock(&pa->pa_lock);
                      return;
              }
      
  162         pa->pa_deleted = 1;
              spin_unlock(&pa->pa_lock);
      
              grp_blk = pa->pa_pstart;
              /*
               * If doing group-based preallocation, pa_pstart may be in the
               * next group when pa is used up
               */
              if (pa->pa_type == MB_GROUP_PA)
   18                 grp_blk--;
      
  162         grp = ext4_get_group_number(sb, grp_blk);
      
              /*
               * possible race:
               *
               *  P1 (buddy init)                        P2 (regular allocation)
               *                                        find block B in PA
               *  copy on-disk bitmap to buddy
               *                                          mark B in on-disk bitmap
               *                                        drop PA from group
               *  mark all PAs in buddy
               *
               * thus, P1 initializes buddy with B available. to prevent this
               * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
               * against that pair
               */
  162         ext4_lock_group(sb, grp);
  162         list_del(&pa->pa_group_list);
              ext4_unlock_group(sb, grp);
      
              spin_lock(pa->pa_obj_lock);
  162         list_del_rcu(&pa->pa_inode_list);
              spin_unlock(pa->pa_obj_lock);
      
              call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
      }
      
      /*
       * creates new preallocated space for given inode
       */
      static noinline_for_stack int
      ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
      {
  508         struct super_block *sb = ac->ac_sb;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              struct ext4_prealloc_space *pa;
              struct ext4_group_info *grp;
              struct ext4_inode_info *ei;
      
              /* preallocate only when found space is larger then requested */
  508         BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
  508         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
  508         BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
      
              pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
              if (pa == NULL)
                      return -ENOMEM;
      
  508         if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
                      int winl;
                      int wins;
                      int win;
                      int offs;
      
                      /* we can't allocate as much as normalizer wants.
                       * so, found space must get proper lstart
                       * to cover original request */
   94                 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
   94                 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
      
                      /* we're limited by original request in that
                       * logical block must be covered any way
                       * winl is window we can move our chunk within */
   94                 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
      
                      /* also, we should cover whole original request */
                      wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
      
                      /* the smallest one defines real window */
                      win = min(winl, wins);
      
                      offs = ac->ac_o_ex.fe_logical %
                              EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
   80                 if (offs && offs < win)
                              win = offs;
      
   94                 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
   94                         EXT4_NUM_B2C(sbi, win);
                      BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
   94                 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
              }
      
              /* preallocation can change ac_b_ex, thus we store actually
               * allocated blocks for history */
  508         ac->ac_f_ex = ac->ac_b_ex;
      
              pa->pa_lstart = ac->ac_b_ex.fe_logical;
              pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
              pa->pa_len = ac->ac_b_ex.fe_len;
              pa->pa_free = pa->pa_len;
              atomic_set(&pa->pa_count, 1);
              spin_lock_init(&pa->pa_lock);
              INIT_LIST_HEAD(&pa->pa_inode_list);
              INIT_LIST_HEAD(&pa->pa_group_list);
              pa->pa_deleted = 0;
              pa->pa_type = MB_INODE_PA;
      
              mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
                              pa->pa_pstart, pa->pa_len, pa->pa_lstart);
  508         trace_ext4_mb_new_inode_pa(ac, pa);
      
  508         ext4_mb_use_inode_pa(ac, pa);
              atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
      
              ei = EXT4_I(ac->ac_inode);
  508         grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
      
              pa->pa_obj_lock = &ei->i_prealloc_lock;
              pa->pa_inode = ac->ac_inode;
      
  508         ext4_lock_group(sb, ac->ac_b_ex.fe_group);
  508         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
  508         ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
      
              spin_lock(pa->pa_obj_lock);
  508         list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
  508         spin_unlock(pa->pa_obj_lock);
      
  508         return 0;
  464 }
      
      /*
       * creates new preallocated space for locality group inodes belongs to
       */
      static noinline_for_stack int
      ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
      {
   38         struct super_block *sb = ac->ac_sb;
              struct ext4_locality_group *lg;
              struct ext4_prealloc_space *pa;
              struct ext4_group_info *grp;
      
              /* preallocate only when found space is larger then requested */
   38         BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
   38         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
   38         BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
      
   38         BUG_ON(ext4_pspace_cachep == NULL);
              pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
              if (pa == NULL)
                      return -ENOMEM;
      
              /* preallocation can change ac_b_ex, thus we store actually
               * allocated blocks for history */
   38         ac->ac_f_ex = ac->ac_b_ex;
      
              pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
              pa->pa_lstart = pa->pa_pstart;
              pa->pa_len = ac->ac_b_ex.fe_len;
              pa->pa_free = pa->pa_len;
              atomic_set(&pa->pa_count, 1);
              spin_lock_init(&pa->pa_lock);
              INIT_LIST_HEAD(&pa->pa_inode_list);
              INIT_LIST_HEAD(&pa->pa_group_list);
              pa->pa_deleted = 0;
              pa->pa_type = MB_GROUP_PA;
      
              mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
                              pa->pa_pstart, pa->pa_len, pa->pa_lstart);
   38         trace_ext4_mb_new_group_pa(ac, pa);
      
   38         ext4_mb_use_group_pa(ac, pa);
              atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
      
   38         grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
              lg = ac->ac_lg;
              BUG_ON(lg == NULL);
      
   38         pa->pa_obj_lock = &lg->lg_prealloc_lock;
              pa->pa_inode = NULL;
      
   38         ext4_lock_group(sb, ac->ac_b_ex.fe_group);
   38         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
   38         ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
      
              /*
               * We will later add the new pa to the right bucket
               * after updating the pa_free in ext4_mb_release_context
               */
   38         return 0;
      }
      
      static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
      {
              int err;
      
  514         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
   38                 err = ext4_mb_new_group_pa(ac);
              else
  508                 err = ext4_mb_new_inode_pa(ac);
              return err;
      }
      
      /*
       * finds all unused blocks in on-disk bitmap, frees them in
       * in-core bitmap and buddy.
       * @pa must be unlinked from inode and group lists, so that
       * nobody else can find/use it.
       * the caller MUST hold group/inode locks.
       * TODO: optimize the case when there are no in-core structures yet
       */
      static noinline_for_stack int
      ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                              struct ext4_prealloc_space *pa)
      {
  247         struct super_block *sb = e4b->bd_sb;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              unsigned int end;
              unsigned int next;
              ext4_group_t group;
              ext4_grpblk_t bit;
              unsigned long long grp_blk_start;
              int err = 0;
              int free = 0;
      
              BUG_ON(pa->pa_deleted == 0);
  247         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
              grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
  247         BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
  247         end = bit + pa->pa_len;
      
              while (bit < end) {
  247                 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                      if (bit >= end)
                              break;
  247                 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
                      mb_debug(1, "    free preallocated %u/%u in group %u\n",
                               (unsigned) ext4_group_first_block_no(sb, group) + bit,
                               (unsigned) next - bit, (unsigned) group);
                      free += next - bit;
      
  247                 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
  247                 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
                                                          EXT4_C2B(sbi, bit)),
                                                     next - bit);
  247                 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                      bit = next + 1;
              }
  247         if (free != pa->pa_free) {
    4                 ext4_msg(e4b->bd_sb, KERN_CRIT,
                               "pa %p: logic %lu, phys. %lu, len %lu",
                               pa, (unsigned long) pa->pa_lstart,
                               (unsigned long) pa->pa_pstart,
                               (unsigned long) pa->pa_len);
                      ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
                                              free, pa->pa_free);
                      /*
                       * pa is already deleted so we use the value obtained
                       * from the bitmap and continue.
                       */
              }
  247         atomic_add(free, &sbi->s_mb_discarded);
      
              return err;
      }
      
      static noinline_for_stack int
      ext4_mb_release_group_pa(struct ext4_buddy *e4b,
                                      struct ext4_prealloc_space *pa)
      {
    8         struct super_block *sb = e4b->bd_sb;
              ext4_group_t group;
              ext4_grpblk_t bit;
      
    8         trace_ext4_mb_release_group_pa(sb, pa);
    8         BUG_ON(pa->pa_deleted == 0);
    8         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
    8         BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
    8         mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
              atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
    8         trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
      
    8         return 0;
      }
      
      /*
       * releases all preallocations in given group
       *
       * first, we need to decide discard policy:
       * - when do we discard
       *   1) ENOSPC
       * - how many do we discard
       *   1) how many requested
       */
      static noinline_for_stack int
      ext4_mb_discard_group_preallocations(struct super_block *sb,
                                              ext4_group_t group, int needed)
      {
   16         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
              struct buffer_head *bitmap_bh = NULL;
              struct ext4_prealloc_space *pa, *tmp;
              struct list_head list;
              struct ext4_buddy e4b;
              int err;
              int busy = 0;
              int free = 0;
      
              mb_debug(1, "discard preallocation for group %u\n", group);
      
              if (list_empty(&grp->bb_prealloc_list))
                      return 0;
      
   10         bitmap_bh = ext4_read_block_bitmap(sb, group);
              if (IS_ERR(bitmap_bh)) {
                      err = PTR_ERR(bitmap_bh);
                      ext4_error(sb, "Error %d reading block bitmap for %u",
                                 err, group);
                      return 0;
              }
      
   10         err = ext4_mb_load_buddy(sb, group, &e4b);
              if (err) {
                      ext4_warning(sb, "Error %d loading buddy information for %u",
                                   err, group);
                      put_bh(bitmap_bh);
                      return 0;
              }
      
   10         if (needed == 0)
                      needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
      
   10         INIT_LIST_HEAD(&list);
      repeat:
   10         ext4_lock_group(sb, group);
   10         list_for_each_entry_safe(pa, tmp,
                                      &grp->bb_prealloc_list, pa_group_list) {
   10                 spin_lock(&pa->pa_lock);
                      if (atomic_read(&pa->pa_count)) {
                              spin_unlock(&pa->pa_lock);
                              busy = 1;
                              continue;
                      }
   10                 if (pa->pa_deleted) {
                              spin_unlock(&pa->pa_lock);
                              continue;
                      }
      
                      /* seems this one can be freed ... */
   10                 pa->pa_deleted = 1;
      
                      /* we can trust pa_free ... */
                      free += pa->pa_free;
      
                      spin_unlock(&pa->pa_lock);
      
   10                 list_del(&pa->pa_group_list);
   10                 list_add(&pa->u.pa_tmp_list, &list);
              }
      
              /* if we still need more blocks and some PAs were used, try again */
   10         if (free < needed && busy) {
                      busy = 0;
                      ext4_unlock_group(sb, group);
                      cond_resched();
                      goto repeat;
              }
      
              /* found anything to free? */
   10         if (list_empty(&list)) {
                      BUG_ON(free != 0);
                      goto out;
              }
      
              /* now free all selected PAs */
   10         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
      
                      /* remove from object (inode or locality group) */
   10                 spin_lock(pa->pa_obj_lock);
   10                 list_del_rcu(&pa->pa_inode_list);
                      spin_unlock(pa->pa_obj_lock);
      
                      if (pa->pa_type == MB_GROUP_PA)
    8                         ext4_mb_release_group_pa(&e4b, pa);
                      else
    8                         ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
      
   10                 list_del(&pa->u.pa_tmp_list);
                      call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
              }
      
      out:
   10         ext4_unlock_group(sb, group);
              ext4_mb_unload_buddy(&e4b);
              put_bh(bitmap_bh);
   16         return free;
      }
      
      /*
       * releases all non-used preallocated blocks for given inode
       *
       * It's important to discard preallocations under i_data_sem
       * We don't want another block to be served from the prealloc
       * space when we are discarding the inode prealloc space.
       *
       * FIXME!! Make sure it is valid at all the call sites
       */
      void ext4_discard_preallocations(struct inode *inode)
      {
              struct ext4_inode_info *ei = EXT4_I(inode);
  559         struct super_block *sb = inode->i_sb;
              struct buffer_head *bitmap_bh = NULL;
              struct ext4_prealloc_space *pa, *tmp;
              ext4_group_t group = 0;
              struct list_head list;
              struct ext4_buddy e4b;
              int err;
      
  561         if (!S_ISREG(inode->i_mode)) {
                      /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
  561                 return;
              }
      
              mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
  559         trace_ext4_discard_preallocations(inode);
      
  559         INIT_LIST_HEAD(&list);
      
      repeat:
              /* first, collect all pa's in the inode */
  559         spin_lock(&ei->i_prealloc_lock);
  559         while (!list_empty(&ei->i_prealloc_list)) {
                      pa = list_entry(ei->i_prealloc_list.next,
                                      struct ext4_prealloc_space, pa_inode_list);
  244                 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
  244                 spin_lock(&pa->pa_lock);
                      if (atomic_read(&pa->pa_count)) {
                              /* this shouldn't happen often - nobody should
                               * use preallocation while we're discarding it */
                              spin_unlock(&pa->pa_lock);
                              spin_unlock(&ei->i_prealloc_lock);
                              ext4_msg(sb, KERN_ERR,
                                       "uh-oh! used pa while discarding");
                              WARN_ON(1);
                              schedule_timeout_uninterruptible(HZ);
                              goto repeat;
      
                      }
  244                 if (pa->pa_deleted == 0) {
  244                         pa->pa_deleted = 1;
                              spin_unlock(&pa->pa_lock);
  244                         list_del_rcu(&pa->pa_inode_list);
  244                         list_add(&pa->u.pa_tmp_list, &list);
                              continue;
                      }
      
                      /* someone is deleting pa right now */
                      spin_unlock(&pa->pa_lock);
                      spin_unlock(&ei->i_prealloc_lock);
      
                      /* we have to wait here because pa_deleted
                       * doesn't mean pa is already unlinked from
                       * the list. as we might be called from
                       * ->clear_inode() the inode will get freed
                       * and concurrent thread which is unlinking
                       * pa from inode's list may access already
                       * freed memory, bad-bad-bad */
      
                      /* XXX: if this happens too often, we can
                       * add a flag to force wait only in case
                       * of ->clear_inode(), but not in case of
                       * regular truncate */
                      schedule_timeout_uninterruptible(HZ);
                      goto repeat;
              }
  559         spin_unlock(&ei->i_prealloc_lock);
      
  244         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
  244                 BUG_ON(pa->pa_type != MB_INODE_PA);
  244                 group = ext4_get_group_number(sb, pa->pa_pstart);
      
                      err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
                                                   GFP_NOFS|__GFP_NOFAIL);
                      if (err) {
                              ext4_error(sb, "Error %d loading buddy information for %u",
                                         err, group);
                              continue;
                      }
      
  244                 bitmap_bh = ext4_read_block_bitmap(sb, group);
                      if (IS_ERR(bitmap_bh)) {
                              err = PTR_ERR(bitmap_bh);
                              ext4_error(sb, "Error %d reading block bitmap for %u",
                                              err, group);
                              ext4_mb_unload_buddy(&e4b);
                              continue;
                      }
      
  244                 ext4_lock_group(sb, group);
  244                 list_del(&pa->pa_group_list);
                      ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                      ext4_unlock_group(sb, group);
      
                      ext4_mb_unload_buddy(&e4b);
                      put_bh(bitmap_bh);
      
  244                 list_del(&pa->u.pa_tmp_list);
                      call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
              }
      }
      
      #ifdef CONFIG_EXT4_DEBUG
      static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
      {
              struct super_block *sb = ac->ac_sb;
              ext4_group_t ngroups, i;
      
              if (!ext4_mballoc_debug ||
                  (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
                      return;
      
              ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
                              " Allocation context details:");
              ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
                              ac->ac_status, ac->ac_flags);
              ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
                               "goal %lu/%lu/%lu@%lu, "
                              "best %lu/%lu/%lu@%lu cr %d",
                              (unsigned long)ac->ac_o_ex.fe_group,
                              (unsigned long)ac->ac_o_ex.fe_start,
                              (unsigned long)ac->ac_o_ex.fe_len,
                              (unsigned long)ac->ac_o_ex.fe_logical,
                              (unsigned long)ac->ac_g_ex.fe_group,
                              (unsigned long)ac->ac_g_ex.fe_start,
                              (unsigned long)ac->ac_g_ex.fe_len,
                              (unsigned long)ac->ac_g_ex.fe_logical,
                              (unsigned long)ac->ac_b_ex.fe_group,
                              (unsigned long)ac->ac_b_ex.fe_start,
                              (unsigned long)ac->ac_b_ex.fe_len,
                              (unsigned long)ac->ac_b_ex.fe_logical,
                              (int)ac->ac_criteria);
              ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
              ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
              ngroups = ext4_get_groups_count(sb);
              for (i = 0; i < ngroups; i++) {
                      struct ext4_group_info *grp = ext4_get_group_info(sb, i);
                      struct ext4_prealloc_space *pa;
                      ext4_grpblk_t start;
                      struct list_head *cur;
                      ext4_lock_group(sb, i);
                      list_for_each(cur, &grp->bb_prealloc_list) {
                              pa = list_entry(cur, struct ext4_prealloc_space,
                                              pa_group_list);
                              spin_lock(&pa->pa_lock);
                              ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                                           NULL, &start);
                              spin_unlock(&pa->pa_lock);
                              printk(KERN_ERR "PA:%u:%d:%u \n", i,
                                     start, pa->pa_len);
                      }
                      ext4_unlock_group(sb, i);
      
                      if (grp->bb_free == 0)
                              continue;
                      printk(KERN_ERR "%u: %d/%d \n",
                             i, grp->bb_free, grp->bb_fragments);
              }
              printk(KERN_ERR "\n");
      }
      #else
      static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
      {
              return;
      }
      #endif
      
      /*
       * We use locality group preallocation for small size file. The size of the
       * file is determined by the current size or the resulting size after
       * allocation which ever is larger
       *
       * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
       */
      static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
      {
  699         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
              int bsbits = ac->ac_sb->s_blocksize_bits;
              loff_t size, isize;
      
              if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                      return;
      
  699         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                      return;
      
              size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
              isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
                      >> bsbits;
      
              if ((size == isize) &&
  217             !ext4_fs_is_busy(sbi) &&
  217             (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
    1                 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
                      return;
              }
      
  699         if (sbi->s_mb_group_prealloc <= 0) {
                      ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                      return;
              }
      
              /* don't use group allocation for large files */
  699         size = max(size, isize);
              if (size > sbi->s_mb_stream_request) {
  645                 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                      return;
              }
      
  359         BUG_ON(ac->ac_lg != NULL);
              /*
               * locality group prealloc space are per cpu. The reason for having
               * per cpu locality group is to reduce the contention between block
               * request from multiple CPUs.
               */
  359         ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
      
              /* we're going to use group allocation */
              ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
      
              /* serialize all allocations in the group */
              mutex_lock(&ac->ac_lg->lg_mutex);
      }
      
      static noinline_for_stack int
      ext4_mb_initialize_context(struct ext4_allocation_context *ac,
                                      struct ext4_allocation_request *ar)
      {
  710         struct super_block *sb = ar->inode->i_sb;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              struct ext4_super_block *es = sbi->s_es;
              ext4_group_t group;
              unsigned int len;
              ext4_fsblk_t goal;
              ext4_grpblk_t block;
      
              /* we can't allocate > group size */
              len = ar->len;
      
              /* just a dirty hack to filter too big requests  */
              if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
                      len = EXT4_CLUSTERS_PER_GROUP(sb);
      
              /* start searching from the goal */
  710         goal = ar->goal;
              if (goal < le32_to_cpu(es->s_first_data_block) ||
  710                         goal >= ext4_blocks_count(es))
                      goal = le32_to_cpu(es->s_first_data_block);
  710         ext4_get_group_no_and_offset(sb, goal, &group, &block);
      
              /* set up allocation goals */
              ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
              ac->ac_status = AC_STATUS_CONTINUE;
              ac->ac_sb = sb;
              ac->ac_inode = ar->inode;
              ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
              ac->ac_o_ex.fe_group = group;
              ac->ac_o_ex.fe_start = block;
              ac->ac_o_ex.fe_len = len;
              ac->ac_g_ex = ac->ac_o_ex;
              ac->ac_flags = ar->flags;
      
              /* we have to define context: we'll we work with a file or
               * locality group. this is a policy, actually */
  699         ext4_mb_group_or_file(ac);
      
  710         mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
                              "left: %u/%u, right %u/%u to %swritable\n",
                              (unsigned) ar->len, (unsigned) ar->logical,
                              (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
                              (unsigned) ar->lleft, (unsigned) ar->pleft,
                              (unsigned) ar->lright, (unsigned) ar->pright,
                              atomic_read(&ar->inode->i_writecount) ? "" : "non-");
              return 0;
      
      }
      
      static noinline_for_stack void
      ext4_mb_discard_lg_preallocations(struct super_block *sb,
                                              struct ext4_locality_group *lg,
                                              int order, int total_entries)
      {
              ext4_group_t group = 0;
              struct ext4_buddy e4b;
              struct list_head discard_list;
              struct ext4_prealloc_space *pa, *tmp;
      
              mb_debug(1, "discard locality group preallocation\n");
      
              INIT_LIST_HEAD(&discard_list);
      
              spin_lock(&lg->lg_prealloc_lock);
              list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
                                                      pa_inode_list) {
                      spin_lock(&pa->pa_lock);
                      if (atomic_read(&pa->pa_count)) {
                              /*
                               * This is the pa that we just used
                               * for block allocation. So don't
                               * free that
                               */
                              spin_unlock(&pa->pa_lock);
                              continue;
                      }
                      if (pa->pa_deleted) {
                              spin_unlock(&pa->pa_lock);
                              continue;
                      }
                      /* only lg prealloc space */
                      BUG_ON(pa->pa_type != MB_GROUP_PA);
      
                      /* seems this one can be freed ... */
                      pa->pa_deleted = 1;
                      spin_unlock(&pa->pa_lock);
      
                      list_del_rcu(&pa->pa_inode_list);
                      list_add(&pa->u.pa_tmp_list, &discard_list);
      
                      total_entries--;
                      if (total_entries <= 5) {
                              /*
                               * we want to keep only 5 entries
                               * allowing it to grow to 8. This
                               * mak sure we don't call discard
                               * soon for this list.
                               */
                              break;
                      }
              }
              spin_unlock(&lg->lg_prealloc_lock);
      
              list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
                      int err;
      
                      group = ext4_get_group_number(sb, pa->pa_pstart);
                      err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
                                                   GFP_NOFS|__GFP_NOFAIL);
                      if (err) {
                              ext4_error(sb, "Error %d loading buddy information for %u",
                                         err, group);
                              continue;
                      }
                      ext4_lock_group(sb, group);
                      list_del(&pa->pa_group_list);
                      ext4_mb_release_group_pa(&e4b, pa);
                      ext4_unlock_group(sb, group);
      
                      ext4_mb_unload_buddy(&e4b);
                      list_del(&pa->u.pa_tmp_list);
                      call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
              }
      }
      
      /*
       * We have incremented pa_count. So it cannot be freed at this
       * point. Also we hold lg_mutex. So no parallel allocation is
       * possible from this lg. That means pa_free cannot be updated.
       *
       * A parallel ext4_mb_discard_group_preallocations is possible.
       * which can cause the lg_prealloc_list to be updated.
       */
      
      static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
      {
              int order, added = 0, lg_prealloc_count = 1;
              struct super_block *sb = ac->ac_sb;
              struct ext4_locality_group *lg = ac->ac_lg;
              struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
      
              order = fls(pa->pa_free) - 1;
              if (order > PREALLOC_TB_SIZE - 1)
                      /* The max size of hash table is PREALLOC_TB_SIZE */
                      order = PREALLOC_TB_SIZE - 1;
              /* Add the prealloc space to lg */
              spin_lock(&lg->lg_prealloc_lock);
    3         list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
                                                      pa_inode_list) {
    3                 spin_lock(&tmp_pa->pa_lock);
                      if (tmp_pa->pa_deleted) {
                              spin_unlock(&tmp_pa->pa_lock);
                              continue;
                      }
    3                 if (!added && pa->pa_free < tmp_pa->pa_free) {
                              /* Add to the tail of the previous entry */
                              list_add_tail_rcu(&pa->pa_inode_list,
                                                      &tmp_pa->pa_inode_list);
                              added = 1;
                              /*
                               * we want to count the total
                               * number of entries in the list
                               */
                      }
    3                 spin_unlock(&tmp_pa->pa_lock);
                      lg_prealloc_count++;
              }
    3         if (!added)
  358                 list_add_tail_rcu(&pa->pa_inode_list,
                                              &lg->lg_prealloc_list[order]);
  358         spin_unlock(&lg->lg_prealloc_lock);
      
              /* Now trim the list to be not more than 8 elements */
  358         if (lg_prealloc_count > 8) {
                      ext4_mb_discard_lg_preallocations(sb, lg,
                                                        order, lg_prealloc_count);
                      return;
              }
              return ;
      }
      
      /*
       * release all resource we used in allocation
       */
      static int ext4_mb_release_context(struct ext4_allocation_context *ac)
      {
  591         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  710         struct ext4_prealloc_space *pa = ac->ac_pa;
              if (pa) {
                      if (pa->pa_type == MB_GROUP_PA) {
                              /* see comment in ext4_mb_use_group_pa() */
                              spin_lock(&pa->pa_lock);
                              pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                              pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                              pa->pa_free -= ac->ac_b_ex.fe_len;
                              pa->pa_len -= ac->ac_b_ex.fe_len;
                              spin_unlock(&pa->pa_lock);
                      }
              }
              if (pa) {
                      /*
                       * We want to add the pa to the right bucket.
                       * Remove it from the list and while adding
                       * make sure the list to which we are adding
                       * doesn't grow big.
                       */
  358                 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
  358                         spin_lock(pa->pa_obj_lock);
  358                         list_del_rcu(&pa->pa_inode_list);
                              spin_unlock(pa->pa_obj_lock);
  358                         ext4_mb_add_n_trim(ac);
                      }
  591                 ext4_mb_put_pa(ac, ac->ac_sb, pa);
              }
  710         if (ac->ac_bitmap_page)
  660                 page_cache_release(ac->ac_bitmap_page);
  710         if (ac->ac_buddy_page)
  660                 page_cache_release(ac->ac_buddy_page);
  710         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
  359                 mutex_unlock(&ac->ac_lg->lg_mutex);
  710         ext4_mb_collect_stats(ac);
              return 0;
      }
      
      static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
      {
              ext4_group_t i, ngroups = ext4_get_groups_count(sb);
              int ret;
              int freed = 0;
      
   16         trace_ext4_mb_discard_preallocations(sb, needed);
   16         for (i = 0; i < ngroups && needed > 0; i++) {
   16                 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                      freed += ret;
                      needed -= ret;
              }
      
              return freed;
      }
      
      /*
       * Main entry point into mballoc to allocate blocks
       * it tries to use preallocation first, then falls back
       * to usual allocation
       */
      ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                                      struct ext4_allocation_request *ar, int *errp)
      {
              int freed;
              struct ext4_allocation_context *ac = NULL;
              struct ext4_sb_info *sbi;
              struct super_block *sb;
              ext4_fsblk_t block = 0;
              unsigned int inquota = 0;
              unsigned int reserv_clstrs = 0;
      
  713         might_sleep();
              sb = ar->inode->i_sb;
              sbi = EXT4_SB(sb);
      
  713         trace_ext4_request_blocks(ar);
      
              /* Allow to use superuser reservation for quota file */
  713         if (IS_NOQUOTA(ar->inode))
                      ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
      
  713         if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
                      /* Without delayed allocation we need to verify
                       * there is enough free blocks to do block allocation
                       * and verify allocation doesn't exceed the quota limits.
                       */
  531                 while (ar->len &&
  531                         ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
      
                              /* let others to free the space */
  152                         cond_resched();
                              ar->len = ar->len >> 1;
                      }
  528                 if (!ar->len) {
   91                         *errp = -ENOSPC;
                              return 0;
                      }
                      reserv_clstrs = ar->len;
  528                 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
                              dquot_alloc_block_nofail(ar->inode,
                                                       EXT4_C2B(sbi, ar->len));
                      } else {
                              while (ar->len &&
  528                                 dquot_alloc_block(ar->inode,
  528                                                   EXT4_C2B(sbi, ar->len))) {
      
                                      ar->flags |= EXT4_MB_HINT_NOPREALLOC;
                                      ar->len--;
                              }
                      }
  528                 inquota = ar->len;
                      if (ar->len == 0) {
                              *errp = -EDQUOT;
                              goto out;
                      }
              }
      
  710         ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
              if (!ac) {
                      ar->len = 0;
                      *errp = -ENOMEM;
                      goto out;
              }
      
  710         *errp = ext4_mb_initialize_context(ac, ar);
              if (*errp) {
                      ar->len = 0;
                      goto out;
              }
      
  710         ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
  522         if (!ext4_mb_use_preallocated(ac)) {
  661                 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
                      ext4_mb_normalize_request(ac, ar);
      repeat:
                      /* allocate space in core */
  661                 *errp = ext4_mb_regular_allocator(ac);
                      if (*errp)
                              goto discard_and_exit;
      
                      /* as we've just preallocated more space than
                       * user requested originally, we store allocated
                       * space in a special descriptor */
  661                 if (ac->ac_status == AC_STATUS_FOUND &&
  660                     ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
  514                         *errp = ext4_mb_new_preallocation(ac);
  514                 if (*errp) {
                      discard_and_exit:
                              ext4_discard_allocated_blocks(ac);
                              goto errout;
                      }
              }
  710         if (likely(ac->ac_status == AC_STATUS_FOUND)) {
  709                 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
                      if (*errp) {
    1                         ext4_discard_allocated_blocks(ac);
                              goto errout;
                      } else {
  709                         block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
                              ar->len = ac->ac_b_ex.fe_len;
                      }
              } else {
   16                 freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
   16                 if (freed)
                              goto repeat;
    7                 *errp = -ENOSPC;
              }
      
      errout:
  709         if (*errp) {
    8                 ac->ac_b_ex.fe_len = 0;
                      ar->len = 0;
                      ext4_mb_show_ac(ac);
              }
  710         ext4_mb_release_context(ac);
      out:
              if (ac)
  710                 kmem_cache_free(ext4_ac_cachep, ac);
  710         if (inquota && ar->len < inquota)
  205                 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
  710         if (!ar->len) {
    8                 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
                              /* release all the reserved blocks if non delalloc */
    7                         percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                                      reserv_clstrs);
              }
      
  713         trace_ext4_allocate_blocks(ar, (unsigned long long)block);
      
              return block;
      }
      
      /*
       * We can merge two free data extents only if the physical blocks
       * are contiguous, AND the extents were freed by the same transaction,
       * AND the blocks are associated with the same group.
       */
      static int can_merge(struct ext4_free_data *entry1,
                              struct ext4_free_data *entry2)
      {
              if ((entry1->efd_tid == entry2->efd_tid) &&
                  (entry1->efd_group == entry2->efd_group) &&
                  ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
                      return 1;
              return 0;
      }
      
      static noinline_for_stack int
      ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                            struct ext4_free_data *new_entry)
      {
              ext4_group_t group = e4b->bd_group;
              ext4_grpblk_t cluster;
              struct ext4_free_data *entry;
              struct ext4_group_info *db = e4b->bd_info;
              struct super_block *sb = e4b->bd_sb;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              struct rb_node **n = &db->bb_free_root.rb_node, *node;
              struct rb_node *parent = NULL, *new_node;
      
              BUG_ON(!ext4_handle_valid(handle));
              BUG_ON(e4b->bd_bitmap_page == NULL);
              BUG_ON(e4b->bd_buddy_page == NULL);
      
              new_node = &new_entry->efd_node;
              cluster = new_entry->efd_start_cluster;
      
              if (!*n) {
                      /* first free block exent. We need to
                         protect buddy cache from being freed,
                       * otherwise we'll refresh it from
                       * on-disk bitmap and lose not-yet-available
                       * blocks */
                      page_cache_get(e4b->bd_buddy_page);
                      page_cache_get(e4b->bd_bitmap_page);
              }
              while (*n) {
                      parent = *n;
                      entry = rb_entry(parent, struct ext4_free_data, efd_node);
                      if (cluster < entry->efd_start_cluster)
                              n = &(*n)->rb_left;
                      else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
                              n = &(*n)->rb_right;
                      else {
                              ext4_grp_locked_error(sb, group, 0,
                                      ext4_group_first_block_no(sb, group) +
                                      EXT4_C2B(sbi, cluster),
                                      "Block already on to-be-freed list");
                              return 0;
                      }
              }
      
              rb_link_node(new_node, parent, n);
              rb_insert_color(new_node, &db->bb_free_root);
      
              /* Now try to see the extent can be merged to left and right */
              node = rb_prev(new_node);
              if (node) {
                      entry = rb_entry(node, struct ext4_free_data, efd_node);
                      if (can_merge(entry, new_entry) &&
                          ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
                              new_entry->efd_start_cluster = entry->efd_start_cluster;
                              new_entry->efd_count += entry->efd_count;
                              rb_erase(node, &(db->bb_free_root));
                              kmem_cache_free(ext4_free_data_cachep, entry);
                      }
              }
      
              node = rb_next(new_node);
              if (node) {
                      entry = rb_entry(node, struct ext4_free_data, efd_node);
                      if (can_merge(new_entry, entry) &&
                          ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
                              new_entry->efd_count += entry->efd_count;
                              rb_erase(node, &(db->bb_free_root));
                              kmem_cache_free(ext4_free_data_cachep, entry);
                      }
              }
              /* Add the extent to transaction's private list */
              ext4_journal_callback_add(handle, ext4_free_data_callback,
                                        &new_entry->efd_jce);
              return 0;
      }
      
      /**
       * ext4_free_blocks() -- Free given blocks and update quota
       * @handle:                handle for this transaction
       * @inode:                inode
       * @block:                start physical block to free
       * @count:                number of blocks to count
       * @flags:                flags used by ext4_free_blocks
       */
      void ext4_free_blocks(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh, ext4_fsblk_t block,
                            unsigned long count, int flags)
      {
              struct buffer_head *bitmap_bh = NULL;
  405         struct super_block *sb = inode->i_sb;
              struct ext4_group_desc *gdp;
              unsigned int overflow;
              ext4_grpblk_t bit;
              struct buffer_head *gd_bh;
              ext4_group_t block_group;
              struct ext4_sb_info *sbi;
              struct ext4_buddy e4b;
              unsigned int count_clusters;
              int err = 0;
              int ret;
      
              might_sleep();
              if (bh) {
    1                 if (block)
                              BUG_ON(block != bh->b_blocknr);
                      else
                              block = bh->b_blocknr;
              }
      
  405         sbi = EXT4_SB(sb);
              if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
  382             !ext4_data_block_valid(sbi, block, count)) {
                      ext4_error(sb, "Freeing blocks not in datazone - "
                                 "block = %llu, count = %lu", block, count);
                      goto error_return;
              }
      
              ext4_debug("freeing block %llu\n", block);
  405         trace_ext4_free_blocks(inode, block, count, flags);
      
  405         if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
    1                 BUG_ON(count > 1);
      
    1                 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                  inode, bh, block);
              }
      
              /*
               * We need to make sure we don't reuse the freed block until
               * after the transaction is committed, which we can do by
               * treating the block as metadata, below.  We make an
               * exception if the inode is to be written in writeback mode
               * since writeback mode has weak data consistency guarantees.
               */
  405         if (!ext4_should_writeback_data(inode))
                      flags |= EXT4_FREE_BLOCKS_METADATA;
      
              /*
               * If the extent to be freed does not begin on a cluster
               * boundary, we need to deal with partial clusters at the
               * beginning and end of the extent.  Normally we will free
               * blocks at the beginning or the end unless we are explicitly
               * requested to avoid doing so.
               */
  405         overflow = EXT4_PBLK_COFF(sbi, block);
              if (overflow) {
                      if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
                              overflow = sbi->s_cluster_ratio - overflow;
                              block += overflow;
                              if (count > overflow)
                                      count -= overflow;
                              else
                                      return;
                      } else {
                              block -= overflow;
                              count += overflow;
                      }
              }
  405         overflow = EXT4_LBLK_COFF(sbi, count);
              if (overflow) {
                      if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
                              if (count > overflow)
                                      count -= overflow;
                              else
                                      return;
                      } else
                              count += sbi->s_cluster_ratio - overflow;
              }
      
  405         if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
                      int i;
      
  238                 for (i = 0; i < count; i++) {
  238                         cond_resched();
                              bh = sb_find_get_block(inode->i_sb, block + i);
                              if (!bh)
                                      continue;
  238                         ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                          inode, bh, block + i);
                      }
              }
      
      do_more:
              overflow = 0;
  405         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
      
  405         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
                              ext4_get_group_info(sb, block_group))))
                      return;
      
              /*
               * Check to see if we are freeing blocks across a group
               * boundary.
               */
  403         if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
   32                 overflow = EXT4_C2B(sbi, bit) + count -
                              EXT4_BLOCKS_PER_GROUP(sb);
                      count -= overflow;
              }
  403         count_clusters = EXT4_NUM_B2C(sbi, count);
              bitmap_bh = ext4_read_block_bitmap(sb, block_group);
              if (IS_ERR(bitmap_bh)) {
                      err = PTR_ERR(bitmap_bh);
                      bitmap_bh = NULL;
                      goto error_return;
              }
  403         gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
              if (!gdp) {
                      err = -EIO;
                      goto error_return;
              }
      
  403         if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
  403             in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
  403             in_range(block, ext4_inode_table(sb, gdp),
                           EXT4_SB(sb)->s_itb_per_group) ||
  403             in_range(block + count - 1, ext4_inode_table(sb, gdp),
                           EXT4_SB(sb)->s_itb_per_group)) {
      
                      ext4_error(sb, "Freeing blocks in system zone - "
                                 "Block = %llu, count = %lu", block, count);
                      /* err = 0. ext4_std_error should be a no op */
                      goto error_return;
              }
      
              BUFFER_TRACE(bitmap_bh, "getting write access");
  403         err = ext4_journal_get_write_access(handle, bitmap_bh);
              if (err)
                      goto error_return;
      
              /*
               * We are about to modify some metadata.  Call the journal APIs
               * to unshare ->b_data if a currently-committing transaction is
               * using it
               */
              BUFFER_TRACE(gd_bh, "get_write_access");
  403         err = ext4_journal_get_write_access(handle, gd_bh);
              if (err)
                      goto error_return;
      #ifdef AGGRESSIVE_CHECK
              {
                      int i;
                      for (i = 0; i < count_clusters; i++)
                              BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
              }
      #endif
  403         trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
      
              /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
  403         err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
                                           GFP_NOFS|__GFP_NOFAIL);
              if (err)
                      goto error_return;
      
  403         if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
                      struct ext4_free_data *new_entry;
                      /*
                       * blocks being freed are metadata. these blocks shouldn't
                       * be used until this transaction is committed
                       *
                       * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
                       * to fail.
                       */
                      new_entry = kmem_cache_alloc(ext4_free_data_cachep,
                                      GFP_NOFS|__GFP_NOFAIL);
                      new_entry->efd_start_cluster = bit;
                      new_entry->efd_group = block_group;
                      new_entry->efd_count = count_clusters;
                      new_entry->efd_tid = handle->h_transaction->t_tid;
      
                      ext4_lock_group(sb, block_group);
                      mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                      ext4_mb_free_metadata(handle, &e4b, new_entry);
              } else {
                      /* need to update group_info->bb_free and bitmap
                       * with group lock held. generate_buddy look at
                       * them with group lock_held
                       */
  403                 if (test_opt(sb, DISCARD)) {
                              err = ext4_issue_discard(sb, block_group, bit, count,
                                                       0);
                              if (err && err != -EOPNOTSUPP)
                                      ext4_msg(sb, KERN_WARNING, "discard request in"
                                               " group:%d block:%d count:%lu failed"
                                               " with %d", block_group, bit, count,
                                               err);
                      } else
  403                         EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
      
  403                 ext4_lock_group(sb, block_group);
  403                 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
                      mb_free_blocks(inode, &e4b, bit, count_clusters);
              }
      
  403         ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
              ext4_free_group_clusters_set(sb, gdp, ret);
              ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
              ext4_group_desc_csum_set(sb, block_group, gdp);
              ext4_unlock_group(sb, block_group);
      
              if (sbi->s_log_groups_per_flex) {
  403                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
                      atomic64_add(count_clusters,
                                   &sbi->s_flex_groups[flex_group].free_clusters);
              }
      
  403         if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
  403                 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
  403         percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
      
              ext4_mb_unload_buddy(&e4b);
      
              /* We dirtied the bitmap block */
              BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
              err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
      
              /* And the group descriptor block */
              BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
              ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
              if (!err)
                      err = ret;
      
  403         if (overflow && !err) {
                      block += count;
   31                 count = overflow;
                      put_bh(bitmap_bh);
                      goto do_more;
              }
      error_return:
  403         brelse(bitmap_bh);
  405         ext4_std_error(sb, err);
              return;
      }
      
      /**
       * ext4_group_add_blocks() -- Add given blocks to an existing group
       * @handle:                        handle to this transaction
       * @sb:                                super block
       * @block:                        start physical block to add to the block group
       * @count:                        number of blocks to free
       *
       * This marks the blocks as free in the bitmap and buddy.
       */
      int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                               ext4_fsblk_t block, unsigned long count)
      {
              struct buffer_head *bitmap_bh = NULL;
              struct buffer_head *gd_bh;
              ext4_group_t block_group;
              ext4_grpblk_t bit;
              unsigned int i;
              struct ext4_group_desc *desc;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              struct ext4_buddy e4b;
              int err = 0, ret, blk_free_count;
              ext4_grpblk_t blocks_freed;
      
              ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
      
              if (count == 0)
                      return 0;
      
              ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
              /*
               * Check to see if we are freeing blocks across a group
               * boundary.
               */
              if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
                      ext4_warning(sb, "too much blocks added to group %u\n",
                                   block_group);
                      err = -EINVAL;
                      goto error_return;
              }
      
              bitmap_bh = ext4_read_block_bitmap(sb, block_group);
              if (IS_ERR(bitmap_bh)) {
                      err = PTR_ERR(bitmap_bh);
                      bitmap_bh = NULL;
                      goto error_return;
              }
      
              desc = ext4_get_group_desc(sb, block_group, &gd_bh);
              if (!desc) {
                      err = -EIO;
                      goto error_return;
              }
      
              if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
                  in_range(ext4_inode_bitmap(sb, desc), block, count) ||
                  in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
                  in_range(block + count - 1, ext4_inode_table(sb, desc),
                           sbi->s_itb_per_group)) {
                      ext4_error(sb, "Adding blocks in system zones - "
                                 "Block = %llu, count = %lu",
                                 block, count);
                      err = -EINVAL;
                      goto error_return;
              }
      
              BUFFER_TRACE(bitmap_bh, "getting write access");
              err = ext4_journal_get_write_access(handle, bitmap_bh);
              if (err)
                      goto error_return;
      
              /*
               * We are about to modify some metadata.  Call the journal APIs
               * to unshare ->b_data if a currently-committing transaction is
               * using it
               */
              BUFFER_TRACE(gd_bh, "get_write_access");
              err = ext4_journal_get_write_access(handle, gd_bh);
              if (err)
                      goto error_return;
      
              for (i = 0, blocks_freed = 0; i < count; i++) {
                      BUFFER_TRACE(bitmap_bh, "clear bit");
                      if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
                              ext4_error(sb, "bit already cleared for block %llu",
                                         (ext4_fsblk_t)(block + i));
                              BUFFER_TRACE(bitmap_bh, "bit already cleared");
                      } else {
                              blocks_freed++;
                      }
              }
      
              err = ext4_mb_load_buddy(sb, block_group, &e4b);
              if (err)
                      goto error_return;
      
              /*
               * need to update group_info->bb_free and bitmap
               * with group lock held. generate_buddy look at
               * them with group lock_held
               */
              ext4_lock_group(sb, block_group);
              mb_clear_bits(bitmap_bh->b_data, bit, count);
              mb_free_blocks(NULL, &e4b, bit, count);
              blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
              ext4_free_group_clusters_set(sb, desc, blk_free_count);
              ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
              ext4_group_desc_csum_set(sb, block_group, desc);
              ext4_unlock_group(sb, block_group);
              percpu_counter_add(&sbi->s_freeclusters_counter,
                                 EXT4_NUM_B2C(sbi, blocks_freed));
      
              if (sbi->s_log_groups_per_flex) {
                      ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
                      atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
                                   &sbi->s_flex_groups[flex_group].free_clusters);
              }
      
              ext4_mb_unload_buddy(&e4b);
      
              /* We dirtied the bitmap block */
              BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
              err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
      
              /* And the group descriptor block */
              BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
              ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
              if (!err)
                      err = ret;
      
      error_return:
              brelse(bitmap_bh);
              ext4_std_error(sb, err);
              return err;
      }
      
      /**
       * ext4_trim_extent -- function to TRIM one single free extent in the group
       * @sb:                super block for the file system
       * @start:        starting block of the free extent in the alloc. group
       * @count:        number of blocks to TRIM
       * @group:        alloc. group we are working with
       * @e4b:        ext4 buddy for the group
       * @blkdev_flags: flags for the block device
       *
       * Trim "count" blocks starting at "start" in the "group". To assure that no
       * one will allocate those blocks, mark it as used in buddy bitmap. This must
       * be called with under the group lock.
       */
      static int ext4_trim_extent(struct super_block *sb, int start, int count,
                                  ext4_group_t group, struct ext4_buddy *e4b,
                                  unsigned long blkdev_flags)
      __releases(bitlock)
      __acquires(bitlock)
      {
              struct ext4_free_extent ex;
              int ret = 0;
      
              trace_ext4_trim_extent(sb, group, start, count);
      
              assert_spin_locked(ext4_group_lock_ptr(sb, group));
      
              ex.fe_start = start;
              ex.fe_group = group;
              ex.fe_len = count;
      
              /*
               * Mark blocks used, so no one can reuse them while
               * being trimmed.
               */
              mb_mark_used(e4b, &ex);
              ext4_unlock_group(sb, group);
              ret = ext4_issue_discard(sb, group, start, count, blkdev_flags);
              ext4_lock_group(sb, group);
              mb_free_blocks(NULL, e4b, start, ex.fe_len);
              return ret;
      }
      
      /**
       * ext4_trim_all_free -- function to trim all free space in alloc. group
       * @sb:                        super block for file system
       * @group:                group to be trimmed
       * @start:                first group block to examine
       * @max:                last group block to examine
       * @minblocks:                minimum extent block count
       * @blkdev_flags:        flags for the block device
       *
       * ext4_trim_all_free walks through group's buddy bitmap searching for free
       * extents. When the free block is found, ext4_trim_extent is called to TRIM
       * the extent.
       *
       *
       * ext4_trim_all_free walks through group's block bitmap searching for free
       * extents. When the free extent is found, mark it as used in group buddy
       * bitmap. Then issue a TRIM command on this extent and free the extent in
       * the group buddy bitmap. This is done until whole group is scanned.
       */
      static ext4_grpblk_t
      ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                         ext4_grpblk_t start, ext4_grpblk_t max,
                         ext4_grpblk_t minblocks, unsigned long blkdev_flags)
      {
              void *bitmap;
              ext4_grpblk_t next, count = 0, free_count = 0;
              struct ext4_buddy e4b;
              int ret = 0;
      
              trace_ext4_trim_all_free(sb, group, start, max);
      
              ret = ext4_mb_load_buddy(sb, group, &e4b);
              if (ret) {
                      ext4_warning(sb, "Error %d loading buddy information for %u",
                                   ret, group);
                      return ret;
              }
              bitmap = e4b.bd_bitmap;
      
              ext4_lock_group(sb, group);
              if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
                  minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
                      goto out;
      
              start = (e4b.bd_info->bb_first_free > start) ?
                      e4b.bd_info->bb_first_free : start;
      
              while (start <= max) {
                      start = mb_find_next_zero_bit(bitmap, max + 1, start);
                      if (start > max)
                              break;
                      next = mb_find_next_bit(bitmap, max + 1, start);
      
                      if ((next - start) >= minblocks) {
                              ret = ext4_trim_extent(sb, start,
                                                     next - start, group, &e4b,
                                                     blkdev_flags);
                              if (ret && ret != -EOPNOTSUPP)
                                      break;
                              ret = 0;
                              count += next - start;
                      }
                      free_count += next - start;
                      start = next + 1;
      
                      if (fatal_signal_pending(current)) {
                              count = -ERESTARTSYS;
                              break;
                      }
      
                      if (need_resched()) {
                              ext4_unlock_group(sb, group);
                              cond_resched();
                              ext4_lock_group(sb, group);
                      }
      
                      if ((e4b.bd_info->bb_free - free_count) < minblocks)
                              break;
              }
      
              if (!ret) {
                      ret = count;
                      EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
              }
      out:
              ext4_unlock_group(sb, group);
              ext4_mb_unload_buddy(&e4b);
      
              ext4_debug("trimmed %d blocks in the group %d\n",
                      count, group);
      
              return ret;
      }
      
      /**
       * ext4_trim_fs() -- trim ioctl handle function
       * @sb:                        superblock for filesystem
       * @range:                fstrim_range structure
       * @blkdev_flags:        flags for the block device
       *
       * start:        First Byte to trim
       * len:                number of Bytes to trim from start
       * minlen:        minimum extent length in Bytes
       * ext4_trim_fs goes through all allocation groups containing Bytes from
       * start to start+len. For each such a group ext4_trim_all_free function
       * is invoked to trim all free space.
       */
      int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range,
                              unsigned long blkdev_flags)
      {
              struct ext4_group_info *grp;
              ext4_group_t group, first_group, last_group;
              ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
              uint64_t start, end, minlen, trimmed = 0;
              ext4_fsblk_t first_data_blk =
                              le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
              ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
              int ret = 0;
      
              start = range->start >> sb->s_blocksize_bits;
              end = start + (range->len >> sb->s_blocksize_bits) - 1;
              minlen = EXT4_NUM_B2C(EXT4_SB(sb),
                                    range->minlen >> sb->s_blocksize_bits);
      
              if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
                  start >= max_blks ||
                  range->len < sb->s_blocksize)
                      return -EINVAL;
              if (end >= max_blks)
                      end = max_blks - 1;
              if (end <= first_data_blk)
                      goto out;
              if (start < first_data_blk)
                      start = first_data_blk;
      
              /* Determine first and last group to examine based on start and end */
              ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
                                           &first_group, &first_cluster);
              ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
                                           &last_group, &last_cluster);
      
              /* end now represents the last cluster to discard in this group */
              end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
      
              for (group = first_group; group <= last_group; group++) {
                      grp = ext4_get_group_info(sb, group);
                      /* We only do this if the grp has never been initialized */
                      if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                              ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                              if (ret)
                                      break;
                      }
      
                      /*
                       * For all the groups except the last one, last cluster will
                       * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
                       * change it for the last group, note that last_cluster is
                       * already computed earlier by ext4_get_group_no_and_offset()
                       */
                      if (group == last_group)
                              end = last_cluster;
      
                      if (grp->bb_free >= minlen) {
                              cnt = ext4_trim_all_free(sb, group, first_cluster,
                                                      end, minlen, blkdev_flags);
                              if (cnt < 0) {
                                      ret = cnt;
                                      break;
                              }
                              trimmed += cnt;
                      }
      
                      /*
                       * For every group except the first one, we are sure
                       * that the first cluster to discard will be cluster #0.
                       */
                      first_cluster = 0;
              }
      
              if (!ret)
                      atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
      
      out:
              range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
              return ret;
      }
      /*
       *        linux/kernel/resource.c
       *
       * Copyright (C) 1999        Linus Torvalds
       * Copyright (C) 1999        Martin Mares <mj@ucw.cz>
       *
       * Arbitrary resource management.
       */
      
      #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      
      #include <linux/export.h>
      #include <linux/errno.h>
      #include <linux/ioport.h>
      #include <linux/init.h>
      #include <linux/slab.h>
      #include <linux/spinlock.h>
      #include <linux/fs.h>
      #include <linux/proc_fs.h>
      #include <linux/sched.h>
      #include <linux/seq_file.h>
      #include <linux/device.h>
      #include <linux/pfn.h>
      #include <linux/mm.h>
      #include <linux/resource_ext.h>
      #include <asm/io.h>
      
      
      struct resource ioport_resource = {
              .name        = "PCI IO",
              .start        = 0,
              .end        = IO_SPACE_LIMIT,
              .flags        = IORESOURCE_IO,
      };
      EXPORT_SYMBOL(ioport_resource);
      
      struct resource iomem_resource = {
              .name        = "PCI mem",
              .start        = 0,
              .end        = -1,
              .flags        = IORESOURCE_MEM,
      };
      EXPORT_SYMBOL(iomem_resource);
      
      /* constraints to be met while allocating resources */
      struct resource_constraint {
              resource_size_t min, max, align;
              resource_size_t (*alignf)(void *, const struct resource *,
                              resource_size_t, resource_size_t);
              void *alignf_data;
      };
      
      static DEFINE_RWLOCK(resource_lock);
      
      /*
       * For memory hotplug, there is no way to free resource entries allocated
       * by boot mem after the system is up. So for reusing the resource entry
       * we need to remember the resource.
       */
      static struct resource *bootmem_resource_free;
      static DEFINE_SPINLOCK(bootmem_resource_lock);
      
      static struct resource *next_resource(struct resource *p, bool sibling_only)
      {
              /* Caller wants to traverse through siblings only */
    6         if (sibling_only)
    6                 return p->sibling;
      
              if (p->child)
                      return p->child;
              while (!p->sibling && p->parent)
                      p = p->parent;
              return p->sibling;
      }
      
      static void *r_next(struct seq_file *m, void *v, loff_t *pos)
      {
              struct resource *p = v;
              (*pos)++;
              return (void *)next_resource(p, false);
      }
      
      #ifdef CONFIG_PROC_FS
      
      enum { MAX_IORES_LEVEL = 5 };
      
      static void *r_start(struct seq_file *m, loff_t *pos)
              __acquires(resource_lock)
      {
              struct resource *p = m->private;
              loff_t l = 0;
              read_lock(&resource_lock);
              for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
                      ;
              return p;
      }
      
      static void r_stop(struct seq_file *m, void *v)
              __releases(resource_lock)
      {
              read_unlock(&resource_lock);
      }
      
      static int r_show(struct seq_file *m, void *v)
      {
              struct resource *root = m->private;
              struct resource *r = v, *p;
              unsigned long long start, end;
              int width = root->end < 0x10000 ? 4 : 8;
              int depth;
      
              for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
                      if (p->parent == root)
                              break;
      
              if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
                      start = r->start;
                      end = r->end;
              } else {
                      start = end = 0;
              }
      
              seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
                              depth * 2, "",
                              width, start,
                              width, end,
                              r->name ? r->name : "<BAD>");
              return 0;
      }
      
      static const struct seq_operations resource_op = {
              .start        = r_start,
              .next        = r_next,
              .stop        = r_stop,
              .show        = r_show,
      };
      
      static int ioports_open(struct inode *inode, struct file *file)
      {
              int res = seq_open(file, &resource_op);
              if (!res) {
                      struct seq_file *m = file->private_data;
                      m->private = &ioport_resource;
              }
              return res;
      }
      
      static int iomem_open(struct inode *inode, struct file *file)
      {
              int res = seq_open(file, &resource_op);
              if (!res) {
                      struct seq_file *m = file->private_data;
                      m->private = &iomem_resource;
              }
              return res;
      }
      
      static const struct file_operations proc_ioports_operations = {
              .open                = ioports_open,
              .read                = seq_read,
              .llseek                = seq_lseek,
              .release        = seq_release,
      };
      
      static const struct file_operations proc_iomem_operations = {
              .open                = iomem_open,
              .read                = seq_read,
              .llseek                = seq_lseek,
              .release        = seq_release,
      };
      
      static int __init ioresources_init(void)
      {
              proc_create("ioports", 0, NULL, &proc_ioports_operations);
              proc_create("iomem", 0, NULL, &proc_iomem_operations);
              return 0;
      }
      __initcall(ioresources_init);
      
      #endif /* CONFIG_PROC_FS */
      
      static void free_resource(struct resource *res)
      {
              if (!res)
                      return;
      
              if (!PageSlab(virt_to_head_page(res))) {
                      spin_lock(&bootmem_resource_lock);
                      res->sibling = bootmem_resource_free;
                      bootmem_resource_free = res;
                      spin_unlock(&bootmem_resource_lock);
              } else {
                      kfree(res);
              }
      }
      
      static struct resource *alloc_resource(gfp_t flags)
      {
              struct resource *res = NULL;
      
              spin_lock(&bootmem_resource_lock);
              if (bootmem_resource_free) {
                      res = bootmem_resource_free;
                      bootmem_resource_free = res->sibling;
              }
              spin_unlock(&bootmem_resource_lock);
      
              if (res)
                      memset(res, 0, sizeof(struct resource));
              else
                      res = kzalloc(sizeof(struct resource), flags);
      
              return res;
      }
      
      /* Return the conflict entry if you can't request it */
      static struct resource * __request_resource(struct resource *root, struct resource *new)
      {
              resource_size_t start = new->start;
              resource_size_t end = new->end;
              struct resource *tmp, **p;
      
              if (end < start)
                      return root;
              if (start < root->start)
                      return root;
              if (end > root->end)
                      return root;
              p = &root->child;
              for (;;) {
                      tmp = *p;
                      if (!tmp || tmp->start > end) {
                              new->sibling = tmp;
                              *p = new;
                              new->parent = root;
                              return NULL;
                      }
                      p = &tmp->sibling;
                      if (tmp->end < start)
                              continue;
                      return tmp;
              }
      }
      
      static int __release_resource(struct resource *old)
      {
              struct resource *tmp, **p;
      
              p = &old->parent->child;
              for (;;) {
                      tmp = *p;
                      if (!tmp)
                              break;
                      if (tmp == old) {
                              *p = tmp->sibling;
                              old->parent = NULL;
                              return 0;
                      }
                      p = &tmp->sibling;
              }
              return -EINVAL;
      }
      
      static void __release_child_resources(struct resource *r)
      {
              struct resource *tmp, *p;
              resource_size_t size;
      
              p = r->child;
              r->child = NULL;
              while (p) {
                      tmp = p;
                      p = p->sibling;
      
                      tmp->parent = NULL;
                      tmp->sibling = NULL;
                      __release_child_resources(tmp);
      
                      printk(KERN_DEBUG "release child resource %pR\n", tmp);
                      /* need to restore size, and keep flags */
                      size = resource_size(tmp);
                      tmp->start = 0;
                      tmp->end = size - 1;
              }
      }
      
      void release_child_resources(struct resource *r)
      {
              write_lock(&resource_lock);
              __release_child_resources(r);
              write_unlock(&resource_lock);
      }
      
      /**
       * request_resource_conflict - request and reserve an I/O or memory resource
       * @root: root resource descriptor
       * @new: resource descriptor desired by caller
       *
       * Returns 0 for success, conflict resource on error.
       */
      struct resource *request_resource_conflict(struct resource *root, struct resource *new)
      {
              struct resource *conflict;
      
              write_lock(&resource_lock);
              conflict = __request_resource(root, new);
              write_unlock(&resource_lock);
              return conflict;
      }
      
      /**
       * request_resource - request and reserve an I/O or memory resource
       * @root: root resource descriptor
       * @new: resource descriptor desired by caller
       *
       * Returns 0 for success, negative error code on error.
       */
      int request_resource(struct resource *root, struct resource *new)
      {
              struct resource *conflict;
      
              conflict = request_resource_conflict(root, new);
              return conflict ? -EBUSY : 0;
      }
      
      EXPORT_SYMBOL(request_resource);
      
      /**
       * release_resource - release a previously reserved resource
       * @old: resource pointer
       */
      int release_resource(struct resource *old)
      {
              int retval;
      
              write_lock(&resource_lock);
              retval = __release_resource(old);
              write_unlock(&resource_lock);
              return retval;
      }
      
      EXPORT_SYMBOL(release_resource);
      
      /*
       * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
       * the caller must specify res->start, res->end, res->flags and "name".
       * If found, returns 0, res is overwritten, if not found, returns -1.
       * This walks through whole tree and not just first level children
       * until and unless first_level_children_only is true.
       */
      static int find_next_iomem_res(struct resource *res, char *name,
                                     bool first_level_children_only)
      {
              resource_size_t start, end;
              struct resource *p;
              bool sibling_only = false;
      
              BUG_ON(!res);
      
    6         start = res->start;
              end = res->end;
              BUG_ON(start >= end);
      
              if (first_level_children_only)
                      sibling_only = true;
      
    6         read_lock(&resource_lock);
      
    6         for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
    6                 if (p->flags != res->flags)
                              continue;
    6                 if (name && strcmp(p->name, name))
                              continue;
    6                 if (p->start > end) {
                              p = NULL;
                              break;
                      }
    6                 if ((p->end >= start) && (p->start < end))
                              break;
              }
      
    6         read_unlock(&resource_lock);
    6         if (!p)
                      return -1;
              /* copy data */
              if (res->start < p->start)
                      res->start = p->start;
    6         if (res->end > p->end)
                      res->end = p->end;
              return 0;
      }
      
      /*
       * Walks through iomem resources and calls func() with matching resource
       * ranges. This walks through whole tree and not just first level children.
       * All the memory ranges which overlap start,end and also match flags and
       * name are valid candidates.
       *
       * @name: name of resource
       * @flags: resource flags
       * @start: start addr
       * @end: end addr
       */
      int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
                      void *arg, int (*func)(u64, u64, void *))
      {
              struct resource res;
              u64 orig_end;
              int ret = -1;
      
              res.start = start;
              res.end = end;
              res.flags = flags;
              orig_end = res.end;
              while ((res.start < res.end) &&
                      (!find_next_iomem_res(&res, name, false))) {
                      ret = (*func)(res.start, res.end, arg);
                      if (ret)
                              break;
                      res.start = res.end + 1;
                      res.end = orig_end;
              }
              return ret;
      }
      
      /*
       * This function calls callback against all memory range of "System RAM"
       * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
       * Now, this function is only for "System RAM". This function deals with
       * full ranges and not pfn. If resources are not pfn aligned, dealing
       * with pfn can truncate ranges.
       */
      int walk_system_ram_res(u64 start, u64 end, void *arg,
                                      int (*func)(u64, u64, void *))
      {
              struct resource res;
              u64 orig_end;
              int ret = -1;
      
              res.start = start;
              res.end = end;
              res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
              orig_end = res.end;
              while ((res.start < res.end) &&
                      (!find_next_iomem_res(&res, "System RAM", true))) {
                      ret = (*func)(res.start, res.end, arg);
                      if (ret)
                              break;
                      res.start = res.end + 1;
                      res.end = orig_end;
              }
              return ret;
      }
      
      #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
      
      /*
       * This function calls callback against all memory range of "System RAM"
       * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
       * Now, this function is only for "System RAM".
       */
      int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                      void *arg, int (*func)(unsigned long, unsigned long, void *))
      {
              struct resource res;
              unsigned long pfn, end_pfn;
              u64 orig_end;
              int ret = -1;
      
    6         res.start = (u64) start_pfn << PAGE_SHIFT;
              res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
              res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
              orig_end = res.end;
              while ((res.start < res.end) &&
    6                 (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
    6                 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
                      end_pfn = (res.end + 1) >> PAGE_SHIFT;
                      if (end_pfn > pfn)
    6                         ret = (*func)(pfn, end_pfn - pfn, arg);
    6                 if (ret)
                              break;
    6                 res.start = res.end + 1;
                      res.end = orig_end;
              }
    6         return ret;
      }
      
      #endif
      
      static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
      {
              return 1;
      }
      /*
       * This generic page_is_ram() returns true if specified address is
       * registered as "System RAM" in iomem_resource list.
       */
      int __weak page_is_ram(unsigned long pfn)
      {
              return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
      }
      EXPORT_SYMBOL_GPL(page_is_ram);
      
      /**
       * region_intersects() - determine intersection of region with known resources
       * @start: region start address
       * @size: size of region
       * @name: name of resource (in iomem_resource)
       *
       * Check if the specified region partially overlaps or fully eclipses a
       * resource identified by @name.  Return REGION_DISJOINT if the region
       * does not overlap @name, return REGION_MIXED if the region overlaps
       * @type and another resource, and return REGION_INTERSECTS if the
       * region overlaps @type and no other defined resource. Note, that
       * REGION_INTERSECTS is also returned in the case when the specified
       * region overlaps RAM and undefined memory holes.
       *
       * region_intersect() is used by memory remapping functions to ensure
       * the user is not remapping RAM and is a vast speed up over walking
       * through the resource table page by page.
       */
      int region_intersects(resource_size_t start, size_t size, const char *name)
      {
              unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
              resource_size_t end = start + size - 1;
              int type = 0; int other = 0;
              struct resource *p;
      
              read_lock(&resource_lock);
              for (p = iomem_resource.child; p ; p = p->sibling) {
                      bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
      
                      if (start >= p->start && start <= p->end)
                              is_type ? type++ : other++;
                      if (end >= p->start && end <= p->end)
                              is_type ? type++ : other++;
                      if (p->start >= start && p->end <= end)
                              is_type ? type++ : other++;
              }
              read_unlock(&resource_lock);
      
              if (other == 0)
                      return type ? REGION_INTERSECTS : REGION_DISJOINT;
      
              if (type)
                      return REGION_MIXED;
      
              return REGION_DISJOINT;
      }
      
      void __weak arch_remove_reservations(struct resource *avail)
      {
      }
      
      static resource_size_t simple_align_resource(void *data,
                                                   const struct resource *avail,
                                                   resource_size_t size,
                                                   resource_size_t align)
      {
              return avail->start;
      }
      
      static void resource_clip(struct resource *res, resource_size_t min,
                                resource_size_t max)
      {
              if (res->start < min)
                      res->start = min;
              if (res->end > max)
                      res->end = max;
      }
      
      /*
       * Find empty slot in the resource tree with the given range and
       * alignment constraints
       */
      static int __find_resource(struct resource *root, struct resource *old,
                               struct resource *new,
                               resource_size_t  size,
                               struct resource_constraint *constraint)
      {
              struct resource *this = root->child;
              struct resource tmp = *new, avail, alloc;
      
              tmp.start = root->start;
              /*
               * Skip past an allocated resource that starts at 0, since the assignment
               * of this->start - 1 to tmp->end below would cause an underflow.
               */
              if (this && this->start == root->start) {
                      tmp.start = (this == old) ? old->start : this->end + 1;
                      this = this->sibling;
              }
              for(;;) {
                      if (this)
                              tmp.end = (this == old) ?  this->end : this->start - 1;
                      else
                              tmp.end = root->end;
      
                      if (tmp.end < tmp.start)
                              goto next;
      
                      resource_clip(&tmp, constraint->min, constraint->max);
                      arch_remove_reservations(&tmp);
      
                      /* Check for overflow after ALIGN() */
                      avail.start = ALIGN(tmp.start, constraint->align);
                      avail.end = tmp.end;
                      avail.flags = new->flags & ~IORESOURCE_UNSET;
                      if (avail.start >= tmp.start) {
                              alloc.flags = avail.flags;
                              alloc.start = constraint->alignf(constraint->alignf_data, &avail,
                                              size, constraint->align);
                              alloc.end = alloc.start + size - 1;
                              if (alloc.start <= alloc.end &&
                                  resource_contains(&avail, &alloc)) {
                                      new->start = alloc.start;
                                      new->end = alloc.end;
                                      return 0;
                              }
                      }
      
      next:                if (!this || this->end == root->end)
                              break;
      
                      if (this != old)
                              tmp.start = this->end + 1;
                      this = this->sibling;
              }
              return -EBUSY;
      }
      
      /*
       * Find empty slot in the resource tree given range and alignment.
       */
      static int find_resource(struct resource *root, struct resource *new,
                              resource_size_t size,
                              struct resource_constraint  *constraint)
      {
              return  __find_resource(root, NULL, new, size, constraint);
      }
      
      /**
       * reallocate_resource - allocate a slot in the resource tree given range & alignment.
       *        The resource will be relocated if the new size cannot be reallocated in the
       *        current location.
       *
       * @root: root resource descriptor
       * @old:  resource descriptor desired by caller
       * @newsize: new size of the resource descriptor
       * @constraint: the size and alignment constraints to be met.
       */
      static int reallocate_resource(struct resource *root, struct resource *old,
                              resource_size_t newsize,
                              struct resource_constraint  *constraint)
      {
              int err=0;
              struct resource new = *old;
              struct resource *conflict;
      
              write_lock(&resource_lock);
      
              if ((err = __find_resource(root, old, &new, newsize, constraint)))
                      goto out;
      
              if (resource_contains(&new, old)) {
                      old->start = new.start;
                      old->end = new.end;
                      goto out;
              }
      
              if (old->child) {
                      err = -EBUSY;
                      goto out;
              }
      
              if (resource_contains(old, &new)) {
                      old->start = new.start;
                      old->end = new.end;
              } else {
                      __release_resource(old);
                      *old = new;
                      conflict = __request_resource(root, old);
                      BUG_ON(conflict);
              }
      out:
              write_unlock(&resource_lock);
              return err;
      }
      
      
      /**
       * allocate_resource - allocate empty slot in the resource tree given range & alignment.
       *         The resource will be reallocated with a new size if it was already allocated
       * @root: root resource descriptor
       * @new: resource descriptor desired by caller
       * @size: requested resource region size
       * @min: minimum boundary to allocate
       * @max: maximum boundary to allocate
       * @align: alignment requested, in bytes
       * @alignf: alignment function, optional, called if not NULL
       * @alignf_data: arbitrary data to pass to the @alignf function
       */
      int allocate_resource(struct resource *root, struct resource *new,
                            resource_size_t size, resource_size_t min,
                            resource_size_t max, resource_size_t align,
                            resource_size_t (*alignf)(void *,
                                                      const struct resource *,
                                                      resource_size_t,
                                                      resource_size_t),
                            void *alignf_data)
      {
              int err;
              struct resource_constraint constraint;
      
              if (!alignf)
                      alignf = simple_align_resource;
      
              constraint.min = min;
              constraint.max = max;
              constraint.align = align;
              constraint.alignf = alignf;
              constraint.alignf_data = alignf_data;
      
              if ( new->parent ) {
                      /* resource is already allocated, try reallocating with
                         the new constraints */
                      return reallocate_resource(root, new, size, &constraint);
              }
      
              write_lock(&resource_lock);
              err = find_resource(root, new, size, &constraint);
              if (err >= 0 && __request_resource(root, new))
                      err = -EBUSY;
              write_unlock(&resource_lock);
              return err;
      }
      
      EXPORT_SYMBOL(allocate_resource);
      
      /**
       * lookup_resource - find an existing resource by a resource start address
       * @root: root resource descriptor
       * @start: resource start address
       *
       * Returns a pointer to the resource if found, NULL otherwise
       */
      struct resource *lookup_resource(struct resource *root, resource_size_t start)
      {
              struct resource *res;
      
              read_lock(&resource_lock);
              for (res = root->child; res; res = res->sibling) {
                      if (res->start == start)
                              break;
              }
              read_unlock(&resource_lock);
      
              return res;
      }
      
      /*
       * Insert a resource into the resource tree. If successful, return NULL,
       * otherwise return the conflicting resource (compare to __request_resource())
       */
      static struct resource * __insert_resource(struct resource *parent, struct resource *new)
      {
              struct resource *first, *next;
      
              for (;; parent = first) {
                      first = __request_resource(parent, new);
                      if (!first)
                              return first;
      
                      if (first == parent)
                              return first;
                      if (WARN_ON(first == new))        /* duplicated insertion */
                              return first;
      
                      if ((first->start > new->start) || (first->end < new->end))
                              break;
                      if ((first->start == new->start) && (first->end == new->end))
                              break;
              }
      
              for (next = first; ; next = next->sibling) {
                      /* Partial overlap? Bad, and unfixable */
                      if (next->start < new->start || next->end > new->end)
                              return next;
                      if (!next->sibling)
                              break;
                      if (next->sibling->start > new->end)
                              break;
              }
      
              new->parent = parent;
              new->sibling = next->sibling;
              new->child = first;
      
              next->sibling = NULL;
              for (next = first; next; next = next->sibling)
                      next->parent = new;
      
              if (parent->child == first) {
                      parent->child = new;
              } else {
                      next = parent->child;
                      while (next->sibling != first)
                              next = next->sibling;
                      next->sibling = new;
              }
              return NULL;
      }
      
      /**
       * insert_resource_conflict - Inserts resource in the resource tree
       * @parent: parent of the new resource
       * @new: new resource to insert
       *
       * Returns 0 on success, conflict resource if the resource can't be inserted.
       *
       * This function is equivalent to request_resource_conflict when no conflict
       * happens. If a conflict happens, and the conflicting resources
       * entirely fit within the range of the new resource, then the new
       * resource is inserted and the conflicting resources become children of
       * the new resource.
       */
      struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
      {
              struct resource *conflict;
      
              write_lock(&resource_lock);
              conflict = __insert_resource(parent, new);
              write_unlock(&resource_lock);
              return conflict;
      }
      
      /**
       * insert_resource - Inserts a resource in the resource tree
       * @parent: parent of the new resource
       * @new: new resource to insert
       *
       * Returns 0 on success, -EBUSY if the resource can't be inserted.
       */
      int insert_resource(struct resource *parent, struct resource *new)
      {
              struct resource *conflict;
      
              conflict = insert_resource_conflict(parent, new);
              return conflict ? -EBUSY : 0;
      }
      
      /**
       * insert_resource_expand_to_fit - Insert a resource into the resource tree
       * @root: root resource descriptor
       * @new: new resource to insert
       *
       * Insert a resource into the resource tree, possibly expanding it in order
       * to make it encompass any conflicting resources.
       */
      void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
      {
              if (new->parent)
                      return;
      
              write_lock(&resource_lock);
              for (;;) {
                      struct resource *conflict;
      
                      conflict = __insert_resource(root, new);
                      if (!conflict)
                              break;
                      if (conflict == root)
                              break;
      
                      /* Ok, expand resource to cover the conflict, then try again .. */
                      if (conflict->start < new->start)
                              new->start = conflict->start;
                      if (conflict->end > new->end)
                              new->end = conflict->end;
      
                      printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
              }
              write_unlock(&resource_lock);
      }
      
      static int __adjust_resource(struct resource *res, resource_size_t start,
                                      resource_size_t size)
      {
              struct resource *tmp, *parent = res->parent;
              resource_size_t end = start + size - 1;
              int result = -EBUSY;
      
              if (!parent)
                      goto skip;
      
              if ((start < parent->start) || (end > parent->end))
                      goto out;
      
              if (res->sibling && (res->sibling->start <= end))
                      goto out;
      
              tmp = parent->child;
              if (tmp != res) {
                      while (tmp->sibling != res)
                              tmp = tmp->sibling;
                      if (start <= tmp->end)
                              goto out;
              }
      
      skip:
              for (tmp = res->child; tmp; tmp = tmp->sibling)
                      if ((tmp->start < start) || (tmp->end > end))
                              goto out;
      
              res->start = start;
              res->end = end;
              result = 0;
      
       out:
              return result;
      }
      
      /**
       * adjust_resource - modify a resource's start and size
       * @res: resource to modify
       * @start: new start value
       * @size: new size
       *
       * Given an existing resource, change its start and size to match the
       * arguments.  Returns 0 on success, -EBUSY if it can't fit.
       * Existing children of the resource are assumed to be immutable.
       */
      int adjust_resource(struct resource *res, resource_size_t start,
                              resource_size_t size)
      {
              int result;
      
              write_lock(&resource_lock);
              result = __adjust_resource(res, start, size);
              write_unlock(&resource_lock);
              return result;
      }
      EXPORT_SYMBOL(adjust_resource);
      
      static void __init __reserve_region_with_split(struct resource *root,
                      resource_size_t start, resource_size_t end,
                      const char *name)
      {
              struct resource *parent = root;
              struct resource *conflict;
              struct resource *res = alloc_resource(GFP_ATOMIC);
              struct resource *next_res = NULL;
      
              if (!res)
                      return;
      
              res->name = name;
              res->start = start;
              res->end = end;
              res->flags = IORESOURCE_BUSY;
      
              while (1) {
      
                      conflict = __request_resource(parent, res);
                      if (!conflict) {
                              if (!next_res)
                                      break;
                              res = next_res;
                              next_res = NULL;
                              continue;
                      }
      
                      /* conflict covered whole area */
                      if (conflict->start <= res->start &&
                                      conflict->end >= res->end) {
                              free_resource(res);
                              WARN_ON(next_res);
                              break;
                      }
      
                      /* failed, split and try again */
                      if (conflict->start > res->start) {
                              end = res->end;
                              res->end = conflict->start - 1;
                              if (conflict->end < end) {
                                      next_res = alloc_resource(GFP_ATOMIC);
                                      if (!next_res) {
                                              free_resource(res);
                                              break;
                                      }
                                      next_res->name = name;
                                      next_res->start = conflict->end + 1;
                                      next_res->end = end;
                                      next_res->flags = IORESOURCE_BUSY;
                              }
                      } else {
                              res->start = conflict->end + 1;
                      }
              }
      
      }
      
      void __init reserve_region_with_split(struct resource *root,
                      resource_size_t start, resource_size_t end,
                      const char *name)
      {
              int abort = 0;
      
              write_lock(&resource_lock);
              if (root->start > start || root->end < end) {
                      pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
                             (unsigned long long)start, (unsigned long long)end,
                             root);
                      if (start > root->end || end < root->start)
                              abort = 1;
                      else {
                              if (end > root->end)
                                      end = root->end;
                              if (start < root->start)
                                      start = root->start;
                              pr_err("fixing request to [0x%llx-0x%llx]\n",
                                     (unsigned long long)start,
                                     (unsigned long long)end);
                      }
                      dump_stack();
              }
              if (!abort)
                      __reserve_region_with_split(root, start, end, name);
              write_unlock(&resource_lock);
      }
      
      /**
       * resource_alignment - calculate resource's alignment
       * @res: resource pointer
       *
       * Returns alignment on success, 0 (invalid alignment) on failure.
       */
      resource_size_t resource_alignment(struct resource *res)
      {
              switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
              case IORESOURCE_SIZEALIGN:
                      return resource_size(res);
              case IORESOURCE_STARTALIGN:
                      return res->start;
              default:
                      return 0;
              }
      }
      
      /*
       * This is compatibility stuff for IO resources.
       *
       * Note how this, unlike the above, knows about
       * the IO flag meanings (busy etc).
       *
       * request_region creates a new busy region.
       *
       * release_region releases a matching busy region.
       */
      
      static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
      
      /**
       * __request_region - create a new busy resource region
       * @parent: parent resource descriptor
       * @start: resource start address
       * @n: resource region size
       * @name: reserving caller's ID string
       * @flags: IO resource flags
       */
      struct resource * __request_region(struct resource *parent,
                                         resource_size_t start, resource_size_t n,
                                         const char *name, int flags)
      {
              DECLARE_WAITQUEUE(wait, current);
              struct resource *res = alloc_resource(GFP_KERNEL);
      
              if (!res)
                      return NULL;
      
              res->name = name;
              res->start = start;
              res->end = start + n - 1;
              res->flags = resource_type(parent);
              res->flags |= IORESOURCE_BUSY | flags;
      
              write_lock(&resource_lock);
      
              for (;;) {
                      struct resource *conflict;
      
                      conflict = __request_resource(parent, res);
                      if (!conflict)
                              break;
                      if (conflict != parent) {
                              if (!(conflict->flags & IORESOURCE_BUSY)) {
                                      parent = conflict;
                                      continue;
                              }
                      }
                      if (conflict->flags & flags & IORESOURCE_MUXED) {
                              add_wait_queue(&muxed_resource_wait, &wait);
                              write_unlock(&resource_lock);
                              set_current_state(TASK_UNINTERRUPTIBLE);
                              schedule();
                              remove_wait_queue(&muxed_resource_wait, &wait);
                              write_lock(&resource_lock);
                              continue;
                      }
                      /* Uhhuh, that didn't work out.. */
                      free_resource(res);
                      res = NULL;
                      break;
              }
              write_unlock(&resource_lock);
              return res;
      }
      EXPORT_SYMBOL(__request_region);
      
      /**
       * __release_region - release a previously reserved resource region
       * @parent: parent resource descriptor
       * @start: resource start address
       * @n: resource region size
       *
       * The described resource region must match a currently busy region.
       */
      void __release_region(struct resource *parent, resource_size_t start,
                              resource_size_t n)
      {
              struct resource **p;
              resource_size_t end;
      
              p = &parent->child;
              end = start + n - 1;
      
              write_lock(&resource_lock);
      
              for (;;) {
                      struct resource *res = *p;
      
                      if (!res)
                              break;
                      if (res->start <= start && res->end >= end) {
                              if (!(res->flags & IORESOURCE_BUSY)) {
                                      p = &res->child;
                                      continue;
                              }
                              if (res->start != start || res->end != end)
                                      break;
                              *p = res->sibling;
                              write_unlock(&resource_lock);
                              if (res->flags & IORESOURCE_MUXED)
                                      wake_up(&muxed_resource_wait);
                              free_resource(res);
                              return;
                      }
                      p = &res->sibling;
              }
      
              write_unlock(&resource_lock);
      
              printk(KERN_WARNING "Trying to free nonexistent resource "
                      "<%016llx-%016llx>\n", (unsigned long long)start,
                      (unsigned long long)end);
      }
      EXPORT_SYMBOL(__release_region);
      
      #ifdef CONFIG_MEMORY_HOTREMOVE
      /**
       * release_mem_region_adjustable - release a previously reserved memory region
       * @parent: parent resource descriptor
       * @start: resource start address
       * @size: resource region size
       *
       * This interface is intended for memory hot-delete.  The requested region
       * is released from a currently busy memory resource.  The requested region
       * must either match exactly or fit into a single busy resource entry.  In
       * the latter case, the remaining resource is adjusted accordingly.
       * Existing children of the busy memory resource must be immutable in the
       * request.
       *
       * Note:
       * - Additional release conditions, such as overlapping region, can be
       *   supported after they are confirmed as valid cases.
       * - When a busy memory resource gets split into two entries, the code
       *   assumes that all children remain in the lower address entry for
       *   simplicity.  Enhance this logic when necessary.
       */
      int release_mem_region_adjustable(struct resource *parent,
                              resource_size_t start, resource_size_t size)
      {
              struct resource **p;
              struct resource *res;
              struct resource *new_res;
              resource_size_t end;
              int ret = -EINVAL;
      
              end = start + size - 1;
              if ((start < parent->start) || (end > parent->end))
                      return ret;
      
              /* The alloc_resource() result gets checked later */
              new_res = alloc_resource(GFP_KERNEL);
      
              p = &parent->child;
              write_lock(&resource_lock);
      
              while ((res = *p)) {
                      if (res->start >= end)
                              break;
      
                      /* look for the next resource if it does not fit into */
                      if (res->start > start || res->end < end) {
                              p = &res->sibling;
                              continue;
                      }
      
                      if (!(res->flags & IORESOURCE_MEM))
                              break;
      
                      if (!(res->flags & IORESOURCE_BUSY)) {
                              p = &res->child;
                              continue;
                      }
      
                      /* found the target resource; let's adjust accordingly */
                      if (res->start == start && res->end == end) {
                              /* free the whole entry */
                              *p = res->sibling;
                              free_resource(res);
                              ret = 0;
                      } else if (res->start == start && res->end != end) {
                              /* adjust the start */
                              ret = __adjust_resource(res, end + 1,
                                                      res->end - end);
                      } else if (res->start != start && res->end == end) {
                              /* adjust the end */
                              ret = __adjust_resource(res, res->start,
                                                      start - res->start);
                      } else {
                              /* split into two entries */
                              if (!new_res) {
                                      ret = -ENOMEM;
                                      break;
                              }
                              new_res->name = res->name;
                              new_res->start = end + 1;
                              new_res->end = res->end;
                              new_res->flags = res->flags;
                              new_res->parent = res->parent;
                              new_res->sibling = res->sibling;
                              new_res->child = NULL;
      
                              ret = __adjust_resource(res, res->start,
                                                      start - res->start);
                              if (ret)
                                      break;
                              res->sibling = new_res;
                              new_res = NULL;
                      }
      
                      break;
              }
      
              write_unlock(&resource_lock);
              free_resource(new_res);
              return ret;
      }
      #endif        /* CONFIG_MEMORY_HOTREMOVE */
      
      /*
       * Managed region resource
       */
      static void devm_resource_release(struct device *dev, void *ptr)
      {
              struct resource **r = ptr;
      
              release_resource(*r);
      }
      
      /**
       * devm_request_resource() - request and reserve an I/O or memory resource
       * @dev: device for which to request the resource
       * @root: root of the resource tree from which to request the resource
       * @new: descriptor of the resource to request
       *
       * This is a device-managed version of request_resource(). There is usually
       * no need to release resources requested by this function explicitly since
       * that will be taken care of when the device is unbound from its driver.
       * If for some reason the resource needs to be released explicitly, because
       * of ordering issues for example, drivers must call devm_release_resource()
       * rather than the regular release_resource().
       *
       * When a conflict is detected between any existing resources and the newly
       * requested resource, an error message will be printed.
       *
       * Returns 0 on success or a negative error code on failure.
       */
      int devm_request_resource(struct device *dev, struct resource *root,
                                struct resource *new)
      {
              struct resource *conflict, **ptr;
      
              ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
              if (!ptr)
                      return -ENOMEM;
      
              *ptr = new;
      
              conflict = request_resource_conflict(root, new);
              if (conflict) {
                      dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
                              new, conflict->name, conflict);
                      devres_free(ptr);
                      return -EBUSY;
              }
      
              devres_add(dev, ptr);
              return 0;
      }
      EXPORT_SYMBOL(devm_request_resource);
      
      static int devm_resource_match(struct device *dev, void *res, void *data)
      {
              struct resource **ptr = res;
      
              return *ptr == data;
      }
      
      /**
       * devm_release_resource() - release a previously requested resource
       * @dev: device for which to release the resource
       * @new: descriptor of the resource to release
       *
       * Releases a resource previously requested using devm_request_resource().
       */
      void devm_release_resource(struct device *dev, struct resource *new)
      {
              WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
                                     new));
      }
      EXPORT_SYMBOL(devm_release_resource);
      
      struct region_devres {
              struct resource *parent;
              resource_size_t start;
              resource_size_t n;
      };
      
      static void devm_region_release(struct device *dev, void *res)
      {
              struct region_devres *this = res;
      
              __release_region(this->parent, this->start, this->n);
      }
      
      static int devm_region_match(struct device *dev, void *res, void *match_data)
      {
              struct region_devres *this = res, *match = match_data;
      
              return this->parent == match->parent &&
                      this->start == match->start && this->n == match->n;
      }
      
      struct resource * __devm_request_region(struct device *dev,
                                      struct resource *parent, resource_size_t start,
                                      resource_size_t n, const char *name)
      {
              struct region_devres *dr = NULL;
              struct resource *res;
      
              dr = devres_alloc(devm_region_release, sizeof(struct region_devres),
                                GFP_KERNEL);
              if (!dr)
                      return NULL;
      
              dr->parent = parent;
              dr->start = start;
              dr->n = n;
      
              res = __request_region(parent, start, n, name, 0);
              if (res)
                      devres_add(dev, dr);
              else
                      devres_free(dr);
      
              return res;
      }
      EXPORT_SYMBOL(__devm_request_region);
      
      void __devm_release_region(struct device *dev, struct resource *parent,
                                 resource_size_t start, resource_size_t n)
      {
              struct region_devres match_data = { parent, start, n };
      
              __release_region(parent, start, n);
              WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
                                     &match_data));
      }
      EXPORT_SYMBOL(__devm_release_region);
      
      /*
       * Called from init/main.c to reserve IO ports.
       */
      #define MAXRESERVE 4
      static int __init reserve_setup(char *str)
      {
              static int reserved;
              static struct resource reserve[MAXRESERVE];
      
              for (;;) {
                      unsigned int io_start, io_num;
                      int x = reserved;
      
                      if (get_option (&str, &io_start) != 2)
                              break;
                      if (get_option (&str, &io_num)   == 0)
                              break;
                      if (x < MAXRESERVE) {
                              struct resource *res = reserve + x;
                              res->name = "reserved";
                              res->start = io_start;
                              res->end = io_start + io_num - 1;
                              res->flags = IORESOURCE_BUSY;
                              res->child = NULL;
                              if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
                                      reserved = x+1;
                      }
              }
              return 1;
      }
      
      __setup("reserve=", reserve_setup);
      
      /*
       * Check if the requested addr and size spans more than any slot in the
       * iomem resource tree.
       */
      int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
      {
              struct resource *p = &iomem_resource;
              int err = 0;
              loff_t l;
      
              read_lock(&resource_lock);
              for (p = p->child; p ; p = r_next(NULL, p, &l)) {
                      /*
                       * We can probably skip the resources without
                       * IORESOURCE_IO attribute?
                       */
                      if (p->start >= addr + size)
                              continue;
                      if (p->end < addr)
                              continue;
                      if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
                          PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
                              continue;
                      /*
                       * if a resource is "BUSY", it's not a hardware resource
                       * but a driver mapping of such a resource; we don't want
                       * to warn for those; some drivers legitimately map only
                       * partial hardware resources. (example: vesafb)
                       */
                      if (p->flags & IORESOURCE_BUSY)
                              continue;
      
                      printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
                             (unsigned long long)addr,
                             (unsigned long long)(addr + size - 1),
                             p->name, p);
                      err = -1;
                      break;
              }
              read_unlock(&resource_lock);
      
              return err;
      }
      
      #ifdef CONFIG_STRICT_DEVMEM
      static int strict_iomem_checks = 1;
      #else
      static int strict_iomem_checks;
      #endif
      
      /*
       * check if an address is reserved in the iomem resource tree
       * returns 1 if reserved, 0 if not reserved.
       */
      int iomem_is_exclusive(u64 addr)
      {
              struct resource *p = &iomem_resource;
              int err = 0;
              loff_t l;
              int size = PAGE_SIZE;
      
              if (!strict_iomem_checks)
                      return 0;
      
              addr = addr & PAGE_MASK;
      
              read_lock(&resource_lock);
              for (p = p->child; p ; p = r_next(NULL, p, &l)) {
                      /*
                       * We can probably skip the resources without
                       * IORESOURCE_IO attribute?
                       */
                      if (p->start >= addr + size)
                              break;
                      if (p->end < addr)
                              continue;
                      if (p->flags & IORESOURCE_BUSY &&
                           p->flags & IORESOURCE_EXCLUSIVE) {
                              err = 1;
                              break;
                      }
              }
              read_unlock(&resource_lock);
      
              return err;
      }
      
      struct resource_entry *resource_list_create_entry(struct resource *res,
                                                        size_t extra_size)
      {
              struct resource_entry *entry;
      
              entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
              if (entry) {
                      INIT_LIST_HEAD(&entry->node);
                      entry->res = res ? res : &entry->__res;
              }
      
              return entry;
      }
      EXPORT_SYMBOL(resource_list_create_entry);
      
      void resource_list_free(struct list_head *head)
      {
              struct resource_entry *entry, *tmp;
      
              list_for_each_entry_safe(entry, tmp, head, node)
                      resource_list_destroy_entry(entry);
      }
      EXPORT_SYMBOL(resource_list_free);
      
      static int __init strict_iomem(char *str)
      {
              if (strstr(str, "relaxed"))
                      strict_iomem_checks = 0;
              if (strstr(str, "strict"))
                      strict_iomem_checks = 1;
              return 1;
      }
      
      __setup("iomem=", strict_iomem);
      /* FTP extension for connection tracking. */
      
      /* (C) 1999-2001 Paul `Rusty' Russell
       * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
       * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
       * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
       *
       * This program is free software; you can redistribute it and/or modify
       * it under the terms of the GNU General Public License version 2 as
       * published by the Free Software Foundation.
       */
      
      #include <linux/module.h>
      #include <linux/moduleparam.h>
      #include <linux/netfilter.h>
      #include <linux/ip.h>
      #include <linux/slab.h>
      #include <linux/ipv6.h>
      #include <linux/ctype.h>
      #include <linux/inet.h>
      #include <net/checksum.h>
      #include <net/tcp.h>
      
      #include <net/netfilter/nf_conntrack.h>
      #include <net/netfilter/nf_conntrack_expect.h>
      #include <net/netfilter/nf_conntrack_ecache.h>
      #include <net/netfilter/nf_conntrack_helper.h>
      #include <linux/netfilter/nf_conntrack_ftp.h>
      
      MODULE_LICENSE("GPL");
      MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
      MODULE_DESCRIPTION("ftp connection tracking helper");
      MODULE_ALIAS("ip_conntrack_ftp");
      MODULE_ALIAS_NFCT_HELPER("ftp");
      
      /* This is slow, but it's simple. --RR */
      static char *ftp_buffer;
      
      static DEFINE_SPINLOCK(nf_ftp_lock);
      
      #define MAX_PORTS 8
      static u_int16_t ports[MAX_PORTS];
      static unsigned int ports_c;
      module_param_array(ports, ushort, &ports_c, 0400);
      
      static bool loose;
      module_param(loose, bool, 0600);
      
      unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb,
                                      enum ip_conntrack_info ctinfo,
                                      enum nf_ct_ftp_type type,
                                      unsigned int protoff,
                                      unsigned int matchoff,
                                      unsigned int matchlen,
                                      struct nf_conntrack_expect *exp);
      EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
      
      static int try_rfc959(const char *, size_t, struct nf_conntrack_man *,
                            char, unsigned int *);
      static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *,
                             char, unsigned int *);
      static int try_eprt(const char *, size_t, struct nf_conntrack_man *,
                          char, unsigned int *);
      static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
                                   char, unsigned int *);
      
      static struct ftp_search {
              const char *pattern;
              size_t plen;
              char skip;
              char term;
              enum nf_ct_ftp_type ftptype;
              int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *);
      } search[IP_CT_DIR_MAX][2] = {
              [IP_CT_DIR_ORIGINAL] = {
                      {
                              .pattern        = "PORT",
                              .plen                = sizeof("PORT") - 1,
                              .skip                = ' ',
                              .term                = '\r',
                              .ftptype        = NF_CT_FTP_PORT,
                              .getnum                = try_rfc959,
                      },
                      {
                              .pattern        = "EPRT",
                              .plen                = sizeof("EPRT") - 1,
                              .skip                = ' ',
                              .term                = '\r',
                              .ftptype        = NF_CT_FTP_EPRT,
                              .getnum                = try_eprt,
                      },
              },
              [IP_CT_DIR_REPLY] = {
                      {
                              .pattern        = "227 ",
                              .plen                = sizeof("227 ") - 1,
                              .ftptype        = NF_CT_FTP_PASV,
                              .getnum                = try_rfc1123,
                      },
                      {
                              .pattern        = "229 ",
                              .plen                = sizeof("229 ") - 1,
                              .skip                = '(',
                              .term                = ')',
                              .ftptype        = NF_CT_FTP_EPSV,
                              .getnum                = try_epsv_response,
                      },
              },
      };
      
      static int
      get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term)
      {
              const char *end;
              int ret = in6_pton(src, min_t(size_t, dlen, 0xffff), (u8 *)dst, term, &end);
              if (ret > 0)
                      return (int)(end - src);
              return 0;
      }
      
      static int try_number(const char *data, size_t dlen, u_int32_t array[],
                            int array_size, char sep, char term)
      {
              u_int32_t i, len;
      
              memset(array, 0, sizeof(array[0])*array_size);
      
              /* Keep data pointing at next char. */
              for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
                      if (*data >= '0' && *data <= '9') {
                              array[i] = array[i]*10 + *data - '0';
                      }
                      else if (*data == sep)
                              i++;
                      else {
                              /* Unexpected character; true if it's the
                                 terminator (or we don't care about one)
                                 and we're finished. */
                              if ((*data == term || !term) && i == array_size - 1)
                                      return len;
      
                              pr_debug("Char %u (got %u nums) `%u' unexpected\n",
                                       len, i, *data);
                              return 0;
                      }
              }
              pr_debug("Failed to fill %u numbers separated by %c\n",
                       array_size, sep);
              return 0;
      }
      
      /* Returns 0, or length of numbers: 192,168,1,1,5,6 */
      static int try_rfc959(const char *data, size_t dlen,
                            struct nf_conntrack_man *cmd, char term,
                            unsigned int *offset)
      {
              int length;
              u_int32_t array[6];
      
              length = try_number(data, dlen, array, 6, ',', term);
              if (length == 0)
                      return 0;
      
              cmd->u3.ip =  htonl((array[0] << 24) | (array[1] << 16) |
                                          (array[2] << 8) | array[3]);
              cmd->u.tcp.port = htons((array[4] << 8) | array[5]);
              return length;
      }
      
      /*
       * From RFC 1123:
       * The format of the 227 reply to a PASV command is not
       * well standardized.  In particular, an FTP client cannot
       * assume that the parentheses shown on page 40 of RFC-959
       * will be present (and in fact, Figure 3 on page 43 omits
       * them).  Therefore, a User-FTP program that interprets
       * the PASV reply must scan the reply for the first digit
       * of the host and port numbers.
       */
      static int try_rfc1123(const char *data, size_t dlen,
                             struct nf_conntrack_man *cmd, char term,
                             unsigned int *offset)
      {
              int i;
              for (i = 0; i < dlen; i++)
                      if (isdigit(data[i]))
                              break;
      
              if (i == dlen)
                      return 0;
      
              *offset += i;
      
              return try_rfc959(data + i, dlen - i, cmd, 0, offset);
      }
      
      /* Grab port: number up to delimiter */
      static int get_port(const char *data, int start, size_t dlen, char delim,
                          __be16 *port)
      {
              u_int16_t tmp_port = 0;
              int i;
      
              for (i = start; i < dlen; i++) {
                      /* Finished? */
                      if (data[i] == delim) {
                              if (tmp_port == 0)
                                      break;
                              *port = htons(tmp_port);
                              pr_debug("get_port: return %d\n", tmp_port);
                              return i + 1;
                      }
                      else if (data[i] >= '0' && data[i] <= '9')
                              tmp_port = tmp_port*10 + data[i] - '0';
                      else { /* Some other crap */
                              pr_debug("get_port: invalid char.\n");
                              break;
                      }
              }
              return 0;
      }
      
      /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */
      static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
                          char term, unsigned int *offset)
      {
              char delim;
              int length;
      
              /* First character is delimiter, then "1" for IPv4 or "2" for IPv6,
                 then delimiter again. */
              if (dlen <= 3) {
                      pr_debug("EPRT: too short\n");
                      return 0;
              }
              delim = data[0];
              if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
                      pr_debug("try_eprt: invalid delimitter.\n");
                      return 0;
              }
      
              if ((cmd->l3num == PF_INET && data[1] != '1') ||
                  (cmd->l3num == PF_INET6 && data[1] != '2')) {
                      pr_debug("EPRT: invalid protocol number.\n");
                      return 0;
              }
      
              pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim);
      
              if (data[1] == '1') {
                      u_int32_t array[4];
      
                      /* Now we have IP address. */
                      length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
                      if (length != 0)
                              cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16)
                                                 | (array[2] << 8) | array[3]);
              } else {
                      /* Now we have IPv6 address. */
                      length = get_ipv6_addr(data + 3, dlen - 3,
                                             (struct in6_addr *)cmd->u3.ip6, delim);
              }
      
              if (length == 0)
                      return 0;
              pr_debug("EPRT: Got IP address!\n");
              /* Start offset includes initial "|1|", and trailing delimiter */
              return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port);
      }
      
      /* Returns 0, or length of numbers: |||6446| */
      static int try_epsv_response(const char *data, size_t dlen,
                                   struct nf_conntrack_man *cmd, char term,
                                   unsigned int *offset)
      {
              char delim;
      
              /* Three delimiters. */
              if (dlen <= 3) return 0;
              delim = data[0];
              if (isdigit(delim) || delim < 33 || delim > 126 ||
                  data[1] != delim || data[2] != delim)
                      return 0;
      
              return get_port(data, 3, dlen, delim, &cmd->u.tcp.port);
      }
      
      /* Return 1 for match, 0 for accept, -1 for partial. */
      static int find_pattern(const char *data, size_t dlen,
                              const char *pattern, size_t plen,
                              char skip, char term,
                              unsigned int *numoff,
                              unsigned int *numlen,
                              struct nf_conntrack_man *cmd,
                              int (*getnum)(const char *, size_t,
                                            struct nf_conntrack_man *, char,
                                            unsigned int *))
      {
              size_t i = plen;
      
              pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
              if (dlen == 0)
                      return 0;
      
              if (dlen <= plen) {
                      /* Short packet: try for partial? */
                      if (strncasecmp(data, pattern, dlen) == 0)
                              return -1;
                      else return 0;
              }
      
              if (strncasecmp(data, pattern, plen) != 0) {
      #if 0
                      size_t i;
      
                      pr_debug("ftp: string mismatch\n");
                      for (i = 0; i < plen; i++) {
                              pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
                                       i, data[i], data[i],
                                       pattern[i], pattern[i]);
                      }
      #endif
                      return 0;
              }
      
              pr_debug("Pattern matches!\n");
              /* Now we've found the constant string, try to skip
                 to the 'skip' character */
              if (skip) {
                      for (i = plen; data[i] != skip; i++)
                              if (i == dlen - 1) return -1;
      
                      /* Skip over the last character */
                      i++;
              }
      
              pr_debug("Skipped up to 0x%hhx delimiter!\n", skip);
      
              *numoff = i;
              *numlen = getnum(data + i, dlen - i, cmd, term, numoff);
              if (!*numlen)
                      return -1;
      
              pr_debug("Match succeeded!\n");
              return 1;
      }
      
      /* Look up to see if we're just after a \n. */
      static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir)
      {
              unsigned int i;
      
              for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
                      if (info->seq_aft_nl[dir][i] == seq)
                              return 1;
              return 0;
      }
      
      /* We don't update if it's older than what we have. */
      static void update_nl_seq(struct nf_conn *ct, u32 nl_seq,
                                struct nf_ct_ftp_master *info, int dir,
                                struct sk_buff *skb)
      {
              unsigned int i, oldest;
      
              /* Look for oldest: if we find exact match, we're done. */
              for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
                      if (info->seq_aft_nl[dir][i] == nl_seq)
                              return;
              }
      
              if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
                      info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
              } else {
                      if (before(info->seq_aft_nl[dir][0], info->seq_aft_nl[dir][1]))
                              oldest = 0;
                      else
                              oldest = 1;
      
                      if (after(nl_seq, info->seq_aft_nl[dir][oldest]))
                              info->seq_aft_nl[dir][oldest] = nl_seq;
              }
      }
      
      static int help(struct sk_buff *skb,
                      unsigned int protoff,
                      struct nf_conn *ct,
                      enum ip_conntrack_info ctinfo)
      {
              unsigned int dataoff, datalen;
              const struct tcphdr *th;
              struct tcphdr _tcph;
              const char *fb_ptr;
              int ret;
              u32 seq;
              int dir = CTINFO2DIR(ctinfo);
              unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
    2         struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
              struct nf_conntrack_expect *exp;
              union nf_inet_addr *daddr;
    2         struct nf_conntrack_man cmd = {};
              unsigned int i;
              int found = 0, ends_in_nl;
              typeof(nf_nat_ftp_hook) nf_nat_ftp;
      
              /* Until there's been traffic both ways, don't look in packets. */
    2         if (ctinfo != IP_CT_ESTABLISHED &&
                  ctinfo != IP_CT_ESTABLISHED_REPLY) {
                      pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);
    2                 return NF_ACCEPT;
              }
      
              th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
              if (th == NULL)
                      return NF_ACCEPT;
      
              dataoff = protoff + th->doff * 4;
              /* No data? */
              if (dataoff >= skb->len) {
                      pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
                               skb->len);
                      return NF_ACCEPT;
              }
              datalen = skb->len - dataoff;
      
              spin_lock_bh(&nf_ftp_lock);
              fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
              BUG_ON(fb_ptr == NULL);
      
              ends_in_nl = (fb_ptr[datalen - 1] == '\n');
              seq = ntohl(th->seq) + datalen;
      
              /* Look up to see if we're just after a \n. */
              if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
                      /* We're picking up this, clear flags and let it continue */
                      if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) {
                              ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP;
                              goto skip_nl_seq;
                      }
      
                      /* Now if this ends in \n, update ftp info. */
                      pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n",
                               ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
                               ct_ftp_info->seq_aft_nl[dir][0],
                               ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
                               ct_ftp_info->seq_aft_nl[dir][1]);
                      ret = NF_ACCEPT;
                      goto out_update_nl;
              }
      
      skip_nl_seq:
              /* Initialize IP/IPv6 addr to expected address (it's not mentioned
                 in EPSV responses) */
              cmd.l3num = nf_ct_l3num(ct);
              memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
                     sizeof(cmd.u3.all));
      
              for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
                      found = find_pattern(fb_ptr, datalen,
                                           search[dir][i].pattern,
                                           search[dir][i].plen,
                                           search[dir][i].skip,
                                           search[dir][i].term,
                                           &matchoff, &matchlen,
                                           &cmd,
                                           search[dir][i].getnum);
                      if (found) break;
              }
              if (found == -1) {
                      /* We don't usually drop packets.  After all, this is
                         connection tracking, not packet filtering.
                         However, it is necessary for accurate tracking in
                         this case. */
                      nf_ct_helper_log(skb, ct, "partial matching of `%s'",
                                       search[dir][i].pattern);
                      ret = NF_DROP;
                      goto out;
              } else if (found == 0) { /* No match */
                      ret = NF_ACCEPT;
                      goto out_update_nl;
              }
      
              pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
                       matchlen, fb_ptr + matchoff,
                       matchlen, ntohl(th->seq) + matchoff);
      
              exp = nf_ct_expect_alloc(ct);
              if (exp == NULL) {
                      nf_ct_helper_log(skb, ct, "cannot alloc expectation");
                      ret = NF_DROP;
                      goto out;
              }
      
              /* We refer to the reverse direction ("!dir") tuples here,
               * because we're expecting something in the other direction.
               * Doesn't matter unless NAT is happening.  */
              daddr = &ct->tuplehash[!dir].tuple.dst.u3;
      
              /* Update the ftp info */
              if ((cmd.l3num == nf_ct_l3num(ct)) &&
                  memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
                           sizeof(cmd.u3.all))) {
                      /* Enrico Scholz's passive FTP to partially RNAT'd ftp
                         server: it really wants us to connect to a
                         different IP address.  Simply don't record it for
                         NAT. */
                      if (cmd.l3num == PF_INET) {
                              pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n",
                                       &cmd.u3.ip,
                                       &ct->tuplehash[dir].tuple.src.u3.ip);
                      } else {
                              pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n",
                                       cmd.u3.ip6,
                                       ct->tuplehash[dir].tuple.src.u3.ip6);
                      }
      
                      /* Thanks to Cristiano Lincoln Mattos
                         <lincoln@cesar.org.br> for reporting this potential
                         problem (DMZ machines opening holes to internal
                         networks, or the packet filter itself). */
                      if (!loose) {
                              ret = NF_ACCEPT;
                              goto out_put_expect;
                      }
                      daddr = &cmd.u3;
              }
      
              nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num,
                                &ct->tuplehash[!dir].tuple.src.u3, daddr,
                                IPPROTO_TCP, NULL, &cmd.u.tcp.port);
      
              /* Now, NAT might want to mangle the packet, and register the
               * (possibly changed) expectation itself. */
              nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);
              if (nf_nat_ftp && ct->status & IPS_NAT_MASK)
                      ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype,
                                       protoff, matchoff, matchlen, exp);
              else {
                      /* Can't expect this?  Best to drop packet now. */
                      if (nf_ct_expect_related(exp) != 0) {
                              nf_ct_helper_log(skb, ct, "cannot add expectation");
                              ret = NF_DROP;
                      } else
                              ret = NF_ACCEPT;
              }
      
      out_put_expect:
              nf_ct_expect_put(exp);
      
      out_update_nl:
              /* Now if this ends in \n, update ftp info.  Seq may have been
               * adjusted by NAT code. */
              if (ends_in_nl)
                      update_nl_seq(ct, seq, ct_ftp_info, dir, skb);
       out:
              spin_unlock_bh(&nf_ftp_lock);
              return ret;
      }
      
      static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
      {
              struct nf_ct_ftp_master *ftp = nfct_help_data(ct);
      
              /* This conntrack has been injected from user-space, always pick up
               * sequence tracking. Otherwise, the first FTP command after the
               * failover breaks.
               */
              ftp->flags[IP_CT_DIR_ORIGINAL] |= NF_CT_FTP_SEQ_PICKUP;
              ftp->flags[IP_CT_DIR_REPLY] |= NF_CT_FTP_SEQ_PICKUP;
              return 0;
      }
      
      static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly;
      
      static const struct nf_conntrack_expect_policy ftp_exp_policy = {
              .max_expected        = 1,
              .timeout        = 5 * 60,
      };
      
      /* don't make this __exit, since it's called from __init ! */
      static void nf_conntrack_ftp_fini(void)
      {
              int i, j;
              for (i = 0; i < ports_c; i++) {
                      for (j = 0; j < 2; j++) {
                              if (ftp[i][j].me == NULL)
                                      continue;
      
                              pr_debug("nf_ct_ftp: unregistering helper for pf: %d "
                                       "port: %d\n",
                                       ftp[i][j].tuple.src.l3num, ports[i]);
                              nf_conntrack_helper_unregister(&ftp[i][j]);
                      }
              }
      
              kfree(ftp_buffer);
      }
      
      static int __init nf_conntrack_ftp_init(void)
      {
              int i, j = -1, ret = 0;
      
              ftp_buffer = kmalloc(65536, GFP_KERNEL);
              if (!ftp_buffer)
                      return -ENOMEM;
      
              if (ports_c == 0)
                      ports[ports_c++] = FTP_PORT;
      
              /* FIXME should be configurable whether IPv4 and IPv6 FTP connections
                       are tracked or not - YK */
              for (i = 0; i < ports_c; i++) {
                      ftp[i][0].tuple.src.l3num = PF_INET;
                      ftp[i][1].tuple.src.l3num = PF_INET6;
                      for (j = 0; j < 2; j++) {
                              ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master);
                              ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
                              ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
                              ftp[i][j].expect_policy = &ftp_exp_policy;
                              ftp[i][j].me = THIS_MODULE;
                              ftp[i][j].help = help;
                              ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr;
                              if (ports[i] == FTP_PORT)
                                      sprintf(ftp[i][j].name, "ftp");
                              else
                                      sprintf(ftp[i][j].name, "ftp-%d", ports[i]);
      
                              pr_debug("nf_ct_ftp: registering helper for pf: %d "
                                       "port: %d\n",
                                       ftp[i][j].tuple.src.l3num, ports[i]);
                              ret = nf_conntrack_helper_register(&ftp[i][j]);
                              if (ret) {
                                      printk(KERN_ERR "nf_ct_ftp: failed to register"
                                             " helper for pf: %d port: %d\n",
                                              ftp[i][j].tuple.src.l3num, ports[i]);
                                      nf_conntrack_ftp_fini();
                                      return ret;
                              }
                      }
              }
      
              return 0;
      }
      
      module_init(nf_conntrack_ftp_init);
      module_exit(nf_conntrack_ftp_fini);
      /*
       * Lockless get_user_pages_fast for x86
       *
       * Copyright (C) 2008 Nick Piggin
       * Copyright (C) 2008 Novell Inc.
       */
      #include <linux/sched.h>
      #include <linux/mm.h>
      #include <linux/vmstat.h>
      #include <linux/highmem.h>
      #include <linux/swap.h>
      
      #include <asm/pgtable.h>
      
      static inline pte_t gup_get_pte(pte_t *ptep)
      {
      #ifndef CONFIG_X86_PAE
  153         return READ_ONCE(*ptep);
      #else
              /*
               * With get_user_pages_fast, we walk down the pagetables without taking
               * any locks.  For this we would like to load the pointers atomically,
               * but that is not possible (without expensive cmpxchg8b) on PAE.  What
               * we do have is the guarantee that a pte will only either go from not
               * present to present, or present to not present or both -- it will not
               * switch to a completely different present page without a TLB flush in
               * between; something that we are blocking by holding interrupts off.
               *
               * Setting ptes from not present to present goes:
               * ptep->pte_high = h;
               * smp_wmb();
               * ptep->pte_low = l;
               *
               * And present to not present goes:
               * ptep->pte_low = 0;
               * smp_wmb();
               * ptep->pte_high = 0;
               *
               * We must ensure here that the load of pte_low sees l iff pte_high
               * sees h. We load pte_high *after* loading pte_low, which ensures we
               * don't see an older value of pte_high.  *Then* we recheck pte_low,
               * which ensures that we haven't picked up a changed pte high. We might
               * have got rubbish values from pte_low and pte_high, but we are
               * guaranteed that pte_low will not have the present bit set *unless*
               * it is 'l'. And get_user_pages_fast only operates on present ptes, so
               * we're safe.
               *
               * gup_get_pte should not be used or copied outside gup.c without being
               * very careful -- it does not atomically load the pte or anything that
               * is likely to be useful for you.
               */
              pte_t pte;
      
      retry:
              pte.pte_low = ptep->pte_low;
              smp_rmb();
              pte.pte_high = ptep->pte_high;
              smp_rmb();
              if (unlikely(pte.pte_low != ptep->pte_low))
                      goto retry;
      
              return pte;
      #endif
      }
      
      /*
       * The performance critical leaf functions are made noinline otherwise gcc
       * inlines everything into a single function which results in too much
       * register pressure.
       */
      static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                      unsigned long end, int write, struct page **pages, int *nr)
      {
              unsigned long mask;
              pte_t *ptep;
      
              mask = _PAGE_PRESENT|_PAGE_USER;
  153         if (write)
                      mask |= _PAGE_RW;
      
  153         ptep = pte_offset_map(&pmd, addr);
              do {
  153                 pte_t pte = gup_get_pte(ptep);
                      struct page *page;
      
                      /* Similar to the PMD case, NUMA hinting must take slow path */
                      if (pte_protnone(pte)) {
                              pte_unmap(ptep);
                              return 0;
                      }
      
                      if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
                              pte_unmap(ptep);
                              return 0;
                      }
  146                 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  146                 page = pte_page(pte);
  146                 get_page(page);
  146                 SetPageReferenced(page);
                      pages[*nr] = page;
                      (*nr)++;
      
              } while (ptep++, addr += PAGE_SIZE, addr != end);
              pte_unmap(ptep - 1);
      
  153         return 1;
      }
      
      static inline void get_head_page_multiple(struct page *page, int nr)
      {
              VM_BUG_ON_PAGE(page != compound_head(page), page);
              VM_BUG_ON_PAGE(page_count(page) == 0, page);
              atomic_add(nr, &page->_count);
              SetPageReferenced(page);
      }
      
      static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
                      unsigned long end, int write, struct page **pages, int *nr)
      {
              unsigned long mask;
              struct page *head, *page;
              int refs;
      
              mask = _PAGE_PRESENT|_PAGE_USER;
              if (write)
                      mask |= _PAGE_RW;
              if ((pmd_flags(pmd) & mask) != mask)
                      return 0;
              /* hugepages are never "special" */
              VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
              VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
      
              refs = 0;
              head = pmd_page(pmd);
              page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
              do {
                      VM_BUG_ON_PAGE(compound_head(page) != head, page);
                      pages[*nr] = page;
                      if (PageTail(page))
                              get_huge_page_tail(page);
                      (*nr)++;
                      page++;
                      refs++;
              } while (addr += PAGE_SIZE, addr != end);
              get_head_page_multiple(head, refs);
      
              return 1;
      }
      
      static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                      int write, struct page **pages, int *nr)
      {
              unsigned long next;
              pmd_t *pmdp;
      
  160         pmdp = pmd_offset(&pud, addr);
              do {
  160                 pmd_t pmd = *pmdp;
      
                      next = pmd_addr_end(addr, end);
                      /*
                       * The pmd_trans_splitting() check below explains why
                       * pmdp_splitting_flush has to flush the tlb, to stop
                       * this gup-fast code from running while we set the
                       * splitting bit in the pmd. Returning zero will take
                       * the slow path that will call wait_split_huge_page()
                       * if the pmd is still in splitting state. gup-fast
                       * can't because it has irq disabled and
                       * wait_split_huge_page() would never return as the
                       * tlb flush IPI wouldn't run.
                       */
  160                 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                              return 0;
  160                 if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
                              /*
                               * NUMA hinting faults need to be handled in the GUP
                               * slowpath for accounting purposes and so that they
                               * can be serialised against THP migration.
                               */
                              if (pmd_protnone(pmd))
                                      return 0;
                              if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
                                      return 0;
                      } else {
  153                         if (!gup_pte_range(pmd, addr, next, write, pages, nr))
                                      return 0;
                      }
   91         } while (pmdp++, addr = next, addr != end);
      
              return 1;
      }
      
      static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
                      unsigned long end, int write, struct page **pages, int *nr)
      {
              unsigned long mask;
              struct page *head, *page;
              int refs;
      
              mask = _PAGE_PRESENT|_PAGE_USER;
              if (write)
                      mask |= _PAGE_RW;
              if ((pud_flags(pud) & mask) != mask)
                      return 0;
              /* hugepages are never "special" */
              VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
              VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
      
              refs = 0;
              head = pud_page(pud);
              page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
              do {
                      VM_BUG_ON_PAGE(compound_head(page) != head, page);
                      pages[*nr] = page;
                      if (PageTail(page))
                              get_huge_page_tail(page);
                      (*nr)++;
                      page++;
                      refs++;
              } while (addr += PAGE_SIZE, addr != end);
              get_head_page_multiple(head, refs);
      
              return 1;
      }
      
      static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
                              int write, struct page **pages, int *nr)
      {
              unsigned long next;
              pud_t *pudp;
      
  160         pudp = pud_offset(&pgd, addr);
              do {
  160                 pud_t pud = *pudp;
      
                      next = pud_addr_end(addr, end);
  160                 if (pud_none(pud))
                              return 0;
  160                 if (unlikely(pud_large(pud))) {
                              if (!gup_huge_pud(pud, addr, next, write, pages, nr))
                                      return 0;
                      } else {
  160                         if (!gup_pmd_range(pud, addr, next, write, pages, nr))
                                      return 0;
                      }
   91         } while (pudp++, addr = next, addr != end);
      
              return 1;
      }
      
      /*
       * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
       * back to the regular GUP.
       */
      int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
                                struct page **pages)
      {
              struct mm_struct *mm = current->mm;
              unsigned long addr, len, end;
              unsigned long next;
              unsigned long flags;
              pgd_t *pgdp;
              int nr = 0;
      
              start &= PAGE_MASK;
              addr = start;
              len = (unsigned long) nr_pages << PAGE_SHIFT;
              end = start + len;
              if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
                                              (void __user *)start, len)))
                      return 0;
      
              /*
               * XXX: batch / limit 'nr', to avoid large irq off latency
               * needs some instrumenting to determine the common sizes used by
               * important workloads (eg. DB2), and whether limiting the batch size
               * will decrease performance.
               *
               * It seems like we're in the clear for the moment. Direct-IO is
               * the main guy that batches up lots of get_user_pages, and even
               * they are limited to 64-at-a-time which is not so many.
               */
              /*
               * This doesn't prevent pagetable teardown, but does prevent
               * the pagetables and pages from being freed on x86.
               *
               * So long as we atomically load page table pointers versus teardown
               * (which we do on x86, with the above PAE exception), we can follow the
               * address down to the the page and take a ref on it.
               */
              local_irq_save(flags);
              pgdp = pgd_offset(mm, addr);
              do {
                      pgd_t pgd = *pgdp;
      
                      next = pgd_addr_end(addr, end);
                      if (pgd_none(pgd))
                              break;
                      if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
                              break;
              } while (pgdp++, addr = next, addr != end);
              local_irq_restore(flags);
      
              return nr;
      }
      
      /**
       * get_user_pages_fast() - pin user pages in memory
       * @start:        starting user address
       * @nr_pages:        number of pages from start to pin
       * @write:        whether pages will be written to
       * @pages:        array that receives pointers to the pages pinned.
       *                 Should be at least nr_pages long.
       *
       * Attempt to pin user pages in memory without taking mm->mmap_sem.
       * If not successful, it will fall back to taking the lock and
       * calling get_user_pages().
       *
       * Returns number of pages pinned. This may be fewer than the number
       * requested. If nr_pages is 0 or negative, returns 0. If no pages
       * were pinned, returns -errno.
       */
      int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                              struct page **pages)
      {
  160         struct mm_struct *mm = current->mm;
              unsigned long addr, len, end;
              unsigned long next;
              pgd_t *pgdp;
              int nr = 0;
      
              start &= PAGE_MASK;
              addr = start;
              len = (unsigned long) nr_pages << PAGE_SHIFT;
      
              end = start + len;
              if (end < start)
                      goto slow_irqon;
      
      #ifdef CONFIG_X86_64
  160         if (end >> __VIRTUAL_MASK_SHIFT)
                      goto slow_irqon;
      #endif
      
              /*
               * XXX: batch / limit 'nr', to avoid large irq off latency
               * needs some instrumenting to determine the common sizes used by
               * important workloads (eg. DB2), and whether limiting the batch size
               * will decrease performance.
               *
               * It seems like we're in the clear for the moment. Direct-IO is
               * the main guy that batches up lots of get_user_pages, and even
               * they are limited to 64-at-a-time which is not so many.
               */
              /*
               * This doesn't prevent pagetable teardown, but does prevent
               * the pagetables and pages from being freed on x86.
               *
               * So long as we atomically load page table pointers versus teardown
               * (which we do on x86, with the above PAE exception), we can follow the
               * address down to the the page and take a ref on it.
               */
  160         local_irq_disable();
              pgdp = pgd_offset(mm, addr);
              do {
  160                 pgd_t pgd = *pgdp;
      
                      next = pgd_addr_end(addr, end);
  160                 if (pgd_none(pgd))
                              goto slow;
  160                 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
                              goto slow;
   91         } while (pgdp++, addr = next, addr != end);
   91         local_irq_enable();
      
              VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
              return nr;
      
              {
                      int ret;
      
      slow:
   91                 local_irq_enable();
      slow_irqon:
                      /* Try to get the remaining pages with get_user_pages */
                      start += nr << PAGE_SHIFT;
                      pages += nr;
      
   91                 ret = get_user_pages_unlocked(current, mm, start,
                                                    (end - start) >> PAGE_SHIFT,
                                                    pages, write ? FOLL_WRITE : 0);
      
                      /* Have to be a bit careful with return values */
                      if (nr > 0) {
   68                         if (ret < 0)
                                      ret = nr;
                              else
  160                                 ret += nr;
                      }
      
                      return ret;
              }
      }
      /*
       *  linux/fs/ext4/dir.c
       *
       * Copyright (C) 1992, 1993, 1994, 1995
       * Remy Card (card@masi.ibp.fr)
       * Laboratoire MASI - Institut Blaise Pascal
       * Universite Pierre et Marie Curie (Paris VI)
       *
       *  from
       *
       *  linux/fs/minix/dir.c
       *
       *  Copyright (C) 1991, 1992  Linus Torvalds
       *
       *  ext4 directory handling functions
       *
       *  Big-endian to little-endian byte-swapping/bitmaps by
       *        David S. Miller (davem@caip.rutgers.edu), 1995
       *
       * Hash Tree Directory indexing (c) 2001  Daniel Phillips
       *
       */
      
      #include <linux/fs.h>
      #include <linux/buffer_head.h>
      #include <linux/slab.h>
      #include "ext4.h"
      #include "xattr.h"
      
      static int ext4_dx_readdir(struct file *, struct dir_context *);
      
      /**
       * Check if the given dir-inode refers to an htree-indexed directory
       * (or a directory which could potentially get converted to use htree
       * indexing).
       *
       * Return 1 if it is a dx dir, 0 if not
       */
      static int is_dx_dir(struct inode *inode)
      {
    9         struct super_block *sb = inode->i_sb;
      
              if (ext4_has_feature_dir_index(inode->i_sb) &&
    9             ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
    9              ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
                   ext4_has_inline_data(inode)))
                      return 1;
      
    9         return 0;
      }
      
      /*
       * Return 0 if the directory entry is OK, and 1 if there is a problem
       *
       * Note: this is the opposite of what ext2 and ext3 historically returned...
       *
       * bh passed here can be an inode block or a dir data block, depending
       * on the inode inline data flag.
       */
      int __ext4_check_dir_entry(const char *function, unsigned int line,
                                 struct inode *dir, struct file *filp,
                                 struct ext4_dir_entry_2 *de,
                                 struct buffer_head *bh, char *buf, int size,
                                 unsigned int offset)
      {
              const char *error_msg = NULL;
   76         const int rlen = ext4_rec_len_from_disk(de->rec_len,
   76                                                 dir->i_sb->s_blocksize);
      
              if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                      error_msg = "rec_len is smaller than minimal";
   76         else if (unlikely(rlen % 4 != 0))
                      error_msg = "rec_len % 4 != 0";
   76         else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                      error_msg = "rec_len is too small for name_len";
   76         else if (unlikely(((char *) de - buf) + rlen > size))
                      error_msg = "directory entry overrun";
   76         else if (unlikely(le32_to_cpu(de->inode) >
                              le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                      error_msg = "inode out of bounds";
              else
                      return 0;
      
              if (filp)
                      ext4_error_file(filp, function, line, bh->b_blocknr,
                                      "bad entry in directory: %s - offset=%u, "
                                      "inode=%u, rec_len=%d, name_len=%d, size=%d",
                                      error_msg, offset, le32_to_cpu(de->inode),
                                      rlen, de->name_len, size);
              else
                      ext4_error_inode(dir, function, line, bh->b_blocknr,
                                      "bad entry in directory: %s - offset=%u, "
                                      "inode=%u, rec_len=%d, name_len=%d, size=%d",
                                       error_msg, offset, le32_to_cpu(de->inode),
                                       rlen, de->name_len, size);
      
              return 1;
      }
      
      static int ext4_readdir(struct file *file, struct dir_context *ctx)
      {
              unsigned int offset;
              int i;
              struct ext4_dir_entry_2 *de;
              int err;
    6         struct inode *inode = file_inode(file);
              struct super_block *sb = inode->i_sb;
              struct buffer_head *bh = NULL;
              int dir_has_error = 0;
              struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
      
              if (ext4_encrypted_inode(inode)) {
                      err = ext4_get_encryption_info(inode);
                      if (err && err != -ENOKEY)
                              return err;
              }
      
              if (is_dx_dir(inode)) {
    6                 err = ext4_dx_readdir(file, ctx);
                      if (err != ERR_BAD_DX_DIR) {
                              return err;
                      }
                      /*
                       * We don't set the inode dirty flag since it's not
                       * critical that it get flushed back to the disk.
                       */
                      ext4_clear_inode_flag(file_inode(file),
                                            EXT4_INODE_INDEX);
              }
      
              if (ext4_has_inline_data(inode)) {
                      int has_inline_data = 1;
                      err = ext4_read_inline_dir(file, ctx,
                                                 &has_inline_data);
                      if (has_inline_data)
                              return err;
              }
      
              if (ext4_encrypted_inode(inode)) {
                      err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN,
                                                           &fname_crypto_str);
                      if (err < 0)
                              return err;
              }
      
              offset = ctx->pos & (sb->s_blocksize - 1);
      
              while (ctx->pos < inode->i_size) {
                      struct ext4_map_blocks map;
      
                      map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                      map.m_len = 1;
                      err = ext4_map_blocks(NULL, inode, &map, 0);
                      if (err > 0) {
                              pgoff_t index = map.m_pblk >>
                                              (PAGE_CACHE_SHIFT - inode->i_blkbits);
                              if (!ra_has_index(&file->f_ra, index))
                                      page_cache_sync_readahead(
                                              sb->s_bdev->bd_inode->i_mapping,
                                              &file->f_ra, file,
                                              index, 1);
                              file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
                              bh = ext4_bread(NULL, inode, map.m_lblk, 0);
                              if (IS_ERR(bh))
    1                                 return PTR_ERR(bh);
                      }
      
                      if (!bh) {
                              if (!dir_has_error) {
                                      EXT4_ERROR_FILE(file, 0,
                                                      "directory contains a "
                                                      "hole at offset %llu",
                                                 (unsigned long long) ctx->pos);
                                      dir_has_error = 1;
                              }
                              /* corrupt size?  Maybe no more blocks to read */
                              if (ctx->pos > inode->i_blocks << 9)
                                      break;
                              ctx->pos += sb->s_blocksize - offset;
                              continue;
                      }
      
                      /* Check the checksum */
                      if (!buffer_verified(bh) &&
                          !ext4_dirent_csum_verify(inode,
                                      (struct ext4_dir_entry *)bh->b_data)) {
                              EXT4_ERROR_FILE(file, 0, "directory fails checksum "
                                              "at offset %llu",
                                              (unsigned long long)ctx->pos);
                              ctx->pos += sb->s_blocksize - offset;
                              brelse(bh);
                              bh = NULL;
                              continue;
                      }
                      set_buffer_verified(bh);
      
                      /* If the dir block has changed since the last call to
                       * readdir(2), then we might be pointing to an invalid
                       * dirent right now.  Scan from the start of the block
                       * to make sure. */
                      if (file->f_version != inode->i_version) {
                              for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                      de = (struct ext4_dir_entry_2 *)
                                              (bh->b_data + i);
                                      /* It's too expensive to do a full
                                       * dirent test each time round this
                                       * loop, but we do have to test at
                                       * least that it is non-zero.  A
                                       * failure will be detected in the
                                       * dirent test below. */
                                      if (ext4_rec_len_from_disk(de->rec_len,
                                              sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
                                              break;
                                      i += ext4_rec_len_from_disk(de->rec_len,
                                                                  sb->s_blocksize);
                              }
                              offset = i;
                              ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                      | offset;
                              file->f_version = inode->i_version;
                      }
      
                      while (ctx->pos < inode->i_size
                             && offset < sb->s_blocksize) {
                              de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
                              if (ext4_check_dir_entry(inode, file, de, bh,
                                                       bh->b_data, bh->b_size,
                                                       offset)) {
                                      /*
                                       * On error, skip to the next block
                                       */
                                      ctx->pos = (ctx->pos |
                                                      (sb->s_blocksize - 1)) + 1;
                                      break;
                              }
                              offset += ext4_rec_len_from_disk(de->rec_len,
                                              sb->s_blocksize);
                              if (le32_to_cpu(de->inode)) {
                                      if (!ext4_encrypted_inode(inode)) {
                                              if (!dir_emit(ctx, de->name,
                                                  de->name_len,
                                                  le32_to_cpu(de->inode),
                                                  get_dtype(sb, de->file_type)))
                                                      goto done;
                                      } else {
                                              int save_len = fname_crypto_str.len;
      
                                              /* Directory is encrypted */
                                              err = ext4_fname_disk_to_usr(inode,
                                                      NULL, de, &fname_crypto_str);
                                              fname_crypto_str.len = save_len;
                                              if (err < 0)
                                                      goto errout;
                                              if (!dir_emit(ctx,
                                                  fname_crypto_str.name, err,
                                                  le32_to_cpu(de->inode),
                                                  get_dtype(sb, de->file_type)))
                                                      goto done;
                                      }
                              }
                              ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                      sb->s_blocksize);
                      }
                      if ((ctx->pos < inode->i_size) && !dir_relax(inode))
                              goto done;
                      brelse(bh);
                      bh = NULL;
                      offset = 0;
              }
      done:
              err = 0;
      errout:
      #ifdef CONFIG_EXT4_FS_ENCRYPTION
              ext4_fname_crypto_free_buffer(&fname_crypto_str);
      #endif
              brelse(bh);
    6         return err;
      }
      
      static inline int is_32bit_api(void)
      {
      #ifdef CONFIG_COMPAT
    9         return is_compat_task();
      #else
              return (BITS_PER_LONG == 32);
      #endif
      }
      
      /*
       * These functions convert from the major/minor hash to an f_pos
       * value for dx directories
       *
       * Upper layer (for example NFS) should specify FMODE_32BITHASH or
       * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
       * directly on both 32-bit and 64-bit nodes, under such case, neither
       * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
       */
      static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
      {
              if ((filp->f_mode & FMODE_32BITHASH) ||
    6             (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
    6                 return major >> 1;
              else
                      return ((__u64)(major >> 1) << 32) | (__u64)minor;
      }
      
      static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
      {
    5         if ((filp->f_mode & FMODE_32BITHASH) ||
    5             (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
    5                 return (pos << 1) & 0xffffffff;
              else
                      return ((pos >> 32) << 1) & 0xffffffff;
      }
      
      static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
      {
              if ((filp->f_mode & FMODE_32BITHASH) ||
    5             (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                      return 0;
              else
                      return pos & 0xffffffff;
      }
      
      /*
       * Return 32- or 64-bit end-of-file for dx directories
       */
      static inline loff_t ext4_get_htree_eof(struct file *filp)
      {
    6         if ((filp->f_mode & FMODE_32BITHASH) ||
    9             (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                      return EXT4_HTREE_EOF_32BIT;
              else
                      return EXT4_HTREE_EOF_64BIT;
      }
      
      
      /*
       * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
       * directories, where the "offset" is in terms of the filename hash
       * value instead of the byte offset.
       *
       * Because we may return a 64-bit hash that is well beyond offset limits,
       * we need to pass the max hash as the maximum allowable offset in
       * the htree directory case.
       *
       * For non-htree, ext4_llseek already chooses the proper max offset.
       */
      static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
      {
    4         struct inode *inode = file->f_mapping->host;
              int dx_dir = is_dx_dir(inode);
    4         loff_t htree_max = ext4_get_htree_eof(file);
      
    4         if (likely(dx_dir))
    4                 return generic_file_llseek_size(file, offset, whence,
                                                          htree_max, htree_max);
              else
                      return ext4_llseek(file, offset, whence);
      }
      
      /*
       * This structure holds the nodes of the red-black tree used to store
       * the directory entry in hash order.
       */
      struct fname {
              __u32                hash;
              __u32                minor_hash;
              struct rb_node        rb_hash;
              struct fname        *next;
              __u32                inode;
              __u8                name_len;
              __u8                file_type;
              char                name[0];
      };
      
      /*
       * This functoin implements a non-recursive way of freeing all of the
       * nodes in the red-black tree.
       */
      static void free_rb_tree_fname(struct rb_root *root)
      {
              struct fname *fname, *next;
      
    5         rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
                      while (fname) {
                              struct fname *old = fname;
    3                         fname = fname->next;
                              kfree(old);
                      }
      
    5         *root = RB_ROOT;
      }
      
      
      static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
                                                                 loff_t pos)
      {
              struct dir_private_info *p;
      
    5         p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
              if (!p)
                      return NULL;
    5         p->curr_hash = pos2maj_hash(filp, pos);
    5         p->curr_minor_hash = pos2min_hash(filp, pos);
              return p;
      }
      
      void ext4_htree_free_dir_info(struct dir_private_info *p)
      {
    1         free_rb_tree_fname(&p->root);
              kfree(p);
      }
      
      /*
       * Given a directory entry, enter it into the fname rb tree.
       *
       * When filename encryption is enabled, the dirent will hold the
       * encrypted filename, while the htree will hold decrypted filename.
       * The decrypted filename is passed in via ent_name.  parameter.
       */
      int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                   __u32 minor_hash,
                                  struct ext4_dir_entry_2 *dirent,
                                  struct ext4_str *ent_name)
      {
              struct rb_node **p, *parent = NULL;
              struct fname *fname, *new_fn;
              struct dir_private_info *info;
              int len;
      
    5         info = dir_file->private_data;
              p = &info->root.rb_node;
      
              /* Create and allocate the fname structure */
              len = sizeof(struct fname) + ent_name->len + 1;
              new_fn = kzalloc(len, GFP_KERNEL);
              if (!new_fn)
                      return -ENOMEM;
    5         new_fn->hash = hash;
              new_fn->minor_hash = minor_hash;
              new_fn->inode = le32_to_cpu(dirent->inode);
              new_fn->name_len = ent_name->len;
              new_fn->file_type = dirent->file_type;
              memcpy(new_fn->name, ent_name->name, ent_name->len);
              new_fn->name[ent_name->len] = 0;
      
    5         while (*p) {
                      parent = *p;
                      fname = rb_entry(parent, struct fname, rb_hash);
      
                      /*
                       * If the hash and minor hash match up, then we put
                       * them on a linked list.  This rarely happens...
                       */
    5                 if ((new_fn->hash == fname->hash) &&
                          (new_fn->minor_hash == fname->minor_hash)) {
                              new_fn->next = fname->next;
                              fname->next = new_fn;
    5                         return 0;
                      }
      
    5                 if (new_fn->hash < fname->hash)
                              p = &(*p)->rb_left;
    5                 else if (new_fn->hash > fname->hash)
                              p = &(*p)->rb_right;
                      else if (new_fn->minor_hash < fname->minor_hash)
    5                         p = &(*p)->rb_left;
                      else /* if (new_fn->minor_hash > fname->minor_hash) */
    5                         p = &(*p)->rb_right;
              }
      
    5         rb_link_node(&new_fn->rb_hash, parent, p);
              rb_insert_color(&new_fn->rb_hash, &info->root);
              return 0;
    5 }
      
      
      
      /*
       * This is a helper function for ext4_dx_readdir.  It calls filldir
       * for all entres on the fname linked list.  (Normally there is only
       * one entry on the linked list, unless there are 62 bit hash collisions.)
       */
      static int call_filldir(struct file *file, struct dir_context *ctx,
                              struct fname *fname)
      {
    6         struct dir_private_info *info = file->private_data;
              struct inode *inode = file_inode(file);
              struct super_block *sb = inode->i_sb;
      
              if (!fname) {
                      ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
                               "called with null fname?!?", __func__, __LINE__,
                               inode->i_ino, current->comm);
                      return 0;
              }
    6         ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
    6         while (fname) {
                      if (!dir_emit(ctx, fname->name,
                                      fname->name_len,
    6                                 fname->inode,
    6                                 get_dtype(sb, fname->file_type))) {
    4                         info->extra_fname = fname;
                              return 1;
                      }
    5                 fname = fname->next;
              }
              return 0;
      }
      
      static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
      {
    6         struct dir_private_info *info = file->private_data;
              struct inode *inode = file_inode(file);
              struct fname *fname;
              int        ret;
      
              if (!info) {
    5                 info = ext4_htree_create_dir_info(file, ctx->pos);
                      if (!info)
                              return -ENOMEM;
                      file->private_data = info;
              }
      
    6         if (ctx->pos == ext4_get_htree_eof(file))
                      return 0;        /* EOF */
      
              /* Some one has messed with f_pos; reset the world */
    6         if (info->last_pos != ctx->pos) {
    1                 free_rb_tree_fname(&info->root);
                      info->curr_node = NULL;
                      info->extra_fname = NULL;
    1                 info->curr_hash = pos2maj_hash(file, ctx->pos);
    1                 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
              }
      
              /*
               * If there are any leftover names on the hash collision
               * chain, return them first.
               */
    6         if (info->extra_fname) {
    2                 if (call_filldir(file, ctx, info->extra_fname))
                              goto finished;
    5                 info->extra_fname = NULL;
                      goto next_node;
    5         } else if (!info->curr_node)
    5                 info->curr_node = rb_first(&info->root);
      
              while (1) {
                      /*
                       * Fill the rbtree if we have no more entries,
                       * or the inode has changed since we last read in the
                       * cached entries.
                       */
    1                 if ((!info->curr_node) ||
    5                     (file->f_version != inode->i_version)) {
    5                         info->curr_node = NULL;
                              free_rb_tree_fname(&info->root);
                              file->f_version = inode->i_version;
                              ret = ext4_htree_fill_tree(file, info->curr_hash,
                                                         info->curr_minor_hash,
                                                         &info->next_hash);
                              if (ret < 0)
                                      return ret;
    5                         if (ret == 0) {
    1                                 ctx->pos = ext4_get_htree_eof(file);
                                      break;
                              }
    5                         info->curr_node = rb_first(&info->root);
                      }
      
    5                 fname = rb_entry(info->curr_node, struct fname, rb_hash);
                      info->curr_hash = fname->hash;
                      info->curr_minor_hash = fname->minor_hash;
    3                 if (call_filldir(file, ctx, fname))
                              break;
              next_node:
    5                 info->curr_node = rb_next(info->curr_node);
                      if (info->curr_node) {
                              fname = rb_entry(info->curr_node, struct fname,
                                               rb_hash);
    5                         info->curr_hash = fname->hash;
                              info->curr_minor_hash = fname->minor_hash;
                      } else {
    3                         if (info->next_hash == ~0) {
    3                                 ctx->pos = ext4_get_htree_eof(file);
                                      break;
                              }
                              info->curr_hash = info->next_hash;
                              info->curr_minor_hash = 0;
                      }
              }
      finished:
    6         info->last_pos = ctx->pos;
              return 0;
      }
      
      static int ext4_dir_open(struct inode * inode, struct file * filp)
    6 {
              if (ext4_encrypted_inode(inode))
                      return ext4_get_encryption_info(inode) ? -EACCES : 0;
              return 0;
      }
      
      static int ext4_release_dir(struct inode *inode, struct file *filp)
      {
    2         if (filp->private_data)
    1                 ext4_htree_free_dir_info(filp->private_data);
      
    2         return 0;
      }
      
      int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
                            int buf_size)
      {
              struct ext4_dir_entry_2 *de;
              int nlen, rlen;
              unsigned int offset = 0;
              char *top;
      
              de = (struct ext4_dir_entry_2 *)buf;
              top = buf + buf_size;
              while ((char *) de < top) {
                      if (ext4_check_dir_entry(dir, NULL, de, bh,
                                               buf, buf_size, offset))
                              return -EFSCORRUPTED;
                      nlen = EXT4_DIR_REC_LEN(de->name_len);
                      rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
                      de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
                      offset += rlen;
              }
              if ((char *) de > top)
                      return -EFSCORRUPTED;
      
              return 0;
      }
      
      const struct file_operations ext4_dir_operations = {
              .llseek                = ext4_dir_llseek,
              .read                = generic_read_dir,
              .iterate        = ext4_readdir,
              .unlocked_ioctl = ext4_ioctl,
      #ifdef CONFIG_COMPAT
              .compat_ioctl        = ext4_compat_ioctl,
      #endif
              .fsync                = ext4_sync_file,
              .open                = ext4_dir_open,
              .release        = ext4_release_dir,
      };
      #undef TRACE_SYSTEM
      #define TRACE_SYSTEM sock
      
      #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ)
      #define _TRACE_SOCK_H
      
      #include <net/sock.h>
      #include <linux/tracepoint.h>
      
    6 TRACE_EVENT(sock_rcvqueue_full,
      
              TP_PROTO(struct sock *sk, struct sk_buff *skb),
      
              TP_ARGS(sk, skb),
      
              TP_STRUCT__entry(
                      __field(int, rmem_alloc)
                      __field(unsigned int, truesize)
                      __field(int, sk_rcvbuf)
              ),
      
              TP_fast_assign(
                      __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                      __entry->truesize   = skb->truesize;
                      __entry->sk_rcvbuf  = sk->sk_rcvbuf;
              ),
      
              TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d",
                      __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf)
      );
      
      TRACE_EVENT(sock_exceed_buf_limit,
      
              TP_PROTO(struct sock *sk, struct proto *prot, long allocated),
      
              TP_ARGS(sk, prot, allocated),
      
              TP_STRUCT__entry(
                      __array(char, name, 32)
                      __field(long *, sysctl_mem)
                      __field(long, allocated)
                      __field(int, sysctl_rmem)
                      __field(int, rmem_alloc)
              ),
      
              TP_fast_assign(
                      strncpy(__entry->name, prot->name, 32);
                      __entry->sysctl_mem = prot->sysctl_mem;
                      __entry->allocated = allocated;
                      __entry->sysctl_rmem = prot->sysctl_rmem[0];
                      __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
              ),
      
              TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
                      "sysctl_rmem=%d rmem_alloc=%d",
                      __entry->name,
                      __entry->sysctl_mem[0],
                      __entry->sysctl_mem[1],
                      __entry->sysctl_mem[2],
                      __entry->allocated,
                      __entry->sysctl_rmem,
                      __entry->rmem_alloc)
      );
      
      #endif /* _TRACE_SOCK_H */
      
      /* This part must be outside protection */
      #include <trace/define_trace.h>
      /*
       *  linux/fs/exec.c
       *
       *  Copyright (C) 1991, 1992  Linus Torvalds
       */
      
      /*
       * #!-checking implemented by tytso.
       */
      /*
       * Demand-loading implemented 01.12.91 - no need to read anything but
       * the header into memory. The inode of the executable is put into
       * "current->executable", and page faults do the actual loading. Clean.
       *
       * Once more I can proudly say that linux stood up to being changed: it
       * was less than 2 hours work to get demand-loading completely implemented.
       *
       * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
       * current->executable is only used by the procfs.  This allows a dispatch
       * table to check for several different types  of binary formats.  We keep
       * trying until we recognize the file or we run out of supported binary
       * formats.
       */
      
      #include <linux/slab.h>
      #include <linux/file.h>
      #include <linux/fdtable.h>
      #include <linux/mm.h>
      #include <linux/vmacache.h>
      #include <linux/stat.h>
      #include <linux/fcntl.h>
      #include <linux/swap.h>
      #include <linux/string.h>
      #include <linux/init.h>
      #include <linux/pagemap.h>
      #include <linux/perf_event.h>
      #include <linux/highmem.h>
      #include <linux/spinlock.h>
      #include <linux/key.h>
      #include <linux/personality.h>
      #include <linux/binfmts.h>
      #include <linux/utsname.h>
      #include <linux/pid_namespace.h>
      #include <linux/module.h>
      #include <linux/namei.h>
      #include <linux/mount.h>
      #include <linux/security.h>
      #include <linux/syscalls.h>
      #include <linux/tsacct_kern.h>
      #include <linux/cn_proc.h>
      #include <linux/audit.h>
      #include <linux/tracehook.h>
      #include <linux/kmod.h>
      #include <linux/fsnotify.h>
      #include <linux/fs_struct.h>
      #include <linux/pipe_fs_i.h>
      #include <linux/oom.h>
      #include <linux/compat.h>
      #include <linux/user_namespace.h>
      
      #include <asm/uaccess.h>
      #include <asm/mmu_context.h>
      #include <asm/tlb.h>
      
      #include <trace/events/task.h>
      #include "internal.h"
      
      #include <trace/events/sched.h>
      
      int suid_dumpable = 0;
      
      static LIST_HEAD(formats);
      static DEFINE_RWLOCK(binfmt_lock);
      
      void __register_binfmt(struct linux_binfmt * fmt, int insert)
      {
              BUG_ON(!fmt);
              if (WARN_ON(!fmt->load_binary))
                      return;
              write_lock(&binfmt_lock);
              insert ? list_add(&fmt->lh, &formats) :
                       list_add_tail(&fmt->lh, &formats);
              write_unlock(&binfmt_lock);
      }
      
      EXPORT_SYMBOL(__register_binfmt);
      
      void unregister_binfmt(struct linux_binfmt * fmt)
      {
              write_lock(&binfmt_lock);
              list_del(&fmt->lh);
              write_unlock(&binfmt_lock);
      }
      
      EXPORT_SYMBOL(unregister_binfmt);
      
      static inline void put_binfmt(struct linux_binfmt * fmt)
      {
              module_put(fmt->module);
      }
      
      bool path_noexec(const struct path *path)
      {
  270         return (path->mnt->mnt_flags & MNT_NOEXEC) ||
  266                (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
      }
      
      #ifdef CONFIG_USELIB
      /*
       * Note that a shared library must be both readable and executable due to
       * security reasons.
       *
       * Also note that we take the address to load from from the file itself.
       */
      SYSCALL_DEFINE1(uselib, const char __user *, library)
      {
              struct linux_binfmt *fmt;
              struct file *file;
              struct filename *tmp = getname(library);
              int error = PTR_ERR(tmp);
              static const struct open_flags uselib_flags = {
                      .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                      .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
                      .intent = LOOKUP_OPEN,
                      .lookup_flags = LOOKUP_FOLLOW,
              };
      
              if (IS_ERR(tmp))
                      goto out;
      
              file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
              putname(tmp);
              error = PTR_ERR(file);
              if (IS_ERR(file))
                      goto out;
      
              error = -EINVAL;
              if (!S_ISREG(file_inode(file)->i_mode))
                      goto exit;
      
              error = -EACCES;
              if (path_noexec(&file->f_path))
                      goto exit;
      
              fsnotify_open(file);
      
              error = -ENOEXEC;
      
              read_lock(&binfmt_lock);
              list_for_each_entry(fmt, &formats, lh) {
                      if (!fmt->load_shlib)
                              continue;
                      if (!try_module_get(fmt->module))
                              continue;
                      read_unlock(&binfmt_lock);
                      error = fmt->load_shlib(file);
                      read_lock(&binfmt_lock);
                      put_binfmt(fmt);
                      if (error != -ENOEXEC)
                              break;
              }
              read_unlock(&binfmt_lock);
      exit:
              fput(file);
      out:
                return error;
      }
      #endif /* #ifdef CONFIG_USELIB */
      
      #ifdef CONFIG_MMU
      /*
       * The nascent bprm->mm is not visible until exec_mmap() but it can
       * use a lot of memory, account these pages in current->mm temporary
       * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
       * change the counter back via acct_arg_size(0).
       */
      static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
      {
   30         struct mm_struct *mm = current->mm;
              long diff = (long)(pages - bprm->vma_pages);
      
   65         if (!mm || !diff)
                      return;
      
   63         bprm->vma_pages = pages;
              add_mm_counter(mm, MM_ANONPAGES, diff);
      }
      
      static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                      int write)
      {
              struct page *page;
              int ret;
              unsigned int gup_flags = FOLL_FORCE;
      
      #ifdef CONFIG_STACK_GROWSUP
              if (write) {
                      ret = expand_downwards(bprm->vma, pos);
                      if (ret < 0)
                              return NULL;
              }
      #endif
      
              if (write)
                      gup_flags |= FOLL_WRITE;
      
   63         ret = get_user_pages(current, bprm->mm, pos, 1, gup_flags,
                              &page, NULL);
              if (ret <= 0)
                      return NULL;
      
              if (write) {
   63                 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
                      unsigned long ptr_size, limit;
      
                      /*
                       * Since the stack will hold pointers to the strings, we
                       * must account for them as well.
                       *
                       * The size calculation is the entire vma while each arg page is
                       * built, so each time we get here it's calculating how far it
                       * is currently (rather than each call being just the newly
                       * added size from the arg page).  As a result, we need to
                       * always add the entire size of the pointers, so that on the
                       * last call to get_arg_page() we'll actually have the entire
                       * correct size.
                       */
                      ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
                      if (ptr_size > ULONG_MAX - size)
                              goto fail;
   63                 size += ptr_size;
      
   63                 acct_arg_size(bprm, size / PAGE_SIZE);
      
                      /*
                       * We've historically supported up to 32 pages (ARG_MAX)
                       * of argument strings even with small stacks
                       */
   63                 if (size <= ARG_MAX)
                              return page;
      
                      /*
                       * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
                       * (whichever is smaller) for the argv+env strings.
                       * This ensures that:
                       *  - the remaining binfmt code will not run out of stack space,
                       *  - the program will have a reasonable amount of stack left
                       *    to work from.
                       */
                      limit = _STK_LIM / 4 * 3;
                      limit = min(limit, rlimit(RLIMIT_STACK) / 4);
   63                 if (size > limit)
                              goto fail;
              }
      
    2         return page;
      
      fail:
              put_page(page);
              return NULL;
      }
      
      static void put_arg_page(struct page *page)
      {
   63         put_page(page);
      }
      
      static void free_arg_page(struct linux_binprm *bprm, int i)
      {
      }
      
      static void free_arg_pages(struct linux_binprm *bprm)
      {
      }
      
      static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                      struct page *page)
      {
              flush_cache_page(bprm->vma, pos, page_to_pfn(page));
      }
      
      static int __bprm_mm_init(struct linux_binprm *bprm)
      {
              int err;
              struct vm_area_struct *vma = NULL;
              struct mm_struct *mm = bprm->mm;
      
   69         bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
              if (!vma)
                      return -ENOMEM;
      
   69         down_write(&mm->mmap_sem);
              vma->vm_mm = mm;
      
              /*
               * Place the stack at the largest stack address the architecture
               * supports. Later, we'll move this to an appropriate place. We don't
               * use STACK_TOP because that can depend on attributes which aren't
               * configured yet.
               */
              BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
              vma->vm_end = STACK_TOP_MAX;
              vma->vm_start = vma->vm_end - PAGE_SIZE;
   69         vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
              vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
              INIT_LIST_HEAD(&vma->anon_vma_chain);
      
              err = insert_vm_struct(mm, vma);
              if (err)
                      goto err;
      
   69         mm->stack_vm = mm->total_vm = 1;
              arch_bprm_mm_init(mm, vma);
              up_write(&mm->mmap_sem);
              bprm->p = vma->vm_end - sizeof(void *);
              return 0;
      err:
              up_write(&mm->mmap_sem);
              bprm->vma = NULL;
              kmem_cache_free(vm_area_cachep, vma);
              return err;
      }
      
      static bool valid_arg_len(struct linux_binprm *bprm, long len)
      {
   63         return len <= MAX_ARG_STRLEN;
      }
      
      #else
      
      static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
      {
      }
      
      static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                      int write)
      {
              struct page *page;
      
              page = bprm->page[pos / PAGE_SIZE];
              if (!page && write) {
                      page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
                      if (!page)
                              return NULL;
                      bprm->page[pos / PAGE_SIZE] = page;
              }
      
              return page;
      }
      
      static void put_arg_page(struct page *page)
      {
      }
      
      static void free_arg_page(struct linux_binprm *bprm, int i)
      {
              if (bprm->page[i]) {
                      __free_page(bprm->page[i]);
                      bprm->page[i] = NULL;
              }
      }
      
      static void free_arg_pages(struct linux_binprm *bprm)
      {
              int i;
      
              for (i = 0; i < MAX_ARG_PAGES; i++)
                      free_arg_page(bprm, i);
      }
      
      static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                      struct page *page)
      {
      }
      
      static int __bprm_mm_init(struct linux_binprm *bprm)
      {
              bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
              return 0;
      }
      
      static bool valid_arg_len(struct linux_binprm *bprm, long len)
      {
              return len <= bprm->p;
      }
      
      #endif /* CONFIG_MMU */
      
      /*
       * Create a new mm_struct and populate it with a temporary stack
       * vm_area_struct.  We don't have enough context at this point to set the stack
       * flags, permissions, and offset, so we use temporary values.  We'll update
       * them later in setup_arg_pages().
       */
      static int bprm_mm_init(struct linux_binprm *bprm)
      {
              int err;
              struct mm_struct *mm = NULL;
      
              bprm->mm = mm = mm_alloc();
              err = -ENOMEM;
              if (!mm)
                      goto err;
      
   69         err = __bprm_mm_init(bprm);
              if (err)
                      goto err;
      
              return 0;
      
      err:
              if (mm) {
                      bprm->mm = NULL;
                      mmdrop(mm);
              }
      
              return err;
      }
      
      struct user_arg_ptr {
      #ifdef CONFIG_COMPAT
              bool is_compat;
      #endif
              union {
                      const char __user *const __user *native;
      #ifdef CONFIG_COMPAT
                      const compat_uptr_t __user *compat;
      #endif
              } ptr;
      };
      
   69 static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
      {
              const char __user *native;
      
      #ifdef CONFIG_COMPAT
              if (unlikely(argv.is_compat)) {
                      compat_uptr_t compat;
      
   15                 if (get_user(compat, argv.ptr.compat + nr))
                              return ERR_PTR(-EFAULT);
      
   15                 return compat_ptr(compat);
              }
      #endif
      
   68         if (get_user(native, argv.ptr.native + nr))
                      return ERR_PTR(-EFAULT);
      
              return native;
      }
      
      /*
       * count() counts the number of strings in array ARGV.
       */
      static int count(struct user_arg_ptr argv, int max)
      {
              int i = 0;
      
   69         if (argv.ptr.native != NULL) {
                      for (;;) {
   15                         const char __user *p = get_user_arg_ptr(argv, i);
      
                              if (!p)
                                      break;
      
   15                         if (IS_ERR(p))
   65                                 return -EFAULT;
      
   14                         if (i >= max)
                                      return -E2BIG;
   14                         ++i;
      
    1                         if (fatal_signal_pending(current))
                                      return -ERESTARTNOHAND;
   14                         cond_resched();
                      }
              }
              return i;
      }
      
      /*
       * 'copy_strings()' copies argument/environment strings from the old
       * processes's memory to the new process's stack.  The call to get_user_pages()
       * ensures the destination page is created and not swapped out.
       */
      static int copy_strings(int argc, struct user_arg_ptr argv,
                              struct linux_binprm *bprm)
      {
              struct page *kmapped_page = NULL;
              char *kaddr = NULL;
              unsigned long kpos = 0;
              int ret;
      
   63         while (argc-- > 0) {
                      const char __user *str;
                      int len;
                      unsigned long pos;
      
                      ret = -EFAULT;
   63                 str = get_user_arg_ptr(argv, argc);
                      if (IS_ERR(str))
                              goto out;
      
   63                 len = strnlen_user(str, MAX_ARG_STRLEN);
                      if (!len)
                              goto out;
      
                      ret = -E2BIG;
   63                 if (!valid_arg_len(bprm, len))
                              goto out;
      
                      /* We're going to work our way backwords. */
   63                 pos = bprm->p;
                      str += len;
                      bprm->p -= len;
      
   63                 while (len > 0) {
                              int offset, bytes_to_copy;
      
   63                         if (fatal_signal_pending(current)) {
    1                                 ret = -ERESTARTNOHAND;
                                      goto out;
                              }
   63                         cond_resched();
      
                              offset = pos % PAGE_SIZE;
                              if (offset == 0)
                                      offset = PAGE_SIZE;
      
                              bytes_to_copy = offset;
   63                         if (bytes_to_copy > len)
                                      bytes_to_copy = len;
      
                              offset -= bytes_to_copy;
                              pos -= bytes_to_copy;
                              str -= bytes_to_copy;
                              len -= bytes_to_copy;
      
    3                         if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                      struct page *page;
      
   63                                 page = get_arg_page(bprm, pos, 1);
                                      if (!page) {
                                              ret = -E2BIG;
                                              goto out;
                                      }
      
   63                                 if (kmapped_page) {
                                              flush_kernel_dcache_page(kmapped_page);
                                              kunmap(kmapped_page);
    2                                         put_arg_page(kmapped_page);
                                      }
                                      kmapped_page = page;
   63                                 kaddr = kmap(kmapped_page);
                                      kpos = pos & PAGE_MASK;
                                      flush_arg_page(bprm, kpos, kmapped_page);
                              }
   63                         if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
                                      ret = -EFAULT;
                                      goto out;
                              }
                      }
              }
              ret = 0;
      out:
   63         if (kmapped_page) {
                      flush_kernel_dcache_page(kmapped_page);
                      kunmap(kmapped_page);
   63                 put_arg_page(kmapped_page);
              }
   63         return ret;
      }
      
      /*
       * Like copy_strings, but get argv and its values from kernel memory.
       */
      int copy_strings_kernel(int argc, const char *const *__argv,
                              struct linux_binprm *bprm)
      {
              int r;
   63         mm_segment_t oldfs = get_fs();
              struct user_arg_ptr argv = {
                      .ptr.native = (const char __user *const  __user *)__argv,
              };
      
              set_fs(KERNEL_DS);
              r = copy_strings(argc, argv, bprm);
              set_fs(oldfs);
      
              return r;
      }
      EXPORT_SYMBOL(copy_strings_kernel);
      
      #ifdef CONFIG_MMU
      
      /*
       * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
       * the binfmt code determines where the new stack should reside, we shift it to
       * its final location.  The process proceeds as follows:
       *
       * 1) Use shift to calculate the new vma endpoints.
       * 2) Extend vma to cover both the old and new ranges.  This ensures the
       *    arguments passed to subsequent functions are consistent.
       * 3) Move vma's page tables to the new range.
       * 4) Free up any cleared pgd range.
       * 5) Shrink the vma to cover only the new range.
       */
      static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
      {
              struct mm_struct *mm = vma->vm_mm;
              unsigned long old_start = vma->vm_start;
              unsigned long old_end = vma->vm_end;
              unsigned long length = old_end - old_start;
              unsigned long new_start = old_start - shift;
              unsigned long new_end = old_end - shift;
              struct mmu_gather tlb;
      
              BUG_ON(new_start > new_end);
      
              /*
               * ensure there are no vmas between where we want to go
               * and where we are
               */
              if (vma != find_vma(mm, new_start))
                      return -EFAULT;
      
              /*
               * cover the whole range: [new_start, old_end)
               */
              if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
                      return -ENOMEM;
      
              /*
               * move the page tables downwards, on failure we rely on
               * process cleanup to remove whatever mess we made.
               */
              if (length != move_page_tables(vma, old_start,
                                             vma, new_start, length, false))
                      return -ENOMEM;
      
              lru_add_drain();
              tlb_gather_mmu(&tlb, mm, old_start, old_end);
              if (new_end > old_start) {
                      /*
                       * when the old and new regions overlap clear from new_end.
                       */
                      free_pgd_range(&tlb, new_end, old_end, new_end,
                              vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
              } else {
                      /*
                       * otherwise, clean from old_start; this is done to not touch
                       * the address space in [new_end, old_start) some architectures
                       * have constraints on va-space that make this illegal (IA64) -
                       * for the others its just a little faster.
                       */
                      free_pgd_range(&tlb, old_start, old_end, new_end,
                              vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
              }
              tlb_finish_mmu(&tlb, old_start, old_end);
      
              /*
               * Shrink the vma to just the new range.  Always succeeds.
               */
              vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
      
              return 0;
      }
      
      /*
       * Finalizes the stack vm_area_struct. The flags and permissions are updated,
       * the stack is optionally relocated, and some extra space is added.
       */
      int setup_arg_pages(struct linux_binprm *bprm,
                          unsigned long stack_top,
                          int executable_stack)
      {
              unsigned long ret;
              unsigned long stack_shift;
              struct mm_struct *mm = current->mm;
              struct vm_area_struct *vma = bprm->vma;
              struct vm_area_struct *prev = NULL;
              unsigned long vm_flags;
              unsigned long stack_base;
              unsigned long stack_size;
              unsigned long stack_expand;
              unsigned long rlim_stack;
      
      #ifdef CONFIG_STACK_GROWSUP
              /* Limit stack size */
              stack_base = rlimit_max(RLIMIT_STACK);
              if (stack_base > STACK_SIZE_MAX)
                      stack_base = STACK_SIZE_MAX;
      
              /* Add space for stack randomization. */
              stack_base += (STACK_RND_MASK << PAGE_SHIFT);
      
              /* Make sure we didn't let the argument array grow too large. */
              if (vma->vm_end - vma->vm_start > stack_base)
                      return -ENOMEM;
      
              stack_base = PAGE_ALIGN(stack_top - stack_base);
      
              stack_shift = vma->vm_start - stack_base;
              mm->arg_start = bprm->p - stack_shift;
              bprm->p = vma->vm_end - stack_shift;
      #else
              stack_top = arch_align_stack(stack_top);
              stack_top = PAGE_ALIGN(stack_top);
      
              if (unlikely(stack_top < mmap_min_addr) ||
                  unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
                      return -ENOMEM;
      
              stack_shift = vma->vm_end - stack_top;
      
              bprm->p -= stack_shift;
              mm->arg_start = bprm->p;
      #endif
      
              if (bprm->loader)
                      bprm->loader -= stack_shift;
              bprm->exec -= stack_shift;
      
              down_write(&mm->mmap_sem);
              vm_flags = VM_STACK_FLAGS;
      
              /*
               * Adjust stack execute permissions; explicitly enable for
               * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
               * (arch default) otherwise.
               */
              if (unlikely(executable_stack == EXSTACK_ENABLE_X))
                      vm_flags |= VM_EXEC;
              else if (executable_stack == EXSTACK_DISABLE_X)
                      vm_flags &= ~VM_EXEC;
              vm_flags |= mm->def_flags;
              vm_flags |= VM_STACK_INCOMPLETE_SETUP;
      
              ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
                              vm_flags);
              if (ret)
                      goto out_unlock;
              BUG_ON(prev != vma);
      
              /* Move stack pages down in memory. */
              if (stack_shift) {
                      ret = shift_arg_pages(vma, stack_shift);
                      if (ret)
                              goto out_unlock;
              }
      
              /* mprotect_fixup is overkill to remove the temporary stack flags */
              vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
      
              stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
              stack_size = vma->vm_end - vma->vm_start;
              /*
               * Align this down to a page boundary as expand_stack
               * will align it up.
               */
              rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
      #ifdef CONFIG_STACK_GROWSUP
              if (stack_size + stack_expand > rlim_stack)
                      stack_base = vma->vm_start + rlim_stack;
              else
                      stack_base = vma->vm_end + stack_expand;
      #else
              if (stack_size + stack_expand > rlim_stack)
                      stack_base = vma->vm_end - rlim_stack;
              else
                      stack_base = vma->vm_start - stack_expand;
      #endif
              current->mm->start_stack = bprm->p;
              ret = expand_stack(vma, stack_base);
              if (ret)
                      ret = -EFAULT;
      
      out_unlock:
              up_write(&mm->mmap_sem);
              return ret;
      }
      EXPORT_SYMBOL(setup_arg_pages);
      
      #endif /* CONFIG_MMU */
      
      static struct file *do_open_execat(int fd, struct filename *name, int flags)
      {
              struct file *file;
              int err;
   93         struct open_flags open_exec_flags = {
                      .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                      .acc_mode = MAY_EXEC | MAY_OPEN,
                      .intent = LOOKUP_OPEN,
                      .lookup_flags = LOOKUP_FOLLOW,
              };
      
              if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                      return ERR_PTR(-EINVAL);
   92         if (flags & AT_SYMLINK_NOFOLLOW)
    1                 open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
   92         if (flags & AT_EMPTY_PATH)
   58                 open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
      
   92         file = do_filp_open(fd, name, &open_exec_flags);
              if (IS_ERR(file))
                      goto out;
      
              err = -EACCES;
   76         if (!S_ISREG(file_inode(file)->i_mode))
                      goto exit;
      
   73         if (path_noexec(&file->f_path))
                      goto exit;
      
   71         err = deny_write_access(file);
              if (err)
                      goto exit;
      
   70         if (name->name[0] != '\0')
   23                 fsnotify_open(file);
      
      out:
              return file;
      
      exit:
    8         fput(file);
              return ERR_PTR(err);
      }
      
      struct file *open_exec(const char *name)
      {
   10         struct filename *filename = getname_kernel(name);
              struct file *f = ERR_CAST(filename);
      
              if (!IS_ERR(filename)) {
   10                 f = do_open_execat(AT_FDCWD, filename, 0);
                      putname(filename);
              }
   10         return f;
      }
      EXPORT_SYMBOL(open_exec);
      
      int kernel_read(struct file *file, loff_t offset,
                      char *addr, unsigned long count)
      {
              mm_segment_t old_fs;
   63         loff_t pos = offset;
              int result;
      
              old_fs = get_fs();
              set_fs(get_ds());
              /* The cast to a user pointer is valid due to the set_fs() */
              result = vfs_read(file, (void __user *)addr, count, &pos);
              set_fs(old_fs);
              return result;
      }
      
      EXPORT_SYMBOL(kernel_read);
      
      ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
      {
              ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
              if (res > 0)
                      flush_icache_range(addr, addr + len);
              return res;
      }
      EXPORT_SYMBOL(read_code);
      
      static int exec_mmap(struct mm_struct *mm)
      {
              struct task_struct *tsk;
              struct mm_struct *old_mm, *active_mm;
      
              /* Notify parent that we're no longer interested in the old VM */
              tsk = current;
              old_mm = current->mm;
              mm_release(tsk, old_mm);
      
              if (old_mm) {
                      sync_mm_rss(old_mm);
                      /*
                       * Make sure that if there is a core dump in progress
                       * for the old mm, we get out and die instead of going
                       * through with the exec.  We must hold mmap_sem around
                       * checking core_state and changing tsk->mm.
                       */
                      down_read(&old_mm->mmap_sem);
                      if (unlikely(old_mm->core_state)) {
                              up_read(&old_mm->mmap_sem);
                              return -EINTR;
                      }
              }
              task_lock(tsk);
              active_mm = tsk->active_mm;
              tsk->mm = mm;
              tsk->active_mm = mm;
              activate_mm(active_mm, mm);
              tsk->mm->vmacache_seqnum = 0;
              vmacache_flush(tsk);
              task_unlock(tsk);
              if (old_mm) {
                      up_read(&old_mm->mmap_sem);
                      BUG_ON(active_mm != old_mm);
                      setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
                      mm_update_next_owner(old_mm);
                      mmput(old_mm);
                      return 0;
              }
              mmdrop(active_mm);
              return 0;
      }
      
      /*
       * This function makes sure the current process has its own signal table,
       * so that flush_signal_handlers can later reset the handlers without
       * disturbing other processes.  (Other processes might share the signal
       * table via the CLONE_SIGHAND option to clone().)
       */
      static int de_thread(struct task_struct *tsk)
      {
              struct signal_struct *sig = tsk->signal;
              struct sighand_struct *oldsighand = tsk->sighand;
              spinlock_t *lock = &oldsighand->siglock;
      
              if (thread_group_empty(tsk))
                      goto no_thread_group;
      
              /*
               * Kill all other threads in the thread group.
               */
              spin_lock_irq(lock);
              if (signal_group_exit(sig)) {
                      /*
                       * Another group action in progress, just
                       * return so that the signal is processed.
                       */
                      spin_unlock_irq(lock);
                      return -EAGAIN;
              }
      
              sig->group_exit_task = tsk;
              sig->notify_count = zap_other_threads(tsk);
              if (!thread_group_leader(tsk))
                      sig->notify_count--;
      
              while (sig->notify_count) {
                      __set_current_state(TASK_KILLABLE);
                      spin_unlock_irq(lock);
                      schedule();
                      if (unlikely(__fatal_signal_pending(tsk)))
                              goto killed;
                      spin_lock_irq(lock);
              }
              spin_unlock_irq(lock);
      
              /*
               * At this point all other threads have exited, all we have to
               * do is to wait for the thread group leader to become inactive,
               * and to assume its PID:
               */
              if (!thread_group_leader(tsk)) {
                      struct task_struct *leader = tsk->group_leader;
      
                      for (;;) {
                              threadgroup_change_begin(tsk);
                              write_lock_irq(&tasklist_lock);
                              /*
                               * Do this under tasklist_lock to ensure that
                               * exit_notify() can't miss ->group_exit_task
                               */
                              sig->notify_count = -1;
                              if (likely(leader->exit_state))
                                      break;
                              __set_current_state(TASK_KILLABLE);
                              write_unlock_irq(&tasklist_lock);
                              threadgroup_change_end(tsk);
                              schedule();
                              if (unlikely(__fatal_signal_pending(tsk)))
                                      goto killed;
                      }
      
                      /*
                       * The only record we have of the real-time age of a
                       * process, regardless of execs it's done, is start_time.
                       * All the past CPU time is accumulated in signal_struct
                       * from sister threads now dead.  But in this non-leader
                       * exec, nothing survives from the original leader thread,
                       * whose birth marks the true age of this process now.
                       * When we take on its identity by switching to its PID, we
                       * also take its birthdate (always earlier than our own).
                       */
                      tsk->start_time = leader->start_time;
                      tsk->real_start_time = leader->real_start_time;
      
                      BUG_ON(!same_thread_group(leader, tsk));
                      BUG_ON(has_group_leader_pid(tsk));
                      /*
                       * An exec() starts a new thread group with the
                       * TGID of the previous thread group. Rehash the
                       * two threads with a switched PID, and release
                       * the former thread group leader:
                       */
      
                      /* Become a process group leader with the old leader's pid.
                       * The old leader becomes a thread of the this thread group.
                       * Note: The old leader also uses this pid until release_task
                       *       is called.  Odd but simple and correct.
                       */
                      tsk->pid = leader->pid;
                      change_pid(tsk, PIDTYPE_PID, task_pid(leader));
                      transfer_pid(leader, tsk, PIDTYPE_PGID);
                      transfer_pid(leader, tsk, PIDTYPE_SID);
      
                      list_replace_rcu(&leader->tasks, &tsk->tasks);
                      list_replace_init(&leader->sibling, &tsk->sibling);
      
                      tsk->group_leader = tsk;
                      leader->group_leader = tsk;
      
                      tsk->exit_signal = SIGCHLD;
                      leader->exit_signal = -1;
      
                      BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                      leader->exit_state = EXIT_DEAD;
      
                      /*
                       * We are going to release_task()->ptrace_unlink() silently,
                       * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
                       * the tracer wont't block again waiting for this thread.
                       */
                      if (unlikely(leader->ptrace))
                              __wake_up_parent(leader, leader->parent);
                      write_unlock_irq(&tasklist_lock);
                      threadgroup_change_end(tsk);
      
                      release_task(leader);
              }
      
              sig->group_exit_task = NULL;
              sig->notify_count = 0;
      
      no_thread_group:
              /* we have changed execution domain */
              tsk->exit_signal = SIGCHLD;
      
              exit_itimers(sig);
              flush_itimer_signals();
      
              if (atomic_read(&oldsighand->count) != 1) {
                      struct sighand_struct *newsighand;
                      /*
                       * This ->sighand is shared with the CLONE_SIGHAND
                       * but not CLONE_THREAD task, switch to the new one.
                       */
                      newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
                      if (!newsighand)
                              return -ENOMEM;
      
                      atomic_set(&newsighand->count, 1);
                      memcpy(newsighand->action, oldsighand->action,
                             sizeof(newsighand->action));
      
                      write_lock_irq(&tasklist_lock);
                      spin_lock(&oldsighand->siglock);
                      rcu_assign_pointer(tsk->sighand, newsighand);
                      spin_unlock(&oldsighand->siglock);
                      write_unlock_irq(&tasklist_lock);
      
                      __cleanup_sighand(oldsighand);
              }
      
              BUG_ON(!thread_group_leader(tsk));
              return 0;
      
      killed:
              /* protects against exit_notify() and __exit_signal() */
              read_lock(&tasklist_lock);
              sig->group_exit_task = NULL;
              sig->notify_count = 0;
              read_unlock(&tasklist_lock);
              return -EAGAIN;
      }
      
      char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
      {
   24         task_lock(tsk);
              strncpy(buf, tsk->comm, buf_size);
              task_unlock(tsk);
              return buf;
      }
      EXPORT_SYMBOL_GPL(__get_task_comm);
      
      /*
       * These functions flushes out all traces of the currently running executable
       * so that a new one can be started
       */
      
      void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
      {
   52         task_lock(tsk);
   52         trace_task_rename(tsk, buf);
   52         strlcpy(tsk->comm, buf, sizeof(tsk->comm));
              task_unlock(tsk);
              perf_event_comm(tsk, exec);
      }
      
      int flush_old_exec(struct linux_binprm * bprm)
      {
              int retval;
      
              /*
               * Make sure we have a private signal table and that
               * we are unassociated from the previous thread group.
               */
              retval = de_thread(current);
              if (retval)
                      goto out;
      
              /*
               * Must be called _before_ exec_mmap() as bprm->mm is
               * not visibile until then. This also enables the update
               * to be lockless.
               */
              set_mm_exe_file(bprm->mm, bprm->file);
      
              /*
               * Release all of the old mmap stuff
               */
              acct_arg_size(bprm, 0);
              retval = exec_mmap(bprm->mm);
              if (retval)
                      goto out;
      
              bprm->mm = NULL;                /* We're using it now */
      
              set_fs(USER_DS);
              current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
                                              PF_NOFREEZE | PF_NO_SETAFFINITY);
              flush_thread();
              current->personality &= ~bprm->per_clear;
      
              /*
               * We have to apply CLOEXEC before we change whether the process is
               * dumpable (in setup_new_exec) to avoid a race with a process in userspace
               * trying to access the should-be-closed file descriptors of a process
               * undergoing exec(2).
               */
              do_close_on_exec(current->files);
              return 0;
      
      out:
              return retval;
      }
      EXPORT_SYMBOL(flush_old_exec);
      
      void would_dump(struct linux_binprm *bprm, struct file *file)
      {
   58         struct inode *inode = file_inode(file);
      
              if (inode_permission2(file->f_path.mnt, inode, MAY_READ) < 0) {
                      struct user_namespace *old, *user_ns;
      
    1                 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
      
                      /* Ensure mm->user_ns contains the executable */
                      user_ns = old = bprm->mm->user_ns;
                      while ((user_ns != &init_user_ns) &&
    1                        !privileged_wrt_inode_uidgid(user_ns, inode))
                              user_ns = user_ns->parent;
      
    1                 if (old != user_ns) {
                              bprm->mm->user_ns = get_user_ns(user_ns);
                              put_user_ns(old);
                      }
              }
   58 }
      EXPORT_SYMBOL(would_dump);
      
      void setup_new_exec(struct linux_binprm * bprm)
      {
              arch_pick_mmap_layout(current->mm);
      
              /* This is the point of no return */
              current->sas_ss_sp = current->sas_ss_size = 0;
      
              if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
                      set_dumpable(current->mm, SUID_DUMP_USER);
              else
                      set_dumpable(current->mm, suid_dumpable);
      
              perf_event_exec();
              __set_task_comm(current, kbasename(bprm->filename), true);
      
              /* Set the new mm task size. We have to do that late because it may
               * depend on TIF_32BIT which is only updated in flush_thread() on
               * some architectures like powerpc
               */
              current->mm->task_size = TASK_SIZE;
      
              /* install the new credentials */
              if (!uid_eq(bprm->cred->uid, current_euid()) ||
                  !gid_eq(bprm->cred->gid, current_egid())) {
                      current->pdeath_signal = 0;
              } else {
                      if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
                              set_dumpable(current->mm, suid_dumpable);
              }
      
              /* An exec changes our domain. We are no longer part of the thread
                 group */
              current->self_exec_id++;
              flush_signal_handlers(current, 0);
      }
      EXPORT_SYMBOL(setup_new_exec);
      
      /*
       * Prepare credentials and lock ->cred_guard_mutex.
       * install_exec_creds() commits the new creds and drops the lock.
       * Or, if exec fails before, free_bprm() should release ->cred and
       * and unlock.
       */
      int prepare_bprm_creds(struct linux_binprm *bprm)
      {
   93         if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                      return -ERESTARTNOINTR;
      
   93         bprm->cred = prepare_exec_creds();
   93         if (likely(bprm->cred))
                      return 0;
      
              mutex_unlock(&current->signal->cred_guard_mutex);
              return -ENOMEM;
      }
      
      static void free_bprm(struct linux_binprm *bprm)
      {
              free_arg_pages(bprm);
   53         if (bprm->cred) {
   53                 mutex_unlock(&current->signal->cred_guard_mutex);
                      abort_creds(bprm->cred);
              }
   53         if (bprm->file) {
   25                 allow_write_access(bprm->file);
                      fput(bprm->file);
              }
              /* If a binfmt changed the interp, free it. */
   53         if (bprm->interp != bprm->filename)
    6                 kfree(bprm->interp);
   53         kfree(bprm);
      }
      
      int bprm_change_interp(char *interp, struct linux_binprm *bprm)
      {
              /* If a binfmt changed the interp, free it first. */
    8         if (bprm->interp != bprm->filename)
    1                 kfree(bprm->interp);
    8         bprm->interp = kstrdup(interp, GFP_KERNEL);
              if (!bprm->interp)
    8                 return -ENOMEM;
              return 0;
      }
      EXPORT_SYMBOL(bprm_change_interp);
      
      /*
       * install the new credentials for this executable
       */
      void install_exec_creds(struct linux_binprm *bprm)
      {
              security_bprm_committing_creds(bprm);
      
              commit_creds(bprm->cred);
              bprm->cred = NULL;
      
              /*
               * Disable monitoring for regular users
               * when executing setuid binaries. Must
               * wait until new credentials are committed
               * by commit_creds() above
               */
              if (get_dumpable(current->mm) != SUID_DUMP_USER)
                      perf_event_exit_task(current);
              /*
               * cred_guard_mutex must be held at least to this point to prevent
               * ptrace_attach() from altering our determination of the task's
               * credentials; any time after this it may be unlocked.
               */
              security_bprm_committed_creds(bprm);
              mutex_unlock(&current->signal->cred_guard_mutex);
      }
      EXPORT_SYMBOL(install_exec_creds);
      
      /*
       * determine how safe it is to execute the proposed program
       * - the caller must hold ->cred_guard_mutex to protect against
       *   PTRACE_ATTACH or seccomp thread-sync
       */
      static void check_unsafe_exec(struct linux_binprm *bprm)
      {
   93         struct task_struct *p = current, *t;
              unsigned n_fs;
      
              if (p->ptrace) {
    1                 if (ptracer_capable(p, current_user_ns()))
                              bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
                      else
    1                         bprm->unsafe |= LSM_UNSAFE_PTRACE;
              }
      
              /*
               * This isn't strictly necessary, but it makes it harder for LSMs to
               * mess up.
               */
   93         if (task_no_new_privs(current))
    2                 bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
      
              t = p;
              n_fs = 1;
   93         spin_lock(&p->fs->lock);
   93         rcu_read_lock();
   93         while_each_thread(p, t) {
   93                 if (t->fs == p->fs)
   92                         n_fs++;
              }
   93         rcu_read_unlock();
      
              if (p->fs->users > n_fs)
   53                 bprm->unsafe |= LSM_UNSAFE_SHARE;
              else
   40                 p->fs->in_exec = 1;
   93         spin_unlock(&p->fs->lock);
      }
      
      static void bprm_fill_uid(struct linux_binprm *bprm)
      {
              struct inode *inode;
              unsigned int mode;
              kuid_t uid;
              kgid_t gid;
      
              /* clear any previous set[ug]id data from a previous binary */
   64         bprm->cred->euid = current_euid();
              bprm->cred->egid = current_egid();
      
              if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
                      return;
      
   63         if (task_no_new_privs(current))
                      return;
      
   61         inode = file_inode(bprm->file);
              mode = READ_ONCE(inode->i_mode);
              if (!(mode & (S_ISUID|S_ISGID)))
                      return;
      
              /* Be careful if suid/sgid is set */
    5         mutex_lock(&inode->i_mutex);
      
              /* reload atomically mode/uid/gid now that lock held */
              mode = inode->i_mode;
              uid = inode->i_uid;
              gid = inode->i_gid;
              mutex_unlock(&inode->i_mutex);
      
              /* We ignore suid/sgid if there are no mappings for them in the ns */
              if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
    5                  !kgid_has_mapping(bprm->cred->user_ns, gid))
                      return;
      
    5         if (mode & S_ISUID) {
    2                 bprm->per_clear |= PER_CLEAR_ON_SETID;
                      bprm->cred->euid = uid;
              }
      
    5         if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
    1                 bprm->per_clear |= PER_CLEAR_ON_SETID;
                      bprm->cred->egid = gid;
              }
      }
      
      /*
       * Fill the binprm structure from the inode.
       * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
       *
       * This may be called multiple times for binary chains (scripts for example).
       */
   63 int prepare_binprm(struct linux_binprm *bprm)
      {
              int retval;
      
   64         bprm_fill_uid(bprm);
      
              /* fill in binprm security blob */
   64         retval = security_bprm_set_creds(bprm);
   64         if (retval)
                      return retval;
   63         bprm->cred_prepared = 1;
      
              memset(bprm->buf, 0, BINPRM_BUF_SIZE);
              return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
      }
      
      EXPORT_SYMBOL(prepare_binprm);
      
      /*
       * Arguments are '\0' separated strings found at the location bprm->p
       * points to; chop off the first by relocating brpm->p to right after
       * the first '\0' encountered.
       */
    8 int remove_arg_zero(struct linux_binprm *bprm)
      {
              int ret = 0;
              unsigned long offset;
              char *kaddr;
              struct page *page;
      
    8         if (!bprm->argc)
                      return 0;
      
              do {
    2                 offset = bprm->p & ~PAGE_MASK;
    2                 page = get_arg_page(bprm, bprm->p, 0);
                      if (!page) {
                              ret = -EFAULT;
                              goto out;
                      }
    2                 kaddr = kmap_atomic(page);
      
    2                 for (; offset < PAGE_SIZE && kaddr[offset];
    2                                 offset++, bprm->p++)
                              ;
      
    2                 kunmap_atomic(kaddr);
    2                 put_arg_page(page);
      
                      if (offset == PAGE_SIZE)
                              free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
              } while (offset == PAGE_SIZE);
      
    2         bprm->p++;
              bprm->argc--;
    8         ret = 0;
      
      out:
              return ret;
      }
      EXPORT_SYMBOL(remove_arg_zero);
      
      #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
      /*
       * cycle the list of binary formats handler, until one recognizes the image
       */
   58 int search_binary_handler(struct linux_binprm *bprm)
      {
              bool need_retry = IS_ENABLED(CONFIG_MODULES);
              struct linux_binfmt *fmt;
              int retval;
      
              /* This allows 4 levels of binfmt rewrites before failing hard. */
   58         if (bprm->recursion_depth > 5)
                      return -ELOOP;
      
   58         retval = security_bprm_check(bprm);
              if (retval)
                      return retval;
      
              retval = -ENOENT;
       retry:
   58         read_lock(&binfmt_lock);
   50         list_for_each_entry(fmt, &formats, lh) {
   58                 if (!try_module_get(fmt->module))
                              continue;
   58                 read_unlock(&binfmt_lock);
                      bprm->recursion_depth++;
                      retval = fmt->load_binary(bprm);
                      read_lock(&binfmt_lock);
                      put_binfmt(fmt);
                      bprm->recursion_depth--;
   58                 if (retval < 0 && !bprm->mm) {
                              /* we got to flush_old_exec() and failed after it */
                              read_unlock(&binfmt_lock);
                              force_sigsegv(SIGSEGV, current);
                              return retval;
                      }
   58                 if (retval != -ENOEXEC || !bprm->file) {
                              read_unlock(&binfmt_lock);
                              return retval;
                      }
              }
   58         read_unlock(&binfmt_lock);
      
   23         if (need_retry) {
   47                 if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
    6                     printable(bprm->buf[2]) && printable(bprm->buf[3]))
                              return retval;
   43                 if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
                              return retval;
                      need_retry = false;
                      goto retry;
              }
      
              return retval;
      }
      EXPORT_SYMBOL(search_binary_handler);
      
      static int exec_binprm(struct linux_binprm *bprm)
      {
              pid_t old_pid, old_vpid;
              int ret;
      
              /* Need to fetch pid before load_binary changes it */
              old_pid = current->pid;
   58         rcu_read_lock();
   58         old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
   58         rcu_read_unlock();
      
              ret = search_binary_handler(bprm);
              if (ret >= 0) {
                      audit_bprm(bprm);
                      trace_sched_process_exec(current, old_pid, bprm);
                      ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
                      proc_exec_connector(current);
              }
      
              return ret;
      }
      
      /*
       * sys_execve() executes a new program.
       */
      static int do_execveat_common(int fd, struct filename *filename,
                                    struct user_arg_ptr argv,
                                    struct user_arg_ptr envp,
                                    int flags)
      {
              char *pathbuf = NULL;
              struct linux_binprm *bprm;
              struct file *file;
              struct files_struct *displaced;
              int retval;
      
   94         if (IS_ERR(filename))
    1                 return PTR_ERR(filename);
      
              /*
               * We move the actual failure in case of RLIMIT_NPROC excess from
               * set*uid() to execve() because too many poorly written programs
               * don't check setuid() return code.  Here we additionally recheck
               * whether NPROC limit is still exceeded.
               */
   93         if ((current->flags & PF_NPROC_EXCEEDED) &&
                  atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
                      retval = -EAGAIN;
                      goto out_ret;
              }
      
              /* We're below the limit (still or again), so we don't want to make
               * further execve() calls fail. */
   93         current->flags &= ~PF_NPROC_EXCEEDED;
      
              retval = unshare_files(&displaced);
              if (retval)
                      goto out_ret;
      
              retval = -ENOMEM;
   93         bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
              if (!bprm)
                      goto out_files;
      
   93         retval = prepare_bprm_creds(bprm);
              if (retval)
                      goto out_free;
      
   93         check_unsafe_exec(bprm);
              current->in_execve = 1;
      
              file = do_open_execat(fd, filename, flags);
   25         retval = PTR_ERR(file);
              if (IS_ERR(file))
                      goto out_unmark;
      
   70         sched_exec();
      
              bprm->file = file;
   49         if (fd == AT_FDCWD || filename->name[0] == '/') {
   20                 bprm->filename = filename->name;
              } else {
   49                 if (filename->name[0] == '\0')
   48                         pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
                      else
    1                         pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
                                                  fd, filename->name);
   49                 if (!pathbuf) {
                              retval = -ENOMEM;
                              goto out_unmark;
                      }
                      /*
                       * Record that a name derived from an O_CLOEXEC fd will be
                       * inaccessible after exec. Relies on having exclusive access to
                       * current->files (due to unshare_files above).
                       */
   49                 if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
    1                         bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
   69                 bprm->filename = pathbuf;
              }
   69         bprm->interp = bprm->filename;
      
   69         retval = bprm_mm_init(bprm);
              if (retval)
                      goto out_unmark;
      
              bprm->argc = count(argv, MAX_ARG_STRINGS);
              if ((retval = bprm->argc) < 0)
                      goto out;
      
   65         bprm->envc = count(envp, MAX_ARG_STRINGS);
              if ((retval = bprm->envc) < 0)
                      goto out;
      
   64         retval = prepare_binprm(bprm);
              if (retval < 0)
                      goto out;
      
   63         retval = copy_strings_kernel(1, &bprm->filename, bprm);
              if (retval < 0)
                      goto out;
      
   63         bprm->exec = bprm->p;
              retval = copy_strings(bprm->envc, envp, bprm);
              if (retval < 0)
                      goto out;
      
   63         retval = copy_strings(bprm->argc, argv, bprm);
              if (retval < 0)
                      goto out;
      
   58         would_dump(bprm, bprm->file);
      
   58         retval = exec_binprm(bprm);
              if (retval < 0)
                      goto out;
      
              /* execve succeeded */
              current->fs->in_exec = 0;
              current->in_execve = 0;
              acct_update_integrals(current);
              task_numa_free(current, false);
              free_bprm(bprm);
              kfree(pathbuf);
              putname(filename);
              if (displaced)
                      put_files_struct(displaced);
              return retval;
      
      out:
   30         if (bprm->mm) {
   30                 acct_arg_size(bprm, 0);
   30                 mmput(bprm->mm);
              }
      
      out_unmark:
   53         current->fs->in_exec = 0;
              current->in_execve = 0;
      
      out_free:
   53         free_bprm(bprm);
              kfree(pathbuf);
      
      out_files:
   53         if (displaced)
   52                 reset_files_struct(displaced);
      out_ret:
   42         putname(filename);
   43         return retval;
      }
      
      int do_execve(struct filename *filename,
              const char __user *const __user *__argv,
              const char __user *const __user *__envp)
      {
              struct user_arg_ptr argv = { .ptr.native = __argv };
              struct user_arg_ptr envp = { .ptr.native = __envp };
              return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
      }
      
      int do_execveat(int fd, struct filename *filename,
                      const char __user *const __user *__argv,
                      const char __user *const __user *__envp,
                      int flags)
      {
              struct user_arg_ptr argv = { .ptr.native = __argv };
              struct user_arg_ptr envp = { .ptr.native = __envp };
      
              return do_execveat_common(fd, filename, argv, envp, flags);
      }
      
      #ifdef CONFIG_COMPAT
      static int compat_do_execve(struct filename *filename,
              const compat_uptr_t __user *__argv,
              const compat_uptr_t __user *__envp)
      {
              struct user_arg_ptr argv = {
                      .is_compat = true,
                      .ptr.compat = __argv,
              };
              struct user_arg_ptr envp = {
                      .is_compat = true,
                      .ptr.compat = __envp,
              };
              return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
      }
      
      static int compat_do_execveat(int fd, struct filename *filename,
                                    const compat_uptr_t __user *__argv,
                                    const compat_uptr_t __user *__envp,
                                    int flags)
      {
              struct user_arg_ptr argv = {
                      .is_compat = true,
                      .ptr.compat = __argv,
              };
              struct user_arg_ptr envp = {
                      .is_compat = true,
                      .ptr.compat = __envp,
              };
              return do_execveat_common(fd, filename, argv, envp, flags);
      }
      #endif
      
      void set_binfmt(struct linux_binfmt *new)
      {
              struct mm_struct *mm = current->mm;
      
              if (mm->binfmt)
                      module_put(mm->binfmt->module);
      
              mm->binfmt = new;
              if (new)
                      __module_get(new->module);
      }
      EXPORT_SYMBOL(set_binfmt);
      
      /*
       * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
       */
    1 void set_dumpable(struct mm_struct *mm, int value)
      {
              unsigned long old, new;
      
    1         if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
                      return;
      
              do {
    1                 old = ACCESS_ONCE(mm->flags);
                      new = (old & ~MMF_DUMPABLE_MASK) | value;
    1         } while (cmpxchg(&mm->flags, old, new) != old);
      }
      
      SYSCALL_DEFINE3(execve,
                      const char __user *, filename,
                      const char __user *const __user *, argv,
                      const char __user *const __user *, envp)
      {
              return do_execve(getname(filename), argv, envp);
      }
      
      SYSCALL_DEFINE5(execveat,
                      int, fd, const char __user *, filename,
                      const char __user *const __user *, argv,
                      const char __user *const __user *, envp,
                      int, flags)
      {
              int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
      
              return do_execveat(fd,
                                 getname_flags(filename, lookup_flags, NULL),
                                 argv, envp, flags);
      }
      
      #ifdef CONFIG_COMPAT
   28 COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
              const compat_uptr_t __user *, argv,
              const compat_uptr_t __user *, envp)
      {
              return compat_do_execve(getname(filename), argv, envp);
      }
      
   67 COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
                             const char __user *, filename,
                             const compat_uptr_t __user *, argv,
                             const compat_uptr_t __user *, envp,
                             int,  flags)
      {
              int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
      
              return compat_do_execveat(fd,
                                        getname_flags(filename, lookup_flags, NULL),
                                        argv, envp, flags);
      }
      #endif
      /*
       * "splice": joining two ropes together by interweaving their strands.
       *
       * This is the "extended pipe" functionality, where a pipe is used as
       * an arbitrary in-memory buffer. Think of a pipe as a small kernel
       * buffer that you can use to transfer data from one end to the other.
       *
       * The traditional unix read/write is extended with a "splice()" operation
       * that transfers data buffers to or from a pipe buffer.
       *
       * Named by Larry McVoy, original implementation from Linus, extended by
       * Jens to support splicing to files, network, direct splicing, etc and
       * fixing lots of bugs.
       *
       * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
       * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
       * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
       *
       */
      #include <linux/fs.h>
      #include <linux/file.h>
      #include <linux/pagemap.h>
      #include <linux/splice.h>
      #include <linux/memcontrol.h>
      #include <linux/mm_inline.h>
      #include <linux/swap.h>
      #include <linux/writeback.h>
      #include <linux/export.h>
      #include <linux/syscalls.h>
      #include <linux/uio.h>
      #include <linux/security.h>
      #include <linux/gfp.h>
      #include <linux/socket.h>
      #include <linux/compat.h>
      #include "internal.h"
      
      /*
       * Attempt to steal a page from a pipe buffer. This should perhaps go into
       * a vm helper function, it's already simplified quite a bit by the
       * addition of remove_mapping(). If success is returned, the caller may
       * attempt to reuse this page for another destination.
       */
      static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
                                           struct pipe_buffer *buf)
      {
              struct page *page = buf->page;
              struct address_space *mapping;
      
              lock_page(page);
      
              mapping = page_mapping(page);
              if (mapping) {
                      WARN_ON(!PageUptodate(page));
      
                      /*
                       * At least for ext2 with nobh option, we need to wait on
                       * writeback completing on this page, since we'll remove it
                       * from the pagecache.  Otherwise truncate wont wait on the
                       * page, allowing the disk blocks to be reused by someone else
                       * before we actually wrote our data to them. fs corruption
                       * ensues.
                       */
                      wait_on_page_writeback(page);
      
                      if (page_has_private(page) &&
                          !try_to_release_page(page, GFP_KERNEL))
                              goto out_unlock;
      
                      /*
                       * If we succeeded in removing the mapping, set LRU flag
                       * and return good.
                       */
                      if (remove_mapping(mapping, page)) {
                              buf->flags |= PIPE_BUF_FLAG_LRU;
                              return 0;
                      }
              }
      
              /*
               * Raced with truncate or failed to remove page from current
               * address space, unlock and return failure.
               */
      out_unlock:
              unlock_page(page);
              return 1;
      }
      
      static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
                                              struct pipe_buffer *buf)
      {
  957         page_cache_release(buf->page);
              buf->flags &= ~PIPE_BUF_FLAG_LRU;
      }
      
      /*
       * Check whether the contents of buf is OK to access. Since the content
       * is a page cache page, IO may be in flight.
       */
      static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
                                             struct pipe_buffer *buf)
      {
  947         struct page *page = buf->page;
              int err;
      
  947         if (!PageUptodate(page)) {
  278                 lock_page(page);
      
                      /*
                       * Page got truncated/unhashed. This will cause a 0-byte
                       * splice, if this is the first page.
                       */
  278                 if (!page->mapping) {
                              err = -ENODATA;
                              goto error;
                      }
      
                      /*
                       * Uh oh, read-error from disk.
                       */
  274                 if (!PageUptodate(page)) {
                              err = -EIO;
                              goto error;
                      }
      
                      /*
                       * Page is ok afterall, we are done.
                       */
  274                 unlock_page(page);
              }
      
              return 0;
      error:
  194         unlock_page(page);
  194         return err;
      }
      
      const struct pipe_buf_operations page_cache_pipe_buf_ops = {
              .can_merge = 0,
              .confirm = page_cache_pipe_buf_confirm,
              .release = page_cache_pipe_buf_release,
              .steal = page_cache_pipe_buf_steal,
              .get = generic_pipe_buf_get,
      };
      
      static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
                                          struct pipe_buffer *buf)
      {
              if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
                      return 1;
      
              buf->flags |= PIPE_BUF_FLAG_LRU;
              return generic_pipe_buf_steal(pipe, buf);
      }
      
      static const struct pipe_buf_operations user_page_pipe_buf_ops = {
              .can_merge = 0,
              .confirm = generic_pipe_buf_confirm,
              .release = page_cache_pipe_buf_release,
              .steal = user_page_pipe_buf_steal,
              .get = generic_pipe_buf_get,
      };
      
      static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
      {
   50         smp_mb();
              if (waitqueue_active(&pipe->wait))
   11                 wake_up_interruptible(&pipe->wait);
   50         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
      }
      
      /**
       * splice_to_pipe - fill passed data into a pipe
       * @pipe:        pipe to fill
       * @spd:        data to fill
       *
       * Description:
       *    @spd contains a map of pages and len/offset tuples, along with
       *    the struct pipe_buf_operations associated with these pages. This
       *    function will link that data to the pipe.
       *
       */
      ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                             struct splice_pipe_desc *spd)
      {
 1189         unsigned int spd_pages = spd->nr_pages;
              int ret, do_wakeup, page_nr;
      
 1188         if (!spd_pages)
                      return 0;
      
              ret = 0;
              do_wakeup = 0;
              page_nr = 0;
      
 1189         pipe_lock(pipe);
      
              for (;;) {
  990                 if (!pipe->readers) {
    2                         send_sig(SIGPIPE, current, 0);
                              if (!ret)
                                      ret = -EPIPE;
                              break;
                      }
      
 1188                 if (pipe->nrbufs < pipe->buffers) {
 1185                         int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
                              struct pipe_buffer *buf = pipe->bufs + newbuf;
      
                              buf->page = spd->pages[page_nr];
                              buf->offset = spd->partial[page_nr].offset;
                              buf->len = spd->partial[page_nr].len;
                              buf->private = spd->partial[page_nr].private;
                              buf->ops = spd->ops;
                              buf->flags = 0;
                              if (spd->flags & SPLICE_F_GIFT)
    2                                 buf->flags |= PIPE_BUF_FLAG_GIFT;
      
 1185                         pipe->nrbufs++;
                              page_nr++;
                              ret += buf->len;
      
                              if (pipe->files)
                                      do_wakeup = 1;
      
 1185                         if (!--spd->nr_pages)
                                      break;
 1185                         if (pipe->nrbufs < pipe->buffers)
                                      continue;
      
                              break;
                      }
      
    6                 if (spd->flags & SPLICE_F_NONBLOCK) {
    1                         if (!ret)
                                      ret = -EAGAIN;
                              break;
                      }
      
    5                 if (signal_pending(current)) {
    1                         if (!ret)
                                      ret = -ERESTARTSYS;
                              break;
                      }
      
    5                 if (do_wakeup) {
                              smp_mb();
                              if (waitqueue_active(&pipe->wait))
                                      wake_up_interruptible_sync(&pipe->wait);
                              kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                              do_wakeup = 0;
                      }
      
    5                 pipe->waiting_writers++;
                      pipe_wait(pipe);
                      pipe->waiting_writers--;
              }
      
 1187         pipe_unlock(pipe);
      
              if (do_wakeup)
   44                 wakeup_pipe_readers(pipe);
      
 1188         while (page_nr < spd_pages)
    5                 spd->spd_release(spd, page_nr++);
      
              return ret;
      }
      EXPORT_SYMBOL_GPL(splice_to_pipe);
      
      void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
      {
    4         page_cache_release(spd->pages[i]);
      }
      
      /*
       * Check if we need to grow the arrays holding pages and partial page
       * descriptions.
       */
      int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
      {
 1233         unsigned int buffers = ACCESS_ONCE(pipe->buffers);
      
              spd->nr_pages_max = buffers;
              if (buffers <= PIPE_DEF_BUFFERS)
 1233                 return 0;
      
    5         spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
              spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
      
    5         if (spd->pages && spd->partial)
                      return 0;
      
              kfree(spd->pages);
              kfree(spd->partial);
              return -ENOMEM;
      }
      
      void splice_shrink_spd(struct splice_pipe_desc *spd)
      {
 1232         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
                      return;
      
    5         kfree(spd->pages);
              kfree(spd->partial);
      }
      
      static int
      __generic_file_splice_read(struct file *in, loff_t *ppos,
                                 struct pipe_inode_info *pipe, size_t len,
                                 unsigned int flags)
      {
  546         struct address_space *mapping = in->f_mapping;
              unsigned int loff, nr_pages, req_pages;
              struct page *pages[PIPE_DEF_BUFFERS];
              struct partial_page partial[PIPE_DEF_BUFFERS];
              struct page *page;
              pgoff_t index, end_index;
              loff_t isize;
              int error, page_nr;
              struct splice_pipe_desc spd = {
                      .pages = pages,
                      .partial = partial,
                      .nr_pages_max = PIPE_DEF_BUFFERS,
                      .flags = flags,
                      .ops = &page_cache_pipe_buf_ops,
                      .spd_release = spd_release_page,
              };
      
              if (splice_grow_spd(pipe, &spd))
                      return -ENOMEM;
      
  546         index = *ppos >> PAGE_CACHE_SHIFT;
              loff = *ppos & ~PAGE_CACHE_MASK;
              req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
              nr_pages = min(req_pages, spd.nr_pages_max);
      
              /*
               * Lookup the (hopefully) full range of pages we need.
               */
              spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
              index += spd.nr_pages;
      
              /*
               * If find_get_pages_contig() returned fewer pages than we needed,
               * readahead/allocate the rest and fill in the holes.
               */
              if (spd.nr_pages < nr_pages)
                      page_cache_sync_readahead(mapping, &in->f_ra, in,
  458                                 index, req_pages - spd.nr_pages);
      
              error = 0;
  546         while (spd.nr_pages < nr_pages) {
                      /*
                       * Page could be there, find_get_pages_contig() breaks on
                       * the first hole.
                       */
  458                 page = find_get_page(mapping, index);
                      if (!page) {
                              /*
                               * page didn't exist, allocate one.
                               */
  280                         page = page_cache_alloc_cold(mapping);
                              if (!page)
                                      break;
      
  280                         error = add_to_page_cache_lru(page, mapping, index,
                                         mapping_gfp_constraint(mapping, GFP_KERNEL));
                              if (unlikely(error)) {
    1                                 page_cache_release(page);
                                      if (error == -EEXIST)
                                              continue;
                                      break;
                              }
                              /*
                               * add_to_page_cache() locks the page, unlock it
                               * to avoid convoluting the logic below even more.
                               */
  280                         unlock_page(page);
                      }
      
  458                 spd.pages[spd.nr_pages++] = page;
                      index++;
              }
      
              /*
               * Now loop over the map and see if we need to start IO on any
               * pages, fill in the partial map, etc.
               */
  546         index = *ppos >> PAGE_CACHE_SHIFT;
              nr_pages = spd.nr_pages;
              spd.nr_pages = 0;
              for (page_nr = 0; page_nr < nr_pages; page_nr++) {
                      unsigned int this_len;
      
  546                 if (!len)
                              break;
      
                      /*
                       * this_len is the max we'll use from this page
                       */
  546                 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
                      page = spd.pages[page_nr];
      
                      if (PageReadahead(page))
                              page_cache_async_readahead(mapping, &in->f_ra, in,
  145                                         page, index, req_pages - page_nr);
      
                      /*
                       * If the page isn't uptodate, we may need to start io on it
                       */
  546                 if (!PageUptodate(page)) {
  310                         lock_page(page);
      
                              /*
                               * Page was truncated, or invalidated by the
                               * filesystem.  Redo the find/create, but this time the
                               * page is kept locked, so there's no chance of another
                               * race with truncate/invalidate.
                               */
  310                         if (!page->mapping) {
  206                                 unlock_page(page);
                                      page = find_or_create_page(mapping, index,
                                                      mapping_gfp_mask(mapping));
      
                                      if (!page) {
                                              error = -ENOMEM;
                                              break;
                                      }
  206                                 page_cache_release(spd.pages[page_nr]);
  546                                 spd.pages[page_nr] = page;
                              }
                              /*
                               * page was already under io and is now done, great
                               */
  310                         if (PageUptodate(page)) {
                                      unlock_page(page);
                                      goto fill_it;
                              }
      
                              /*
                               * need to read in the page
                               */
  289                         error = mapping->a_ops->readpage(in, page);
                              if (unlikely(error)) {
                                      /*
                                       * We really should re-lookup the page here,
                                       * but it complicates things a lot. Instead
                                       * lets just do what we already stored, and
                                       * we'll get it the next time we are called.
                                       */
                                      if (error == AOP_TRUNCATED_PAGE)
                                              error = 0;
      
                                      break;
                              }
                      }
      fill_it:
                      /*
                       * i_size must be checked after PageUptodate.
                       */
  546                 isize = i_size_read(mapping->host);
                      end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
  546                 if (unlikely(!isize || index > end_index))
                              break;
      
                      /*
                       * if this is the last page, see if we need to shrink
                       * the length and stop
                       */
  546                 if (end_index == index) {
                              unsigned int plen;
      
                              /*
                               * max good bytes in this page
                               */
  377                         plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
                              if (plen <= loff)
                                      break;
      
                              /*
                               * force quit after adding this page
                               */
  377                         this_len = min(this_len, plen - loff);
                              len = this_len;
                      }
      
  546                 spd.partial[page_nr].offset = loff;
                      spd.partial[page_nr].len = this_len;
                      len -= this_len;
                      loff = 0;
                      spd.nr_pages++;
                      index++;
              }
      
              /*
               * Release any pages at the end, if we quit early. 'page_nr' is how far
               * we got, 'nr_pages' is how many pages are in the map.
               */
   19         while (page_nr < nr_pages)
   19                 page_cache_release(spd.pages[page_nr++]);
  546         in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
      
              if (spd.nr_pages)
  546                 error = splice_to_pipe(pipe, &spd);
      
  546         splice_shrink_spd(&spd);
              return error;
      }
      
      /**
       * generic_file_splice_read - splice data from file to a pipe
       * @in:                file to splice from
       * @ppos:        position in @in
       * @pipe:        pipe to splice to
       * @len:        number of bytes to splice
       * @flags:        splice modifier flags
       *
       * Description:
       *    Will read pages from given file and fill them into a pipe. Can be
       *    used as long as the address_space operations for the source implements
       *    a readpage() hook.
       *
       */
      ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
                                       struct pipe_inode_info *pipe, size_t len,
                                       unsigned int flags)
      {
              loff_t isize, left;
              int ret;
      
              if (IS_DAX(in->f_mapping->host))
                      return default_file_splice_read(in, ppos, pipe, len, flags);
      
  552         isize = i_size_read(in->f_mapping->host);
              if (unlikely(*ppos >= isize))
                      return 0;
      
  546         left = isize - *ppos;
              if (unlikely(left < len))
                      len = left;
      
              ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
              if (ret > 0) {
  546                 *ppos += ret;
  552                 file_accessed(in);
              }
      
              return ret;
   19 }
      EXPORT_SYMBOL(generic_file_splice_read);
      
      static const struct pipe_buf_operations default_pipe_buf_ops = {
              .can_merge = 0,
              .confirm = generic_pipe_buf_confirm,
              .release = generic_pipe_buf_release,
              .steal = generic_pipe_buf_steal,
              .get = generic_pipe_buf_get,
      };
      
      static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
                                          struct pipe_buffer *buf)
      {
              return 1;
      }
      
      /* Pipe buffer operations for a socket and similar. */
      const struct pipe_buf_operations nosteal_pipe_buf_ops = {
              .can_merge = 0,
              .confirm = generic_pipe_buf_confirm,
              .release = generic_pipe_buf_release,
              .steal = generic_pipe_buf_nosteal,
              .get = generic_pipe_buf_get,
      };
      EXPORT_SYMBOL(nosteal_pipe_buf_ops);
      
      static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
                                  unsigned long vlen, loff_t offset)
      {
              mm_segment_t old_fs;
              loff_t pos = offset;
              ssize_t res;
      
              old_fs = get_fs();
              set_fs(get_ds());
              /* The cast to a user pointer is valid due to the set_fs() */
              res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
              set_fs(old_fs);
      
              return res;
      }
      
      ssize_t kernel_write(struct file *file, const char *buf, size_t count,
                                  loff_t pos)
      {
              mm_segment_t old_fs;
              ssize_t res;
      
              old_fs = get_fs();
              set_fs(get_ds());
              /* The cast to a user pointer is valid due to the set_fs() */
              res = vfs_write(file, (__force const char __user *)buf, count, &pos);
              set_fs(old_fs);
      
              return res;
      }
      EXPORT_SYMBOL(kernel_write);
      
      ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
                                       struct pipe_inode_info *pipe, size_t len,
                                       unsigned int flags)
      {
              unsigned int nr_pages;
              unsigned int nr_freed;
              size_t offset;
              struct page *pages[PIPE_DEF_BUFFERS];
              struct partial_page partial[PIPE_DEF_BUFFERS];
              struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
              ssize_t res;
              size_t this_len;
              int error;
              int i;
  301         struct splice_pipe_desc spd = {
                      .pages = pages,
                      .partial = partial,
                      .nr_pages_max = PIPE_DEF_BUFFERS,
                      .flags = flags,
                      .ops = &default_pipe_buf_ops,
                      .spd_release = spd_release_page,
              };
      
              if (splice_grow_spd(pipe, &spd))
                      return -ENOMEM;
      
              res = -ENOMEM;
              vec = __vec;
  301         if (spd.nr_pages_max > PIPE_DEF_BUFFERS) {
    1                 vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL);
                      if (!vec)
                              goto shrink_ret;
              }
      
  301         offset = *ppos & ~PAGE_CACHE_MASK;
              nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
      
  301         for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
                      struct page *page;
      
  301                 page = alloc_page(GFP_USER);
                      error = -ENOMEM;
                      if (!page)
                              goto err;
      
  301                 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
                      vec[i].iov_base = (void __user *) page_address(page);
                      vec[i].iov_len = this_len;
                      spd.pages[i] = page;
                      spd.nr_pages++;
                      len -= this_len;
                      offset = 0;
              }
      
  301         res = kernel_readv(in, vec, spd.nr_pages, *ppos);
              if (res < 0) {
   15                 error = res;
                      goto err;
              }
      
              error = 0;
  289         if (!res)
                      goto err;
      
              nr_freed = 0;
  243         for (i = 0; i < spd.nr_pages; i++) {
  243                 this_len = min_t(size_t, vec[i].iov_len, res);
                      spd.partial[i].offset = 0;
                      spd.partial[i].len = this_len;
                      if (!this_len) {
  119                         __free_page(spd.pages[i]);
                              spd.pages[i] = NULL;
                              nr_freed++;
                      }
  243                 res -= this_len;
              }
  243         spd.nr_pages -= nr_freed;
      
              res = splice_to_pipe(pipe, &spd);
              if (res > 0)
  292                 *ppos += res;
      
      shrink_ret:
  301         if (vec != __vec)
    1                 kfree(vec);
  301         splice_shrink_spd(&spd);
              return res;
      
      err:
  158         for (i = 0; i < spd.nr_pages; i++)
  158                 __free_page(spd.pages[i]);
      
              res = error;
              goto shrink_ret;
      }
      EXPORT_SYMBOL(default_file_splice_read);
      
      /*
       * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
       * using sendpage(). Return the number of bytes sent.
       */
      static int pipe_to_sendpage(struct pipe_inode_info *pipe,
                                  struct pipe_buffer *buf, struct splice_desc *sd)
      {
  398         struct file *file = sd->u.file;
              loff_t pos = sd->pos;
              int more;
      
              if (!likely(file->f_op->sendpage))
                      return -EINVAL;
      
  398         more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
      
  325         if (sd->len < sd->total_len && pipe->nrbufs > 1)
  274                 more |= MSG_SENDPAGE_NOTLAST;
      
  398         return file->f_op->sendpage(file, buf->page, buf->offset,
                                          sd->len, &pos, more);
      }
      
      static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
      {
  244         smp_mb();
              if (waitqueue_active(&pipe->wait))
  218                 wake_up_interruptible(&pipe->wait);
  244         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
      }
      
      /**
       * splice_from_pipe_feed - feed available data from a pipe to a file
       * @pipe:        pipe to splice from
       * @sd:                information to @actor
       * @actor:        handler that splices the data
       *
       * Description:
       *    This function loops over the pipe and calls @actor to do the
       *    actual moving of a single struct pipe_buffer to the desired
       *    destination.  It returns when there's no more buffers left in
       *    the pipe or if the requested number of bytes (@sd->total_len)
       *    have been copied.  It returns a positive number (one) if the
       *    pipe needs to be filled with more data, zero if the required
       *    number of bytes have been copied and -errno on error.
       *
       *    This, together with splice_from_pipe_{begin,end,next}, may be
       *    used to implement the functionality of __splice_from_pipe() when
       *    locking is required around copying the pipe buffers to the
       *    destination.
       */
      static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                                splice_actor *actor)
      {
              int ret;
      
  652         while (pipe->nrbufs) {
  652                 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
                      const struct pipe_buf_operations *ops = buf->ops;
      
                      sd->len = buf->len;
                      if (sd->len > sd->total_len)
   32                         sd->len = sd->total_len;
      
  652                 ret = buf->ops->confirm(pipe, buf);
                      if (unlikely(ret)) {
                              if (ret == -ENODATA)
                                      ret = 0;
                              return ret;
                      }
      
  652                 ret = actor(pipe, buf, sd);
                      if (ret <= 0)
                              return ret;
      
  613                 buf->offset += ret;
                      buf->len -= ret;
      
                      sd->num_spliced += ret;
                      sd->len -= ret;
                      sd->pos += ret;
                      sd->total_len -= ret;
      
                      if (!buf->len) {
  600                         buf->ops = NULL;
                              ops->release(pipe, buf);
                              pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
                              pipe->nrbufs--;
                              if (pipe->files)
  245                                 sd->need_wakeup = true;
                      }
      
  613                 if (!sd->total_len)
                              return 0;
              }
      
              return 1;
      }
      
      /**
       * splice_from_pipe_next - wait for some data to splice from
       * @pipe:        pipe to splice from
       * @sd:                information about the splice operation
       *
       * Description:
       *    This function will wait for some data and return a positive
       *    value (one) if pipe buffers are available.  It will return zero
       *    or -errno if no more data needs to be spliced.
       */
      static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
      {
              /*
               * Check for signal early to make process killable when there are
               * always buffers available
               */
 1385         if (signal_pending(current))
                      return -ERESTARTSYS;
      
 1385         while (!pipe->nrbufs) {
  228                 if (!pipe->writers)
                              return 0;
      
  226                 if (!pipe->waiting_writers && sd->num_spliced)
                              return 0;
      
  209                 if (sd->flags & SPLICE_F_NONBLOCK)
                              return -EAGAIN;
      
  208                 if (signal_pending(current))
 1384                         return -ERESTARTSYS;
      
  208                 if (sd->need_wakeup) {
  191                         wakeup_pipe_writers(pipe);
                              sd->need_wakeup = false;
                      }
      
  208                 pipe_wait(pipe);
              }
      
              return 1;
      }
      
      /**
       * splice_from_pipe_begin - start splicing from pipe
       * @sd:                information about the splice operation
       *
       * Description:
       *    This function should be called before a loop containing
       *    splice_from_pipe_next() and splice_from_pipe_feed() to
       *    initialize the necessary fields of @sd.
       */
      static void splice_from_pipe_begin(struct splice_desc *sd)
      {
  654         sd->num_spliced = 0;
              sd->need_wakeup = false;
      }
      
      /**
       * splice_from_pipe_end - finish splicing from pipe
       * @pipe:        pipe to splice from
       * @sd:                information about the splice operation
       *
       * Description:
       *    This function will wake up pipe writers if necessary.  It should
       *    be called after a loop containing splice_from_pipe_next() and
       *    splice_from_pipe_feed().
       */
      static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
      {
              if (sd->need_wakeup)
   61                 wakeup_pipe_writers(pipe);
      }
      
      /**
       * __splice_from_pipe - splice data from a pipe to given actor
       * @pipe:        pipe to splice from
       * @sd:                information to @actor
       * @actor:        handler that splices the data
       *
       * Description:
       *    This function does little more than loop over the pipe and call
       *    @actor to do the actual moving of a single struct pipe_buffer to
       *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
       *    pipe_to_user.
       *
       */
      ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
                                 splice_actor *actor)
      {
              int ret;
      
  654         splice_from_pipe_begin(sd);
              do {
  655                 cond_resched();
  655                 ret = splice_from_pipe_next(pipe, sd);
                      if (ret > 0)
  652                         ret = splice_from_pipe_feed(pipe, sd, actor);
              } while (ret > 0);
  487         splice_from_pipe_end(pipe, sd);
      
  487         return sd->num_spliced ? sd->num_spliced : ret;
      }
      EXPORT_SYMBOL(__splice_from_pipe);
      
      /**
       * splice_from_pipe - splice data from a pipe to a file
       * @pipe:        pipe to splice from
       * @out:        file to splice to
       * @ppos:        position in @out
       * @len:        how many bytes to splice
       * @flags:        splice modifier flags
       * @actor:        handler that splices the data
       *
       * Description:
       *    See __splice_from_pipe. This function locks the pipe inode,
       *    otherwise it's identical to __splice_from_pipe().
       *
       */
      ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                               loff_t *ppos, size_t len, unsigned int flags,
                               splice_actor *actor)
      {
              ssize_t ret;
  551         struct splice_desc sd = {
                      .total_len = len,
                      .flags = flags,
                      .pos = *ppos,
                      .u.file = out,
              };
      
              pipe_lock(pipe);
              ret = __splice_from_pipe(pipe, &sd, actor);
              pipe_unlock(pipe);
      
              return ret;
      }
      
      /**
       * iter_file_splice_write - splice data from a pipe to a file
       * @pipe:        pipe info
       * @out:        file to write to
       * @ppos:        position in @out
       * @len:        number of bytes to splice
       * @flags:        splice modifier flags
       *
       * Description:
       *    Will either move or copy pages (determined by @flags options) from
       *    the given pipe inode to the given file.
       *    This one is ->write_iter-based.
       *
       */
      ssize_t
      iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                                loff_t *ppos, size_t len, unsigned int flags)
      {
  751         struct splice_desc sd = {
                      .total_len = len,
                      .flags = flags,
                      .pos = *ppos,
                      .u.file = out,
              };
              int nbufs = pipe->buffers;
  751         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
                                              GFP_KERNEL);
              ssize_t ret;
      
              if (unlikely(!array))
                      return -ENOMEM;
      
  751         pipe_lock(pipe);
      
              splice_from_pipe_begin(&sd);
  745         while (sd.total_len) {
                      struct iov_iter from;
                      size_t left;
                      int n, idx;
      
  751                 ret = splice_from_pipe_next(pipe, &sd);
                      if (ret <= 0)
                              break;
      
  749                 if (unlikely(nbufs < pipe->buffers)) {
                              kfree(array);
                              nbufs = pipe->buffers;
                              array = kcalloc(nbufs, sizeof(struct bio_vec),
                                              GFP_KERNEL);
                              if (!array) {
                                      ret = -ENOMEM;
                                      break;
                              }
                      }
      
                      /* build the vector */
  749                 left = sd.total_len;
  749                 for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
  749                         struct pipe_buffer *buf = pipe->bufs + idx;
                              size_t this_len = buf->len;
      
                              if (this_len > left)
                                      this_len = left;
      
                              if (idx == pipe->buffers - 1)
                                      idx = -1;
      
  749                         ret = buf->ops->confirm(pipe, buf);
                              if (unlikely(ret)) {
  194                                 if (ret == -ENODATA)
                                              ret = 0;
  194                                 goto done;
                              }
      
  749                         array[n].bv_page = buf->page;
                              array[n].bv_len = this_len;
                              array[n].bv_offset = buf->offset;
                              left -= this_len;
                      }
      
  749                 iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
                                    sd.total_len - left);
                      ret = vfs_iter_write(out, &from, &sd.pos);
                      if (ret <= 0)
                              break;
      
  745                 sd.num_spliced += ret;
                      sd.total_len -= ret;
                      *ppos = sd.pos;
      
                      /* dismiss the fully eaten buffers, adjust the partial one */
  744                 while (ret) {
  745                         struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
                              if (ret >= buf->len) {
  744                                 const struct pipe_buf_operations *ops = buf->ops;
                                      ret -= buf->len;
                                      buf->len = 0;
                                      buf->ops = NULL;
                                      ops->release(pipe, buf);
                                      pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
                                      pipe->nrbufs--;
                                      if (pipe->files)
    3                                         sd.need_wakeup = true;
                              } else {
   12                                 buf->offset += ret;
                                      buf->len -= ret;
   12                                 ret = 0;
                              }
                      }
              }
      done:
  749         kfree(array);
    2         splice_from_pipe_end(pipe, &sd);
      
  749         pipe_unlock(pipe);
      
  749         if (sd.num_spliced)
  746                 ret = sd.num_spliced;
      
              return ret;
      }
      
      EXPORT_SYMBOL(iter_file_splice_write);
      
      static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                                struct splice_desc *sd)
      {
              int ret;
              void *data;
  157         loff_t tmp = sd->pos;
      
              data = kmap(buf->page);
              ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
              kunmap(buf->page);
      
              return ret;
      }
      
      static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
                                               struct file *out, loff_t *ppos,
                                               size_t len, unsigned int flags)
      {
              ssize_t ret;
      
  158         ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
              if (ret > 0)
  110                 *ppos += ret;
      
  135         return ret;
      }
      
      /**
       * generic_splice_sendpage - splice data from a pipe to a socket
       * @pipe:        pipe to splice from
       * @out:        socket to write to
       * @ppos:        position in @out
       * @len:        number of bytes to splice
       * @flags:        splice modifier flags
       *
       * Description:
       *    Will send @len bytes from the pipe to a network socket. No data copying
       *    is involved.
       *
       */
      ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
                                      loff_t *ppos, size_t len, unsigned int flags)
      {
  398         return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
      }
      
      EXPORT_SYMBOL(generic_splice_sendpage);
      
      /*
       * Attempt to initiate a splice from pipe to file.
       */
      static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
                                 loff_t *ppos, size_t len, unsigned int flags)
      {
              ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
                                      loff_t *, size_t, unsigned int);
      
  146         if (out->f_op->splice_write)
                      splice_write = out->f_op->splice_write;
              else
                      splice_write = default_file_splice_write;
      
 1287         return splice_write(pipe, out, ppos, len, flags);
      }
      
      /*
       * Attempt to initiate a splice from a file to a pipe.
       */
      static long do_splice_to(struct file *in, loff_t *ppos,
                               struct pipe_inode_info *pipe, size_t len,
                               unsigned int flags)
      {
              ssize_t (*splice_read)(struct file *, loff_t *,
                                     struct pipe_inode_info *, size_t, unsigned int);
              int ret;
      
 1247         if (unlikely(!(in->f_mode & FMODE_READ)))
                      return -EBADF;
      
 1247         ret = rw_verify_area(READ, in, ppos, len);
              if (unlikely(ret < 0))
    1                 return ret;
      
 1246         if (in->f_op->splice_read)
                      splice_read = in->f_op->splice_read;
              else
                      splice_read = default_file_splice_read;
      
 1247         return splice_read(in, ppos, pipe, len, flags);
      }
      
      /**
       * splice_direct_to_actor - splices data directly between two non-pipes
       * @in:                file to splice from
       * @sd:                actor information on where to splice to
       * @actor:        handles the data splicing
       *
       * Description:
       *    This is a special case helper to splice directly between two
       *    points, without requiring an explicit pipe. Internally an allocated
       *    pipe is cached in the process, and reused during the lifetime of
       *    that process.
       *
       */
      ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                                     splice_direct_actor *actor)
      {
              struct pipe_inode_info *pipe;
              long ret, bytes;
              umode_t i_mode;
              size_t len;
              int i, flags, more;
      
              /*
               * We require the input being a regular file, as we don't want to
               * randomly drop data for eg socket -> socket splicing. Use the
               * piped splicing for that!
               */
 1205         i_mode = file_inode(in)->i_mode;
              if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
                      return -EINVAL;
      
              /*
               * neither in nor out is a pipe, setup an internal pipe attached to
               * 'out' and transfer the wanted data from 'in' to 'out' through that
               */
 1204         pipe = current->splice_pipe;
              if (unlikely(!pipe)) {
 1168                 pipe = alloc_pipe_info();
                      if (!pipe)
                              return -ENOMEM;
      
                      /*
                       * We don't have an immediate reader, but we'll read the stuff
                       * out of the pipe right after the splice_to_pipe(). So set
                       * PIPE_READERS appropriately.
                       */
 1168                 pipe->readers = 1;
      
                      current->splice_pipe = pipe;
              }
      
              /*
               * Do the splice.
               */
              ret = 0;
              bytes = 0;
 1204         len = sd->total_len;
              flags = sd->flags;
      
              /*
               * Don't block on output, we have to drain the direct pipe.
               */
              sd->flags &= ~SPLICE_F_NONBLOCK;
              more = sd->flags & SPLICE_F_MORE;
      
 1202         while (len) {
                      size_t read_len;
 1203                 loff_t pos = sd->pos, prev_pos = pos;
      
                      ret = do_splice_to(in, &pos, pipe, len, flags);
                      if (unlikely(ret <= 0))
  639                         goto out_release;
      
 1141                 read_len = ret;
                      sd->total_len = read_len;
      
                      /*
                       * If more data is pending, set SPLICE_F_MORE
                       * If this is the last data and SPLICE_F_MORE was not set
                       * initially, clears it.
                       */
                      if (read_len < len)
 1093                         sd->flags |= SPLICE_F_MORE;
   96                 else if (!more)
   96                         sd->flags &= ~SPLICE_F_MORE;
                      /*
                       * NOTE: nonblocking mode only applies to the input. We
                       * must not do the output in nonblocking mode as then we
                       * could get stuck data in the internal pipe:
                       */
 1141                 ret = actor(pipe, sd);
                      if (unlikely(ret <= 0)) {
  313                         sd->pos = prev_pos;
                              goto out_release;
                      }
      
 1082                 bytes += ret;
                      len -= ret;
                      sd->pos = pos;
      
 1071                 if (ret < read_len) {
   40                         sd->pos = prev_pos + ret;
                              goto out_release;
                      }
              }
      
      done:
  704         pipe->nrbufs = pipe->curbuf = 0;
  705         file_accessed(in);
              return bytes;
      
      out_release:
              /*
               * If we did an incomplete transfer we must release
               * the pipe buffers in question:
               */
  639         for (i = 0; i < pipe->buffers; i++) {
  639                 struct pipe_buffer *buf = pipe->bufs + i;
      
                      if (buf->ops) {
  350                         buf->ops->release(pipe, buf);
                              buf->ops = NULL;
                      }
              }
      
  639         if (!bytes)
                      bytes = ret;
      
              goto done;
      }
      EXPORT_SYMBOL(splice_direct_to_actor);
      
      static int direct_splice_actor(struct pipe_inode_info *pipe,
                                     struct splice_desc *sd)
      {
 1141         struct file *file = sd->u.file;
      
 1141         return do_splice_from(pipe, file, sd->opos, sd->total_len,
                                    sd->flags);
      }
      
      /**
       * do_splice_direct - splices data directly between two files
       * @in:                file to splice from
       * @ppos:        input file offset
       * @out:        file to splice to
       * @opos:        output file offset
       * @len:        number of bytes to splice
       * @flags:        splice modifier flags
       *
       * Description:
       *    For use by do_sendfile(). splice can easily emulate sendfile, but
       *    doing it in the application would incur an extra system call
       *    (splice in + splice out, as compared to just sendfile()). So this helper
       *    can splice directly through a process-private pipe.
       *
       */
      long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
                            loff_t *opos, size_t len, unsigned int flags)
      {
 1206         struct splice_desc sd = {
                      .len                = len,
                      .total_len        = len,
                      .flags                = flags,
                      .pos                = *ppos,
                      .u.file                = out,
                      .opos                = opos,
              };
              long ret;
      
              if (unlikely(!(out->f_mode & FMODE_WRITE)))
                      return -EBADF;
      
 1206         if (unlikely(out->f_flags & O_APPEND))
                      return -EINVAL;
      
 1205         ret = rw_verify_area(WRITE, out, opos, len);
              if (unlikely(ret < 0))
                      return ret;
      
 1205         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
              if (ret > 0)
  706                 *ppos = sd.pos;
      
              return ret;
      }
      EXPORT_SYMBOL(do_splice_direct);
      
      static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                                     struct pipe_inode_info *opipe,
                                     size_t len, unsigned int flags);
      
      /*
       * Determine where to splice to/from.
       */
      static long do_splice(struct file *in, loff_t __user *off_in,
                            struct file *out, loff_t __user *off_out,
                            size_t len, unsigned int flags)
      {
              struct pipe_inode_info *ipipe;
              struct pipe_inode_info *opipe;
              loff_t offset;
              long ret;
      
              ipipe = get_pipe_info(in);
              opipe = get_pipe_info(out);
      
  157         if (ipipe && opipe) {
    8                 if (off_in || off_out)
    1                         return -ESPIPE;
      
    7                 if (!(in->f_mode & FMODE_READ))
                              return -EBADF;
      
    7                 if (!(out->f_mode & FMODE_WRITE))
                              return -EBADF;
      
                      /* Splicing to self would be fun, but... */
    7                 if (ipipe == opipe)
                              return -EINVAL;
      
    5                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
              }
      
              if (ipipe) {
  150                 if (off_in)
                              return -ESPIPE;
                      if (off_out) {
    4                         if (!(out->f_mode & FMODE_PWRITE))
                                      return -EINVAL;
    3                         if (copy_from_user(&offset, off_out, sizeof(loff_t)))
  153                                 return -EFAULT;
                      } else {
  145                         offset = out->f_pos;
                      }
      
  148                 if (unlikely(!(out->f_mode & FMODE_WRITE)))
                              return -EBADF;
      
  148                 if (unlikely(out->f_flags & O_APPEND))
                              return -EINVAL;
      
  147                 ret = rw_verify_area(WRITE, out, &offset, len);
                      if (unlikely(ret < 0))
                              return ret;
      
  146                 file_start_write(out);
  146                 ret = do_splice_from(ipipe, out, &offset, len, flags);
   22                 file_end_write(out);
      
   96                 if (!off_out)
   95                         out->f_pos = offset;
    1                 else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
                              ret = -EFAULT;
      
                      return ret;
              }
      
   47         if (opipe) {
   46                 if (off_out)
                              return -ESPIPE;
   45                 if (off_in) {
    5                         if (!(in->f_mode & FMODE_PREAD))
                                      return -EINVAL;
                              if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                                      return -EFAULT;
                      } else {
   40                         offset = in->f_pos;
                      }
      
    4                 ret = do_splice_to(in, &offset, opipe, len, flags);
      
                      if (!off_in)
                              in->f_pos = offset;
                      else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
                              ret = -EFAULT;
      
                      return ret;
              }
      
              return -EINVAL;
      }
      
      /*
       * Map an iov into an array of pages and offset/length tupples. With the
       * partial_page structure, we can map several non-contiguous ranges into
       * our ones pages[] map instead of splitting that operation into pieces.
       * Could easily be exported as a generic helper for other users, in which
       * case one would probably want to add a 'max_nr_pages' parameter as well.
       */
      static int get_iovec_page_array(const struct iovec __user *iov,
                                      unsigned int nr_vecs, struct page **pages,
                                      struct partial_page *partial, bool aligned,
                                      unsigned int pipe_buffers)
      {
              int buffers = 0, error = 0;
      
              while (nr_vecs) {
                      unsigned long off, npages;
                      struct iovec entry;
                      void __user *base;
                      size_t len;
                      int i;
      
                      error = -EFAULT;
   18                 if (copy_from_user(&entry, iov, sizeof(entry)))
                              break;
      
   18                 base = entry.iov_base;
                      len = entry.iov_len;
      
                      /*
                       * Sanity check this iovec. 0 read succeeds.
                       */
                      error = 0;
                      if (unlikely(!len))
                              break;
   13                 error = -EFAULT;
   15                 if (!access_ok(VERIFY_READ, base, len))
                              break;
      
                      /*
                       * Get this base offset and number of pages, then map
                       * in the user pages.
                       */
   15                 off = (unsigned long) base & ~PAGE_MASK;
      
                      /*
                       * If asked for alignment, the offset must be zero and the
                       * length a multiple of the PAGE_SIZE.
                       */
                      error = -EINVAL;
                      if (aligned && (off || len & ~PAGE_MASK))
                              break;
      
                      npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                      if (npages > pipe_buffers - buffers)
                              npages = pipe_buffers - buffers;
      
                      error = get_user_pages_fast((unsigned long)base, npages,
                                              0, &pages[buffers]);
      
                      if (unlikely(error <= 0))
                              break;
      
                      /*
                       * Fill this contiguous range into the partial page map.
                       */
                      for (i = 0; i < error; i++) {
   15                         const int plen = min_t(size_t, len, PAGE_SIZE - off);
      
                              partial[buffers].offset = off;
                              partial[buffers].len = plen;
      
                              off = 0;
                              len -= plen;
                              buffers++;
                      }
      
                      /*
                       * We didn't complete this iov, stop here since it probably
                       * means we have to move some of this into a pipe to
                       * be able to continue.
                       */
   15                 if (len)
                              break;
      
                      /*
                       * Don't continue if we mapped fewer pages than we asked for,
                       * or if we mapped the max number of pages that we have
                       * room for.
                       */
   11                 if (error < npages || buffers == pipe_buffers)
                              break;
      
    8                 nr_vecs--;
                      iov++;
              }
      
   18         if (buffers)
                      return buffers;
      
              return error;
      }
      
      static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                              struct splice_desc *sd)
      {
  123         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
  123         return n == sd->len ? n : -EFAULT;
      }
      
      /*
       * For lack of a better implementation, implement vmsplice() to userspace
       * as a simple copy of the pipes pages to the user iov.
       */
      static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
                                   unsigned long nr_segs, unsigned int flags)
      {
              struct pipe_inode_info *pipe;
              struct splice_desc sd;
              long ret;
              struct iovec iovstack[UIO_FASTIOV];
  127         struct iovec *iov = iovstack;
              struct iov_iter iter;
      
              pipe = get_pipe_info(file);
              if (!pipe)
                      return -EBADF;
      
  126         ret = import_iovec(READ, uiov, nr_segs,
                                 ARRAY_SIZE(iovstack), &iov, &iter);
              if (ret < 0)
                      return ret;
      
  126         sd.total_len = iov_iter_count(&iter);
              sd.len = 0;
              sd.flags = flags;
              sd.u.data = &iter;
              sd.pos = 0;
      
              if (sd.total_len) {
  124                 pipe_lock(pipe);
                      ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
                      pipe_unlock(pipe);
              }
      
    8         kfree(iov);
    9         return ret;
      }
      
      /*
       * vmsplice splices a user address range into a pipe. It can be thought of
       * as splice-from-memory, where the regular splice is splice-from-file (or
       * to file). In both cases the output is a pipe, naturally.
       */
      static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
                                   unsigned long nr_segs, unsigned int flags)
      {
              struct pipe_inode_info *pipe;
              struct page *pages[PIPE_DEF_BUFFERS];
              struct partial_page partial[PIPE_DEF_BUFFERS];
   19         struct splice_pipe_desc spd = {
                      .pages = pages,
                      .partial = partial,
                      .nr_pages_max = PIPE_DEF_BUFFERS,
                      .flags = flags,
                      .ops = &user_page_pipe_buf_ops,
                      .spd_release = spd_release_page,
              };
              long ret;
      
              pipe = get_pipe_info(file);
              if (!pipe)
                      return -EBADF;
      
   18         if (splice_grow_spd(pipe, &spd))
                      return -ENOMEM;
      
   18         spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
                                                  spd.partial, false,
                                                  spd.nr_pages_max);
    4         if (spd.nr_pages <= 0)
                      ret = spd.nr_pages;
              else
   15                 ret = splice_to_pipe(pipe, &spd);
      
   17         splice_shrink_spd(&spd);
              return ret;
      }
      
      /*
       * Note that vmsplice only really supports true splicing _from_ user memory
       * to a pipe, not the other way around. Splicing from user memory is a simple
       * operation that can be supported without any funky alignment restrictions
       * or nasty vm tricks. We simply map in the user memory and fill them into
       * a pipe. The reverse isn't quite as easy, though. There are two possible
       * solutions for that:
       *
       *        - memcpy() the data internally, at which point we might as well just
       *          do a regular read() on the buffer anyway.
       *        - Lots of nasty vm tricks, that are neither fast nor flexible (it
       *          has restriction limitations on both ends of the pipe).
       *
       * Currently we punt and implement it as a normal copy, see pipe_to_user().
       *
       */
  149 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
                      unsigned long, nr_segs, unsigned int, flags)
      {
              struct fd f;
              long error;
      
              if (unlikely(nr_segs > UIO_MAXIOV))
                      return -EINVAL;
  148         else if (unlikely(!nr_segs))
                      return 0;
      
              error = -EBADF;
  146         f = fdget(fd);
              if (f.file) {
  145                 if (f.file->f_mode & FMODE_WRITE)
   19                         error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
  128                 else if (f.file->f_mode & FMODE_READ)
  127                         error = vmsplice_to_user(f.file, iov, nr_segs, flags);
      
   28                 fdput(f);
              }
      
              return error;
      }
      
      #ifdef CONFIG_COMPAT
  151 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
                          unsigned int, nr_segs, unsigned int, flags)
      {
              unsigned i;
              struct iovec __user *iov;
              if (nr_segs > UIO_MAXIOV)
                      return -EINVAL;
  149         iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
  146         for (i = 0; i < nr_segs; i++) {
                      struct compat_iovec v;
  147                 if (get_user(v.iov_base, &iov32[i].iov_base) ||
  146                     get_user(v.iov_len, &iov32[i].iov_len) ||
  146                     put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
  146                     put_user(v.iov_len, &iov[i].iov_len))
                              return -EFAULT;
              }
              return sys_vmsplice(fd, iov, nr_segs, flags);
      }
      #endif
      
  209 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
                      int, fd_out, loff_t __user *, off_out,
                      size_t, len, unsigned int, flags)
      {
              struct fd in, out;
              long error;
      
              if (unlikely(!len))
                      return 0;
      
              error = -EBADF;
  208         in = fdget(fd_in);
              if (in.file) {
  206                 if (in.file->f_mode & FMODE_READ) {
  205                         out = fdget(fd_out);
                              if (out.file) {
                                      if (out.file->f_mode & FMODE_WRITE)
  203                                         error = do_splice(in.file, off_in,
                                                                out.file, off_out,
                                                                len, flags);
  153                                 fdput(out);
                              }
                      }
  156                 fdput(in);
              }
              return error;
      }
      
      /*
       * Make sure there's data to read. Wait for input if we can, otherwise
       * return an appropriate error.
       */
      static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
      {
              int ret;
      
              /*
               * Check ->nrbufs without the inode lock first. This function
               * is speculative anyways, so missing one is ok.
               */
    9         if (pipe->nrbufs)
                      return 0;
      
              ret = 0;
    9         pipe_lock(pipe);
      
              while (!pipe->nrbufs) {
    9                 if (signal_pending(current)) {
                              ret = -ERESTARTSYS;
                              break;
                      }
    9                 if (!pipe->writers)
                              break;
    7                 if (!pipe->waiting_writers) {
    6                         if (flags & SPLICE_F_NONBLOCK) {
                                      ret = -EAGAIN;
                                      break;
                              }
                      }
    6                 pipe_wait(pipe);
              }
      
    9         pipe_unlock(pipe);
              return ret;
      }
      
      /*
       * Make sure there's writeable room. Wait for room if we can, otherwise
       * return an appropriate error.
       */
      static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
      {
              int ret;
      
              /*
               * Check ->nrbufs without the inode lock first. This function
               * is speculative anyways, so missing one is ok.
               */
   14         if (pipe->nrbufs < pipe->buffers)
                      return 0;
      
              ret = 0;
    4         pipe_lock(pipe);
      
              while (pipe->nrbufs >= pipe->buffers) {
    4                 if (!pipe->readers) {
    1                         send_sig(SIGPIPE, current, 0);
                              ret = -EPIPE;
                              break;
                      }
    3                 if (flags & SPLICE_F_NONBLOCK) {
                              ret = -EAGAIN;
                              break;
                      }
    2                 if (signal_pending(current)) {
                              ret = -ERESTARTSYS;
                              break;
                      }
    2                 pipe->waiting_writers++;
                      pipe_wait(pipe);
                      pipe->waiting_writers--;
              }
      
    4         pipe_unlock(pipe);
              return ret;
      }
      
      /*
       * Splice contents of ipipe to opipe.
       */
      static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                                     struct pipe_inode_info *opipe,
                                     size_t len, unsigned int flags)
      {
              struct pipe_buffer *ibuf, *obuf;
              int ret = 0, nbuf;
              bool input_wakeup = false;
      
      
      retry:
    5         ret = ipipe_prep(ipipe, flags);
              if (ret)
                      return ret;
      
    5         ret = opipe_prep(opipe, flags);
              if (ret)
                      return ret;
      
              /*
               * Potential ABBA deadlock, work around it by ordering lock
               * grabbing by pipe info address. Otherwise two different processes
               * could deadlock (one doing tee from A -> B, the other from B -> A).
               */
    5         pipe_double_lock(ipipe, opipe);
      
              do {
    5                 if (!opipe->readers) {
    1                         send_sig(SIGPIPE, current, 0);
                              if (!ret)
                                      ret = -EPIPE;
                              break;
                      }
      
    4                 if (!ipipe->nrbufs && !ipipe->writers)
                              break;
      
                      /*
                       * Cannot make any progress, because either the input
                       * pipe is empty or the output pipe is full.
                       */
    3                 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
                              /* Already processed some buffers, break */
    2                         if (ret)
                                      break;
      
    1                         if (flags & SPLICE_F_NONBLOCK) {
                                      ret = -EAGAIN;
                                      break;
                              }
      
                              /*
                               * We raced with another reader/writer and haven't
                               * managed to process any buffers.  A zero return
                               * value means EOF, so retry instead.
                               */
    1                         pipe_unlock(ipipe);
                              pipe_unlock(opipe);
                              goto retry;
                      }
      
    3                 ibuf = ipipe->bufs + ipipe->curbuf;
                      nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
                      obuf = opipe->bufs + nbuf;
      
                      if (len >= ibuf->len) {
                              /*
                               * Simply move the whole buffer from ipipe to opipe
                               */
    2                         *obuf = *ibuf;
                              ibuf->ops = NULL;
                              opipe->nrbufs++;
                              ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
                              ipipe->nrbufs--;
                              input_wakeup = true;
                      } else {
                              /*
                               * Get a reference to this pipe buffer,
                               * so we can copy the contents over.
                               */
    1                         ibuf->ops->get(ipipe, ibuf);
                              *obuf = *ibuf;
      
                              /*
                               * Don't inherit the gift flag, we need to
                               * prevent multiple steals of this page.
                               */
                              obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
      
                              obuf->len = len;
                              opipe->nrbufs++;
                              ibuf->offset += obuf->len;
                              ibuf->len -= obuf->len;
                      }
    3                 ret += obuf->len;
                      len -= obuf->len;
              } while (len);
      
    5         pipe_unlock(ipipe);
              pipe_unlock(opipe);
      
              /*
               * If we put data in the output pipe, wakeup any potential readers.
               */
              if (ret > 0)
    3                 wakeup_pipe_readers(opipe);
      
    5         if (input_wakeup)
    2                 wakeup_pipe_writers(ipipe);
      
              return ret;
      }
      
      /*
       * Link contents of ipipe to opipe.
       */
      static int link_pipe(struct pipe_inode_info *ipipe,
                           struct pipe_inode_info *opipe,
                           size_t len, unsigned int flags)
      {
              struct pipe_buffer *ibuf, *obuf;
              int ret = 0, i = 0, nbuf;
      
              /*
               * Potential ABBA deadlock, work around it by ordering lock
               * grabbing by pipe info address. Otherwise two different processes
               * could deadlock (one doing tee from A -> B, the other from B -> A).
               */
    7         pipe_double_lock(ipipe, opipe);
      
              do {
    7                 if (!opipe->readers) {
    1                         send_sig(SIGPIPE, current, 0);
                              if (!ret)
                                      ret = -EPIPE;
                              break;
                      }
      
                      /*
                       * If we have iterated all input buffers or ran out of
                       * output room, break.
                       */
    6                 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
                              break;
      
    5                 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
                      nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
      
                      /*
                       * Get a reference to this pipe buffer,
                       * so we can copy the contents over.
                       */
                      ibuf->ops->get(ipipe, ibuf);
      
                      obuf = opipe->bufs + nbuf;
                      *obuf = *ibuf;
      
                      /*
                       * Don't inherit the gift flag, we need to
                       * prevent multiple steals of this page.
                       */
                      obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
      
                      if (obuf->len > len)
    1                         obuf->len = len;
      
    5                 opipe->nrbufs++;
                      ret += obuf->len;
                      len -= obuf->len;
                      i++;
              } while (len);
      
              /*
               * return EAGAIN if we have the potential of some data in the
               * future, otherwise just return 0
               */
    6         if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
                      ret = -EAGAIN;
      
    7         pipe_unlock(ipipe);
              pipe_unlock(opipe);
      
              /*
               * If we put data in the output pipe, wakeup any potential readers.
               */
              if (ret > 0)
    5                 wakeup_pipe_readers(opipe);
      
              return ret;
      }
      
      /*
       * This is a tee(1) implementation that works on pipes. It doesn't copy
       * any data, it simply references the 'in' pages on the 'out' pipe.
       * The 'flags' used are the SPLICE_F_* variants, currently the only
       * applicable one is SPLICE_F_NONBLOCK.
       */
      static long do_tee(struct file *in, struct file *out, size_t len,
                         unsigned int flags)
      {
   13         struct pipe_inode_info *ipipe = get_pipe_info(in);
              struct pipe_inode_info *opipe = get_pipe_info(out);
              int ret = -EINVAL;
      
              /*
               * Duplicate the contents of ipipe to opipe without actually
               * copying the data.
               */
   12         if (ipipe && opipe && ipipe != opipe) {
                      /*
                       * Keep going, unless we encounter an error. The ipipe/opipe
                       * ordering doesn't really matter.
                       */
    4                 ret = ipipe_prep(ipipe, flags);
    4                 if (!ret) {
    9                         ret = opipe_prep(opipe, flags);
                              if (!ret)
    7                                 ret = link_pipe(ipipe, opipe, len, flags);
                      }
              }
      
              return ret;
      }
      
   18 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
      {
              struct fd in;
              int error;
      
              if (unlikely(!len))
                      return 0;
      
              error = -EBADF;
   17         in = fdget(fdin);
              if (in.file) {
                      if (in.file->f_mode & FMODE_READ) {
   15                         struct fd out = fdget(fdout);
                              if (out.file) {
                                      if (out.file->f_mode & FMODE_WRITE)
   13                                         error = do_tee(in.file, out.file,
                                                              len, flags);
   13                                 fdput(out);
                              }
                      }
   15                  fdput(in);
               }
      
              return error;
      }
      #undef TRACE_SYSTEM
      #define TRACE_SYSTEM random
      
      #if !defined(_TRACE_RANDOM_H) || defined(TRACE_HEADER_MULTI_READ)
      #define _TRACE_RANDOM_H
      
      #include <linux/writeback.h>
      #include <linux/tracepoint.h>
      
  418 TRACE_EVENT(add_device_randomness,
              TP_PROTO(int bytes, unsigned long IP),
      
              TP_ARGS(bytes, IP),
      
              TP_STRUCT__entry(
                      __field(          int,        bytes                        )
                      __field(unsigned long,        IP                        )
              ),
      
              TP_fast_assign(
                      __entry->bytes                = bytes;
                      __entry->IP                = IP;
              ),
      
              TP_printk("bytes %d caller %pS",
                      __entry->bytes, (void *)__entry->IP)
      );
      
      DECLARE_EVENT_CLASS(random__mix_pool_bytes,
              TP_PROTO(const char *pool_name, int bytes, unsigned long IP),
      
              TP_ARGS(pool_name, bytes, IP),
      
              TP_STRUCT__entry(
                      __field( const char *,        pool_name                )
                      __field(          int,        bytes                        )
                      __field(unsigned long,        IP                        )
              ),
      
              TP_fast_assign(
                      __entry->pool_name        = pool_name;
                      __entry->bytes                = bytes;
                      __entry->IP                = IP;
              ),
      
              TP_printk("%s pool: bytes %d caller %pS",
                        __entry->pool_name, __entry->bytes, (void *)__entry->IP)
      );
      
   19 DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes,
              TP_PROTO(const char *pool_name, int bytes, unsigned long IP),
      
              TP_ARGS(pool_name, bytes, IP)
      );
      
  355 DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes_nolock,
              TP_PROTO(const char *pool_name, int bytes, unsigned long IP),
      
              TP_ARGS(pool_name, bytes, IP)
      );
      
   12 TRACE_EVENT(credit_entropy_bits,
              TP_PROTO(const char *pool_name, int bits, int entropy_count,
                       int entropy_total, unsigned long IP),
      
              TP_ARGS(pool_name, bits, entropy_count, entropy_total, IP),
      
              TP_STRUCT__entry(
                      __field( const char *,        pool_name                )
                      __field(          int,        bits                        )
                      __field(          int,        entropy_count                )
                      __field(          int,        entropy_total                )
                      __field(unsigned long,        IP                        )
              ),
      
              TP_fast_assign(
                      __entry->pool_name        = pool_name;
                      __entry->bits                = bits;
                      __entry->entropy_count        = entropy_count;
                      __entry->entropy_total        = entropy_total;
                      __entry->IP                = IP;
              ),
      
              TP_printk("%s pool: bits %d entropy_count %d entropy_total %d "
                        "caller %pS", __entry->pool_name, __entry->bits,
                        __entry->entropy_count, __entry->entropy_total,
                        (void *)__entry->IP)
      );
      
      TRACE_EVENT(push_to_pool,
              TP_PROTO(const char *pool_name, int pool_bits, int input_bits),
      
              TP_ARGS(pool_name, pool_bits, input_bits),
      
              TP_STRUCT__entry(
                      __field( const char *,        pool_name                )
                      __field(          int,        pool_bits                )
                      __field(          int,        input_bits                )
              ),
      
              TP_fast_assign(
                      __entry->pool_name        = pool_name;
                      __entry->pool_bits        = pool_bits;
                      __entry->input_bits        = input_bits;
              ),
      
              TP_printk("%s: pool_bits %d input_pool_bits %d",
                        __entry->pool_name, __entry->pool_bits,
                        __entry->input_bits)
      );
      
  357 TRACE_EVENT(debit_entropy,
              TP_PROTO(const char *pool_name, int debit_bits),
      
              TP_ARGS(pool_name, debit_bits),
      
              TP_STRUCT__entry(
                      __field( const char *,        pool_name                )
                      __field(          int,        debit_bits                )
              ),
      
              TP_fast_assign(
                      __entry->pool_name        = pool_name;
                      __entry->debit_bits        = debit_bits;
              ),
      
              TP_printk("%s: debit_bits %d", __entry->pool_name,
                        __entry->debit_bits)
      );
      
   36 TRACE_EVENT(add_input_randomness,
              TP_PROTO(int input_bits),
      
              TP_ARGS(input_bits),
      
              TP_STRUCT__entry(
                      __field(          int,        input_bits                )
              ),
      
              TP_fast_assign(
                      __entry->input_bits        = input_bits;
              ),
      
              TP_printk("input_pool_bits %d", __entry->input_bits)
      );
      
      TRACE_EVENT(add_disk_randomness,
              TP_PROTO(dev_t dev, int input_bits),
      
              TP_ARGS(dev, input_bits),
      
              TP_STRUCT__entry(
                      __field(        dev_t,        dev                        )
                      __field(          int,        input_bits                )
              ),
      
              TP_fast_assign(
                      __entry->dev                = dev;
                      __entry->input_bits        = input_bits;
              ),
      
              TP_printk("dev %d,%d input_pool_bits %d", MAJOR(__entry->dev),
                        MINOR(__entry->dev), __entry->input_bits)
      );
      
    4 TRACE_EVENT(xfer_secondary_pool,
              TP_PROTO(const char *pool_name, int xfer_bits, int request_bits,
                       int pool_entropy, int input_entropy),
      
              TP_ARGS(pool_name, xfer_bits, request_bits, pool_entropy,
                      input_entropy),
      
              TP_STRUCT__entry(
                      __field( const char *,        pool_name                )
                      __field(          int,        xfer_bits                )
                      __field(          int,        request_bits                )
                      __field(          int,        pool_entropy                )
                      __field(          int,        input_entropy                )
              ),
      
              TP_fast_assign(
                      __entry->pool_name        = pool_name;
                      __entry->xfer_bits        = xfer_bits;
                      __entry->request_bits        = request_bits;
                      __entry->pool_entropy        = pool_entropy;
                      __entry->input_entropy        = input_entropy;
              ),
      
              TP_printk("pool %s xfer_bits %d request_bits %d pool_entropy %d "
                        "input_entropy %d", __entry->pool_name, __entry->xfer_bits,
                        __entry->request_bits, __entry->pool_entropy,
                        __entry->input_entropy)
      );
      
      DECLARE_EVENT_CLASS(random__get_random_bytes,
              TP_PROTO(int nbytes, unsigned long IP),
      
              TP_ARGS(nbytes, IP),
      
              TP_STRUCT__entry(
                      __field(          int,        nbytes                        )
                      __field(unsigned long,        IP                        )
              ),
      
              TP_fast_assign(
                      __entry->nbytes                = nbytes;
                      __entry->IP                = IP;
              ),
      
              TP_printk("nbytes %d caller %pS", __entry->nbytes, (void *)__entry->IP)
      );
      
  337 DEFINE_EVENT(random__get_random_bytes, get_random_bytes,
              TP_PROTO(int nbytes, unsigned long IP),
      
              TP_ARGS(nbytes, IP)
      );
      
      DEFINE_EVENT(random__get_random_bytes, get_random_bytes_arch,
              TP_PROTO(int nbytes, unsigned long IP),
      
              TP_ARGS(nbytes, IP)
      );
      
      DECLARE_EVENT_CLASS(random__extract_entropy,
              TP_PROTO(const char *pool_name, int nbytes, int entropy_count,
                       unsigned long IP),
      
              TP_ARGS(pool_name, nbytes, entropy_count, IP),
      
              TP_STRUCT__entry(
                      __field( const char *,        pool_name                )
                      __field(          int,        nbytes                        )
                      __field(          int,        entropy_count                )
                      __field(unsigned long,        IP                        )
              ),
      
              TP_fast_assign(
                      __entry->pool_name        = pool_name;
                      __entry->nbytes                = nbytes;
                      __entry->entropy_count        = entropy_count;
                      __entry->IP                = IP;
              ),
      
              TP_printk("%s pool: nbytes %d entropy_count %d caller %pS",
                        __entry->pool_name, __entry->nbytes, __entry->entropy_count,
                        (void *)__entry->IP)
      );
      
      
  339 DEFINE_EVENT(random__extract_entropy, extract_entropy,
              TP_PROTO(const char *pool_name, int nbytes, int entropy_count,
                       unsigned long IP),
      
              TP_ARGS(pool_name, nbytes, entropy_count, IP)
      );
      
   21 DEFINE_EVENT(random__extract_entropy, extract_entropy_user,
              TP_PROTO(const char *pool_name, int nbytes, int entropy_count,
                       unsigned long IP),
      
              TP_ARGS(pool_name, nbytes, entropy_count, IP)
      );
      
    2 TRACE_EVENT(random_read,
              TP_PROTO(int got_bits, int need_bits, int pool_left, int input_left),
      
              TP_ARGS(got_bits, need_bits, pool_left, input_left),
      
              TP_STRUCT__entry(
                      __field(          int,        got_bits                )
                      __field(          int,        need_bits                )
                      __field(          int,        pool_left                )
                      __field(          int,        input_left                )
              ),
      
              TP_fast_assign(
                      __entry->got_bits        = got_bits;
                      __entry->need_bits        = need_bits;
                      __entry->pool_left        = pool_left;
                      __entry->input_left        = input_left;
              ),
      
              TP_printk("got_bits %d still_needed_bits %d "
                        "blocking_pool_entropy_left %d input_entropy_left %d",
                        __entry->got_bits, __entry->got_bits, __entry->pool_left,
                        __entry->input_left)
      );
      
   12 TRACE_EVENT(urandom_read,
              TP_PROTO(int got_bits, int pool_left, int input_left),
      
              TP_ARGS(got_bits, pool_left, input_left),
      
              TP_STRUCT__entry(
                      __field(          int,        got_bits                )
                      __field(          int,        pool_left                )
                      __field(          int,        input_left                )
              ),
      
              TP_fast_assign(
                      __entry->got_bits        = got_bits;
                      __entry->pool_left        = pool_left;
                      __entry->input_left        = input_left;
              ),
      
              TP_printk("got_bits %d nonblocking_pool_entropy_left %d "
                        "input_entropy_left %d", __entry->got_bits,
                        __entry->pool_left, __entry->input_left)
      );
      
      #endif /* _TRACE_RANDOM_H */
      
      /* This part must be outside protection */
      #include <trace/define_trace.h>
      #include <linux/pm.h>
      #include <linux/acpi.h>
      
      struct usb_hub_descriptor;
      struct usb_dev_state;
      
      /* Functions local to drivers/usb/core/ */
      
      extern int usb_create_sysfs_dev_files(struct usb_device *dev);
      extern void usb_remove_sysfs_dev_files(struct usb_device *dev);
      extern void usb_create_sysfs_intf_files(struct usb_interface *intf);
      extern void usb_remove_sysfs_intf_files(struct usb_interface *intf);
      extern int usb_create_ep_devs(struct device *parent,
                                      struct usb_host_endpoint *endpoint,
                                      struct usb_device *udev);
      extern void usb_remove_ep_devs(struct usb_host_endpoint *endpoint);
      
      extern void usb_enable_endpoint(struct usb_device *dev,
                      struct usb_host_endpoint *ep, bool reset_toggle);
      extern void usb_enable_interface(struct usb_device *dev,
                      struct usb_interface *intf, bool reset_toggles);
      extern void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr,
                      bool reset_hardware);
      extern void usb_disable_interface(struct usb_device *dev,
                      struct usb_interface *intf, bool reset_hardware);
      extern void usb_release_interface_cache(struct kref *ref);
      extern void usb_disable_device(struct usb_device *dev, int skip_ep0);
      extern int usb_deauthorize_device(struct usb_device *);
      extern int usb_authorize_device(struct usb_device *);
      extern void usb_deauthorize_interface(struct usb_interface *);
      extern void usb_authorize_interface(struct usb_interface *);
      extern void usb_detect_quirks(struct usb_device *udev);
      extern void usb_detect_interface_quirks(struct usb_device *udev);
      extern int usb_remove_device(struct usb_device *udev);
      
      extern int usb_get_device_descriptor(struct usb_device *dev,
                      unsigned int size);
      extern int usb_get_bos_descriptor(struct usb_device *dev);
      extern void usb_release_bos_descriptor(struct usb_device *dev);
      extern char *usb_cache_string(struct usb_device *udev, int index);
      extern int usb_set_configuration(struct usb_device *dev, int configuration);
      extern int usb_choose_configuration(struct usb_device *udev);
      
      static inline unsigned usb_get_max_power(struct usb_device *udev,
                      struct usb_host_config *c)
      {
              /* SuperSpeed power is in 8 mA units; others are in 2 mA units */
              unsigned mul = (udev->speed >= USB_SPEED_SUPER ? 8 : 2);
      
              return c->desc.bMaxPower * mul;
      }
      
      extern void usb_kick_hub_wq(struct usb_device *dev);
      extern int usb_match_one_id_intf(struct usb_device *dev,
                                       struct usb_host_interface *intf,
                                       const struct usb_device_id *id);
      extern int usb_match_device(struct usb_device *dev,
                                  const struct usb_device_id *id);
      extern void usb_forced_unbind_intf(struct usb_interface *intf);
      extern void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev);
      
      extern void usb_hub_release_all_ports(struct usb_device *hdev,
                      struct usb_dev_state *owner);
      extern bool usb_device_is_owned(struct usb_device *udev);
      
      extern int  usb_hub_init(void);
      extern void usb_hub_cleanup(void);
      extern int usb_major_init(void);
      extern void usb_major_cleanup(void);
      extern int usb_device_supports_lpm(struct usb_device *udev);
      
      #ifdef        CONFIG_PM
      
      extern int usb_suspend(struct device *dev, pm_message_t msg);
      extern int usb_resume(struct device *dev, pm_message_t msg);
      extern int usb_resume_complete(struct device *dev);
      
      extern int usb_port_suspend(struct usb_device *dev, pm_message_t msg);
      extern int usb_port_resume(struct usb_device *dev, pm_message_t msg);
      
      extern void usb_autosuspend_device(struct usb_device *udev);
      extern int usb_autoresume_device(struct usb_device *udev);
      extern int usb_remote_wakeup(struct usb_device *dev);
      extern int usb_runtime_suspend(struct device *dev);
      extern int usb_runtime_resume(struct device *dev);
      extern int usb_runtime_idle(struct device *dev);
      extern int usb_enable_usb2_hardware_lpm(struct usb_device *udev);
      extern int usb_disable_usb2_hardware_lpm(struct usb_device *udev);
      
      #else
      
      static inline int usb_port_suspend(struct usb_device *udev, pm_message_t msg)
      {
              return 0;
      }
      
      static inline int usb_port_resume(struct usb_device *udev, pm_message_t msg)
      {
              return 0;
      }
      
      #define usb_autosuspend_device(udev)                do {} while (0)
      static inline int usb_autoresume_device(struct usb_device *udev)
      {
              return 0;
      }
      
      static inline int usb_enable_usb2_hardware_lpm(struct usb_device *udev)
      {
              return 0;
      }
      
      static inline int usb_disable_usb2_hardware_lpm(struct usb_device *udev)
      {
              return 0;
      }
      
      #endif
      
      extern struct bus_type usb_bus_type;
      extern struct mutex usb_port_peer_mutex;
      extern struct device_type usb_device_type;
      extern struct device_type usb_if_device_type;
  420 extern struct device_type usb_ep_device_type;
      extern struct device_type usb_port_device_type;
      extern struct usb_device_driver usb_generic_driver;
      
      static inline int is_usb_device(const struct device *dev)
      {
              return dev->type == &usb_device_type;
      }
      
      static inline int is_usb_interface(const struct device *dev)
      {
              return dev->type == &usb_if_device_type;
      }
      
      static inline int is_usb_endpoint(const struct device *dev)
      {
              return dev->type == &usb_ep_device_type;
      }
      
      static inline int is_usb_port(const struct device *dev)
      {
              return dev->type == &usb_port_device_type;
      }
      
      /* Do the same for device drivers and interface drivers. */
      
      static inline int is_usb_device_driver(struct device_driver *drv)
      {
              return container_of(drv, struct usbdrv_wrap, driver)->
                              for_devices;
      }
      
      /* for labeling diagnostics */
      extern const char *usbcore_name;
      
      /* sysfs stuff */
      extern const struct attribute_group *usb_device_groups[];
      extern const struct attribute_group *usb_interface_groups[];
      
      /* usbfs stuff */
      extern struct mutex usbfs_mutex;
      extern struct usb_driver usbfs_driver;
      extern const struct file_operations usbfs_devices_fops;
      extern const struct file_operations usbdev_file_operations;
      extern void usbfs_conn_disc_event(void);
      
      extern int usb_devio_init(void);
      extern void usb_devio_cleanup(void);
      
      /*
       * Firmware specific cookie identifying a port's location. '0' == no location
       * data available
       */
      typedef u32 usb_port_location_t;
      
      /* internal notify stuff */
      extern void usb_notify_add_device(struct usb_device *udev);
      extern void usb_notify_remove_device(struct usb_device *udev);
      extern void usb_notify_add_bus(struct usb_bus *ubus);
      extern void usb_notify_remove_bus(struct usb_bus *ubus);
      extern void usb_hub_adjust_deviceremovable(struct usb_device *hdev,
                      struct usb_hub_descriptor *desc);
      
      #ifdef CONFIG_ACPI
      extern int usb_acpi_register(void);
      extern void usb_acpi_unregister(void);
      extern acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev,
              int port1);
      #else
      static inline int usb_acpi_register(void) { return 0; };
      static inline void usb_acpi_unregister(void) { };
      #endif
      /* Internal procfs definitions
       *
       * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
       * Written by David Howells (dhowells@redhat.com)
       *
       * This program is free software; you can redistribute it and/or
       * modify it under the terms of the GNU General Public License
       * as published by the Free Software Foundation; either version
       * 2 of the License, or (at your option) any later version.
       */
      
      #include <linux/proc_fs.h>
      #include <linux/proc_ns.h>
      #include <linux/spinlock.h>
      #include <linux/atomic.h>
      #include <linux/binfmts.h>
      
      struct ctl_table_header;
      struct mempolicy;
      
      /*
       * This is not completely implemented yet. The idea is to
       * create an in-memory tree (like the actual /proc filesystem
       * tree) of these proc_dir_entries, so that we can dynamically
       * add new files to /proc.
       *
       * parent/subdir are used for the directory structure (every /proc file has a
       * parent, but "subdir" is empty for all non-directory entries).
       * subdir_node is used to build the rb tree "subdir" of the parent.
       */
      struct proc_dir_entry {
              unsigned int low_ino;
              umode_t mode;
              nlink_t nlink;
              kuid_t uid;
              kgid_t gid;
              loff_t size;
              const struct inode_operations *proc_iops;
              const struct file_operations *proc_fops;
              struct proc_dir_entry *parent;
              struct rb_root subdir;
              struct rb_node subdir_node;
              void *data;
              atomic_t count;                /* use count */
              atomic_t in_use;        /* number of callers into module in progress; */
                              /* negative -> it's going away RSN */
              struct completion *pde_unload_completion;
              struct list_head pde_openers;        /* who did ->open, but not ->release */
              spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
              u8 namelen;
              char name[];
      };
      
      union proc_op {
              int (*proc_get_link)(struct dentry *, struct path *);
              int (*proc_show)(struct seq_file *m,
                      struct pid_namespace *ns, struct pid *pid,
                      struct task_struct *task);
      };
      
      struct proc_inode {
              struct pid *pid;
              int fd;
              union proc_op op;
              struct proc_dir_entry *pde;
              struct ctl_table_header *sysctl;
              struct ctl_table *sysctl_entry;
              const struct proc_ns_operations *ns_ops;
              struct inode vfs_inode;
      };
      
      /*
       * General functions
       */
      static inline struct proc_inode *PROC_I(const struct inode *inode)
      {
              return container_of(inode, struct proc_inode, vfs_inode);
      }
      
      static inline struct proc_dir_entry *PDE(const struct inode *inode)
      {
  204         return PROC_I(inode)->pde;
      }
      
      static inline void *__PDE_DATA(const struct inode *inode)
      {
   25         return PDE(inode)->data;
      }
      
      static inline struct pid *proc_pid(struct inode *inode)
      {
  534         return PROC_I(inode)->pid;
      }
      
      static inline struct task_struct *get_proc_task(struct inode *inode)
      {
  511         return get_pid_task(proc_pid(inode), PIDTYPE_PID);
      }
      
      static inline int task_dumpable(struct task_struct *task)
      {
              int dumpable = 0;
              struct mm_struct *mm;
      
  498         task_lock(task);
              mm = task->mm;
              if (mm)
  496                 dumpable = get_dumpable(mm);
    2         task_unlock(task);
              if (dumpable == SUID_DUMP_USER)
                      return 1;
              return 0;
      }
      
      static inline unsigned name_to_int(const struct qstr *qstr)
      {
  140         const char *name = qstr->name;
              int len = qstr->len;
              unsigned n = 0;
      
  146         if (len > 1 && *name == '0')
                      goto out;
  196         while (len-- > 0) {
  196                 unsigned c = *name++ - '0';
                      if (c > 9)
                              goto out;
  182                 if (n >= (~0U-9)/10)
                              goto out;
  182                 n *= 10;
                      n += c;
              }
              return n;
      out:
              return ~0U;
      }
      
      /*
       * Offset of the first process in the /proc root directory..
       */
      #define FIRST_PROCESS_ENTRY 256
      
      /* Worst case buffer size needed for holding an integer. */
      #define PROC_NUMBUF 13
      
      /*
       * array.c
       */
      extern const struct file_operations proc_tid_children_operations;
      
      extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
                               struct pid *, struct task_struct *);
      extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
                                struct pid *, struct task_struct *);
      extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
                                 struct pid *, struct task_struct *);
      extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
                                struct pid *, struct task_struct *);
      
      /*
       * base.c
       */
      extern const struct dentry_operations pid_dentry_operations;
      extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
      extern int proc_setattr(struct dentry *, struct iattr *);
      extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
      extern int pid_revalidate(struct dentry *, unsigned int);
      extern int pid_delete_dentry(const struct dentry *);
      extern int proc_pid_readdir(struct file *, struct dir_context *);
      extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
      extern loff_t mem_lseek(struct file *, loff_t, int);
      
      /* Lookups */
      typedef int instantiate_t(struct inode *, struct dentry *,
                                           struct task_struct *, const void *);
      extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
                                 instantiate_t, struct task_struct *, const void *);
      
      /*
       * generic.c
       */
      extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
      extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
                                           struct dentry *);
      extern int proc_readdir(struct file *, struct dir_context *);
      extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
      
      static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
      {
  198         atomic_inc(&pde->count);
              return pde;
      }
      extern void pde_put(struct proc_dir_entry *);
      
      static inline bool is_empty_pde(const struct proc_dir_entry *pde)
      {
  387         return S_ISDIR(pde->mode) && !pde->proc_iops;
      }
      struct proc_dir_entry *proc_create_mount_point(const char *name);
      
      /*
       * inode.c
       */
      struct pde_opener {
              struct file *file;
              struct list_head lh;
              int closing;
              struct completion *c;
      };
      extern const struct inode_operations proc_link_inode_operations;
      
      extern const struct inode_operations proc_pid_link_inode_operations;
      
      extern void proc_init_inodecache(void);
      extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
      extern int proc_fill_super(struct super_block *);
      extern void proc_entry_rundown(struct proc_dir_entry *);
      
      /*
       * proc_namespaces.c
       */
      extern const struct inode_operations proc_ns_dir_inode_operations;
      extern const struct file_operations proc_ns_dir_operations;
      
      /*
       * proc_net.c
       */
      extern const struct file_operations proc_net_operations;
      extern const struct inode_operations proc_net_inode_operations;
      
      #ifdef CONFIG_NET
      extern int proc_net_init(void);
      #else
      static inline int proc_net_init(void) { return 0; }
      #endif
      
      /*
       * proc_self.c
       */
      extern int proc_setup_self(struct super_block *);
      
      /*
       * proc_thread_self.c
       */
      extern int proc_setup_thread_self(struct super_block *);
      extern void proc_thread_self_init(void);
      
      /*
       * proc_sysctl.c
       */
      #ifdef CONFIG_PROC_SYSCTL
      extern int proc_sys_init(void);
      extern void sysctl_head_put(struct ctl_table_header *);
      #else
      static inline void proc_sys_init(void) { }
      static inline void sysctl_head_put(struct ctl_table_header *head) { }
      #endif
      
      /*
       * uid.c
       */
      #ifdef CONFIG_PROC_UID
      extern int proc_uid_init(void);
      #else
      static inline void proc_uid_init(void) { }
      #endif
      
      /*
       * proc_tty.c
       */
      #ifdef CONFIG_TTY
      extern void proc_tty_init(void);
      #else
      static inline void proc_tty_init(void) {}
      #endif
      
      /*
       * root.c
       */
      extern struct proc_dir_entry proc_root;
      
      extern void proc_self_init(void);
      extern int proc_remount(struct super_block *, int *, char *);
      
      /*
       * task_[no]mmu.c
       */
      struct proc_maps_private {
              struct inode *inode;
              struct task_struct *task;
              struct mm_struct *mm;
      #ifdef CONFIG_MMU
              struct vm_area_struct *tail_vma;
      #endif
      #ifdef CONFIG_NUMA
              struct mempolicy *task_mempolicy;
      #endif
      };
      
      struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
      
      extern const struct file_operations proc_pid_maps_operations;
      extern const struct file_operations proc_tid_maps_operations;
      extern const struct file_operations proc_pid_numa_maps_operations;
      extern const struct file_operations proc_tid_numa_maps_operations;
      extern const struct file_operations proc_pid_smaps_operations;
      extern const struct file_operations proc_tid_smaps_operations;
      extern const struct file_operations proc_clear_refs_operations;
      extern const struct file_operations proc_pagemap_operations;
      
      extern unsigned long task_vsize(struct mm_struct *);
      extern unsigned long task_statm(struct mm_struct *,
                                      unsigned long *, unsigned long *,
                                      unsigned long *, unsigned long *);
      extern void task_mem(struct seq_file *, struct mm_struct *);
      #ifndef _ASM_X86_PTRACE_H
      #define _ASM_X86_PTRACE_H
      
      #include <asm/segment.h>
      #include <asm/page_types.h>
      #include <uapi/asm/ptrace.h>
      
      #ifndef __ASSEMBLY__
      #ifdef __i386__
      
      struct pt_regs {
              unsigned long bx;
              unsigned long cx;
              unsigned long dx;
              unsigned long si;
              unsigned long di;
              unsigned long bp;
              unsigned long ax;
              unsigned long ds;
              unsigned long es;
              unsigned long fs;
              unsigned long gs;
              unsigned long orig_ax;
              unsigned long ip;
              unsigned long cs;
              unsigned long flags;
              unsigned long sp;
              unsigned long ss;
      };
      
      #else /* __i386__ */
      
      struct pt_regs {
      /*
       * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
       * unless syscall needs a complete, fully filled "struct pt_regs".
       */
              unsigned long r15;
              unsigned long r14;
              unsigned long r13;
              unsigned long r12;
              unsigned long bp;
              unsigned long bx;
      /* These regs are callee-clobbered. Always saved on kernel entry. */
              unsigned long r11;
              unsigned long r10;
              unsigned long r9;
              unsigned long r8;
              unsigned long ax;
              unsigned long cx;
              unsigned long dx;
              unsigned long si;
              unsigned long di;
      /*
       * On syscall entry, this is syscall#. On CPU exception, this is error code.
       * On hw interrupt, it's IRQ number:
       */
              unsigned long orig_ax;
      /* Return frame for iretq */
              unsigned long ip;
              unsigned long cs;
              unsigned long flags;
              unsigned long sp;
              unsigned long ss;
      /* top of stack page */
      };
      
      #endif /* !__i386__ */
      
      #ifdef CONFIG_PARAVIRT
      #include <asm/paravirt_types.h>
      #endif
      
      struct cpuinfo_x86;
      struct task_struct;
      
      extern unsigned long profile_pc(struct pt_regs *regs);
      #define profile_pc profile_pc
      
      extern unsigned long
      convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
      extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
                               int error_code, int si_code);
      
      
      extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
      extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
                                             unsigned long phase1_result);
      
      extern long syscall_trace_enter(struct pt_regs *);
      
      static inline unsigned long regs_return_value(struct pt_regs *regs)
      {
              return regs->ax;
      }
      
      /*
       * user_mode(regs) determines whether a register set came from user
       * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
       * register set was from protected mode with RPL-3 CS value.  This
       * tricky test checks that with one comparison.
       *
       * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
       * the extra check.
       */
      static inline int user_mode(struct pt_regs *regs)
 3351 {
      #ifdef CONFIG_X86_32
              return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
      #else
              return !!(regs->cs & 3);
      #endif
      }
      
      static inline int v8086_mode(struct pt_regs *regs)
      {
      #ifdef CONFIG_X86_32
              return (regs->flags & X86_VM_MASK);
      #else
              return 0;        /* No V86 mode support in long mode */
      #endif
      }
      
      static inline bool user_64bit_mode(struct pt_regs *regs)
      {
      #ifdef CONFIG_X86_64
      #ifndef CONFIG_PARAVIRT
              /*
               * On non-paravirt systems, this is the only long mode CPL 3
               * selector.  We do not allow long mode selectors in the LDT.
               */
              return regs->cs == __USER_CS;
      #else
              /* Headers are too twisted for this to go in paravirt.h. */
              return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
      #endif
      #else /* !CONFIG_X86_64 */
              return false;
      #endif
      }
      
      #ifdef CONFIG_X86_64
      #define current_user_stack_pointer()        current_pt_regs()->sp
      #define compat_user_stack_pointer()        current_pt_regs()->sp
      #endif
      
      #ifdef CONFIG_X86_32
      extern unsigned long kernel_stack_pointer(struct pt_regs *regs);
      #else
      static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
      {
              return regs->sp;
      }
      #endif
      
      #define GET_IP(regs) ((regs)->ip)
      #define GET_FP(regs) ((regs)->bp)
      #define GET_USP(regs) ((regs)->sp)
      
      #include <asm-generic/ptrace.h>
      
      /* Query offset/name of register from its name/offset */
      extern int regs_query_register_offset(const char *name);
      extern const char *regs_query_register_name(unsigned int offset);
      #define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
      
      /**
       * regs_get_register() - get register value from its offset
       * @regs:        pt_regs from which register value is gotten.
       * @offset:        offset number of the register.
       *
       * regs_get_register returns the value of a register. The @offset is the
       * offset of the register in struct pt_regs address which specified by @regs.
       * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
       */
      static inline unsigned long regs_get_register(struct pt_regs *regs,
                                                    unsigned int offset)
      {
              if (unlikely(offset > MAX_REG_OFFSET))
                      return 0;
      #ifdef CONFIG_X86_32
              /*
               * Traps from the kernel do not save sp and ss.
               * Use the helper function to retrieve sp.
               */
              if (offset == offsetof(struct pt_regs, sp) &&
                  regs->cs == __KERNEL_CS)
                      return kernel_stack_pointer(regs);
      #endif
              return *(unsigned long *)((unsigned long)regs + offset);
      }
      
      /**
       * regs_within_kernel_stack() - check the address in the stack
       * @regs:        pt_regs which contains kernel stack pointer.
       * @addr:        address which is checked.
       *
       * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
       * If @addr is within the kernel stack, it returns true. If not, returns false.
       */
      static inline int regs_within_kernel_stack(struct pt_regs *regs,
                                                 unsigned long addr)
      {
              return ((addr & ~(THREAD_SIZE - 1))  ==
                      (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
      }
      
      /**
       * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
       * @regs:        pt_regs which contains kernel stack pointer.
       * @n:                stack entry number.
       *
       * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
       * kernel stack which is specified by @regs. If the @n th entry is NOT in
       * the kernel stack, this returns NULL.
       */
      static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
      {
              unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
      
              addr += n;
              if (regs_within_kernel_stack(regs, (unsigned long)addr))
                      return addr;
              else
                      return NULL;
      }
      
      /* To avoid include hell, we can't include uaccess.h */
      extern long probe_kernel_read(void *dst, const void *src, size_t size);
      
      /**
       * regs_get_kernel_stack_nth() - get Nth entry of the stack
       * @regs:        pt_regs which contains kernel stack pointer.
       * @n:                stack entry number.
       *
       * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
       * is specified by @regs. If the @n th entry is NOT in the kernel stack
       * this returns 0.
       */
      static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
                                                            unsigned int n)
      {
              unsigned long *addr;
              unsigned long val;
              long ret;
      
              addr = regs_get_kernel_stack_nth_addr(regs, n);
              if (addr) {
                      ret = probe_kernel_read(&val, addr, sizeof(val));
                      if (!ret)
                              return val;
              }
              return 0;
      }
      
      #define arch_has_single_step()        (1)
      #ifdef CONFIG_X86_DEBUGCTLMSR
      #define arch_has_block_step()        (1)
      #else
      #define arch_has_block_step()        (boot_cpu_data.x86 >= 6)
      #endif
      
      #define ARCH_HAS_USER_SINGLE_STEP_INFO
      
      /*
       * When hitting ptrace_stop(), we cannot return using SYSRET because
       * that does not restore the full CPU state, only a minimal set.  The
       * ptracer can change arbitrary register values, which is usually okay
       * because the usual ptrace stops run off the signal delivery path which
       * forces IRET; however, ptrace_event() stops happen in arbitrary places
       * in the kernel and don't force IRET path.
       *
       * So force IRET path after a ptrace stop.
       */
      #define arch_ptrace_stop_needed(code, info)                                \
      ({                                                                        \
              force_iret();                                                        \
              false;                                                                \
      })
      
      struct user_desc;
      extern int do_get_thread_area(struct task_struct *p, int idx,
                                    struct user_desc __user *info);
      extern int do_set_thread_area(struct task_struct *p, int idx,
                                    struct user_desc __user *info, int can_allocate);
      
      #endif /* !__ASSEMBLY__ */
      #endif /* _ASM_X86_PTRACE_H */
      /*
       * Hash: Hash algorithms under the crypto API
       * 
       * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
       *
       * This program is free software; you can redistribute it and/or modify it
       * under the terms of the GNU General Public License as published by the Free
       * Software Foundation; either version 2 of the License, or (at your option) 
       * any later version.
       *
       */
      
      #ifndef _CRYPTO_HASH_H
      #define _CRYPTO_HASH_H
      
      #include <linux/crypto.h>
      
      struct crypto_ahash;
      
      /**
       * DOC: Message Digest Algorithm Definitions
       *
       * These data structures define modular message digest algorithm
       * implementations, managed via crypto_register_ahash(),
       * crypto_register_shash(), crypto_unregister_ahash() and
       * crypto_unregister_shash().
       */
      
      /**
       * struct hash_alg_common - define properties of message digest
       * @digestsize: Size of the result of the transformation. A buffer of this size
       *                must be available to the @final and @finup calls, so they can
       *                store the resulting hash into it. For various predefined sizes,
       *                search include/crypto/ using
       *                git grep _DIGEST_SIZE include/crypto.
       * @statesize: Size of the block for partial state of the transformation. A
       *               buffer of this size must be passed to the @export function as it
       *               will save the partial state of the transformation into it. On the
       *               other side, the @import function will load the state from a
       *               buffer of this size as well.
       * @base: Start of data structure of cipher algorithm. The common data
       *          structure of crypto_alg contains information common to all ciphers.
       *          The hash_alg_common data structure now adds the hash-specific
       *          information.
       */
      struct hash_alg_common {
              unsigned int digestsize;
              unsigned int statesize;
      
              struct crypto_alg base;
      };
      
      struct ahash_request {
              struct crypto_async_request base;
      
              unsigned int nbytes;
              struct scatterlist *src;
              u8 *result;
      
              /* This field may only be used by the ahash API code. */
              void *priv;
      
              void *__ctx[] CRYPTO_MINALIGN_ATTR;
      };
      
      #define AHASH_REQUEST_ON_STACK(name, ahash) \
              char __##name##_desc[sizeof(struct ahash_request) + \
                      crypto_ahash_reqsize(ahash)] CRYPTO_MINALIGN_ATTR; \
              struct ahash_request *name = (void *)__##name##_desc
      
      /**
       * struct ahash_alg - asynchronous message digest definition
       * @init: Initialize the transformation context. Intended only to initialize the
       *          state of the HASH transformation at the beginning. This shall fill in
       *          the internal structures used during the entire duration of the whole
       *          transformation. No data processing happens at this point.
       * @update: Push a chunk of data into the driver for transformation. This
       *           function actually pushes blocks of data from upper layers into the
       *           driver, which then passes those to the hardware as seen fit. This
       *           function must not finalize the HASH transformation by calculating the
       *           final message digest as this only adds more data into the
       *           transformation. This function shall not modify the transformation
       *           context, as this function may be called in parallel with the same
       *           transformation object. Data processing can happen synchronously
       *           [SHASH] or asynchronously [AHASH] at this point.
       * @final: Retrieve result from the driver. This function finalizes the
       *           transformation and retrieves the resulting hash from the driver and
       *           pushes it back to upper layers. No data processing happens at this
       *           point.
       * @finup: Combination of @update and @final. This function is effectively a
       *           combination of @update and @final calls issued in sequence. As some
       *           hardware cannot do @update and @final separately, this callback was
       *           added to allow such hardware to be used at least by IPsec. Data
       *           processing can happen synchronously [SHASH] or asynchronously [AHASH]
       *           at this point.
       * @digest: Combination of @init and @update and @final. This function
       *            effectively behaves as the entire chain of operations, @init,
       *            @update and @final issued in sequence. Just like @finup, this was
       *            added for hardware which cannot do even the @finup, but can only do
       *            the whole transformation in one run. Data processing can happen
       *            synchronously [SHASH] or asynchronously [AHASH] at this point.
       * @setkey: Set optional key used by the hashing algorithm. Intended to push
       *            optional key used by the hashing algorithm from upper layers into
       *            the driver. This function can store the key in the transformation
       *            context or can outright program it into the hardware. In the former
       *            case, one must be careful to program the key into the hardware at
       *            appropriate time and one must be careful that .setkey() can be
       *            called multiple times during the existence of the transformation
       *            object. Not  all hashing algorithms do implement this function as it
       *            is only needed for keyed message digests. SHAx/MDx/CRCx do NOT
       *            implement this function. HMAC(MDx)/HMAC(SHAx)/CMAC(AES) do implement
       *            this function. This function must be called before any other of the
       *            @init, @update, @final, @finup, @digest is called. No data
       *            processing happens at this point.
       * @export: Export partial state of the transformation. This function dumps the
       *            entire state of the ongoing transformation into a provided block of
       *            data so it can be @import 'ed back later on. This is useful in case
       *            you want to save partial result of the transformation after
       *            processing certain amount of data and reload this partial result
       *            multiple times later on for multiple re-use. No data processing
       *            happens at this point.
       * @import: Import partial state of the transformation. This function loads the
       *            entire state of the ongoing transformation from a provided block of
       *            data so the transformation can continue from this point onward. No
       *            data processing happens at this point.
       * @halg: see struct hash_alg_common
       */
      struct ahash_alg {
              int (*init)(struct ahash_request *req);
              int (*update)(struct ahash_request *req);
              int (*final)(struct ahash_request *req);
              int (*finup)(struct ahash_request *req);
              int (*digest)(struct ahash_request *req);
              int (*export)(struct ahash_request *req, void *out);
              int (*import)(struct ahash_request *req, const void *in);
              int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
                            unsigned int keylen);
      
              struct hash_alg_common halg;
      };
      
      struct shash_desc {
              struct crypto_shash *tfm;
              u32 flags;
      
              void *__ctx[] CRYPTO_MINALIGN_ATTR;
      };
      
      #define SHASH_DESC_ON_STACK(shash, ctx)                                  \
              char __##shash##_desc[sizeof(struct shash_desc) +          \
                      crypto_shash_descsize(ctx)] CRYPTO_MINALIGN_ATTR; \
              struct shash_desc *shash = (struct shash_desc *)__##shash##_desc
      
      /**
       * struct shash_alg - synchronous message digest definition
       * @init: see struct ahash_alg
       * @update: see struct ahash_alg
       * @final: see struct ahash_alg
       * @finup: see struct ahash_alg
       * @digest: see struct ahash_alg
       * @export: see struct ahash_alg
       * @import: see struct ahash_alg
       * @setkey: see struct ahash_alg
       * @digestsize: see struct ahash_alg
       * @statesize: see struct ahash_alg
       * @descsize: Size of the operational state for the message digest. This state
       *               size is the memory size that needs to be allocated for
       *              shash_desc.__ctx
       * @base: internally used
       */
      struct shash_alg {
              int (*init)(struct shash_desc *desc);
              int (*update)(struct shash_desc *desc, const u8 *data,
                            unsigned int len);
              int (*final)(struct shash_desc *desc, u8 *out);
              int (*finup)(struct shash_desc *desc, const u8 *data,
                           unsigned int len, u8 *out);
              int (*digest)(struct shash_desc *desc, const u8 *data,
                            unsigned int len, u8 *out);
              int (*export)(struct shash_desc *desc, void *out);
              int (*import)(struct shash_desc *desc, const void *in);
              int (*setkey)(struct crypto_shash *tfm, const u8 *key,
                            unsigned int keylen);
      
              unsigned int descsize;
      
              /* These fields must match hash_alg_common. */
              unsigned int digestsize
                      __attribute__ ((aligned(__alignof__(struct hash_alg_common))));
              unsigned int statesize;
      
              struct crypto_alg base;
      };
      
      struct crypto_ahash {
              int (*init)(struct ahash_request *req);
              int (*update)(struct ahash_request *req);
              int (*final)(struct ahash_request *req);
              int (*finup)(struct ahash_request *req);
              int (*digest)(struct ahash_request *req);
              int (*export)(struct ahash_request *req, void *out);
              int (*import)(struct ahash_request *req, const void *in);
              int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
                            unsigned int keylen);
      
              unsigned int reqsize;
              bool has_setkey;
              struct crypto_tfm base;
      };
      
      struct crypto_shash {
              unsigned int descsize;
              struct crypto_tfm base;
      };
      
      /**
       * DOC: Asynchronous Message Digest API
       *
       * The asynchronous message digest API is used with the ciphers of type
       * CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto)
       *
       * The asynchronous cipher operation discussion provided for the
       * CRYPTO_ALG_TYPE_ABLKCIPHER API applies here as well.
       */
      
      static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
      {
              return container_of(tfm, struct crypto_ahash, base);
      }
      
      /**
       * crypto_alloc_ahash() - allocate ahash cipher handle
       * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
       *              ahash cipher
       * @type: specifies the type of the cipher
       * @mask: specifies the mask for the cipher
       *
       * Allocate a cipher handle for an ahash. The returned struct
       * crypto_ahash is the cipher handle that is required for any subsequent
       * API invocation for that ahash.
       *
       * Return: allocated cipher handle in case of success; IS_ERR() is true in case
       *           of an error, PTR_ERR() returns the error code.
       */
      struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type,
                                              u32 mask);
      
      static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
      {
              return &tfm->base;
      }
      
      /**
       * crypto_free_ahash() - zeroize and free the ahash handle
       * @tfm: cipher handle to be freed
       */
      static inline void crypto_free_ahash(struct crypto_ahash *tfm)
      {
              crypto_destroy_tfm(tfm, crypto_ahash_tfm(tfm));
      }
      
      static inline unsigned int crypto_ahash_alignmask(
              struct crypto_ahash *tfm)
      {
   39         return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm));
      }
      
      /**
       * crypto_ahash_blocksize() - obtain block size for cipher
       * @tfm: cipher handle
       *
       * The block size for the message digest cipher referenced with the cipher
       * handle is returned.
       *
       * Return: block size of cipher
       */
      static inline unsigned int crypto_ahash_blocksize(struct crypto_ahash *tfm)
      {
              return crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm));
      }
      
      static inline struct hash_alg_common *__crypto_hash_alg_common(
              struct crypto_alg *alg)
      {
              return container_of(alg, struct hash_alg_common, base);
      }
      
      static inline struct hash_alg_common *crypto_hash_alg_common(
              struct crypto_ahash *tfm)
      {
   39         return __crypto_hash_alg_common(crypto_ahash_tfm(tfm)->__crt_alg);
      }
      
      /**
       * crypto_ahash_digestsize() - obtain message digest size
       * @tfm: cipher handle
       *
       * The size for the message digest created by the message digest cipher
       * referenced with the cipher handle is returned.
       *
       *
       * Return: message digest size of cipher
       */
      static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm)
      {
              return crypto_hash_alg_common(tfm)->digestsize;
      }
      
      static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm)
      {
              return crypto_hash_alg_common(tfm)->statesize;
      }
      
      static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm)
      {
              return crypto_tfm_get_flags(crypto_ahash_tfm(tfm));
      }
      
      static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags)
      {
              crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags);
      }
      
      static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags)
      {
   17         crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags);
      }
      
      /**
       * crypto_ahash_reqtfm() - obtain cipher handle from request
       * @req: asynchronous request handle that contains the reference to the ahash
       *         cipher handle
       *
       * Return the ahash cipher handle that is registered with the asynchronous
       * request handle ahash_request.
       *
       * Return: ahash cipher handle
       */
      static inline struct crypto_ahash *crypto_ahash_reqtfm(
              struct ahash_request *req)
      {
              return __crypto_ahash_cast(req->base.tfm);
      }
      
      /**
       * crypto_ahash_reqsize() - obtain size of the request data structure
       * @tfm: cipher handle
       *
       * Return the size of the ahash state size. With the crypto_ahash_export
       * function, the caller can export the state into a buffer whose size is
       * defined with this function.
       *
       * Return: size of the ahash state
       */
      static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
      {
              return tfm->reqsize;
      }
      
      static inline void *ahash_request_ctx(struct ahash_request *req)
      {
              return req->__ctx;
      }
      
      /**
       * crypto_ahash_setkey - set key for cipher handle
       * @tfm: cipher handle
       * @key: buffer holding the key
       * @keylen: length of the key in bytes
       *
       * The caller provided key is set for the ahash cipher. The cipher
       * handle must point to a keyed hash in order for this function to succeed.
       *
       * Return: 0 if the setting of the key was successful; < 0 if an error occurred
       */
      int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
                              unsigned int keylen);
      
      static inline bool crypto_ahash_has_setkey(struct crypto_ahash *tfm)
      {
              return tfm->has_setkey;
      }
      
      /**
       * crypto_ahash_finup() - update and finalize message digest
       * @req: reference to the ahash_request handle that holds all information
       *         needed to perform the cipher operation
       *
       * This function is a "short-hand" for the function calls of
       * crypto_ahash_update and crypto_shash_final. The parameters have the same
       * meaning as discussed for those separate functions.
       *
       * Return: 0 if the message digest creation was successful; < 0 if an error
       *           occurred
       */
      int crypto_ahash_finup(struct ahash_request *req);
      
      /**
       * crypto_ahash_final() - calculate message digest
       * @req: reference to the ahash_request handle that holds all information
       *         needed to perform the cipher operation
       *
       * Finalize the message digest operation and create the message digest
       * based on all data added to the cipher handle. The message digest is placed
       * into the output buffer registered with the ahash_request handle.
       *
       * Return: 0 if the message digest creation was successful; < 0 if an error
       *           occurred
       */
      int crypto_ahash_final(struct ahash_request *req);
      
      /**
       * crypto_ahash_digest() - calculate message digest for a buffer
       * @req: reference to the ahash_request handle that holds all information
       *         needed to perform the cipher operation
       *
       * This function is a "short-hand" for the function calls of crypto_ahash_init,
       * crypto_ahash_update and crypto_ahash_final. The parameters have the same
       * meaning as discussed for those separate three functions.
       *
       * Return: 0 if the message digest creation was successful; < 0 if an error
       *           occurred
       */
      int crypto_ahash_digest(struct ahash_request *req);
      
      /**
       * crypto_ahash_export() - extract current message digest state
       * @req: reference to the ahash_request handle whose state is exported
       * @out: output buffer of sufficient size that can hold the hash state
       *
       * This function exports the hash state of the ahash_request handle into the
       * caller-allocated output buffer out which must have sufficient size (e.g. by
       * calling crypto_ahash_reqsize).
       *
       * Return: 0 if the export was successful; < 0 if an error occurred
       */
      static inline int crypto_ahash_export(struct ahash_request *req, void *out)
      {
              return crypto_ahash_reqtfm(req)->export(req, out);
      }
      
      /**
       * crypto_ahash_import() - import message digest state
       * @req: reference to ahash_request handle the state is imported into
       * @in: buffer holding the state
       *
       * This function imports the hash state into the ahash_request handle from the
       * input buffer. That buffer should have been generated with the
       * crypto_ahash_export function.
       *
       * Return: 0 if the import was successful; < 0 if an error occurred
       */
      static inline int crypto_ahash_import(struct ahash_request *req, const void *in)
      {
              return crypto_ahash_reqtfm(req)->import(req, in);
      }
      
      /**
       * crypto_ahash_init() - (re)initialize message digest handle
       * @req: ahash_request handle that already is initialized with all necessary
       *         data using the ahash_request_* API functions
       *
       * The call (re-)initializes the message digest referenced by the ahash_request
       * handle. Any potentially existing state created by previous operations is
       * discarded.
       *
       * Return: 0 if the message digest initialization was successful; < 0 if an
       *           error occurred
       */
      static inline int crypto_ahash_init(struct ahash_request *req)
      {
              return crypto_ahash_reqtfm(req)->init(req);
      }
      
      /**
       * crypto_ahash_update() - add data to message digest for processing
       * @req: ahash_request handle that was previously initialized with the
       *         crypto_ahash_init call.
       *
       * Updates the message digest state of the &ahash_request handle. The input data
       * is pointed to by the scatter/gather list registered in the &ahash_request
       * handle
       *
       * Return: 0 if the message digest update was successful; < 0 if an error
       *           occurred
       */
      static inline int crypto_ahash_update(struct ahash_request *req)
      {
              return crypto_ahash_reqtfm(req)->update(req);
      }
      
      /**
       * DOC: Asynchronous Hash Request Handle
       *
       * The &ahash_request data structure contains all pointers to data
       * required for the asynchronous cipher operation. This includes the cipher
       * handle (which can be used by multiple &ahash_request instances), pointer
       * to plaintext and the message digest output buffer, asynchronous callback
       * function, etc. It acts as a handle to the ahash_request_* API calls in a
       * similar way as ahash handle to the crypto_ahash_* API calls.
       */
      
      /**
       * ahash_request_set_tfm() - update cipher handle reference in request
       * @req: request handle to be modified
       * @tfm: cipher handle that shall be added to the request handle
       *
       * Allow the caller to replace the existing ahash handle in the request
       * data structure with a different one.
       */
      static inline void ahash_request_set_tfm(struct ahash_request *req,
                                               struct crypto_ahash *tfm)
      {
              req->base.tfm = crypto_ahash_tfm(tfm);
      }
      
      /**
       * ahash_request_alloc() - allocate request data structure
       * @tfm: cipher handle to be registered with the request
       * @gfp: memory allocation flag that is handed to kmalloc by the API call.
       *
       * Allocate the request data structure that must be used with the ahash
       * message digest API calls. During
       * the allocation, the provided ahash handle
       * is registered in the request data structure.
       *
       * Return: allocated request handle in case of success; IS_ERR() is true in case
       *           of an error, PTR_ERR() returns the error code.
       */
      static inline struct ahash_request *ahash_request_alloc(
              struct crypto_ahash *tfm, gfp_t gfp)
      {
              struct ahash_request *req;
      
              req = kmalloc(sizeof(struct ahash_request) +
                            crypto_ahash_reqsize(tfm), gfp);
      
              if (likely(req))
                      ahash_request_set_tfm(req, tfm);
      
              return req;
      }
      
      /**
       * ahash_request_free() - zeroize and free the request data structure
       * @req: request data structure cipher handle to be freed
       */
      static inline void ahash_request_free(struct ahash_request *req)
      {
              kzfree(req);
      }
      
      static inline struct ahash_request *ahash_request_cast(
              struct crypto_async_request *req)
      {
              return container_of(req, struct ahash_request, base);
      }
      
      /**
       * ahash_request_set_callback() - set asynchronous callback function
       * @req: request handle
       * @flags: specify zero or an ORing of the flags
       *           CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
       *           increase the wait queue beyond the initial maximum size;
       *           CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
       * @compl: callback function pointer to be registered with the request handle
       * @data: The data pointer refers to memory that is not used by the kernel
       *          crypto API, but provided to the callback function for it to use. Here,
       *          the caller can provide a reference to memory the callback function can
       *          operate on. As the callback function is invoked asynchronously to the
       *          related functionality, it may need to access data structures of the
       *          related functionality which can be referenced using this pointer. The
       *          callback function can access the memory via the "data" field in the
       *          &crypto_async_request data structure provided to the callback function.
       *
       * This function allows setting the callback function that is triggered once
       * the cipher operation completes.
       *
       * The callback function is registered with the &ahash_request handle and
       * must comply with the following template
       *
       *        void callback_function(struct crypto_async_request *req, int error)
       */
      static inline void ahash_request_set_callback(struct ahash_request *req,
                                                    u32 flags,
                                                    crypto_completion_t compl,
                                                    void *data)
      {
              req->base.complete = compl;
              req->base.data = data;
              req->base.flags = flags;
      }
      
      /**
       * ahash_request_set_crypt() - set data buffers
       * @req: ahash_request handle to be updated
       * @src: source scatter/gather list
       * @result: buffer that is filled with the message digest -- the caller must
       *            ensure that the buffer has sufficient space by, for example, calling
       *            crypto_ahash_digestsize()
       * @nbytes: number of bytes to process from the source scatter/gather list
       *
       * By using this call, the caller references the source scatter/gather list.
       * The source scatter/gather list points to the data the message digest is to
       * be calculated for.
       */
      static inline void ahash_request_set_crypt(struct ahash_request *req,
                                                 struct scatterlist *src, u8 *result,
                                                 unsigned int nbytes)
      {
              req->src = src;
              req->nbytes = nbytes;
              req->result = result;
      }
      
      /**
       * DOC: Synchronous Message Digest API
       *
       * The synchronous message digest API is used with the ciphers of type
       * CRYPTO_ALG_TYPE_SHASH (listed as type "shash" in /proc/crypto)
       *
       * The message digest API is able to maintain state information for the
       * caller.
       *
       * The synchronous message digest API can store user-related context in in its
       * shash_desc request data structure.
       */
      
      /**
       * crypto_alloc_shash() - allocate message digest handle
       * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
       *              message digest cipher
       * @type: specifies the type of the cipher
       * @mask: specifies the mask for the cipher
       *
       * Allocate a cipher handle for a message digest. The returned &struct
       * crypto_shash is the cipher handle that is required for any subsequent
       * API invocation for that message digest.
       *
       * Return: allocated cipher handle in case of success; IS_ERR() is true in case
       *           of an error, PTR_ERR() returns the error code.
       */
      struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
                                              u32 mask);
      
      static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
      {
              return &tfm->base;
      }
      
      /**
       * crypto_free_shash() - zeroize and free the message digest handle
       * @tfm: cipher handle to be freed
       */
      static inline void crypto_free_shash(struct crypto_shash *tfm)
      {
              crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm));
      }
      
      static inline unsigned int crypto_shash_alignmask(
              struct crypto_shash *tfm)
      {
              return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm));
      }
      
      /**
       * crypto_shash_blocksize() - obtain block size for cipher
       * @tfm: cipher handle
       *
       * The block size for the message digest cipher referenced with the cipher
       * handle is returned.
       *
       * Return: block size of cipher
       */
      static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm)
      {
              return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm));
      }
      
      static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg)
      {
              return container_of(alg, struct shash_alg, base);
      }
      
      static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm)
      {
   39         return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg);
      }
      
      /**
       * crypto_shash_digestsize() - obtain message digest size
       * @tfm: cipher handle
       *
       * The size for the message digest created by the message digest cipher
       * referenced with the cipher handle is returned.
       *
       * Return: digest size of cipher
       */
      static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm)
      {
    1         return crypto_shash_alg(tfm)->digestsize;
      }
      
      static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm)
      {
              return crypto_shash_alg(tfm)->statesize;
      }
      
      static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm)
      {
              return crypto_tfm_get_flags(crypto_shash_tfm(tfm));
      }
      
      static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags)
      {
              crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags);
      }
      
      static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags)
      {
              crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags);
      }
      
      /**
       * crypto_shash_descsize() - obtain the operational state size
       * @tfm: cipher handle
       *
       * The size of the operational state the cipher needs during operation is
       * returned for the hash referenced with the cipher handle. This size is
       * required to calculate the memory requirements to allow the caller allocating
       * sufficient memory for operational state.
       *
       * The operational state is defined with struct shash_desc where the size of
       * that data structure is to be calculated as
       * sizeof(struct shash_desc) + crypto_shash_descsize(alg)
       *
       * Return: size of the operational state
       */
      static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm)
      {
   39         return tfm->descsize;
      }
      
      static inline void *shash_desc_ctx(struct shash_desc *desc)
      {
   33         return desc->__ctx;
      }
      
      /**
       * crypto_shash_setkey() - set key for message digest
       * @tfm: cipher handle
       * @key: buffer holding the key
       * @keylen: length of the key in bytes
       *
       * The caller provided key is set for the keyed message digest cipher. The
       * cipher handle must point to a keyed message digest cipher in order for this
       * function to succeed.
       *
       * Return: 0 if the setting of the key was successful; < 0 if an error occurred
       */
      int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
                              unsigned int keylen);
      
      /**
       * crypto_shash_digest() - calculate message digest for buffer
       * @desc: see crypto_shash_final()
       * @data: see crypto_shash_update()
       * @len: see crypto_shash_update()
       * @out: see crypto_shash_final()
       *
       * This function is a "short-hand" for the function calls of crypto_shash_init,
       * crypto_shash_update and crypto_shash_final. The parameters have the same
       * meaning as discussed for those separate three functions.
       *
       * Return: 0 if the message digest creation was successful; < 0 if an error
       *           occurred
       */
      int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
                              unsigned int len, u8 *out);
      
      /**
       * crypto_shash_export() - extract operational state for message digest
       * @desc: reference to the operational state handle whose state is exported
       * @out: output buffer of sufficient size that can hold the hash state
       *
       * This function exports the hash state of the operational state handle into the
       * caller-allocated output buffer out which must have sufficient size (e.g. by
       * calling crypto_shash_descsize).
       *
       * Return: 0 if the export creation was successful; < 0 if an error occurred
       */
      static inline int crypto_shash_export(struct shash_desc *desc, void *out)
      {
   37         return crypto_shash_alg(desc->tfm)->export(desc, out);
      }
      
      /**
       * crypto_shash_import() - import operational state
       * @desc: reference to the operational state handle the state imported into
       * @in: buffer holding the state
       *
       * This function imports the hash state into the operational state handle from
       * the input buffer. That buffer should have been generated with the
       * crypto_ahash_export function.
       *
       * Return: 0 if the import was successful; < 0 if an error occurred
       */
      static inline int crypto_shash_import(struct shash_desc *desc, const void *in)
      {
              return crypto_shash_alg(desc->tfm)->import(desc, in);
      }
      
      /**
       * crypto_shash_init() - (re)initialize message digest
       * @desc: operational state handle that is already filled
       *
       * The call (re-)initializes the message digest referenced by the
       * operational state handle. Any potentially existing state created by
       * previous operations is discarded.
       *
       * Return: 0 if the message digest initialization was successful; < 0 if an
       *           error occurred
       */
      static inline int crypto_shash_init(struct shash_desc *desc)
      {
   37         return crypto_shash_alg(desc->tfm)->init(desc);
      }
      
      /**
       * crypto_shash_update() - add data to message digest for processing
       * @desc: operational state handle that is already initialized
       * @data: input data to be added to the message digest
       * @len: length of the input data
       *
       * Updates the message digest state of the operational state handle.
       *
       * Return: 0 if the message digest update was successful; < 0 if an error
       *           occurred
       */
      int crypto_shash_update(struct shash_desc *desc, const u8 *data,
                              unsigned int len);
      
      /**
       * crypto_shash_final() - calculate message digest
       * @desc: operational state handle that is already filled with data
       * @out: output buffer filled with the message digest
       *
       * Finalize the message digest operation and create the message digest
       * based on all data added to the cipher handle. The message digest is placed
       * into the output buffer. The caller must ensure that the output buffer is
       * large enough by using crypto_shash_digestsize.
       *
       * Return: 0 if the message digest creation was successful; < 0 if an error
       *           occurred
       */
      int crypto_shash_final(struct shash_desc *desc, u8 *out);
      
      /**
       * crypto_shash_finup() - calculate message digest of buffer
       * @desc: see crypto_shash_final()
       * @data: see crypto_shash_update()
       * @len: see crypto_shash_update()
       * @out: see crypto_shash_final()
       *
       * This function is a "short-hand" for the function calls of
       * crypto_shash_update and crypto_shash_final. The parameters have the same
       * meaning as discussed for those separate functions.
       *
       * Return: 0 if the message digest creation was successful; < 0 if an error
       *           occurred
       */
      int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
                             unsigned int len, u8 *out);
      
      #endif        /* _CRYPTO_HASH_H */
      /*
       * mm/page-writeback.c
       *
       * Copyright (C) 2002, Linus Torvalds.
       * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
       *
       * Contains functions related to writing back dirty pages at the
       * address_space level.
       *
       * 10Apr2002        Andrew Morton
       *                Initial version
       */
      
      #include <linux/kernel.h>
      #include <linux/export.h>
      #include <linux/spinlock.h>
      #include <linux/fs.h>
      #include <linux/mm.h>
      #include <linux/swap.h>
      #include <linux/slab.h>
      #include <linux/pagemap.h>
      #include <linux/writeback.h>
      #include <linux/init.h>
      #include <linux/backing-dev.h>
      #include <linux/task_io_accounting_ops.h>
      #include <linux/blkdev.h>
      #include <linux/mpage.h>
      #include <linux/rmap.h>
      #include <linux/percpu.h>
      #include <linux/notifier.h>
      #include <linux/smp.h>
      #include <linux/sysctl.h>
      #include <linux/cpu.h>
      #include <linux/syscalls.h>
      #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
      #include <linux/pagevec.h>
      #include <linux/timer.h>
      #include <linux/sched/rt.h>
      #include <linux/mm_inline.h>
      #include <trace/events/writeback.h>
      
      #include "internal.h"
      
      /*
       * Sleep at most 200ms at a time in balance_dirty_pages().
       */
      #define MAX_PAUSE                max(HZ/5, 1)
      
      /*
       * Try to keep balance_dirty_pages() call intervals higher than this many pages
       * by raising pause time to max_pause when falls below it.
       */
      #define DIRTY_POLL_THRESH        (128 >> (PAGE_SHIFT - 10))
      
      /*
       * Estimate write bandwidth at 200ms intervals.
       */
      #define BANDWIDTH_INTERVAL        max(HZ/5, 1)
      
      #define RATELIMIT_CALC_SHIFT        10
      
      /*
       * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
       * will look to see if it needs to force writeback or throttling.
       */
      static long ratelimit_pages = 32;
      
      /* The following parameters are exported via /proc/sys/vm */
      
      /*
       * Start background writeback (via writeback threads) at this percentage
       */
      int dirty_background_ratio = 10;
      
      /*
       * dirty_background_bytes starts at 0 (disabled) so that it is a function of
       * dirty_background_ratio * the amount of dirtyable memory
       */
      unsigned long dirty_background_bytes;
      
      /*
       * free highmem will not be subtracted from the total free memory
       * for calculating free ratios if vm_highmem_is_dirtyable is true
       */
      int vm_highmem_is_dirtyable;
      
      /*
       * The generator of dirty data starts writeback at this percentage
       */
      int vm_dirty_ratio = 20;
      
      /*
       * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
       * vm_dirty_ratio * the amount of dirtyable memory
       */
      unsigned long vm_dirty_bytes;
      
      /*
       * The interval between `kupdate'-style writebacks
       */
      unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
      
      EXPORT_SYMBOL_GPL(dirty_writeback_interval);
      
      /*
       * The longest time for which data is allowed to remain dirty
       */
      unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
      
      /*
       * Flag that makes the machine dump writes/reads and block dirtyings.
       */
      int block_dump;
      
      /*
       * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
       * a full sync is triggered after this time elapses without any disk activity.
       */
      int laptop_mode;
      
      EXPORT_SYMBOL(laptop_mode);
      
      /* End of sysctl-exported parameters */
      
      struct wb_domain global_wb_domain;
      
      /* consolidated parameters for balance_dirty_pages() and its subroutines */
      struct dirty_throttle_control {
      #ifdef CONFIG_CGROUP_WRITEBACK
              struct wb_domain        *dom;
              struct dirty_throttle_control *gdtc;        /* only set in memcg dtc's */
      #endif
              struct bdi_writeback        *wb;
              struct fprop_local_percpu *wb_completions;
      
              unsigned long                avail;                /* dirtyable */
              unsigned long                dirty;                /* file_dirty + write + nfs */
              unsigned long                thresh;                /* dirty threshold */
              unsigned long                bg_thresh;        /* dirty background threshold */
      
              unsigned long                wb_dirty;        /* per-wb counterparts */
              unsigned long                wb_thresh;
              unsigned long                wb_bg_thresh;
      
              unsigned long                pos_ratio;
      };
      
      /*
       * Length of period for aging writeout fractions of bdis. This is an
       * arbitrarily chosen number. The longer the period, the slower fractions will
       * reflect changes in current writeout rate.
       */
      #define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
      
      #ifdef CONFIG_CGROUP_WRITEBACK
      
      #define GDTC_INIT(__wb)                .wb = (__wb),                                \
                                      .dom = &global_wb_domain,                \
                                      .wb_completions = &(__wb)->completions
      
      #define GDTC_INIT_NO_WB                .dom = &global_wb_domain
      
      #define MDTC_INIT(__wb, __gdtc)        .wb = (__wb),                                \
                                      .dom = mem_cgroup_wb_domain(__wb),        \
                                      .wb_completions = &(__wb)->memcg_completions, \
                                      .gdtc = __gdtc
      
      static bool mdtc_valid(struct dirty_throttle_control *dtc)
      {
              return dtc->dom;
      }
      
      static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
      {
              return dtc->dom;
      }
      
      static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
      {
              return mdtc->gdtc;
      }
      
      static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
      {
              return &wb->memcg_completions;
      }
      
      static void wb_min_max_ratio(struct bdi_writeback *wb,
                                   unsigned long *minp, unsigned long *maxp)
      {
              unsigned long this_bw = wb->avg_write_bandwidth;
              unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
              unsigned long long min = wb->bdi->min_ratio;
              unsigned long long max = wb->bdi->max_ratio;
      
              /*
               * @wb may already be clean by the time control reaches here and
               * the total may not include its bw.
               */
              if (this_bw < tot_bw) {
                      if (min) {
                              min *= this_bw;
                              do_div(min, tot_bw);
                      }
                      if (max < 100) {
                              max *= this_bw;
                              do_div(max, tot_bw);
                      }
              }
      
              *minp = min;
              *maxp = max;
      }
      
      #else        /* CONFIG_CGROUP_WRITEBACK */
      
      #define GDTC_INIT(__wb)                .wb = (__wb),                           \
                                      .wb_completions = &(__wb)->completions
      #define GDTC_INIT_NO_WB
      #define MDTC_INIT(__wb, __gdtc)
      
      static bool mdtc_valid(struct dirty_throttle_control *dtc)
      {
              return false;
      }
      
      static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
      {
              return &global_wb_domain;
      }
      
      static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
      {
              return NULL;
      }
      
      static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
      {
              return NULL;
      }
      
      static void wb_min_max_ratio(struct bdi_writeback *wb,
                                   unsigned long *minp, unsigned long *maxp)
      {
              *minp = wb->bdi->min_ratio;
              *maxp = wb->bdi->max_ratio;
      }
      
      #endif        /* CONFIG_CGROUP_WRITEBACK */
      
      /*
       * In a memory zone, there is a certain amount of pages we consider
       * available for the page cache, which is essentially the number of
       * free and reclaimable pages, minus some zone reserves to protect
       * lowmem and the ability to uphold the zone's watermarks without
       * requiring writeback.
       *
       * This number of dirtyable pages is the base value of which the
       * user-configurable dirty ratio is the effictive number of pages that
       * are allowed to be actually dirtied.  Per individual zone, or
       * globally by using the sum of dirtyable pages over all zones.
       *
       * Because the user is allowed to specify the dirty limit globally as
       * absolute number of bytes, calculating the per-zone dirty limit can
       * require translating the configured limit into a percentage of
       * global dirtyable memory first.
       */
      
      /**
       * zone_dirtyable_memory - number of dirtyable pages in a zone
       * @zone: the zone
       *
       * Returns the zone's number of pages potentially available for dirty
       * page cache.  This is the base value for the per-zone dirty limits.
       */
      static unsigned long zone_dirtyable_memory(struct zone *zone)
      {
              unsigned long nr_pages;
      
  359         nr_pages = zone_page_state(zone, NR_FREE_PAGES);
              nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
      
              nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
              nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
      
              return nr_pages;
      }
      
      static unsigned long highmem_dirtyable_memory(unsigned long total)
      {
      #ifdef CONFIG_HIGHMEM
              int node;
              unsigned long x = 0;
      
              for_each_node_state(node, N_HIGH_MEMORY) {
                      struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
      
                      x += zone_dirtyable_memory(z);
              }
              /*
               * Unreclaimable memory (kernel memory or anonymous memory
               * without swap) can bring down the dirtyable pages below
               * the zone's dirty balance reserve and the above calculation
               * will underflow.  However we still want to add in nodes
               * which are below threshold (negative values) to get a more
               * accurate calculation but make sure that the total never
               * underflows.
               */
              if ((long)x < 0)
                      x = 0;
      
              /*
               * Make sure that the number of highmem pages is never larger
               * than the number of the total dirtyable memory. This can only
               * occur in very strange VM situations but we want to make sure
               * that this does not occur.
               */
              return min(x, total);
      #else
              return 0;
      #endif
      }
      
      /**
       * global_dirtyable_memory - number of globally dirtyable pages
       *
       * Returns the global number of pages potentially available for dirty
       * page cache.  This is the base value for the global dirty limits.
       */
      static unsigned long global_dirtyable_memory(void)
      {
              unsigned long x;
      
              x = global_page_state(NR_FREE_PAGES);
              x -= min(x, dirty_balance_reserve);
      
              x += global_page_state(NR_INACTIVE_FILE);
              x += global_page_state(NR_ACTIVE_FILE);
      
              if (!vm_highmem_is_dirtyable)
                      x -= highmem_dirtyable_memory(x);
      
              return x + 1;        /* Ensure that we never return 0 */
      }
      
      /**
       * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
       * @dtc: dirty_throttle_control of interest
       *
       * Calculate @dtc->thresh and ->bg_thresh considering
       * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
       * must ensure that @dtc->avail is set before calling this function.  The
       * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
       * real-time tasks.
       */
      static void domain_dirty_limits(struct dirty_throttle_control *dtc)
      {
  399         const unsigned long available_memory = dtc->avail;
              struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
              unsigned long bytes = vm_dirty_bytes;
              unsigned long bg_bytes = dirty_background_bytes;
              /* convert ratios to per-PAGE_SIZE for higher precision */
  399         unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
  399         unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
              unsigned long thresh;
              unsigned long bg_thresh;
              struct task_struct *tsk;
      
              /* gdtc is !NULL iff @dtc is for memcg domain */
              if (gdtc) {
                      unsigned long global_avail = gdtc->avail;
      
                      /*
                       * The byte settings can't be applied directly to memcg
                       * domains.  Convert them to ratios by scaling against
                       * globally available memory.  As the ratios are in
                       * per-PAGE_SIZE, they can be obtained by dividing bytes by
                       * number of pages.
                       */
                      if (bytes)
                              ratio = min(DIV_ROUND_UP(bytes, global_avail),
                                          PAGE_SIZE);
                      if (bg_bytes)
                              bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
                                             PAGE_SIZE);
                      bytes = bg_bytes = 0;
              }
      
              if (bytes)
                      thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
              else
                      thresh = (ratio * available_memory) / PAGE_SIZE;
      
  399         if (bg_bytes)
                      bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
              else
                      bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
      
  399         if (bg_thresh >= thresh)
                      bg_thresh = thresh / 2;
  399         tsk = current;
  399         if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
                      bg_thresh += bg_thresh / 4;
                      thresh += thresh / 4;
              }
  399         dtc->thresh = thresh;
              dtc->bg_thresh = bg_thresh;
      
              /* we should eventually report the domain in the TP */
              if (!gdtc)
  399                 trace_global_dirty_state(bg_thresh, thresh);
  399 }
      
      /**
       * global_dirty_limits - background-writeback and dirty-throttling thresholds
       * @pbackground: out parameter for bg_thresh
       * @pdirty: out parameter for thresh
       *
       * Calculate bg_thresh and thresh for global_wb_domain.  See
       * domain_dirty_limits() for details.
       */
      void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
      {
    2         struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
      
              gdtc.avail = global_dirtyable_memory();
              domain_dirty_limits(&gdtc);
      
              *pbackground = gdtc.bg_thresh;
              *pdirty = gdtc.thresh;
      }
      
      /**
       * zone_dirty_limit - maximum number of dirty pages allowed in a zone
       * @zone: the zone
       *
       * Returns the maximum number of dirty pages allowed in a zone, based
       * on the zone's dirtyable memory.
       */
      static unsigned long zone_dirty_limit(struct zone *zone)
      {
  359         unsigned long zone_memory = zone_dirtyable_memory(zone);
              struct task_struct *tsk = current;
              unsigned long dirty;
      
              if (vm_dirty_bytes)
                      dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
                              zone_memory / global_dirtyable_memory();
              else
  359                 dirty = vm_dirty_ratio * zone_memory / 100;
      
  359         if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
                      dirty += dirty / 4;
      
              return dirty;
      }
      
      /**
       * zone_dirty_ok - tells whether a zone is within its dirty limits
       * @zone: the zone to check
       *
       * Returns %true when the dirty pages in @zone are within the zone's
       * dirty limit, %false if the limit is exceeded.
       */
      bool zone_dirty_ok(struct zone *zone)
      {
  359         unsigned long limit = zone_dirty_limit(zone);
      
  359         return zone_page_state(zone, NR_FILE_DIRTY) +
                     zone_page_state(zone, NR_UNSTABLE_NFS) +
                     zone_page_state(zone, NR_WRITEBACK) <= limit;
      }
      
      int dirty_background_ratio_handler(struct ctl_table *table, int write,
                      void __user *buffer, size_t *lenp,
                      loff_t *ppos)
      {
              int ret;
      
              ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
              if (ret == 0 && write)
                      dirty_background_bytes = 0;
              return ret;
      }
      
      int dirty_background_bytes_handler(struct ctl_table *table, int write,
                      void __user *buffer, size_t *lenp,
                      loff_t *ppos)
      {
              int ret;
      
              ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
              if (ret == 0 && write)
                      dirty_background_ratio = 0;
              return ret;
      }
      
      int dirty_ratio_handler(struct ctl_table *table, int write,
                      void __user *buffer, size_t *lenp,
                      loff_t *ppos)
      {
              int old_ratio = vm_dirty_ratio;
              int ret;
      
              ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
              if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                      writeback_set_ratelimit();
                      vm_dirty_bytes = 0;
              }
              return ret;
      }
      
      int dirty_bytes_handler(struct ctl_table *table, int write,
                      void __user *buffer, size_t *lenp,
                      loff_t *ppos)
      {
              unsigned long old_bytes = vm_dirty_bytes;
              int ret;
      
              ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
              if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                      writeback_set_ratelimit();
                      vm_dirty_ratio = 0;
              }
              return ret;
      }
      
      static unsigned long wp_next_time(unsigned long cur_time)
      {
              cur_time += VM_COMPLETIONS_PERIOD_LEN;
              /* 0 has a special meaning... */
              if (!cur_time)
                      return 1;
              return cur_time;
      }
      
      static void wb_domain_writeout_inc(struct wb_domain *dom,
                                         struct fprop_local_percpu *completions,
                                         unsigned int max_prop_frac)
      {
   29         __fprop_inc_percpu_max(&dom->completions, completions,
                                     max_prop_frac);
              /* First event after period switching was turned off? */
              if (!unlikely(dom->period_time)) {
                      /*
                       * We can race with other __bdi_writeout_inc calls here but
                       * it does not cause any harm since the resulting time when
                       * timer will fire and what is in writeout_period_time will be
                       * roughly the same.
                       */
                      dom->period_time = wp_next_time(jiffies);
                      mod_timer(&dom->period_timer, dom->period_time);
              }
   29 }
      
      /*
       * Increment @wb's writeout completion count and the global writeout
       * completion count. Called from test_clear_page_writeback().
       */
      static inline void __wb_writeout_inc(struct bdi_writeback *wb)
      {
              struct wb_domain *cgdom;
      
              __inc_wb_stat(wb, WB_WRITTEN);
              wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
                                     wb->bdi->max_prop_frac);
      
              cgdom = mem_cgroup_wb_domain(wb);
              if (cgdom)
                      wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
                                             wb->bdi->max_prop_frac);
      }
      
      void wb_writeout_inc(struct bdi_writeback *wb)
      {
              unsigned long flags;
      
              local_irq_save(flags);
              __wb_writeout_inc(wb);
              local_irq_restore(flags);
      }
      EXPORT_SYMBOL_GPL(wb_writeout_inc);
      
      /*
       * On idle system, we can be called long after we scheduled because we use
       * deferred timers so count with missed periods.
       */
      static void writeout_period(unsigned long t)
      {
              struct wb_domain *dom = (void *)t;
              int miss_periods = (jiffies - dom->period_time) /
                                                       VM_COMPLETIONS_PERIOD_LEN;
      
              if (fprop_new_period(&dom->completions, miss_periods + 1)) {
                      dom->period_time = wp_next_time(dom->period_time +
                                      miss_periods * VM_COMPLETIONS_PERIOD_LEN);
                      mod_timer(&dom->period_timer, dom->period_time);
              } else {
                      /*
                       * Aging has zeroed all fractions. Stop wasting CPU on period
                       * updates.
                       */
                      dom->period_time = 0;
              }
      }
      
      int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
      {
              memset(dom, 0, sizeof(*dom));
      
              spin_lock_init(&dom->lock);
      
              init_timer_deferrable(&dom->period_timer);
              dom->period_timer.function = writeout_period;
              dom->period_timer.data = (unsigned long)dom;
      
              dom->dirty_limit_tstamp = jiffies;
      
              return fprop_global_init(&dom->completions, gfp);
      }
      
      #ifdef CONFIG_CGROUP_WRITEBACK
      void wb_domain_exit(struct wb_domain *dom)
      {
              del_timer_sync(&dom->period_timer);
              fprop_global_destroy(&dom->completions);
      }
      #endif
      
      /*
       * bdi_min_ratio keeps the sum of the minimum dirty shares of all
       * registered backing devices, which, for obvious reasons, can not
       * exceed 100%.
       */
      static unsigned int bdi_min_ratio;
      
      int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
      {
              int ret = 0;
      
              spin_lock_bh(&bdi_lock);
              if (min_ratio > bdi->max_ratio) {
                      ret = -EINVAL;
              } else {
                      min_ratio -= bdi->min_ratio;
                      if (bdi_min_ratio + min_ratio < 100) {
                              bdi_min_ratio += min_ratio;
                              bdi->min_ratio += min_ratio;
                      } else {
                              ret = -EINVAL;
                      }
              }
              spin_unlock_bh(&bdi_lock);
      
              return ret;
      }
      
      int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
      {
              int ret = 0;
      
              if (max_ratio > 100)
                      return -EINVAL;
      
              spin_lock_bh(&bdi_lock);
              if (bdi->min_ratio > max_ratio) {
                      ret = -EINVAL;
              } else {
                      bdi->max_ratio = max_ratio;
                      bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
              }
              spin_unlock_bh(&bdi_lock);
      
              return ret;
      }
      EXPORT_SYMBOL(bdi_set_max_ratio);
      
      static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                                                 unsigned long bg_thresh)
      {
  398         return (thresh + bg_thresh) / 2;
      }
      
      static unsigned long hard_dirty_limit(struct wb_domain *dom,
                                            unsigned long thresh)
      {
              return max(thresh, dom->dirty_limit);
      }
      
      /*
       * Memory which can be further allocated to a memcg domain is capped by
       * system-wide clean memory excluding the amount being used in the domain.
       */
      static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
                                  unsigned long filepages, unsigned long headroom)
      {
              struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
              unsigned long clean = filepages - min(filepages, mdtc->dirty);
              unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
              unsigned long other_clean = global_clean - min(global_clean, clean);
      
              mdtc->avail = filepages + min(headroom, other_clean);
      }
      
      /**
       * __wb_calc_thresh - @wb's share of dirty throttling threshold
       * @dtc: dirty_throttle_context of interest
       *
       * Returns @wb's dirty limit in pages. The term "dirty" in the context of
       * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
       *
       * Note that balance_dirty_pages() will only seriously take it as a hard limit
       * when sleeping max_pause per page is not enough to keep the dirty pages under
       * control. For example, when the device is completely stalled due to some error
       * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
       * In the other normal situations, it acts more gently by throttling the tasks
       * more (rather than completely block them) when the wb dirty pages go high.
       *
       * It allocates high/low dirty limits to fast/slow devices, in order to prevent
       * - starving fast devices
       * - piling up dirty pages (that will take long time to sync) on slow devices
       *
       * The wb's share of dirty limit will be adapting to its throughput and
       * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
       */
      static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
      {
              struct wb_domain *dom = dtc_dom(dtc);
              unsigned long thresh = dtc->thresh;
              u64 wb_thresh;
              long numerator, denominator;
              unsigned long wb_min_ratio, wb_max_ratio;
      
              /*
               * Calculate this BDI's share of the thresh ratio.
               */
              fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
                                    &numerator, &denominator);
      
              wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
              wb_thresh *= numerator;
              do_div(wb_thresh, denominator);
      
              wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
      
              wb_thresh += (thresh * wb_min_ratio) / 100;
              if (wb_thresh > (thresh * wb_max_ratio) / 100)
                      wb_thresh = thresh * wb_max_ratio / 100;
      
              return wb_thresh;
      }
      
      unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
      {
              struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
                                                     .thresh = thresh };
              return __wb_calc_thresh(&gdtc);
      }
      
      /*
       *                           setpoint - dirty 3
       *        f(dirty) := 1.0 + (----------------)
       *                           limit - setpoint
       *
       * it's a 3rd order polynomial that subjects to
       *
       * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
       * (2) f(setpoint) = 1.0 => the balance point
       * (3) f(limit)    = 0   => the hard limit
       * (4) df/dx      <= 0         => negative feedback control
       * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
       *     => fast response on large errors; small oscillation near setpoint
       */
      static long long pos_ratio_polynom(unsigned long setpoint,
                                                unsigned long dirty,
                                                unsigned long limit)
      {
              long long pos_ratio;
              long x;
      
              x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
                            (limit - setpoint) | 1);
              pos_ratio = x;
              pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
              pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
              pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
      
              return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
      }
      
      /*
       * Dirty position control.
       *
       * (o) global/bdi setpoints
       *
       * We want the dirty pages be balanced around the global/wb setpoints.
       * When the number of dirty pages is higher/lower than the setpoint, the
       * dirty position control ratio (and hence task dirty ratelimit) will be
       * decreased/increased to bring the dirty pages back to the setpoint.
       *
       *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
       *
       *     if (dirty < setpoint) scale up   pos_ratio
       *     if (dirty > setpoint) scale down pos_ratio
       *
       *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
       *     if (wb_dirty > wb_setpoint) scale down pos_ratio
       *
       *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
       *
       * (o) global control line
       *
       *     ^ pos_ratio
       *     |
       *     |            |<===== global dirty control scope ======>|
       * 2.0 .............*
       *     |            .*
       *     |            . *
       *     |            .   *
       *     |            .     *
       *     |            .        *
       *     |            .            *
       * 1.0 ................................*
       *     |            .                  .     *
       *     |            .                  .          *
       *     |            .                  .              *
       *     |            .                  .                 *
       *     |            .                  .                    *
       *   0 +------------.------------------.----------------------*------------->
       *           freerun^          setpoint^                 limit^   dirty pages
       *
       * (o) wb control line
       *
       *     ^ pos_ratio
       *     |
       *     |            *
       *     |              *
       *     |                *
       *     |                  *
       *     |                    * |<=========== span ============>|
       * 1.0 .......................*
       *     |                      . *
       *     |                      .   *
       *     |                      .     *
       *     |                      .       *
       *     |                      .         *
       *     |                      .           *
       *     |                      .             *
       *     |                      .               *
       *     |                      .                 *
       *     |                      .                   *
       *     |                      .                     *
       * 1/4 ...............................................* * * * * * * * * * * *
       *     |                      .                         .
       *     |                      .                           .
       *     |                      .                             .
       *   0 +----------------------.-------------------------------.------------->
       *                wb_setpoint^                    x_intercept^
       *
       * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
       * be smoothly throttled down to normal if it starts high in situations like
       * - start writing to a slow SD card and a fast disk at the same time. The SD
       *   card's wb_dirty may rush to many times higher than wb_setpoint.
       * - the wb dirty thresh drops quickly due to change of JBOD workload
       */
      static void wb_position_ratio(struct dirty_throttle_control *dtc)
      {
              struct bdi_writeback *wb = dtc->wb;
              unsigned long write_bw = wb->avg_write_bandwidth;
              unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
              unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
              unsigned long wb_thresh = dtc->wb_thresh;
              unsigned long x_intercept;
              unsigned long setpoint;                /* dirty pages' target balance point */
              unsigned long wb_setpoint;
              unsigned long span;
              long long pos_ratio;                /* for scaling up/down the rate limit */
              long x;
      
              dtc->pos_ratio = 0;
      
              if (unlikely(dtc->dirty >= limit))
                      return;
      
              /*
               * global setpoint
               *
               * See comment for pos_ratio_polynom().
               */
              setpoint = (freerun + limit) / 2;
              pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
      
              /*
               * The strictlimit feature is a tool preventing mistrusted filesystems
               * from growing a large number of dirty pages before throttling. For
               * such filesystems balance_dirty_pages always checks wb counters
               * against wb limits. Even if global "nr_dirty" is under "freerun".
               * This is especially important for fuse which sets bdi->max_ratio to
               * 1% by default. Without strictlimit feature, fuse writeback may
               * consume arbitrary amount of RAM because it is accounted in
               * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
               *
               * Here, in wb_position_ratio(), we calculate pos_ratio based on
               * two values: wb_dirty and wb_thresh. Let's consider an example:
               * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
               * limits are set by default to 10% and 20% (background and throttle).
               * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
               * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
               * about ~6K pages (as the average of background and throttle wb
               * limits). The 3rd order polynomial will provide positive feedback if
               * wb_dirty is under wb_setpoint and vice versa.
               *
               * Note, that we cannot use global counters in these calculations
               * because we want to throttle process writing to a strictlimit wb
               * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
               * in the example above).
               */
              if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                      long long wb_pos_ratio;
      
                      if (dtc->wb_dirty < 8) {
                              dtc->pos_ratio = min_t(long long, pos_ratio * 2,
                                                 2 << RATELIMIT_CALC_SHIFT);
                              return;
                      }
      
                      if (dtc->wb_dirty >= wb_thresh)
                              return;
      
                      wb_setpoint = dirty_freerun_ceiling(wb_thresh,
                                                          dtc->wb_bg_thresh);
      
                      if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
                              return;
      
                      wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
                                                       wb_thresh);
      
                      /*
                       * Typically, for strictlimit case, wb_setpoint << setpoint
                       * and pos_ratio >> wb_pos_ratio. In the other words global
                       * state ("dirty") is not limiting factor and we have to
                       * make decision based on wb counters. But there is an
                       * important case when global pos_ratio should get precedence:
                       * global limits are exceeded (e.g. due to activities on other
                       * wb's) while given strictlimit wb is below limit.
                       *
                       * "pos_ratio * wb_pos_ratio" would work for the case above,
                       * but it would look too non-natural for the case of all
                       * activity in the system coming from a single strictlimit wb
                       * with bdi->max_ratio == 100%.
                       *
                       * Note that min() below somewhat changes the dynamics of the
                       * control system. Normally, pos_ratio value can be well over 3
                       * (when globally we are at freerun and wb is well below wb
                       * setpoint). Now the maximum pos_ratio in the same situation
                       * is 2. We might want to tweak this if we observe the control
                       * system is too slow to adapt.
                       */
                      dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
                      return;
              }
      
              /*
               * We have computed basic pos_ratio above based on global situation. If
               * the wb is over/under its share of dirty pages, we want to scale
               * pos_ratio further down/up. That is done by the following mechanism.
               */
      
              /*
               * wb setpoint
               *
               *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
               *
               *                        x_intercept - wb_dirty
               *                     := --------------------------
               *                        x_intercept - wb_setpoint
               *
               * The main wb control line is a linear function that subjects to
               *
               * (1) f(wb_setpoint) = 1.0
               * (2) k = - 1 / (8 * write_bw)  (in single wb case)
               *     or equally: x_intercept = wb_setpoint + 8 * write_bw
               *
               * For single wb case, the dirty pages are observed to fluctuate
               * regularly within range
               *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
               * for various filesystems, where (2) can yield in a reasonable 12.5%
               * fluctuation range for pos_ratio.
               *
               * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
               * own size, so move the slope over accordingly and choose a slope that
               * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
               */
              if (unlikely(wb_thresh > dtc->thresh))
                      wb_thresh = dtc->thresh;
              /*
               * It's very possible that wb_thresh is close to 0 not because the
               * device is slow, but that it has remained inactive for long time.
               * Honour such devices a reasonable good (hopefully IO efficient)
               * threshold, so that the occasional writes won't be blocked and active
               * writes can rampup the threshold quickly.
               */
              wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
              /*
               * scale global setpoint to wb's:
               *        wb_setpoint = setpoint * wb_thresh / thresh
               */
              x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
              wb_setpoint = setpoint * (u64)x >> 16;
              /*
               * Use span=(8*write_bw) in single wb case as indicated by
               * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
               *
               *        wb_thresh                    thresh - wb_thresh
               * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
               *         thresh                           thresh
               */
              span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
              x_intercept = wb_setpoint + span;
      
              if (dtc->wb_dirty < x_intercept - span / 4) {
                      pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
                                            (x_intercept - wb_setpoint) | 1);
              } else
                      pos_ratio /= 4;
      
              /*
               * wb reserve area, safeguard against dirty pool underrun and disk idle
               * It may push the desired control point of global dirty pages higher
               * than setpoint.
               */
              x_intercept = wb_thresh / 2;
              if (dtc->wb_dirty < x_intercept) {
                      if (dtc->wb_dirty > x_intercept / 8)
                              pos_ratio = div_u64(pos_ratio * x_intercept,
                                                  dtc->wb_dirty);
                      else
                              pos_ratio *= 8;
              }
      
              dtc->pos_ratio = pos_ratio;
      }
      
      static void wb_update_write_bandwidth(struct bdi_writeback *wb,
                                            unsigned long elapsed,
                                            unsigned long written)
      {
              const unsigned long period = roundup_pow_of_two(3 * HZ);
              unsigned long avg = wb->avg_write_bandwidth;
              unsigned long old = wb->write_bandwidth;
              u64 bw;
      
              /*
               * bw = written * HZ / elapsed
               *
               *                   bw * elapsed + write_bandwidth * (period - elapsed)
               * write_bandwidth = ---------------------------------------------------
               *                                          period
               *
               * @written may have decreased due to account_page_redirty().
               * Avoid underflowing @bw calculation.
               */
              bw = written - min(written, wb->written_stamp);
              bw *= HZ;
              if (unlikely(elapsed > period)) {
                      do_div(bw, elapsed);
                      avg = bw;
                      goto out;
              }
              bw += (u64)wb->write_bandwidth * (period - elapsed);
              bw >>= ilog2(period);
      
              /*
               * one more level of smoothing, for filtering out sudden spikes
               */
              if (avg > old && old >= (unsigned long)bw)
                      avg -= (avg - old) >> 3;
      
              if (avg < old && old <= (unsigned long)bw)
                      avg += (old - avg) >> 3;
      
      out:
              /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
              avg = max(avg, 1LU);
              if (wb_has_dirty_io(wb)) {
                      long delta = avg - wb->avg_write_bandwidth;
                      WARN_ON_ONCE(atomic_long_add_return(delta,
                                              &wb->bdi->tot_write_bandwidth) <= 0);
              }
              wb->write_bandwidth = bw;
              wb->avg_write_bandwidth = avg;
      }
      
      static void update_dirty_limit(struct dirty_throttle_control *dtc)
      {
              struct wb_domain *dom = dtc_dom(dtc);
              unsigned long thresh = dtc->thresh;
              unsigned long limit = dom->dirty_limit;
      
              /*
               * Follow up in one step.
               */
              if (limit < thresh) {
                      limit = thresh;
                      goto update;
              }
      
              /*
               * Follow down slowly. Use the higher one as the target, because thresh
               * may drop below dirty. This is exactly the reason to introduce
               * dom->dirty_limit which is guaranteed to lie above the dirty pages.
               */
              thresh = max(thresh, dtc->dirty);
              if (limit > thresh) {
                      limit -= (limit - thresh) >> 5;
                      goto update;
              }
              return;
      update:
              dom->dirty_limit = limit;
      }
      
      static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
                                          unsigned long now)
      {
              struct wb_domain *dom = dtc_dom(dtc);
      
              /*
               * check locklessly first to optimize away locking for the most time
               */
              if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
                      return;
      
              spin_lock(&dom->lock);
              if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
                      update_dirty_limit(dtc);
                      dom->dirty_limit_tstamp = now;
              }
              spin_unlock(&dom->lock);
      }
      
      /*
       * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
       *
       * Normal wb tasks will be curbed at or below it in long term.
       * Obviously it should be around (write_bw / N) when there are N dd tasks.
       */
      static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
                                            unsigned long dirtied,
                                            unsigned long elapsed)
      {
              struct bdi_writeback *wb = dtc->wb;
              unsigned long dirty = dtc->dirty;
              unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
              unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
              unsigned long setpoint = (freerun + limit) / 2;
              unsigned long write_bw = wb->avg_write_bandwidth;
              unsigned long dirty_ratelimit = wb->dirty_ratelimit;
              unsigned long dirty_rate;
              unsigned long task_ratelimit;
              unsigned long balanced_dirty_ratelimit;
              unsigned long step;
              unsigned long x;
              unsigned long shift;
      
              /*
               * The dirty rate will match the writeout rate in long term, except
               * when dirty pages are truncated by userspace or re-dirtied by FS.
               */
              dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
      
              /*
               * task_ratelimit reflects each dd's dirty rate for the past 200ms.
               */
              task_ratelimit = (u64)dirty_ratelimit *
                                              dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
              task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
      
              /*
               * A linear estimation of the "balanced" throttle rate. The theory is,
               * if there are N dd tasks, each throttled at task_ratelimit, the wb's
               * dirty_rate will be measured to be (N * task_ratelimit). So the below
               * formula will yield the balanced rate limit (write_bw / N).
               *
               * Note that the expanded form is not a pure rate feedback:
               *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate)                     (1)
               * but also takes pos_ratio into account:
               *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
               *
               * (1) is not realistic because pos_ratio also takes part in balancing
               * the dirty rate.  Consider the state
               *        pos_ratio = 0.5                                                     (3)
               *        rate = 2 * (write_bw / N)                                     (4)
               * If (1) is used, it will stuck in that state! Because each dd will
               * be throttled at
               *        task_ratelimit = pos_ratio * rate = (write_bw / N)             (5)
               * yielding
               *        dirty_rate = N * task_ratelimit = write_bw                     (6)
               * put (6) into (1) we get
               *        rate_(i+1) = rate_(i)                                             (7)
               *
               * So we end up using (2) to always keep
               *        rate_(i+1) ~= (write_bw / N)                                     (8)
               * regardless of the value of pos_ratio. As long as (8) is satisfied,
               * pos_ratio is able to drive itself to 1.0, which is not only where
               * the dirty count meet the setpoint, but also where the slope of
               * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
               */
              balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
                                                 dirty_rate | 1);
              /*
               * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
               */
              if (unlikely(balanced_dirty_ratelimit > write_bw))
                      balanced_dirty_ratelimit = write_bw;
      
              /*
               * We could safely do this and return immediately:
               *
               *        wb->dirty_ratelimit = balanced_dirty_ratelimit;
               *
               * However to get a more stable dirty_ratelimit, the below elaborated
               * code makes use of task_ratelimit to filter out singular points and
               * limit the step size.
               *
               * The below code essentially only uses the relative value of
               *
               *        task_ratelimit - dirty_ratelimit
               *        = (pos_ratio - 1) * dirty_ratelimit
               *
               * which reflects the direction and size of dirty position error.
               */
      
              /*
               * dirty_ratelimit will follow balanced_dirty_ratelimit iff
               * task_ratelimit is on the same side of dirty_ratelimit, too.
               * For example, when
               * - dirty_ratelimit > balanced_dirty_ratelimit
               * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
               * lowering dirty_ratelimit will help meet both the position and rate
               * control targets. Otherwise, don't update dirty_ratelimit if it will
               * only help meet the rate target. After all, what the users ultimately
               * feel and care are stable dirty rate and small position error.
               *
               * |task_ratelimit - dirty_ratelimit| is used to limit the step size
               * and filter out the singular points of balanced_dirty_ratelimit. Which
               * keeps jumping around randomly and can even leap far away at times
               * due to the small 200ms estimation period of dirty_rate (we want to
               * keep that period small to reduce time lags).
               */
              step = 0;
      
              /*
               * For strictlimit case, calculations above were based on wb counters
               * and limits (starting from pos_ratio = wb_position_ratio() and up to
               * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
               * Hence, to calculate "step" properly, we have to use wb_dirty as
               * "dirty" and wb_setpoint as "setpoint".
               *
               * We rampup dirty_ratelimit forcibly if wb_dirty is low because
               * it's possible that wb_thresh is close to zero due to inactivity
               * of backing device.
               */
              if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                      dirty = dtc->wb_dirty;
                      if (dtc->wb_dirty < 8)
                              setpoint = dtc->wb_dirty + 1;
                      else
                              setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
              }
      
              if (dirty < setpoint) {
                      x = min3(wb->balanced_dirty_ratelimit,
                               balanced_dirty_ratelimit, task_ratelimit);
                      if (dirty_ratelimit < x)
                              step = x - dirty_ratelimit;
              } else {
                      x = max3(wb->balanced_dirty_ratelimit,
                               balanced_dirty_ratelimit, task_ratelimit);
                      if (dirty_ratelimit > x)
                              step = dirty_ratelimit - x;
              }
      
              /*
               * Don't pursue 100% rate matching. It's impossible since the balanced
               * rate itself is constantly fluctuating. So decrease the track speed
               * when it gets close to the target. Helps eliminate pointless tremors.
               */
              shift = dirty_ratelimit / (2 * step + 1);
              if (shift < BITS_PER_LONG)
                      step = DIV_ROUND_UP(step >> shift, 8);
              else
                      step = 0;
      
              if (dirty_ratelimit < balanced_dirty_ratelimit)
                      dirty_ratelimit += step;
              else
                      dirty_ratelimit -= step;
      
              wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
              wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
      
              trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
      }
      
      static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                                        struct dirty_throttle_control *mdtc,
                                        unsigned long start_time,
                                        bool update_ratelimit)
      {
              struct bdi_writeback *wb = gdtc->wb;
              unsigned long now = jiffies;
              unsigned long elapsed = now - wb->bw_time_stamp;
              unsigned long dirtied;
              unsigned long written;
      
              lockdep_assert_held(&wb->list_lock);
      
              /*
               * rate-limit, only update once every 200ms.
               */
              if (elapsed < BANDWIDTH_INTERVAL)
                      return;
      
              dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
              written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
      
              /*
               * Skip quiet periods when disk bandwidth is under-utilized.
               * (at least 1s idle time between two flusher runs)
               */
              if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
                      goto snapshot;
      
              if (update_ratelimit) {
                      domain_update_bandwidth(gdtc, now);
                      wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
      
                      /*
                       * @mdtc is always NULL if !CGROUP_WRITEBACK but the
                       * compiler has no way to figure that out.  Help it.
                       */
                      if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
                              domain_update_bandwidth(mdtc, now);
                              wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
                      }
              }
              wb_update_write_bandwidth(wb, elapsed, written);
      
      snapshot:
              wb->dirtied_stamp = dirtied;
              wb->written_stamp = written;
              wb->bw_time_stamp = now;
      }
      
      void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
      {
              struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
      
              __wb_update_bandwidth(&gdtc, NULL, start_time, false);
      }
      
      /*
       * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
       * will look to see if it needs to start dirty throttling.
       *
       * If dirty_poll_interval is too low, big NUMA machines will call the expensive
       * global_page_state() too often. So scale it near-sqrt to the safety margin
       * (the number of pages we may dirty without exceeding the dirty limits).
       */
  398 static unsigned long dirty_poll_interval(unsigned long dirty,
                                               unsigned long thresh)
      {
  398         if (thresh > dirty)
  398                 return 1UL << (ilog2(thresh - dirty) >> 1);
      
              return 1;
      }
      
      static unsigned long wb_max_pause(struct bdi_writeback *wb,
                                        unsigned long wb_dirty)
      {
              unsigned long bw = wb->avg_write_bandwidth;
              unsigned long t;
      
              /*
               * Limit pause time for small memory systems. If sleeping for too long
               * time, a small pool of dirty/writeback pages may go empty and disk go
               * idle.
               *
               * 8 serves as the safety ratio.
               */
              t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
              t++;
      
              return min_t(unsigned long, t, MAX_PAUSE);
      }
      
      static long wb_min_pause(struct bdi_writeback *wb,
                               long max_pause,
                               unsigned long task_ratelimit,
                               unsigned long dirty_ratelimit,
                               int *nr_dirtied_pause)
      {
              long hi = ilog2(wb->avg_write_bandwidth);
              long lo = ilog2(wb->dirty_ratelimit);
              long t;                /* target pause */
              long pause;        /* estimated next pause */
              int pages;        /* target nr_dirtied_pause */
      
              /* target for 10ms pause on 1-dd case */
              t = max(1, HZ / 100);
      
              /*
               * Scale up pause time for concurrent dirtiers in order to reduce CPU
               * overheads.
               *
               * (N * 10ms) on 2^N concurrent tasks.
               */
              if (hi > lo)
                      t += (hi - lo) * (10 * HZ) / 1024;
      
              /*
               * This is a bit convoluted. We try to base the next nr_dirtied_pause
               * on the much more stable dirty_ratelimit. However the next pause time
               * will be computed based on task_ratelimit and the two rate limits may
               * depart considerably at some time. Especially if task_ratelimit goes
               * below dirty_ratelimit/2 and the target pause is max_pause, the next
               * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a
               * result task_ratelimit won't be executed faithfully, which could
               * eventually bring down dirty_ratelimit.
               *
               * We apply two rules to fix it up:
               * 1) try to estimate the next pause time and if necessary, use a lower
               *    nr_dirtied_pause so as not to exceed max_pause. When this happens,
               *    nr_dirtied_pause will be "dancing" with task_ratelimit.
               * 2) limit the target pause time to max_pause/2, so that the normal
               *    small fluctuations of task_ratelimit won't trigger rule (1) and
               *    nr_dirtied_pause will remain as stable as dirty_ratelimit.
               */
              t = min(t, 1 + max_pause / 2);
              pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
      
              /*
               * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
               * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
               * When the 16 consecutive reads are often interrupted by some dirty
               * throttling pause during the async writes, cfq will go into idles
               * (deadline is fine). So push nr_dirtied_pause as high as possible
               * until reaches DIRTY_POLL_THRESH=32 pages.
               */
              if (pages < DIRTY_POLL_THRESH) {
                      t = max_pause;
                      pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
                      if (pages > DIRTY_POLL_THRESH) {
                              pages = DIRTY_POLL_THRESH;
                              t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
                      }
              }
      
              pause = HZ * pages / (task_ratelimit + 1);
              if (pause > max_pause) {
                      t = max_pause;
                      pages = task_ratelimit * t / roundup_pow_of_two(HZ);
              }
      
              *nr_dirtied_pause = pages;
              /*
               * The minimal pause time will normally be half the target pause time.
               */
              return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
      }
      
      static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
      {
              struct bdi_writeback *wb = dtc->wb;
              unsigned long wb_reclaimable;
      
              /*
               * wb_thresh is not treated as some limiting factor as
               * dirty_thresh, due to reasons
               * - in JBOD setup, wb_thresh can fluctuate a lot
               * - in a system with HDD and USB key, the USB key may somehow
               *   go into state (wb_dirty >> wb_thresh) either because
               *   wb_dirty starts high, or because wb_thresh drops low.
               *   In this case we don't want to hard throttle the USB key
               *   dirtiers for 100 seconds until wb_dirty drops under
               *   wb_thresh. Instead the auxiliary wb control line in
               *   wb_position_ratio() will let the dirtier task progress
               *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
               */
              dtc->wb_thresh = __wb_calc_thresh(dtc);
              dtc->wb_bg_thresh = dtc->thresh ?
                      div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
      
              /*
               * In order to avoid the stacked BDI deadlock we need
               * to ensure we accurately count the 'dirty' pages when
               * the threshold is low.
               *
               * Otherwise it would be possible to get thresh+n pages
               * reported dirty, even though there are thresh-m pages
               * actually dirty; with m+n sitting in the percpu
               * deltas.
               */
              if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
                      wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
                      dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
              } else {
                      wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
                      dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
              }
      }
      
      /*
       * balance_dirty_pages() must be called by processes which are generating dirty
       * data.  It looks at the number of dirty pages in the machine and will force
       * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
       * If we're over `background_thresh' then the writeback threads are woken to
       * perform some writeout.
       */
      static void balance_dirty_pages(struct address_space *mapping,
                                      struct bdi_writeback *wb,
                                      unsigned long pages_dirtied)
      {
              struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
              struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
              struct dirty_throttle_control * const gdtc = &gdtc_stor;
              struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                                                           &mdtc_stor : NULL;
              struct dirty_throttle_control *sdtc;
              unsigned long nr_reclaimable;        /* = file_dirty + unstable_nfs */
              long period;
              long pause;
              long max_pause;
              long min_pause;
              int nr_dirtied_pause;
              bool dirty_exceeded = false;
              unsigned long task_ratelimit;
              unsigned long dirty_ratelimit;
              struct backing_dev_info *bdi = wb->bdi;
              bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
              unsigned long start_time = jiffies;
      
              for (;;) {
  398                 unsigned long now = jiffies;
                      unsigned long dirty, thresh, bg_thresh;
                      unsigned long m_dirty = 0;        /* stop bogus uninit warnings */
                      unsigned long m_thresh = 0;
                      unsigned long m_bg_thresh = 0;
      
                      /*
                       * Unstable writes are a feature of certain networked
                       * filesystems (i.e. NFS) in which data may have been
                       * written to the server's write cache, but has not yet
                       * been flushed to permanent storage.
                       */
                      nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                              global_page_state(NR_UNSTABLE_NFS);
                      gdtc->avail = global_dirtyable_memory();
                      gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
      
                      domain_dirty_limits(gdtc);
      
                      if (unlikely(strictlimit)) {
                              wb_dirty_limits(gdtc);
      
                              dirty = gdtc->wb_dirty;
                              thresh = gdtc->wb_thresh;
                              bg_thresh = gdtc->wb_bg_thresh;
                      } else {
  398                         dirty = gdtc->dirty;
                              thresh = gdtc->thresh;
                              bg_thresh = gdtc->bg_thresh;
                      }
      
                      if (mdtc) {
                              unsigned long filepages, headroom, writeback;
      
                              /*
                               * If @wb belongs to !root memcg, repeat the same
                               * basic calculations for the memcg domain.
                               */
                              mem_cgroup_wb_stats(wb, &filepages, &headroom,
                                                  &mdtc->dirty, &writeback);
                              mdtc->dirty += writeback;
                              mdtc_calc_avail(mdtc, filepages, headroom);
      
                              domain_dirty_limits(mdtc);
      
                              if (unlikely(strictlimit)) {
                                      wb_dirty_limits(mdtc);
                                      m_dirty = mdtc->wb_dirty;
                                      m_thresh = mdtc->wb_thresh;
                                      m_bg_thresh = mdtc->wb_bg_thresh;
                              } else {
                                      m_dirty = mdtc->dirty;
                                      m_thresh = mdtc->thresh;
                                      m_bg_thresh = mdtc->bg_thresh;
                              }
                      }
      
                      /*
                       * Throttle it only when the background writeback cannot
                       * catch-up. This avoids (excessively) small writeouts
                       * when the wb limits are ramping up in case of !strictlimit.
                       *
                       * In strictlimit case make decision based on the wb counters
                       * and limits. Small writeouts when the wb limits are ramping
                       * up are the price we consciously pay for strictlimit-ing.
                       *
                       * If memcg domain is in effect, @dirty should be under
                       * both global and memcg freerun ceilings.
                       */
  398                 if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
                          (!mdtc ||
                           m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
  398                         unsigned long intv = dirty_poll_interval(dirty, thresh);
                              unsigned long m_intv = ULONG_MAX;
      
  398                         current->dirty_paused_when = now;
                              current->nr_dirtied = 0;
                              if (mdtc)
                                      m_intv = dirty_poll_interval(m_dirty, m_thresh);
                              current->nr_dirtied_pause = min(intv, m_intv);
                              break;
                      }
      
                      if (unlikely(!writeback_in_progress(wb)))
                              wb_start_background_writeback(wb);
      
                      /*
                       * Calculate global domain's pos_ratio and select the
                       * global dtc by default.
                       */
                      if (!strictlimit)
                              wb_dirty_limits(gdtc);
      
                      dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
                              ((gdtc->dirty > gdtc->thresh) || strictlimit);
      
                      wb_position_ratio(gdtc);
                      sdtc = gdtc;
      
                      if (mdtc) {
                              /*
                               * If memcg domain is in effect, calculate its
                               * pos_ratio.  @wb should satisfy constraints from
                               * both global and memcg domains.  Choose the one
                               * w/ lower pos_ratio.
                               */
                              if (!strictlimit)
                                      wb_dirty_limits(mdtc);
      
                              dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
                                      ((mdtc->dirty > mdtc->thresh) || strictlimit);
      
                              wb_position_ratio(mdtc);
                              if (mdtc->pos_ratio < gdtc->pos_ratio)
                                      sdtc = mdtc;
                      }
      
                      if (dirty_exceeded && !wb->dirty_exceeded)
                              wb->dirty_exceeded = 1;
      
                      if (time_is_before_jiffies(wb->bw_time_stamp +
                                                 BANDWIDTH_INTERVAL)) {
                              spin_lock(&wb->list_lock);
                              __wb_update_bandwidth(gdtc, mdtc, start_time, true);
                              spin_unlock(&wb->list_lock);
                      }
      
                      /* throttle according to the chosen dtc */
                      dirty_ratelimit = wb->dirty_ratelimit;
                      task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                              RATELIMIT_CALC_SHIFT;
                      max_pause = wb_max_pause(wb, sdtc->wb_dirty);
                      min_pause = wb_min_pause(wb, max_pause,
                                               task_ratelimit, dirty_ratelimit,
                                               &nr_dirtied_pause);
      
                      if (unlikely(task_ratelimit == 0)) {
                              period = max_pause;
                              pause = max_pause;
                              goto pause;
                      }
                      period = HZ * pages_dirtied / task_ratelimit;
                      pause = period;
                      if (current->dirty_paused_when)
                              pause -= now - current->dirty_paused_when;
                      /*
                       * For less than 1s think time (ext3/4 may block the dirtier
                       * for up to 800ms from time to time on 1-HDD; so does xfs,
                       * however at much less frequency), try to compensate it in
                       * future periods by updating the virtual time; otherwise just
                       * do a reset, as it may be a light dirtier.
                       */
                      if (pause < min_pause) {
                              trace_balance_dirty_pages(wb,
                                                        sdtc->thresh,
                                                        sdtc->bg_thresh,
                                                        sdtc->dirty,
                                                        sdtc->wb_thresh,
                                                        sdtc->wb_dirty,
                                                        dirty_ratelimit,
                                                        task_ratelimit,
                                                        pages_dirtied,
                                                        period,
                                                        min(pause, 0L),
                                                        start_time);
                              if (pause < -HZ) {
                                      current->dirty_paused_when = now;
                                      current->nr_dirtied = 0;
                              } else if (period) {
                                      current->dirty_paused_when += period;
                                      current->nr_dirtied = 0;
                              } else if (current->nr_dirtied_pause <= pages_dirtied)
                                      current->nr_dirtied_pause += pages_dirtied;
                              break;
                      }
                      if (unlikely(pause > max_pause)) {
                              /* for occasional dropped task_ratelimit */
                              now += min(pause - max_pause, max_pause);
                              pause = max_pause;
                      }
      
      pause:
                      trace_balance_dirty_pages(wb,
                                                sdtc->thresh,
                                                sdtc->bg_thresh,
                                                sdtc->dirty,
                                                sdtc->wb_thresh,
                                                sdtc->wb_dirty,
                                                dirty_ratelimit,
                                                task_ratelimit,
                                                pages_dirtied,
                                                period,
                                                pause,
                                                start_time);
                      __set_current_state(TASK_KILLABLE);
                      io_schedule_timeout(pause);
      
                      current->dirty_paused_when = now + pause;
                      current->nr_dirtied = 0;
                      current->nr_dirtied_pause = nr_dirtied_pause;
      
                      /*
                       * This is typically equal to (dirty < thresh) and can also
                       * keep "1000+ dd on a slow USB stick" under control.
                       */
                      if (task_ratelimit)
                              break;
      
                      /*
                       * In the case of an unresponding NFS server and the NFS dirty
                       * pages exceeds dirty_thresh, give the other good wb's a pipe
                       * to go through, so that tasks on them still remain responsive.
                       *
                       * In theory 1 page is enough to keep the comsumer-producer
                       * pipe going: the flusher cleans 1 page => the task dirties 1
                       * more page. However wb_dirty has accounting errors.  So use
                       * the larger and more IO friendly wb_stat_error.
                       */
                      if (sdtc->wb_dirty <= wb_stat_error(wb))
                              break;
      
                      if (fatal_signal_pending(current))
                              break;
              }
      
  398         if (!dirty_exceeded && wb->dirty_exceeded)
                      wb->dirty_exceeded = 0;
      
  398         if (writeback_in_progress(wb))
                      return;
      
              /*
               * In laptop mode, we wait until hitting the higher threshold before
               * starting background writeout, and then write out all the way down
               * to the lower threshold.  So slow writers cause minimal disk activity.
               *
               * In normal mode, we start background writeout at the lower
               * background_thresh, to keep the amount of dirty memory low.
               */
  391         if (laptop_mode)
                      return;
      
  391         if (nr_reclaimable > gdtc->bg_thresh)
  398                 wb_start_background_writeback(wb);
      }
      
      static DEFINE_PER_CPU(int, bdp_ratelimits);
      
      /*
       * Normal tasks are throttled by
       *        loop {
       *                dirty tsk->nr_dirtied_pause pages;
       *                take a snap in balance_dirty_pages();
       *        }
       * However there is a worst case. If every task exit immediately when dirtied
       * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
       * called to throttle the page dirties. The solution is to save the not yet
       * throttled page dirties in dirty_throttle_leaks on task exit and charge them
       * randomly into the running tasks. This works well for the above worst case,
       * as the new task will pick up and accumulate the old task's leaked dirty
       * count and eventually get throttled.
       */
      DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
      
      /**
       * balance_dirty_pages_ratelimited - balance dirty memory state
       * @mapping: address_space which was dirtied
       *
       * Processes which are dirtying memory should call in here once for each page
       * which was newly dirtied.  The function will periodically check the system's
       * dirty state and will initiate writeback if needed.
       *
       * On really big machines, get_writeback_state is expensive, so try to avoid
       * calling it too often (ratelimiting).  But once we're over the dirty memory
       * limit we decrease the ratelimiting by a lot, to prevent individual processes
       * from overshooting the limit by (ratelimit_pages) each.
       */
      void balance_dirty_pages_ratelimited(struct address_space *mapping)
      {
  861         struct inode *inode = mapping->host;
  861         struct backing_dev_info *bdi = inode_to_bdi(inode);
              struct bdi_writeback *wb = NULL;
              int ratelimit;
              int *p;
      
  861         if (!bdi_cap_account_dirty(bdi))
                      return;
      
              if (inode_cgwb_enabled(inode))
                      wb = wb_get_create_current(bdi, GFP_KERNEL);
              if (!wb)
  398                 wb = &bdi->wb;
      
  479         ratelimit = current->nr_dirtied_pause;
              if (wb->dirty_exceeded)
                      ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
      
  479         preempt_disable();
              /*
               * This prevents one CPU to accumulate too many dirtied pages without
               * calling into balance_dirty_pages(), which can happen when there are
               * 1000+ tasks, all of them start dirtying pages at exactly the same
               * time, hence all honoured too large initial task->nr_dirtied_pause.
               */
              p =  this_cpu_ptr(&bdp_ratelimits);
              if (unlikely(current->nr_dirtied >= ratelimit))
  394                 *p = 0;
  479         else if (unlikely(*p >= ratelimit_pages)) {
                      *p = 0;
                      ratelimit = 0;
              }
              /*
               * Pick up the dirtied pages by the exited tasks. This avoids lots of
               * short-lived tasks (eg. gcc invocations in a kernel build) escaping
               * the dirty throttling and livelock other long-run dirtiers.
               */
  479         p = this_cpu_ptr(&dirty_throttle_leaks);
  109         if (*p > 0 && current->nr_dirtied < ratelimit) {
                      unsigned long nr_pages_dirtied;
  109                 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
                      *p -= nr_pages_dirtied;
                      current->nr_dirtied += nr_pages_dirtied;
              }
  479         preempt_enable();
      
  861         if (unlikely(current->nr_dirtied >= ratelimit))
  398                 balance_dirty_pages(mapping, wb, current->nr_dirtied);
      
              wb_put(wb);
      }
      EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
      
      /**
       * wb_over_bg_thresh - does @wb need to be written back?
       * @wb: bdi_writeback of interest
       *
       * Determines whether background writeback should keep writing @wb or it's
       * clean enough.  Returns %true if writeback should continue.
       */
      bool wb_over_bg_thresh(struct bdi_writeback *wb)
      {
              struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
              struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
              struct dirty_throttle_control * const gdtc = &gdtc_stor;
              struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                                                           &mdtc_stor : NULL;
      
              /*
               * Similar to balance_dirty_pages() but ignores pages being written
               * as we're trying to decide whether to put more under writeback.
               */
              gdtc->avail = global_dirtyable_memory();
              gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
                            global_page_state(NR_UNSTABLE_NFS);
              domain_dirty_limits(gdtc);
      
              if (gdtc->dirty > gdtc->bg_thresh)
                      return true;
      
              if (wb_stat(wb, WB_RECLAIMABLE) >
                  wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
                      return true;
      
              if (mdtc) {
                      unsigned long filepages, headroom, writeback;
      
                      mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
                                          &writeback);
                      mdtc_calc_avail(mdtc, filepages, headroom);
                      domain_dirty_limits(mdtc);        /* ditto, ignore writeback */
      
                      if (mdtc->dirty > mdtc->bg_thresh)
                              return true;
      
                      if (wb_stat(wb, WB_RECLAIMABLE) >
                          wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
                              return true;
              }
      
              return false;
      }
      
      void throttle_vm_writeout(gfp_t gfp_mask)
      {
              unsigned long background_thresh;
              unsigned long dirty_thresh;
      
              for ( ; ; ) {
    2                 global_dirty_limits(&background_thresh, &dirty_thresh);
                      dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
      
                      /*
                       * Boost the allowable dirty threshold a bit for page
                       * allocators so they don't get DoS'ed by heavy writers
                       */
                      dirty_thresh += dirty_thresh / 10;      /* wheeee... */
      
    2                 if (global_page_state(NR_UNSTABLE_NFS) +
                              global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                      break;
                      congestion_wait(BLK_RW_ASYNC, HZ/10);
      
                      /*
                       * The caller might hold locks which can prevent IO completion
                       * or progress in the filesystem.  So we cannot just sit here
                       * waiting for IO to complete.
                       */
                      if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
                              break;
              }
      }
      
      /*
       * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
       */
      int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
              void __user *buffer, size_t *length, loff_t *ppos)
      {
              proc_dointvec(table, write, buffer, length, ppos);
              return 0;
      }
      
      #ifdef CONFIG_BLOCK
      void laptop_mode_timer_fn(unsigned long data)
      {
              struct request_queue *q = (struct request_queue *)data;
              int nr_pages = global_page_state(NR_FILE_DIRTY) +
                      global_page_state(NR_UNSTABLE_NFS);
              struct bdi_writeback *wb;
      
              /*
               * We want to write everything out, not just down to the dirty
               * threshold
               */
              if (!bdi_has_dirty_io(&q->backing_dev_info))
                      return;
      
              rcu_read_lock();
              list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
                      if (wb_has_dirty_io(wb))
                              wb_start_writeback(wb, nr_pages, true,
                                                 WB_REASON_LAPTOP_TIMER);
              rcu_read_unlock();
      }
      
      /*
       * We've spun up the disk and we're in laptop mode: schedule writeback
       * of all dirty data a few seconds from now.  If the flush is already scheduled
       * then push it back - the user is still using the disk.
       */
      void laptop_io_completion(struct backing_dev_info *info)
      {
              mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
      }
      
      /*
       * We're in laptop mode and we've just synced. The sync's writes will have
       * caused another writeback to be scheduled by laptop_io_completion.
       * Nothing needs to be written back anymore, so we unschedule the writeback.
       */
      void laptop_sync_completion(void)
      {
              struct backing_dev_info *bdi;
      
              rcu_read_lock();
      
              list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                      del_timer(&bdi->laptop_mode_wb_timer);
      
              rcu_read_unlock();
      }
      #endif
      
      /*
       * If ratelimit_pages is too high then we can get into dirty-data overload
       * if a large number of processes all perform writes at the same time.
       * If it is too low then SMP machines will call the (expensive)
       * get_writeback_state too often.
       *
       * Here we set ratelimit_pages to a level which ensures that when all CPUs are
       * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
       * thresholds.
       */
      
      void writeback_set_ratelimit(void)
      {
              struct wb_domain *dom = &global_wb_domain;
              unsigned long background_thresh;
              unsigned long dirty_thresh;
      
              global_dirty_limits(&background_thresh, &dirty_thresh);
              dom->dirty_limit = dirty_thresh;
              ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
              if (ratelimit_pages < 16)
                      ratelimit_pages = 16;
      }
      
      static int
      ratelimit_handler(struct notifier_block *self, unsigned long action,
                        void *hcpu)
      {
      
              switch (action & ~CPU_TASKS_FROZEN) {
              case CPU_ONLINE:
              case CPU_DEAD:
                      writeback_set_ratelimit();
                      return NOTIFY_OK;
              default:
                      return NOTIFY_DONE;
              }
      }
      
      static struct notifier_block ratelimit_nb = {
              .notifier_call        = ratelimit_handler,
              .next                = NULL,
      };
      
      /*
       * Called early on to tune the page writeback dirty limits.
       *
       * We used to scale dirty pages according to how total memory
       * related to pages that could be allocated for buffers (by
       * comparing nr_free_buffer_pages() to vm_total_pages.
       *
       * However, that was when we used "dirty_ratio" to scale with
       * all memory, and we don't do that any more. "dirty_ratio"
       * is now applied to total non-HIGHPAGE memory (by subtracting
       * totalhigh_pages from vm_total_pages), and as such we can't
       * get into the old insane situation any more where we had
       * large amounts of dirty pages compared to a small amount of
       * non-HIGHMEM memory.
       *
       * But we might still want to scale the dirty_ratio by how
       * much memory the box has..
       */
      void __init page_writeback_init(void)
      {
              BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
      
              writeback_set_ratelimit();
              register_cpu_notifier(&ratelimit_nb);
      }
      
      /**
       * tag_pages_for_writeback - tag pages to be written by write_cache_pages
       * @mapping: address space structure to write
       * @start: starting page index
       * @end: ending page index (inclusive)
       *
       * This function scans the page range from @start to @end (inclusive) and tags
       * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
       * that write_cache_pages (or whoever calls this function) will then use
       * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
       * used to avoid livelocking of writeback by a process steadily creating new
       * dirty pages in the file (thus it is important for this function to be quick
       * so that it can tag pages faster than a dirtying process can create them).
       */
      /*
       * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
       */
      void tag_pages_for_writeback(struct address_space *mapping,
  403                              pgoff_t start, pgoff_t end)
      {
      #define WRITEBACK_TAG_BATCH 4096
              unsigned long tagged;
      
              do {
  403                 spin_lock_irq(&mapping->tree_lock);
                      tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
                                      &start, end, WRITEBACK_TAG_BATCH,
                                      PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
                      spin_unlock_irq(&mapping->tree_lock);
                      WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
  403                 cond_resched();
                      /* We check 'start' to handle wrapping when end == ~0UL */
    1         } while (tagged >= WRITEBACK_TAG_BATCH && start);
  403 }
      EXPORT_SYMBOL(tag_pages_for_writeback);
      
      /**
       * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
       * @mapping: address space structure to write
       * @wbc: subtract the number of written pages from *@wbc->nr_to_write
       * @writepage: function called for each page
       * @data: data passed to writepage function
       *
       * If a page is already under I/O, write_cache_pages() skips it, even
       * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
       * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
       * and msync() need to guarantee that all the data which was dirty at the time
       * the call was made get new I/O started against them.  If wbc->sync_mode is
       * WB_SYNC_ALL then we were called for data integrity and we must wait for
       * existing IO to complete.
       *
       * To avoid livelocks (when other process dirties new pages), we first tag
       * pages which should be written back with TOWRITE tag and only then start
       * writing them. For data-integrity sync we have to be careful so that we do
       * not miss some pages (e.g., because some other process has cleared TOWRITE
       * tag we set). The rule we follow is that TOWRITE tag can be cleared only
       * by the process clearing the DIRTY tag (and submitting the page for IO).
       *
       * To avoid deadlocks between range_cyclic writeback and callers that hold
       * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
       * we do not loop back to the start of the file. Doing so causes a page
       * lock/page writeback access order inversion - we should only ever lock
       * multiple pages in ascending page->index order, and looping back to the start
       * of the file violates that rule and causes deadlocks.
       */
      int write_cache_pages(struct address_space *mapping,
                            struct writeback_control *wbc, writepage_t writepage,
                            void *data)
      {
              int ret = 0;
              int done = 0;
              int error;
              struct pagevec pvec;
              int nr_pages;
              pgoff_t uninitialized_var(writeback_index);
  130         pgoff_t index;
              pgoff_t end;                /* Inclusive */
              pgoff_t done_index;
              int range_whole = 0;
              int tag;
      
              pagevec_init(&pvec, 0);
              if (wbc->range_cyclic) {
                      writeback_index = mapping->writeback_index; /* prev offset */
                      index = writeback_index;
  130                 end = -1;
              } else {
  123                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
                      end = wbc->range_end >> PAGE_CACHE_SHIFT;
                      if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                              range_whole = 1;
  130         }
              if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                      tag = PAGECACHE_TAG_TOWRITE;
              else
                      tag = PAGECACHE_TAG_DIRTY;
  130         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
  130                 tag_pages_for_writeback(mapping, index, end);
              done_index = index;
  130         while (!done && (index <= end)) {
                      int i;
      
  130                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                                    min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                      if (nr_pages == 0)
                              break;
      
  122                 for (i = 0; i < nr_pages; i++) {
  122                         struct page *page = pvec.pages[i];
      
                              /*
                               * At this point, the page may be truncated or
  122                          * invalidated (changing page->mapping to NULL), or
                               * even swizzled back from swapper_space to tmpfs file
                               * mapping. However, page->index will not change
                               * because we have a reference on the page.
                               */
                              if (page->index > end) {
                                      /*
                                       * can't be range_cyclic (1st pass) because
                                       * end == -1 in that case.
                                       */
  122                                 done = 1;
                                      break;
   11                         }
      
                              done_index = page->index;
      
  122                         lock_page(page);
      
                              /*
                               * Page truncated or invalidated. We can freely skip it
                               * then, even for data integrity operations: the page
  122                          * has disappeared concurrently, so there could be no
    2                          * real expectation of this data interity operation
    2                          * even if there is now a new, dirty page at the same
                               * pagecache address.
                               */
                              if (unlikely(page->mapping != mapping)) {
      continue_unlock:
  122                                 unlock_page(page);
  122                                 continue;
                              }
      
  122                         if (!PageDirty(page)) {
  122                                 /* someone wrote it for us */
                                      goto continue_unlock;
                              }
      
                              if (PageWriteback(page)) {
                                      if (wbc->sync_mode != WB_SYNC_NONE)
                                              wait_on_page_writeback(page);
                                      else
                                              goto continue_unlock;
                              }
      
                              BUG_ON(PageWriteback(page));
                              if (!clear_page_dirty_for_io(page))
                                      goto continue_unlock;
      
                              trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
                              error = (*writepage)(page, wbc, data);
                              if (unlikely(error)) {
                                      /*
                                       * Handle errors according to the type of
                                       * writeback. There's no need to continue for
                                       * background writeback. Just push done_index
                                       * past this page so media errors won't choke
                                       * writeout for the entire file. For integrity
                                       * writeback, we must process the entire dirty
                                       * set regardless of errors because the fs may
                                       * still have state to clear for each page. In
                                       * that case we continue processing and return
                                       * the first error.
                                       */
                                      if (error == AOP_WRITEPAGE_ACTIVATE) {
                                              unlock_page(page);
                                              error = 0;
  122                                 } else if (wbc->sync_mode != WB_SYNC_ALL) {
                                              ret = error;
                                              done_index = page->index + 1;
                                              done = 1;
                                              break;
                                      }
  122                                 if (!ret)
  122                                         ret = error;
                              }
  130 
                              /*
                               * We stop writing back only if we are not doing
                               * integrity sync. In case of integrity sync we have to
                               * keep going until we have written all the pages
                               * we tagged for writeback prior to entering this loop.
                               */
                              if (--wbc->nr_to_write <= 0 &&
                                  wbc->sync_mode == WB_SYNC_NONE) {
                                      done = 1;
                                      break;
  130                         }
   79                 }
                      pagevec_release(&pvec);
  130                 cond_resched();
              }
      
              /*
               * If we hit the last page and there is more work to be done: wrap
               * back the index back to the start of the file for the next
               * time we are called.
               */
              if (wbc->range_cyclic && !done)
                      done_index = 0;
              if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                      mapping->writeback_index = done_index;
  122 
              return ret;
  122 }
      EXPORT_SYMBOL(write_cache_pages);
      
      /*
       * Function used by generic_writepages to call the real writepage
       * function and set the mapping flags on error
       */
      static int __writepage(struct page *page, struct writeback_control *wbc,
                             void *data)
      {
              struct address_space *mapping = data;
  130         int ret = mapping->a_ops->writepage(page, wbc);
              mapping_set_error(mapping, ret);
              return ret;
      }
      
      /**
       * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
  130  * @mapping: address space structure to write
       * @wbc: subtract the number of written pages from *@wbc->nr_to_write
       *
  130  * This is a library function, which implements the writepages()
       * address_space_operation.
       */
  130 int generic_writepages(struct address_space *mapping,
                             struct writeback_control *wbc)
      {
              struct blk_plug plug;
              int ret;
      
              /* deal with chardevs and other special file */
              if (!mapping->a_ops->writepage)
                      return 0;
  711 
              blk_start_plug(&plug);
  697         ret = write_cache_pages(mapping, wbc, __writepage, mapping);
  711         blk_finish_plug(&plug);
              return ret;
      }
      
      EXPORT_SYMBOL(generic_writepages);
      
      int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
      {
              int ret;
      
              if (wbc->nr_to_write <= 0)
                      return 0;
              if (mapping->a_ops->writepages)
                      ret = mapping->a_ops->writepages(mapping, wbc);
              else
                      ret = generic_writepages(mapping, wbc);
              return ret;
      }
      
      /**
       * write_one_page - write out a single page and optionally wait on I/O
       * @page: the page to write
       * @wait: if true, wait on writeout
       *
       * The page must be locked by the caller and will be unlocked upon return.
       *
       * write_one_page() returns a negative error code if I/O failed.
       */
      int write_one_page(struct page *page, int wait)
      {
              struct address_space *mapping = page->mapping;
              int ret = 0;
              struct writeback_control wbc = {
                      .sync_mode = WB_SYNC_ALL,
                      .nr_to_write = 1,
              };
      
              BUG_ON(!PageLocked(page));
      
              if (wait)
                      wait_on_page_writeback(page);
      
              if (clear_page_dirty_for_io(page)) {
                      page_cache_get(page);
                      ret = mapping->a_ops->writepage(page, &wbc);
                      if (ret == 0 && wait) {
                              wait_on_page_writeback(page);
                              if (PageError(page))
                                      ret = -EIO;
                      }
  455                 page_cache_release(page);
  429         } else {
                      unlock_page(page);
              }
              return ret;
      }
      EXPORT_SYMBOL(write_one_page);
      
      /*
       * For address_spaces which do not use buffers nor write back.
       */
      int __set_page_dirty_no_writeback(struct page *page)
      {
              if (!PageDirty(page))
                      return !TestSetPageDirty(page);
  695         return 0;
      }
  695 
      /*
  695  * Helper function for set_page_dirty family.
       *
       * Caller must hold mem_cgroup_begin_page_stat().
       *
  695  * NOTE: This relies on being atomic wrt interrupts.
       */
      void account_page_dirtied(struct page *page, struct address_space *mapping,
  695                           struct mem_cgroup *memcg)
      {
              struct inode *inode = mapping->host;
      
              trace_writeback_dirty_page(page, mapping);
      
              if (mapping_cap_account_dirty(mapping)) {
                      struct bdi_writeback *wb;
  695 
                      inode_attach_wb(inode, page);
                      wb = inode_to_wb(inode);
      
                      mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                      __inc_zone_page_state(page, NR_FILE_DIRTY);
                      __inc_zone_page_state(page, NR_DIRTIED);
                      __inc_wb_stat(wb, WB_RECLAIMABLE);
   83                 __inc_wb_stat(wb, WB_DIRTIED);
                      task_io_account_write(PAGE_CACHE_SIZE);
                      current->nr_dirtied++;
   83                 this_cpu_inc(bdp_ratelimits);
              }
   83 }
   83 EXPORT_SYMBOL(account_page_dirtied);
   83 
      /*
   83  * Helper function for deaccounting dirty page without writeback.
       *
       * Caller must hold mem_cgroup_begin_page_stat().
       */
      void account_page_cleaned(struct page *page, struct address_space *mapping,
                                struct mem_cgroup *memcg, struct bdi_writeback *wb)
      {
              if (mapping_cap_account_dirty(mapping)) {
                      mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                      dec_zone_page_state(page, NR_FILE_DIRTY);
                      dec_wb_stat(wb, WB_RECLAIMABLE);
                      task_io_account_cancelled_write(PAGE_CACHE_SIZE);
              }
      }
    2 
      /*
       * For address_spaces which do not use buffers.  Just tag the page as dirty in
       * its radix tree.
       *
    2  * This is also used when a single buffer is being dirtied: we want to set the
       * page dirty in that case, but not all the buffers.  This is a "bottom-up"
       * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
       *
       * The caller must ensure this doesn't race with truncation.  Most will simply
       * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
       * the pte lock held, which also locks out truncation.
       */
      int __set_page_dirty_nobuffers(struct page *page)
    2 {
              struct mem_cgroup *memcg;
    2 
    2         memcg = mem_cgroup_begin_page_stat(page);
    2         if (!TestSetPageDirty(page)) {
                      struct address_space *mapping = page_mapping(page);
                      unsigned long flags;
      
                      if (!mapping) {
                              mem_cgroup_end_page_stat(memcg);
                              return 1;
    2                 }
      
                      spin_lock_irqsave(&mapping->tree_lock, flags);
                      BUG_ON(page_mapping(page) != mapping);
                      WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
    2                 account_page_dirtied(page, mapping, memcg);
                      radix_tree_tag_set(&mapping->page_tree, page_index(page),
                                         PAGECACHE_TAG_DIRTY);
                      spin_unlock_irqrestore(&mapping->tree_lock, flags);
                      mem_cgroup_end_page_stat(memcg);
      
                      if (mapping->host) {
                              /* !PageAnon && !swapper_space */
                              __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
                      }
                      return 1;
              }
              mem_cgroup_end_page_stat(memcg);
    2         return 0;
      }
    2 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
    2 
      /*
       * Call this whenever redirtying a page, to de-account the dirty counters
       * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
    2  * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
    2  * systematic errors in balanced_dirty_ratelimit and the dirty pages position
       * control.
    2  */
      void account_page_redirty(struct page *page)
      {
    2         struct address_space *mapping = page->mapping;
      
              if (mapping && mapping_cap_account_dirty(mapping)) {
                      struct inode *inode = mapping->host;
                      struct bdi_writeback *wb;
                      struct wb_lock_cookie cookie = {};
      
                      wb = unlocked_inode_to_wb_begin(inode, &cookie);
                      current->nr_dirtied--;
                      dec_zone_page_state(page, NR_DIRTIED);
                      dec_wb_stat(wb, WB_DIRTIED);
                      unlocked_inode_to_wb_end(inode, &cookie);
    2         }
      }
      EXPORT_SYMBOL(account_page_redirty);
      
      /*
       * When a writepage implementation decides that it doesn't want to write this
       * page for some reason, it should redirty the locked page via
       * redirty_page_for_writepage() and it should then unlock the page and return 0
       */
      int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
      {
              int ret;
      
              wbc->pages_skipped++;
              ret = __set_page_dirty_nobuffers(page);
              account_page_redirty(page);
              return ret;
      }
   16 EXPORT_SYMBOL(redirty_page_for_writepage);
      
  567 /*
       * Dirty a page.
       *
  550  * For pages with a mapping this should be done under the page lock
       * for the benefit of asynchronous memory errors who prefer a consistent
       * dirty state. This rule can be broken in some special cases,
       * but should be better not to.
       *
       * If the mapping doesn't provide a set_page_dirty a_op, then
       * just fall through and assume that it wants buffer_heads.
       */
      int set_page_dirty(struct page *page)
      {
              struct address_space *mapping = page_mapping(page);
      
              if (likely(mapping)) {
                      int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
  550                 /*
                       * readahead/lru_deactivate_page could remain
                       * PG_readahead/PG_reclaim due to race with end_page_writeback
  567                  * About readahead, if the page is written, the flags would be
                       * reset. So no problem.
   17                  * About lru_deactivate_page, if the page is redirty, the flag
   16                  * will be reset. So no problem. but if the page is used by readahead
                       * it will confuse readahead and make it restart the size rampup
                       * process. But it's a trivial problem.
                       */
                      if (PageReclaim(page))
                              ClearPageReclaim(page);
      #ifdef CONFIG_BLOCK
                      if (!spd)
                              spd = __set_page_dirty_buffers;
      #endif
                      return (*spd)(page);
              }
              if (!PageDirty(page)) {
                      if (!TestSetPageDirty(page))
                              return 1;
              }
              return 0;
      }
      EXPORT_SYMBOL(set_page_dirty);
      
      /*
   16  * set_page_dirty() is racy if the caller has no reference against
   16  * page->mapping->host, and if the page is unlocked.  This is because another
       * CPU could truncate the page off the mapping and then free the mapping.
       *
       * Usually, the page _is_ locked, or the caller is a user-space process which
       * holds a reference on the inode by having an open file.
       *
       * In other cases, the page should be locked before running set_page_dirty().
       */
      int set_page_dirty_lock(struct page *page)
      {
              int ret;
      
              lock_page(page);
              ret = set_page_dirty(page);
              unlock_page(page);
              return ret;
      }
      EXPORT_SYMBOL(set_page_dirty_lock);
      
      /*
       * This cancels just the dirty bit on the kernel page itself, it does NOT
  559  * actually remove dirty bits on any mmap's that may be around. It also
       * leaves the page tagged dirty, so any sync activity will still find it on
  559  * the dirty lists, and in particular, clear_page_dirty_for_io() will still
   23  * look at the dirty bits in the VM.
       *
       * Doing this should *normally* only ever be done when a page is truncated,
       * and is not actually mapped anywhere at all. However, fs/buffer.c does
       * this when it notices that somebody has cleaned out all the buffers on a
       * page without actually doing it through the VM. Can you say "ext3 is
  518  * horribly ugly"? Thought you could.
       */
  518 void cancel_dirty_page(struct page *page)
      {
              struct address_space *mapping = page_mapping(page);
      
              if (mapping_cap_account_dirty(mapping)) {
                      struct inode *inode = mapping->host;
   59                 struct bdi_writeback *wb;
                      struct mem_cgroup *memcg;
  559                 struct wb_lock_cookie cookie = {};
      
                      memcg = mem_cgroup_begin_page_stat(page);
                      wb = unlocked_inode_to_wb_begin(inode, &cookie);
      
                      if (TestClearPageDirty(page))
                              account_page_cleaned(page, mapping, memcg, wb);
      
                      unlocked_inode_to_wb_end(inode, &cookie);
                      mem_cgroup_end_page_stat(memcg);
              } else {
                      ClearPageDirty(page);
              }
      }
      EXPORT_SYMBOL(cancel_dirty_page);
      
      /*
       * Clear a page's dirty flag, while caring for dirty memory accounting.
       * Returns true if the page was previously dirty.
  404  *
       * This is for preparing to put the page under writeout.  We leave the page
       * tagged as dirty in the radix tree so that a concurrent write-for-sync
       * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
       * implementation will run either set_page_writeback() or set_page_dirty(),
  404  * at which stage we bring the page's dirty flag and radix-tree dirty tag
  404  * back into sync.
       *
       * This incoherency between the page's dirty flag and radix-tree tag is
       * unfortunate, but it only exists while the page is locked.
       */
      int clear_page_dirty_for_io(struct page *page)
      {
              struct address_space *mapping = page_mapping(page);
              int ret = 0;
      
              BUG_ON(!PageLocked(page));
      
              if (mapping && mapping_cap_account_dirty(mapping)) {
                      struct inode *inode = mapping->host;
                      struct bdi_writeback *wb;
                      struct mem_cgroup *memcg;
                      struct wb_lock_cookie cookie = {};
      
                      /*
                       * Yes, Virginia, this is indeed insane.
                       *
                       * We use this sequence to make sure that
                       *  (a) we account for dirty stats properly
                       *  (b) we tell the low-level filesystem to
                       *      mark the whole page dirty if it was
                       *      dirty in a pagetable. Only to then
                       *  (c) clean the page again and return 1 to
                       *      cause the writeback.
                       *
                       * This way we avoid all nasty races with the
                       * dirty bit in multiple places and clearing
   81                  * them concurrently from different threads.
                       *
                       * Note! Normally the "set_page_dirty(page)"
                       * has no effect on the actual dirty bit - since
                       * that will already usually be set. But we
                       * need the side effects, and it can help us
                       * avoid races.
                       *
                       * We basically use the page "master dirty bit"
                       * as a serialization point for all the different
  404                  * threads doing their things.
  404                  */
                      if (page_mkclean(page))
  404                         set_page_dirty(page);
  404                 /*
                       * We carefully synchronise fault handlers against
                       * installing a dirty pte and marking the page dirty
                       * at this point.  We do this by having them hold the
                       * page lock while dirtying the page, and pages are
                       * always locked coming in here, so we get the desired
                       * exclusion.
  404                  */
                      memcg = mem_cgroup_begin_page_stat(page);
                      wb = unlocked_inode_to_wb_begin(inode, &cookie);
                      if (TestClearPageDirty(page)) {
                              mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                              dec_zone_page_state(page, NR_FILE_DIRTY);
   29                         dec_wb_stat(wb, WB_RECLAIMABLE);
                              ret = 1;
                      }
                      unlocked_inode_to_wb_end(inode, &cookie);
                      mem_cgroup_end_page_stat(memcg);
                      return ret;
   29         }
   29         return TestClearPageDirty(page);
      }
      EXPORT_SYMBOL(clear_page_dirty_for_io);
   29 
      int test_clear_page_writeback(struct page *page)
      {
   29         struct address_space *mapping = page_mapping(page);
              struct mem_cgroup *memcg;
              int ret;
      
   29         memcg = mem_cgroup_begin_page_stat(page);
              if (mapping) {
   29                 struct inode *inode = mapping->host;
                      struct backing_dev_info *bdi = inode_to_bdi(inode);
                      unsigned long flags;
      
                      spin_lock_irqsave(&mapping->tree_lock, flags);
                      ret = TestClearPageWriteback(page);
                      if (ret) {
                              radix_tree_tag_clear(&mapping->page_tree,
                                                      page_index(page),
                                                      PAGECACHE_TAG_WRITEBACK);
   29                         if (bdi_cap_account_writeback(bdi)) {
                                      struct bdi_writeback *wb = inode_to_wb(inode);
      
                                      __dec_wb_stat(wb, WB_WRITEBACK);
   29                                 __wb_writeout_inc(wb);
                              }
                      }
                      spin_unlock_irqrestore(&mapping->tree_lock, flags);
              } else {
  404                 ret = TestClearPageWriteback(page);
              }
              if (ret) {
                      mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
                      dec_zone_page_state(page, NR_WRITEBACK);
                      inc_zone_page_state(page, NR_WRITTEN);
  404         }
  404         mem_cgroup_end_page_stat(memcg);
              return ret;
      }
  404 
  404 int __test_set_page_writeback(struct page *page, bool keep_write)
      {
  404         struct address_space *mapping = page_mapping(page);
              struct mem_cgroup *memcg;
              int ret;
      
  404         memcg = mem_cgroup_begin_page_stat(page);
              if (mapping) {
  404                 struct inode *inode = mapping->host;
  404                 struct backing_dev_info *bdi = inode_to_bdi(inode);
                      unsigned long flags;
      
  404                 spin_lock_irqsave(&mapping->tree_lock, flags);
  404                 ret = TestSetPageWriteback(page);
                      if (!ret) {
                              radix_tree_tag_set(&mapping->page_tree,
  404                                                 page_index(page),
                                                      PAGECACHE_TAG_WRITEBACK);
                              if (bdi_cap_account_writeback(bdi))
                                      __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
                      }
                      if (!PageDirty(page))
  404                         radix_tree_tag_clear(&mapping->page_tree,
                                                      page_index(page),
                                                      PAGECACHE_TAG_DIRTY);
  404                 if (!keep_write)
                              radix_tree_tag_clear(&mapping->page_tree,
                                                      page_index(page),
                                                      PAGECACHE_TAG_TOWRITE);
                      spin_unlock_irqrestore(&mapping->tree_lock, flags);
              } else {
                      ret = TestSetPageWriteback(page);
              }
              if (!ret) {
                      mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
                      inc_zone_page_state(page, NR_WRITEBACK);
  603         }
              mem_cgroup_end_page_stat(memcg);
              return ret;
      
      }
      EXPORT_SYMBOL(__test_set_page_writeback);
      
      /*
       * Return true if any of the pages in the mapping are marked with the
       * passed tag.
       */
      int mapping_tagged(struct address_space *mapping, int tag)
      {
              return radix_tree_tagged(&mapping->page_tree, tag);
  557 }
      EXPORT_SYMBOL(mapping_tagged);
  557 
      /**
       * wait_for_stable_page() - wait for writeback to finish, if necessary.
       * @page:        The page to wait on.
       *
       * This function determines if the given page is related to a backing device
       * that requires page contents to be held stable during writeback.  If so, then
       * it will wait for any pending writeback to complete.
       */
      void wait_for_stable_page(struct page *page)
      {
              if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
                      wait_on_page_writeback(page);
      }
      EXPORT_SYMBOL_GPL(wait_for_stable_page);
      #ifndef __LINUX_SPINLOCK_H
      #define __LINUX_SPINLOCK_H
      
      /*
       * include/linux/spinlock.h - generic spinlock/rwlock declarations
       *
       * here's the role of the various spinlock/rwlock related include files:
       *
       * on SMP builds:
       *
       *  asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the
       *                        initializers
       *
       *  linux/spinlock_types.h:
       *                        defines the generic type and initializers
       *
       *  asm/spinlock.h:       contains the arch_spin_*()/etc. lowlevel
       *                        implementations, mostly inline assembly code
       *
       *   (also included on UP-debug builds:)
       *
       *  linux/spinlock_api_smp.h:
       *                        contains the prototypes for the _spin_*() APIs.
       *
       *  linux/spinlock.h:     builds the final spin_*() APIs.
       *
       * on UP builds:
       *
       *  linux/spinlock_type_up.h:
       *                        contains the generic, simplified UP spinlock type.
       *                        (which is an empty structure on non-debug builds)
       *
       *  linux/spinlock_types.h:
       *                        defines the generic type and initializers
       *
       *  linux/spinlock_up.h:
       *                        contains the arch_spin_*()/etc. version of UP
       *                        builds. (which are NOPs on non-debug, non-preempt
       *                        builds)
       *
       *   (included on UP-non-debug builds:)
       *
       *  linux/spinlock_api_up.h:
       *                        builds the _spin_*() APIs.
       *
       *  linux/spinlock.h:     builds the final spin_*() APIs.
       */
      
      #include <linux/typecheck.h>
      #include <linux/preempt.h>
      #include <linux/linkage.h>
      #include <linux/compiler.h>
      #include <linux/irqflags.h>
      #include <linux/thread_info.h>
      #include <linux/kernel.h>
      #include <linux/stringify.h>
      #include <linux/bottom_half.h>
      #include <asm/barrier.h>
      
      
      /*
       * Must define these before including other files, inline functions need them
       */
      #define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME
      
      #define LOCK_SECTION_START(extra)               \
              ".subsection 1\n\t"                     \
              extra                                   \
              ".ifndef " LOCK_SECTION_NAME "\n\t"     \
              LOCK_SECTION_NAME ":\n\t"               \
              ".endif\n"
      
      #define LOCK_SECTION_END                        \
              ".previous\n\t"
      
      #define __lockfunc __attribute__((section(".spinlock.text")))
      
      /*
       * Pull the arch_spinlock_t and arch_rwlock_t definitions:
       */
      #include <linux/spinlock_types.h>
      
      /*
       * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them):
       */
      #ifdef CONFIG_SMP
      # include <asm/spinlock.h>
      #else
      # include <linux/spinlock_up.h>
      #endif
      
      #ifdef CONFIG_DEBUG_SPINLOCK
        extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
                                         struct lock_class_key *key);
      # define raw_spin_lock_init(lock)                                \
      do {                                                                \
              static struct lock_class_key __key;                        \
                                                                      \
              __raw_spin_lock_init((lock), #lock, &__key);                \
      } while (0)
      
      #else
      # define raw_spin_lock_init(lock)                                \
              do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
      #endif
      
      #define raw_spin_is_locked(lock)        arch_spin_is_locked(&(lock)->raw_lock)
      
      #ifdef CONFIG_GENERIC_LOCKBREAK
      #define raw_spin_is_contended(lock) ((lock)->break_lock)
      #else
      
      #ifdef arch_spin_is_contended
      #define raw_spin_is_contended(lock)        arch_spin_is_contended(&(lock)->raw_lock)
      #else
      #define raw_spin_is_contended(lock)        (((void)(lock), 0))
      #endif /*arch_spin_is_contended*/
      #endif
      
      /*
       * Despite its name it doesn't necessarily has to be a full barrier.
       * It should only guarantee that a STORE before the critical section
       * can not be reordered with LOADs and STOREs inside this section.
       * spin_lock() is the one-way barrier, this LOAD can not escape out
       * of the region. So the default implementation simply ensures that
       * a STORE can not move into the critical section, smp_wmb() should
       * serialize it with another STORE done by spin_lock().
       */
      #ifndef smp_mb__before_spinlock
      #define smp_mb__before_spinlock()        smp_wmb()
      #endif
      
      /**
       * raw_spin_unlock_wait - wait until the spinlock gets unlocked
       * @lock: the spinlock in question.
       */
      #define raw_spin_unlock_wait(lock)        arch_spin_unlock_wait(&(lock)->raw_lock)
      
      #ifdef CONFIG_DEBUG_SPINLOCK
       extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
      #define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock)
       extern int do_raw_spin_trylock(raw_spinlock_t *lock);
       extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
      #else
      static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
      {
              __acquire(lock);
              arch_spin_lock(&lock->raw_lock);
      }
      
      static inline void
      do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock)
      {
              __acquire(lock);
              arch_spin_lock_flags(&lock->raw_lock, *flags);
      }
      
      static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
      {
              return arch_spin_trylock(&(lock)->raw_lock);
      }
      
      static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
      {
              arch_spin_unlock(&lock->raw_lock);
              __release(lock);
      }
      #endif
      
      /*
       * Define the various spin_lock methods.  Note we define these
       * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The
       * various methods are defined as nops in the case they are not
       * required.
       */
      #define raw_spin_trylock(lock)        __cond_lock(lock, _raw_spin_trylock(lock))
      
      #define raw_spin_lock(lock)        _raw_spin_lock(lock)
      
      #ifdef CONFIG_DEBUG_LOCK_ALLOC
      # define raw_spin_lock_nested(lock, subclass) \
              _raw_spin_lock_nested(lock, subclass)
      # define raw_spin_lock_bh_nested(lock, subclass) \
              _raw_spin_lock_bh_nested(lock, subclass)
      
      # define raw_spin_lock_nest_lock(lock, nest_lock)                        \
               do {                                                                \
                       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\
                       _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);        \
               } while (0)
      #else
      /*
       * Always evaluate the 'subclass' argument to avoid that the compiler
       * warns about set-but-not-used variables when building with
       * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
       */
      # define raw_spin_lock_nested(lock, subclass)                \
              _raw_spin_lock(((void)(subclass), (lock)))
      # define raw_spin_lock_nest_lock(lock, nest_lock)        _raw_spin_lock(lock)
      # define raw_spin_lock_bh_nested(lock, subclass)        _raw_spin_lock_bh(lock)
      #endif
      
      #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
      
      #define raw_spin_lock_irqsave(lock, flags)                        \
              do {                                                \
                      typecheck(unsigned long, flags);        \
                      flags = _raw_spin_lock_irqsave(lock);        \
              } while (0)
      
      #ifdef CONFIG_DEBUG_LOCK_ALLOC
      #define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
              do {                                                                \
                      typecheck(unsigned long, flags);                        \
                      flags = _raw_spin_lock_irqsave_nested(lock, subclass);        \
              } while (0)
      #else
      #define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
              do {                                                                \
                      typecheck(unsigned long, flags);                        \
                      flags = _raw_spin_lock_irqsave(lock);                        \
              } while (0)
      #endif
      
      #else
      
      #define raw_spin_lock_irqsave(lock, flags)                \
              do {                                                \
                      typecheck(unsigned long, flags);        \
                      _raw_spin_lock_irqsave(lock, flags);        \
              } while (0)
      
      #define raw_spin_lock_irqsave_nested(lock, flags, subclass)        \
              raw_spin_lock_irqsave(lock, flags)
      
      #endif
      
      #define raw_spin_lock_irq(lock)                _raw_spin_lock_irq(lock)
      #define raw_spin_lock_bh(lock)                _raw_spin_lock_bh(lock)
      #define raw_spin_unlock(lock)                _raw_spin_unlock(lock)
      #define raw_spin_unlock_irq(lock)        _raw_spin_unlock_irq(lock)
      
      #define raw_spin_unlock_irqrestore(lock, flags)                \
              do {                                                        \
                      typecheck(unsigned long, flags);                \
                      _raw_spin_unlock_irqrestore(lock, flags);        \
              } while (0)
      #define raw_spin_unlock_bh(lock)        _raw_spin_unlock_bh(lock)
      
      #define raw_spin_trylock_bh(lock) \
              __cond_lock(lock, _raw_spin_trylock_bh(lock))
      
      #define raw_spin_trylock_irq(lock) \
      ({ \
              local_irq_disable(); \
              raw_spin_trylock(lock) ? \
              1 : ({ local_irq_enable(); 0;  }); \
      })
      
      #define raw_spin_trylock_irqsave(lock, flags) \
      ({ \
              local_irq_save(flags); \
              raw_spin_trylock(lock) ? \
              1 : ({ local_irq_restore(flags); 0; }); \
      })
      
      /**
       * raw_spin_can_lock - would raw_spin_trylock() succeed?
       * @lock: the spinlock in question.
       */
      #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
      
      /* Include rwlock functions */
      #include <linux/rwlock.h>
      
      /*
       * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
       */
      #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
      # include <linux/spinlock_api_smp.h>
      #else
      # include <linux/spinlock_api_up.h>
      #endif
      
      /*
       * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
       */
      
      static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
      {
 5904         return &lock->rlock;
      }
      
      #define spin_lock_init(_lock)                                \
      do {                                                        \
              spinlock_check(_lock);                                \
              raw_spin_lock_init(&(_lock)->rlock);                \
      } while (0)
      
      static __always_inline void spin_lock(spinlock_t *lock)
      {
 6409         raw_spin_lock(&lock->rlock);
  368 }
      
      static __always_inline void spin_lock_bh(spinlock_t *lock)
      {
 1870         raw_spin_lock_bh(&lock->rlock);
      }
      
      static __always_inline int spin_trylock(spinlock_t *lock)
      {
  851         return raw_spin_trylock(&lock->rlock);
      }
      
      #define spin_lock_nested(lock, subclass)                        \
      do {                                                                \
              raw_spin_lock_nested(spinlock_check(lock), subclass);        \
      } while (0)
      
      #define spin_lock_bh_nested(lock, subclass)                        \
      do {                                                                \
              raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
      } while (0)
      
      #define spin_lock_nest_lock(lock, nest_lock)                                \
      do {                                                                        \
              raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);        \
      } while (0)
      
      static __always_inline void spin_lock_irq(spinlock_t *lock)
      {
 1852         raw_spin_lock_irq(&lock->rlock);
      }
      
      #define spin_lock_irqsave(lock, flags)                                \
      do {                                                                \
              raw_spin_lock_irqsave(spinlock_check(lock), flags);        \
      } while (0)
      
      #define spin_lock_irqsave_nested(lock, flags, subclass)                        \
      do {                                                                        \
              raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
      } while (0)
      
      static __always_inline void spin_unlock(spinlock_t *lock)
      {
 7019         raw_spin_unlock(&lock->rlock);
      }
      
      static __always_inline void spin_unlock_bh(spinlock_t *lock)
      {
 4094         raw_spin_unlock_bh(&lock->rlock);
      }
      
      static __always_inline void spin_unlock_irq(spinlock_t *lock)
      {
 1500         raw_spin_unlock_irq(&lock->rlock);
      }
      
      static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
      {
 5479         raw_spin_unlock_irqrestore(&lock->rlock, flags);
      }
      
      static __always_inline int spin_trylock_bh(spinlock_t *lock)
      {
   13         return raw_spin_trylock_bh(&lock->rlock);
      }
      
      static __always_inline int spin_trylock_irq(spinlock_t *lock)
      {
              return raw_spin_trylock_irq(&lock->rlock);
      }
      
      #define spin_trylock_irqsave(lock, flags)                        \
      ({                                                                \
              raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
      })
      
      static __always_inline void spin_unlock_wait(spinlock_t *lock)
      {
              raw_spin_unlock_wait(&lock->rlock);
      }
      
      static __always_inline int spin_is_locked(spinlock_t *lock)
      {
   37         return raw_spin_is_locked(&lock->rlock);
      }
      
      static __always_inline int spin_is_contended(spinlock_t *lock)
      {
  150         return raw_spin_is_contended(&lock->rlock);
      }
      
      static __always_inline int spin_can_lock(spinlock_t *lock)
      {
              return raw_spin_can_lock(&lock->rlock);
      }
      
      #define assert_spin_locked(lock)        assert_raw_spin_locked(&(lock)->rlock)
      
      /*
       * Pull the atomic_t declaration:
       * (asm-mips/atomic.h needs above definitions)
       */
      #include <linux/atomic.h>
      /**
       * atomic_dec_and_lock - lock on reaching reference count zero
       * @atomic: the atomic counter
       * @lock: the spinlock in question
       *
       * Decrements @atomic by 1.  If the result is 0, returns true and locks
       * @lock.  Returns false for all other cases.
       */
      extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
      #define atomic_dec_and_lock(atomic, lock) \
                      __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
      
      #endif /* __LINUX_SPINLOCK_H */
      #ifndef __KERNEL_PRINTK__
      #define __KERNEL_PRINTK__
      
      #include <stdarg.h>
      #include <linux/init.h>
      #include <linux/kern_levels.h>
      #include <linux/linkage.h>
      #include <linux/cache.h>
      
      extern const char linux_banner[];
      extern const char linux_proc_banner[];
      
      static inline int printk_get_level(const char *buffer)
      {
  668         if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
  667                 switch (buffer[1]) {
                      case '0' ... '7':
                      case 'd':        /* KERN_DEFAULT */
                              return buffer[1];
                      }
              }
              return 0;
      }
      
      static inline const char *printk_skip_level(const char *buffer)
      {
  667         if (printk_get_level(buffer))
                      return buffer + 2;
      
              return buffer;
      }
      
      #define CONSOLE_EXT_LOG_MAX        8192
      
      /* printk's without a loglevel use this.. */
      #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT
      
      /* We show everything that is MORE important than this.. */
      #define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
      #define CONSOLE_LOGLEVEL_MIN         1 /* Minimum loglevel we let people use */
      #define CONSOLE_LOGLEVEL_QUIET         4 /* Shhh ..., when booted with "quiet" */
      #define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */
      #define CONSOLE_LOGLEVEL_DEBUG        10 /* issue debug messages */
      #define CONSOLE_LOGLEVEL_MOTORMOUTH 15        /* You can't shut this one up */
      
      extern int console_printk[];
      
      #define console_loglevel (console_printk[0])
      #define default_message_loglevel (console_printk[1])
      #define minimum_console_loglevel (console_printk[2])
      #define default_console_loglevel (console_printk[3])
      
      static inline void console_silent(void)
      {
              console_loglevel = CONSOLE_LOGLEVEL_SILENT;
      }
      
      static inline void console_verbose(void)
      {
              if (console_loglevel)
                      console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
      }
      
      struct va_format {
              const char *fmt;
              va_list *va;
      };
      
      /*
       * FW_BUG
       * Add this to a message where you are sure the firmware is buggy or behaves
       * really stupid or out of spec. Be aware that the responsible BIOS developer
       * should be able to fix this issue or at least get a concrete idea of the
       * problem by reading your message without the need of looking at the kernel
       * code.
       *
       * Use it for definite and high priority BIOS bugs.
       *
       * FW_WARN
       * Use it for not that clear (e.g. could the kernel messed up things already?)
       * and medium priority BIOS bugs.
       *
       * FW_INFO
       * Use this one if you want to tell the user or vendor about something
       * suspicious, but generally harmless related to the firmware.
       *
       * Use it for information or very low priority BIOS bugs.
       */
      #define FW_BUG                "[Firmware Bug]: "
      #define FW_WARN                "[Firmware Warn]: "
      #define FW_INFO                "[Firmware Info]: "
      
      /*
       * HW_ERR
       * Add this to a message for hardware errors, so that user can report
       * it to hardware vendor instead of LKML or software vendor.
       */
      #define HW_ERR                "[Hardware Error]: "
      
      /*
       * DEPRECATED
       * Add this to a message whenever you want to warn user space about the use
       * of a deprecated aspect of an API so they can stop using it
       */
      #define DEPRECATED        "[Deprecated]: "
      
      /*
       * Dummy printk for disabled debugging statements to use whilst maintaining
       * gcc's format and side-effect checking.
       */
      static inline __printf(1, 2)
      int no_printk(const char *fmt, ...)
      {
    6         return 0;
      }
      
      #ifdef CONFIG_EARLY_PRINTK
      extern asmlinkage __printf(1, 2)
      void early_printk(const char *fmt, ...);
      #else
      static inline __printf(1, 2) __cold
      void early_printk(const char *s, ...) { }
      #endif
      
      typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
      
      #ifdef CONFIG_PRINTK
      asmlinkage __printf(5, 0)
      int vprintk_emit(int facility, int level,
                       const char *dict, size_t dictlen,
                       const char *fmt, va_list args);
      
      asmlinkage __printf(1, 0)
      int vprintk(const char *fmt, va_list args);
      
      asmlinkage __printf(5, 6) __cold
      int printk_emit(int facility, int level,
                      const char *dict, size_t dictlen,
                      const char *fmt, ...);
      
      asmlinkage __printf(1, 2) __cold
      int printk(const char *fmt, ...);
      
      /*
       * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
       */
      __printf(1, 2) __cold int printk_deferred(const char *fmt, ...);
      
      /*
       * Please don't use printk_ratelimit(), because it shares ratelimiting state
       * with all other unrelated printk_ratelimit() callsites.  Instead use
       * printk_ratelimited() or plain old __ratelimit().
       */
      extern int __printk_ratelimit(const char *func);
      #define printk_ratelimit() __printk_ratelimit(__func__)
      extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                         unsigned int interval_msec);
      
      extern int printk_delay_msec;
      extern int dmesg_restrict;
      extern int kptr_restrict;
      
      extern void wake_up_klogd(void);
      
      char *log_buf_addr_get(void);
      u32 log_buf_len_get(void);
      void log_buf_kexec_setup(void);
      void __init setup_log_buf(int early);
      __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
      void dump_stack_print_info(const char *log_lvl);
      void show_regs_print_info(const char *log_lvl);
      #else
      static inline __printf(1, 0)
      int vprintk(const char *s, va_list args)
      {
              return 0;
      }
      static inline __printf(1, 2) __cold
      int printk(const char *s, ...)
      {
              return 0;
      }
      static inline __printf(1, 2) __cold
      int printk_deferred(const char *s, ...)
      {
              return 0;
      }
      static inline int printk_ratelimit(void)
      {
              return 0;
      }
      static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                                unsigned int interval_msec)
      {
              return false;
      }
      
      static inline void wake_up_klogd(void)
      {
      }
      
      static inline char *log_buf_addr_get(void)
      {
              return NULL;
      }
      
      static inline u32 log_buf_len_get(void)
      {
              return 0;
      }
      
      static inline void log_buf_kexec_setup(void)
      {
      }
      
      static inline void setup_log_buf(int early)
      {
      }
      
      static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
      {
      }
      
      static inline void dump_stack_print_info(const char *log_lvl)
      {
      }
      
      static inline void show_regs_print_info(const char *log_lvl)
      {
      }
      #endif
      
      extern asmlinkage void dump_stack(void) __cold;
      
      #ifndef pr_fmt
      #define pr_fmt(fmt) fmt
      #endif
      
      /*
       * These can be used to print at the various log levels.
       * All of these will print unconditionally, although note that pr_debug()
       * and other debug macros are compiled out unless either DEBUG is defined
       * or CONFIG_DYNAMIC_DEBUG is set.
       */
      #define pr_emerg(fmt, ...) \
              printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_alert(fmt, ...) \
              printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_crit(fmt, ...) \
              printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_err(fmt, ...) \
              printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_warning(fmt, ...) \
              printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_warn pr_warning
      #define pr_notice(fmt, ...) \
              printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_info(fmt, ...) \
              printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
      /*
       * Like KERN_CONT, pr_cont() should only be used when continuing
       * a line with no newline ('\n') enclosed. Otherwise it defaults
       * back to KERN_DEFAULT.
       */
      #define pr_cont(fmt, ...) \
              printk(KERN_CONT fmt, ##__VA_ARGS__)
      
      /* pr_devel() should produce zero code unless DEBUG is defined */
      #ifdef DEBUG
      #define pr_devel(fmt, ...) \
              printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #else
      #define pr_devel(fmt, ...) \
              no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #endif
      
      #include <linux/dynamic_debug.h>
      
      /* If you are writing a driver, please use dev_dbg instead */
      #if defined(CONFIG_DYNAMIC_DEBUG)
      /* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */
      #define pr_debug(fmt, ...) \
              dynamic_pr_debug(fmt, ##__VA_ARGS__)
      #elif defined(DEBUG)
      #define pr_debug(fmt, ...) \
              printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #else
      #define pr_debug(fmt, ...) \
              no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #endif
      
      /*
       * Print a one-time message (analogous to WARN_ONCE() et al):
       */
      
      #ifdef CONFIG_PRINTK
      #define printk_once(fmt, ...)                                        \
      ({                                                                \
              static bool __print_once __read_mostly;                        \
                                                                      \
              if (!__print_once) {                                        \
                      __print_once = true;                                \
                      printk(fmt, ##__VA_ARGS__);                        \
              }                                                        \
      })
      #define printk_deferred_once(fmt, ...)                                \
      ({                                                                \
              static bool __print_once __read_mostly;                        \
                                                                      \
              if (!__print_once) {                                        \
                      __print_once = true;                                \
                      printk_deferred(fmt, ##__VA_ARGS__);                \
              }                                                        \
      })
      #else
      #define printk_once(fmt, ...)                                        \
              no_printk(fmt, ##__VA_ARGS__)
      #define printk_deferred_once(fmt, ...)                                \
              no_printk(fmt, ##__VA_ARGS__)
      #endif
      
      #define pr_emerg_once(fmt, ...)                                        \
              printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_alert_once(fmt, ...)                                        \
              printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_crit_once(fmt, ...)                                        \
              printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_err_once(fmt, ...)                                        \
              printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_warn_once(fmt, ...)                                        \
              printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_notice_once(fmt, ...)                                \
              printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_info_once(fmt, ...)                                        \
              printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_cont_once(fmt, ...)                                        \
              printk_once(KERN_CONT pr_fmt(fmt), ##__VA_ARGS__)
      
      #if defined(DEBUG)
      #define pr_devel_once(fmt, ...)                                        \
              printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #else
      #define pr_devel_once(fmt, ...)                                        \
              no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #endif
      
      /* If you are writing a driver, please use dev_dbg instead */
      #if defined(DEBUG)
      #define pr_debug_once(fmt, ...)                                        \
              printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #else
      #define pr_debug_once(fmt, ...)                                        \
              no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #endif
      
      /*
       * ratelimited messages with local ratelimit_state,
       * no local ratelimit_state used in the !PRINTK case
       */
      #ifdef CONFIG_PRINTK
      #define printk_ratelimited(fmt, ...)                                        \
      ({                                                                        \
              static DEFINE_RATELIMIT_STATE(_rs,                                \
                                            DEFAULT_RATELIMIT_INTERVAL,        \
                                            DEFAULT_RATELIMIT_BURST);                \
                                                                              \
              if (__ratelimit(&_rs))                                                \
                      printk(fmt, ##__VA_ARGS__);                                \
      })
      #else
      #define printk_ratelimited(fmt, ...)                                        \
              no_printk(fmt, ##__VA_ARGS__)
      #endif
      
      #define pr_emerg_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_alert_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_crit_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_err_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_warn_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_notice_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
      #define pr_info_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
      /* no pr_cont_ratelimited, don't do that... */
      
      #if defined(DEBUG)
      #define pr_devel_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #else
      #define pr_devel_ratelimited(fmt, ...)                                        \
              no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #endif
      
      /* If you are writing a driver, please use dev_dbg instead */
      #if defined(CONFIG_DYNAMIC_DEBUG)
      /* descriptor check is first to prevent flooding with "callbacks suppressed" */
      #define pr_debug_ratelimited(fmt, ...)                                        \
      do {                                                                        \
              static DEFINE_RATELIMIT_STATE(_rs,                                \
                                            DEFAULT_RATELIMIT_INTERVAL,        \
                                            DEFAULT_RATELIMIT_BURST);                \
              DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));                \
              if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) &&        \
                  __ratelimit(&_rs))                                                \
                      __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);        \
      } while (0)
      #elif defined(DEBUG)
      #define pr_debug_ratelimited(fmt, ...)                                        \
              printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #else
      #define pr_debug_ratelimited(fmt, ...) \
              no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
      #endif
      
      extern const struct file_operations kmsg_fops;
      
      enum {
              DUMP_PREFIX_NONE,
              DUMP_PREFIX_ADDRESS,
              DUMP_PREFIX_OFFSET
      };
      extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
                                    int groupsize, char *linebuf, size_t linebuflen,
                                    bool ascii);
      #ifdef CONFIG_PRINTK
      extern void print_hex_dump(const char *level, const char *prefix_str,
                                 int prefix_type, int rowsize, int groupsize,
                                 const void *buf, size_t len, bool ascii);
      #if defined(CONFIG_DYNAMIC_DEBUG)
      #define print_hex_dump_bytes(prefix_str, prefix_type, buf, len)        \
              dynamic_hex_dump(prefix_str, prefix_type, 16, 1, buf, len, true)
      #else
      extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                                       const void *buf, size_t len);
      #endif /* defined(CONFIG_DYNAMIC_DEBUG) */
      #else
      static inline void print_hex_dump(const char *level, const char *prefix_str,
                                        int prefix_type, int rowsize, int groupsize,
                                        const void *buf, size_t len, bool ascii)
      {
      }
      static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                                              const void *buf, size_t len)
      {
      }
      
      #endif
      
      #if defined(CONFIG_DYNAMIC_DEBUG)
      #define print_hex_dump_debug(prefix_str, prefix_type, rowsize,        \
                                   groupsize, buf, len, ascii)        \
              dynamic_hex_dump(prefix_str, prefix_type, rowsize,        \
                               groupsize, buf, len, ascii)
      #elif defined(DEBUG)
      #define print_hex_dump_debug(prefix_str, prefix_type, rowsize,                \
                                   groupsize, buf, len, ascii)                \
              print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,        \
                             groupsize, buf, len, ascii)
      #else
      static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
                                              int rowsize, int groupsize,
                                              const void *buf, size_t len, bool ascii)
      {
      }
      #endif
      
      #endif
      /*
       *  Copyright (C) 1995  Linus Torvalds
       *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
       *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
       */
      #include <linux/sched.h>                /* test_thread_flag(), ...        */
      #include <linux/kdebug.h>                /* oops_begin/end, ...                */
      #include <linux/module.h>                /* search_exception_table        */
      #include <linux/bootmem.h>                /* max_low_pfn                        */
      #include <linux/kprobes.h>                /* NOKPROBE_SYMBOL, ...                */
      #include <linux/mmiotrace.h>                /* kmmio_handler, ...                */
      #include <linux/perf_event.h>                /* perf_sw_event                */
      #include <linux/hugetlb.h>                /* hstate_index_to_shift        */
      #include <linux/prefetch.h>                /* prefetchw                        */
      #include <linux/context_tracking.h>        /* exception_enter(), ...        */
      #include <linux/uaccess.h>                /* faulthandler_disabled()        */
      
      #include <asm/traps.h>                        /* dotraplinkage, ...                */
      #include <asm/pgalloc.h>                /* pgd_*(), ...                        */
      #include <asm/kmemcheck.h>                /* kmemcheck_*(), ...                */
      #include <asm/fixmap.h>                        /* VSYSCALL_ADDR                */
      #include <asm/vsyscall.h>                /* emulate_vsyscall                */
      #include <asm/vm86.h>                        /* struct vm86                        */
      
      #define CREATE_TRACE_POINTS
      #include <asm/trace/exceptions.h>
      
      /*
       * Page fault error code bits:
       *
       *   bit 0 ==         0: no page found        1: protection fault
       *   bit 1 ==         0: read access                1: write access
       *   bit 2 ==         0: kernel-mode access        1: user-mode access
       *   bit 3 ==                                1: use of reserved bit detected
       *   bit 4 ==                                1: fault was an instruction fetch
       */
      enum x86_pf_error_code {
      
              PF_PROT                =                1 << 0,
              PF_WRITE        =                1 << 1,
              PF_USER                =                1 << 2,
              PF_RSVD                =                1 << 3,
              PF_INSTR        =                1 << 4,
      };
      
      /*
       * Returns 0 if mmiotrace is disabled, or if the fault is not
       * handled by mmiotrace:
       */
      static nokprobe_inline int
      kmmio_fault(struct pt_regs *regs, unsigned long addr)
      {
              if (unlikely(is_kmmio_active()))
                      if (kmmio_handler(regs, addr) == 1)
                              return -1;
              return 0;
      }
      
      static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
      {
              int ret = 0;
      
              /* kprobe_running() needs smp_processor_id() */
              if (kprobes_built_in() && !user_mode(regs)) {
                      preempt_disable();
                      if (kprobe_running() && kprobe_fault_handler(regs, 14))
                              ret = 1;
                      preempt_enable();
              }
      
              return ret;
      }
      
      /*
       * Prefetch quirks:
       *
       * 32-bit mode:
       *
       *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
       *   Check that here and ignore it.
       *
       * 64-bit mode:
       *
       *   Sometimes the CPU reports invalid exceptions on prefetch.
       *   Check that here and ignore it.
       *
       * Opcode checker based on code by Richard Brunner.
       */
      static inline int
      check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
                            unsigned char opcode, int *prefetch)
      {
              unsigned char instr_hi = opcode & 0xf0;
              unsigned char instr_lo = opcode & 0x0f;
      
              switch (instr_hi) {
              case 0x20:
              case 0x30:
                      /*
                       * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
                       * In X86_64 long mode, the CPU will signal invalid
                       * opcode if some of these prefixes are present so
                       * X86_64 will never get here anyway
                       */
                      return ((instr_lo & 7) == 0x6);
      #ifdef CONFIG_X86_64
              case 0x40:
                      /*
                       * In AMD64 long mode 0x40..0x4F are valid REX prefixes
                       * Need to figure out under what instruction mode the
                       * instruction was issued. Could check the LDT for lm,
                       * but for now it's good enough to assume that long
                       * mode only uses well known segments or kernel.
                       */
                      return (!user_mode(regs) || user_64bit_mode(regs));
      #endif
              case 0x60:
                      /* 0x64 thru 0x67 are valid prefixes in all modes. */
   95                 return (instr_lo & 0xC) == 0x4;
              case 0xF0:
                      /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
    5                 return !instr_lo || (instr_lo>>1) == 1;
              case 0x00:
                      /* Prefetch instruction is 0x0F0D or 0x0F18 */
   95                 if (probe_kernel_address(instr, opcode))
                              return 0;
      
   95                 *prefetch = (instr_lo == 0xF) &&
   95                         (opcode == 0x0D || opcode == 0x18);
                      return 0;
              default:
                      return 0;
              }
      }
      
      static int
      is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
      {
              unsigned char *max_instr;
              unsigned char *instr;
              int prefetch = 0;
      
              /*
               * If it was a exec (instruction fetch) fault on NX page, then
               * do not ignore the fault:
               */
    3         if (error_code & PF_INSTR)
                      return 0;
      
  100         instr = (void *)convert_ip_to_linear(current, regs);
              max_instr = instr + 15;
      
  100         if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
                      return 0;
      
  100         while (instr < max_instr) {
                      unsigned char opcode;
      
  100                 if (probe_kernel_address(instr, opcode))
                              break;
      
  100                 instr++;
      
  100                 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
                              break;
              }
              return prefetch;
      }
      
      static void
      force_sig_info_fault(int si_signo, int si_code, unsigned long address,
                           struct task_struct *tsk, int fault)
      {
              unsigned lsb = 0;
              siginfo_t info;
      
  102         info.si_signo        = si_signo;
              info.si_errno        = 0;
              info.si_code        = si_code;
              info.si_addr        = (void __user *)address;
              if (fault & VM_FAULT_HWPOISON_LARGE)
                      lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
              if (fault & VM_FAULT_HWPOISON)
                      lsb = PAGE_SHIFT;
  102         info.si_addr_lsb = lsb;
      
              force_sig_info(si_signo, &info, tsk);
      }
      
      DEFINE_SPINLOCK(pgd_lock);
      LIST_HEAD(pgd_list);
      
      #ifdef CONFIG_X86_32
      static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
      {
              unsigned index = pgd_index(address);
              pgd_t *pgd_k;
              pud_t *pud, *pud_k;
              pmd_t *pmd, *pmd_k;
      
              pgd += index;
              pgd_k = init_mm.pgd + index;
      
              if (!pgd_present(*pgd_k))
                      return NULL;
      
              /*
               * set_pgd(pgd, *pgd_k); here would be useless on PAE
               * and redundant with the set_pmd() on non-PAE. As would
               * set_pud.
               */
              pud = pud_offset(pgd, address);
              pud_k = pud_offset(pgd_k, address);
              if (!pud_present(*pud_k))
                      return NULL;
      
              pmd = pmd_offset(pud, address);
              pmd_k = pmd_offset(pud_k, address);
      
              if (pmd_present(*pmd) != pmd_present(*pmd_k))
                      set_pmd(pmd, *pmd_k);
      
              if (!pmd_present(*pmd_k))
                      return NULL;
              else
                      BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
      
              return pmd_k;
      }
      
      void vmalloc_sync_all(void)
      {
              unsigned long address;
      
              if (SHARED_KERNEL_PMD)
                      return;
      
              for (address = VMALLOC_START & PMD_MASK;
                   address >= TASK_SIZE && address < FIXADDR_TOP;
                   address += PMD_SIZE) {
                      struct page *page;
      
                      spin_lock(&pgd_lock);
                      list_for_each_entry(page, &pgd_list, lru) {
                              spinlock_t *pgt_lock;
      
                              /* the pgt_lock only for Xen */
                              pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
      
                              spin_lock(pgt_lock);
                              vmalloc_sync_one(page_address(page), address);
                              spin_unlock(pgt_lock);
                      }
                      spin_unlock(&pgd_lock);
              }
      }
      
      /*
       * 32-bit:
       *
       *   Handle a fault on the vmalloc or module mapping area
       */
      static noinline int vmalloc_fault(unsigned long address)
      {
              unsigned long pgd_paddr;
              pmd_t *pmd_k;
              pte_t *pte_k;
      
              /* Make sure we are in vmalloc area: */
              if (!(address >= VMALLOC_START && address < VMALLOC_END))
                      return -1;
      
              /*
               * Synchronize this task's top level page-table
               * with the 'reference' page table.
               *
               * Do _not_ use "current" here. We might be inside
               * an interrupt in the middle of a task switch..
               */
              pgd_paddr = read_cr3();
              pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
              if (!pmd_k)
                      return -1;
      
              if (pmd_large(*pmd_k))
                      return 0;
      
              pte_k = pte_offset_kernel(pmd_k, address);
              if (!pte_present(*pte_k))
                      return -1;
      
              return 0;
      }
      NOKPROBE_SYMBOL(vmalloc_fault);
      
      /*
       * Did it hit the DOS screen memory VA from vm86 mode?
       */
      static inline void
      check_v8086_mode(struct pt_regs *regs, unsigned long address,
                       struct task_struct *tsk)
      {
      #ifdef CONFIG_VM86
              unsigned long bit;
      
              if (!v8086_mode(regs) || !tsk->thread.vm86)
                      return;
      
              bit = (address - 0xA0000) >> PAGE_SHIFT;
              if (bit < 32)
                      tsk->thread.vm86->screen_bitmap |= 1 << bit;
      #endif
      }
      
      static bool low_pfn(unsigned long pfn)
      {
              return pfn < max_low_pfn;
      }
      
      static void dump_pagetable(unsigned long address)
      {
              pgd_t *base = __va(read_cr3());
              pgd_t *pgd = &base[pgd_index(address)];
              pmd_t *pmd;
              pte_t *pte;
      
      #ifdef CONFIG_X86_PAE
              printk("*pdpt = %016Lx ", pgd_val(*pgd));
              if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
                      goto out;
      #endif
              pmd = pmd_offset(pud_offset(pgd, address), address);
              printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
      
              /*
               * We must not directly access the pte in the highpte
               * case if the page table is located in highmem.
               * And let's rather not kmap-atomic the pte, just in case
               * it's allocated already:
               */
              if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
                      goto out;
      
              pte = pte_offset_kernel(pmd, address);
              printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
      out:
              printk("\n");
      }
      
      #else /* CONFIG_X86_64: */
      
      void vmalloc_sync_all(void)
      {
              sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
      }
      
      /*
       * 64-bit:
       *
       *   Handle a fault on the vmalloc area
       */
      static noinline int vmalloc_fault(unsigned long address)
      {
              pgd_t *pgd, *pgd_ref;
              pud_t *pud, *pud_ref;
              pmd_t *pmd, *pmd_ref;
              pte_t *pte, *pte_ref;
      
              /* Make sure we are in vmalloc area: */
              if (!(address >= VMALLOC_START && address < VMALLOC_END))
                      return -1;
      
    1         /*
               * Copy kernel mappings over when needed. This can also
               * happen within a race in page table update. In the later
               * case just flush:
               */
              pgd = pgd_offset(current->active_mm, address);
              pgd_ref = pgd_offset_k(address);
              if (pgd_none(*pgd_ref))
                      return -1;
      
              if (pgd_none(*pgd)) {
                      set_pgd(pgd, *pgd_ref);
                      arch_flush_lazy_mmu_mode();
              } else {
                      BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
              }
      
              /*
               * Below here mismatches are bugs because these lower tables
               * are shared:
               */
      
              pud = pud_offset(pgd, address);
              pud_ref = pud_offset(pgd_ref, address);
              if (pud_none(*pud_ref))
                      return -1;
      
              if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
                      BUG();
      
              if (pud_large(*pud))
                      return 0;
      
              pmd = pmd_offset(pud, address);
              pmd_ref = pmd_offset(pud_ref, address);
              if (pmd_none(*pmd_ref))
    1                 return -1;
      
              if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
                      BUG();
      
              if (pmd_large(*pmd))
                      return 0;
      
              pte_ref = pte_offset_kernel(pmd_ref, address);
              if (!pte_present(*pte_ref))
                      return -1;
      
              pte = pte_offset_kernel(pmd, address);
      
              /*
               * Don't use pte_page here, because the mappings can point
               * outside mem_map, and the NUMA hash lookup cannot handle
               * that:
               */
              if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
                      BUG();
      
              return 0;
      }
      NOKPROBE_SYMBOL(vmalloc_fault);
      
      #ifdef CONFIG_CPU_SUP_AMD
      static const char errata93_warning[] =
      KERN_ERR 
      "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
      "******* Working around it, but it may cause SEGVs or burn power.\n"
      "******* Please consider a BIOS update.\n"
      "******* Disabling USB legacy in the BIOS may also help.\n";
      #endif
      
      /*
       * No vm86 mode in 64-bit mode:
       */
      static inline void
      check_v8086_mode(struct pt_regs *regs, unsigned long address,
                       struct task_struct *tsk)
      {
      }
      
      static int bad_address(void *p)
      {
              unsigned long dummy;
      
              return probe_kernel_address((unsigned long *)p, dummy);
      }
      
      static void dump_pagetable(unsigned long address)
      {
              pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
              pgd_t *pgd = base + pgd_index(address);
              pud_t *pud;
              pmd_t *pmd;
              pte_t *pte;
      
              if (bad_address(pgd))
                      goto bad;
      
              printk("PGD %lx ", pgd_val(*pgd));
      
              if (!pgd_present(*pgd))
                      goto out;
      
              pud = pud_offset(pgd, address);
              if (bad_address(pud))
                      goto bad;
      
              printk("PUD %lx ", pud_val(*pud));
              if (!pud_present(*pud) || pud_large(*pud))
                      goto out;
      
              pmd = pmd_offset(pud, address);
              if (bad_address(pmd))
                      goto bad;
      
              printk("PMD %lx ", pmd_val(*pmd));
              if (!pmd_present(*pmd) || pmd_large(*pmd))
                      goto out;
      
              pte = pte_offset_kernel(pmd, address);
              if (bad_address(pte))
                      goto bad;
      
              printk("PTE %lx", pte_val(*pte));
      out:
              printk("\n");
              return;
      bad:
              printk("BAD\n");
      }
      
      #endif /* CONFIG_X86_64 */
      
      /*
       * Workaround for K8 erratum #93 & buggy BIOS.
       *
       * BIOS SMM functions are required to use a specific workaround
       * to avoid corruption of the 64bit RIP register on C stepping K8.
       *
       * A lot of BIOS that didn't get tested properly miss this.
       *
       * The OS sees this as a page fault with the upper 32bits of RIP cleared.
       * Try to work around it here.
       *
       * Note we only handle faults in kernel here.
       * Does nothing on 32-bit.
       */
      static int is_errata93(struct pt_regs *regs, unsigned long address)
      {
      #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
              if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
                  || boot_cpu_data.x86 != 0xf)
                      return 0;
      
              if (address != regs->ip)
                      return 0;
      
              if ((address >> 32) != 0)
                      return 0;
      
              address |= 0xffffffffUL << 32;
              if ((address >= (u64)_stext && address <= (u64)_etext) ||
                  (address >= MODULES_VADDR && address <= MODULES_END)) {
                      printk_once(errata93_warning);
                      regs->ip = address;
                      return 1;
              }
      #endif
              return 0;
      }
      
      /*
       * Work around K8 erratum #100 K8 in compat mode occasionally jumps
       * to illegal addresses >4GB.
       *
       * We catch this in the page fault handler because these addresses
       * are not reachable. Just detect this case and return.  Any code
       * segment in LDT is compatibility mode.
       */
      static int is_errata100(struct pt_regs *regs, unsigned long address)
      {
      #ifdef CONFIG_X86_64
              if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
                      return 1;
      #endif
              return 0;
      }
  100 
      static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
      {
      #ifdef CONFIG_X86_F00F_BUG
              unsigned long nr;
      
              /*
               * Pentium F0 0F C7 C8 bug workaround:
               */
              if (boot_cpu_has_bug(X86_BUG_F00F)) {
                      nr = (address - idt_descr.address) >> 3;
      
                      if (nr == 6) {
                              do_invalid_op(regs, 0);
                              return 1;
                      }
              }
      #endif
              return 0;
      }
      
      static const char nx_warning[] = KERN_CRIT
      "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
      static const char smep_warning[] = KERN_CRIT
      "unable to execute userspace code (SMEP?) (uid: %d)\n";
      
      static void
      show_fault_oops(struct pt_regs *regs, unsigned long error_code,
                      unsigned long address)
      {
              if (!oops_may_print())
                      return;
      
              if (error_code & PF_INSTR) {
                      unsigned int level;
                      pgd_t *pgd;
                      pte_t *pte;
      
                      pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
                      pgd += pgd_index(address);
      
                      pte = lookup_address_in_pgd(pgd, address, &level);
      
                      if (pte && pte_present(*pte) && !pte_exec(*pte))
                              printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
                      if (pte && pte_present(*pte) && pte_exec(*pte) &&
                                      (pgd_flags(*pgd) & _PAGE_USER) &&
                                      (__read_cr4() & X86_CR4_SMEP))
                              printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
              }
      
              printk(KERN_ALERT "BUG: unable to handle kernel ");
              if (address < PAGE_SIZE)
                      printk(KERN_CONT "NULL pointer dereference");
              else
                      printk(KERN_CONT "paging request");
      
              printk(KERN_CONT " at %p\n", (void *) address);
              printk(KERN_ALERT "IP:");
              printk_address(regs->ip);
      
              dump_pagetable(address);
      }
      
      static noinline void
      pgtable_bad(struct pt_regs *regs, unsigned long error_code,
                  unsigned long address)
      {
              struct task_struct *tsk;
              unsigned long flags;
              int sig;
      
              flags = oops_begin();
              tsk = current;
              sig = SIGKILL;
      
              printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
                     tsk->comm, address);
              dump_pagetable(address);
      
              tsk->thread.cr2                = address;
              tsk->thread.trap_nr        = X86_TRAP_PF;
              tsk->thread.error_code        = error_code;
      
              if (__die("Bad pagetable", regs, error_code))
                      sig = 0;
      
              oops_end(flags, regs, sig);
      }
      
      static noinline void
      no_context(struct pt_regs *regs, unsigned long error_code,
                 unsigned long address, int signal, int si_code)
      {
              struct task_struct *tsk = current;
              unsigned long flags;
              int sig;
      
              /* Are we prepared to handle this kernel fault? */
  821         if (fixup_exception(regs)) {
                      /*
                       * Any interrupt that takes a fault gets the fixup. This makes
                       * the below recursive fault logic only apply to a faults from
                       * task context.
                       */
                      if (in_interrupt())
                              return;
      
                      /*
                       * Per the above we're !in_interrupt(), aka. task context.
  821                  *
                       * In this case we need to make sure we're not recursively
                       * faulting through the emulate_vsyscall() logic.
                       */
                      if (current_thread_info()->sig_on_uaccess_error && signal) {
                              tsk->thread.trap_nr = X86_TRAP_PF;
                              tsk->thread.error_code = error_code | PF_USER;
                              tsk->thread.cr2 = address;
      
  821                         /* XXX: hwpoison faults will set the wrong code. */
                              force_sig_info_fault(signal, si_code, address, tsk, 0);
                      }
      
                      /*
                       * Barring that, we can do the fixup and be happy.
                       */
                      return;
              }
      
              /*
               * 32-bit:
               *
               *   Valid to do another page fault here, because if this fault
               *   had been triggered by is_prefetch fixup_exception would have
               *   handled it.
               *
               * 64-bit:
               *
               *   Hall of shame of CPU/BIOS bugs.
               */
              if (is_prefetch(regs, error_code, address))
                      return;
      
              if (is_errata93(regs, address))
                      return;
      
              /*
               * Oops. The kernel tried to access some bad page. We'll have to
               * terminate things with extreme prejudice:
               */
              flags = oops_begin();
      
              show_fault_oops(regs, error_code, address);
      
              if (task_stack_end_corrupted(tsk))
                      printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
      
              tsk->thread.cr2                = address;
              tsk->thread.trap_nr        = X86_TRAP_PF;
              tsk->thread.error_code        = error_code;
      
              sig = SIGKILL;
              if (__die("Oops", regs, error_code))
                      sig = 0;
      
              /* Executive summary in case the body of the oops scrolled away */
              printk(KERN_DEFAULT "CR2: %016lx\n", address);
      
              oops_end(flags, regs, sig);
      }
      
      /*
       * Print out info about fatal segfaults, if the show_unhandled_signals
       * sysctl is set:
       */
      static inline void
      show_signal_msg(struct pt_regs *regs, unsigned long error_code,
                      unsigned long address, struct task_struct *tsk)
      {
              if (!unhandled_signal(tsk, SIGSEGV))
                      return;
      
              if (!printk_ratelimit())
                      return;
      
              printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
                      task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
                      tsk->comm, task_pid_nr(tsk), address,
                      (void *)regs->ip, (void *)regs->sp, error_code);
      
              print_vma_addr(KERN_CONT " in ", regs->ip);
      
              printk(KERN_CONT "\n");
      }
      
      static void
      __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                             unsigned long address, int si_code)
      {
              struct task_struct *tsk = current;
      
              /* User mode accesses just cause a SIGSEGV */
              if (error_code & PF_USER) {
                      /*
  872                  * It's possible to have interrupts off here:
                       */
                      local_irq_enable();
      
                      /*
                       * Valid to do another page fault here because this one came
                       * from user space:
  100                  */
                      if (is_prefetch(regs, error_code, address))
                              return;
      
                      if (is_errata100(regs, address))
                              return;
   97 
      #ifdef CONFIG_X86_64
                      /*
  100                  * Instruction fetch faults in the vsyscall page might need
                       * emulation.
                       */
                      if (unlikely((error_code & PF_INSTR) &&
                                   ((address & ~0xfff) == VSYSCALL_ADDR))) {
                              if (emulate_vsyscall(regs, address))
                                      return;
                      }
  100 #endif
                      /* Kernel addresses are always protection faults: */
                      if (address >= TASK_SIZE)
                              error_code |= PF_PROT;
      
                      if (likely(show_unhandled_signals))
                              show_signal_msg(regs, error_code, address, tsk);
  100 
    1                 tsk->thread.cr2                = address;
                      tsk->thread.error_code        = error_code;
  100                 tsk->thread.trap_nr        = X86_TRAP_PF;
      
                      force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
  100 
                      return;
              }
      
              if (is_f00f_bug(regs, address))
                      return;
  872 
              no_context(regs, error_code, address, SIGSEGV, si_code);
      }
      
      static noinline void
      bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
  788                      unsigned long address)
   97 {
              __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
      }
      
      static void
      __bad_area(struct pt_regs *regs, unsigned long error_code,
  101            unsigned long address, int si_code)
      {
              struct mm_struct *mm = current->mm;
      
              /*
               * Something tried to access memory that isn't in our memory map..
               * Fix it, but check if it's kernel or user first..
  793          */
              up_read(&mm->mmap_sem);
      
              __bad_area_nosemaphore(regs, error_code, address, si_code);
      }
      
      static noinline void
      bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
      {
              __bad_area(regs, error_code, address, SEGV_MAPERR);
      }
      
      static noinline void
      bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
  637                       unsigned long address)
      {
              __bad_area(regs, error_code, address, SEGV_ACCERR);
      }
      
      static void
      do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
  159           unsigned int fault)
      {
              struct task_struct *tsk = current;
              int code = BUS_ADRERR;
      
              /* Kernel mode? Handle exceptions or die: */
              if (!(error_code & PF_USER)) {
   45                 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
                      return;
              }
      
              /* User-space => ok to do another page fault: */
   43         if (is_prefetch(regs, error_code, address))
                      return;
      
              tsk->thread.cr2                = address;
              tsk->thread.error_code        = error_code;
    3         tsk->thread.trap_nr        = X86_TRAP_PF;
      
      #ifdef CONFIG_MEMORY_FAILURE
    3         if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                      printk(KERN_ERR
              "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                              tsk->comm, tsk->pid, address);
                      code = BUS_MCEERR_AR;
              }
      #endif
              force_sig_info_fault(SIGBUS, code, address, tsk, fault);
      }
      
      static noinline void
      mm_fault_error(struct pt_regs *regs, unsigned long error_code,
                     unsigned long address, unsigned int fault)
      {
              if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
                      no_context(regs, error_code, address, 0, 0);
                      return;
              }
      
   45         if (fault & VM_FAULT_OOM) {
                      /* Kernel mode? Handle exceptions or die: */
                      if (!(error_code & PF_USER)) {
                              no_context(regs, error_code, address,
                                         SIGSEGV, SEGV_MAPERR);
   45                         return;
                      }
      
                      /*
                       * We ran out of memory, call the OOM killer, and return the
                       * userspace (which will retry the fault, or kill us if we got
                       * oom-killed):
                       */
                      pagefault_out_of_memory();
              } else {
                      if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
                                   VM_FAULT_HWPOISON_LARGE))
                              do_sigbus(regs, error_code, address, fault);
                      else if (fault & VM_FAULT_SIGSEGV)
                              bad_area_nosemaphore(regs, error_code, address);
   45                 else
                              BUG();
   45         }
      }
   45 
      static int spurious_fault_check(unsigned long error_code, pte_t *pte)
      {
              if ((error_code & PF_WRITE) && !pte_write(*pte))
                      return 0;
      
              if ((error_code & PF_INSTR) && !pte_exec(*pte))
                      return 0;
      
              return 1;
      }
      
      /*
       * Handle a spurious fault caused by a stale TLB entry.
       *
       * This allows us to lazily refresh the TLB when increasing the
       * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
       * eagerly is very expensive since that implies doing a full
       * cross-processor TLB flush, even if no stale TLB entries exist
       * on other processors.
       *
       * Spurious faults may only occur if the TLB contains an entry with
       * fewer permission than the page table entry.  Non-present (P = 0)
       * and reserved bit (R = 1) faults are never spurious.
       *
       * There are no security implications to leaving a stale TLB when
       * increasing the permissions on a page.
       *
       * Returns non-zero if a spurious fault was handled, zero otherwise.
       *
       * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
       * (Optional Invalidation).
       */
      static noinline int
      spurious_fault(unsigned long error_code, unsigned long address)
      {
              pgd_t *pgd;
              pud_t *pud;
              pmd_t *pmd;
              pte_t *pte;
              int ret;
      
              /*
               * Only writes to RO or instruction fetches from NX may cause
               * spurious faults.
               *
               * These could be from user or supervisor accesses but the TLB
               * is only lazily flushed after a kernel mapping protection
               * change, so user accesses are not expected to cause spurious
               * faults.
               */
              if (error_code != (PF_WRITE | PF_PROT)
                  && error_code != (PF_INSTR | PF_PROT))
                      return 0;
      
              pgd = init_mm.pgd + pgd_index(address);
    1         if (!pgd_present(*pgd))
    1                 return 0;
    1 
              pud = pud_offset(pgd, address);
              if (!pud_present(*pud))
                      return 0;
      
              if (pud_large(*pud))
                      return spurious_fault_check(error_code, (pte_t *) pud);
      
              pmd = pmd_offset(pud, address);
              if (!pmd_present(*pmd))
                      return 0;
      
              if (pmd_large(*pmd))
                      return spurious_fault_check(error_code, (pte_t *) pmd);
      
              pte = pte_offset_kernel(pmd, address);
              if (!pte_present(*pte))
                      return 0;
      
              ret = spurious_fault_check(error_code, pte);
              if (!ret)
                      return 0;
      
              /*
               * Make sure we have permissions in PMD.
               * If not, then there's a bug in the page tables:
               */
              ret = spurious_fault_check(error_code, (pte_t *) pmd);
              WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
      
              return ret;
      }
      NOKPROBE_SYMBOL(spurious_fault);
      
      int show_unhandled_signals = 1;
      
      static inline int
      access_error(unsigned long error_code, struct vm_area_struct *vma)
      {
              if (error_code & PF_WRITE) {
                      /* write, present and write, not present: */
                      if (unlikely(!(vma->vm_flags & VM_WRITE)))
                              return 1;
                      return 0;
 3067         }
      
 2064         /* read, present: */
              if (unlikely(error_code & PF_PROT))
                      return 1;
      
              /* read, not present: */
              if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
 1739                 return 1;
      
              return 0;
      }
 1738 
      static int fault_in_kernel_space(unsigned long address)
      {
              return address >= TASK_SIZE_MAX;
      }
      
      static inline bool smap_violation(int error_code, struct pt_regs *regs)
      {
              if (!IS_ENABLED(CONFIG_X86_SMAP))
                      return false;
      
              if (!static_cpu_has(X86_FEATURE_SMAP))
                      return false;
      
              if (error_code & PF_USER)
                      return false;
      
              if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
                      return false;
      
              return true;
      }
      
      /*
       * This routine handles page faults.  It determines the address,
       * and the problem, and then passes it off to one of the appropriate
       * routines.
       *
       * This function must have noinline because both callers
       * {,trace_}do_page_fault() have notrace on. Having this an actual function
       * guarantees there's a function trace entry.
       */
      static noinline void
      __do_page_fault(struct pt_regs *regs, unsigned long error_code,
                      unsigned long address)
      {
              struct vm_area_struct *vma;
              struct task_struct *tsk;
              struct mm_struct *mm;
              int fault, major = 0;
              unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
      
              tsk = current;
              mm = tsk->mm;
      
              /*
               * Detect and handle instructions that would cause a page fault for
 3363          * both a tracked kernel page and a userspace page.
               */
              if (kmemcheck_active(regs))
                      kmemcheck_hide(regs);
              prefetchw(&mm->mmap_sem);
      
              if (unlikely(kmmio_fault(regs, address)))
                      return;
      
              /*
               * We fault-in kernel-space virtual memory on-demand. The
               * 'reference' page table is init_mm.pgd.
               *
               * NOTE! We MUST NOT take any locks for this case. We may
               * be in an interrupt or a critical region, and should
               * only copy the information from the master page table,
               * nothing more.
               *
               * This verifies that the fault happens in kernel space
               * (error_code & 4) == 0, and that the fault was not a
               * protection error (error_code & 9) == 0.
               */
              if (unlikely(fault_in_kernel_space(address))) {
                      if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
                              if (vmalloc_fault(address) >= 0)
                                      return;
      
                              if (kmemcheck_fault(regs, address, error_code))
    1                                 return;
    1                 }
      
                      /* Can handle a stale RO->RW TLB: */
                      if (spurious_fault(error_code, address))
                              return;
      
                      /* kprobes don't want to hook the spurious faults: */
                      if (kprobes_fault(regs))
    1                         return;
                      /*
                       * Don't take the mm semaphore here. If we fixup a prefetch
                       * fault we could otherwise deadlock:
                       */
                      bad_area_nosemaphore(regs, error_code, address);
      
                      return;
              }
      
              /* kprobes don't want to hook the spurious faults: */
              if (unlikely(kprobes_fault(regs)))
                      return;
      
              if (unlikely(error_code & PF_RSVD))
                      pgtable_bad(regs, error_code, address);
      
              if (unlikely(smap_violation(error_code, regs))) {
                      bad_area_nosemaphore(regs, error_code, address);
 3362                 return;
              }
      
              /*
               * If we're in an interrupt, have no user context or are running
               * in a region with pagefaults disabled then we must not take the fault
               */
              if (unlikely(faulthandler_disabled() || !mm)) {
                      bad_area_nosemaphore(regs, error_code, address);
                      return;
              }
      
 3362         /*
               * It's safe to allow irq's after cr2 has been saved and the
               * vmalloc fault has been handled.
               *
               * User-mode registers count as a user access even for any
               * potential system fault or CPU buglet:
               */
              if (user_mode(regs)) {
                      local_irq_enable();
                      error_code |= PF_USER;
                      flags |= FAULT_FLAG_USER;
              } else {
 3351                 if (regs->flags & X86_EFLAGS_IF)
  831                         local_irq_enable();
              }
      
              perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 2820 
 2820         if (error_code & PF_WRITE)
                      flags |= FAULT_FLAG_WRITE;
      
 3351         /*
               * When running in the kernel we expect faults to occur only to
 3351          * addresses in user space.  All other faults represent errors in
 2211          * the kernel and should generate an OOPS.  Unfortunately, in the
               * case of an erroneous fault occurring in a code path which already
               * holds mmap_sem we will deadlock attempting to validate the fault
               * against the address space.  Luckily the kernel only validly
               * references user space from well defined areas of code, which are
               * listed in the exceptions table.
               *
               * As the vast majority of faults will be valid we will only perform
               * the source reference check when there is a possibility of a
               * deadlock. Attempt to lock the address space, if we cannot we then
               * validate the source. If this is invalid we can skip the address
               * space check, thus avoiding the deadlock:
               */
              if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
                      if ((error_code & PF_USER) == 0 &&
                          !search_exception_tables(regs->ip)) {
                              bad_area_nosemaphore(regs, error_code, address);
                              return;
 3351                 }
  146 retry:
  140                 down_read(&mm->mmap_sem);
  101         } else {
                      /*
                       * The above down_read_trylock() might have succeeded in
                       * which case we'll have missed the might_sleep() from
  164                  * down_read():
                       */
                      might_sleep();
              }
      
              vma = find_vma(mm, address);
              if (unlikely(!vma)) {
 3347                 bad_area(regs, error_code, address);
                      return;
              }
 3351         if (likely(vma->vm_start <= address))
                      goto good_area;
              if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
                      bad_area(regs, error_code, address);
                      return;
 3343         }
              if (error_code & PF_USER) {
  630                 /*
                       * Accessing the stack below %sp is always a bug.
                       * The large cushion allows instructions like enter
                       * and pusha to work. ("enter $65535, $31" pushes
    8                  * 32 pointers and then decrements %sp by 65535.)
                       */
                      if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
                              bad_area(regs, error_code, address);
                              return;
                      }
              }
    1         if (unlikely(expand_stack(vma, address))) {
                      bad_area(regs, error_code, address);
                      return;
              }
      
    7         /*
  637          * Ok, we have a good vm_area for this memory access, so
               * we can handle it..
               */
      good_area:
              if (unlikely(access_error(error_code, vma))) {
                      bad_area_access_error(regs, error_code, address);
                      return;
              }
      
 3067         /*
  159          * If for any reason at all we couldn't handle the fault,
               * make sure we exit gracefully rather than endlessly redo
               * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
               * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
               */
              fault = handle_mm_fault(mm, vma, address, flags);
              major |= fault & VM_FAULT_MAJOR;
      
              /*
               * If we need to retry the mmap_sem has already been released,
 2962          * and if there is a fatal signal pending there is no guarantee
               * that we made any progress. Handle this case first.
               */
              if (unlikely(fault & VM_FAULT_RETRY)) {
                      /* Retry at most once */
                      if (flags & FAULT_FLAG_ALLOW_RETRY) {
                              flags &= ~FAULT_FLAG_ALLOW_RETRY;
                              flags |= FAULT_FLAG_TRIED;
                              if (!fatal_signal_pending(tsk))
                                      goto retry;
   20                 }
   20 
                      /* User mode? Just return to handle the fatal exception */
                      if (flags & FAULT_FLAG_USER)
                              return;
      
                      /* Not returning to user mode? Handle exceptions or die: */
                      no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
                      return;
              }
      
              up_read(&mm->mmap_sem);
              if (unlikely(fault & VM_FAULT_ERROR)) {
                      mm_fault_error(regs, error_code, address, fault);
                      return;
              }
 2962 
              /*
   45          * Major/minor page fault accounting. If any of the events
               * returned VM_FAULT_MAJOR, we account it as a major fault.
               */
              if (major) {
                      tsk->maj_flt++;
                      perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
              } else {
                      tsk->min_flt++;
 2955                 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
   47         }
 3363 
              check_v8086_mode(regs, address, tsk);
 2950 }
      NOKPROBE_SYMBOL(__do_page_fault);
      
      dotraplinkage void notrace
      do_page_fault(struct pt_regs *regs, unsigned long error_code)
      {
              unsigned long address = read_cr2(); /* Get the faulting address */
              enum ctx_state prev_state;
      
              /*
               * We must have this function tagged with __kprobes, notrace and call
 3363          * read_cr2() before calling anything else. To avoid calling any kind
               * of tracing machinery before we've observed the CR2 value.
               *
               * exception_{enter,exit}() contain all sorts of tracepoints.
               */
      
              prev_state = exception_enter();
              __do_page_fault(regs, error_code, address);
              exception_exit(prev_state);
      }
      NOKPROBE_SYMBOL(do_page_fault);
      
      #ifdef CONFIG_TRACING
      static nokprobe_inline void
      trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
                               unsigned long error_code)
      {
              if (user_mode(regs))
                      trace_page_fault_user(address, regs, error_code);
              else
                      trace_page_fault_kernel(address, regs, error_code);
      }
      
      dotraplinkage void notrace
      trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
      {
              /*
               * The exception_enter and tracepoint processing could
               * trigger another page faults (user space callchain
               * reading) and destroy the original cr2 value, so read
               * the faulting address now.
               */
              unsigned long address = read_cr2();
              enum ctx_state prev_state;
      
              prev_state = exception_enter();
              trace_page_fault_entries(address, regs, error_code);
              __do_page_fault(regs, error_code, address);
              exception_exit(prev_state);
      }
      NOKPROBE_SYMBOL(trace_do_page_fault);
      #endif /* CONFIG_TRACING */
      /*
       * Generic PPP layer for Linux.
       *
       * Copyright 1999-2002 Paul Mackerras.
       *
       *  This program is free software; you can redistribute it and/or
       *  modify it under the terms of the GNU General Public License
       *  as published by the Free Software Foundation; either version
       *  2 of the License, or (at your option) any later version.
       *
       * The generic PPP layer handles the PPP network interfaces, the
       * /dev/ppp device, packet and VJ compression, and multilink.
       * It talks to PPP `channels' via the interface defined in
       * include/linux/ppp_channel.h.  Channels provide the basic means for
       * sending and receiving PPP frames on some kind of communications
       * channel.
       *
       * Part of the code in this driver was inspired by the old async-only
       * PPP driver, written by Michael Callahan and Al Longyear, and
       * subsequently hacked by Paul Mackerras.
       *
       * ==FILEVERSION 20041108==
       */
      
      #include <linux/module.h>
      #include <linux/kernel.h>
      #include <linux/kmod.h>
      #include <linux/init.h>
      #include <linux/list.h>
      #include <linux/idr.h>
      #include <linux/netdevice.h>
      #include <linux/poll.h>
      #include <linux/ppp_defs.h>
      #include <linux/filter.h>
      #include <linux/ppp-ioctl.h>
      #include <linux/ppp_channel.h>
      #include <linux/ppp-comp.h>
      #include <linux/skbuff.h>
      #include <linux/rtnetlink.h>
      #include <linux/if_arp.h>
      #include <linux/ip.h>
      #include <linux/tcp.h>
      #include <linux/spinlock.h>
      #include <linux/rwsem.h>
      #include <linux/stddef.h>
      #include <linux/device.h>
      #include <linux/mutex.h>
      #include <linux/slab.h>
      #include <asm/unaligned.h>
      #include <net/slhc_vj.h>
      #include <linux/atomic.h>
      
      #include <linux/nsproxy.h>
      #include <net/net_namespace.h>
      #include <net/netns/generic.h>
      
      #define PPP_VERSION        "2.4.2"
      
      /*
       * Network protocols we support.
       */
      #define NP_IP        0                /* Internet Protocol V4 */
      #define NP_IPV6        1                /* Internet Protocol V6 */
      #define NP_IPX        2                /* IPX protocol */
      #define NP_AT        3                /* Appletalk protocol */
      #define NP_MPLS_UC 4                /* MPLS unicast */
      #define NP_MPLS_MC 5                /* MPLS multicast */
      #define NUM_NP        6                /* Number of NPs. */
      
      #define MPHDRLEN        6        /* multilink protocol header length */
      #define MPHDRLEN_SSN        4        /* ditto with short sequence numbers */
      
      /*
       * An instance of /dev/ppp can be associated with either a ppp
       * interface unit or a ppp channel.  In both cases, file->private_data
       * points to one of these.
       */
      struct ppp_file {
              enum {
                      INTERFACE=1, CHANNEL
              }                kind;
              struct sk_buff_head xq;                /* pppd transmit queue */
              struct sk_buff_head rq;                /* receive queue for pppd */
              wait_queue_head_t rwait;        /* for poll on reading /dev/ppp */
              atomic_t        refcnt;                /* # refs (incl /dev/ppp attached) */
              int                hdrlen;                /* space to leave for headers */
              int                index;                /* interface unit / channel number */
              int                dead;                /* unit/channel has been shut down */
      };
      
      #define PF_TO_X(pf, X)                container_of(pf, X, file)
      
      #define PF_TO_PPP(pf)                PF_TO_X(pf, struct ppp)
      #define PF_TO_CHANNEL(pf)        PF_TO_X(pf, struct channel)
      
      /*
       * Data structure to hold primary network stats for which
       * we want to use 64 bit storage.  Other network stats
       * are stored in dev->stats of the ppp strucute.
       */
      struct ppp_link_stats {
              u64 rx_packets;
              u64 tx_packets;
              u64 rx_bytes;
              u64 tx_bytes;
      };
      
      /*
       * Data structure describing one ppp unit.
       * A ppp unit corresponds to a ppp network interface device
       * and represents a multilink bundle.
       * It can have 0 or more ppp channels connected to it.
       */
      struct ppp {
              struct ppp_file        file;                /* stuff for read/write/poll 0 */
              struct file        *owner;                /* file that owns this unit 48 */
              struct list_head channels;        /* list of attached channels 4c */
              int                n_channels;        /* how many channels are attached 54 */
              spinlock_t        rlock;                /* lock for receive side 58 */
              spinlock_t        wlock;                /* lock for transmit side 5c */
              int                mru;                /* max receive unit 60 */
              unsigned int        flags;                /* control bits 64 */
              unsigned int        xstate;                /* transmit state bits 68 */
              unsigned int        rstate;                /* receive state bits 6c */
              int                debug;                /* debug flags 70 */
              struct slcompress *vj;                /* state for VJ header compression */
              enum NPmode        npmode[NUM_NP];        /* what to do with each net proto 78 */
              struct sk_buff        *xmit_pending;        /* a packet ready to go out 88 */
              struct compressor *xcomp;        /* transmit packet compressor 8c */
              void                *xc_state;        /* its internal state 90 */
              struct compressor *rcomp;        /* receive decompressor 94 */
              void                *rc_state;        /* its internal state 98 */
              unsigned long        last_xmit;        /* jiffies when last pkt sent 9c */
              unsigned long        last_recv;        /* jiffies when last pkt rcvd a0 */
              struct net_device *dev;                /* network interface device a4 */
              int                closing;        /* is device closing down? a8 */
      #ifdef CONFIG_PPP_MULTILINK
              int                nxchan;                /* next channel to send something on */
              u32                nxseq;                /* next sequence number to send */
              int                mrru;                /* MP: max reconst. receive unit */
              u32                nextseq;        /* MP: seq no of next packet */
              u32                minseq;                /* MP: min of most recent seqnos */
              struct sk_buff_head mrq;        /* MP: receive reconstruction queue */
      #endif /* CONFIG_PPP_MULTILINK */
      #ifdef CONFIG_PPP_FILTER
              struct bpf_prog *pass_filter;        /* filter for packets to pass */
              struct bpf_prog *active_filter; /* filter for pkts to reset idle */
      #endif /* CONFIG_PPP_FILTER */
              struct net        *ppp_net;        /* the net we belong to */
              struct ppp_link_stats stats64;        /* 64 bit network stats */
      };
      
      /*
       * Bits in flags: SC_NO_TCP_CCID, SC_CCP_OPEN, SC_CCP_UP, SC_LOOP_TRAFFIC,
       * SC_MULTILINK, SC_MP_SHORTSEQ, SC_MP_XSHORTSEQ, SC_COMP_TCP, SC_REJ_COMP_TCP,
       * SC_MUST_COMP
       * Bits in rstate: SC_DECOMP_RUN, SC_DC_ERROR, SC_DC_FERROR.
       * Bits in xstate: SC_COMP_RUN
       */
      #define SC_FLAG_BITS        (SC_NO_TCP_CCID|SC_CCP_OPEN|SC_CCP_UP|SC_LOOP_TRAFFIC \
                               |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
                               |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)
      
      /*
       * Private data structure for each channel.
       * This includes the data structure used for multilink.
       */
      struct channel {
              struct ppp_file        file;                /* stuff for read/write/poll */
              struct list_head list;                /* link in all/new_channels list */
              struct ppp_channel *chan;        /* public channel data structure */
              struct rw_semaphore chan_sem;        /* protects `chan' during chan ioctl */
              spinlock_t        downl;                /* protects `chan', file.xq dequeue */
              struct ppp        *ppp;                /* ppp unit we're connected to */
              struct net        *chan_net;        /* the net channel belongs to */
              struct list_head clist;                /* link in list of channels per unit */
              rwlock_t        upl;                /* protects `ppp' */
      #ifdef CONFIG_PPP_MULTILINK
              u8                avail;                /* flag used in multilink stuff */
              u8                had_frag;        /* >= 1 fragments have been sent */
              u32                lastseq;        /* MP: last sequence # received */
              int                speed;                /* speed of the corresponding ppp channel*/
      #endif /* CONFIG_PPP_MULTILINK */
      };
      
      /*
       * SMP locking issues:
       * Both the ppp.rlock and ppp.wlock locks protect the ppp.channels
       * list and the ppp.n_channels field, you need to take both locks
       * before you modify them.
       * The lock ordering is: channel.upl -> ppp.wlock -> ppp.rlock ->
       * channel.downl.
       */
      
      static DEFINE_MUTEX(ppp_mutex);
      static atomic_t ppp_unit_count = ATOMIC_INIT(0);
      static atomic_t channel_count = ATOMIC_INIT(0);
      
      /* per-net private data for this module */
      static int ppp_net_id __read_mostly;
      struct ppp_net {
              /* units to ppp mapping */
              struct idr units_idr;
      
              /*
               * all_ppp_mutex protects the units_idr mapping.
               * It also ensures that finding a ppp unit in the units_idr
               * map and updating its file.refcnt field is atomic.
               */
              struct mutex all_ppp_mutex;
      
              /* channels */
              struct list_head all_channels;
              struct list_head new_channels;
              int last_channel_index;
      
              /*
               * all_channels_lock protects all_channels and
               * last_channel_index, and the atomicity of find
               * a channel and updating its file.refcnt field.
               */
              spinlock_t all_channels_lock;
      };
      
      /* Get the PPP protocol number from a skb */
      #define PPP_PROTO(skb)        get_unaligned_be16((skb)->data)
      
      /* We limit the length of ppp->file.rq to this (arbitrary) value */
      #define PPP_MAX_RQLEN        32
      
      /*
       * Maximum number of multilink fragments queued up.
       * This has to be large enough to cope with the maximum latency of
       * the slowest channel relative to the others.  Strictly it should
       * depend on the number of channels and their characteristics.
       */
      #define PPP_MP_MAX_QLEN        128
      
      /* Multilink header bits. */
      #define B        0x80                /* this fragment begins a packet */
      #define E        0x40                /* this fragment ends a packet */
      
      /* Compare multilink sequence numbers (assumed to be 32 bits wide) */
      #define seq_before(a, b)        ((s32)((a) - (b)) < 0)
      #define seq_after(a, b)                ((s32)((a) - (b)) > 0)
      
      /* Prototypes. */
      static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf,
                              struct file *file, unsigned int cmd, unsigned long arg);
      static void ppp_xmit_process(struct ppp *ppp);
      static void ppp_send_frame(struct ppp *ppp, struct sk_buff *skb);
      static void ppp_push(struct ppp *ppp);
      static void ppp_channel_push(struct channel *pch);
      static void ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb,
                                    struct channel *pch);
      static void ppp_receive_error(struct ppp *ppp);
      static void ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb);
      static struct sk_buff *ppp_decompress_frame(struct ppp *ppp,
                                                  struct sk_buff *skb);
      #ifdef CONFIG_PPP_MULTILINK
      static void ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb,
                                      struct channel *pch);
      static void ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb);
      static struct sk_buff *ppp_mp_reconstruct(struct ppp *ppp);
      static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb);
      #endif /* CONFIG_PPP_MULTILINK */
      static int ppp_set_compress(struct ppp *ppp, unsigned long arg);
      static void ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound);
      static void ppp_ccp_closed(struct ppp *ppp);
      static struct compressor *find_compressor(int type);
      static void ppp_get_stats(struct ppp *ppp, struct ppp_stats *st);
      static struct ppp *ppp_create_interface(struct net *net, int unit,
                                              struct file *file, int *retp);
      static void init_ppp_file(struct ppp_file *pf, int kind);
      static void ppp_destroy_interface(struct ppp *ppp);
      static struct ppp *ppp_find_unit(struct ppp_net *pn, int unit);
      static struct channel *ppp_find_channel(struct ppp_net *pn, int unit);
      static int ppp_connect_channel(struct channel *pch, int unit);
      static int ppp_disconnect_channel(struct channel *pch);
      static void ppp_destroy_channel(struct channel *pch);
      static int unit_get(struct idr *p, void *ptr);
      static int unit_set(struct idr *p, void *ptr, int n);
      static void unit_put(struct idr *p, int n);
      static void *unit_find(struct idr *p, int n);
      
      static const struct net_device_ops ppp_netdev_ops;
      
      static struct class *ppp_class;
      
      /* per net-namespace data */
      static inline struct ppp_net *ppp_pernet(struct net *net)
      {
              BUG_ON(!net);
      
              return net_generic(net, ppp_net_id);
      }
      
      /* Translates a PPP protocol number to a NP index (NP == network protocol) */
      static inline int proto_to_npindex(int proto)
      {
              switch (proto) {
              case PPP_IP:
                      return NP_IP;
              case PPP_IPV6:
                      return NP_IPV6;
              case PPP_IPX:
                      return NP_IPX;
              case PPP_AT:
                      return NP_AT;
              case PPP_MPLS_UC:
                      return NP_MPLS_UC;
              case PPP_MPLS_MC:
                      return NP_MPLS_MC;
              }
              return -EINVAL;
      }
      
      /* Translates an NP index into a PPP protocol number */
      static const int npindex_to_proto[NUM_NP] = {
              PPP_IP,
              PPP_IPV6,
              PPP_IPX,
              PPP_AT,
              PPP_MPLS_UC,
              PPP_MPLS_MC,
      };
      
      /* Translates an ethertype into an NP index */
      static inline int ethertype_to_npindex(int ethertype)
      {
              switch (ethertype) {
              case ETH_P_IP:
                      return NP_IP;
              case ETH_P_IPV6:
                      return NP_IPV6;
              case ETH_P_IPX:
                      return NP_IPX;
              case ETH_P_PPPTALK:
              case ETH_P_ATALK:
                      return NP_AT;
              case ETH_P_MPLS_UC:
                      return NP_MPLS_UC;
              case ETH_P_MPLS_MC:
                      return NP_MPLS_MC;
              }
              return -1;
      }
      
      /* Translates an NP index into an ethertype */
      static const int npindex_to_ethertype[NUM_NP] = {
              ETH_P_IP,
              ETH_P_IPV6,
              ETH_P_IPX,
              ETH_P_PPPTALK,
              ETH_P_MPLS_UC,
              ETH_P_MPLS_MC,
      };
      
      /*
       * Locking shorthand.
       */
      #define ppp_xmit_lock(ppp)        spin_lock_bh(&(ppp)->wlock)
      #define ppp_xmit_unlock(ppp)        spin_unlock_bh(&(ppp)->wlock)
      #define ppp_recv_lock(ppp)        spin_lock_bh(&(ppp)->rlock)
      #define ppp_recv_unlock(ppp)        spin_unlock_bh(&(ppp)->rlock)
      #define ppp_lock(ppp)                do { ppp_xmit_lock(ppp); \
                                           ppp_recv_lock(ppp); } while (0)
      #define ppp_unlock(ppp)                do { ppp_recv_unlock(ppp); \
                                           ppp_xmit_unlock(ppp); } while (0)
      
      /*
       * /dev/ppp device routines.
       * The /dev/ppp device is used by pppd to control the ppp unit.
       * It supports the read, write, ioctl and poll functions.
       * Open instances of /dev/ppp can be in one of three states:
       * unattached, attached to a ppp unit, or attached to a ppp channel.
       */
      static int ppp_open(struct inode *inode, struct file *file)
      {
              /*
               * This could (should?) be enforced by the permissions on /dev/ppp.
               */
    1         if (!capable(CAP_NET_ADMIN))
                      return -EPERM;
              return 0;
      }
      
      static int ppp_release(struct inode *unused, struct file *file)
      {
              struct ppp_file *pf = file->private_data;
              struct ppp *ppp;
      
              if (pf) {
                      file->private_data = NULL;
                      if (pf->kind == INTERFACE) {
                              ppp = PF_TO_PPP(pf);
                              rtnl_lock();
                              if (file == ppp->owner)
                                      unregister_netdevice(ppp->dev);
                              rtnl_unlock();
                      }
                      if (atomic_dec_and_test(&pf->refcnt)) {
                              switch (pf->kind) {
                              case INTERFACE:
                                      ppp_destroy_interface(PF_TO_PPP(pf));
                                      break;
                              case CHANNEL:
                                      ppp_destroy_channel(PF_TO_CHANNEL(pf));
                                      break;
                              }
                      }
              }
              return 0;
      }
      
      static ssize_t ppp_read(struct file *file, char __user *buf,
                              size_t count, loff_t *ppos)
      {
              struct ppp_file *pf = file->private_data;
              DECLARE_WAITQUEUE(wait, current);
              ssize_t ret;
              struct sk_buff *skb = NULL;
              struct iovec iov;
              struct iov_iter to;
      
              ret = count;
      
              if (!pf)
                      return -ENXIO;
              add_wait_queue(&pf->rwait, &wait);
              for (;;) {
                      set_current_state(TASK_INTERRUPTIBLE);
                      skb = skb_dequeue(&pf->rq);
                      if (skb)
                              break;
                      ret = 0;
                      if (pf->dead)
                              break;
                      if (pf->kind == INTERFACE) {
                              /*
                               * Return 0 (EOF) on an interface that has no
                               * channels connected, unless it is looping
                               * network traffic (demand mode).
                               */
                              struct ppp *ppp = PF_TO_PPP(pf);
                              if (ppp->n_channels == 0 &&
                                  (ppp->flags & SC_LOOP_TRAFFIC) == 0)
                                      break;
                      }
                      ret = -EAGAIN;
                      if (file->f_flags & O_NONBLOCK)
                              break;
                      ret = -ERESTARTSYS;
                      if (signal_pending(current))
                              break;
                      schedule();
              }
              set_current_state(TASK_RUNNING);
              remove_wait_queue(&pf->rwait, &wait);
      
              if (!skb)
                      goto out;
      
              ret = -EOVERFLOW;
              if (skb->len > count)
                      goto outf;
              ret = -EFAULT;
              iov.iov_base = buf;
              iov.iov_len = count;
              iov_iter_init(&to, READ, &iov, 1, count);
              if (skb_copy_datagram_iter(skb, 0, &to, skb->len))
                      goto outf;
              ret = skb->len;
      
       outf:
              kfree_skb(skb);
       out:
              return ret;
      }
      
      static ssize_t ppp_write(struct file *file, const char __user *buf,
                               size_t count, loff_t *ppos)
      {
              struct ppp_file *pf = file->private_data;
              struct sk_buff *skb;
              ssize_t ret;
      
              if (!pf)
                      return -ENXIO;
              ret = -ENOMEM;
              skb = alloc_skb(count + pf->hdrlen, GFP_KERNEL);
              if (!skb)
                      goto out;
              skb_reserve(skb, pf->hdrlen);
              ret = -EFAULT;
              if (copy_from_user(skb_put(skb, count), buf, count)) {
                      kfree_skb(skb);
                      goto out;
              }
      
              skb_queue_tail(&pf->xq, skb);
      
              switch (pf->kind) {
              case INTERFACE:
                      ppp_xmit_process(PF_TO_PPP(pf));
                      break;
              case CHANNEL:
                      ppp_channel_push(PF_TO_CHANNEL(pf));
                      break;
              }
      
              ret = count;
      
       out:
              return ret;
      }
      
      /* No kernel lock - fine */
      static unsigned int ppp_poll(struct file *file, poll_table *wait)
      {
              struct ppp_file *pf = file->private_data;
              unsigned int mask;
      
              if (!pf)
                      return 0;
              poll_wait(file, &pf->rwait, wait);
              mask = POLLOUT | POLLWRNORM;
              if (skb_peek(&pf->rq))
                      mask |= POLLIN | POLLRDNORM;
              if (pf->dead)
                      mask |= POLLHUP;
              else if (pf->kind == INTERFACE) {
                      /* see comment in ppp_read */
                      struct ppp *ppp = PF_TO_PPP(pf);
                      if (ppp->n_channels == 0 &&
                          (ppp->flags & SC_LOOP_TRAFFIC) == 0)
                              mask |= POLLIN | POLLRDNORM;
              }
      
              return mask;
      }
      
      #ifdef CONFIG_PPP_FILTER
      static int get_filter(void __user *arg, struct sock_filter **p)
      {
              struct sock_fprog uprog;
              struct sock_filter *code = NULL;
              int len;
      
              if (copy_from_user(&uprog, arg, sizeof(uprog)))
                      return -EFAULT;
      
              if (!uprog.len) {
                      *p = NULL;
                      return 0;
              }
      
              len = uprog.len * sizeof(struct sock_filter);
              code = memdup_user(uprog.filter, len);
              if (IS_ERR(code))
                      return PTR_ERR(code);
      
              *p = code;
              return uprog.len;
      }
      #endif /* CONFIG_PPP_FILTER */
      
      static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
      {
              struct ppp_file *pf;
              struct ppp *ppp;
              int err = -EFAULT, val, val2, i;
              struct ppp_idle idle;
              struct npioctl npi;
              int unit, cflags;
              struct slcompress *vj;
              void __user *argp = (void __user *)arg;
              int __user *p = argp;
      
              mutex_lock(&ppp_mutex);
      
              pf = file->private_data;
              if (!pf) {
                      err = ppp_unattached_ioctl(current->nsproxy->net_ns,
                                                 pf, file, cmd, arg);
                      goto out;
              }
      
              if (cmd == PPPIOCDETACH) {
                      /*
                       * We have to be careful here... if the file descriptor
                       * has been dup'd, we could have another process in the
                       * middle of a poll using the same file *, so we had
                       * better not free the interface data structures -
                       * instead we fail the ioctl.  Even in this case, we
                       * shut down the interface if we are the owner of it.
                       * Actually, we should get rid of PPPIOCDETACH, userland
                       * (i.e. pppd) could achieve the same effect by closing
                       * this fd and reopening /dev/ppp.
                       */
                      err = -EINVAL;
                      if (pf->kind == INTERFACE) {
                              ppp = PF_TO_PPP(pf);
                              rtnl_lock();
                              if (file == ppp->owner)
                                      unregister_netdevice(ppp->dev);
                              rtnl_unlock();
                      }
                      if (atomic_long_read(&file->f_count) < 2) {
                              ppp_release(NULL, file);
                              err = 0;
                      } else
                              pr_warn("PPPIOCDETACH file->f_count=%ld\n",
                                      atomic_long_read(&file->f_count));
                      goto out;
              }
      
              if (pf->kind == CHANNEL) {
                      struct channel *pch;
                      struct ppp_channel *chan;
      
                      pch = PF_TO_CHANNEL(pf);
      
                      switch (cmd) {
                      case PPPIOCCONNECT:
                              if (get_user(unit, p))
                                      break;
                              err = ppp_connect_channel(pch, unit);
                              break;
      
                      case PPPIOCDISCONN:
                              err = ppp_disconnect_channel(pch);
                              break;
      
                      default:
                              down_read(&pch->chan_sem);
                              chan = pch->chan;
                              err = -ENOTTY;
                              if (chan && chan->ops->ioctl)
                                      err = chan->ops->ioctl(chan, cmd, arg);
                              up_read(&pch->chan_sem);
                      }
                      goto out;
              }
      
              if (pf->kind != INTERFACE) {
                      /* can't happen */
                      pr_err("PPP: not interface or channel??\n");
                      err = -EINVAL;
                      goto out;
              }
      
              ppp = PF_TO_PPP(pf);
              switch (cmd) {
              case PPPIOCSMRU:
                      if (get_user(val, p))
                              break;
                      ppp->mru = val;
                      err = 0;
                      break;
      
              case PPPIOCSFLAGS:
                      if (get_user(val, p))
                              break;
                      ppp_lock(ppp);
                      cflags = ppp->flags & ~val;
      #ifdef CONFIG_PPP_MULTILINK
                      if (!(ppp->flags & SC_MULTILINK) && (val & SC_MULTILINK))
                              ppp->nextseq = 0;
      #endif
                      ppp->flags = val & SC_FLAG_BITS;
                      ppp_unlock(ppp);
                      if (cflags & SC_CCP_OPEN)
                              ppp_ccp_closed(ppp);
                      err = 0;
                      break;
      
              case PPPIOCGFLAGS:
                      val = ppp->flags | ppp->xstate | ppp->rstate;
                      if (put_user(val, p))
                              break;
                      err = 0;
                      break;
      
              case PPPIOCSCOMPRESS:
                      err = ppp_set_compress(ppp, arg);
                      break;
      
              case PPPIOCGUNIT:
                      if (put_user(ppp->file.index, p))
                              break;
                      err = 0;
                      break;
      
              case PPPIOCSDEBUG:
                      if (get_user(val, p))
                              break;
                      ppp->debug = val;
                      err = 0;
                      break;
      
              case PPPIOCGDEBUG:
                      if (put_user(ppp->debug, p))
                              break;
                      err = 0;
                      break;
      
              case PPPIOCGIDLE:
                      idle.xmit_idle = (jiffies - ppp->last_xmit) / HZ;
                      idle.recv_idle = (jiffies - ppp->last_recv) / HZ;
                      if (copy_to_user(argp, &idle, sizeof(idle)))
                              break;
                      err = 0;
                      break;
      
              case PPPIOCSMAXCID:
                      if (get_user(val, p))
                              break;
                      val2 = 15;
                      if ((val >> 16) != 0) {
                              val2 = val >> 16;
                              val &= 0xffff;
                      }
                      vj = slhc_init(val2+1, val+1);
                      if (IS_ERR(vj)) {
                              err = PTR_ERR(vj);
                              break;
                      }
                      ppp_lock(ppp);
                      if (ppp->vj)
                              slhc_free(ppp->vj);
                      ppp->vj = vj;
                      ppp_unlock(ppp);
                      err = 0;
                      break;
      
              case PPPIOCGNPMODE:
              case PPPIOCSNPMODE:
                      if (copy_from_user(&npi, argp, sizeof(npi)))
                              break;
                      err = proto_to_npindex(npi.protocol);
                      if (err < 0)
                              break;
                      i = err;
                      if (cmd == PPPIOCGNPMODE) {
                              err = -EFAULT;
                              npi.mode = ppp->npmode[i];
                              if (copy_to_user(argp, &npi, sizeof(npi)))
                                      break;
                      } else {
                              ppp->npmode[i] = npi.mode;
                              /* we may be able to transmit more packets now (??) */
                              netif_wake_queue(ppp->dev);
                      }
                      err = 0;
                      break;
      
      #ifdef CONFIG_PPP_FILTER
              case PPPIOCSPASS:
              {
                      struct sock_filter *code;
      
                      err = get_filter(argp, &code);
                      if (err >= 0) {
                              struct bpf_prog *pass_filter = NULL;
                              struct sock_fprog_kern fprog = {
                                      .len = err,
                                      .filter = code,
                              };
      
                              err = 0;
                              if (fprog.filter)
                                      err = bpf_prog_create(&pass_filter, &fprog);
                              if (!err) {
                                      ppp_lock(ppp);
                                      if (ppp->pass_filter)
                                              bpf_prog_destroy(ppp->pass_filter);
                                      ppp->pass_filter = pass_filter;
                                      ppp_unlock(ppp);
                              }
                              kfree(code);
                      }
                      break;
              }
              case PPPIOCSACTIVE:
              {
                      struct sock_filter *code;
      
                      err = get_filter(argp, &code);
                      if (err >= 0) {
                              struct bpf_prog *active_filter = NULL;
                              struct sock_fprog_kern fprog = {
                                      .len = err,
                                      .filter = code,
                              };
      
                              err = 0;
                              if (fprog.filter)
                                      err = bpf_prog_create(&active_filter, &fprog);
                              if (!err) {
                                      ppp_lock(ppp);
                                      if (ppp->active_filter)
                                              bpf_prog_destroy(ppp->active_filter);
                                      ppp->active_filter = active_filter;
                                      ppp_unlock(ppp);
                              }
                              kfree(code);
                      }
                      break;
              }
      #endif /* CONFIG_PPP_FILTER */
      
      #ifdef CONFIG_PPP_MULTILINK
              case PPPIOCSMRRU:
                      if (get_user(val, p))
                              break;
                      ppp_recv_lock(ppp);
                      ppp->mrru = val;
                      ppp_recv_unlock(ppp);
                      err = 0;
                      break;
      #endif /* CONFIG_PPP_MULTILINK */
      
              default:
                      err = -ENOTTY;
              }
      
      out:
              mutex_unlock(&ppp_mutex);
      
              return err;
      }
      
      static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf,
                              struct file *file, unsigned int cmd, unsigned long arg)
      {
              int unit, err = -EFAULT;
              struct ppp *ppp;
              struct channel *chan;
              struct ppp_net *pn;
              int __user *p = (int __user *)arg;
      
              switch (cmd) {
              case PPPIOCNEWUNIT:
                      /* Create a new ppp unit */
                      if (get_user(unit, p))
                              break;
                      ppp = ppp_create_interface(net, unit, file, &err);
                      if (!ppp)
                              break;
                      file->private_data = &ppp->file;
                      err = -EFAULT;
                      if (put_user(ppp->file.index, p))
                              break;
                      err = 0;
                      break;
      
              case PPPIOCATTACH:
                      /* Attach to an existing ppp unit */
                      if (get_user(unit, p))
                              break;
                      err = -ENXIO;
                      pn = ppp_pernet(net);
                      mutex_lock(&pn->all_ppp_mutex);
                      ppp = ppp_find_unit(pn, unit);
                      if (ppp) {
                              atomic_inc(&ppp->file.refcnt);
                              file->private_data = &ppp->file;
                              err = 0;
                      }
                      mutex_unlock(&pn->all_ppp_mutex);
                      break;
      
              case PPPIOCATTCHAN:
                      if (get_user(unit, p))
                              break;
                      err = -ENXIO;
                      pn = ppp_pernet(net);
                      spin_lock_bh(&pn->all_channels_lock);
                      chan = ppp_find_channel(pn, unit);
                      if (chan) {
                              atomic_inc(&chan->file.refcnt);
                              file->private_data = &chan->file;
                              err = 0;
                      }
                      spin_unlock_bh(&pn->all_channels_lock);
                      break;
      
              default:
                      err = -ENOTTY;
              }
      
              return err;
      }
      
      static const struct file_operations ppp_device_fops = {
              .owner                = THIS_MODULE,
              .read                = ppp_read,
              .write                = ppp_write,
              .poll                = ppp_poll,
              .unlocked_ioctl        = ppp_ioctl,
              .open                = ppp_open,
              .release        = ppp_release,
              .llseek                = noop_llseek,
      };
      
      static __net_init int ppp_init_net(struct net *net)
      {
   12         struct ppp_net *pn = net_generic(net, ppp_net_id);
      
              idr_init(&pn->units_idr);
              mutex_init(&pn->all_ppp_mutex);
      
              INIT_LIST_HEAD(&pn->all_channels);
              INIT_LIST_HEAD(&pn->new_channels);
      
              spin_lock_init(&pn->all_channels_lock);
      
              return 0;
      }
      
      static __net_exit void ppp_exit_net(struct net *net)
      {
              struct ppp_net *pn = net_generic(net, ppp_net_id);
              struct net_device *dev;
              struct net_device *aux;
              struct ppp *ppp;
              LIST_HEAD(list);
              int id;
      
              rtnl_lock();
              for_each_netdev_safe(net, dev, aux) {
                      if (dev->netdev_ops == &ppp_netdev_ops)
                              unregister_netdevice_queue(dev, &list);
              }
      
              idr_for_each_entry(&pn->units_idr, ppp, id)
                      /* Skip devices already unregistered by previous loop */
                      if (!net_eq(dev_net(ppp->dev), net))
                              unregister_netdevice_queue(ppp->dev, &list);
      
              unregister_netdevice_many(&list);
              rtnl_unlock();
      
              mutex_destroy(&pn->all_ppp_mutex);
              idr_destroy(&pn->units_idr);
      }
      
      static struct pernet_operations ppp_net_ops = {
              .init = ppp_init_net,
              .exit = ppp_exit_net,
              .id   = &ppp_net_id,
              .size = sizeof(struct ppp_net),
      };
      
      #define PPP_MAJOR        108
      
      /* Called at boot time if ppp is compiled into the kernel,
         or at module load time (from init_module) if compiled as a module. */
      static int __init ppp_init(void)
      {
              int err;
      
              pr_info("PPP generic driver version " PPP_VERSION "\n");
      
              err = register_pernet_device(&ppp_net_ops);
              if (err) {
                      pr_err("failed to register PPP pernet device (%d)\n", err);
                      goto out;
              }
      
              err = register_chrdev(PPP_MAJOR, "ppp", &ppp_device_fops);
              if (err) {
                      pr_err("failed to register PPP device (%d)\n", err);
                      goto out_net;
              }
      
              ppp_class = class_create(THIS_MODULE, "ppp");
              if (IS_ERR(ppp_class)) {
                      err = PTR_ERR(ppp_class);
                      goto out_chrdev;
              }
      
              /* not a big deal if we fail here :-) */
              device_create(ppp_class, NULL, MKDEV(PPP_MAJOR, 0), NULL, "ppp");
      
              return 0;
      
      out_chrdev:
              unregister_chrdev(PPP_MAJOR, "ppp");
      out_net:
              unregister_pernet_device(&ppp_net_ops);
      out:
              return err;
      }
      
      /*
       * Network interface unit routines.
       */
      static netdev_tx_t
      ppp_start_xmit(struct sk_buff *skb, struct net_device *dev)
      {
              struct ppp *ppp = netdev_priv(dev);
              int npi, proto;
              unsigned char *pp;
      
              npi = ethertype_to_npindex(ntohs(skb->protocol));
              if (npi < 0)
                      goto outf;
      
              /* Drop, accept or reject the packet */
              switch (ppp->npmode[npi]) {
              case NPMODE_PASS:
                      break;
              case NPMODE_QUEUE:
                      /* it would be nice to have a way to tell the network
                         system to queue this one up for later. */
                      goto outf;
              case NPMODE_DROP:
              case NPMODE_ERROR:
                      goto outf;
              }
      
              /* Put the 2-byte PPP protocol number on the front,
                 making sure there is room for the address and control fields. */
              if (skb_cow_head(skb, PPP_HDRLEN))
                      goto outf;
      
              pp = skb_push(skb, 2);
              proto = npindex_to_proto[npi];
              put_unaligned_be16(proto, pp);
      
              skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(dev)));
              skb_queue_tail(&ppp->file.xq, skb);
              ppp_xmit_process(ppp);
              return NETDEV_TX_OK;
      
       outf:
              kfree_skb(skb);
              ++dev->stats.tx_dropped;
              return NETDEV_TX_OK;
      }
      
      static int
      ppp_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
      {
              struct ppp *ppp = netdev_priv(dev);
              int err = -EFAULT;
              void __user *addr = (void __user *) ifr->ifr_ifru.ifru_data;
              struct ppp_stats stats;
              struct ppp_comp_stats cstats;
              char *vers;
      
              switch (cmd) {
              case SIOCGPPPSTATS:
                      ppp_get_stats(ppp, &stats);
                      if (copy_to_user(addr, &stats, sizeof(stats)))
                              break;
                      err = 0;
                      break;
      
              case SIOCGPPPCSTATS:
                      memset(&cstats, 0, sizeof(cstats));
                      if (ppp->xc_state)
                              ppp->xcomp->comp_stat(ppp->xc_state, &cstats.c);
                      if (ppp->rc_state)
                              ppp->rcomp->decomp_stat(ppp->rc_state, &cstats.d);
                      if (copy_to_user(addr, &cstats, sizeof(cstats)))
                              break;
                      err = 0;
                      break;
      
              case SIOCGPPPVER:
                      vers = PPP_VERSION;
                      if (copy_to_user(addr, vers, strlen(vers) + 1))
                              break;
                      err = 0;
                      break;
      
              default:
                      err = -EINVAL;
              }
      
              return err;
      }
      
      static struct rtnl_link_stats64*
      ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
      {
              struct ppp *ppp = netdev_priv(dev);
      
              ppp_recv_lock(ppp);
              stats64->rx_packets = ppp->stats64.rx_packets;
              stats64->rx_bytes   = ppp->stats64.rx_bytes;
              ppp_recv_unlock(ppp);
      
              ppp_xmit_lock(ppp);
              stats64->tx_packets = ppp->stats64.tx_packets;
              stats64->tx_bytes   = ppp->stats64.tx_bytes;
              ppp_xmit_unlock(ppp);
      
              stats64->rx_errors        = dev->stats.rx_errors;
              stats64->tx_errors        = dev->stats.tx_errors;
              stats64->rx_dropped       = dev->stats.rx_dropped;
              stats64->tx_dropped       = dev->stats.tx_dropped;
              stats64->rx_length_errors = dev->stats.rx_length_errors;
      
              return stats64;
      }
      
      static struct lock_class_key ppp_tx_busylock;
      static int ppp_dev_init(struct net_device *dev)
      {
              struct ppp *ppp;
      
              dev->qdisc_tx_busylock = &ppp_tx_busylock;
      
              ppp = netdev_priv(dev);
              /* Let the netdevice take a reference on the ppp file. This ensures
               * that ppp_destroy_interface() won't run before the device gets
               * unregistered.
               */
              atomic_inc(&ppp->file.refcnt);
      
              return 0;
      }
      
      static void ppp_dev_uninit(struct net_device *dev)
      {
              struct ppp *ppp = netdev_priv(dev);
              struct ppp_net *pn = ppp_pernet(ppp->ppp_net);
      
              ppp_lock(ppp);
              ppp->closing = 1;
              ppp_unlock(ppp);
      
              mutex_lock(&pn->all_ppp_mutex);
              unit_put(&pn->units_idr, ppp->file.index);
              mutex_unlock(&pn->all_ppp_mutex);
      
              ppp->owner = NULL;
      
              ppp->file.dead = 1;
              wake_up_interruptible(&ppp->file.rwait);
      }
      
      static void ppp_dev_priv_destructor(struct net_device *dev)
      {
              struct ppp *ppp;
      
              ppp = netdev_priv(dev);
              if (atomic_dec_and_test(&ppp->file.refcnt))
                      ppp_destroy_interface(ppp);
      }
      
      static const struct net_device_ops ppp_netdev_ops = {
              .ndo_init         = ppp_dev_init,
              .ndo_uninit      = ppp_dev_uninit,
              .ndo_start_xmit  = ppp_start_xmit,
              .ndo_do_ioctl    = ppp_net_ioctl,
              .ndo_get_stats64 = ppp_get_stats64,
      };
      
      static void ppp_setup(struct net_device *dev)
      {
              dev->netdev_ops = &ppp_netdev_ops;
              dev->hard_header_len = PPP_HDRLEN;
              dev->mtu = PPP_MRU;
              dev->addr_len = 0;
              dev->tx_queue_len = 3;
              dev->type = ARPHRD_PPP;
              dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
              dev->destructor = ppp_dev_priv_destructor;
              netif_keep_dst(dev);
      }
      
      /*
       * Transmit-side routines.
       */
      
      /*
       * Called to do any work queued up on the transmit side
       * that can now be done.
       */
      static void
      ppp_xmit_process(struct ppp *ppp)
      {
              struct sk_buff *skb;
      
              ppp_xmit_lock(ppp);
              if (!ppp->closing) {
                      ppp_push(ppp);
                      while (!ppp->xmit_pending &&
                             (skb = skb_dequeue(&ppp->file.xq)))
                              ppp_send_frame(ppp, skb);
                      /* If there's no work left to do, tell the core net
                         code that we can accept some more. */
                      if (!ppp->xmit_pending && !skb_peek(&ppp->file.xq))
                              netif_wake_queue(ppp->dev);
                      else
                              netif_stop_queue(ppp->dev);
              }
              ppp_xmit_unlock(ppp);
      }
      
      static inline struct sk_buff *
      pad_compress_skb(struct ppp *ppp, struct sk_buff *skb)
      {
              struct sk_buff *new_skb;
              int len;
              int new_skb_size = ppp->dev->mtu +
                      ppp->xcomp->comp_extra + ppp->dev->hard_header_len;
              int compressor_skb_size = ppp->dev->mtu +
                      ppp->xcomp->comp_extra + PPP_HDRLEN;
              new_skb = alloc_skb(new_skb_size, GFP_ATOMIC);
              if (!new_skb) {
                      if (net_ratelimit())
                              netdev_err(ppp->dev, "PPP: no memory (comp pkt)\n");
                      return NULL;
              }
              if (ppp->dev->hard_header_len > PPP_HDRLEN)
                      skb_reserve(new_skb,
                                  ppp->dev->hard_header_len - PPP_HDRLEN);
      
              /* compressor still expects A/C bytes in hdr */
              len = ppp->xcomp->compress(ppp->xc_state, skb->data - 2,
                                         new_skb->data, skb->len + 2,
                                         compressor_skb_size);
              if (len > 0 && (ppp->flags & SC_CCP_UP)) {
                      consume_skb(skb);
                      skb = new_skb;
                      skb_put(skb, len);
                      skb_pull(skb, 2);        /* pull off A/C bytes */
              } else if (len == 0) {
                      /* didn't compress, or CCP not up yet */
                      consume_skb(new_skb);
                      new_skb = skb;
              } else {
                      /*
                       * (len < 0)
                       * MPPE requires that we do not send unencrypted
                       * frames.  The compressor will return -1 if we
                       * should drop the frame.  We cannot simply test
                       * the compress_proto because MPPE and MPPC share
                       * the same number.
                       */
                      if (net_ratelimit())
                              netdev_err(ppp->dev, "ppp: compressor dropped pkt\n");
                      kfree_skb(skb);
                      consume_skb(new_skb);
                      new_skb = NULL;
              }
              return new_skb;
      }
      
      /*
       * Compress and send a frame.
       * The caller should have locked the xmit path,
       * and xmit_pending should be 0.
       */
      static void
      ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
      {
              int proto = PPP_PROTO(skb);
              struct sk_buff *new_skb;
              int len;
              unsigned char *cp;
      
              if (proto < 0x8000) {
      #ifdef CONFIG_PPP_FILTER
                      /* check if we should pass this packet */
                      /* the filter instructions are constructed assuming
                         a four-byte PPP header on each packet */
                      *skb_push(skb, 2) = 1;
                      if (ppp->pass_filter &&
                          BPF_PROG_RUN(ppp->pass_filter, skb) == 0) {
                              if (ppp->debug & 1)
                                      netdev_printk(KERN_DEBUG, ppp->dev,
                                                    "PPP: outbound frame "
                                                    "not passed\n");
                              kfree_skb(skb);
                              return;
                      }
                      /* if this packet passes the active filter, record the time */
                      if (!(ppp->active_filter &&
                            BPF_PROG_RUN(ppp->active_filter, skb) == 0))
                              ppp->last_xmit = jiffies;
                      skb_pull(skb, 2);
      #else
                      /* for data packets, record the time */
                      ppp->last_xmit = jiffies;
      #endif /* CONFIG_PPP_FILTER */
              }
      
              ++ppp->stats64.tx_packets;
              ppp->stats64.tx_bytes += skb->len - 2;
      
              switch (proto) {
              case PPP_IP:
                      if (!ppp->vj || (ppp->flags & SC_COMP_TCP) == 0)
                              break;
                      /* try to do VJ TCP header compression */
                      new_skb = alloc_skb(skb->len + ppp->dev->hard_header_len - 2,
                                          GFP_ATOMIC);
                      if (!new_skb) {
                              netdev_err(ppp->dev, "PPP: no memory (VJ comp pkt)\n");
                              goto drop;
                      }
                      skb_reserve(new_skb, ppp->dev->hard_header_len - 2);
                      cp = skb->data + 2;
                      len = slhc_compress(ppp->vj, cp, skb->len - 2,
                                          new_skb->data + 2, &cp,
                                          !(ppp->flags & SC_NO_TCP_CCID));
                      if (cp == skb->data + 2) {
                              /* didn't compress */
                              consume_skb(new_skb);
                      } else {
                              if (cp[0] & SL_TYPE_COMPRESSED_TCP) {
                                      proto = PPP_VJC_COMP;
                                      cp[0] &= ~SL_TYPE_COMPRESSED_TCP;
                              } else {
                                      proto = PPP_VJC_UNCOMP;
                                      cp[0] = skb->data[2];
                              }
                              consume_skb(skb);
                              skb = new_skb;
                              cp = skb_put(skb, len + 2);
                              cp[0] = 0;
                              cp[1] = proto;
                      }
                      break;
      
              case PPP_CCP:
                      /* peek at outbound CCP frames */
                      ppp_ccp_peek(ppp, skb, 0);
                      break;
              }
      
              /* try to do packet compression */
              if ((ppp->xstate & SC_COMP_RUN) && ppp->xc_state &&
                  proto != PPP_LCP && proto != PPP_CCP) {
                      if (!(ppp->flags & SC_CCP_UP) && (ppp->flags & SC_MUST_COMP)) {
                              if (net_ratelimit())
                                      netdev_err(ppp->dev,
                                                 "ppp: compression required but "
                                                 "down - pkt dropped.\n");
                              goto drop;
                      }
                      skb = pad_compress_skb(ppp, skb);
                      if (!skb)
                              goto drop;
              }
      
              /*
               * If we are waiting for traffic (demand dialling),
               * queue it up for pppd to receive.
               */
              if (ppp->flags & SC_LOOP_TRAFFIC) {
                      if (ppp->file.rq.qlen > PPP_MAX_RQLEN)
                              goto drop;
                      skb_queue_tail(&ppp->file.rq, skb);
                      wake_up_interruptible(&ppp->file.rwait);
                      return;
              }
      
              ppp->xmit_pending = skb;
              ppp_push(ppp);
              return;
      
       drop:
              kfree_skb(skb);
              ++ppp->dev->stats.tx_errors;
      }
      
      /*
       * Try to send the frame in xmit_pending.
       * The caller should have the xmit path locked.
       */
      static void
      ppp_push(struct ppp *ppp)
      {
              struct list_head *list;
              struct channel *pch;
              struct sk_buff *skb = ppp->xmit_pending;
      
              if (!skb)
                      return;
      
              list = &ppp->channels;
              if (list_empty(list)) {
                      /* nowhere to send the packet, just drop it */
                      ppp->xmit_pending = NULL;
                      kfree_skb(skb);
                      return;
              }
      
              if ((ppp->flags & SC_MULTILINK) == 0) {
                      /* not doing multilink: send it down the first channel */
                      list = list->next;
                      pch = list_entry(list, struct channel, clist);
      
                      spin_lock_bh(&pch->downl);
                      if (pch->chan) {
                              if (pch->chan->ops->start_xmit(pch->chan, skb))
                                      ppp->xmit_pending = NULL;
                      } else {
                              /* channel got unregistered */
                              kfree_skb(skb);
                              ppp->xmit_pending = NULL;
                      }
                      spin_unlock_bh(&pch->downl);
                      return;
              }
      
      #ifdef CONFIG_PPP_MULTILINK
              /* Multilink: fragment the packet over as many links
                 as can take the packet at the moment. */
              if (!ppp_mp_explode(ppp, skb))
                      return;
      #endif /* CONFIG_PPP_MULTILINK */
      
              ppp->xmit_pending = NULL;
              kfree_skb(skb);
      }
      
      #ifdef CONFIG_PPP_MULTILINK
      static bool mp_protocol_compress __read_mostly = true;
      module_param(mp_protocol_compress, bool, S_IRUGO | S_IWUSR);
      MODULE_PARM_DESC(mp_protocol_compress,
                       "compress protocol id in multilink fragments");
      
      /*
       * Divide a packet to be transmitted into fragments and
       * send them out the individual links.
       */
      static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
      {
              int len, totlen;
              int i, bits, hdrlen, mtu;
              int flen;
              int navail, nfree, nzero;
              int nbigger;
              int totspeed;
              int totfree;
              unsigned char *p, *q;
              struct list_head *list;
              struct channel *pch;
              struct sk_buff *frag;
              struct ppp_channel *chan;
      
              totspeed = 0; /*total bitrate of the bundle*/
              nfree = 0; /* # channels which have no packet already queued */
              navail = 0; /* total # of usable channels (not deregistered) */
              nzero = 0; /* number of channels with zero speed associated*/
              totfree = 0; /*total # of channels available and
                                        *having no queued packets before
                                        *starting the fragmentation*/
      
              hdrlen = (ppp->flags & SC_MP_XSHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN;
              i = 0;
              list_for_each_entry(pch, &ppp->channels, clist) {
                      if (pch->chan) {
                              pch->avail = 1;
                              navail++;
                              pch->speed = pch->chan->speed;
                      } else {
                              pch->avail = 0;
                      }
                      if (pch->avail) {
                              if (skb_queue_empty(&pch->file.xq) ||
                                      !pch->had_frag) {
                                              if (pch->speed == 0)
                                                      nzero++;
                                              else
                                                      totspeed += pch->speed;
      
                                              pch->avail = 2;
                                              ++nfree;
                                              ++totfree;
                                      }
                              if (!pch->had_frag && i < ppp->nxchan)
                                      ppp->nxchan = i;
                      }
                      ++i;
              }
              /*
               * Don't start sending this packet unless at least half of
               * the channels are free.  This gives much better TCP
               * performance if we have a lot of channels.
               */
              if (nfree == 0 || nfree < navail / 2)
                      return 0; /* can't take now, leave it in xmit_pending */
      
              /* Do protocol field compression */
              p = skb->data;
              len = skb->len;
              if (*p == 0 && mp_protocol_compress) {
                      ++p;
                      --len;
              }
      
              totlen = len;
              nbigger = len % nfree;
      
              /* skip to the channel after the one we last used
                 and start at that one */
              list = &ppp->channels;
              for (i = 0; i < ppp->nxchan; ++i) {
                      list = list->next;
                      if (list == &ppp->channels) {
                              i = 0;
                              break;
                      }
              }
      
              /* create a fragment for each channel */
              bits = B;
              while (len > 0) {
                      list = list->next;
                      if (list == &ppp->channels) {
                              i = 0;
                              continue;
                      }
                      pch = list_entry(list, struct channel, clist);
                      ++i;
                      if (!pch->avail)
                              continue;
      
                      /*
                       * Skip this channel if it has a fragment pending already and
                       * we haven't given a fragment to all of the free channels.
                       */
                      if (pch->avail == 1) {
                              if (nfree > 0)
                                      continue;
                      } else {
                              pch->avail = 1;
                      }
      
                      /* check the channel's mtu and whether it is still attached. */
                      spin_lock_bh(&pch->downl);
                      if (pch->chan == NULL) {
                              /* can't use this channel, it's being deregistered */
                              if (pch->speed == 0)
                                      nzero--;
                              else
                                      totspeed -= pch->speed;
      
                              spin_unlock_bh(&pch->downl);
                              pch->avail = 0;
                              totlen = len;
                              totfree--;
                              nfree--;
                              if (--navail == 0)
                                      break;
                              continue;
                      }
      
                      /*
                      *if the channel speed is not set divide
                      *the packet evenly among the free channels;
                      *otherwise divide it according to the speed
                      *of the channel we are going to transmit on
                      */
                      flen = len;
                      if (nfree > 0) {
                              if (pch->speed == 0) {
                                      flen = len/nfree;
                                      if (nbigger > 0) {
                                              flen++;
                                              nbigger--;
                                      }
                              } else {
                                      flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) /
                                              ((totspeed*totfree)/pch->speed)) - hdrlen;
                                      if (nbigger > 0) {
                                              flen += ((totfree - nzero)*pch->speed)/totspeed;
                                              nbigger -= ((totfree - nzero)*pch->speed)/
                                                              totspeed;
                                      }
                              }
                              nfree--;
                      }
      
                      /*
                       *check if we are on the last channel or
                       *we exceded the length of the data to
                       *fragment
                       */
                      if ((nfree <= 0) || (flen > len))
                              flen = len;
                      /*
                       *it is not worth to tx on slow channels:
                       *in that case from the resulting flen according to the
                       *above formula will be equal or less than zero.
                       *Skip the channel in this case
                       */
                      if (flen <= 0) {
                              pch->avail = 2;
                              spin_unlock_bh(&pch->downl);
                              continue;
                      }
      
                      /*
                       * hdrlen includes the 2-byte PPP protocol field, but the
                       * MTU counts only the payload excluding the protocol field.
                       * (RFC1661 Section 2)
                       */
                      mtu = pch->chan->mtu - (hdrlen - 2);
                      if (mtu < 4)
                              mtu = 4;
                      if (flen > mtu)
                              flen = mtu;
                      if (flen == len)
                              bits |= E;
                      frag = alloc_skb(flen + hdrlen + (flen == 0), GFP_ATOMIC);
                      if (!frag)
                              goto noskb;
                      q = skb_put(frag, flen + hdrlen);
      
                      /* make the MP header */
                      put_unaligned_be16(PPP_MP, q);
                      if (ppp->flags & SC_MP_XSHORTSEQ) {
                              q[2] = bits + ((ppp->nxseq >> 8) & 0xf);
                              q[3] = ppp->nxseq;
                      } else {
                              q[2] = bits;
                              q[3] = ppp->nxseq >> 16;
                              q[4] = ppp->nxseq >> 8;
                              q[5] = ppp->nxseq;
                      }
      
                      memcpy(q + hdrlen, p, flen);
      
                      /* try to send it down the channel */
                      chan = pch->chan;
                      if (!skb_queue_empty(&pch->file.xq) ||
                              !chan->ops->start_xmit(chan, frag))
                              skb_queue_tail(&pch->file.xq, frag);
                      pch->had_frag = 1;
                      p += flen;
                      len -= flen;
                      ++ppp->nxseq;
                      bits = 0;
                      spin_unlock_bh(&pch->downl);
              }
              ppp->nxchan = i;
      
              return 1;
      
       noskb:
              spin_unlock_bh(&pch->downl);
              if (ppp->debug & 1)
                      netdev_err(ppp->dev, "PPP: no memory (fragment)\n");
              ++ppp->dev->stats.tx_errors;
              ++ppp->nxseq;
              return 1;        /* abandon the frame */
      }
      #endif /* CONFIG_PPP_MULTILINK */
      
      /*
       * Try to send data out on a channel.
       */
      static void
      ppp_channel_push(struct channel *pch)
      {
              struct sk_buff *skb;
              struct ppp *ppp;
      
              spin_lock_bh(&pch->downl);
              if (pch->chan) {
                      while (!skb_queue_empty(&pch->file.xq)) {
                              skb = skb_dequeue(&pch->file.xq);
                              if (!pch->chan->ops->start_xmit(pch->chan, skb)) {
                                      /* put the packet back and try again later */
                                      skb_queue_head(&pch->file.xq, skb);
                                      break;
                              }
                      }
              } else {
                      /* channel got deregistered */
                      skb_queue_purge(&pch->file.xq);
              }
              spin_unlock_bh(&pch->downl);
              /* see if there is anything from the attached unit to be sent */
              if (skb_queue_empty(&pch->file.xq)) {
                      read_lock_bh(&pch->upl);
                      ppp = pch->ppp;
                      if (ppp)
                              ppp_xmit_process(ppp);
                      read_unlock_bh(&pch->upl);
              }
      }
      
      /*
       * Receive-side routines.
       */
      
      struct ppp_mp_skb_parm {
              u32                sequence;
              u8                BEbits;
      };
      #define PPP_MP_CB(skb)        ((struct ppp_mp_skb_parm *)((skb)->cb))
      
      static inline void
      ppp_do_recv(struct ppp *ppp, struct sk_buff *skb, struct channel *pch)
      {
              ppp_recv_lock(ppp);
              if (!ppp->closing)
                      ppp_receive_frame(ppp, skb, pch);
              else
                      kfree_skb(skb);
              ppp_recv_unlock(ppp);
      }
      
      void
      ppp_input(struct ppp_channel *chan, struct sk_buff *skb)
      {
              struct channel *pch = chan->ppp;
              int proto;
      
              if (!pch) {
                      kfree_skb(skb);
                      return;
              }
      
              read_lock_bh(&pch->upl);
              if (!pskb_may_pull(skb, 2)) {
                      kfree_skb(skb);
                      if (pch->ppp) {
                              ++pch->ppp->dev->stats.rx_length_errors;
                              ppp_receive_error(pch->ppp);
                      }
                      goto done;
              }
      
              proto = PPP_PROTO(skb);
              if (!pch->ppp || proto >= 0xc000 || proto == PPP_CCPFRAG) {
                      /* put it on the channel queue */
                      skb_queue_tail(&pch->file.rq, skb);
                      /* drop old frames if queue too long */
                      while (pch->file.rq.qlen > PPP_MAX_RQLEN &&
                             (skb = skb_dequeue(&pch->file.rq)))
                              kfree_skb(skb);
                      wake_up_interruptible(&pch->file.rwait);
              } else {
                      ppp_do_recv(pch->ppp, skb, pch);
              }
      
      done:
              read_unlock_bh(&pch->upl);
      }
      
      /* Put a 0-length skb in the receive queue as an error indication */
      void
      ppp_input_error(struct ppp_channel *chan, int code)
      {
              struct channel *pch = chan->ppp;
              struct sk_buff *skb;
      
              if (!pch)
                      return;
      
              read_lock_bh(&pch->upl);
              if (pch->ppp) {
                      skb = alloc_skb(0, GFP_ATOMIC);
                      if (skb) {
                              skb->len = 0;                /* probably unnecessary */
                              skb->cb[0] = code;
                              ppp_do_recv(pch->ppp, skb, pch);
                      }
              }
              read_unlock_bh(&pch->upl);
      }
      
      /*
       * We come in here to process a received frame.
       * The receive side of the ppp unit is locked.
       */
      static void
      ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch)
      {
              /* note: a 0-length skb is used as an error indication */
              if (skb->len > 0) {
                      skb_checksum_complete_unset(skb);
      #ifdef CONFIG_PPP_MULTILINK
                      /* XXX do channel-level decompression here */
                      if (PPP_PROTO(skb) == PPP_MP)
                              ppp_receive_mp_frame(ppp, skb, pch);
                      else
      #endif /* CONFIG_PPP_MULTILINK */
                              ppp_receive_nonmp_frame(ppp, skb);
              } else {
                      kfree_skb(skb);
                      ppp_receive_error(ppp);
              }
      }
      
      static void
      ppp_receive_error(struct ppp *ppp)
      {
              ++ppp->dev->stats.rx_errors;
              if (ppp->vj)
                      slhc_toss(ppp->vj);
      }
      
      static void
      ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
      {
              struct sk_buff *ns;
              int proto, len, npi;
      
              /*
               * Decompress the frame, if compressed.
               * Note that some decompressors need to see uncompressed frames
               * that come in as well as compressed frames.
               */
              if (ppp->rc_state && (ppp->rstate & SC_DECOMP_RUN) &&
                  (ppp->rstate & (SC_DC_FERROR | SC_DC_ERROR)) == 0)
                      skb = ppp_decompress_frame(ppp, skb);
      
              if (ppp->flags & SC_MUST_COMP && ppp->rstate & SC_DC_FERROR)
                      goto err;
      
              proto = PPP_PROTO(skb);
              switch (proto) {
              case PPP_VJC_COMP:
                      /* decompress VJ compressed packets */
                      if (!ppp->vj || (ppp->flags & SC_REJ_COMP_TCP))
                              goto err;
      
                      if (skb_tailroom(skb) < 124 || skb_cloned(skb)) {
                              /* copy to a new sk_buff with more tailroom */
                              ns = dev_alloc_skb(skb->len + 128);
                              if (!ns) {
                                      netdev_err(ppp->dev, "PPP: no memory "
                                                 "(VJ decomp)\n");
                                      goto err;
                              }
                              skb_reserve(ns, 2);
                              skb_copy_bits(skb, 0, skb_put(ns, skb->len), skb->len);
                              consume_skb(skb);
                              skb = ns;
                      }
                      else
                              skb->ip_summed = CHECKSUM_NONE;
      
                      len = slhc_uncompress(ppp->vj, skb->data + 2, skb->len - 2);
                      if (len <= 0) {
                              netdev_printk(KERN_DEBUG, ppp->dev,
                                            "PPP: VJ decompression error\n");
                              goto err;
                      }
                      len += 2;
                      if (len > skb->len)
                              skb_put(skb, len - skb->len);
                      else if (len < skb->len)
                              skb_trim(skb, len);
                      proto = PPP_IP;
                      break;
      
              case PPP_VJC_UNCOMP:
                      if (!ppp->vj || (ppp->flags & SC_REJ_COMP_TCP))
                              goto err;
      
                      /* Until we fix the decompressor need to make sure
                       * data portion is linear.
                       */
                      if (!pskb_may_pull(skb, skb->len))
                              goto err;
      
                      if (slhc_remember(ppp->vj, skb->data + 2, skb->len - 2) <= 0) {
                              netdev_err(ppp->dev, "PPP: VJ uncompressed error\n");
                              goto err;
                      }
                      proto = PPP_IP;
                      break;
      
              case PPP_CCP:
                      ppp_ccp_peek(ppp, skb, 1);
                      break;
              }
      
              ++ppp->stats64.rx_packets;
              ppp->stats64.rx_bytes += skb->len - 2;
      
              npi = proto_to_npindex(proto);
              if (npi < 0) {
                      /* control or unknown frame - pass it to pppd */
                      skb_queue_tail(&ppp->file.rq, skb);
                      /* limit queue length by dropping old frames */
                      while (ppp->file.rq.qlen > PPP_MAX_RQLEN &&
                             (skb = skb_dequeue(&ppp->file.rq)))
                              kfree_skb(skb);
                      /* wake up any process polling or blocking on read */
                      wake_up_interruptible(&ppp->file.rwait);
      
              } else {
                      /* network protocol frame - give it to the kernel */
      
      #ifdef CONFIG_PPP_FILTER
                      /* check if the packet passes the pass and active filters */
                      /* the filter instructions are constructed assuming
                         a four-byte PPP header on each packet */
                      if (ppp->pass_filter || ppp->active_filter) {
                              if (skb_unclone(skb, GFP_ATOMIC))
                                      goto err;
      
                              *skb_push(skb, 2) = 0;
                              if (ppp->pass_filter &&
                                  BPF_PROG_RUN(ppp->pass_filter, skb) == 0) {
                                      if (ppp->debug & 1)
                                              netdev_printk(KERN_DEBUG, ppp->dev,
                                                            "PPP: inbound frame "
                                                            "not passed\n");
                                      kfree_skb(skb);
                                      return;
                              }
                              if (!(ppp->active_filter &&
                                    BPF_PROG_RUN(ppp->active_filter, skb) == 0))
                                      ppp->last_recv = jiffies;
                              __skb_pull(skb, 2);
                      } else
      #endif /* CONFIG_PPP_FILTER */
                              ppp->last_recv = jiffies;
      
                      if ((ppp->dev->flags & IFF_UP) == 0 ||
                          ppp->npmode[npi] != NPMODE_PASS) {
                              kfree_skb(skb);
                      } else {
                              /* chop off protocol */
                              skb_pull_rcsum(skb, 2);
                              skb->dev = ppp->dev;
                              skb->protocol = htons(npindex_to_ethertype[npi]);
                              skb_reset_mac_header(skb);
                              skb_scrub_packet(skb, !net_eq(ppp->ppp_net,
                                                            dev_net(ppp->dev)));
                              netif_rx(skb);
                      }
              }
              return;
      
       err:
              kfree_skb(skb);
              ppp_receive_error(ppp);
      }
      
      static struct sk_buff *
      ppp_decompress_frame(struct ppp *ppp, struct sk_buff *skb)
      {
              int proto = PPP_PROTO(skb);
              struct sk_buff *ns;
              int len;
      
              /* Until we fix all the decompressor's need to make sure
               * data portion is linear.
               */
              if (!pskb_may_pull(skb, skb->len))
                      goto err;
      
              if (proto == PPP_COMP) {
                      int obuff_size;
      
                      switch(ppp->rcomp->compress_proto) {
                      case CI_MPPE:
                              obuff_size = ppp->mru + PPP_HDRLEN + 1;
                              break;
                      default:
                              obuff_size = ppp->mru + PPP_HDRLEN;
                              break;
                      }
      
                      ns = dev_alloc_skb(obuff_size);
                      if (!ns) {
                              netdev_err(ppp->dev, "ppp_decompress_frame: "
                                         "no memory\n");
                              goto err;
                      }
                      /* the decompressor still expects the A/C bytes in the hdr */
                      len = ppp->rcomp->decompress(ppp->rc_state, skb->data - 2,
                                      skb->len + 2, ns->data, obuff_size);
                      if (len < 0) {
                              /* Pass the compressed frame to pppd as an
                                 error indication. */
                              if (len == DECOMP_FATALERROR)
                                      ppp->rstate |= SC_DC_FERROR;
                              kfree_skb(ns);
                              goto err;
                      }
      
                      consume_skb(skb);
                      skb = ns;
                      skb_put(skb, len);
                      skb_pull(skb, 2);        /* pull off the A/C bytes */
      
              } else {
                      /* Uncompressed frame - pass to decompressor so it
                         can update its dictionary if necessary. */
                      if (ppp->rcomp->incomp)
                              ppp->rcomp->incomp(ppp->rc_state, skb->data - 2,
                                                 skb->len + 2);
              }
      
              return skb;
      
       err:
              ppp->rstate |= SC_DC_ERROR;
              ppp_receive_error(ppp);
              return skb;
      }
      
      #ifdef CONFIG_PPP_MULTILINK
      /*
       * Receive a multilink frame.
       * We put it on the reconstruction queue and then pull off
       * as many completed frames as we can.
       */
      static void
      ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch)
      {
              u32 mask, seq;
              struct channel *ch;
              int mphdrlen = (ppp->flags & SC_MP_SHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN;
      
              if (!pskb_may_pull(skb, mphdrlen + 1) || ppp->mrru == 0)
                      goto err;                /* no good, throw it away */
      
              /* Decode sequence number and begin/end bits */
              if (ppp->flags & SC_MP_SHORTSEQ) {
                      seq = ((skb->data[2] & 0x0f) << 8) | skb->data[3];
                      mask = 0xfff;
              } else {
                      seq = (skb->data[3] << 16) | (skb->data[4] << 8)| skb->data[5];
                      mask = 0xffffff;
              }
              PPP_MP_CB(skb)->BEbits = skb->data[2];
              skb_pull(skb, mphdrlen);        /* pull off PPP and MP headers */
      
              /*
               * Do protocol ID decompression on the first fragment of each packet.
               */
              if ((PPP_MP_CB(skb)->BEbits & B) && (skb->data[0] & 1))
                      *skb_push(skb, 1) = 0;
      
              /*
               * Expand sequence number to 32 bits, making it as close
               * as possible to ppp->minseq.
               */
              seq |= ppp->minseq & ~mask;
              if ((int)(ppp->minseq - seq) > (int)(mask >> 1))
                      seq += mask + 1;
              else if ((int)(seq - ppp->minseq) > (int)(mask >> 1))
                      seq -= mask + 1;        /* should never happen */
              PPP_MP_CB(skb)->sequence = seq;
              pch->lastseq = seq;
      
              /*
               * If this packet comes before the next one we were expecting,
               * drop it.
               */
              if (seq_before(seq, ppp->nextseq)) {
                      kfree_skb(skb);
                      ++ppp->dev->stats.rx_dropped;
                      ppp_receive_error(ppp);
                      return;
              }
      
              /*
               * Reevaluate minseq, the minimum over all channels of the
               * last sequence number received on each channel.  Because of
               * the increasing sequence number rule, we know that any fragment
               * before `minseq' which hasn't arrived is never going to arrive.
               * The list of channels can't change because we have the receive
               * side of the ppp unit locked.
               */
              list_for_each_entry(ch, &ppp->channels, clist) {
                      if (seq_before(ch->lastseq, seq))
                              seq = ch->lastseq;
              }
              if (seq_before(ppp->minseq, seq))
                      ppp->minseq = seq;
      
              /* Put the fragment on the reconstruction queue */
              ppp_mp_insert(ppp, skb);
      
              /* If the queue is getting long, don't wait any longer for packets
                 before the start of the queue. */
              if (skb_queue_len(&ppp->mrq) >= PPP_MP_MAX_QLEN) {
                      struct sk_buff *mskb = skb_peek(&ppp->mrq);
                      if (seq_before(ppp->minseq, PPP_MP_CB(mskb)->sequence))
                              ppp->minseq = PPP_MP_CB(mskb)->sequence;
              }
      
              /* Pull completed packets off the queue and receive them. */
              while ((skb = ppp_mp_reconstruct(ppp))) {
                      if (pskb_may_pull(skb, 2))
                              ppp_receive_nonmp_frame(ppp, skb);
                      else {
                              ++ppp->dev->stats.rx_length_errors;
                              kfree_skb(skb);
                              ppp_receive_error(ppp);
                      }
              }
      
              return;
      
       err:
              kfree_skb(skb);
              ppp_receive_error(ppp);
      }
      
      /*
       * Insert a fragment on the MP reconstruction queue.
       * The queue is ordered by increasing sequence number.
       */
      static void
      ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb)
      {
              struct sk_buff *p;
              struct sk_buff_head *list = &ppp->mrq;
              u32 seq = PPP_MP_CB(skb)->sequence;
      
              /* N.B. we don't need to lock the list lock because we have the
                 ppp unit receive-side lock. */
              skb_queue_walk(list, p) {
                      if (seq_before(seq, PPP_MP_CB(p)->sequence))
                              break;
              }
              __skb_queue_before(list, p, skb);
      }
      
      /*
       * Reconstruct a packet from the MP fragment queue.
       * We go through increasing sequence numbers until we find a
       * complete packet, or we get to the sequence number for a fragment
       * which hasn't arrived but might still do so.
       */
      static struct sk_buff *
      ppp_mp_reconstruct(struct ppp *ppp)
      {
              u32 seq = ppp->nextseq;
              u32 minseq = ppp->minseq;
              struct sk_buff_head *list = &ppp->mrq;
              struct sk_buff *p, *tmp;
              struct sk_buff *head, *tail;
              struct sk_buff *skb = NULL;
              int lost = 0, len = 0;
      
              if (ppp->mrru == 0)        /* do nothing until mrru is set */
                      return NULL;
              head = list->next;
              tail = NULL;
              skb_queue_walk_safe(list, p, tmp) {
              again:
                      if (seq_before(PPP_MP_CB(p)->sequence, seq)) {
                              /* this can't happen, anyway ignore the skb */
                              netdev_err(ppp->dev, "ppp_mp_reconstruct bad "
                                         "seq %u < %u\n",
                                         PPP_MP_CB(p)->sequence, seq);
                              __skb_unlink(p, list);
                              kfree_skb(p);
                              continue;
                      }
                      if (PPP_MP_CB(p)->sequence != seq) {
                              u32 oldseq;
                              /* Fragment `seq' is missing.  If it is after
                                 minseq, it might arrive later, so stop here. */
                              if (seq_after(seq, minseq))
                                      break;
                              /* Fragment `seq' is lost, keep going. */
                              lost = 1;
                              oldseq = seq;
                              seq = seq_before(minseq, PPP_MP_CB(p)->sequence)?
                                      minseq + 1: PPP_MP_CB(p)->sequence;
      
                              if (ppp->debug & 1)
                                      netdev_printk(KERN_DEBUG, ppp->dev,
                                                    "lost frag %u..%u\n",
                                                    oldseq, seq-1);
      
                              goto again;
                      }
      
                      /*
                       * At this point we know that all the fragments from
                       * ppp->nextseq to seq are either present or lost.
                       * Also, there are no complete packets in the queue
                       * that have no missing fragments and end before this
                       * fragment.
                       */
      
                      /* B bit set indicates this fragment starts a packet */
                      if (PPP_MP_CB(p)->BEbits & B) {
                              head = p;
                              lost = 0;
                              len = 0;
                      }
      
                      len += p->len;
      
                      /* Got a complete packet yet? */
                      if (lost == 0 && (PPP_MP_CB(p)->BEbits & E) &&
                          (PPP_MP_CB(head)->BEbits & B)) {
                              if (len > ppp->mrru + 2) {
                                      ++ppp->dev->stats.rx_length_errors;
                                      netdev_printk(KERN_DEBUG, ppp->dev,
                                                    "PPP: reconstructed packet"
                                                    " is too long (%d)\n", len);
                              } else {
                                      tail = p;
                                      break;
                              }
                              ppp->nextseq = seq + 1;
                      }
      
                      /*
                       * If this is the ending fragment of a packet,
                       * and we haven't found a complete valid packet yet,
                       * we can discard up to and including this fragment.
                       */
                      if (PPP_MP_CB(p)->BEbits & E) {
                              struct sk_buff *tmp2;
      
                              skb_queue_reverse_walk_from_safe(list, p, tmp2) {
                                      if (ppp->debug & 1)
                                              netdev_printk(KERN_DEBUG, ppp->dev,
                                                            "discarding frag %u\n",
                                                            PPP_MP_CB(p)->sequence);
                                      __skb_unlink(p, list);
                                      kfree_skb(p);
                              }
                              head = skb_peek(list);
                              if (!head)
                                      break;
                      }
                      ++seq;
              }
      
              /* If we have a complete packet, copy it all into one skb. */
              if (tail != NULL) {
                      /* If we have discarded any fragments,
                         signal a receive error. */
                      if (PPP_MP_CB(head)->sequence != ppp->nextseq) {
                              skb_queue_walk_safe(list, p, tmp) {
                                      if (p == head)
                                              break;
                                      if (ppp->debug & 1)
                                              netdev_printk(KERN_DEBUG, ppp->dev,
                                                            "discarding frag %u\n",
                                                            PPP_MP_CB(p)->sequence);
                                      __skb_unlink(p, list);
                                      kfree_skb(p);
                              }
      
                              if (ppp->debug & 1)
                                      netdev_printk(KERN_DEBUG, ppp->dev,
                                                    "  missed pkts %u..%u\n",
                                                    ppp->nextseq,
                                                    PPP_MP_CB(head)->sequence-1);
                              ++ppp->dev->stats.rx_dropped;
                              ppp_receive_error(ppp);
                      }
      
                      skb = head;
                      if (head != tail) {
                              struct sk_buff **fragpp = &skb_shinfo(skb)->frag_list;
                              p = skb_queue_next(list, head);
                              __skb_unlink(skb, list);
                              skb_queue_walk_from_safe(list, p, tmp) {
                                      __skb_unlink(p, list);
                                      *fragpp = p;
                                      p->next = NULL;
                                      fragpp = &p->next;
      
                                      skb->len += p->len;
                                      skb->data_len += p->len;
                                      skb->truesize += p->truesize;
      
                                      if (p == tail)
                                              break;
                              }
                      } else {
                              __skb_unlink(skb, list);
                      }
      
                      ppp->nextseq = PPP_MP_CB(tail)->sequence + 1;
              }
      
              return skb;
      }
      #endif /* CONFIG_PPP_MULTILINK */
      
      /*
       * Channel interface.
       */
      
      /* Create a new, unattached ppp channel. */
      int ppp_register_channel(struct ppp_channel *chan)
      {
              return ppp_register_net_channel(current->nsproxy->net_ns, chan);
      }
      
      /* Create a new, unattached ppp channel for specified net. */
      int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
      {
              struct channel *pch;
              struct ppp_net *pn;
      
              pch = kzalloc(sizeof(struct channel), GFP_KERNEL);
              if (!pch)
                      return -ENOMEM;
      
              pn = ppp_pernet(net);
      
              pch->ppp = NULL;
              pch->chan = chan;
              pch->chan_net = get_net(net);
              chan->ppp = pch;
              init_ppp_file(&pch->file, CHANNEL);
              pch->file.hdrlen = chan->hdrlen;
      #ifdef CONFIG_PPP_MULTILINK
              pch->lastseq = -1;
      #endif /* CONFIG_PPP_MULTILINK */
              init_rwsem(&pch->chan_sem);
              spin_lock_init(&pch->downl);
              rwlock_init(&pch->upl);
      
              spin_lock_bh(&pn->all_channels_lock);
              pch->file.index = ++pn->last_channel_index;
              list_add(&pch->list, &pn->new_channels);
              atomic_inc(&channel_count);
              spin_unlock_bh(&pn->all_channels_lock);
      
              return 0;
      }
      
      /*
       * Return the index of a channel.
       */
      int ppp_channel_index(struct ppp_channel *chan)
      {
              struct channel *pch = chan->ppp;
      
              if (pch)
                      return pch->file.index;
              return -1;
      }
      
      /*
       * Return the PPP unit number to which a channel is connected.
       */
      int ppp_unit_number(struct ppp_channel *chan)
      {
              struct channel *pch = chan->ppp;
              int unit = -1;
      
              if (pch) {
                      read_lock_bh(&pch->upl);
                      if (pch->ppp)
                              unit = pch->ppp->file.index;
                      read_unlock_bh(&pch->upl);
              }
              return unit;
      }
      
      /*
       * Return the PPP device interface name of a channel.
       */
      char *ppp_dev_name(struct ppp_channel *chan)
      {
              struct channel *pch = chan->ppp;
              char *name = NULL;
      
              if (pch) {
                      read_lock_bh(&pch->upl);
                      if (pch->ppp && pch->ppp->dev)
                              name = pch->ppp->dev->name;
                      read_unlock_bh(&pch->upl);
              }
              return name;
      }
      
      
      /*
       * Disconnect a channel from the generic layer.
       * This must be called in process context.
       */
      void
      ppp_unregister_channel(struct ppp_channel *chan)
      {
              struct channel *pch = chan->ppp;
              struct ppp_net *pn;
      
              if (!pch)
                      return;                /* should never happen */
      
              chan->ppp = NULL;
      
              /*
               * This ensures that we have returned from any calls into the
               * the channel's start_xmit or ioctl routine before we proceed.
               */
              down_write(&pch->chan_sem);
              spin_lock_bh(&pch->downl);
              pch->chan = NULL;
              spin_unlock_bh(&pch->downl);
              up_write(&pch->chan_sem);
              ppp_disconnect_channel(pch);
      
              pn = ppp_pernet(pch->chan_net);
              spin_lock_bh(&pn->all_channels_lock);
              list_del(&pch->list);
              spin_unlock_bh(&pn->all_channels_lock);
      
              pch->file.dead = 1;
              wake_up_interruptible(&pch->file.rwait);
              if (atomic_dec_and_test(&pch->file.refcnt))
                      ppp_destroy_channel(pch);
      }
      
      /*
       * Callback from a channel when it can accept more to transmit.
       * This should be called at BH/softirq level, not interrupt level.
       */
      void
      ppp_output_wakeup(struct ppp_channel *chan)
      {
              struct channel *pch = chan->ppp;
      
              if (!pch)
                      return;
              ppp_channel_push(pch);
      }
      
      /*
       * Compression control.
       */
      
      /* Process the PPPIOCSCOMPRESS ioctl. */
      static int
      ppp_set_compress(struct ppp *ppp, unsigned long arg)
      {
              int err;
              struct compressor *cp, *ocomp;
              struct ppp_option_data data;
              void *state, *ostate;
              unsigned char ccp_option[CCP_MAX_OPTION_LENGTH];
      
              err = -EFAULT;
              if (copy_from_user(&data, (void __user *) arg, sizeof(data)) ||
                  (data.length <= CCP_MAX_OPTION_LENGTH &&
                   copy_from_user(ccp_option, (void __user *) data.ptr, data.length)))
                      goto out;
              err = -EINVAL;
              if (data.length > CCP_MAX_OPTION_LENGTH ||
                  ccp_option[1] < 2 || ccp_option[1] > data.length)
                      goto out;
      
              cp = try_then_request_module(
                      find_compressor(ccp_option[0]),
                      "ppp-compress-%d", ccp_option[0]);
              if (!cp)
                      goto out;
      
              err = -ENOBUFS;
              if (data.transmit) {
                      state = cp->comp_alloc(ccp_option, data.length);
                      if (state) {
                              ppp_xmit_lock(ppp);
                              ppp->xstate &= ~SC_COMP_RUN;
                              ocomp = ppp->xcomp;
                              ostate = ppp->xc_state;
                              ppp->xcomp = cp;
                              ppp->xc_state = state;
                              ppp_xmit_unlock(ppp);
                              if (ostate) {
                                      ocomp->comp_free(ostate);
                                      module_put(ocomp->owner);
                              }
                              err = 0;
                      } else
                              module_put(cp->owner);
      
              } else {
                      state = cp->decomp_alloc(ccp_option, data.length);
                      if (state) {
                              ppp_recv_lock(ppp);
                              ppp->rstate &= ~SC_DECOMP_RUN;
                              ocomp = ppp->rcomp;
                              ostate = ppp->rc_state;
                              ppp->rcomp = cp;
                              ppp->rc_state = state;
                              ppp_recv_unlock(ppp);
                              if (ostate) {
                                      ocomp->decomp_free(ostate);
                                      module_put(ocomp->owner);
                              }
                              err = 0;
                      } else
                              module_put(cp->owner);
              }
      
       out:
              return err;
      }
      
      /*
       * Look at a CCP packet and update our state accordingly.
       * We assume the caller has the xmit or recv path locked.
       */
      static void
      ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound)
      {
              unsigned char *dp;
              int len;
      
              if (!pskb_may_pull(skb, CCP_HDRLEN + 2))
                      return;        /* no header */
              dp = skb->data + 2;
      
              switch (CCP_CODE(dp)) {
              case CCP_CONFREQ:
      
                      /* A ConfReq starts negotiation of compression
                       * in one direction of transmission,
                       * and hence brings it down...but which way?
                       *
                       * Remember:
                       * A ConfReq indicates what the sender would like to receive
                       */
                      if(inbound)
                              /* He is proposing what I should send */
                              ppp->xstate &= ~SC_COMP_RUN;
                      else
                              /* I am proposing to what he should send */
                              ppp->rstate &= ~SC_DECOMP_RUN;
      
                      break;
      
              case CCP_TERMREQ:
              case CCP_TERMACK:
                      /*
                       * CCP is going down, both directions of transmission
                       */
                      ppp->rstate &= ~SC_DECOMP_RUN;
                      ppp->xstate &= ~SC_COMP_RUN;
                      break;
      
              case CCP_CONFACK:
                      if ((ppp->flags & (SC_CCP_OPEN | SC_CCP_UP)) != SC_CCP_OPEN)
                              break;
                      len = CCP_LENGTH(dp);
                      if (!pskb_may_pull(skb, len + 2))
                              return;                /* too short */
                      dp += CCP_HDRLEN;
                      len -= CCP_HDRLEN;
                      if (len < CCP_OPT_MINLEN || len < CCP_OPT_LENGTH(dp))
                              break;
                      if (inbound) {
                              /* we will start receiving compressed packets */
                              if (!ppp->rc_state)
                                      break;
                              if (ppp->rcomp->decomp_init(ppp->rc_state, dp, len,
                                              ppp->file.index, 0, ppp->mru, ppp->debug)) {
                                      ppp->rstate |= SC_DECOMP_RUN;
                                      ppp->rstate &= ~(SC_DC_ERROR | SC_DC_FERROR);
                              }
                      } else {
                              /* we will soon start sending compressed packets */
                              if (!ppp->xc_state)
                                      break;
                              if (ppp->xcomp->comp_init(ppp->xc_state, dp, len,
                                              ppp->file.index, 0, ppp->debug))
                                      ppp->xstate |= SC_COMP_RUN;
                      }
                      break;
      
              case CCP_RESETACK:
                      /* reset the [de]compressor */
                      if ((ppp->flags & SC_CCP_UP) == 0)
                              break;
                      if (inbound) {
                              if (ppp->rc_state && (ppp->rstate & SC_DECOMP_RUN)) {
                                      ppp->rcomp->decomp_reset(ppp->rc_state);
                                      ppp->rstate &= ~SC_DC_ERROR;
                              }
                      } else {
                              if (ppp->xc_state && (ppp->xstate & SC_COMP_RUN))
                                      ppp->xcomp->comp_reset(ppp->xc_state);
                      }
                      break;
              }
      }
      
      /* Free up compression resources. */
      static void
      ppp_ccp_closed(struct ppp *ppp)
      {
              void *xstate, *rstate;
              struct compressor *xcomp, *rcomp;
      
              ppp_lock(ppp);
              ppp->flags &= ~(SC_CCP_OPEN | SC_CCP_UP);
              ppp->xstate = 0;
              xcomp = ppp->xcomp;
              xstate = ppp->xc_state;
              ppp->xc_state = NULL;
              ppp->rstate = 0;
              rcomp = ppp->rcomp;
              rstate = ppp->rc_state;
              ppp->rc_state = NULL;
              ppp_unlock(ppp);
      
              if (xstate) {
                      xcomp->comp_free(xstate);
                      module_put(xcomp->owner);
              }
              if (rstate) {
                      rcomp->decomp_free(rstate);
                      module_put(rcomp->owner);
              }
      }
      
      /* List of compressors. */
      static LIST_HEAD(compressor_list);
      static DEFINE_SPINLOCK(compressor_list_lock);
      
      struct compressor_entry {
              struct list_head list;
              struct compressor *comp;
      };
      
      static struct compressor_entry *
      find_comp_entry(int proto)
      {
              struct compressor_entry *ce;
      
              list_for_each_entry(ce, &compressor_list, list) {
                      if (ce->comp->compress_proto == proto)
                              return ce;
              }
              return NULL;
      }
      
      /* Register a compressor */
      int
      ppp_register_compressor(struct compressor *cp)
      {
              struct compressor_entry *ce;
              int ret;
              spin_lock(&compressor_list_lock);
              ret = -EEXIST;
              if (find_comp_entry(cp->compress_proto))
                      goto out;
              ret = -ENOMEM;
              ce = kmalloc(sizeof(struct compressor_entry), GFP_ATOMIC);
              if (!ce)
                      goto out;
              ret = 0;
              ce->comp = cp;
              list_add(&ce->list, &compressor_list);
       out:
              spin_unlock(&compressor_list_lock);
              return ret;
      }
      
      /* Unregister a compressor */
      void
      ppp_unregister_compressor(struct compressor *cp)
      {
              struct compressor_entry *ce;
      
              spin_lock(&compressor_list_lock);
              ce = find_comp_entry(cp->compress_proto);
              if (ce && ce->comp == cp) {
                      list_del(&ce->list);
                      kfree(ce);
              }
              spin_unlock(&compressor_list_lock);
      }
      
      /* Find a compressor. */
      static struct compressor *
      find_compressor(int type)
      {
              struct compressor_entry *ce;
              struct compressor *cp = NULL;
      
              spin_lock(&compressor_list_lock);
              ce = find_comp_entry(type);
              if (ce) {
                      cp = ce->comp;
                      if (!try_module_get(cp->owner))
                              cp = NULL;
              }
              spin_unlock(&compressor_list_lock);
              return cp;
      }
      
      /*
       * Miscelleneous stuff.
       */
      
      static void
      ppp_get_stats(struct ppp *ppp, struct ppp_stats *st)
      {
              struct slcompress *vj = ppp->vj;
      
              memset(st, 0, sizeof(*st));
              st->p.ppp_ipackets = ppp->stats64.rx_packets;
              st->p.ppp_ierrors = ppp->dev->stats.rx_errors;
              st->p.ppp_ibytes = ppp->stats64.rx_bytes;
              st->p.ppp_opackets = ppp->stats64.tx_packets;
              st->p.ppp_oerrors = ppp->dev->stats.tx_errors;
              st->p.ppp_obytes = ppp->stats64.tx_bytes;
              if (!vj)
                      return;
              st->vj.vjs_packets = vj->sls_o_compressed + vj->sls_o_uncompressed;
              st->vj.vjs_compressed = vj->sls_o_compressed;
              st->vj.vjs_searches = vj->sls_o_searches;
              st->vj.vjs_misses = vj->sls_o_misses;
              st->vj.vjs_errorin = vj->sls_i_error;
              st->vj.vjs_tossed = vj->sls_i_tossed;
              st->vj.vjs_uncompressedin = vj->sls_i_uncompressed;
              st->vj.vjs_compressedin = vj->sls_i_compressed;
      }
      
      /*
       * Stuff for handling the lists of ppp units and channels
       * and for initialization.
       */
      
      /*
       * Create a new ppp interface unit.  Fails if it can't allocate memory
       * or if there is already a unit with the requested number.
       * unit == -1 means allocate a new number.
       */
      static struct ppp *ppp_create_interface(struct net *net, int unit,
                                              struct file *file, int *retp)
      {
              struct ppp *ppp;
              struct ppp_net *pn;
              struct net_device *dev = NULL;
              int ret = -ENOMEM;
              int i;
      
              dev = alloc_netdev(sizeof(struct ppp), "", NET_NAME_UNKNOWN,
                                 ppp_setup);
              if (!dev)
                      goto out1;
      
              pn = ppp_pernet(net);
      
              ppp = netdev_priv(dev);
              ppp->dev = dev;
              ppp->mru = PPP_MRU;
              init_ppp_file(&ppp->file, INTERFACE);
              ppp->file.hdrlen = PPP_HDRLEN - 2;        /* don't count proto bytes */
              ppp->owner = file;
              for (i = 0; i < NUM_NP; ++i)
                      ppp->npmode[i] = NPMODE_PASS;
              INIT_LIST_HEAD(&ppp->channels);
              spin_lock_init(&ppp->rlock);
              spin_lock_init(&ppp->wlock);
      #ifdef CONFIG_PPP_MULTILINK
              ppp->minseq = -1;
              skb_queue_head_init(&ppp->mrq);
      #endif /* CONFIG_PPP_MULTILINK */
      #ifdef CONFIG_PPP_FILTER
              ppp->pass_filter = NULL;
              ppp->active_filter = NULL;
      #endif /* CONFIG_PPP_FILTER */
      
              /*
               * drum roll: don't forget to set
               * the net device is belong to
               */
              dev_net_set(dev, net);
      
              rtnl_lock();
              mutex_lock(&pn->all_ppp_mutex);
      
              if (unit < 0) {
                      unit = unit_get(&pn->units_idr, ppp);
                      if (unit < 0) {
                              ret = unit;
                              goto out2;
                      }
              } else {
                      ret = -EEXIST;
                      if (unit_find(&pn->units_idr, unit))
                              goto out2; /* unit already exists */
                      /*
                       * if caller need a specified unit number
                       * lets try to satisfy him, otherwise --
                       * he should better ask us for new unit number
                       *
                       * NOTE: yes I know that returning EEXIST it's not
                       * fair but at least pppd will ask us to allocate
                       * new unit in this case so user is happy :)
                       */
                      unit = unit_set(&pn->units_idr, ppp, unit);
                      if (unit < 0)
                              goto out2;
              }
      
              /* Initialize the new ppp unit */
              ppp->file.index = unit;
              sprintf(dev->name, "ppp%d", unit);
      
              ret = register_netdevice(dev);
              if (ret != 0) {
                      unit_put(&pn->units_idr, unit);
                      netdev_err(ppp->dev, "PPP: couldn't register device %s (%d)\n",
                                 dev->name, ret);
                      goto out2;
              }
      
              ppp->ppp_net = net;
      
              atomic_inc(&ppp_unit_count);
              mutex_unlock(&pn->all_ppp_mutex);
              rtnl_unlock();
      
              *retp = 0;
              return ppp;
      
      out2:
              mutex_unlock(&pn->all_ppp_mutex);
              rtnl_unlock();
              free_netdev(dev);
      out1:
              *retp = ret;
              return NULL;
      }
      
      /*
       * Initialize a ppp_file structure.
       */
      static void
      init_ppp_file(struct ppp_file *pf, int kind)
      {
              pf->kind = kind;
              skb_queue_head_init(&pf->xq);
              skb_queue_head_init(&pf->rq);
              atomic_set(&pf->refcnt, 1);
              init_waitqueue_head(&pf->rwait);
      }
      
      /*
       * Free the memory used by a ppp unit.  This is only called once
       * there are no channels connected to the unit and no file structs
       * that reference the unit.
       */
      static void ppp_destroy_interface(struct ppp *ppp)
      {
              atomic_dec(&ppp_unit_count);
      
              if (!ppp->file.dead || ppp->n_channels) {
                      /* "can't happen" */
                      netdev_err(ppp->dev, "ppp: destroying ppp struct %p "
                                 "but dead=%d n_channels=%d !\n",
                                 ppp, ppp->file.dead, ppp->n_channels);
                      return;
              }
      
              ppp_ccp_closed(ppp);
              if (ppp->vj) {
                      slhc_free(ppp->vj);
                      ppp->vj = NULL;
              }
              skb_queue_purge(&ppp->file.xq);
              skb_queue_purge(&ppp->file.rq);
      #ifdef CONFIG_PPP_MULTILINK
              skb_queue_purge(&ppp->mrq);
      #endif /* CONFIG_PPP_MULTILINK */
      #ifdef CONFIG_PPP_FILTER
              if (ppp->pass_filter) {
                      bpf_prog_destroy(ppp->pass_filter);
                      ppp->pass_filter = NULL;
              }
      
              if (ppp->active_filter) {
                      bpf_prog_destroy(ppp->active_filter);
                      ppp->active_filter = NULL;
              }
      #endif /* CONFIG_PPP_FILTER */
      
              kfree_skb(ppp->xmit_pending);
      
              free_netdev(ppp->dev);
      }
      
      /*
       * Locate an existing ppp unit.
       * The caller should have locked the all_ppp_mutex.
       */
      static struct ppp *
      ppp_find_unit(struct ppp_net *pn, int unit)
      {
              return unit_find(&pn->units_idr, unit);
      }
      
      /*
       * Locate an existing ppp channel.
       * The caller should have locked the all_channels_lock.
       * First we look in the new_channels list, then in the
       * all_channels list.  If found in the new_channels list,
       * we move it to the all_channels list.  This is for speed
       * when we have a lot of channels in use.
       */
      static struct channel *
      ppp_find_channel(struct ppp_net *pn, int unit)
      {
              struct channel *pch;
      
              list_for_each_entry(pch, &pn->new_channels, list) {
                      if (pch->file.index == unit) {
                              list_move(&pch->list, &pn->all_channels);
                              return pch;
                      }
              }
      
              list_for_each_entry(pch, &pn->all_channels, list) {
                      if (pch->file.index == unit)
                              return pch;
              }
      
              return NULL;
      }
      
      /*
       * Connect a PPP channel to a PPP interface unit.
       */
      static int
      ppp_connect_channel(struct channel *pch, int unit)
      {
              struct ppp *ppp;
              struct ppp_net *pn;
              int ret = -ENXIO;
              int hdrlen;
      
              pn = ppp_pernet(pch->chan_net);
      
              mutex_lock(&pn->all_ppp_mutex);
              ppp = ppp_find_unit(pn, unit);
              if (!ppp)
                      goto out;
              write_lock_bh(&pch->upl);
              ret = -EINVAL;
              if (pch->ppp)
                      goto outl;
      
              ppp_lock(ppp);
              spin_lock_bh(&pch->downl);
              if (!pch->chan) {
                      /* Don't connect unregistered channels */
                      spin_unlock_bh(&pch->downl);
                      ppp_unlock(ppp);
                      ret = -ENOTCONN;
                      goto outl;
              }
              spin_unlock_bh(&pch->downl);
              if (pch->file.hdrlen > ppp->file.hdrlen)
                      ppp->file.hdrlen = pch->file.hdrlen;
              hdrlen = pch->file.hdrlen + 2;        /* for protocol bytes */
              if (hdrlen > ppp->dev->hard_header_len)
                      ppp->dev->hard_header_len = hdrlen;
              list_add_tail(&pch->clist, &ppp->channels);
              ++ppp->n_channels;
              pch->ppp = ppp;
              atomic_inc(&ppp->file.refcnt);
              ppp_unlock(ppp);
              ret = 0;
      
       outl:
              write_unlock_bh(&pch->upl);
       out:
              mutex_unlock(&pn->all_ppp_mutex);
              return ret;
      }
      
      /*
       * Disconnect a channel from its ppp unit.
       */
      static int
      ppp_disconnect_channel(struct channel *pch)
      {
              struct ppp *ppp;
              int err = -EINVAL;
      
              write_lock_bh(&pch->upl);
              ppp = pch->ppp;
              pch->ppp = NULL;
              write_unlock_bh(&pch->upl);
              if (ppp) {
                      /* remove it from the ppp unit's list */
                      ppp_lock(ppp);
                      list_del(&pch->clist);
                      if (--ppp->n_channels == 0)
                              wake_up_interruptible(&ppp->file.rwait);
                      ppp_unlock(ppp);
                      if (atomic_dec_and_test(&ppp->file.refcnt))
                              ppp_destroy_interface(ppp);
                      err = 0;
              }
              return err;
      }
      
      /*
       * Free up the resources used by a ppp channel.
       */
      static void ppp_destroy_channel(struct channel *pch)
      {
              put_net(pch->chan_net);
              pch->chan_net = NULL;
      
              atomic_dec(&channel_count);
      
              if (!pch->file.dead) {
                      /* "can't happen" */
                      pr_err("ppp: destroying undead channel %p !\n", pch);
                      return;
              }
              skb_queue_purge(&pch->file.xq);
              skb_queue_purge(&pch->file.rq);
              kfree(pch);
      }
      
      static void __exit ppp_cleanup(void)
      {
              /* should never happen */
              if (atomic_read(&ppp_unit_count) || atomic_read(&channel_count))
                      pr_err("PPP: removing module but units remain!\n");
              unregister_chrdev(PPP_MAJOR, "ppp");
              device_destroy(ppp_class, MKDEV(PPP_MAJOR, 0));
              class_destroy(ppp_class);
              unregister_pernet_device(&ppp_net_ops);
      }
      
      /*
       * Units handling. Caller must protect concurrent access
       * by holding all_ppp_mutex
       */
      
      /* associate pointer with specified number */
      static int unit_set(struct idr *p, void *ptr, int n)
      {
              int unit;
      
              unit = idr_alloc(p, ptr, n, n + 1, GFP_KERNEL);
              if (unit == -ENOSPC)
                      unit = -EINVAL;
              return unit;
      }
      
      /* get new free unit number and associate pointer with it */
      static int unit_get(struct idr *p, void *ptr)
      {
              return idr_alloc(p, ptr, 0, 0, GFP_KERNEL);
      }
      
      /* put unit number back to a pool */
      static void unit_put(struct idr *p, int n)
      {
              idr_remove(p, n);
      }
      
      /* get pointer associated with the number */
      static void *unit_find(struct idr *p, int n)
      {
              return idr_find(p, n);
      }
      
      /* Module/initialization stuff */
      
      module_init(ppp_init);
      module_exit(ppp_cleanup);
      
      EXPORT_SYMBOL(ppp_register_net_channel);
      EXPORT_SYMBOL(ppp_register_channel);
      EXPORT_SYMBOL(ppp_unregister_channel);
      EXPORT_SYMBOL(ppp_channel_index);
      EXPORT_SYMBOL(ppp_unit_number);
      EXPORT_SYMBOL(ppp_dev_name);
      EXPORT_SYMBOL(ppp_input);
      EXPORT_SYMBOL(ppp_input_error);
      EXPORT_SYMBOL(ppp_output_wakeup);
      EXPORT_SYMBOL(ppp_register_compressor);
      EXPORT_SYMBOL(ppp_unregister_compressor);
      MODULE_LICENSE("GPL");
      MODULE_ALIAS_CHARDEV(PPP_MAJOR, 0);
      MODULE_ALIAS("devname:ppp");
      /*
       *  mm/pgtable-generic.c
       *
       *  Generic pgtable methods declared in asm-generic/pgtable.h
       *
       *  Copyright (C) 2010  Linus Torvalds
       */
      
      #include <linux/pagemap.h>
      #include <asm/tlb.h>
      #include <asm-generic/pgtable.h>
      
      /*
       * If a p?d_bad entry is found while walking page tables, report
       * the error, before resetting entry to p?d_none.  Usually (but
       * very seldom) called out from the p?d_none_or_clear_bad macros.
       */
      
      void pgd_clear_bad(pgd_t *pgd)
      {
              pgd_ERROR(*pgd);
              pgd_clear(pgd);
      }
      
      void pud_clear_bad(pud_t *pud)
      {
              pud_ERROR(*pud);
              pud_clear(pud);
      }
      
      void pmd_clear_bad(pmd_t *pmd)
      {
              pmd_ERROR(*pmd);
              pmd_clear(pmd);
      }
      
      #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
      /*
       * Only sets the access flags (dirty, accessed), as well as write 
       * permission. Furthermore, we know it always gets set to a "more
       * permissive" setting, which allows most architectures to optimize
       * this. We return whether the PTE actually changed, which in turn
       * instructs the caller to do things like update__mmu_cache.  This
       * used to be done in the caller, but sparc needs minor faults to
       * force that call on sun4c so we changed this macro slightly
       */
      int ptep_set_access_flags(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep,
                                pte_t entry, int dirty)
      {
              int changed = !pte_same(*ptep, entry);
              if (changed) {
                      set_pte_at(vma->vm_mm, address, ptep, entry);
                      flush_tlb_fix_spurious_fault(vma, address);
              }
              return changed;
      }
      #endif
      
      #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
      int ptep_clear_flush_young(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep)
      {
              int young;
              young = ptep_test_and_clear_young(vma, address, ptep);
              if (young)
                      flush_tlb_page(vma, address);
              return young;
      }
      #endif
      
      #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
      pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
                             pte_t *ptep)
      {
 1032         struct mm_struct *mm = (vma)->vm_mm;
              pte_t pte;
              pte = ptep_get_and_clear(mm, address, ptep);
    1         if (pte_accessible(mm, pte))
 1031                 flush_tlb_page(vma, address);
 1032         return pte;
      }
      #endif
      
      #ifdef CONFIG_TRANSPARENT_HUGEPAGE
      
      #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
      
      /*
       * ARCHes with special requirements for evicting THP backing TLB entries can
       * implement this. Otherwise also, it can help optimize normal TLB flush in
       * THP regime. stock flush_tlb_range() typically has optimization to nuke the
       * entire TLB TLB if flush span is greater than a threshhold, which will
       * likely be true for a single huge page. Thus a single thp flush will
       * invalidate the entire TLB which is not desitable.
       * e.g. see arch/arc: flush_pmd_tlb_range
       */
      #define flush_pmd_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
      #endif
      
      #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
      int pmdp_set_access_flags(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp,
                                pmd_t entry, int dirty)
      {
              int changed = !pmd_same(*pmdp, entry);
              VM_BUG_ON(address & ~HPAGE_PMD_MASK);
              if (changed) {
                      set_pmd_at(vma->vm_mm, address, pmdp, entry);
                      flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
              }
              return changed;
      }
      #endif
      
      #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
      int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp)
      {
              int young;
              VM_BUG_ON(address & ~HPAGE_PMD_MASK);
              young = pmdp_test_and_clear_young(vma, address, pmdp);
              if (young)
                      flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
              return young;
      }
      #endif
      
      #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
      pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                                  pmd_t *pmdp)
      {
              pmd_t pmd;
              VM_BUG_ON(address & ~HPAGE_PMD_MASK);
              VM_BUG_ON(!pmd_trans_huge(*pmdp));
              pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
              flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
              return pmd;
      }
      #endif
      
      #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
      void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
                                pmd_t *pmdp)
      {
              pmd_t pmd = pmd_mksplitting(*pmdp);
              VM_BUG_ON(address & ~HPAGE_PMD_MASK);
              set_pmd_at(vma->vm_mm, address, pmdp, pmd);
              /* tlb flush only to serialize against gup-fast */
              flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
      }
      #endif
      
      #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
      void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                      pgtable_t pgtable)
      {
              assert_spin_locked(pmd_lockptr(mm, pmdp));
      
              /* FIFO */
              if (!pmd_huge_pte(mm, pmdp))
                      INIT_LIST_HEAD(&pgtable->lru);
              else
                      list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
              pmd_huge_pte(mm, pmdp) = pgtable;
      }
      #endif
      
      #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
      /* no "address" argument so destroys page coloring of some arch */
      pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
      {
              pgtable_t pgtable;
      
              assert_spin_locked(pmd_lockptr(mm, pmdp));
      
              /* FIFO */
              pgtable = pmd_huge_pte(mm, pmdp);
              if (list_empty(&pgtable->lru))
                      pmd_huge_pte(mm, pmdp) = NULL;
              else {
                      pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
                                                    struct page, lru);
                      list_del(&pgtable->lru);
              }
              return pgtable;
      }
      #endif
      
      #ifndef __HAVE_ARCH_PMDP_INVALIDATE
      void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                           pmd_t *pmdp)
      {
              pmd_t entry = *pmdp;
              set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
              flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
      }
      #endif
      
      #ifndef pmdp_collapse_flush
      pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
                                pmd_t *pmdp)
      {
              /*
               * pmd and hugepage pte format are same. So we could
               * use the same function.
               */
              pmd_t pmd;
      
              VM_BUG_ON(address & ~HPAGE_PMD_MASK);
              VM_BUG_ON(pmd_trans_huge(*pmdp));
              pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
      
              /* collapse entails shooting down ptes not pmd */
              flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
              return pmd;
      }
      #endif
      #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
      #ifndef _NF_NAT_CORE_H
      #define _NF_NAT_CORE_H
      #include <linux/list.h>
      #include <net/netfilter/nf_conntrack.h>
      #include <net/netfilter/nf_nat.h>
      
      /* This header used to share core functionality between the standalone
         NAT module, and the compatibility layer's use of NAT for masquerading. */
      
      unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                                 unsigned int hooknum, struct sk_buff *skb);
      
      int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family);
      
      static inline int nf_nat_initialized(struct nf_conn *ct,
                                           enum nf_nat_manip_type manip)
      {
  620         if (manip == NF_NAT_MANIP_SRC)
  497                 return ct->status & IPS_SRC_NAT_DONE;
              else
  620                 return ct->status & IPS_DST_NAT_DONE;
      }
      
      struct nlattr;
      
      extern int
      (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
                                        enum nf_nat_manip_type manip,
                                        const struct nlattr *attr);
      
      #endif /* _NF_NAT_CORE_H */
      /*
       * kernel/workqueue.c - generic async execution with shared worker pool
       *
       * Copyright (C) 2002                Ingo Molnar
       *
       *   Derived from the taskqueue/keventd code by:
       *     David Woodhouse <dwmw2@infradead.org>
       *     Andrew Morton
       *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
       *     Theodore Ts'o <tytso@mit.edu>
       *
       * Made to use alloc_percpu by Christoph Lameter.
       *
       * Copyright (C) 2010                SUSE Linux Products GmbH
       * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
       *
       * This is the generic async execution mechanism.  Work items as are
       * executed in process context.  The worker pool is shared and
       * automatically managed.  There are two worker pools for each CPU (one for
       * normal work items and the other for high priority ones) and some extra
       * pools for workqueues which are not bound to any specific CPU - the
       * number of these backing pools is dynamic.
       *
       * Please read Documentation/workqueue.txt for details.
       */
      
      #include <linux/export.h>
      #include <linux/kernel.h>
      #include <linux/sched.h>
      #include <linux/init.h>
      #include <linux/signal.h>
      #include <linux/completion.h>
      #include <linux/workqueue.h>
      #include <linux/slab.h>
      #include <linux/cpu.h>
      #include <linux/notifier.h>
      #include <linux/kthread.h>
      #include <linux/hardirq.h>
      #include <linux/mempolicy.h>
      #include <linux/freezer.h>
      #include <linux/kallsyms.h>
      #include <linux/debug_locks.h>
      #include <linux/lockdep.h>
      #include <linux/idr.h>
      #include <linux/jhash.h>
      #include <linux/hashtable.h>
      #include <linux/rculist.h>
      #include <linux/nodemask.h>
      #include <linux/moduleparam.h>
      #include <linux/uaccess.h>
      
      #include "workqueue_internal.h"
      
      enum {
              /*
               * worker_pool flags
               *
               * A bound pool is either associated or disassociated with its CPU.
               * While associated (!DISASSOCIATED), all workers are bound to the
               * CPU and none has %WORKER_UNBOUND set and concurrency management
               * is in effect.
               *
               * While DISASSOCIATED, the cpu may be offline and all workers have
               * %WORKER_UNBOUND set and concurrency management disabled, and may
               * be executing on any CPU.  The pool behaves as an unbound one.
               *
               * Note that DISASSOCIATED should be flipped only while holding
               * attach_mutex to avoid changing binding state while
               * worker_attach_to_pool() is in progress.
               */
              POOL_MANAGER_ACTIVE        = 1 << 0,        /* being managed */
              POOL_DISASSOCIATED        = 1 << 2,        /* cpu can't serve workers */
      
              /* worker flags */
              WORKER_DIE                = 1 << 1,        /* die die die */
              WORKER_IDLE                = 1 << 2,        /* is idle */
              WORKER_PREP                = 1 << 3,        /* preparing to run works */
              WORKER_CPU_INTENSIVE        = 1 << 6,        /* cpu intensive */
              WORKER_UNBOUND                = 1 << 7,        /* worker is unbound */
              WORKER_REBOUND                = 1 << 8,        /* worker was rebound */
      
              WORKER_NOT_RUNNING        = WORKER_PREP | WORKER_CPU_INTENSIVE |
                                        WORKER_UNBOUND | WORKER_REBOUND,
      
              NR_STD_WORKER_POOLS        = 2,                /* # standard pools per cpu */
      
              UNBOUND_POOL_HASH_ORDER        = 6,                /* hashed by pool->attrs */
              BUSY_WORKER_HASH_ORDER        = 6,                /* 64 pointers */
      
              MAX_IDLE_WORKERS_RATIO        = 4,                /* 1/4 of busy can be idle */
              IDLE_WORKER_TIMEOUT        = 300 * HZ,        /* keep idle ones for 5 mins */
      
              MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
                                                      /* call for help after 10ms
                                                         (min two ticks) */
              MAYDAY_INTERVAL                = HZ / 10,        /* and then every 100ms */
              CREATE_COOLDOWN                = HZ,                /* time to breath after fail */
      
              /*
               * Rescue workers are used only on emergencies and shared by
               * all cpus.  Give MIN_NICE.
               */
              RESCUER_NICE_LEVEL        = MIN_NICE,
              HIGHPRI_NICE_LEVEL        = MIN_NICE,
      
              WQ_NAME_LEN                = 24,
      };
      
      /*
       * Structure fields follow one of the following exclusion rules.
       *
       * I: Modifiable by initialization/destruction paths and read-only for
       *    everyone else.
       *
       * P: Preemption protected.  Disabling preemption is enough and should
       *    only be modified and accessed from the local cpu.
       *
       * L: pool->lock protected.  Access with pool->lock held.
       *
       * X: During normal operation, modification requires pool->lock and should
       *    be done only from local cpu.  Either disabling preemption on local
       *    cpu or grabbing pool->lock is enough for read access.  If
       *    POOL_DISASSOCIATED is set, it's identical to L.
       *
       * A: pool->attach_mutex protected.
       *
       * PL: wq_pool_mutex protected.
       *
       * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
       *
       * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
       *
       * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
       *      sched-RCU for reads.
       *
       * WQ: wq->mutex protected.
       *
       * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
       *
       * MD: wq_mayday_lock protected.
       */
      
      /* struct worker is defined in workqueue_internal.h */
      
      struct worker_pool {
              spinlock_t                lock;                /* the pool lock */
              int                        cpu;                /* I: the associated cpu */
              int                        node;                /* I: the associated node ID */
              int                        id;                /* I: pool ID */
              unsigned int                flags;                /* X: flags */
      
              struct list_head        worklist;        /* L: list of pending works */
              int                        nr_workers;        /* L: total number of workers */
      
              /* nr_idle includes the ones off idle_list for rebinding */
              int                        nr_idle;        /* L: currently idle ones */
      
              struct list_head        idle_list;        /* X: list of idle workers */
              struct timer_list        idle_timer;        /* L: worker idle timeout */
              struct timer_list        mayday_timer;        /* L: SOS timer for workers */
      
              /* a workers is either on busy_hash or idle_list, or the manager */
              DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                                                      /* L: hash of busy workers */
      
              /* see manage_workers() for details on the two manager mutexes */
              struct worker                *manager;        /* L: purely informational */
              struct mutex                attach_mutex;        /* attach/detach exclusion */
              struct list_head        workers;        /* A: attached workers */
              struct completion        *detach_completion; /* all workers detached */
      
              struct ida                worker_ida;        /* worker IDs for task name */
      
              struct workqueue_attrs        *attrs;                /* I: worker attributes */
              struct hlist_node        hash_node;        /* PL: unbound_pool_hash node */
              int                        refcnt;                /* PL: refcnt for unbound pools */
      
              /*
               * The current concurrency level.  As it's likely to be accessed
               * from other CPUs during try_to_wake_up(), put it in a separate
               * cacheline.
               */
              atomic_t                nr_running ____cacheline_aligned_in_smp;
      
              /*
               * Destruction of pool is sched-RCU protected to allow dereferences
               * from get_work_pool().
               */
              struct rcu_head                rcu;
      } ____cacheline_aligned_in_smp;
      
      /*
       * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
       * of work_struct->data are used for flags and the remaining high bits
       * point to the pwq; thus, pwqs need to be aligned at two's power of the
       * number of flag bits.
       */
      struct pool_workqueue {
              struct worker_pool        *pool;                /* I: the associated pool */
              struct workqueue_struct *wq;                /* I: the owning workqueue */
              int                        work_color;        /* L: current color */
              int                        flush_color;        /* L: flushing color */
              int                        refcnt;                /* L: reference count */
              int                        nr_in_flight[WORK_NR_COLORS];
                                                      /* L: nr of in_flight works */
              int                        nr_active;        /* L: nr of active works */
              int                        max_active;        /* L: max active works */
              struct list_head        delayed_works;        /* L: delayed works */
              struct list_head        pwqs_node;        /* WR: node on wq->pwqs */
              struct list_head        mayday_node;        /* MD: node on wq->maydays */
      
              /*
               * Release of unbound pwq is punted to system_wq.  See put_pwq()
               * and pwq_unbound_release_workfn() for details.  pool_workqueue
               * itself is also sched-RCU protected so that the first pwq can be
               * determined without grabbing wq->mutex.
               */
              struct work_struct        unbound_release_work;
              struct rcu_head                rcu;
      } __aligned(1 << WORK_STRUCT_FLAG_BITS);
      
      /*
       * Structure used to wait for workqueue flush.
       */
      struct wq_flusher {
              struct list_head        list;                /* WQ: list of flushers */
              int                        flush_color;        /* WQ: flush color waiting for */
              struct completion        done;                /* flush completion */
      };
      
      struct wq_device;
      
      /*
       * The externally visible workqueue.  It relays the issued work items to
       * the appropriate worker_pool through its pool_workqueues.
       */
      struct workqueue_struct {
              struct list_head        pwqs;                /* WR: all pwqs of this wq */
              struct list_head        list;                /* PR: list of all workqueues */
      
              struct mutex                mutex;                /* protects this wq */
              int                        work_color;        /* WQ: current work color */
              int                        flush_color;        /* WQ: current flush color */
              atomic_t                nr_pwqs_to_flush; /* flush in progress */
              struct wq_flusher        *first_flusher;        /* WQ: first flusher */
              struct list_head        flusher_queue;        /* WQ: flush waiters */
              struct list_head        flusher_overflow; /* WQ: flush overflow list */
      
              struct list_head        maydays;        /* MD: pwqs requesting rescue */
              struct worker                *rescuer;        /* I: rescue worker */
      
              int                        nr_drainers;        /* WQ: drain in progress */
              int                        saved_max_active; /* WQ: saved pwq max_active */
      
              struct workqueue_attrs        *unbound_attrs;        /* PW: only for unbound wqs */
              struct pool_workqueue        *dfl_pwq;        /* PW: only for unbound wqs */
      
      #ifdef CONFIG_SYSFS
              struct wq_device        *wq_dev;        /* I: for sysfs interface */
      #endif
      #ifdef CONFIG_LOCKDEP
              struct lockdep_map        lockdep_map;
      #endif
              char                        name[WQ_NAME_LEN]; /* I: workqueue name */
      
              /*
               * Destruction of workqueue_struct is sched-RCU protected to allow
               * walking the workqueues list without grabbing wq_pool_mutex.
               * This is used to dump all workqueues from sysrq.
               */
              struct rcu_head                rcu;
      
              /* hot fields used during command issue, aligned to cacheline */
              unsigned int                flags ____cacheline_aligned; /* WQ: WQ_* flags */
              struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
              struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
      };
      
      static struct kmem_cache *pwq_cache;
      
      static cpumask_var_t *wq_numa_possible_cpumask;
                                              /* possible CPUs of each node */
      
      static bool wq_disable_numa;
      module_param_named(disable_numa, wq_disable_numa, bool, 0444);
      
      /* see the comment above the definition of WQ_POWER_EFFICIENT */
      static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
      module_param_named(power_efficient, wq_power_efficient, bool, 0444);
      
      static bool wq_numa_enabled;                /* unbound NUMA affinity enabled */
      
      /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
      static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
      
      static DEFINE_MUTEX(wq_pool_mutex);        /* protects pools and workqueues list */
      static DEFINE_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
      static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
      
      static LIST_HEAD(workqueues);                /* PR: list of all workqueues */
      static bool workqueue_freezing;                /* PL: have wqs started freezing? */
      
      static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
      
      /* the per-cpu worker pools */
      static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
                                           cpu_worker_pools);
      
      static DEFINE_IDR(worker_pool_idr);        /* PR: idr of all pools */
      
      /* PL: hash of all unbound pools keyed by pool->attrs */
      static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
      
      /* I: attributes used when instantiating standard unbound pools on demand */
      static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
      
      /* I: attributes used when instantiating ordered pools on demand */
      static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
      
      struct workqueue_struct *system_wq __read_mostly;
      EXPORT_SYMBOL(system_wq);
      struct workqueue_struct *system_highpri_wq __read_mostly;
      EXPORT_SYMBOL_GPL(system_highpri_wq);
      struct workqueue_struct *system_long_wq __read_mostly;
      EXPORT_SYMBOL_GPL(system_long_wq);
      struct workqueue_struct *system_unbound_wq __read_mostly;
      EXPORT_SYMBOL_GPL(system_unbound_wq);
      struct workqueue_struct *system_freezable_wq __read_mostly;
      EXPORT_SYMBOL_GPL(system_freezable_wq);
      struct workqueue_struct *system_power_efficient_wq __read_mostly;
      EXPORT_SYMBOL_GPL(system_power_efficient_wq);
      struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
      EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
      
      static int worker_thread(void *__worker);
      static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
      
      #define CREATE_TRACE_POINTS
      #include <trace/events/workqueue.h>
      
      #define assert_rcu_or_pool_mutex()                                        \
              RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                        \
                               !lockdep_is_held(&wq_pool_mutex),                \
                               "sched RCU or wq_pool_mutex should be held")
      
      #define assert_rcu_or_wq_mutex(wq)                                        \
              RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                        \
                               !lockdep_is_held(&wq->mutex),                        \
                               "sched RCU or wq->mutex should be held")
      
      #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                        \
              RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                        \
                               !lockdep_is_held(&wq->mutex) &&                \
                               !lockdep_is_held(&wq_pool_mutex),                \
                               "sched RCU, wq->mutex or wq_pool_mutex should be held")
      
      #define for_each_cpu_worker_pool(pool, cpu)                                \
              for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];                \
                   (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
                   (pool)++)
      
      /**
       * for_each_pool - iterate through all worker_pools in the system
       * @pool: iteration cursor
       * @pi: integer used for iteration
       *
       * This must be called either with wq_pool_mutex held or sched RCU read
       * locked.  If the pool needs to be used beyond the locking in effect, the
       * caller is responsible for guaranteeing that the pool stays online.
       *
       * The if/else clause exists only for the lockdep assertion and can be
       * ignored.
       */
      #define for_each_pool(pool, pi)                                                \
              idr_for_each_entry(&worker_pool_idr, pool, pi)                        \
                      if (({ assert_rcu_or_pool_mutex(); false; })) { }        \
                      else
      
      /**
       * for_each_pool_worker - iterate through all workers of a worker_pool
       * @worker: iteration cursor
       * @pool: worker_pool to iterate workers of
       *
       * This must be called with @pool->attach_mutex.
       *
       * The if/else clause exists only for the lockdep assertion and can be
       * ignored.
       */
      #define for_each_pool_worker(worker, pool)                                \
              list_for_each_entry((worker), &(pool)->workers, node)                \
                      if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
                      else
      
      /**
       * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
       * @pwq: iteration cursor
       * @wq: the target workqueue
       *
       * This must be called either with wq->mutex held or sched RCU read locked.
       * If the pwq needs to be used beyond the locking in effect, the caller is
       * responsible for guaranteeing that the pwq stays online.
       *
       * The if/else clause exists only for the lockdep assertion and can be
       * ignored.
       */
      #define for_each_pwq(pwq, wq)                                                \
              list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)                \
                      if (({ assert_rcu_or_wq_mutex(wq); false; })) { }        \
                      else
      
      #ifdef CONFIG_DEBUG_OBJECTS_WORK
      
      static struct debug_obj_descr work_debug_descr;
      
      static void *work_debug_hint(void *addr)
      {
              return ((struct work_struct *) addr)->func;
      }
      
      /*
       * fixup_init is called when:
       * - an active object is initialized
       */
      static int work_fixup_init(void *addr, enum debug_obj_state state)
      {
              struct work_struct *work = addr;
      
              switch (state) {
              case ODEBUG_STATE_ACTIVE:
                      cancel_work_sync(work);
                      debug_object_init(work, &work_debug_descr);
                      return 1;
              default:
                      return 0;
              }
      }
      
      /*
       * fixup_activate is called when:
       * - an active object is activated
       * - an unknown object is activated (might be a statically initialized object)
       */
      static int work_fixup_activate(void *addr, enum debug_obj_state state)
      {
              struct work_struct *work = addr;
      
              switch (state) {
      
              case ODEBUG_STATE_NOTAVAILABLE:
                      /*
                       * This is not really a fixup. The work struct was
                       * statically initialized. We just make sure that it
                       * is tracked in the object tracker.
                       */
                      if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
                              debug_object_init(work, &work_debug_descr);
                              debug_object_activate(work, &work_debug_descr);
                              return 0;
                      }
                      WARN_ON_ONCE(1);
                      return 0;
      
              case ODEBUG_STATE_ACTIVE:
                      WARN_ON(1);
      
              default:
                      return 0;
              }
      }
      
      /*
       * fixup_free is called when:
       * - an active object is freed
       */
      static int work_fixup_free(void *addr, enum debug_obj_state state)
      {
              struct work_struct *work = addr;
      
              switch (state) {
              case ODEBUG_STATE_ACTIVE:
                      cancel_work_sync(work);
                      debug_object_free(work, &work_debug_descr);
                      return 1;
              default:
                      return 0;
              }
      }
      
      static struct debug_obj_descr work_debug_descr = {
              .name                = "work_struct",
              .debug_hint        = work_debug_hint,
              .fixup_init        = work_fixup_init,
              .fixup_activate        = work_fixup_activate,
              .fixup_free        = work_fixup_free,
      };
      
      static inline void debug_work_activate(struct work_struct *work)
      {
 1221         debug_object_activate(work, &work_debug_descr);
      }
      
      static inline void debug_work_deactivate(struct work_struct *work)
      {
  126         debug_object_deactivate(work, &work_debug_descr);
      }
      
      void __init_work(struct work_struct *work, int onstack)
      {
 1777         if (onstack)
   86                 debug_object_init_on_stack(work, &work_debug_descr);
              else
 1777                 debug_object_init(work, &work_debug_descr);
 1777 }
      EXPORT_SYMBOL_GPL(__init_work);
      
      void destroy_work_on_stack(struct work_struct *work)
      {
              debug_object_free(work, &work_debug_descr);
      }
      EXPORT_SYMBOL_GPL(destroy_work_on_stack);
      
      void destroy_delayed_work_on_stack(struct delayed_work *work)
      {
              destroy_timer_on_stack(&work->timer);
              debug_object_free(&work->work, &work_debug_descr);
      }
      EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
      
      #else
      static inline void debug_work_activate(struct work_struct *work) { }
      static inline void debug_work_deactivate(struct work_struct *work) { }
      #endif
      
      /**
       * worker_pool_assign_id - allocate ID and assing it to @pool
       * @pool: the pool pointer of interest
       *
       * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
       * successfully, -errno on failure.
       */
      static int worker_pool_assign_id(struct worker_pool *pool)
      {
              int ret;
      
              lockdep_assert_held(&wq_pool_mutex);
      
              ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
                              GFP_KERNEL);
              if (ret >= 0) {
                      pool->id = ret;
                      return 0;
              }
              return ret;
      }
      
      /**
       * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
       * @wq: the target workqueue
       * @node: the node ID
       *
       * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
       * read locked.
       * If the pwq needs to be used beyond the locking in effect, the caller is
       * responsible for guaranteeing that the pwq stays online.
       *
       * Return: The unbound pool_workqueue for @node.
       */
      static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
                                                        int node)
      {
  525         assert_rcu_or_wq_mutex_or_pool_mutex(wq);
      
              /*
               * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
               * delayed item is pending.  The plan is to keep CPU -> NODE
               * mapping valid and stable across CPU on/offlines.  Once that
               * happens, this workaround can be removed.
               */
              if (unlikely(node == NUMA_NO_NODE))
                      return wq->dfl_pwq;
      
  525         return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
      }
      
      static unsigned int work_color_to_flags(int color)
      {
              return color << WORK_STRUCT_COLOR_SHIFT;
      }
      
      static int get_work_color(struct work_struct *work)
      {
              return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
                      ((1 << WORK_STRUCT_COLOR_BITS) - 1);
      }
      
      static int work_next_color(int color)
      {
              return (color + 1) % WORK_NR_COLORS;
      }
      
      /*
       * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
       * contain the pointer to the queued pwq.  Once execution starts, the flag
       * is cleared and the high bits contain OFFQ flags and pool ID.
       *
       * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
       * and clear_work_data() can be used to set the pwq, pool or clear
       * work->data.  These functions should only be called while the work is
       * owned - ie. while the PENDING bit is set.
       *
       * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
       * corresponding to a work.  Pool is available once the work has been
       * queued anywhere after initialization until it is sync canceled.  pwq is
       * available only while the work item is queued.
       *
       * %WORK_OFFQ_CANCELING is used to mark a work item which is being
       * canceled.  While being canceled, a work item may have its PENDING set
       * but stay off timer and worklist for arbitrarily long and nobody should
       * try to steal the PENDING bit.
       */
      static inline void set_work_data(struct work_struct *work, unsigned long data,
                                       unsigned long flags)
      {
              WARN_ON_ONCE(!work_pending(work));
 1237         atomic_long_set(&work->data, data | flags | work_static(work));
      }
      
      static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
                               unsigned long extra_flags)
      {
 1221         set_work_data(work, (unsigned long)pwq,
                            WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
      }
      
      static void set_work_pool_and_keep_pending(struct work_struct *work,
                                                 int pool_id)
      {
  126         set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
                            WORK_STRUCT_PENDING);
      }
      
      static void set_work_pool_and_clear_pending(struct work_struct *work,
                                                  int pool_id)
      {
              /*
               * The following wmb is paired with the implied mb in
               * test_and_set_bit(PENDING) and ensures all updates to @work made
               * here are visible to and precede any updates by the next PENDING
               * owner.
               */
   36         smp_wmb();
   36         set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
              /*
               * The following mb guarantees that previous clear of a PENDING bit
               * will not be reordered with any speculative LOADS or STORES from
               * work->current_func, which is executed afterwards.  This possible
               * reordering can lead to a missed execution on attempt to qeueue
               * the same @work.  E.g. consider this case:
               *
               *   CPU#0                         CPU#1
               *   ----------------------------  --------------------------------
               *
               * 1  STORE event_indicated
               * 2  queue_work_on() {
               * 3    test_and_set_bit(PENDING)
               * 4 }                             set_..._and_clear_pending() {
               * 5                                 set_work_data() # clear bit
               * 6                                 smp_mb()
               * 7                               work->current_func() {
               * 8                                      LOAD event_indicated
               *                                   }
               *
               * Without an explicit full barrier speculative LOAD on line 8 can
               * be executed before CPU#0 does STORE on line 1.  If that happens,
               * CPU#0 observes the PENDING bit is still set and new execution of
               * a @work is not queued in a hope, that CPU#1 will eventually
               * finish the queued @work.  Meanwhile CPU#1 does not see
               * event_indicated is set, because speculative LOAD was executed
               * before actual STORE.
               */
              smp_mb();
      }
      
      static void clear_work_data(struct work_struct *work)
      {
              smp_wmb();        /* see set_work_pool_and_clear_pending() */
   67         set_work_data(work, WORK_STRUCT_NO_POOL, 0);
      }
      
      static struct pool_workqueue *get_work_pwq(struct work_struct *work)
      {
              unsigned long data = atomic_long_read(&work->data);
      
              if (data & WORK_STRUCT_PWQ)
  200                 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
              else
                      return NULL;
      }
      
      /**
       * get_work_pool - return the worker_pool a given work was associated with
       * @work: the work item of interest
       *
       * Pools are created and destroyed under wq_pool_mutex, and allows read
       * access under sched-RCU read lock.  As such, this function should be
       * called under wq_pool_mutex or with preemption disabled.
       *
       * All fields of the returned pool are accessible as long as the above
       * mentioned locking is in effect.  If the returned pool needs to be used
       * beyond the critical section, the caller is responsible for ensuring the
       * returned pool is and stays online.
       *
       * Return: The worker_pool @work was last associated with.  %NULL if none.
       */
      static struct worker_pool *get_work_pool(struct work_struct *work)
      {
 1272         unsigned long data = atomic_long_read(&work->data);
              int pool_id;
      
 1272         assert_rcu_or_pool_mutex();
      
 1272         if (data & WORK_STRUCT_PWQ)
                      return ((struct pool_workqueue *)
  201                         (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
      
 1271         pool_id = data >> WORK_OFFQ_POOL_SHIFT;
              if (pool_id == WORK_OFFQ_POOL_NONE)
                      return NULL;
      
  807         return idr_find(&worker_pool_idr, pool_id);
      }
      
      /**
       * get_work_pool_id - return the worker pool ID a given work is associated with
       * @work: the work item of interest
       *
       * Return: The worker_pool ID @work was last associated with.
       * %WORK_OFFQ_POOL_NONE if none.
       */
      static int get_work_pool_id(struct work_struct *work)
      {
  103         unsigned long data = atomic_long_read(&work->data);
      
              if (data & WORK_STRUCT_PWQ)
                      return ((struct pool_workqueue *)
                              (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
      
  103         return data >> WORK_OFFQ_POOL_SHIFT;
      }
      
      static void mark_work_canceling(struct work_struct *work)
      {
   67         unsigned long pool_id = get_work_pool_id(work);
      
              pool_id <<= WORK_OFFQ_POOL_SHIFT;
   67         set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
      }
      
      static bool work_is_canceling(struct work_struct *work)
      {
   10         unsigned long data = atomic_long_read(&work->data);
      
              return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
      }
      
      /*
       * Policy functions.  These define the policies on how the global worker
       * pools are managed.  Unless noted otherwise, these functions assume that
       * they're being called with pool->lock held.
       */
      
      static bool __need_more_worker(struct worker_pool *pool)
      {
              return !atomic_read(&pool->nr_running);
      }
      
      /*
       * Need to wake up a worker?  Called from anything but currently
       * running workers.
       *
       * Note that, because unbound workers never contribute to nr_running, this
       * function will always return %true for unbound pools as long as the
       * worklist isn't empty.
       */
      static bool need_more_worker(struct worker_pool *pool)
      {
              return !list_empty(&pool->worklist) && __need_more_worker(pool);
      }
      
      /* Can I start working?  Called from busy but !running workers. */
      static bool may_start_working(struct worker_pool *pool)
      {
              return pool->nr_idle;
      }
      
      /* Do I need to keep working?  Called from currently running workers. */
      static bool keep_working(struct worker_pool *pool)
      {
              return !list_empty(&pool->worklist) &&
                      atomic_read(&pool->nr_running) <= 1;
      }
      
      /* Do we need a new worker?  Called from manager. */
      static bool need_to_create_worker(struct worker_pool *pool)
      {
              return need_more_worker(pool) && !may_start_working(pool);
      }
      
      /* Do we have too many workers and should some go away? */
      static bool too_many_workers(struct worker_pool *pool)
      {
              bool managing = pool->flags & POOL_MANAGER_ACTIVE;
              int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
              int nr_busy = pool->nr_workers - nr_idle;
      
              return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
      }
      
      /*
       * Wake up functions.
       */
      
      /* Return the first idle worker.  Safe with preemption disabled */
      static struct worker *first_idle_worker(struct worker_pool *pool)
      {
 1208         if (unlikely(list_empty(&pool->idle_list)))
                      return NULL;
      
              return list_first_entry(&pool->idle_list, struct worker, entry);
      }
      
      /**
       * wake_up_worker - wake up an idle worker
       * @pool: worker pool to wake worker from
       *
       * Wake up the first idle worker of @pool.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock).
       */
      static void wake_up_worker(struct worker_pool *pool)
      {
 1208         struct worker *worker = first_idle_worker(pool);
      
 1211         if (likely(worker))
 1211                 wake_up_process(worker->task);
      }
      
      /**
       * wq_worker_waking_up - a worker is waking up
       * @task: task waking up
       * @cpu: CPU @task is waking up to
       *
       * This function is called during try_to_wake_up() when a worker is
       * being awoken.
       *
       * CONTEXT:
       * spin_lock_irq(rq->lock)
       */
      void wq_worker_waking_up(struct task_struct *task, int cpu)
      {
 1409         struct worker *worker = kthread_data(task);
      
              if (!(worker->flags & WORKER_NOT_RUNNING)) {
  118                 WARN_ON_ONCE(worker->pool->cpu != cpu);
  118                 atomic_inc(&worker->pool->nr_running);
              }
 1409 }
      
      /**
       * wq_worker_sleeping - a worker is going to sleep
       * @task: task going to sleep
       * @cpu: CPU in question, must be the current CPU number
       *
       * This function is called during schedule() when a busy worker is
       * going to sleep.  Worker on the same cpu can be woken up by
       * returning pointer to its task.
       *
       * CONTEXT:
       * spin_lock_irq(rq->lock)
       *
       * Return:
       * Worker task on @cpu to wake up, %NULL if none.
       */
      struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
      {
              struct worker *worker = kthread_data(task), *to_wakeup = NULL;
              struct worker_pool *pool;
      
              /*
               * Rescuers, which may not have all the fields set up like normal
               * workers, also reach here, let's not access anything before
               * checking NOT_RUNNING.
               */
              if (worker->flags & WORKER_NOT_RUNNING)
                      return NULL;
      
              pool = worker->pool;
      
              /* this can only happen on the local cpu */
              if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
                      return NULL;
      
              /*
               * The counterpart of the following dec_and_test, implied mb,
               * worklist not empty test sequence is in insert_work().
               * Please read comment there.
               *
               * NOT_RUNNING is clear.  This means that we're bound to and
               * running on the local cpu w/ rq lock held and preemption
               * disabled, which in turn means that none else could be
               * manipulating idle_list, so dereferencing idle_list without pool
               * lock is safe.
               */
              if (atomic_dec_and_test(&pool->nr_running) &&
                  !list_empty(&pool->worklist))
                      to_wakeup = first_idle_worker(pool);
              return to_wakeup ? to_wakeup->task : NULL;
      }
      
      /**
       * worker_set_flags - set worker flags and adjust nr_running accordingly
       * @worker: self
       * @flags: flags to set
       *
       * Set @flags in @worker->flags and adjust nr_running accordingly.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock)
       */
      static inline void worker_set_flags(struct worker *worker, unsigned int flags)
      {
              struct worker_pool *pool = worker->pool;
      
              WARN_ON_ONCE(worker->task != current);
      
              /* If transitioning into NOT_RUNNING, adjust nr_running. */
              if ((flags & WORKER_NOT_RUNNING) &&
                  !(worker->flags & WORKER_NOT_RUNNING)) {
                      atomic_dec(&pool->nr_running);
              }
      
              worker->flags |= flags;
      }
      
      /**
       * worker_clr_flags - clear worker flags and adjust nr_running accordingly
       * @worker: self
       * @flags: flags to clear
       *
       * Clear @flags in @worker->flags and adjust nr_running accordingly.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock)
       */
      static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
      {
              struct worker_pool *pool = worker->pool;
              unsigned int oflags = worker->flags;
      
              WARN_ON_ONCE(worker->task != current);
      
              worker->flags &= ~flags;
      
              /*
               * If transitioning out of NOT_RUNNING, increment nr_running.  Note
               * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
               * of multiple flags, not a single flag.
               */
              if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                      if (!(worker->flags & WORKER_NOT_RUNNING))
                              atomic_inc(&pool->nr_running);
      }
      
      /**
       * find_worker_executing_work - find worker which is executing a work
       * @pool: pool of interest
       * @work: work to find worker for
       *
       * Find a worker which is executing @work on @pool by searching
       * @pool->busy_hash which is keyed by the address of @work.  For a worker
       * to match, its current execution should match the address of @work and
       * its work function.  This is to avoid unwanted dependency between
       * unrelated work executions through a work item being recycled while still
       * being executed.
       *
       * This is a bit tricky.  A work item may be freed once its execution
       * starts and nothing prevents the freed area from being recycled for
       * another work item.  If the same work item address ends up being reused
       * before the original execution finishes, workqueue will identify the
       * recycled work item as currently executing and make it wait until the
       * current execution finishes, introducing an unwanted dependency.
       *
       * This function checks the work item address and work function to avoid
       * false positives.  Note that this isn't complete as one may construct a
       * work function which can introduce dependency onto itself through a
       * recycled work item.  Well, if somebody wants to shoot oneself in the
       * foot that badly, there's only so much we can do, and if such deadlock
       * actually occurs, it should be easy to locate the culprit work function.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock).
       *
       * Return:
       * Pointer to worker which is executing @work if found, %NULL
       * otherwise.
       */
      static struct worker *find_worker_executing_work(struct worker_pool *pool,
                                                       struct work_struct *work)
      {
              struct worker *worker;
      
 1224         hash_for_each_possible(pool->busy_hash, worker, hentry,
                                     (unsigned long)work)
  128                 if (worker->current_work == work &&
  128                     worker->current_func == work->func)
                              return worker;
      
              return NULL;
      }
      
      /**
       * move_linked_works - move linked works to a list
       * @work: start of series of works to be scheduled
       * @head: target list to append @work to
       * @nextp: out parameter for nested worklist walking
       *
       * Schedule linked works starting from @work to @head.  Work series to
       * be scheduled starts at @work and includes any consecutive work with
       * WORK_STRUCT_LINKED set in its predecessor.
       *
       * If @nextp is not NULL, it's updated to point to the next work of
       * the last scheduled work.  This allows move_linked_works() to be
       * nested inside outer list_for_each_entry_safe().
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock).
       */
      static void move_linked_works(struct work_struct *work, struct list_head *head,
                                    struct work_struct **nextp)
      {
              struct work_struct *n;
      
              /*
               * Linked worklist will always end before the end of the list,
               * use NULL for list head.
               */
              list_for_each_entry_safe_from(work, n, NULL, entry) {
                      list_move_tail(&work->entry, head);
                      if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
                              break;
              }
      
              /*
               * If we're already inside safe list traversal and have moved
               * multiple works to the scheduled queue, the next position
               * needs to be updated.
               */
              if (nextp)
                      *nextp = n;
      }
      
      /**
       * get_pwq - get an extra reference on the specified pool_workqueue
       * @pwq: pool_workqueue to get
       *
       * Obtain an extra reference on @pwq.  The caller should guarantee that
       * @pwq has positive refcnt and be holding the matching pool->lock.
       */
      static void get_pwq(struct pool_workqueue *pwq)
      {
 1221         lockdep_assert_held(&pwq->pool->lock);
 1221         WARN_ON_ONCE(pwq->refcnt <= 0);
 1221         pwq->refcnt++;
      }
      
      /**
       * put_pwq - put a pool_workqueue reference
       * @pwq: pool_workqueue to put
       *
       * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
       * destruction.  The caller should be holding the matching pool->lock.
       */
      static void put_pwq(struct pool_workqueue *pwq)
      {
  126         lockdep_assert_held(&pwq->pool->lock);
  126         if (likely(--pwq->refcnt))
                      return;
              if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
                      return;
              /*
               * @pwq can't be released under pool->lock, bounce to
               * pwq_unbound_release_workfn().  This never recurses on the same
               * pool->lock as this path is taken only for unbound workqueues and
               * the release work item is scheduled on a per-cpu workqueue.  To
               * avoid lockdep warning, unbound pool->locks are given lockdep
               * subclass of 1 in get_unbound_pool().
               */
              schedule_work(&pwq->unbound_release_work);
      }
      
      /**
       * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
       * @pwq: pool_workqueue to put (can be %NULL)
       *
       * put_pwq() with locking.  This function also allows %NULL @pwq.
       */
      static void put_pwq_unlocked(struct pool_workqueue *pwq)
      {
              if (pwq) {
                      /*
                       * As both pwqs and pools are sched-RCU protected, the
                       * following lock operations are safe.
                       */
                      spin_lock_irq(&pwq->pool->lock);
                      put_pwq(pwq);
                      spin_unlock_irq(&pwq->pool->lock);
              }
      }
      
      static void pwq_activate_delayed_work(struct work_struct *work)
      {
              struct pool_workqueue *pwq = get_work_pwq(work);
      
              trace_workqueue_activate_work(work);
              move_linked_works(work, &pwq->pool->worklist, NULL);
              __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
              pwq->nr_active++;
      }
      
      static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
      {
              struct work_struct *work = list_first_entry(&pwq->delayed_works,
                                                          struct work_struct, entry);
      
              pwq_activate_delayed_work(work);
      }
      
      /**
       * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
       * @pwq: pwq of interest
       * @color: color of work which left the queue
       *
       * A work either has completed or is removed from pending queue,
       * decrement nr_in_flight of its pwq and handle workqueue flushing.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock).
       */
      static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
      {
              /* uncolored work items don't participate in flushing or nr_active */
  126         if (color == WORK_NO_COLOR)
                      goto out_put;
      
  126         pwq->nr_in_flight[color]--;
      
              pwq->nr_active--;
              if (!list_empty(&pwq->delayed_works)) {
                      /* one down, submit a delayed one */
                      if (pwq->nr_active < pwq->max_active)
                              pwq_activate_first_delayed(pwq);
              }
      
              /* is flush in progress and are we at the flushing tip? */
  126         if (likely(pwq->flush_color != color))
                      goto out_put;
      
              /* are there still in-flight works? */
              if (pwq->nr_in_flight[color])
                      goto out_put;
      
              /* this pwq is done, clear flush_color */
              pwq->flush_color = -1;
      
              /*
               * If this was the last pwq, wake up the first flusher.  It
               * will handle the rest.
               */
              if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
                      complete(&pwq->wq->first_flusher->done);
      out_put:
  126         put_pwq(pwq);
      }
      
      /**
       * try_to_grab_pending - steal work item from worklist and disable irq
       * @work: work item to steal
       * @is_dwork: @work is a delayed_work
       * @flags: place to store irq state
       *
       * Try to grab PENDING bit of @work.  This function can handle @work in any
       * stable state - idle, on timer or on worklist.
       *
       * Return:
       *  1                if @work was pending and we successfully stole PENDING
       *  0                if @work was idle and we claimed PENDING
       *  -EAGAIN        if PENDING couldn't be grabbed at the moment, safe to busy-retry
       *  -ENOENT        if someone else is canceling @work, this state may persist
       *                for arbitrarily long
       *
       * Note:
       * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
       * interrupted while holding PENDING and @work off queue, irq must be
       * disabled on entry.  This, combined with delayed_work->timer being
       * irqsafe, ensures that we return -EAGAIN for finite short period of time.
       *
       * On successful return, >= 0, irq is disabled and the caller is
       * responsible for releasing it using local_irq_restore(*@flags).
       *
       * This function is safe to call from any context including IRQ handler.
       */
      static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
                                     unsigned long *flags)
      {
              struct worker_pool *pool;
              struct pool_workqueue *pwq;
      
  719         local_irq_save(*flags);
      
              /* try to steal the timer if it exists */
              if (is_dwork) {
                      struct delayed_work *dwork = to_delayed_work(work);
      
                      /*
                       * dwork->timer is irqsafe.  If del_timer() fails, it's
                       * guaranteed that the timer is not queued anywhere and not
                       * running on the local CPU.
                       */
  684                 if (likely(del_timer(&dwork->timer)))
                              return 1;
              }
      
              /* try to claim PENDING the normal way */
  617         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
  719                 return 0;
      
              /*
               * The queueing is in progress, or it is already queued. Try to
               * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
               */
  128         pool = get_work_pool(work);
              if (!pool)
                      goto fail;
      
  128         spin_lock(&pool->lock);
              /*
               * work->data is guaranteed to point to pwq only while the work
               * item is queued on pwq->wq, and both updating work->data to point
               * to pwq on queueing and to pool on dequeueing are done under
               * pwq->pool->lock.  This in turn guarantees that, if work->data
               * points to pwq which is associated with a locked pool, the work
               * item is currently queued on that pool.
               */
  126         pwq = get_work_pwq(work);
              if (pwq && pwq->pool == pool) {
  126                 debug_work_deactivate(work);
      
                      /*
                       * A delayed work item cannot be grabbed directly because
                       * it might have linked NO_COLOR work items which, if left
                       * on the delayed_list, will confuse pwq->nr_active
                       * management later on and cause stall.  Make sure the work
                       * item is activated before grabbing.
                       */
                      if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
                              pwq_activate_delayed_work(work);
      
  126                 list_del_init(&work->entry);
                      pwq_dec_nr_in_flight(pwq, get_work_color(work));
      
                      /* work->data points to pwq iff queued, point to pool */
  126                 set_work_pool_and_keep_pending(work, pool->id);
      
                      spin_unlock(&pool->lock);
                      return 1;
              }
   10         spin_unlock(&pool->lock);
      fail:
   10         local_irq_restore(*flags);
   10         if (work_is_canceling(work))
                      return -ENOENT;
   10         cpu_relax();
              return -EAGAIN;
      }
      
      /**
       * insert_work - insert a work into a pool
       * @pwq: pwq @work belongs to
       * @work: work to insert
       * @head: insertion point
       * @extra_flags: extra WORK_STRUCT_* flags to set
       *
       * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
       * work_struct flags.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock).
       */
      static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
                              struct list_head *head, unsigned int extra_flags)
      {
 1221         struct worker_pool *pool = pwq->pool;
      
              /* we own @work, set data and link */
 1221         set_work_pwq(work, pwq, extra_flags);
 1221         list_add_tail(&work->entry, head);
 1221         get_pwq(pwq);
      
              /*
               * Ensure either wq_worker_sleeping() sees the above
               * list_add_tail() or we see zero nr_running to avoid workers lying
               * around lazily while there are works to be processed.
               */
              smp_mb();
      
              if (__need_more_worker(pool))
 1208                 wake_up_worker(pool);
 1221 }
      
      /*
       * Test whether @work is being queued from another work executing on the
       * same workqueue.
       */
      static bool is_chained_work(struct workqueue_struct *wq)
      {
              struct worker *worker;
      
              worker = current_wq_worker();
              /*
               * Return %true iff I'm a worker execuing a work item on @wq.  If
               * I'm @worker, it's safe to dereference it without locking.
               */
              return worker && worker->current_pwq->wq == wq;
      }
      
      static void __queue_work(int cpu, struct workqueue_struct *wq,
                               struct work_struct *work)
      {
              struct pool_workqueue *pwq;
              struct worker_pool *last_pool;
              struct list_head *worklist;
              unsigned int work_flags;
 1213         unsigned int req_cpu = cpu;
      
              /*
               * While a work item is PENDING && off queue, a task trying to
               * steal the PENDING will busy-loop waiting for it to either get
               * queued or lose PENDING.  Grabbing PENDING and queueing should
               * happen with IRQ disabled.
               */
              WARN_ON_ONCE(!irqs_disabled());
      
 1213         debug_work_activate(work);
      
              /* if draining, only works from the same workqueue are allowed */
              if (unlikely(wq->flags & __WQ_DRAINING) &&
                  WARN_ON_ONCE(!is_chained_work(wq)))
                      return;
      retry:
 1213         if (req_cpu == WORK_CPU_UNBOUND)
 1187                 cpu = raw_smp_processor_id();
      
              /* pwq which will be used unless @work is executing elsewhere */
 1213         if (!(wq->flags & WQ_UNBOUND))
  772                 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
              else
  525                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
      
              /*
               * If @work was previously on a different pool, it might still be
               * running there, in which case the work needs to be queued on that
               * pool to guarantee non-reentrancy.
               */
 1213         last_pool = get_work_pool(work);
  772         if (last_pool && last_pool != pwq->pool) {
                      struct worker *worker;
      
  569                 spin_lock(&last_pool->lock);
      
 1213                 worker = find_worker_executing_work(last_pool, work);
      
  107                 if (worker && worker->current_pwq->wq == wq) {
  577                         pwq = worker->current_pwq;
                      } else {
                              /* meh... not running there, queue here */
  558                         spin_unlock(&last_pool->lock);
                              spin_lock(&pwq->pool->lock);
                      }
              } else {
 1169                 spin_lock(&pwq->pool->lock);
              }
      
              /*
               * pwq is determined and locked.  For unbound pools, we could have
               * raced with pwq release and it could already be dead.  If its
               * refcnt is zero, repeat pwq selection.  Note that pwqs never die
               * without another pwq replacing it in the numa_pwq_tbl or while
               * work items are executing on it, so the retrying is guaranteed to
               * make forward-progress.
               */
 1213         if (unlikely(!pwq->refcnt)) {
                      if (wq->flags & WQ_UNBOUND) {
                              spin_unlock(&pwq->pool->lock);
                              cpu_relax();
                              goto retry;
                      }
                      /* oops */
                      WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
                                wq->name, cpu);
              }
      
              /* pwq determined, queue */
 1213         trace_workqueue_queue_work(req_cpu, pwq, work);
      
 1213         if (WARN_ON(!list_empty(&work->entry))) {
                      spin_unlock(&pwq->pool->lock);
                      return;
              }
      
 1213         pwq->nr_in_flight[pwq->work_color]++;
              work_flags = work_color_to_flags(pwq->work_color);
      
              if (likely(pwq->nr_active < pwq->max_active)) {
 1213                 trace_workqueue_activate_work(work);
 1213                 pwq->nr_active++;
                      worklist = &pwq->pool->worklist;
              } else {
    5                 work_flags |= WORK_STRUCT_DELAYED;
                      worklist = &pwq->delayed_works;
              }
      
 1213         insert_work(pwq, work, worklist, work_flags);
      
 1213         spin_unlock(&pwq->pool->lock);
      }
      
      /**
       * queue_work_on - queue work on specific cpu
       * @cpu: CPU number to execute work on
       * @wq: workqueue to use
       * @work: work to queue
       *
       * We queue the work to a specific CPU, the caller must ensure it
       * can't go away.
       *
       * Return: %false if @work was already on a queue, %true otherwise.
       */
      bool queue_work_on(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
      {
              bool ret = false;
              unsigned long flags;
      
  762         local_irq_save(flags);
      
  707         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                      __queue_work(cpu, wq, work);
                      ret = true;
              }
      
  762         local_irq_restore(flags);
  762         return ret;
      }
      EXPORT_SYMBOL(queue_work_on);
      
      void delayed_work_timer_fn(unsigned long __data)
      {
              struct delayed_work *dwork = (struct delayed_work *)__data;
      
              /* should have been called from irqsafe timer with irq already off */
              __queue_work(dwork->cpu, dwork->wq, &dwork->work);
      }
      EXPORT_SYMBOL(delayed_work_timer_fn);
      
      static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                      struct delayed_work *dwork, unsigned long delay)
      {
  747         struct timer_list *timer = &dwork->timer;
              struct work_struct *work = &dwork->work;
      
              WARN_ON_ONCE(!wq);
  747         WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
                           timer->data != (unsigned long)dwork);
  747         WARN_ON_ONCE(timer_pending(timer));
  747         WARN_ON_ONCE(!list_empty(&work->entry));
      
              /*
               * If @delay is 0, queue @dwork->work immediately.  This is for
               * both optimization and correctness.  The earliest @timer can
               * expire is on the closest next tick and delayed_work users depend
               * on that there's no such delay when @delay is 0.
               */
  747         if (!delay) {
  658                 __queue_work(cpu, wq, &dwork->work);
                      return;
              }
      
  143         timer_stats_timer_set_start_info(&dwork->timer);
      
  143         dwork->wq = wq;
              dwork->cpu = cpu;
              timer->expires = jiffies + delay;
      
              if (unlikely(cpu != WORK_CPU_UNBOUND))
                      add_timer_on(timer, cpu);
              else
  747                 add_timer(timer);
      }
      
      /**
       * queue_delayed_work_on - queue work on specific CPU after delay
       * @cpu: CPU number to execute work on
       * @wq: workqueue to use
       * @dwork: work to queue
       * @delay: number of jiffies to wait before queueing
       *
       * Return: %false if @work was already on a queue, %true otherwise.  If
       * @delay is zero and @dwork is idle, it will be scheduled for immediate
       * execution.
       */
      bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                                 struct delayed_work *dwork, unsigned long delay)
      {
              struct work_struct *work = &dwork->work;
              bool ret = false;
              unsigned long flags;
      
              /* read the comment in __queue_work() */
  152         local_irq_save(flags);
      
  137         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                      __queue_delayed_work(cpu, wq, dwork, delay);
                      ret = true;
              }
      
  152         local_irq_restore(flags);
  152         return ret;
      }
      EXPORT_SYMBOL(queue_delayed_work_on);
      
      /**
       * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
       * @cpu: CPU number to execute work on
       * @wq: workqueue to use
       * @dwork: work to queue
       * @delay: number of jiffies to wait before queueing
       *
       * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
       * modify @dwork's timer so that it expires after @delay.  If @delay is
       * zero, @work is guaranteed to be scheduled immediately regardless of its
       * current state.
       *
       * Return: %false if @dwork was idle and queued, %true if @dwork was
       * pending and its timer was modified.
       *
       * This function is safe to call from any context including IRQ handler.
       * See try_to_grab_pending() for details.
       */
      bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                               struct delayed_work *dwork, unsigned long delay)
  673 {
              unsigned long flags;
              int ret;
      
              do {
  673                 ret = try_to_grab_pending(&dwork->work, true, &flags);
              } while (unlikely(ret == -EAGAIN));
      
  673         if (likely(ret >= 0)) {
  673                 __queue_delayed_work(cpu, wq, dwork, delay);
  673                 local_irq_restore(flags);
              }
      
              /* -ENOENT from try_to_grab_pending() becomes %true */
  673         return ret;
      }
      EXPORT_SYMBOL_GPL(mod_delayed_work_on);
      
      /**
       * worker_enter_idle - enter idle state
       * @worker: worker which is entering idle state
       *
       * @worker is entering idle state.  Update stats and idle timer if
       * necessary.
       *
       * LOCKING:
       * spin_lock_irq(pool->lock).
       */
      static void worker_enter_idle(struct worker *worker)
      {
              struct worker_pool *pool = worker->pool;
      
              if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
                  WARN_ON_ONCE(!list_empty(&worker->entry) &&
                               (worker->hentry.next || worker->hentry.pprev)))
                      return;
      
              /* can't use worker_set_flags(), also called from create_worker() */
              worker->flags |= WORKER_IDLE;
              pool->nr_idle++;
              worker->last_active = jiffies;
      
              /* idle_list is LIFO */
              list_add(&worker->entry, &pool->idle_list);
      
              if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
                      mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
      
              /*
               * Sanity check nr_running.  Because wq_unbind_fn() releases
               * pool->lock between setting %WORKER_UNBOUND and zapping
               * nr_running, the warning may trigger spuriously.  Check iff
               * unbind is not in progress.
               */
              WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                           pool->nr_workers == pool->nr_idle &&
                           atomic_read(&pool->nr_running));
      }
      
      /**
       * worker_leave_idle - leave idle state
       * @worker: worker which is leaving idle state
       *
       * @worker is leaving idle state.  Update stats.
       *
       * LOCKING:
       * spin_lock_irq(pool->lock).
       */
      static void worker_leave_idle(struct worker *worker)
      {
              struct worker_pool *pool = worker->pool;
      
              if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
                      return;
              worker_clr_flags(worker, WORKER_IDLE);
              pool->nr_idle--;
              list_del_init(&worker->entry);
      }
      
      static struct worker *alloc_worker(int node)
      {
              struct worker *worker;
      
   24         worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
              if (worker) {
   24                 INIT_LIST_HEAD(&worker->entry);
                      INIT_LIST_HEAD(&worker->scheduled);
                      INIT_LIST_HEAD(&worker->node);
                      /* on creation a worker is in !idle && prep state */
                      worker->flags = WORKER_PREP;
              }
   24         return worker;
      }
      
      /**
       * worker_attach_to_pool() - attach a worker to a pool
       * @worker: worker to be attached
       * @pool: the target pool
       *
       * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
       * cpu-binding of @worker are kept coordinated with the pool across
       * cpu-[un]hotplugs.
       */
      static void worker_attach_to_pool(struct worker *worker,
                                         struct worker_pool *pool)
      {
              mutex_lock(&pool->attach_mutex);
      
              /*
               * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
               * online CPUs.  It'll be re-applied when any of the CPUs come up.
               */
              set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
      
              /*
               * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
               * stable across this function.  See the comments above the
               * flag definition for details.
               */
              if (pool->flags & POOL_DISASSOCIATED)
                      worker->flags |= WORKER_UNBOUND;
      
              list_add_tail(&worker->node, &pool->workers);
      
              mutex_unlock(&pool->attach_mutex);
      }
      
      /**
       * worker_detach_from_pool() - detach a worker from its pool
       * @worker: worker which is attached to its pool
       * @pool: the pool @worker is attached to
       *
       * Undo the attaching which had been done in worker_attach_to_pool().  The
       * caller worker shouldn't access to the pool after detached except it has
       * other reference to the pool.
       */
      static void worker_detach_from_pool(struct worker *worker,
                                          struct worker_pool *pool)
      {
              struct completion *detach_completion = NULL;
      
              mutex_lock(&pool->attach_mutex);
              list_del(&worker->node);
              if (list_empty(&pool->workers))
                      detach_completion = pool->detach_completion;
              mutex_unlock(&pool->attach_mutex);
      
              /* clear leftover flags without pool->lock after it is detached */
              worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
      
              if (detach_completion)
                      complete(detach_completion);
      }
      
      /**
       * create_worker - create a new workqueue worker
       * @pool: pool the new worker will belong to
       *
       * Create and start a new worker which is attached to @pool.
       *
       * CONTEXT:
       * Might sleep.  Does GFP_KERNEL allocations.
       *
       * Return:
       * Pointer to the newly created worker.
       */
      static struct worker *create_worker(struct worker_pool *pool)
      {
              struct worker *worker = NULL;
              int id = -1;
              char id_buf[16];
      
              /* ID is needed to determine kthread name */
              id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
              if (id < 0)
                      goto fail;
      
              worker = alloc_worker(pool->node);
              if (!worker)
                      goto fail;
      
              worker->pool = pool;
              worker->id = id;
      
              if (pool->cpu >= 0)
                      snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
                               pool->attrs->nice < 0  ? "H" : "");
              else
                      snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
      
              worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
                                                    "kworker/%s", id_buf);
              if (IS_ERR(worker->task))
                      goto fail;
      
              set_user_nice(worker->task, pool->attrs->nice);
              kthread_bind_mask(worker->task, pool->attrs->cpumask);
      
              /* successful, attach the worker to the pool */
              worker_attach_to_pool(worker, pool);
      
              /* start the newly created worker */
              spin_lock_irq(&pool->lock);
              worker->pool->nr_workers++;
              worker_enter_idle(worker);
              wake_up_process(worker->task);
              spin_unlock_irq(&pool->lock);
      
              return worker;
      
      fail:
              if (id >= 0)
                      ida_simple_remove(&pool->worker_ida, id);
              kfree(worker);
              return NULL;
      }
      
      /**
       * destroy_worker - destroy a workqueue worker
       * @worker: worker to be destroyed
       *
       * Destroy @worker and adjust @pool stats accordingly.  The worker should
       * be idle.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock).
       */
      static void destroy_worker(struct worker *worker)
      {
              struct worker_pool *pool = worker->pool;
      
              lockdep_assert_held(&pool->lock);
      
              /* sanity check frenzy */
              if (WARN_ON(worker->current_work) ||
                  WARN_ON(!list_empty(&worker->scheduled)) ||
                  WARN_ON(!(worker->flags & WORKER_IDLE)))
                      return;
      
              pool->nr_workers--;
              pool->nr_idle--;
      
              list_del_init(&worker->entry);
              worker->flags |= WORKER_DIE;
              wake_up_process(worker->task);
      }
      
      static void idle_worker_timeout(unsigned long __pool)
      {
              struct worker_pool *pool = (void *)__pool;
      
              spin_lock_irq(&pool->lock);
      
              while (too_many_workers(pool)) {
                      struct worker *worker;
                      unsigned long expires;
      
                      /* idle_list is kept in LIFO order, check the last one */
                      worker = list_entry(pool->idle_list.prev, struct worker, entry);
                      expires = worker->last_active + IDLE_WORKER_TIMEOUT;
      
                      if (time_before(jiffies, expires)) {
                              mod_timer(&pool->idle_timer, expires);
                              break;
                      }
      
                      destroy_worker(worker);
              }
      
              spin_unlock_irq(&pool->lock);
      }
      
      static void send_mayday(struct work_struct *work)
      {
              struct pool_workqueue *pwq = get_work_pwq(work);
              struct workqueue_struct *wq = pwq->wq;
      
              lockdep_assert_held(&wq_mayday_lock);
      
              if (!wq->rescuer)
                      return;
      
              /* mayday mayday mayday */
              if (list_empty(&pwq->mayday_node)) {
                      /*
                       * If @pwq is for an unbound wq, its base ref may be put at
                       * any time due to an attribute change.  Pin @pwq until the
                       * rescuer is done with it.
                       */
                      get_pwq(pwq);
                      list_add_tail(&pwq->mayday_node, &wq->maydays);
                      wake_up_process(wq->rescuer->task);
              }
      }
      
      static void pool_mayday_timeout(unsigned long __pool)
      {
              struct worker_pool *pool = (void *)__pool;
              struct work_struct *work;
      
              spin_lock_irq(&pool->lock);
              spin_lock(&wq_mayday_lock);                /* for wq->maydays */
      
              if (need_to_create_worker(pool)) {
                      /*
                       * We've been trying to create a new worker but
                       * haven't been successful.  We might be hitting an
                       * allocation deadlock.  Send distress signals to
                       * rescuers.
                       */
                      list_for_each_entry(work, &pool->worklist, entry)
                              send_mayday(work);
              }
      
              spin_unlock(&wq_mayday_lock);
              spin_unlock_irq(&pool->lock);
      
              mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
      }
      
      /**
       * maybe_create_worker - create a new worker if necessary
       * @pool: pool to create a new worker for
       *
       * Create a new worker for @pool if necessary.  @pool is guaranteed to
       * have at least one idle worker on return from this function.  If
       * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
       * sent to all rescuers with works scheduled on @pool to resolve
       * possible allocation deadlock.
       *
       * On return, need_to_create_worker() is guaranteed to be %false and
       * may_start_working() %true.
       *
       * LOCKING:
       * spin_lock_irq(pool->lock) which may be released and regrabbed
       * multiple times.  Does GFP_KERNEL allocations.  Called only from
       * manager.
       */
      static void maybe_create_worker(struct worker_pool *pool)
      __releases(&pool->lock)
      __acquires(&pool->lock)
      {
      restart:
              spin_unlock_irq(&pool->lock);
      
              /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
              mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
      
              while (true) {
                      if (create_worker(pool) || !need_to_create_worker(pool))
                              break;
      
                      schedule_timeout_interruptible(CREATE_COOLDOWN);
      
                      if (!need_to_create_worker(pool))
                              break;
              }
      
              del_timer_sync(&pool->mayday_timer);
              spin_lock_irq(&pool->lock);
              /*
               * This is necessary even after a new worker was just successfully
               * created as @pool->lock was dropped and the new worker might have
               * already become busy.
               */
              if (need_to_create_worker(pool))
                      goto restart;
      }
      
      /**
       * manage_workers - manage worker pool
       * @worker: self
       *
       * Assume the manager role and manage the worker pool @worker belongs
       * to.  At any given time, there can be only zero or one manager per
       * pool.  The exclusion is handled automatically by this function.
       *
       * The caller can safely start processing works on false return.  On
       * true return, it's guaranteed that need_to_create_worker() is false
       * and may_start_working() is true.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock) which may be released and regrabbed
       * multiple times.  Does GFP_KERNEL allocations.
       *
       * Return:
       * %false if the pool doesn't need management and the caller can safely
       * start processing works, %true if management function was performed and
       * the conditions that the caller verified before calling the function may
       * no longer be true.
       */
      static bool manage_workers(struct worker *worker)
      {
              struct worker_pool *pool = worker->pool;
      
              if (pool->flags & POOL_MANAGER_ACTIVE)
                      return false;
      
              pool->flags |= POOL_MANAGER_ACTIVE;
              pool->manager = worker;
      
              maybe_create_worker(pool);
      
              pool->manager = NULL;
              pool->flags &= ~POOL_MANAGER_ACTIVE;
              wake_up(&wq_manager_wait);
              return true;
      }
      
      /**
       * process_one_work - process single work
       * @worker: self
       * @work: work to process
       *
       * Process @work.  This function contains all the logics necessary to
       * process a single work including synchronization against and
       * interaction with other workers on the same cpu, queueing and
       * flushing.  As long as context requirement is met, any worker can
       * call this function to process a work.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock) which is released and regrabbed.
       */
      static void process_one_work(struct worker *worker, struct work_struct *work)
      __releases(&pool->lock)
      __acquires(&pool->lock)
      {
              struct pool_workqueue *pwq = get_work_pwq(work);
              struct worker_pool *pool = worker->pool;
              bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
              int work_color;
              struct worker *collision;
      #ifdef CONFIG_LOCKDEP
              /*
               * It is permissible to free the struct work_struct from
               * inside the function that is called from it, this we need to
               * take into account for lockdep too.  To avoid bogus "held
               * lock freed" warnings as well as problems when looking into
               * work->lockdep_map, make a copy and use that here.
               */
              struct lockdep_map lockdep_map;
      
              lockdep_copy_map(&lockdep_map, &work->lockdep_map);
      #endif
              /* ensure we're on the correct CPU */
              WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                           raw_smp_processor_id() != pool->cpu);
      
              /*
               * A single work shouldn't be executed concurrently by
               * multiple workers on a single cpu.  Check whether anyone is
               * already processing the work.  If so, defer the work to the
               * currently executing one.
               */
              collision = find_worker_executing_work(pool, work);
              if (unlikely(collision)) {
                      move_linked_works(work, &collision->scheduled, NULL);
                      return;
              }
      
              /* claim and dequeue */
              debug_work_deactivate(work);
              hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
              worker->current_work = work;
              worker->current_func = work->func;
              worker->current_pwq = pwq;
              work_color = get_work_color(work);
      
              list_del_init(&work->entry);
      
              /*
               * CPU intensive works don't participate in concurrency management.
               * They're the scheduler's responsibility.  This takes @worker out
               * of concurrency management and the next code block will chain
               * execution of the pending work items.
               */
              if (unlikely(cpu_intensive))
                      worker_set_flags(worker, WORKER_CPU_INTENSIVE);
      
              /*
               * Wake up another worker if necessary.  The condition is always
               * false for normal per-cpu workers since nr_running would always
               * be >= 1 at this point.  This is used to chain execution of the
               * pending work items for WORKER_NOT_RUNNING workers such as the
               * UNBOUND and CPU_INTENSIVE ones.
               */
              if (need_more_worker(pool))
                      wake_up_worker(pool);
      
              /*
               * Record the last pool and clear PENDING which should be the last
               * update to @work.  Also, do this inside @pool->lock so that
               * PENDING and queued state changes happen together while IRQ is
               * disabled.
               */
              set_work_pool_and_clear_pending(work, pool->id);
      
              spin_unlock_irq(&pool->lock);
      
              lock_map_acquire_read(&pwq->wq->lockdep_map);
              lock_map_acquire(&lockdep_map);
              trace_workqueue_execute_start(work);
              worker->current_func(work);
              /*
               * While we must be careful to not use "work" after this, the trace
               * point will only record its address.
               */
              trace_workqueue_execute_end(work);
              lock_map_release(&lockdep_map);
              lock_map_release(&pwq->wq->lockdep_map);
      
              if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
                      pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
                             "     last function: %pf\n",
                             current->comm, preempt_count(), task_pid_nr(current),
                             worker->current_func);
                      debug_show_held_locks(current);
                      dump_stack();
              }
      
              /*
               * The following prevents a kworker from hogging CPU on !PREEMPT
               * kernels, where a requeueing work item waiting for something to
               * happen could deadlock with stop_machine as such work item could
               * indefinitely requeue itself while all other CPUs are trapped in
               * stop_machine. At the same time, report a quiescent RCU state so
               * the same condition doesn't freeze RCU.
               */
              cond_resched_rcu_qs();
      
              spin_lock_irq(&pool->lock);
      
              /* clear cpu intensive status */
              if (unlikely(cpu_intensive))
                      worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
      
              /* we're done with it, release */
              hash_del(&worker->hentry);
              worker->current_work = NULL;
              worker->current_func = NULL;
              worker->current_pwq = NULL;
              worker->desc_valid = false;
              pwq_dec_nr_in_flight(pwq, work_color);
      }
      
      /**
       * process_scheduled_works - process scheduled works
       * @worker: self
       *
       * Process all scheduled works.  Please note that the scheduled list
       * may change while processing a work, so this function repeatedly
       * fetches a work from the top and executes it.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock) which may be released and regrabbed
       * multiple times.
       */
      static void process_scheduled_works(struct worker *worker)
      {
              while (!list_empty(&worker->scheduled)) {
                      struct work_struct *work = list_first_entry(&worker->scheduled,
                                                      struct work_struct, entry);
                      process_one_work(worker, work);
              }
      }
      
      /**
       * worker_thread - the worker thread function
       * @__worker: self
       *
       * The worker thread function.  All workers belong to a worker_pool -
       * either a per-cpu one or dynamic unbound one.  These workers process all
       * work items regardless of their specific target workqueue.  The only
       * exception is work items which belong to workqueues with a rescuer which
       * will be explained in rescuer_thread().
       *
       * Return: 0
       */
      static int worker_thread(void *__worker)
      {
              struct worker *worker = __worker;
              struct worker_pool *pool = worker->pool;
      
              /* tell the scheduler that this is a workqueue worker */
              worker->task->flags |= PF_WQ_WORKER;
      woke_up:
              spin_lock_irq(&pool->lock);
      
              /* am I supposed to die? */
              if (unlikely(worker->flags & WORKER_DIE)) {
                      spin_unlock_irq(&pool->lock);
                      WARN_ON_ONCE(!list_empty(&worker->entry));
                      worker->task->flags &= ~PF_WQ_WORKER;
      
                      set_task_comm(worker->task, "kworker/dying");
                      ida_simple_remove(&pool->worker_ida, worker->id);
                      worker_detach_from_pool(worker, pool);
                      kfree(worker);
                      return 0;
              }
      
              worker_leave_idle(worker);
      recheck:
              /* no more worker necessary? */
              if (!need_more_worker(pool))
                      goto sleep;
      
              /* do we need to manage? */
              if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                      goto recheck;
      
              /*
               * ->scheduled list can only be filled while a worker is
               * preparing to process a work or actually processing it.
               * Make sure nobody diddled with it while I was sleeping.
               */
              WARN_ON_ONCE(!list_empty(&worker->scheduled));
      
              /*
               * Finish PREP stage.  We're guaranteed to have at least one idle
               * worker or that someone else has already assumed the manager
               * role.  This is where @worker starts participating in concurrency
               * management if applicable and concurrency management is restored
               * after being rebound.  See rebind_workers() for details.
               */
              worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
      
              do {
                      struct work_struct *work =
                              list_first_entry(&pool->worklist,
                                               struct work_struct, entry);
      
                      if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
                              /* optimization path, not strictly necessary */
                              process_one_work(worker, work);
                              if (unlikely(!list_empty(&worker->scheduled)))
                                      process_scheduled_works(worker);
                      } else {
                              move_linked_works(work, &worker->scheduled, NULL);
                              process_scheduled_works(worker);
                      }
              } while (keep_working(pool));
      
              worker_set_flags(worker, WORKER_PREP);
      sleep:
              /*
               * pool->lock is held and there's no work to process and no need to
               * manage, sleep.  Workers are woken up only while holding
               * pool->lock or from local cpu, so setting the current state
               * before releasing pool->lock is enough to prevent losing any
               * event.
               */
              worker_enter_idle(worker);
              __set_current_state(TASK_INTERRUPTIBLE);
              spin_unlock_irq(&pool->lock);
              schedule();
              goto woke_up;
      }
      
      /**
       * rescuer_thread - the rescuer thread function
       * @__rescuer: self
       *
       * Workqueue rescuer thread function.  There's one rescuer for each
       * workqueue which has WQ_MEM_RECLAIM set.
       *
       * Regular work processing on a pool may block trying to create a new
       * worker which uses GFP_KERNEL allocation which has slight chance of
       * developing into deadlock if some works currently on the same queue
       * need to be processed to satisfy the GFP_KERNEL allocation.  This is
       * the problem rescuer solves.
       *
       * When such condition is possible, the pool summons rescuers of all
       * workqueues which have works queued on the pool and let them process
       * those works so that forward progress can be guaranteed.
       *
       * This should happen rarely.
       *
       * Return: 0
       */
      static int rescuer_thread(void *__rescuer)
      {
              struct worker *rescuer = __rescuer;
              struct workqueue_struct *wq = rescuer->rescue_wq;
              struct list_head *scheduled = &rescuer->scheduled;
              bool should_stop;
      
              set_user_nice(current, RESCUER_NICE_LEVEL);
      
              /*
               * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
               * doesn't participate in concurrency management.
               */
              rescuer->task->flags |= PF_WQ_WORKER;
      repeat:
              set_current_state(TASK_INTERRUPTIBLE);
      
              /*
               * By the time the rescuer is requested to stop, the workqueue
               * shouldn't have any work pending, but @wq->maydays may still have
               * pwq(s) queued.  This can happen by non-rescuer workers consuming
               * all the work items before the rescuer got to them.  Go through
               * @wq->maydays processing before acting on should_stop so that the
               * list is always empty on exit.
               */
              should_stop = kthread_should_stop();
      
              /* see whether any pwq is asking for help */
              spin_lock_irq(&wq_mayday_lock);
      
              while (!list_empty(&wq->maydays)) {
                      struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
                                              struct pool_workqueue, mayday_node);
                      struct worker_pool *pool = pwq->pool;
                      struct work_struct *work, *n;
      
                      __set_current_state(TASK_RUNNING);
                      list_del_init(&pwq->mayday_node);
      
                      spin_unlock_irq(&wq_mayday_lock);
      
                      worker_attach_to_pool(rescuer, pool);
      
                      spin_lock_irq(&pool->lock);
                      rescuer->pool = pool;
      
                      /*
                       * Slurp in all works issued via this workqueue and
                       * process'em.
                       */
                      WARN_ON_ONCE(!list_empty(scheduled));
                      list_for_each_entry_safe(work, n, &pool->worklist, entry)
                              if (get_work_pwq(work) == pwq)
                                      move_linked_works(work, scheduled, &n);
      
                      if (!list_empty(scheduled)) {
                              process_scheduled_works(rescuer);
      
                              /*
                               * The above execution of rescued work items could
                               * have created more to rescue through
                               * pwq_activate_first_delayed() or chained
                               * queueing.  Let's put @pwq back on mayday list so
                               * that such back-to-back work items, which may be
                               * being used to relieve memory pressure, don't
                               * incur MAYDAY_INTERVAL delay inbetween.
                               */
                              if (need_to_create_worker(pool)) {
                                      spin_lock(&wq_mayday_lock);
                                      get_pwq(pwq);
                                      list_move_tail(&pwq->mayday_node, &wq->maydays);
                                      spin_unlock(&wq_mayday_lock);
                              }
                      }
      
                      /*
                       * Put the reference grabbed by send_mayday().  @pool won't
                       * go away while we're still attached to it.
                       */
                      put_pwq(pwq);
      
                      /*
                       * Leave this pool.  If need_more_worker() is %true, notify a
                       * regular worker; otherwise, we end up with 0 concurrency
                       * and stalling the execution.
                       */
                      if (need_more_worker(pool))
                              wake_up_worker(pool);
      
                      rescuer->pool = NULL;
                      spin_unlock_irq(&pool->lock);
      
                      worker_detach_from_pool(rescuer, pool);
      
                      spin_lock_irq(&wq_mayday_lock);
              }
      
              spin_unlock_irq(&wq_mayday_lock);
      
              if (should_stop) {
                      __set_current_state(TASK_RUNNING);
                      rescuer->task->flags &= ~PF_WQ_WORKER;
                      return 0;
              }
      
              /* rescuers should never participate in concurrency management */
              WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
              schedule();
              goto repeat;
      }
      
      struct wq_barrier {
              struct work_struct        work;
              struct completion        done;
              struct task_struct        *task;        /* purely informational */
      };
      
      static void wq_barrier_func(struct work_struct *work)
      {
              struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
              complete(&barr->done);
      }
      
      /**
       * insert_wq_barrier - insert a barrier work
       * @pwq: pwq to insert barrier into
       * @barr: wq_barrier to insert
       * @target: target work to attach @barr to
       * @worker: worker currently executing @target, NULL if @target is not executing
       *
       * @barr is linked to @target such that @barr is completed only after
       * @target finishes execution.  Please note that the ordering
       * guarantee is observed only with respect to @target and on the local
       * cpu.
       *
       * Currently, a queued barrier can't be canceled.  This is because
       * try_to_grab_pending() can't determine whether the work to be
       * grabbed is at the head of the queue and thus can't clear LINKED
       * flag of the previous work while there must be a valid next work
       * after a work with LINKED flag set.
       *
       * Note that when @worker is non-NULL, @target may be modified
       * underneath us, so we can't reliably determine pwq from @target.
       *
       * CONTEXT:
       * spin_lock_irq(pool->lock).
       */
      static void insert_wq_barrier(struct pool_workqueue *pwq,
                                    struct wq_barrier *barr,
                                    struct work_struct *target, struct worker *worker)
      {
              struct list_head *head;
              unsigned int linked = 0;
      
              /*
               * debugobject calls are safe here even with pool->lock locked
               * as we know for sure that this will not trigger any of the
               * checks and call back into the fixup functions where we
               * might deadlock.
               */
   86         INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
              __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
              init_completion(&barr->done);
              barr->task = current;
      
              /*
               * If @target is currently being executed, schedule the
               * barrier to the worker; otherwise, put it after @target.
               */
              if (worker)
   22                 head = worker->scheduled.next;
              else {
   75                 unsigned long *bits = work_data_bits(target);
      
                      head = target->entry.next;
                      /* there can already be other linked works, inherit and set */
                      linked = *bits & WORK_STRUCT_LINKED;
                      __set_bit(WORK_STRUCT_LINKED_BIT, bits);
              }
      
   86         debug_work_activate(&barr->work);
              insert_work(pwq, &barr->work, head,
                          work_color_to_flags(WORK_NO_COLOR) | linked);
      }
      
      /**
       * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
       * @wq: workqueue being flushed
       * @flush_color: new flush color, < 0 for no-op
       * @work_color: new work color, < 0 for no-op
       *
       * Prepare pwqs for workqueue flushing.
       *
       * If @flush_color is non-negative, flush_color on all pwqs should be
       * -1.  If no pwq has in-flight commands at the specified color, all
       * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
       * has in flight commands, its pwq->flush_color is set to
       * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
       * wakeup logic is armed and %true is returned.
       *
       * The caller should have initialized @wq->first_flusher prior to
       * calling this function with non-negative @flush_color.  If
       * @flush_color is negative, no flush color update is done and %false
       * is returned.
       *
       * If @work_color is non-negative, all pwqs should have the same
       * work_color which is previous to @work_color and all will be
       * advanced to @work_color.
       *
       * CONTEXT:
       * mutex_lock(wq->mutex).
       *
       * Return:
       * %true if @flush_color >= 0 and there's something to flush.  %false
       * otherwise.
       */
      static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                            int flush_color, int work_color)
   93 {
              bool wait = false;
              struct pool_workqueue *pwq;
      
              if (flush_color >= 0) {
   93                 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
   93                 atomic_set(&wq->nr_pwqs_to_flush, 1);
              }
      
   93         for_each_pwq(pwq, wq) {
   93                 struct worker_pool *pool = pwq->pool;
      
                      spin_lock_irq(&pool->lock);
      
                      if (flush_color >= 0) {
   93                         WARN_ON_ONCE(pwq->flush_color != -1);
      
   93                         if (pwq->nr_in_flight[flush_color]) {
                                      pwq->flush_color = flush_color;
                                      atomic_inc(&wq->nr_pwqs_to_flush);
                                      wait = true;
                              }
                      }
      
   93                 if (work_color >= 0) {
   93                         WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
   93                         pwq->work_color = work_color;
                      }
      
   93                 spin_unlock_irq(&pool->lock);
              }
      
   93         if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
   93                 complete(&wq->first_flusher->done);
      
   93         return wait;
      }
      
      /**
       * flush_workqueue - ensure that any scheduled work has run to completion.
       * @wq: workqueue to flush
       *
       * This function sleeps until all work items which were queued on entry
       * have finished execution, but it is not livelocked by new incoming ones.
       */
      void flush_workqueue(struct workqueue_struct *wq)
      {
   93         struct wq_flusher this_flusher = {
                      .list = LIST_HEAD_INIT(this_flusher.list),
                      .flush_color = -1,
                      .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
              };
              int next_color;
      
   93         lock_map_acquire(&wq->lockdep_map);
   93         lock_map_release(&wq->lockdep_map);
      
              mutex_lock(&wq->mutex);
      
              /*
               * Start-to-wait phase
               */
              next_color = work_next_color(wq->work_color);
      
              if (next_color != wq->flush_color) {
                      /*
                       * Color space is not full.  The current work_color
                       * becomes our flush_color and work_color is advanced
                       * by one.
                       */
   93                 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
   93                 this_flusher.flush_color = wq->work_color;
                      wq->work_color = next_color;
      
                      if (!wq->first_flusher) {
                              /* no flush in progress, become the first flusher */
   93                         WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
      
   93                         wq->first_flusher = &this_flusher;
      
                              if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                             wq->work_color)) {
                                      /* nothing to flush, done */
   93                                 wq->flush_color = next_color;
                                      wq->first_flusher = NULL;
                                      goto out_unlock;
                              }
                      } else {
                              /* wait in queue */
                              WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
                              list_add_tail(&this_flusher.list, &wq->flusher_queue);
                              flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                      }
              } else {
                      /*
                       * Oops, color space is full, wait on overflow queue.
                       * The next flush completion will assign us
                       * flush_color and transfer to flusher_queue.
                       */
                      list_add_tail(&this_flusher.list, &wq->flusher_overflow);
              }
      
              mutex_unlock(&wq->mutex);
      
              wait_for_completion(&this_flusher.done);
      
              /*
               * Wake-up-and-cascade phase
               *
               * First flushers are responsible for cascading flushes and
               * handling overflow.  Non-first flushers can simply return.
               */
              if (wq->first_flusher != &this_flusher)
   93                 return;
      
              mutex_lock(&wq->mutex);
      
              /* we might have raced, check again with mutex held */
              if (wq->first_flusher != &this_flusher)
                      goto out_unlock;
      
              wq->first_flusher = NULL;
      
              WARN_ON_ONCE(!list_empty(&this_flusher.list));
              WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
      
              while (true) {
                      struct wq_flusher *next, *tmp;
      
                      /* complete all the flushers sharing the current flush color */
                      list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
                              if (next->flush_color != wq->flush_color)
                                      break;
                              list_del_init(&next->list);
                              complete(&next->done);
                      }
      
                      WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
                                   wq->flush_color != work_next_color(wq->work_color));
      
                      /* this flush_color is finished, advance by one */
                      wq->flush_color = work_next_color(wq->flush_color);
      
                      /* one color has been freed, handle overflow queue */
                      if (!list_empty(&wq->flusher_overflow)) {
                              /*
                               * Assign the same color to all overflowed
                               * flushers, advance work_color and append to
                               * flusher_queue.  This is the start-to-wait
                               * phase for these overflowed flushers.
                               */
                              list_for_each_entry(tmp, &wq->flusher_overflow, list)
                                      tmp->flush_color = wq->work_color;
      
                              wq->work_color = work_next_color(wq->work_color);
      
                              list_splice_tail_init(&wq->flusher_overflow,
                                                    &wq->flusher_queue);
                              flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                      }
      
                      if (list_empty(&wq->flusher_queue)) {
                              WARN_ON_ONCE(wq->flush_color != wq->work_color);
                              break;
                      }
      
                      /*
                       * Need to flush more colors.  Make the next flusher
                       * the new first flusher and arm pwqs.
                       */
                      WARN_ON_ONCE(wq->flush_color == wq->work_color);
                      WARN_ON_ONCE(wq->flush_color != next->flush_color);
      
                      list_del_init(&next->list);
                      wq->first_flusher = next;
      
                      if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                              break;
      
                      /*
                       * Meh... this color is already done, clear first
                       * flusher and repeat cascading.
                       */
                      wq->first_flusher = NULL;
              }
      
      out_unlock:
   93         mutex_unlock(&wq->mutex);
      }
      EXPORT_SYMBOL(flush_workqueue);
      
      /**
       * drain_workqueue - drain a workqueue
       * @wq: workqueue to drain
       *
       * Wait until the workqueue becomes empty.  While draining is in progress,
       * only chain queueing is allowed.  IOW, only currently pending or running
       * work items on @wq can queue further work items on it.  @wq is flushed
       * repeatedly until it becomes empty.  The number of flushing is determined
       * by the depth of chaining and should be relatively short.  Whine if it
       * takes too long.
       */
      void drain_workqueue(struct workqueue_struct *wq)
      {
              unsigned int flush_cnt = 0;
              struct pool_workqueue *pwq;
      
              /*
               * __queue_work() needs to test whether there are drainers, is much
               * hotter than drain_workqueue() and already looks at @wq->flags.
               * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
               */
   32         mutex_lock(&wq->mutex);
              if (!wq->nr_drainers++)
   32                 wq->flags |= __WQ_DRAINING;
   32         mutex_unlock(&wq->mutex);
      reflush:
   32         flush_workqueue(wq);
      
              mutex_lock(&wq->mutex);
      
   32         for_each_pwq(pwq, wq) {
                      bool drained;
      
   32                 spin_lock_irq(&pwq->pool->lock);
   32                 drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
   32                 spin_unlock_irq(&pwq->pool->lock);
      
                      if (drained)
                              continue;
      
                      if (++flush_cnt == 10 ||
                          (flush_cnt % 100 == 0 && flush_cnt <= 1000))
                              pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
                                      wq->name, flush_cnt);
      
                      mutex_unlock(&wq->mutex);
                      goto reflush;
              }
      
   32         if (!--wq->nr_drainers)
   32                 wq->flags &= ~__WQ_DRAINING;
   32         mutex_unlock(&wq->mutex);
      }
      EXPORT_SYMBOL_GPL(drain_workqueue);
      
      static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
      {
              struct worker *worker = NULL;
              struct worker_pool *pool;
              struct pool_workqueue *pwq;
      
              might_sleep();
      
              local_irq_disable();
              pool = get_work_pool(work);
              if (!pool) {
  112                 local_irq_enable();
                      return false;
              }
      
   90         spin_lock(&pool->lock);
              /* see the comment in try_to_grab_pending() with the same code */
   75         pwq = get_work_pwq(work);
              if (pwq) {
                      if (unlikely(pwq->pool != pool))
                              goto already_gone;
              } else {
   47                 worker = find_worker_executing_work(pool, work);
                      if (!worker)
                              goto already_gone;
   22                 pwq = worker->current_pwq;
              }
      
   86         insert_wq_barrier(pwq, barr, work, worker);
              spin_unlock_irq(&pool->lock);
      
              /*
               * If @max_active is 1 or rescuer is in use, flushing another work
               * item on the same workqueue may lead to deadlock.  Make sure the
               * flusher is not running on the same workqueue by verifying write
               * access.
               */
   86         if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
   32                 lock_map_acquire(&pwq->wq->lockdep_map);
              else
   54                 lock_map_acquire_read(&pwq->wq->lockdep_map);
   86         lock_map_release(&pwq->wq->lockdep_map);
      
              return true;
      already_gone:
   34         spin_unlock_irq(&pool->lock);
              return false;
      }
      
      /**
       * flush_work - wait for a work to finish executing the last queueing instance
       * @work: the work to flush
       *
       * Wait until @work has finished execution.  @work is guaranteed to be idle
       * on return if it hasn't been requeued since flush started.
       *
       * Return:
       * %true if flush_work() waited for the work to finish execution,
       * %false if it was already idle.
       */
      bool flush_work(struct work_struct *work)
      {
              struct wq_barrier barr;
      
  169         lock_map_acquire(&work->lockdep_map);
  169         lock_map_release(&work->lockdep_map);
      
  169         if (start_flush_work(work, &barr)) {
                      wait_for_completion(&barr.done);
                      destroy_work_on_stack(&barr.work);
  169                 return true;
              } else {
                      return false;
              }
      }
      EXPORT_SYMBOL_GPL(flush_work);
      
      struct cwt_wait {
              wait_queue_t                wait;
              struct work_struct        *work;
      };
      
      static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
      {
              struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
      
              if (cwait->work != key)
                      return 0;
              return autoremove_wake_function(wait, mode, sync, key);
      }
      
      static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
   67 {
              static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
              unsigned long flags;
              int ret;
      
              do {
   67                 ret = try_to_grab_pending(work, is_dwork, &flags);
                      /*
                       * If someone else is already canceling, wait for it to
                       * finish.  flush_work() doesn't work for PREEMPT_NONE
                       * because we may get scheduled between @work's completion
                       * and the other canceling task resuming and clearing
                       * CANCELING - flush_work() will return false immediately
                       * as @work is no longer busy, try_to_grab_pending() will
                       * return -ENOENT as @work is still being canceled and the
                       * other canceling task won't be able to clear CANCELING as
                       * we're hogging the CPU.
                       *
                       * Let's wait for completion using a waitqueue.  As this
                       * may lead to the thundering herd problem, use a custom
                       * wake function which matches @work along with exclusive
                       * wait and wakeup.
                       */
                      if (unlikely(ret == -ENOENT)) {
                              struct cwt_wait cwait;
      
                              init_wait(&cwait.wait);
                              cwait.wait.func = cwt_wakefn;
                              cwait.work = work;
      
                              prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
                                                        TASK_UNINTERRUPTIBLE);
                              if (work_is_canceling(work))
                                      schedule();
                              finish_wait(&cancel_waitq, &cwait.wait);
                      }
   67         } while (unlikely(ret < 0));
      
              /* tell other tasks trying to grab @work to back off */
   67         mark_work_canceling(work);
   67         local_irq_restore(flags);
      
   67         flush_work(work);
   67         clear_work_data(work);
      
              /*
               * Paired with prepare_to_wait() above so that either
               * waitqueue_active() is visible here or !work_is_canceling() is
               * visible there.
               */
              smp_mb();
              if (waitqueue_active(&cancel_waitq))
                      __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
      
   67         return ret;
      }
      
      /**
       * cancel_work_sync - cancel a work and wait for it to finish
       * @work: the work to cancel
       *
       * Cancel @work and wait for its execution to finish.  This function
       * can be used even if the work re-queues itself or migrates to
       * another workqueue.  On return from this function, @work is
       * guaranteed to be not pending or executing on any CPU.
       *
       * cancel_work_sync(&delayed_work->work) must not be used for
       * delayed_work's.  Use cancel_delayed_work_sync() instead.
       *
       * The caller must ensure that the workqueue on which @work was last
       * queued can't be destroyed before this function returns.
       *
       * Return:
       * %true if @work was pending, %false otherwise.
       */
      bool cancel_work_sync(struct work_struct *work)
      {
   35         return __cancel_work_timer(work, false);
      }
      EXPORT_SYMBOL_GPL(cancel_work_sync);
      
      /**
       * flush_delayed_work - wait for a dwork to finish executing the last queueing
       * @dwork: the delayed work to flush
       *
       * Delayed timer is cancelled and the pending work is queued for
       * immediate execution.  Like flush_work(), this function only
       * considers the last queueing instance of @dwork.
       *
       * Return:
       * %true if flush_work() waited for the work to finish execution,
       * %false if it was already idle.
       */
      bool flush_delayed_work(struct delayed_work *dwork)
      {
   32         local_irq_disable();
              if (del_timer_sync(&dwork->timer))
                      __queue_work(dwork->cpu, dwork->wq, &dwork->work);
   32         local_irq_enable();
              return flush_work(&dwork->work);
      }
      EXPORT_SYMBOL(flush_delayed_work);
      
      /**
       * cancel_delayed_work - cancel a delayed work
       * @dwork: delayed_work to cancel
       *
       * Kill off a pending delayed_work.
       *
       * Return: %true if @dwork was pending and canceled; %false if it wasn't
       * pending.
       *
       * Note:
       * The work callback function may still be running on return, unless
       * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
       * use cancel_delayed_work_sync() to wait on it.
       *
       * This function is safe to call from any context including IRQ handler.
       */
      bool cancel_delayed_work(struct delayed_work *dwork)
   36 {
              unsigned long flags;
              int ret;
      
              do {
   36                 ret = try_to_grab_pending(&dwork->work, true, &flags);
              } while (unlikely(ret == -EAGAIN));
      
   36         if (unlikely(ret < 0))
                      return false;
      
   36         set_work_pool_and_clear_pending(&dwork->work,
                                              get_work_pool_id(&dwork->work));
   36         local_irq_restore(flags);
   36         return ret;
      }
      EXPORT_SYMBOL(cancel_delayed_work);
      
      /**
       * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
       * @dwork: the delayed work cancel
       *
       * This is cancel_work_sync() for delayed works.
       *
       * Return:
       * %true if @dwork was pending, %false otherwise.
       */
      bool cancel_delayed_work_sync(struct delayed_work *dwork)
      {
   32         return __cancel_work_timer(&dwork->work, true);
      }
      EXPORT_SYMBOL(cancel_delayed_work_sync);
      
      /**
       * schedule_on_each_cpu - execute a function synchronously on each online CPU
       * @func: the function to call
       *
       * schedule_on_each_cpu() executes @func on each online CPU using the
       * system workqueue and blocks until all CPUs have completed.
       * schedule_on_each_cpu() is very slow.
       *
       * Return:
       * 0 on success, -errno on failure.
       */
      int schedule_on_each_cpu(work_func_t func)
      {
              int cpu;
              struct work_struct __percpu *works;
      
              works = alloc_percpu(struct work_struct);
              if (!works)
                      return -ENOMEM;
      
              get_online_cpus();
      
              for_each_online_cpu(cpu) {
                      struct work_struct *work = per_cpu_ptr(works, cpu);
      
                      INIT_WORK(work, func);
                      schedule_work_on(cpu, work);
              }
      
              for_each_online_cpu(cpu)
                      flush_work(per_cpu_ptr(works, cpu));
      
              put_online_cpus();
              free_percpu(works);
              return 0;
      }
      
      /**
       * execute_in_process_context - reliably execute the routine with user context
       * @fn:                the function to execute
       * @ew:                guaranteed storage for the execute work structure (must
       *                be available when the work executes)
       *
       * Executes the function immediately if process context is available,
       * otherwise schedules the function for delayed execution.
       *
       * Return:        0 - function was executed
       *                1 - function was scheduled for execution
       */
      int execute_in_process_context(work_func_t fn, struct execute_work *ew)
      {
              if (!in_interrupt()) {
                      fn(&ew->work);
                      return 0;
              }
      
              INIT_WORK(&ew->work, fn);
              schedule_work(&ew->work);
      
              return 1;
      }
      EXPORT_SYMBOL_GPL(execute_in_process_context);
      
      /**
       * free_workqueue_attrs - free a workqueue_attrs
       * @attrs: workqueue_attrs to free
       *
       * Undo alloc_workqueue_attrs().
       */
      void free_workqueue_attrs(struct workqueue_attrs *attrs)
      {
              if (attrs) {
                      free_cpumask_var(attrs->cpumask);
                      kfree(attrs);
              }
      }
      
      /**
       * alloc_workqueue_attrs - allocate a workqueue_attrs
       * @gfp_mask: allocation mask to use
       *
       * Allocate a new workqueue_attrs, initialize with default settings and
       * return it.
       *
       * Return: The allocated new workqueue_attr on success. %NULL on failure.
       */
      struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
      {
              struct workqueue_attrs *attrs;
      
              attrs = kzalloc(sizeof(*attrs), gfp_mask);
              if (!attrs)
                      goto fail;
              if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
                      goto fail;
      
              cpumask_copy(attrs->cpumask, cpu_possible_mask);
              return attrs;
      fail:
              free_workqueue_attrs(attrs);
              return NULL;
      }
      
      static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                       const struct workqueue_attrs *from)
      {
              to->nice = from->nice;
              cpumask_copy(to->cpumask, from->cpumask);
              /*
               * Unlike hash and equality test, this function doesn't ignore
               * ->no_numa as it is used for both pool and wq attrs.  Instead,
               * get_unbound_pool() explicitly clears ->no_numa after copying.
               */
              to->no_numa = from->no_numa;
      }
      
      /* hash value of the content of @attr */
      static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
      {
              u32 hash = 0;
      
              hash = jhash_1word(attrs->nice, hash);
              hash = jhash(cpumask_bits(attrs->cpumask),
                           BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
              return hash;
      }
      
      /* content equality test */
      static bool wqattrs_equal(const struct workqueue_attrs *a,
                                const struct workqueue_attrs *b)
      {
              if (a->nice != b->nice)
                      return false;
              if (!cpumask_equal(a->cpumask, b->cpumask))
                      return false;
              return true;
      }
      
      /**
       * init_worker_pool - initialize a newly zalloc'd worker_pool
       * @pool: worker_pool to initialize
       *
       * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
       *
       * Return: 0 on success, -errno on failure.  Even on failure, all fields
       * inside @pool proper are initialized and put_unbound_pool() can be called
       * on @pool safely to release it.
       */
      static int init_worker_pool(struct worker_pool *pool)
      {
              spin_lock_init(&pool->lock);
              pool->id = -1;
              pool->cpu = -1;
              pool->node = NUMA_NO_NODE;
              pool->flags |= POOL_DISASSOCIATED;
              INIT_LIST_HEAD(&pool->worklist);
              INIT_LIST_HEAD(&pool->idle_list);
              hash_init(pool->busy_hash);
      
              init_timer_deferrable(&pool->idle_timer);
              pool->idle_timer.function = idle_worker_timeout;
              pool->idle_timer.data = (unsigned long)pool;
      
              setup_timer(&pool->mayday_timer, pool_mayday_timeout,
                          (unsigned long)pool);
      
              mutex_init(&pool->attach_mutex);
              INIT_LIST_HEAD(&pool->workers);
      
              ida_init(&pool->worker_ida);
              INIT_HLIST_NODE(&pool->hash_node);
              pool->refcnt = 1;
      
              /* shouldn't fail above this point */
              pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
              if (!pool->attrs)
                      return -ENOMEM;
              return 0;
      }
      
      static void rcu_free_wq(struct rcu_head *rcu)
      {
              struct workqueue_struct *wq =
                      container_of(rcu, struct workqueue_struct, rcu);
      
              if (!(wq->flags & WQ_UNBOUND))
                      free_percpu(wq->cpu_pwqs);
              else
                      free_workqueue_attrs(wq->unbound_attrs);
      
              kfree(wq->rescuer);
              kfree(wq);
      }
      
      static void rcu_free_pool(struct rcu_head *rcu)
      {
              struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
      
              ida_destroy(&pool->worker_ida);
              free_workqueue_attrs(pool->attrs);
              kfree(pool);
      }
      
      /**
       * put_unbound_pool - put a worker_pool
       * @pool: worker_pool to put
       *
       * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
       * safe manner.  get_unbound_pool() calls this function on its failure path
       * and this function should be able to release pools which went through,
       * successfully or not, init_worker_pool().
       *
       * Should be called with wq_pool_mutex held.
       */
      static void put_unbound_pool(struct worker_pool *pool)
      {
              DECLARE_COMPLETION_ONSTACK(detach_completion);
              struct worker *worker;
      
              lockdep_assert_held(&wq_pool_mutex);
      
              if (--pool->refcnt)
                      return;
      
              /* sanity checks */
              if (WARN_ON(!(pool->cpu < 0)) ||
                  WARN_ON(!list_empty(&pool->worklist)))
                      return;
      
              /* release id and unhash */
              if (pool->id >= 0)
                      idr_remove(&worker_pool_idr, pool->id);
              hash_del(&pool->hash_node);
      
              /*
               * Become the manager and destroy all workers.  This prevents
               * @pool's workers from blocking on attach_mutex.  We're the last
               * manager and @pool gets freed with the flag set.
               */
              spin_lock_irq(&pool->lock);
              wait_event_lock_irq(wq_manager_wait,
                                  !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
              pool->flags |= POOL_MANAGER_ACTIVE;
      
              while ((worker = first_idle_worker(pool)))
                      destroy_worker(worker);
              WARN_ON(pool->nr_workers || pool->nr_idle);
              spin_unlock_irq(&pool->lock);
      
              mutex_lock(&pool->attach_mutex);
              if (!list_empty(&pool->workers))
                      pool->detach_completion = &detach_completion;
              mutex_unlock(&pool->attach_mutex);
      
              if (pool->detach_completion)
                      wait_for_completion(pool->detach_completion);
      
              /* shut down the timers */
              del_timer_sync(&pool->idle_timer);
              del_timer_sync(&pool->mayday_timer);
      
              /* sched-RCU protected to allow dereferences from get_work_pool() */
              call_rcu_sched(&pool->rcu, rcu_free_pool);
      }
      
      /**
       * get_unbound_pool - get a worker_pool with the specified attributes
       * @attrs: the attributes of the worker_pool to get
       *
       * Obtain a worker_pool which has the same attributes as @attrs, bump the
       * reference count and return it.  If there already is a matching
       * worker_pool, it will be used; otherwise, this function attempts to
       * create a new one.
       *
       * Should be called with wq_pool_mutex held.
       *
       * Return: On success, a worker_pool with the same attributes as @attrs.
       * On failure, %NULL.
       */
      static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
      {
              u32 hash = wqattrs_hash(attrs);
              struct worker_pool *pool;
              int node;
              int target_node = NUMA_NO_NODE;
      
              lockdep_assert_held(&wq_pool_mutex);
      
              /* do we already have a matching pool? */
              hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                      if (wqattrs_equal(pool->attrs, attrs)) {
                              pool->refcnt++;
                              return pool;
                      }
              }
      
              /* if cpumask is contained inside a NUMA node, we belong to that node */
              if (wq_numa_enabled) {
                      for_each_node(node) {
                              if (cpumask_subset(attrs->cpumask,
                                                 wq_numa_possible_cpumask[node])) {
                                      target_node = node;
                                      break;
                              }
                      }
              }
      
              /* nope, create a new one */
              pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
              if (!pool || init_worker_pool(pool) < 0)
                      goto fail;
      
              lockdep_set_subclass(&pool->lock, 1);        /* see put_pwq() */
              copy_workqueue_attrs(pool->attrs, attrs);
              pool->node = target_node;
      
              /*
               * no_numa isn't a worker_pool attribute, always clear it.  See
               * 'struct workqueue_attrs' comments for detail.
               */
              pool->attrs->no_numa = false;
      
              if (worker_pool_assign_id(pool) < 0)
                      goto fail;
      
              /* create and start the initial worker */
              if (!create_worker(pool))
                      goto fail;
      
              /* install */
              hash_add(unbound_pool_hash, &pool->hash_node, hash);
      
              return pool;
      fail:
              if (pool)
                      put_unbound_pool(pool);
              return NULL;
      }
      
      static void rcu_free_pwq(struct rcu_head *rcu)
      {
              kmem_cache_free(pwq_cache,
                              container_of(rcu, struct pool_workqueue, rcu));
      }
      
      /*
       * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
       * and needs to be destroyed.
       */
      static void pwq_unbound_release_workfn(struct work_struct *work)
      {
              struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
                                                        unbound_release_work);
              struct workqueue_struct *wq = pwq->wq;
              struct worker_pool *pool = pwq->pool;
              bool is_last;
      
              if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
                      return;
      
              mutex_lock(&wq->mutex);
              list_del_rcu(&pwq->pwqs_node);
              is_last = list_empty(&wq->pwqs);
              mutex_unlock(&wq->mutex);
      
              mutex_lock(&wq_pool_mutex);
              put_unbound_pool(pool);
              mutex_unlock(&wq_pool_mutex);
      
              call_rcu_sched(&pwq->rcu, rcu_free_pwq);
      
              /*
               * If we're the last pwq going away, @wq is already dead and no one
               * is gonna access it anymore.  Schedule RCU free.
               */
              if (is_last)
                      call_rcu_sched(&wq->rcu, rcu_free_wq);
      }
      
      /**
       * pwq_adjust_max_active - update a pwq's max_active to the current setting
       * @pwq: target pool_workqueue
       *
       * If @pwq isn't freezing, set @pwq->max_active to the associated
       * workqueue's saved_max_active and activate delayed work items
       * accordingly.  If @pwq is freezing, clear @pwq->max_active to zero.
       */
      static void pwq_adjust_max_active(struct pool_workqueue *pwq)
      {
   24         struct workqueue_struct *wq = pwq->wq;
              bool freezable = wq->flags & WQ_FREEZABLE;
      
              /* for @wq->saved_max_active */
   24         lockdep_assert_held(&wq->mutex);
      
              /* fast exit for non-freezable wqs */
   24         if (!freezable && pwq->max_active == wq->saved_max_active)
                      return;
      
   24         spin_lock_irq(&pwq->pool->lock);
      
              /*
               * During [un]freezing, the caller is responsible for ensuring that
               * this function is called at least once after @workqueue_freezing
               * is updated and visible.
               */
              if (!freezable || !workqueue_freezing) {
   24                 pwq->max_active = wq->saved_max_active;
      
                      while (!list_empty(&pwq->delayed_works) &&
                             pwq->nr_active < pwq->max_active)
                              pwq_activate_first_delayed(pwq);
      
                      /*
                       * Need to kick a worker after thawed or an unbound wq's
                       * max_active is bumped.  It's a slow path.  Do it always.
                       */
   24                 wake_up_worker(pwq->pool);
              } else {
                      pwq->max_active = 0;
              }
      
   24         spin_unlock_irq(&pwq->pool->lock);
      }
      
      /* initialize newly alloced @pwq which is associated with @wq and @pool */
      static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
                           struct worker_pool *pool)
      {
   24         BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
      
   24         memset(pwq, 0, sizeof(*pwq));
      
              pwq->pool = pool;
              pwq->wq = wq;
              pwq->flush_color = -1;
              pwq->refcnt = 1;
              INIT_LIST_HEAD(&pwq->delayed_works);
              INIT_LIST_HEAD(&pwq->pwqs_node);
              INIT_LIST_HEAD(&pwq->mayday_node);
              INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
      }
      
      /* sync @pwq with the current state of its associated wq and link it */
      static void link_pwq(struct pool_workqueue *pwq)
      {
   24         struct workqueue_struct *wq = pwq->wq;
      
   24         lockdep_assert_held(&wq->mutex);
      
              /* may be called multiple times, ignore if already linked */
   24         if (!list_empty(&pwq->pwqs_node))
                      return;
      
              /* set the matching work_color */
   24         pwq->work_color = wq->work_color;
      
              /* sync max_active to the current setting */
              pwq_adjust_max_active(pwq);
      
              /* link in @pwq */
   24         list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
      }
      
      /* obtain a pool matching @attr and create a pwq associating the pool and @wq */
      static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
                                              const struct workqueue_attrs *attrs)
      {
              struct worker_pool *pool;
              struct pool_workqueue *pwq;
      
              lockdep_assert_held(&wq_pool_mutex);
      
              pool = get_unbound_pool(attrs);
              if (!pool)
                      return NULL;
      
              pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
              if (!pwq) {
                      put_unbound_pool(pool);
                      return NULL;
              }
      
              init_pwq(pwq, wq, pool);
              return pwq;
      }
      
      /**
       * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
       * @attrs: the wq_attrs of the default pwq of the target workqueue
       * @node: the target NUMA node
       * @cpu_going_down: if >= 0, the CPU to consider as offline
       * @cpumask: outarg, the resulting cpumask
       *
       * Calculate the cpumask a workqueue with @attrs should use on @node.  If
       * @cpu_going_down is >= 0, that cpu is considered offline during
       * calculation.  The result is stored in @cpumask.
       *
       * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
       * enabled and @node has online CPUs requested by @attrs, the returned
       * cpumask is the intersection of the possible CPUs of @node and
       * @attrs->cpumask.
       *
       * The caller is responsible for ensuring that the cpumask of @node stays
       * stable.
       *
       * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
       * %false if equal.
       */
      static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
                                       int cpu_going_down, cpumask_t *cpumask)
      {
              if (!wq_numa_enabled || attrs->no_numa)
                      goto use_dfl;
      
              /* does @node have any online CPUs @attrs wants? */
              cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
              if (cpu_going_down >= 0)
                      cpumask_clear_cpu(cpu_going_down, cpumask);
      
              if (cpumask_empty(cpumask))
                      goto use_dfl;
      
              /* yeap, return possible CPUs in @node that @attrs wants */
              cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
              return !cpumask_equal(cpumask, attrs->cpumask);
      
      use_dfl:
              cpumask_copy(cpumask, attrs->cpumask);
              return false;
      }
      
      /* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
      static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
                                                         int node,
                                                         struct pool_workqueue *pwq)
      {
              struct pool_workqueue *old_pwq;
      
              lockdep_assert_held(&wq_pool_mutex);
              lockdep_assert_held(&wq->mutex);
      
              /* link_pwq() can handle duplicate calls */
              link_pwq(pwq);
      
              old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
              rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
              return old_pwq;
      }
      
      /* context to store the prepared attrs & pwqs before applying */
      struct apply_wqattrs_ctx {
              struct workqueue_struct        *wq;                /* target workqueue */
              struct workqueue_attrs        *attrs;                /* attrs to apply */
              struct list_head        list;                /* queued for batching commit */
              struct pool_workqueue        *dfl_pwq;
              struct pool_workqueue        *pwq_tbl[];
      };
      
      /* free the resources after success or abort */
      static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
      {
              if (ctx) {
                      int node;
      
                      for_each_node(node)
                              put_pwq_unlocked(ctx->pwq_tbl[node]);
                      put_pwq_unlocked(ctx->dfl_pwq);
      
                      free_workqueue_attrs(ctx->attrs);
      
                      kfree(ctx);
              }
      }
      
      /* allocate the attrs and pwqs for later installation */
      static struct apply_wqattrs_ctx *
      apply_wqattrs_prepare(struct workqueue_struct *wq,
                            const struct workqueue_attrs *attrs)
      {
              struct apply_wqattrs_ctx *ctx;
              struct workqueue_attrs *new_attrs, *tmp_attrs;
              int node;
      
              lockdep_assert_held(&wq_pool_mutex);
      
              ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
                            GFP_KERNEL);
      
              new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
              tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
              if (!ctx || !new_attrs || !tmp_attrs)
                      goto out_free;
      
              /*
               * Calculate the attrs of the default pwq.
               * If the user configured cpumask doesn't overlap with the
               * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
               */
              copy_workqueue_attrs(new_attrs, attrs);
              cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
              if (unlikely(cpumask_empty(new_attrs->cpumask)))
                      cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
      
              /*
               * We may create multiple pwqs with differing cpumasks.  Make a
               * copy of @new_attrs which will be modified and used to obtain
               * pools.
               */
              copy_workqueue_attrs(tmp_attrs, new_attrs);
      
              /*
               * If something goes wrong during CPU up/down, we'll fall back to
               * the default pwq covering whole @attrs->cpumask.  Always create
               * it even if we don't use it immediately.
               */
              ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
              if (!ctx->dfl_pwq)
                      goto out_free;
      
              for_each_node(node) {
                      if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
                              ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
                              if (!ctx->pwq_tbl[node])
                                      goto out_free;
                      } else {
                              ctx->dfl_pwq->refcnt++;
                              ctx->pwq_tbl[node] = ctx->dfl_pwq;
                      }
              }
      
              /* save the user configured attrs and sanitize it. */
              copy_workqueue_attrs(new_attrs, attrs);
              cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
              ctx->attrs = new_attrs;
      
              ctx->wq = wq;
              free_workqueue_attrs(tmp_attrs);
              return ctx;
      
      out_free:
              free_workqueue_attrs(tmp_attrs);
              free_workqueue_attrs(new_attrs);
              apply_wqattrs_cleanup(ctx);
              return NULL;
      }
      
      /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
      static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
      {
              int node;
      
              /* all pwqs have been created successfully, let's install'em */
              mutex_lock(&ctx->wq->mutex);
      
              copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
      
              /* save the previous pwq and install the new one */
              for_each_node(node)
                      ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
                                                                ctx->pwq_tbl[node]);
      
              /* @dfl_pwq might not have been used, ensure it's linked */
              link_pwq(ctx->dfl_pwq);
              swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
      
              mutex_unlock(&ctx->wq->mutex);
      }
      
      static void apply_wqattrs_lock(void)
      {
              /* CPUs should stay stable across pwq creations and installations */
              get_online_cpus();
              mutex_lock(&wq_pool_mutex);
      }
      
      static void apply_wqattrs_unlock(void)
      {
              mutex_unlock(&wq_pool_mutex);
              put_online_cpus();
      }
      
      static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
                                              const struct workqueue_attrs *attrs)
      {
              struct apply_wqattrs_ctx *ctx;
              int ret = -ENOMEM;
      
              /* only unbound workqueues can change attributes */
              if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
                      return -EINVAL;
      
              /* creating multiple pwqs breaks ordering guarantee */
              if (!list_empty(&wq->pwqs)) {
                      if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
                              return -EINVAL;
      
                      wq->flags &= ~__WQ_ORDERED;
              }
      
              ctx = apply_wqattrs_prepare(wq, attrs);
      
              /* the ctx has been prepared successfully, let's commit it */
              if (ctx) {
                      apply_wqattrs_commit(ctx);
                      ret = 0;
              }
      
              apply_wqattrs_cleanup(ctx);
      
              return ret;
      }
      
      /**
       * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
       * @wq: the target workqueue
       * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
       *
       * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
       * machines, this function maps a separate pwq to each NUMA node with
       * possibles CPUs in @attrs->cpumask so that work items are affine to the
       * NUMA node it was issued on.  Older pwqs are released as in-flight work
       * items finish.  Note that a work item which repeatedly requeues itself
       * back-to-back will stay on its current pwq.
       *
       * Performs GFP_KERNEL allocations.
       *
       * Return: 0 on success and -errno on failure.
       */
      int apply_workqueue_attrs(struct workqueue_struct *wq,
                                const struct workqueue_attrs *attrs)
      {
              int ret;
      
              apply_wqattrs_lock();
              ret = apply_workqueue_attrs_locked(wq, attrs);
              apply_wqattrs_unlock();
      
              return ret;
      }
      
      /**
       * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
       * @wq: the target workqueue
       * @cpu: the CPU coming up or going down
       * @online: whether @cpu is coming up or going down
       *
       * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
       * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
       * @wq accordingly.
       *
       * If NUMA affinity can't be adjusted due to memory allocation failure, it
       * falls back to @wq->dfl_pwq which may not be optimal but is always
       * correct.
       *
       * Note that when the last allowed CPU of a NUMA node goes offline for a
       * workqueue with a cpumask spanning multiple nodes, the workers which were
       * already executing the work items for the workqueue will lose their CPU
       * affinity and may execute on any CPU.  This is similar to how per-cpu
       * workqueues behave on CPU_DOWN.  If a workqueue user wants strict
       * affinity, it's the user's responsibility to flush the work item from
       * CPU_DOWN_PREPARE.
       */
      static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
                                         bool online)
      {
              int node = cpu_to_node(cpu);
              int cpu_off = online ? -1 : cpu;
              struct pool_workqueue *old_pwq = NULL, *pwq;
              struct workqueue_attrs *target_attrs;
              cpumask_t *cpumask;
      
              lockdep_assert_held(&wq_pool_mutex);
      
              if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
                  wq->unbound_attrs->no_numa)
                      return;
      
              /*
               * We don't wanna alloc/free wq_attrs for each wq for each CPU.
               * Let's use a preallocated one.  The following buf is protected by
               * CPU hotplug exclusion.
               */
              target_attrs = wq_update_unbound_numa_attrs_buf;
              cpumask = target_attrs->cpumask;
      
              copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
              pwq = unbound_pwq_by_node(wq, node);
      
              /*
               * Let's determine what needs to be done.  If the target cpumask is
               * different from the default pwq's, we need to compare it to @pwq's
               * and create a new one if they don't match.  If the target cpumask
               * equals the default pwq's, the default pwq should be used.
               */
              if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
                      if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
                              return;
              } else {
                      goto use_dfl_pwq;
              }
      
              /* create a new pwq */
              pwq = alloc_unbound_pwq(wq, target_attrs);
              if (!pwq) {
                      pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
                              wq->name);
                      goto use_dfl_pwq;
              }
      
              /* Install the new pwq. */
              mutex_lock(&wq->mutex);
              old_pwq = numa_pwq_tbl_install(wq, node, pwq);
              goto out_unlock;
      
      use_dfl_pwq:
              mutex_lock(&wq->mutex);
              spin_lock_irq(&wq->dfl_pwq->pool->lock);
              get_pwq(wq->dfl_pwq);
              spin_unlock_irq(&wq->dfl_pwq->pool->lock);
              old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
      out_unlock:
              mutex_unlock(&wq->mutex);
              put_pwq_unlocked(old_pwq);
      }
      
      static int alloc_and_link_pwqs(struct workqueue_struct *wq)
      {
              bool highpri = wq->flags & WQ_HIGHPRI;
              int cpu, ret;
      
              if (!(wq->flags & WQ_UNBOUND)) {
   24                 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
                      if (!wq->cpu_pwqs)
                              return -ENOMEM;
      
   24                 for_each_possible_cpu(cpu) {
                              struct pool_workqueue *pwq =
   24                                 per_cpu_ptr(wq->cpu_pwqs, cpu);
                              struct worker_pool *cpu_pools =
                                      per_cpu(cpu_worker_pools, cpu);
      
   24                         init_pwq(pwq, wq, &cpu_pools[highpri]);
      
                              mutex_lock(&wq->mutex);
                              link_pwq(pwq);
                              mutex_unlock(&wq->mutex);
                      }
                      return 0;
              } else if (wq->flags & __WQ_ORDERED) {
                      ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
                      /* there should only be single pwq for ordering guarantee */
                      WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
                                    wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
                           "ordering guarantee broken for workqueue %s\n", wq->name);
                      return ret;
              } else {
                      return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
              }
      }
      
      static int wq_clamp_max_active(int max_active, unsigned int flags,
                                     const char *name)
      {
   24         int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
      
   24         if (max_active < 1 || max_active > lim)
                      pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
                              max_active, name, 1, lim);
      
   24         return clamp_val(max_active, 1, lim);
      }
      
      struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                                                     unsigned int flags,
                                                     int max_active,
                                                     struct lock_class_key *key,
                                                     const char *lock_name, ...)
      {
              size_t tbl_size = 0;
              va_list args;
              struct workqueue_struct *wq;
              struct pool_workqueue *pwq;
      
              /*
               * Unbound && max_active == 1 used to imply ordered, which is no
               * longer the case on NUMA machines due to per-node pools.  While
               * alloc_ordered_workqueue() is the right way to create an ordered
               * workqueue, keep the previous behavior to avoid subtle breakages
               * on NUMA.
               */
   24         if ((flags & WQ_UNBOUND) && max_active == 1)
                      flags |= __WQ_ORDERED;
      
              /* see the comment above the definition of WQ_POWER_EFFICIENT */
   24         if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
                      flags |= WQ_UNBOUND;
      
              /* allocate wq and format name */
              if (flags & WQ_UNBOUND)
                      tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
      
   24         wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
              if (!wq)
                      return NULL;
      
   24         if (flags & WQ_UNBOUND) {
                      wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
                      if (!wq->unbound_attrs)
                              goto err_free_wq;
              }
      
   24         va_start(args, lock_name);
              vsnprintf(wq->name, sizeof(wq->name), fmt, args);
              va_end(args);
      
              max_active = max_active ?: WQ_DFL_ACTIVE;
   24         max_active = wq_clamp_max_active(max_active, flags, wq->name);
      
              /* init wq */
              wq->flags = flags;
              wq->saved_max_active = max_active;
              mutex_init(&wq->mutex);
              atomic_set(&wq->nr_pwqs_to_flush, 0);
              INIT_LIST_HEAD(&wq->pwqs);
              INIT_LIST_HEAD(&wq->flusher_queue);
              INIT_LIST_HEAD(&wq->flusher_overflow);
              INIT_LIST_HEAD(&wq->maydays);
      
              lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
              INIT_LIST_HEAD(&wq->list);
      
   24         if (alloc_and_link_pwqs(wq) < 0)
                      goto err_free_wq;
      
              /*
               * Workqueues which may be used during memory reclaim should
               * have a rescuer to guarantee forward progress.
               */
   24         if (flags & WQ_MEM_RECLAIM) {
                      struct worker *rescuer;
      
   24                 rescuer = alloc_worker(NUMA_NO_NODE);
                      if (!rescuer)
                              goto err_destroy;
      
   24                 rescuer->rescue_wq = wq;
                      rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
                                                     wq->name);
                      if (IS_ERR(rescuer->task)) {
                              kfree(rescuer);
                              goto err_destroy;
                      }
      
   24                 wq->rescuer = rescuer;
                      kthread_bind_mask(rescuer->task, cpu_possible_mask);
                      wake_up_process(rescuer->task);
              }
      
   24         if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
                      goto err_destroy;
      
              /*
               * wq_pool_mutex protects global freeze state and workqueues list.
               * Grab it, adjust max_active and add the new @wq to workqueues
               * list.
               */
   24         mutex_lock(&wq_pool_mutex);
      
              mutex_lock(&wq->mutex);
   24         for_each_pwq(pwq, wq)
   24                 pwq_adjust_max_active(pwq);
   24         mutex_unlock(&wq->mutex);
      
   24         list_add_tail_rcu(&wq->list, &workqueues);
      
   24         mutex_unlock(&wq_pool_mutex);
      
              return wq;
      
      err_free_wq:
              free_workqueue_attrs(wq->unbound_attrs);
              kfree(wq);
   24         return NULL;
      err_destroy:
              destroy_workqueue(wq);
              return NULL;
      }
      EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
      
      /**
       * destroy_workqueue - safely terminate a workqueue
       * @wq: target workqueue
       *
       * Safely destroy a workqueue. All work currently pending will be done first.
       */
      void destroy_workqueue(struct workqueue_struct *wq)
      {
              struct pool_workqueue *pwq;
              int node;
      
              /* drain it before proceeding with destruction */
   32         drain_workqueue(wq);
      
              /* sanity checks */
              mutex_lock(&wq->mutex);
   32         for_each_pwq(pwq, wq) {
                      int i;
      
   32                 for (i = 0; i < WORK_NR_COLORS; i++) {
   32                         if (WARN_ON(pwq->nr_in_flight[i])) {
                                      mutex_unlock(&wq->mutex);
   32                                 return;
                              }
                      }
      
   32                 if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
   32                     WARN_ON(pwq->nr_active) ||
   32                     WARN_ON(!list_empty(&pwq->delayed_works))) {
                              mutex_unlock(&wq->mutex);
                              return;
                      }
              }
   32         mutex_unlock(&wq->mutex);
      
              /*
               * wq list is used to freeze wq, remove from list after
               * flushing is complete in case freeze races us.
               */
              mutex_lock(&wq_pool_mutex);
   32         list_del_rcu(&wq->list);
              mutex_unlock(&wq_pool_mutex);
      
              workqueue_sysfs_unregister(wq);
      
   32         if (wq->rescuer)
   32                 kthread_stop(wq->rescuer->task);
      
   32         if (!(wq->flags & WQ_UNBOUND)) {
                      /*
                       * The base ref is never dropped on per-cpu pwqs.  Directly
                       * schedule RCU free.
                       */
   32                 call_rcu_sched(&wq->rcu, rcu_free_wq);
              } else {
                      /*
                       * We're the sole accessor of @wq at this point.  Directly
                       * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
                       * @wq will be freed when the last pwq is released.
                       */
                      for_each_node(node) {
                              pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
                              RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
                              put_pwq_unlocked(pwq);
                      }
      
                      /*
                       * Put dfl_pwq.  @wq may be freed any time after dfl_pwq is
                       * put.  Don't access it afterwards.
                       */
                      pwq = wq->dfl_pwq;
                      wq->dfl_pwq = NULL;
                      put_pwq_unlocked(pwq);
              }
      }
      EXPORT_SYMBOL_GPL(destroy_workqueue);
      
      /**
       * workqueue_set_max_active - adjust max_active of a workqueue
       * @wq: target workqueue
       * @max_active: new max_active value.
       *
       * Set max_active of @wq to @max_active.
       *
       * CONTEXT:
       * Don't call from IRQ context.
       */
      void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
      {
              struct pool_workqueue *pwq;
      
              /* disallow meddling with max_active for ordered workqueues */
              if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
                      return;
      
              max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
      
              mutex_lock(&wq->mutex);
      
              wq->flags &= ~__WQ_ORDERED;
              wq->saved_max_active = max_active;
      
              for_each_pwq(pwq, wq)
                      pwq_adjust_max_active(pwq);
      
              mutex_unlock(&wq->mutex);
      }
      EXPORT_SYMBOL_GPL(workqueue_set_max_active);
      
      /**
       * current_work - retrieve %current task's work struct
       *
       * Determine if %current task is a workqueue worker and what it's working on.
       * Useful to find out the context that the %current task is running in.
       *
       * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
       */
      struct work_struct *current_work(void)
      {
              struct worker *worker = current_wq_worker();
      
              return worker ? worker->current_work : NULL;
      }
      EXPORT_SYMBOL(current_work);
      
      /**
       * current_is_workqueue_rescuer - is %current workqueue rescuer?
       *
       * Determine whether %current is a workqueue rescuer.  Can be used from
       * work functions to determine whether it's being run off the rescuer task.
       *
       * Return: %true if %current is a workqueue rescuer. %false otherwise.
       */
      bool current_is_workqueue_rescuer(void)
      {
              struct worker *worker = current_wq_worker();
      
              return worker && worker->rescue_wq;
      }
      
      /**
       * workqueue_congested - test whether a workqueue is congested
       * @cpu: CPU in question
       * @wq: target workqueue
       *
       * Test whether @wq's cpu workqueue for @cpu is congested.  There is
       * no synchronization around this function and the test result is
       * unreliable and only useful as advisory hints or for debugging.
       *
       * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
       * Note that both per-cpu and unbound workqueues may be associated with
       * multiple pool_workqueues which have separate congested states.  A
       * workqueue being congested on one CPU doesn't mean the workqueue is also
       * contested on other CPUs / NUMA nodes.
       *
       * Return:
       * %true if congested, %false otherwise.
       */
      bool workqueue_congested(int cpu, struct workqueue_struct *wq)
      {
              struct pool_workqueue *pwq;
              bool ret;
      
              rcu_read_lock_sched();
      
              if (cpu == WORK_CPU_UNBOUND)
                      cpu = smp_processor_id();
      
              if (!(wq->flags & WQ_UNBOUND))
                      pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
              else
                      pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
      
              ret = !list_empty(&pwq->delayed_works);
              rcu_read_unlock_sched();
      
              return ret;
      }
      EXPORT_SYMBOL_GPL(workqueue_congested);
      
      /**
       * work_busy - test whether a work is currently pending or running
       * @work: the work to be tested
       *
       * Test whether @work is currently pending or running.  There is no
       * synchronization around this function and the test result is
       * unreliable and only useful as advisory hints or for debugging.
       *
       * Return:
       * OR'd bitmask of WORK_BUSY_* bits.
       */
      unsigned int work_busy(struct work_struct *work)
      {
              struct worker_pool *pool;
              unsigned long flags;
              unsigned int ret = 0;
      
              if (work_pending(work))
                      ret |= WORK_BUSY_PENDING;
      
              local_irq_save(flags);
              pool = get_work_pool(work);
              if (pool) {
                      spin_lock(&pool->lock);
                      if (find_worker_executing_work(pool, work))
                              ret |= WORK_BUSY_RUNNING;
                      spin_unlock(&pool->lock);
              }
              local_irq_restore(flags);
      
              return ret;
      }
      EXPORT_SYMBOL_GPL(work_busy);
      
      /**
       * set_worker_desc - set description for the current work item
       * @fmt: printf-style format string
       * @...: arguments for the format string
       *
       * This function can be called by a running work function to describe what
       * the work item is about.  If the worker task gets dumped, this
       * information will be printed out together to help debugging.  The
       * description can be at most WORKER_DESC_LEN including the trailing '\0'.
       */
      void set_worker_desc(const char *fmt, ...)
      {
              struct worker *worker = current_wq_worker();
              va_list args;
      
              if (worker) {
                      va_start(args, fmt);
                      vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
                      va_end(args);
                      worker->desc_valid = true;
              }
      }
      
      /**
       * print_worker_info - print out worker information and description
       * @log_lvl: the log level to use when printing
       * @task: target task
       *
       * If @task is a worker and currently executing a work item, print out the
       * name of the workqueue being serviced and worker description set with
       * set_worker_desc() by the currently executing work item.
       *
       * This function can be safely called on any task as long as the
       * task_struct itself is accessible.  While safe, this function isn't
       * synchronized and may print out mixups or garbages of limited length.
       */
      void print_worker_info(const char *log_lvl, struct task_struct *task)
      {
    2         work_func_t *fn = NULL;
              char name[WQ_NAME_LEN] = { };
              char desc[WORKER_DESC_LEN] = { };
              struct pool_workqueue *pwq = NULL;
              struct workqueue_struct *wq = NULL;
              bool desc_valid = false;
              struct worker *worker;
      
              if (!(task->flags & PF_WQ_WORKER))
    2                 return;
      
              /*
               * This function is called without any synchronization and @task
               * could be in any state.  Be careful with dereferences.
               */
              worker = probe_kthread_data(task);
      
              /*
               * Carefully copy the associated workqueue's workfn and name.  Keep
               * the original last '\0' in case the original contains garbage.
               */
              probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
              probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
              probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
              probe_kernel_read(name, wq->name, sizeof(name) - 1);
      
              /* copy worker description */
              probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
              if (desc_valid)
                      probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
      
              if (fn || name[0] || desc[0]) {
                      printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
                      if (desc[0])
                              pr_cont(" (%s)", desc);
                      pr_cont("\n");
              }
      }
      
      static void pr_cont_pool_info(struct worker_pool *pool)
      {
              pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
              if (pool->node != NUMA_NO_NODE)
                      pr_cont(" node=%d", pool->node);
              pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
      }
      
      static void pr_cont_work(bool comma, struct work_struct *work)
      {
              if (work->func == wq_barrier_func) {
                      struct wq_barrier *barr;
      
                      barr = container_of(work, struct wq_barrier, work);
      
                      pr_cont("%s BAR(%d)", comma ? "," : "",
                              task_pid_nr(barr->task));
              } else {
                      pr_cont("%s %pf", comma ? "," : "", work->func);
              }
      }
      
      static void show_pwq(struct pool_workqueue *pwq)
      {
              struct worker_pool *pool = pwq->pool;
              struct work_struct *work;
              struct worker *worker;
              bool has_in_flight = false, has_pending = false;
              int bkt;
      
              pr_info("  pwq %d:", pool->id);
              pr_cont_pool_info(pool);
      
              pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
                      !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
      
              hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                      if (worker->current_pwq == pwq) {
                              has_in_flight = true;
                              break;
                      }
              }
              if (has_in_flight) {
                      bool comma = false;
      
                      pr_info("    in-flight:");
                      hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                              if (worker->current_pwq != pwq)
                                      continue;
      
                              pr_cont("%s %d%s:%pf", comma ? "," : "",
                                      task_pid_nr(worker->task),
                                      worker == pwq->wq->rescuer ? "(RESCUER)" : "",
                                      worker->current_func);
                              list_for_each_entry(work, &worker->scheduled, entry)
                                      pr_cont_work(false, work);
                              comma = true;
                      }
                      pr_cont("\n");
              }
      
              list_for_each_entry(work, &pool->worklist, entry) {
                      if (get_work_pwq(work) == pwq) {
                              has_pending = true;
                              break;
                      }
              }
              if (has_pending) {
                      bool comma = false;
      
                      pr_info("    pending:");
                      list_for_each_entry(work, &pool->worklist, entry) {
                              if (get_work_pwq(work) != pwq)
                                      continue;
      
                              pr_cont_work(comma, work);
                              comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                      }
                      pr_cont("\n");
              }
      
              if (!list_empty(&pwq->delayed_works)) {
                      bool comma = false;
      
                      pr_info("    delayed:");
                      list_for_each_entry(work, &pwq->delayed_works, entry) {
                              pr_cont_work(comma, work);
                              comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                      }
                      pr_cont("\n");
              }
      }
      
      /**
       * show_workqueue_state - dump workqueue state
       *
       * Called from a sysrq handler and prints out all busy workqueues and
       * pools.
       */
      void show_workqueue_state(void)
      {
              struct workqueue_struct *wq;
              struct worker_pool *pool;
              unsigned long flags;
              int pi;
      
              rcu_read_lock_sched();
      
              pr_info("Showing busy workqueues and worker pools:\n");
      
              list_for_each_entry_rcu(wq, &workqueues, list) {
                      struct pool_workqueue *pwq;
                      bool idle = true;
      
                      for_each_pwq(pwq, wq) {
                              if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
                                      idle = false;
                                      break;
                              }
                      }
                      if (idle)
                              continue;
      
                      pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
      
                      for_each_pwq(pwq, wq) {
                              spin_lock_irqsave(&pwq->pool->lock, flags);
                              if (pwq->nr_active || !list_empty(&pwq->delayed_works))
                                      show_pwq(pwq);
                              spin_unlock_irqrestore(&pwq->pool->lock, flags);
                      }
              }
      
              for_each_pool(pool, pi) {
                      struct worker *worker;
                      bool first = true;
      
                      spin_lock_irqsave(&pool->lock, flags);
                      if (pool->nr_workers == pool->nr_idle)
                              goto next_pool;
      
                      pr_info("pool %d:", pool->id);
                      pr_cont_pool_info(pool);
                      pr_cont(" workers=%d", pool->nr_workers);
                      if (pool->manager)
                              pr_cont(" manager: %d",
                                      task_pid_nr(pool->manager->task));
                      list_for_each_entry(worker, &pool->idle_list, entry) {
                              pr_cont(" %s%d", first ? "idle: " : "",
                                      task_pid_nr(worker->task));
                              first = false;
                      }
                      pr_cont("\n");
              next_pool:
                      spin_unlock_irqrestore(&pool->lock, flags);
              }
      
              rcu_read_unlock_sched();
      }
      
      /*
       * CPU hotplug.
       *
       * There are two challenges in supporting CPU hotplug.  Firstly, there
       * are a lot of assumptions on strong associations among work, pwq and
       * pool which make migrating pending and scheduled works very
       * difficult to implement without impacting hot paths.  Secondly,
       * worker pools serve mix of short, long and very long running works making
       * blocked draining impractical.
       *
       * This is solved by allowing the pools to be disassociated from the CPU
       * running as an unbound one and allowing it to be reattached later if the
       * cpu comes back online.
       */
      
      static void wq_unbind_fn(struct work_struct *work)
      {
              int cpu = smp_processor_id();
              struct worker_pool *pool;
              struct worker *worker;
      
              for_each_cpu_worker_pool(pool, cpu) {
                      mutex_lock(&pool->attach_mutex);
                      spin_lock_irq(&pool->lock);
      
                      /*
                       * We've blocked all attach/detach operations. Make all workers
                       * unbound and set DISASSOCIATED.  Before this, all workers
                       * except for the ones which are still executing works from
                       * before the last CPU down must be on the cpu.  After
                       * this, they may become diasporas.
                       */
                      for_each_pool_worker(worker, pool)
                              worker->flags |= WORKER_UNBOUND;
      
                      pool->flags |= POOL_DISASSOCIATED;
      
                      spin_unlock_irq(&pool->lock);
                      mutex_unlock(&pool->attach_mutex);
      
                      /*
                       * Call schedule() so that we cross rq->lock and thus can
                       * guarantee sched callbacks see the %WORKER_UNBOUND flag.
                       * This is necessary as scheduler callbacks may be invoked
                       * from other cpus.
                       */
                      schedule();
      
                      /*
                       * Sched callbacks are disabled now.  Zap nr_running.
                       * After this, nr_running stays zero and need_more_worker()
                       * and keep_working() are always true as long as the
                       * worklist is not empty.  This pool now behaves as an
                       * unbound (in terms of concurrency management) pool which
                       * are served by workers tied to the pool.
                       */
                      atomic_set(&pool->nr_running, 0);
      
                      /*
                       * With concurrency management just turned off, a busy
                       * worker blocking could lead to lengthy stalls.  Kick off
                       * unbound chain execution of currently pending work items.
                       */
                      spin_lock_irq(&pool->lock);
                      wake_up_worker(pool);
                      spin_unlock_irq(&pool->lock);
              }
      }
      
      /**
       * rebind_workers - rebind all workers of a pool to the associated CPU
       * @pool: pool of interest
       *
       * @pool->cpu is coming online.  Rebind all workers to the CPU.
       */
      static void rebind_workers(struct worker_pool *pool)
      {
              struct worker *worker;
      
              lockdep_assert_held(&pool->attach_mutex);
      
              /*
               * Restore CPU affinity of all workers.  As all idle workers should
               * be on the run-queue of the associated CPU before any local
               * wake-ups for concurrency management happen, restore CPU affinity
               * of all workers first and then clear UNBOUND.  As we're called
               * from CPU_ONLINE, the following shouldn't fail.
               */
              for_each_pool_worker(worker, pool)
                      WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                        pool->attrs->cpumask) < 0);
      
              spin_lock_irq(&pool->lock);
      
              /*
               * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
               * w/o preceding DOWN_PREPARE.  Work around it.  CPU hotplug is
               * being reworked and this can go away in time.
               */
              if (!(pool->flags & POOL_DISASSOCIATED)) {
                      spin_unlock_irq(&pool->lock);
                      return;
              }
      
              pool->flags &= ~POOL_DISASSOCIATED;
      
              for_each_pool_worker(worker, pool) {
                      unsigned int worker_flags = worker->flags;
      
                      /*
                       * A bound idle worker should actually be on the runqueue
                       * of the associated CPU for local wake-ups targeting it to
                       * work.  Kick all idle workers so that they migrate to the
                       * associated CPU.  Doing this in the same loop as
                       * replacing UNBOUND with REBOUND is safe as no worker will
                       * be bound before @pool->lock is released.
                       */
                      if (worker_flags & WORKER_IDLE)
                              wake_up_process(worker->task);
      
                      /*
                       * We want to clear UNBOUND but can't directly call
                       * worker_clr_flags() or adjust nr_running.  Atomically
                       * replace UNBOUND with another NOT_RUNNING flag REBOUND.
                       * @worker will clear REBOUND using worker_clr_flags() when
                       * it initiates the next execution cycle thus restoring
                       * concurrency management.  Note that when or whether
                       * @worker clears REBOUND doesn't affect correctness.
                       *
                       * ACCESS_ONCE() is necessary because @worker->flags may be
                       * tested without holding any lock in
                       * wq_worker_waking_up().  Without it, NOT_RUNNING test may
                       * fail incorrectly leading to premature concurrency
                       * management operations.
                       */
                      WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
                      worker_flags |= WORKER_REBOUND;
                      worker_flags &= ~WORKER_UNBOUND;
                      ACCESS_ONCE(worker->flags) = worker_flags;
              }
      
              spin_unlock_irq(&pool->lock);
      }
      
      /**
       * restore_unbound_workers_cpumask - restore cpumask of unbound workers
       * @pool: unbound pool of interest
       * @cpu: the CPU which is coming up
       *
       * An unbound pool may end up with a cpumask which doesn't have any online
       * CPUs.  When a worker of such pool get scheduled, the scheduler resets
       * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
       * online CPU before, cpus_allowed of all its workers should be restored.
       */
      static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
      {
              static cpumask_t cpumask;
              struct worker *worker;
      
              lockdep_assert_held(&pool->attach_mutex);
      
              /* is @cpu allowed for @pool? */
              if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
                      return;
      
              /* is @cpu the only online CPU? */
              cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
              if (cpumask_weight(&cpumask) != 1)
                      return;
      
              /* as we're called from CPU_ONLINE, the following shouldn't fail */
              for_each_pool_worker(worker, pool)
                      WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                        pool->attrs->cpumask) < 0);
      }
      
      /*
       * Workqueues should be brought up before normal priority CPU notifiers.
       * This will be registered high priority CPU notifier.
       */
      static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                                                     unsigned long action,
                                                     void *hcpu)
      {
              int cpu = (unsigned long)hcpu;
              struct worker_pool *pool;
              struct workqueue_struct *wq;
              int pi;
      
              switch (action & ~CPU_TASKS_FROZEN) {
              case CPU_UP_PREPARE:
                      for_each_cpu_worker_pool(pool, cpu) {
                              if (pool->nr_workers)
                                      continue;
                              if (!create_worker(pool))
                                      return NOTIFY_BAD;
                      }
                      break;
      
              case CPU_DOWN_FAILED:
              case CPU_ONLINE:
                      mutex_lock(&wq_pool_mutex);
      
                      for_each_pool(pool, pi) {
                              mutex_lock(&pool->attach_mutex);
      
                              if (pool->cpu == cpu)
                                      rebind_workers(pool);
                              else if (pool->cpu < 0)
                                      restore_unbound_workers_cpumask(pool, cpu);
      
                              mutex_unlock(&pool->attach_mutex);
                      }
      
                      /* update NUMA affinity of unbound workqueues */
                      list_for_each_entry(wq, &workqueues, list)
                              wq_update_unbound_numa(wq, cpu, true);
      
                      mutex_unlock(&wq_pool_mutex);
                      break;
              }
              return NOTIFY_OK;
      }
      
      /*
       * Workqueues should be brought down after normal priority CPU notifiers.
       * This will be registered as low priority CPU notifier.
       */
      static int workqueue_cpu_down_callback(struct notifier_block *nfb,
                                                       unsigned long action,
                                                       void *hcpu)
      {
              int cpu = (unsigned long)hcpu;
              struct work_struct unbind_work;
              struct workqueue_struct *wq;
      
              switch (action & ~CPU_TASKS_FROZEN) {
              case CPU_DOWN_PREPARE:
                      /* unbinding per-cpu workers should happen on the local CPU */
                      INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
                      queue_work_on(cpu, system_highpri_wq, &unbind_work);
      
                      /* update NUMA affinity of unbound workqueues */
                      mutex_lock(&wq_pool_mutex);
                      list_for_each_entry(wq, &workqueues, list)
                              wq_update_unbound_numa(wq, cpu, false);
                      mutex_unlock(&wq_pool_mutex);
      
                      /* wait for per-cpu unbinding to finish */
                      flush_work(&unbind_work);
                      destroy_work_on_stack(&unbind_work);
                      break;
              }
              return NOTIFY_OK;
      }
      
      #ifdef CONFIG_SMP
      
      struct work_for_cpu {
              struct work_struct work;
              long (*fn)(void *);
              void *arg;
              long ret;
      };
      
      static void work_for_cpu_fn(struct work_struct *work)
      {
              struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
      
              wfc->ret = wfc->fn(wfc->arg);
      }
      
      /**
       * work_on_cpu - run a function in user context on a particular cpu
       * @cpu: the cpu to run on
       * @fn: the function to run
       * @arg: the function arg
       *
       * It is up to the caller to ensure that the cpu doesn't go offline.
       * The caller must not hold any locks which would prevent @fn from completing.
       *
       * Return: The value @fn returns.
       */
      long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
      {
              struct work_for_cpu wfc = { .fn = fn, .arg = arg };
      
              INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
              schedule_work_on(cpu, &wfc.work);
              flush_work(&wfc.work);
              destroy_work_on_stack(&wfc.work);
              return wfc.ret;
      }
      EXPORT_SYMBOL_GPL(work_on_cpu);
      #endif /* CONFIG_SMP */
      
      #ifdef CONFIG_FREEZER
      
      /**
       * freeze_workqueues_begin - begin freezing workqueues
       *
       * Start freezing workqueues.  After this function returns, all freezable
       * workqueues will queue new works to their delayed_works list instead of
       * pool->worklist.
       *
       * CONTEXT:
       * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
       */
      void freeze_workqueues_begin(void)
      {
              struct workqueue_struct *wq;
              struct pool_workqueue *pwq;
      
              mutex_lock(&wq_pool_mutex);
      
              WARN_ON_ONCE(workqueue_freezing);
              workqueue_freezing = true;
      
              list_for_each_entry(wq, &workqueues, list) {
                      mutex_lock(&wq->mutex);
                      for_each_pwq(pwq, wq)
                              pwq_adjust_max_active(pwq);
                      mutex_unlock(&wq->mutex);
              }
      
              mutex_unlock(&wq_pool_mutex);
      }
      
      /**
       * freeze_workqueues_busy - are freezable workqueues still busy?
       *
       * Check whether freezing is complete.  This function must be called
       * between freeze_workqueues_begin() and thaw_workqueues().
       *
       * CONTEXT:
       * Grabs and releases wq_pool_mutex.
       *
       * Return:
       * %true if some freezable workqueues are still busy.  %false if freezing
       * is complete.
       */
      bool freeze_workqueues_busy(void)
      {
              bool busy = false;
              struct workqueue_struct *wq;
              struct pool_workqueue *pwq;
      
              mutex_lock(&wq_pool_mutex);
      
              WARN_ON_ONCE(!workqueue_freezing);
      
              list_for_each_entry(wq, &workqueues, list) {
                      if (!(wq->flags & WQ_FREEZABLE))
                              continue;
                      /*
                       * nr_active is monotonically decreasing.  It's safe
                       * to peek without lock.
                       */
                      rcu_read_lock_sched();
                      for_each_pwq(pwq, wq) {
                              WARN_ON_ONCE(pwq->nr_active < 0);
                              if (pwq->nr_active) {
                                      busy = true;
                                      rcu_read_unlock_sched();
                                      goto out_unlock;
                              }
                      }
                      rcu_read_unlock_sched();
              }
      out_unlock:
              mutex_unlock(&wq_pool_mutex);
              return busy;
      }
      
      /**
       * thaw_workqueues - thaw workqueues
       *
       * Thaw workqueues.  Normal queueing is restored and all collected
       * frozen works are transferred to their respective pool worklists.
       *
       * CONTEXT:
       * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
       */
      void thaw_workqueues(void)
      {
              struct workqueue_struct *wq;
              struct pool_workqueue *pwq;
      
              mutex_lock(&wq_pool_mutex);
      
              if (!workqueue_freezing)
                      goto out_unlock;
      
              workqueue_freezing = false;
      
              /* restore max_active and repopulate worklist */
              list_for_each_entry(wq, &workqueues, list) {
                      mutex_lock(&wq->mutex);
                      for_each_pwq(pwq, wq)
                              pwq_adjust_max_active(pwq);
                      mutex_unlock(&wq->mutex);
              }
      
      out_unlock:
              mutex_unlock(&wq_pool_mutex);
      }
      #endif /* CONFIG_FREEZER */
      
      static int workqueue_apply_unbound_cpumask(void)
      {
              LIST_HEAD(ctxs);
              int ret = 0;
              struct workqueue_struct *wq;
              struct apply_wqattrs_ctx *ctx, *n;
      
              lockdep_assert_held(&wq_pool_mutex);
      
              list_for_each_entry(wq, &workqueues, list) {
                      if (!(wq->flags & WQ_UNBOUND))
                              continue;
                      /* creating multiple pwqs breaks ordering guarantee */
                      if (wq->flags & __WQ_ORDERED)
                              continue;
      
                      ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
                      if (!ctx) {
                              ret = -ENOMEM;
                              break;
                      }
      
                      list_add_tail(&ctx->list, &ctxs);
              }
      
              list_for_each_entry_safe(ctx, n, &ctxs, list) {
                      if (!ret)
                              apply_wqattrs_commit(ctx);
                      apply_wqattrs_cleanup(ctx);
              }
      
              return ret;
      }
      
      /**
       *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
       *  @cpumask: the cpumask to set
       *
       *  The low-level workqueues cpumask is a global cpumask that limits
       *  the affinity of all unbound workqueues.  This function check the @cpumask
       *  and apply it to all unbound workqueues and updates all pwqs of them.
       *
       *  Retun:        0        - Success
       *                  -EINVAL        - Invalid @cpumask
       *                  -ENOMEM        - Failed to allocate memory for attrs or pwqs.
       */
      int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
      {
              int ret = -EINVAL;
              cpumask_var_t saved_cpumask;
      
              if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
                      return -ENOMEM;
      
              cpumask_and(cpumask, cpumask, cpu_possible_mask);
              if (!cpumask_empty(cpumask)) {
                      apply_wqattrs_lock();
      
                      /* save the old wq_unbound_cpumask. */
                      cpumask_copy(saved_cpumask, wq_unbound_cpumask);
      
                      /* update wq_unbound_cpumask at first and apply it to wqs. */
                      cpumask_copy(wq_unbound_cpumask, cpumask);
                      ret = workqueue_apply_unbound_cpumask();
      
                      /* restore the wq_unbound_cpumask when failed. */
                      if (ret < 0)
                              cpumask_copy(wq_unbound_cpumask, saved_cpumask);
      
                      apply_wqattrs_unlock();
              }
      
              free_cpumask_var(saved_cpumask);
              return ret;
      }
      
      #ifdef CONFIG_SYSFS
      /*
       * Workqueues with WQ_SYSFS flag set is visible to userland via
       * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
       * following attributes.
       *
       *  per_cpu        RO bool        : whether the workqueue is per-cpu or unbound
       *  max_active        RW int        : maximum number of in-flight work items
       *
       * Unbound workqueues have the following extra attributes.
       *
       *  id                RO int        : the associated pool ID
       *  nice        RW int        : nice value of the workers
       *  cpumask        RW mask        : bitmask of allowed CPUs for the workers
       */
      struct wq_device {
              struct workqueue_struct                *wq;
              struct device                        dev;
      };
      
      static struct workqueue_struct *dev_to_wq(struct device *dev)
      {
              struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
      
              return wq_dev->wq;
      }
      
      static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
                                  char *buf)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
      
              return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
      }
      static DEVICE_ATTR_RO(per_cpu);
      
      static ssize_t max_active_show(struct device *dev,
                                     struct device_attribute *attr, char *buf)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
      
              return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
      }
      
      static ssize_t max_active_store(struct device *dev,
                                      struct device_attribute *attr, const char *buf,
                                      size_t count)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
              int val;
      
              if (sscanf(buf, "%d", &val) != 1 || val <= 0)
                      return -EINVAL;
      
              workqueue_set_max_active(wq, val);
              return count;
      }
      static DEVICE_ATTR_RW(max_active);
      
      static struct attribute *wq_sysfs_attrs[] = {
              &dev_attr_per_cpu.attr,
              &dev_attr_max_active.attr,
              NULL,
      };
      ATTRIBUTE_GROUPS(wq_sysfs);
      
      static ssize_t wq_pool_ids_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
              const char *delim = "";
              int node, written = 0;
      
              rcu_read_lock_sched();
              for_each_node(node) {
                      written += scnprintf(buf + written, PAGE_SIZE - written,
                                           "%s%d:%d", delim, node,
                                           unbound_pwq_by_node(wq, node)->pool->id);
                      delim = " ";
              }
              written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
              rcu_read_unlock_sched();
      
              return written;
      }
      
      static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
                                  char *buf)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
              int written;
      
              mutex_lock(&wq->mutex);
              written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
              mutex_unlock(&wq->mutex);
      
              return written;
      }
      
      /* prepare workqueue_attrs for sysfs store operations */
      static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
      {
              struct workqueue_attrs *attrs;
      
              lockdep_assert_held(&wq_pool_mutex);
      
              attrs = alloc_workqueue_attrs(GFP_KERNEL);
              if (!attrs)
                      return NULL;
      
              copy_workqueue_attrs(attrs, wq->unbound_attrs);
              return attrs;
      }
      
      static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
                                   const char *buf, size_t count)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
              struct workqueue_attrs *attrs;
              int ret = -ENOMEM;
      
              apply_wqattrs_lock();
      
              attrs = wq_sysfs_prep_attrs(wq);
              if (!attrs)
                      goto out_unlock;
      
              if (sscanf(buf, "%d", &attrs->nice) == 1 &&
                  attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
                      ret = apply_workqueue_attrs_locked(wq, attrs);
              else
                      ret = -EINVAL;
      
      out_unlock:
              apply_wqattrs_unlock();
              free_workqueue_attrs(attrs);
              return ret ?: count;
      }
      
      static ssize_t wq_cpumask_show(struct device *dev,
                                     struct device_attribute *attr, char *buf)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
              int written;
      
              mutex_lock(&wq->mutex);
              written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                                  cpumask_pr_args(wq->unbound_attrs->cpumask));
              mutex_unlock(&wq->mutex);
              return written;
      }
      
      static ssize_t wq_cpumask_store(struct device *dev,
                                      struct device_attribute *attr,
                                      const char *buf, size_t count)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
              struct workqueue_attrs *attrs;
              int ret = -ENOMEM;
      
              apply_wqattrs_lock();
      
              attrs = wq_sysfs_prep_attrs(wq);
              if (!attrs)
                      goto out_unlock;
      
              ret = cpumask_parse(buf, attrs->cpumask);
              if (!ret)
                      ret = apply_workqueue_attrs_locked(wq, attrs);
      
      out_unlock:
              apply_wqattrs_unlock();
              free_workqueue_attrs(attrs);
              return ret ?: count;
      }
      
      static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
                                  char *buf)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
              int written;
      
              mutex_lock(&wq->mutex);
              written = scnprintf(buf, PAGE_SIZE, "%d\n",
                                  !wq->unbound_attrs->no_numa);
              mutex_unlock(&wq->mutex);
      
              return written;
      }
      
      static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
                                   const char *buf, size_t count)
      {
              struct workqueue_struct *wq = dev_to_wq(dev);
              struct workqueue_attrs *attrs;
              int v, ret = -ENOMEM;
      
              apply_wqattrs_lock();
      
              attrs = wq_sysfs_prep_attrs(wq);
              if (!attrs)
                      goto out_unlock;
      
              ret = -EINVAL;
              if (sscanf(buf, "%d", &v) == 1) {
                      attrs->no_numa = !v;
                      ret = apply_workqueue_attrs_locked(wq, attrs);
              }
      
      out_unlock:
              apply_wqattrs_unlock();
              free_workqueue_attrs(attrs);
              return ret ?: count;
      }
      
      static struct device_attribute wq_sysfs_unbound_attrs[] = {
              __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
              __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
              __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
              __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
              __ATTR_NULL,
      };
      
      static struct bus_type wq_subsys = {
              .name                                = "workqueue",
              .dev_groups                        = wq_sysfs_groups,
      };
      
      static ssize_t wq_unbound_cpumask_show(struct device *dev,
                      struct device_attribute *attr, char *buf)
      {
              int written;
      
              mutex_lock(&wq_pool_mutex);
              written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                                  cpumask_pr_args(wq_unbound_cpumask));
              mutex_unlock(&wq_pool_mutex);
      
              return written;
      }
      
      static ssize_t wq_unbound_cpumask_store(struct device *dev,
                      struct device_attribute *attr, const char *buf, size_t count)
      {
              cpumask_var_t cpumask;
              int ret;
      
              if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                      return -ENOMEM;
      
              ret = cpumask_parse(buf, cpumask);
              if (!ret)
                      ret = workqueue_set_unbound_cpumask(cpumask);
      
              free_cpumask_var(cpumask);
              return ret ? ret : count;
      }
      
      static struct device_attribute wq_sysfs_cpumask_attr =
              __ATTR(cpumask, 0644, wq_unbound_cpumask_show,
                     wq_unbound_cpumask_store);
      
      static int __init wq_sysfs_init(void)
      {
              int err;
      
              err = subsys_virtual_register(&wq_subsys, NULL);
              if (err)
                      return err;
      
              return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
      }
      core_initcall(wq_sysfs_init);
      
      static void wq_device_release(struct device *dev)
      {
              struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
      
              kfree(wq_dev);
      }
      
      /**
       * workqueue_sysfs_register - make a workqueue visible in sysfs
       * @wq: the workqueue to register
       *
       * Expose @wq in sysfs under /sys/bus/workqueue/devices.
       * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
       * which is the preferred method.
       *
       * Workqueue user should use this function directly iff it wants to apply
       * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
       * apply_workqueue_attrs() may race against userland updating the
       * attributes.
       *
       * Return: 0 on success, -errno on failure.
       */
      int workqueue_sysfs_register(struct workqueue_struct *wq)
      {
              struct wq_device *wq_dev;
              int ret;
      
              /*
               * Adjusting max_active or creating new pwqs by applying
               * attributes breaks ordering guarantee.  Disallow exposing ordered
               * workqueues.
               */
              if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
                      return -EINVAL;
      
              wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
              if (!wq_dev)
                      return -ENOMEM;
      
              wq_dev->wq = wq;
              wq_dev->dev.bus = &wq_subsys;
              wq_dev->dev.init_name = wq->name;
              wq_dev->dev.release = wq_device_release;
      
              /*
               * unbound_attrs are created separately.  Suppress uevent until
               * everything is ready.
               */
              dev_set_uevent_suppress(&wq_dev->dev, true);
      
              ret = device_register(&wq_dev->dev);
              if (ret) {
                      put_device(&wq_dev->dev);
                      wq->wq_dev = NULL;
                      return ret;
              }
      
              if (wq->flags & WQ_UNBOUND) {
                      struct device_attribute *attr;
      
                      for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
                              ret = device_create_file(&wq_dev->dev, attr);
                              if (ret) {
                                      device_unregister(&wq_dev->dev);
                                      wq->wq_dev = NULL;
                                      return ret;
                              }
                      }
              }
      
              dev_set_uevent_suppress(&wq_dev->dev, false);
              kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
              return 0;
      }
      
      /**
       * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
       * @wq: the workqueue to unregister
       *
       * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
       */
      static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
      {
              struct wq_device *wq_dev = wq->wq_dev;
      
              if (!wq->wq_dev)
                      return;
      
              wq->wq_dev = NULL;
              device_unregister(&wq_dev->dev);
      }
      #else        /* CONFIG_SYSFS */
      static void workqueue_sysfs_unregister(struct workqueue_struct *wq)        { }
      #endif        /* CONFIG_SYSFS */
      
      static void __init wq_numa_init(void)
      {
              cpumask_var_t *tbl;
              int node, cpu;
      
              if (num_possible_nodes() <= 1)
                      return;
      
              if (wq_disable_numa) {
                      pr_info("workqueue: NUMA affinity support disabled\n");
                      return;
              }
      
              wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
              BUG_ON(!wq_update_unbound_numa_attrs_buf);
      
              /*
               * We want masks of possible CPUs of each node which isn't readily
               * available.  Build one from cpu_to_node() which should have been
               * fully initialized by now.
               */
              tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
              BUG_ON(!tbl);
      
              for_each_node(node)
                      BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
                                      node_online(node) ? node : NUMA_NO_NODE));
      
              for_each_possible_cpu(cpu) {
                      node = cpu_to_node(cpu);
                      if (WARN_ON(node == NUMA_NO_NODE)) {
                              pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
                              /* happens iff arch is bonkers, let's just proceed */
                              return;
                      }
                      cpumask_set_cpu(cpu, tbl[node]);
              }
      
              wq_numa_possible_cpumask = tbl;
              wq_numa_enabled = true;
      }
      
      static int __init init_workqueues(void)
      {
              int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
              int i, cpu;
      
              WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
      
              BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
              cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
      
              pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
      
              cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
              hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
      
              wq_numa_init();
      
              /* initialize CPU pools */
              for_each_possible_cpu(cpu) {
                      struct worker_pool *pool;
      
                      i = 0;
                      for_each_cpu_worker_pool(pool, cpu) {
                              BUG_ON(init_worker_pool(pool));
                              pool->cpu = cpu;
                              cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
                              pool->attrs->nice = std_nice[i++];
                              pool->node = cpu_to_node(cpu);
      
                              /* alloc pool ID */
                              mutex_lock(&wq_pool_mutex);
                              BUG_ON(worker_pool_assign_id(pool));
                              mutex_unlock(&wq_pool_mutex);
                      }
              }
      
              /* create the initial worker */
              for_each_online_cpu(cpu) {
                      struct worker_pool *pool;
      
                      for_each_cpu_worker_pool(pool, cpu) {
                              pool->flags &= ~POOL_DISASSOCIATED;
                              BUG_ON(!create_worker(pool));
                      }
              }
      
              /* create default unbound and ordered wq attrs */
              for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                      struct workqueue_attrs *attrs;
      
                      BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
                      attrs->nice = std_nice[i];
                      unbound_std_wq_attrs[i] = attrs;
      
                      /*
                       * An ordered wq should have only one pwq as ordering is
                       * guaranteed by max_active which is enforced by pwqs.
                       * Turn off NUMA so that dfl_pwq is used for all nodes.
                       */
                      BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
                      attrs->nice = std_nice[i];
                      attrs->no_numa = true;
                      ordered_wq_attrs[i] = attrs;
              }
      
              system_wq = alloc_workqueue("events", 0, 0);
              system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
              system_long_wq = alloc_workqueue("events_long", 0, 0);
              system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                                  WQ_UNBOUND_MAX_ACTIVE);
              system_freezable_wq = alloc_workqueue("events_freezable",
                                                    WQ_FREEZABLE, 0);
              system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                                                    WQ_POWER_EFFICIENT, 0);
              system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
                                                    WQ_FREEZABLE | WQ_POWER_EFFICIENT,
                                                    0);
              BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
                     !system_unbound_wq || !system_freezable_wq ||
                     !system_power_efficient_wq ||
                     !system_freezable_power_efficient_wq);
              return 0;
      }
      early_initcall(init_workqueues);
      /*
       * fs/fs-writeback.c
       *
       * Copyright (C) 2002, Linus Torvalds.
       *
       * Contains all the functions related to writing back and waiting
       * upon dirty inodes against superblocks, and writing back dirty
       * pages against inodes.  ie: data writeback.  Writeout of the
       * inode itself is not handled here.
       *
       * 10Apr2002        Andrew Morton
       *                Split out of fs/inode.c
       *                Additions for address_space-based writeback
       */
      
      #include <linux/kernel.h>
      #include <linux/export.h>
      #include <linux/spinlock.h>
      #include <linux/slab.h>
      #include <linux/sched.h>
      #include <linux/fs.h>
      #include <linux/mm.h>
      #include <linux/pagemap.h>
      #include <linux/kthread.h>
      #include <linux/writeback.h>
      #include <linux/blkdev.h>
      #include <linux/backing-dev.h>
      #include <linux/tracepoint.h>
      #include <linux/device.h>
      #include <linux/memcontrol.h>
      #include "internal.h"
      
      /*
       * 4MB minimal write chunk size
       */
      #define MIN_WRITEBACK_PAGES        (4096UL >> (PAGE_CACHE_SHIFT - 10))
      
      struct wb_completion {
              atomic_t                cnt;
      };
      
      /*
       * Passed into wb_writeback(), essentially a subset of writeback_control
       */
      struct wb_writeback_work {
              long nr_pages;
              struct super_block *sb;
              unsigned long *older_than_this;
              enum writeback_sync_modes sync_mode;
              unsigned int tagged_writepages:1;
              unsigned int for_kupdate:1;
              unsigned int range_cyclic:1;
              unsigned int for_background:1;
              unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
              unsigned int auto_free:1;        /* free on completion */
              enum wb_reason reason;                /* why was writeback initiated? */
      
              struct list_head list;                /* pending work list */
              struct wb_completion *done;        /* set if the caller waits */
      };
      
      /*
       * If one wants to wait for one or more wb_writeback_works, each work's
       * ->done should be set to a wb_completion defined using the following
       * macro.  Once all work items are issued with wb_queue_work(), the caller
       * can wait for the completion of all using wb_wait_for_completion().  Work
       * items which are waited upon aren't freed automatically on completion.
       */
      #define DEFINE_WB_COMPLETION_ONSTACK(cmpl)                                \
              struct wb_completion cmpl = {                                        \
                      .cnt                = ATOMIC_INIT(1),                        \
              }
      
      
      /*
       * If an inode is constantly having its pages dirtied, but then the
       * updates stop dirtytime_expire_interval seconds in the past, it's
       * possible for the worst case time between when an inode has its
       * timestamps updated and when they finally get written out to be two
       * dirtytime_expire_intervals.  We set the default to 12 hours (in
       * seconds), which means most of the time inodes will have their
       * timestamps written to disk after 12 hours, but in the worst case a
       * few inodes might not their timestamps updated for 24 hours.
       */
      unsigned int dirtytime_expire_interval = 12 * 60 * 60;
      
      static inline struct inode *wb_inode(struct list_head *head)
      {
              return list_entry(head, struct inode, i_io_list);
      }
      
      /*
       * Include the creation of the trace points after defining the
       * wb_writeback_work structure and inline functions so that the definition
       * remains local to this file.
       */
      #define CREATE_TRACE_POINTS
      #include <trace/events/writeback.h>
      
      EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
      
      static bool wb_io_lists_populated(struct bdi_writeback *wb)
      {
  609         if (wb_has_dirty_io(wb)) {
                      return false;
              } else {
   32                 set_bit(WB_has_dirty_io, &wb->state);
                      WARN_ON_ONCE(!wb->avg_write_bandwidth);
                      atomic_long_add(wb->avg_write_bandwidth,
   32                                 &wb->bdi->tot_write_bandwidth);
                      return true;
              }
      }
      
      static void wb_io_lists_depopulated(struct bdi_writeback *wb)
      {
  297         if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
    8             list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
    4                 clear_bit(WB_has_dirty_io, &wb->state);
                      WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
                                              &wb->bdi->tot_write_bandwidth) < 0);
              }
  297 }
      
      /**
       * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
       * @inode: inode to be moved
       * @wb: target bdi_writeback
       * @head: one of @wb->b_{dirty|io|more_io}
       *
       * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
       * Returns %true if @inode is the first occupant of the !dirty_time IO
       * lists; otherwise, %false.
       */
      static bool inode_io_list_move_locked(struct inode *inode,
                                            struct bdi_writeback *wb,
                                            struct list_head *head)
      {
  609         assert_spin_locked(&wb->list_lock);
      
  609         list_move(&inode->i_io_list, head);
      
              /* dirty_time doesn't count as dirty_io until expiration */
  609         if (head != &wb->b_dirty_time)
  609                 return wb_io_lists_populated(wb);
      
    1         wb_io_lists_depopulated(wb);
              return false;
   31 }
      
      /**
       * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
       * @inode: inode to be removed
       * @wb: bdi_writeback @inode is being removed from
       *
       * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
       * clear %WB_has_dirty_io if all are empty afterwards.
       */
      static void inode_io_list_del_locked(struct inode *inode,
                                           struct bdi_writeback *wb)
      {
  296         assert_spin_locked(&wb->list_lock);
      
  296         list_del_init(&inode->i_io_list);
              wb_io_lists_depopulated(wb);
      }
      
      static void wb_wakeup(struct bdi_writeback *wb)
      {
              spin_lock_bh(&wb->work_lock);
              if (test_bit(WB_registered, &wb->state))
                      mod_delayed_work(bdi_wq, &wb->dwork, 0);
              spin_unlock_bh(&wb->work_lock);
      }
      
      static void finish_writeback_work(struct bdi_writeback *wb,
                                        struct wb_writeback_work *work)
      {
              struct wb_completion *done = work->done;
      
              if (work->auto_free)
                      kfree(work);
              if (done && atomic_dec_and_test(&done->cnt))
                      wake_up_all(&wb->bdi->wb_waitq);
      }
      
      static void wb_queue_work(struct bdi_writeback *wb,
                                struct wb_writeback_work *work)
      {
   77         trace_writeback_queue(wb, work);
      
   77         if (work->done)
   77                 atomic_inc(&work->done->cnt);
      
   77         spin_lock_bh(&wb->work_lock);
      
              if (test_bit(WB_registered, &wb->state)) {
   77                 list_add_tail(&work->list, &wb->work_list);
   77                 mod_delayed_work(bdi_wq, &wb->dwork, 0);
              } else
                      finish_writeback_work(wb, work);
      
   77         spin_unlock_bh(&wb->work_lock);
      }
      
      /**
       * wb_wait_for_completion - wait for completion of bdi_writeback_works
       * @bdi: bdi work items were issued to
       * @done: target wb_completion
       *
       * Wait for one or more work items issued to @bdi with their ->done field
       * set to @done, which should have been defined with
       * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
       * work items are completed.  Work items which are waited upon aren't freed
       * automatically on completion.
       */
   76 static void wb_wait_for_completion(struct backing_dev_info *bdi,
                                         struct wb_completion *done)
      {
   91         atomic_dec(&done->cnt);                /* put down the initial count */
   76         wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
   91 }
      
      #ifdef CONFIG_CGROUP_WRITEBACK
      
      /* parameters for foreign inode detection, see wb_detach_inode() */
      #define WB_FRN_TIME_SHIFT        13        /* 1s = 2^13, upto 8 secs w/ 16bit */
      #define WB_FRN_TIME_AVG_SHIFT        3        /* avg = avg * 7/8 + new * 1/8 */
      #define WB_FRN_TIME_CUT_DIV        2        /* ignore rounds < avg / 2 */
      #define WB_FRN_TIME_PERIOD        (2 * (1 << WB_FRN_TIME_SHIFT))        /* 2s */
      
      #define WB_FRN_HIST_SLOTS        16        /* inode->i_wb_frn_history is 16bit */
      #define WB_FRN_HIST_UNIT        (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
                                              /* each slot's duration is 2s / 16 */
      #define WB_FRN_HIST_THR_SLOTS        (WB_FRN_HIST_SLOTS / 2)
                                              /* if foreign slots >= 8, switch */
      #define WB_FRN_HIST_MAX_SLOTS        (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                              /* one round can affect upto 5 slots */
      
      static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
      static struct workqueue_struct *isw_wq;
      
      void __inode_attach_wb(struct inode *inode, struct page *page)
      {
              struct backing_dev_info *bdi = inode_to_bdi(inode);
              struct bdi_writeback *wb = NULL;
      
              if (inode_cgwb_enabled(inode)) {
                      struct cgroup_subsys_state *memcg_css;
      
                      if (page) {
                              memcg_css = mem_cgroup_css_from_page(page);
                              wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                      } else {
                              /* must pin memcg_css, see wb_get_create() */
                              memcg_css = task_get_css(current, memory_cgrp_id);
                              wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                              css_put(memcg_css);
                      }
              }
      
              if (!wb)
                      wb = &bdi->wb;
      
              /*
               * There may be multiple instances of this function racing to
               * update the same inode.  Use cmpxchg() to tell the winner.
               */
              if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
                      wb_put(wb);
      }
      
      /**
       * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
       * @inode: inode of interest with i_lock held
       *
       * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
       * held on entry and is released on return.  The returned wb is guaranteed
       * to stay @inode's associated wb until its list_lock is released.
       */
      static struct bdi_writeback *
      locked_inode_to_wb_and_lock_list(struct inode *inode)
              __releases(&inode->i_lock)
              __acquires(&wb->list_lock)
      {
              while (true) {
                      struct bdi_writeback *wb = inode_to_wb(inode);
      
                      /*
                       * inode_to_wb() association is protected by both
                       * @inode->i_lock and @wb->list_lock but list_lock nests
                       * outside i_lock.  Drop i_lock and verify that the
                       * association hasn't changed after acquiring list_lock.
                       */
                      wb_get(wb);
                      spin_unlock(&inode->i_lock);
                      spin_lock(&wb->list_lock);
      
                      /* i_wb may have changed inbetween, can't use inode_to_wb() */
                      if (likely(wb == inode->i_wb)) {
                              wb_put(wb);        /* @inode already has ref */
                              return wb;
                      }
      
                      spin_unlock(&wb->list_lock);
                      wb_put(wb);
                      cpu_relax();
                      spin_lock(&inode->i_lock);
              }
      }
      
      /**
       * inode_to_wb_and_lock_list - determine an inode's wb and lock it
       * @inode: inode of interest
       *
       * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
       * on entry.
       */
      static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
              __acquires(&wb->list_lock)
      {
              spin_lock(&inode->i_lock);
              return locked_inode_to_wb_and_lock_list(inode);
      }
      
      struct inode_switch_wbs_context {
              struct inode                *inode;
              struct bdi_writeback        *new_wb;
      
              struct rcu_head                rcu_head;
              struct work_struct        work;
      };
      
      static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
      {
              down_write(&bdi->wb_switch_rwsem);
      }
      
      static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
      {
              up_write(&bdi->wb_switch_rwsem);
      }
      
      static void inode_switch_wbs_work_fn(struct work_struct *work)
      {
              struct inode_switch_wbs_context *isw =
                      container_of(work, struct inode_switch_wbs_context, work);
              struct inode *inode = isw->inode;
              struct backing_dev_info *bdi = inode_to_bdi(inode);
              struct address_space *mapping = inode->i_mapping;
              struct bdi_writeback *old_wb = inode->i_wb;
              struct bdi_writeback *new_wb = isw->new_wb;
              struct radix_tree_iter iter;
              bool switched = false;
              void **slot;
      
              /*
               * If @inode switches cgwb membership while sync_inodes_sb() is
               * being issued, sync_inodes_sb() might miss it.  Synchronize.
               */
              down_read(&bdi->wb_switch_rwsem);
      
              /*
               * By the time control reaches here, RCU grace period has passed
               * since I_WB_SWITCH assertion and all wb stat update transactions
               * between unlocked_inode_to_wb_begin/end() are guaranteed to be
               * synchronizing against mapping->tree_lock.
               *
               * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
               * gives us exclusion against all wb related operations on @inode
               * including IO list manipulations and stat updates.
               */
              if (old_wb < new_wb) {
                      spin_lock(&old_wb->list_lock);
                      spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
              } else {
                      spin_lock(&new_wb->list_lock);
                      spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
              }
              spin_lock(&inode->i_lock);
              spin_lock_irq(&mapping->tree_lock);
      
              /*
               * Once I_FREEING is visible under i_lock, the eviction path owns
               * the inode and we shouldn't modify ->i_io_list.
               */
              if (unlikely(inode->i_state & I_FREEING))
                      goto skip_switch;
      
              /*
               * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
               * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
               * pages actually under underwriteback.
               */
              radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
                                         PAGECACHE_TAG_DIRTY) {
                      struct page *page = radix_tree_deref_slot_protected(slot,
                                                              &mapping->tree_lock);
                      if (likely(page) && PageDirty(page)) {
                              __dec_wb_stat(old_wb, WB_RECLAIMABLE);
                              __inc_wb_stat(new_wb, WB_RECLAIMABLE);
                      }
              }
      
              radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
                                         PAGECACHE_TAG_WRITEBACK) {
                      struct page *page = radix_tree_deref_slot_protected(slot,
                                                              &mapping->tree_lock);
                      if (likely(page)) {
                              WARN_ON_ONCE(!PageWriteback(page));
                              __dec_wb_stat(old_wb, WB_WRITEBACK);
                              __inc_wb_stat(new_wb, WB_WRITEBACK);
                      }
              }
      
              wb_get(new_wb);
      
              /*
               * Transfer to @new_wb's IO list if necessary.  The specific list
               * @inode was on is ignored and the inode is put on ->b_dirty which
               * is always correct including from ->b_dirty_time.  The transfer
               * preserves @inode->dirtied_when ordering.
               */
              if (!list_empty(&inode->i_io_list)) {
                      struct inode *pos;
      
                      inode_io_list_del_locked(inode, old_wb);
                      inode->i_wb = new_wb;
                      list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
                              if (time_after_eq(inode->dirtied_when,
                                                pos->dirtied_when))
                                      break;
                      inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
              } else {
                      inode->i_wb = new_wb;
              }
      
              /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
              inode->i_wb_frn_winner = 0;
              inode->i_wb_frn_avg_time = 0;
              inode->i_wb_frn_history = 0;
              switched = true;
      skip_switch:
              /*
               * Paired with load_acquire in unlocked_inode_to_wb_begin() and
               * ensures that the new wb is visible if they see !I_WB_SWITCH.
               */
              smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
      
              spin_unlock_irq(&mapping->tree_lock);
              spin_unlock(&inode->i_lock);
              spin_unlock(&new_wb->list_lock);
              spin_unlock(&old_wb->list_lock);
      
              up_read(&bdi->wb_switch_rwsem);
      
              if (switched) {
                      wb_wakeup(new_wb);
                      wb_put(old_wb);
              }
              wb_put(new_wb);
      
              iput(inode);
              kfree(isw);
      
              atomic_dec(&isw_nr_in_flight);
      }
      
      static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
      {
              struct inode_switch_wbs_context *isw = container_of(rcu_head,
                                      struct inode_switch_wbs_context, rcu_head);
      
              /* needs to grab bh-unsafe locks, bounce to work item */
              INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
              queue_work(isw_wq, &isw->work);
      }
      
      /**
       * inode_switch_wbs - change the wb association of an inode
       * @inode: target inode
       * @new_wb_id: ID of the new wb
       *
       * Switch @inode's wb association to the wb identified by @new_wb_id.  The
       * switching is performed asynchronously and may fail silently.
       */
      static void inode_switch_wbs(struct inode *inode, int new_wb_id)
      {
              struct backing_dev_info *bdi = inode_to_bdi(inode);
              struct cgroup_subsys_state *memcg_css;
              struct inode_switch_wbs_context *isw;
      
              /* noop if seems to be already in progress */
              if (inode->i_state & I_WB_SWITCH)
                      return;
      
              /*
               * Avoid starting new switches while sync_inodes_sb() is in
               * progress.  Otherwise, if the down_write protected issue path
               * blocks heavily, we might end up starting a large number of
               * switches which will block on the rwsem.
               */
              if (!down_read_trylock(&bdi->wb_switch_rwsem))
                      return;
      
              isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
              if (!isw)
                      goto out_unlock;
      
              /* find and pin the new wb */
              rcu_read_lock();
              memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
              if (memcg_css)
                      isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
              rcu_read_unlock();
              if (!isw->new_wb)
                      goto out_free;
      
              /* while holding I_WB_SWITCH, no one else can update the association */
              spin_lock(&inode->i_lock);
              if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
                  inode->i_state & (I_WB_SWITCH | I_FREEING) ||
                  inode_to_wb(inode) == isw->new_wb) {
                      spin_unlock(&inode->i_lock);
                      goto out_free;
              }
              inode->i_state |= I_WB_SWITCH;
              spin_unlock(&inode->i_lock);
      
              ihold(inode);
              isw->inode = inode;
      
              /*
               * In addition to synchronizing among switchers, I_WB_SWITCH tells
               * the RCU protected stat update paths to grab the mapping's
               * tree_lock so that stat transfer can synchronize against them.
               * Let's continue after I_WB_SWITCH is guaranteed to be visible.
               */
              call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
      
              atomic_inc(&isw_nr_in_flight);
      
              goto out_unlock;
      
      out_free:
              if (isw->new_wb)
                      wb_put(isw->new_wb);
              kfree(isw);
      out_unlock:
              up_read(&bdi->wb_switch_rwsem);
      }
      
      /**
       * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
       * @wbc: writeback_control of interest
       * @inode: target inode
       *
       * @inode is locked and about to be written back under the control of @wbc.
       * Record @inode's writeback context into @wbc and unlock the i_lock.  On
       * writeback completion, wbc_detach_inode() should be called.  This is used
       * to track the cgroup writeback context.
       */
      void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                       struct inode *inode)
      {
              if (!inode_cgwb_enabled(inode)) {
                      spin_unlock(&inode->i_lock);
                      return;
              }
      
              wbc->wb = inode_to_wb(inode);
              wbc->inode = inode;
      
              wbc->wb_id = wbc->wb->memcg_css->id;
              wbc->wb_lcand_id = inode->i_wb_frn_winner;
              wbc->wb_tcand_id = 0;
              wbc->wb_bytes = 0;
              wbc->wb_lcand_bytes = 0;
              wbc->wb_tcand_bytes = 0;
      
              wb_get(wbc->wb);
              spin_unlock(&inode->i_lock);
      
              /*
               * A dying wb indicates that either the blkcg associated with the
               * memcg changed or the associated memcg is dying.  In the first
               * case, a replacement wb should already be available and we should
               * refresh the wb immediately.  In the second case, trying to
               * refresh will keep failing.
               */
              if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
                      inode_switch_wbs(inode, wbc->wb_id);
      }
      
      /**
       * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
       * @wbc: writeback_control of the just finished writeback
       *
       * To be called after a writeback attempt of an inode finishes and undoes
       * wbc_attach_and_unlock_inode().  Can be called under any context.
       *
       * As concurrent write sharing of an inode is expected to be very rare and
       * memcg only tracks page ownership on first-use basis severely confining
       * the usefulness of such sharing, cgroup writeback tracks ownership
       * per-inode.  While the support for concurrent write sharing of an inode
       * is deemed unnecessary, an inode being written to by different cgroups at
       * different points in time is a lot more common, and, more importantly,
       * charging only by first-use can too readily lead to grossly incorrect
       * behaviors (single foreign page can lead to gigabytes of writeback to be
       * incorrectly attributed).
       *
       * To resolve this issue, cgroup writeback detects the majority dirtier of
       * an inode and transfers the ownership to it.  To avoid unnnecessary
       * oscillation, the detection mechanism keeps track of history and gives
       * out the switch verdict only if the foreign usage pattern is stable over
       * a certain amount of time and/or writeback attempts.
       *
       * On each writeback attempt, @wbc tries to detect the majority writer
       * using Boyer-Moore majority vote algorithm.  In addition to the byte
       * count from the majority voting, it also counts the bytes written for the
       * current wb and the last round's winner wb (max of last round's current
       * wb, the winner from two rounds ago, and the last round's majority
       * candidate).  Keeping track of the historical winner helps the algorithm
       * to semi-reliably detect the most active writer even when it's not the
       * absolute majority.
       *
       * Once the winner of the round is determined, whether the winner is
       * foreign or not and how much IO time the round consumed is recorded in
       * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
       * over a certain threshold, the switch verdict is given.
       */
      void wbc_detach_inode(struct writeback_control *wbc)
      {
              struct bdi_writeback *wb = wbc->wb;
              struct inode *inode = wbc->inode;
              unsigned long avg_time, max_bytes, max_time;
              u16 history;
              int max_id;
      
              if (!wb)
                      return;
      
              history = inode->i_wb_frn_history;
              avg_time = inode->i_wb_frn_avg_time;
      
              /* pick the winner of this round */
              if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
                  wbc->wb_bytes >= wbc->wb_tcand_bytes) {
                      max_id = wbc->wb_id;
                      max_bytes = wbc->wb_bytes;
              } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
                      max_id = wbc->wb_lcand_id;
                      max_bytes = wbc->wb_lcand_bytes;
              } else {
                      max_id = wbc->wb_tcand_id;
                      max_bytes = wbc->wb_tcand_bytes;
              }
      
              /*
               * Calculate the amount of IO time the winner consumed and fold it
               * into the running average kept per inode.  If the consumed IO
               * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
               * deciding whether to switch or not.  This is to prevent one-off
               * small dirtiers from skewing the verdict.
               */
              max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
                                      wb->avg_write_bandwidth);
              if (avg_time)
                      avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
                                  (avg_time >> WB_FRN_TIME_AVG_SHIFT);
              else
                      avg_time = max_time;        /* immediate catch up on first run */
      
              if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
                      int slots;
      
                      /*
                       * The switch verdict is reached if foreign wb's consume
                       * more than a certain proportion of IO time in a
                       * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
                       * history mask where each bit represents one sixteenth of
                       * the period.  Determine the number of slots to shift into
                       * history from @max_time.
                       */
                      slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
                                  (unsigned long)WB_FRN_HIST_MAX_SLOTS);
                      history <<= slots;
                      if (wbc->wb_id != max_id)
                              history |= (1U << slots) - 1;
      
                      /*
                       * Switch if the current wb isn't the consistent winner.
                       * If there are multiple closely competing dirtiers, the
                       * inode may switch across them repeatedly over time, which
                       * is okay.  The main goal is avoiding keeping an inode on
                       * the wrong wb for an extended period of time.
                       */
                      if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
                              inode_switch_wbs(inode, max_id);
              }
      
              /*
               * Multiple instances of this function may race to update the
               * following fields but we don't mind occassional inaccuracies.
               */
              inode->i_wb_frn_winner = max_id;
              inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
              inode->i_wb_frn_history = history;
      
              wb_put(wbc->wb);
              wbc->wb = NULL;
      }
      
      /**
       * wbc_account_io - account IO issued during writeback
       * @wbc: writeback_control of the writeback in progress
       * @page: page being written out
       * @bytes: number of bytes being written out
       *
       * @bytes from @page are about to written out during the writeback
       * controlled by @wbc.  Keep the book for foreign inode detection.  See
       * wbc_detach_inode().
       */
      void wbc_account_io(struct writeback_control *wbc, struct page *page,
                          size_t bytes)
      {
              int id;
      
              /*
               * pageout() path doesn't attach @wbc to the inode being written
               * out.  This is intentional as we don't want the function to block
               * behind a slow cgroup.  Ultimately, we want pageout() to kick off
               * regular writeback instead of writing things out itself.
               */
              if (!wbc->wb)
                      return;
      
              rcu_read_lock();
              id = mem_cgroup_css_from_page(page)->id;
              rcu_read_unlock();
      
              if (id == wbc->wb_id) {
                      wbc->wb_bytes += bytes;
                      return;
              }
      
              if (id == wbc->wb_lcand_id)
                      wbc->wb_lcand_bytes += bytes;
      
              /* Boyer-Moore majority vote algorithm */
              if (!wbc->wb_tcand_bytes)
                      wbc->wb_tcand_id = id;
              if (id == wbc->wb_tcand_id)
                      wbc->wb_tcand_bytes += bytes;
              else
                      wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
      }
      EXPORT_SYMBOL_GPL(wbc_account_io);
      
      /**
       * inode_congested - test whether an inode is congested
       * @inode: inode to test for congestion (may be NULL)
       * @cong_bits: mask of WB_[a]sync_congested bits to test
       *
       * Tests whether @inode is congested.  @cong_bits is the mask of congestion
       * bits to test and the return value is the mask of set bits.
       *
       * If cgroup writeback is enabled for @inode, the congestion state is
       * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
       * associated with @inode is congested; otherwise, the root wb's congestion
       * state is used.
       *
       * @inode is allowed to be NULL as this function is often called on
       * mapping->host which is NULL for the swapper space.
       */
      int inode_congested(struct inode *inode, int cong_bits)
      {
              /*
               * Once set, ->i_wb never becomes NULL while the inode is alive.
               * Start transaction iff ->i_wb is visible.
               */
              if (inode && inode_to_wb_is_valid(inode)) {
                      struct bdi_writeback *wb;
                      struct wb_lock_cookie lock_cookie = {};
                      bool congested;
      
                      wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
                      congested = wb_congested(wb, cong_bits);
                      unlocked_inode_to_wb_end(inode, &lock_cookie);
                      return congested;
              }
      
              return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
      }
      EXPORT_SYMBOL_GPL(inode_congested);
      
      /**
       * wb_split_bdi_pages - split nr_pages to write according to bandwidth
       * @wb: target bdi_writeback to split @nr_pages to
       * @nr_pages: number of pages to write for the whole bdi
       *
       * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
       * relation to the total write bandwidth of all wb's w/ dirty inodes on
       * @wb->bdi.
       */
      static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
      {
              unsigned long this_bw = wb->avg_write_bandwidth;
              unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
      
              if (nr_pages == LONG_MAX)
                      return LONG_MAX;
      
              /*
               * This may be called on clean wb's and proportional distribution
               * may not make sense, just use the original @nr_pages in those
               * cases.  In general, we wanna err on the side of writing more.
               */
              if (!tot_bw || this_bw >= tot_bw)
                      return nr_pages;
              else
                      return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
      }
      
      /**
       * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
       * @bdi: target backing_dev_info
       * @base_work: wb_writeback_work to issue
       * @skip_if_busy: skip wb's which already have writeback in progress
       *
       * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
       * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
       * distributed to the busy wbs according to each wb's proportion in the
       * total active write bandwidth of @bdi.
       */
      static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                        struct wb_writeback_work *base_work,
                                        bool skip_if_busy)
      {
              struct bdi_writeback *last_wb = NULL;
              struct bdi_writeback *wb = list_entry(&bdi->wb_list,
                                                    struct bdi_writeback, bdi_node);
      
              might_sleep();
      restart:
              rcu_read_lock();
              list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                      DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
                      struct wb_writeback_work fallback_work;
                      struct wb_writeback_work *work;
                      long nr_pages;
      
                      if (last_wb) {
                              wb_put(last_wb);
                              last_wb = NULL;
                      }
      
                      /* SYNC_ALL writes out I_DIRTY_TIME too */
                      if (!wb_has_dirty_io(wb) &&
                          (base_work->sync_mode == WB_SYNC_NONE ||
                           list_empty(&wb->b_dirty_time)))
                              continue;
                      if (skip_if_busy && writeback_in_progress(wb))
                              continue;
      
                      nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
      
                      work = kmalloc(sizeof(*work), GFP_ATOMIC);
                      if (work) {
                              *work = *base_work;
                              work->nr_pages = nr_pages;
                              work->auto_free = 1;
                              wb_queue_work(wb, work);
                              continue;
                      }
      
                      /* alloc failed, execute synchronously using on-stack fallback */
                      work = &fallback_work;
                      *work = *base_work;
                      work->nr_pages = nr_pages;
                      work->auto_free = 0;
                      work->done = &fallback_work_done;
      
                      wb_queue_work(wb, work);
      
                      /*
                       * Pin @wb so that it stays on @bdi->wb_list.  This allows
                       * continuing iteration from @wb after dropping and
                       * regrabbing rcu read lock.
                       */
                      wb_get(wb);
                      last_wb = wb;
      
                      rcu_read_unlock();
                      wb_wait_for_completion(bdi, &fallback_work_done);
                      goto restart;
              }
              rcu_read_unlock();
      
              if (last_wb)
                      wb_put(last_wb);
      }
      
  609 /**
       * cgroup_writeback_umount - flush inode wb switches for umount
       *
       * This function is called when a super_block is about to be destroyed and
       * flushes in-flight inode wb switches.  An inode wb switch goes through
       * RCU and then workqueue, so the two need to be flushed in order to ensure
       * that all previously scheduled switches are finished.  As wb switches are
       * rare occurrences and synchronize_rcu() can take a while, perform
       * flushing iff wb switches are in flight.
       */
  332 void cgroup_writeback_umount(void)
      {
              if (atomic_read(&isw_nr_in_flight)) {
                      /*
                       * Use rcu_barrier() to wait for all pending callbacks to
                       * ensure that all in-flight wb switches are in the workqueue.
                       */
                      rcu_barrier();
                      flush_workqueue(isw_wq);
              }
      }
      
      static int __init cgroup_writeback_init(void)
      {
              isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
   91         if (!isw_wq)
                      return -ENOMEM;
   31         return 0;
   64 }
      fs_initcall(cgroup_writeback_init);
      
      #else        /* CONFIG_CGROUP_WRITEBACK */
      
      static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
      static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
      
      static struct bdi_writeback *
      locked_inode_to_wb_and_lock_list(struct inode *inode)
              __releases(&inode->i_lock)
              __acquires(&wb->list_lock)
   13 {
              struct bdi_writeback *wb = inode_to_wb(inode);
      
              spin_unlock(&inode->i_lock);
              spin_lock(&wb->list_lock);
              return wb;
      }
   13 
      static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
              __acquires(&wb->list_lock)
      {
              struct bdi_writeback *wb = inode_to_wb(inode);
      
              spin_lock(&wb->list_lock);
   13         return wb;
      }
      
      static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
      {
              return nr_pages;
   13 }
      
      static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                        struct wb_writeback_work *base_work,
                                        bool skip_if_busy)
      {
              might_sleep();
      
              if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                      base_work->auto_free = 0;
                      wb_queue_work(&bdi->wb, base_work);
              }
      }
      
      #endif        /* CONFIG_CGROUP_WRITEBACK */
      
      void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
                              bool range_cyclic, enum wb_reason reason)
      {
              struct wb_writeback_work *work;
      
              if (!wb_has_dirty_io(wb))
                      return;
      
              /*
               * This is WB_SYNC_NONE writeback, so if allocation fails just
               * wakeup the thread for old dirty data writeback
               */
              work = kzalloc(sizeof(*work), GFP_ATOMIC);
              if (!work) {
   17                 trace_writeback_nowork(wb);
                      wb_wakeup(wb);
                      return;
              }
      
              work->sync_mode        = WB_SYNC_NONE;
              work->nr_pages        = nr_pages;
              work->range_cyclic = range_cyclic;
              work->reason        = reason;
              work->auto_free        = 1;
      
              wb_queue_work(wb, work);
      }
      
      /**
       * wb_start_background_writeback - start background writeback
       * @wb: bdi_writback to write from
       *
       * Description:
       *   This makes sure WB_SYNC_NONE background writeback happens. When
       *   this function returns, it is only guaranteed that for given wb
       *   some IO is happening if we are over background dirty threshold.
       *   Caller need not hold sb s_umount semaphore.
       */
      void wb_start_background_writeback(struct bdi_writeback *wb)
      {
              /*
               * We just wake up the flusher thread. It will perform background
               * writeback as soon as there is no other work to do.
               */
              trace_writeback_wake_background(wb);
              wb_wakeup(wb);
      }
      
      /*
       * Remove the inode from the writeback list it is on.
       */
      void inode_io_list_del(struct inode *inode)
      {
              struct bdi_writeback *wb;
      
              wb = inode_to_wb_and_lock_list(inode);
              inode_io_list_del_locked(inode, wb);
              spin_unlock(&wb->list_lock);
      }
      
      /*
       * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
       * furthest end of its superblock's dirty-inode list.
       *
       * Before stamping the inode's ->dirtied_when, we check to see whether it is
       * already the most-recently-dirtied inode on the b_dirty list.  If that is
       * the case then the inode must have been redirtied while it was being written
       * out and we don't reset its dirtied_when.
       */
      static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
      {
              if (!list_empty(&wb->b_dirty)) {
                      struct inode *tail;
      
                      tail = wb_inode(wb->b_dirty.next);
                      if (time_before(inode->dirtied_when, tail->dirtied_when))
                              inode->dirtied_when = jiffies;
              }
              inode_io_list_move_locked(inode, wb, &wb->b_dirty);
      }
      
      /*
       * requeue inode for re-scanning after bdi->b_io list is exhausted.
       */
      static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
      {
              inode_io_list_move_locked(inode, wb, &wb->b_more_io);
      }
      
      static void inode_sync_complete(struct inode *inode)
      {
              inode->i_state &= ~I_SYNC;
              /* If inode is clean an unused, put it into LRU now... */
              inode_add_lru(inode);
              /* Waiters must see I_SYNC cleared before being woken up */
              smp_mb();
              wake_up_bit(&inode->i_state, __I_SYNC);
      }
      
      static bool inode_dirtied_after(struct inode *inode, unsigned long t)
      {
              bool ret = time_after(inode->dirtied_when, t);
      #ifndef CONFIG_64BIT
              /*
               * For inodes being constantly redirtied, dirtied_when can get stuck.
               * It _appears_ to be in the future, but is actually in distant past.
               * This test is necessary to prevent such wrapped-around relative times
               * from permanently stopping the whole bdi writeback.
               */
              ret = ret && time_before_eq(inode->dirtied_when, jiffies);
      #endif
              return ret;
      }
      
      #define EXPIRE_DIRTY_ATIME 0x0001
      
      /*
       * Move expired (dirtied before work->older_than_this) dirty inodes from
       * @delaying_queue to @dispatch_queue.
       */
      static int move_expired_inodes(struct list_head *delaying_queue,
                                     struct list_head *dispatch_queue,
                                     int flags,
                                     struct wb_writeback_work *work)
      {
              unsigned long *older_than_this = NULL;
              unsigned long expire_time;
              LIST_HEAD(tmp);
              struct list_head *pos, *node;
              struct super_block *sb = NULL;
              struct inode *inode;
              int do_sb_sort = 0;
              int moved = 0;
      
              if ((flags & EXPIRE_DIRTY_ATIME) == 0)
                      older_than_this = work->older_than_this;
              else if (!work->for_sync) {
                      expire_time = jiffies - (dirtytime_expire_interval * HZ);
                      older_than_this = &expire_time;
              }
              while (!list_empty(delaying_queue)) {
                      inode = wb_inode(delaying_queue->prev);
                      if (older_than_this &&
                          inode_dirtied_after(inode, *older_than_this))
                              break;
                      list_move(&inode->i_io_list, &tmp);
                      moved++;
                      if (flags & EXPIRE_DIRTY_ATIME)
                              set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
                      if (sb_is_blkdev_sb(inode->i_sb))
                              continue;
                      if (sb && sb != inode->i_sb)
                              do_sb_sort = 1;
                      sb = inode->i_sb;
              }
      
              /* just one sb in list, splice to dispatch_queue and we're done */
              if (!do_sb_sort) {
                      list_splice(&tmp, dispatch_queue);
                      goto out;
              }
      
              /* Move inodes from one superblock together */
  318         while (!list_empty(&tmp)) {
  318                 sb = wb_inode(tmp.prev)->i_sb;
  318                 list_for_each_prev_safe(pos, node, &tmp) {
  318                         inode = wb_inode(pos);
                              if (inode->i_sb == sb)
                                      list_move(&inode->i_io_list, dispatch_queue);
                      }
              }
      out:
              return moved;
      }
      
      /*
       * Queue all expired dirty inodes for io, eldest first.
       * Before
       *         newly dirtied     b_dirty    b_io    b_more_io
       *         =============>    gf         edc     BA
  705  * After
       *         newly dirtied     b_dirty    b_io    b_more_io
       *         =============>    g          fBAedc
       *                                           |
  705  *                                           +--> dequeue for IO
    2  */
      static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
      {
              int moved;
      
              assert_spin_locked(&wb->list_lock);
              list_splice_init(&wb->b_more_io, &wb->b_io);
              moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
              moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                                           EXPIRE_DIRTY_ATIME, work);
              if (moved)
                      wb_io_lists_populated(wb);
  703         trace_writeback_queue_io(wb, work, moved);
      }
      
      static int write_inode(struct inode *inode, struct writeback_control *wbc)
      {
              int ret;
      
              if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
                      trace_writeback_write_inode_start(inode, wbc);
                      ret = inode->i_sb->s_op->write_inode(inode, wbc);
                      trace_writeback_write_inode(inode, wbc);
                      return ret;
              }
              return 0;
      }
      
      /*
       * Wait for writeback on an inode to complete. Called with i_lock held.
       * Caller must make sure inode cannot go away when we drop i_lock.
       */
      static void __inode_wait_for_writeback(struct inode *inode)
              __releases(inode->i_lock)
              __acquires(inode->i_lock)
      {
              DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
              wait_queue_head_t *wqh;
      
              wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
              while (inode->i_state & I_SYNC) {
                      spin_unlock(&inode->i_lock);
                      __wait_on_bit(wqh, &wq, bit_wait,
                                    TASK_UNINTERRUPTIBLE);
                      spin_lock(&inode->i_lock);
              }
      }
      
      /*
       * Wait for writeback on an inode to complete. Caller must have inode pinned.
       */
      void inode_wait_for_writeback(struct inode *inode)
      {
              spin_lock(&inode->i_lock);
              __inode_wait_for_writeback(inode);
              spin_unlock(&inode->i_lock);
      }
      
      /*
       * Sleep until I_SYNC is cleared. This function must be called with i_lock
       * held and drops it. It is aimed for callers not holding any inode reference
       * so once i_lock is dropped, inode can go away.
       */
      static void inode_sleep_on_writeback(struct inode *inode)
              __releases(inode->i_lock)
      {
              DEFINE_WAIT(wait);
              wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
              int sleep;
      
              prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
              sleep = inode->i_state & I_SYNC;
              spin_unlock(&inode->i_lock);
              if (sleep)
                      schedule();
              finish_wait(wqh, &wait);
      }
      
      /*
       * Find proper writeback list for the inode depending on its current state and
       * possibly also change of its state while we were doing writeback.  Here we
       * handle things such as livelock prevention or fairness of writeback among
       * inodes. This function can be called only by flusher thread - noone else
       * processes all inodes in writeback lists and requeueing inodes behind flusher
       * thread's back can have unexpected consequences.
       */
      static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                                struct writeback_control *wbc)
      {
              if (inode->i_state & I_FREEING)
                      return;
      
              /*
               * Sync livelock prevention. Each inode is tagged and synced in one
               * shot. If still dirty, it will be redirty_tail()'ed below.  Update
               * the dirty time to prevent enqueue and sync it again.
               */
              if ((inode->i_state & I_DIRTY) &&
                  (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
                      inode->dirtied_when = jiffies;
      
              if (wbc->pages_skipped) {
                      /*
                       * writeback is not making progress due to locked
                       * buffers. Skip this inode for now.
                       */
                      redirty_tail(inode, wb);
                      return;
              }
      
              if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
  322                 /*
                       * We didn't write back all the pages.  nfs_writepages()
                       * sometimes bales out without doing anything.
                       */
                      if (wbc->nr_to_write <= 0) {
                              /* Slice used up. Queue for next turn. */
                              requeue_io(inode, wb);
  322                 } else {
                              /*
  322                          * Writeback blocked by something other than
                               * congestion. Delay the inode for some time to
                               * avoid spinning on the CPU (100% iowait)
                               * retrying writeback of the dirty page/inode
                               * that cannot be performed immediately.
                               */
                              redirty_tail(inode, wb);
                      }
              } else if (inode->i_state & I_DIRTY) {
                      /*
  322                  * Filesystems can dirty the inode during writeback operations,
                       * such as delayed allocation during submission or metadata
                       * updates after data IO completion.
                       */
                      redirty_tail(inode, wb);
              } else if (inode->i_state & I_DIRTY_TIME) {
                      inode->dirtied_when = jiffies;
                      inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
              } else {
                      /* The inode is clean. Remove from writeback lists. */
  322                 inode_io_list_del_locked(inode, wb);
              }
      }
      
      /*
       * Write out an inode and its dirty pages. Do not update the writeback list
       * linkage. That is left to the caller. The caller is also responsible for
       * setting I_SYNC flag and calling inode_sync_complete() to clear it.
       */
      static int
      __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
      {
              struct address_space *mapping = inode->i_mapping;
  322         long nr_to_write = wbc->nr_to_write;
  322         unsigned dirty;
              int ret;
      
              WARN_ON(!(inode->i_state & I_SYNC));
      
              trace_writeback_single_inode_start(inode, wbc, nr_to_write);
      
              ret = do_writepages(mapping, wbc);
      
              /*
               * Make sure to wait on the data before writing out the metadata.
               * This is important for filesystems that modify metadata on data
               * I/O completion. We don't do it for sync(2) writeback because it has a
               * separate, external IO completion path and ->sync_fs for guaranteeing
               * inode metadata is written back correctly.
               */
   79         if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                      int err = filemap_fdatawait(mapping);
  322                 if (ret == 0)
                              ret = err;
              }
      
              /*
  322          * Some filesystems may redirty the inode during the writeback
  318          * due to delalloc, clear dirty metadata flags right before
  318          * write_inode()
               */
              spin_lock(&inode->i_lock);
  322 
  322         dirty = inode->i_state & I_DIRTY;
              if (inode->i_state & I_DIRTY_TIME) {
                      if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
                          unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
                          unlikely(time_after(jiffies,
                                              (inode->dirtied_time_when +
                                               dirtytime_expire_interval * HZ)))) {
                              dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
                              trace_writeback_lazytime(inode);
                      }
              } else
                      inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
              inode->i_state &= ~dirty;
      
              /*
               * Paired with smp_mb() in __mark_inode_dirty().  This allows
               * __mark_inode_dirty() to test i_state without grabbing i_lock -
  323          * either they see the I_DIRTY bits cleared or we see the dirtied
               * inode.
               *
               * I_DIRTY_PAGES is always cleared together above even if @mapping
  323          * still has dirty pages.  The flag is reinstated after smp_mb() if
               * necessary.  This guarantees that either __mark_inode_dirty()
  323          * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
    2          */
              smp_mb();
      
              if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                      inode->i_state |= I_DIRTY_PAGES;
      
              spin_unlock(&inode->i_lock);
    2 
              if (dirty & I_DIRTY_TIME)
                      mark_inode_dirty_sync(inode);
              /* Don't write the inode if only I_DIRTY_PAGES was set */
              if (dirty & ~I_DIRTY_PAGES) {
                      int err = write_inode(inode, wbc);
                      if (ret == 0)
                              ret = err;
              }
              trace_writeback_single_inode(inode, wbc, nr_to_write);
              return ret;
  322 }
   27 
   27 /*
       * Write out an inode's dirty pages. Either the caller has an active reference
  322  * on the inode or the inode has I_WILL_FREE set.
       *
       * This function is designed to be called for writing back one inode which
       * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
       * and does more profound writeback list handling in writeback_sb_inodes().
       */
      static int writeback_single_inode(struct inode *inode,
                                        struct writeback_control *wbc)
      {
              struct bdi_writeback *wb;
              int ret = 0;
      
              spin_lock(&inode->i_lock);
              if (!atomic_read(&inode->i_count))
  286                 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
  322         else
                      WARN_ON(inode->i_state & I_WILL_FREE);
      
  322         if (inode->i_state & I_SYNC) {
                      if (wbc->sync_mode != WB_SYNC_ALL)
                              goto out;
                      /*
                       * It's a data-integrity sync. We must wait. Since callers hold
                       * inode reference or inode has I_WILL_FREE set, it cannot go
                       * away under us.
                       */
                      __inode_wait_for_writeback(inode);
              }
              WARN_ON(inode->i_state & I_SYNC);
              /*
               * Skip inode if it is clean and we have no outstanding writeback in
               * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
               * function since flusher thread may be doing for example sync in
               * parallel and if we move the inode, it could get skipped. So here we
               * make sure inode is on some writeback list and leave it there unless
               * we have completely cleaned the inode.
               */
              if (!(inode->i_state & I_DIRTY_ALL) &&
                  (wbc->sync_mode != WB_SYNC_ALL ||
                   !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                      goto out;
              inode->i_state |= I_SYNC;
              wbc_attach_and_unlock_inode(wbc, inode);
      
              ret = __writeback_single_inode(inode, wbc);
      
              wbc_detach_inode(wbc);
      
              wb = inode_to_wb_and_lock_list(inode);
              spin_lock(&inode->i_lock);
              /*
               * If inode is clean, remove it from writeback lists. Otherwise don't
               * touch it. See comment above for explanation.
               */
              if (!(inode->i_state & I_DIRTY_ALL))
                      inode_io_list_del_locked(inode, wb);
              spin_unlock(&wb->list_lock);
              inode_sync_complete(inode);
      out:
              spin_unlock(&inode->i_lock);
              return ret;
      }
      
      static long writeback_chunk_size(struct bdi_writeback *wb,
                                       struct wb_writeback_work *work)
      {
              long pages;
      
              /*
               * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
               * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
               * here avoids calling into writeback_inodes_wb() more than once.
               *
               * The intended call sequence for WB_SYNC_ALL writeback is:
               *
               *      wb_writeback()
               *          writeback_sb_inodes()       <== called only once
               *              write_cache_pages()     <== called once for each inode
               *                   (quickly) tag currently dirty pages
               *                   (maybe slowly) sync all tagged pages
               */
              if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
                      pages = LONG_MAX;
              else {
                      pages = min(wb->avg_write_bandwidth / 2,
                                  global_wb_domain.dirty_limit / DIRTY_SCOPE);
                      pages = min(pages, work->nr_pages);
                      pages = round_down(pages + MIN_WRITEBACK_PAGES,
                                         MIN_WRITEBACK_PAGES);
              }
      
              return pages;
      }
      
      /*
       * Write a portion of b_io inodes which belong to @sb.
       *
       * Return the number of pages and/or inodes written.
       *
       * NOTE! This is called with wb->list_lock held, and will
       * unlock and relock that for each inode it ends up doing
       * IO for.
       */
      static long writeback_sb_inodes(struct super_block *sb,
                                      struct bdi_writeback *wb,
                                      struct wb_writeback_work *work)
      {
              struct writeback_control wbc = {
                      .sync_mode                = work->sync_mode,
                      .tagged_writepages        = work->tagged_writepages,
                      .for_kupdate                = work->for_kupdate,
                      .for_background                = work->for_background,
                      .for_sync                = work->for_sync,
                      .range_cyclic                = work->range_cyclic,
                      .range_start                = 0,
                      .range_end                = LLONG_MAX,
              };
              unsigned long start_time = jiffies;
              long write_chunk;
              long wrote = 0;  /* count both pages and inodes */
      
              while (!list_empty(&wb->b_io)) {
                      struct inode *inode = wb_inode(wb->b_io.prev);
                      struct bdi_writeback *tmp_wb;
      
                      if (inode->i_sb != sb) {
                              if (work->sb) {
                                      /*
                                       * We only want to write back data for this
                                       * superblock, move all inodes not belonging
                                       * to it back onto the dirty list.
                                       */
                                      redirty_tail(inode, wb);
                                      continue;
                              }
      
                              /*
                               * The inode belongs to a different superblock.
                               * Bounce back to the caller to unpin this and
                               * pin the next superblock.
                               */
                              break;
                      }
      
                      /*
                       * Don't bother with new inodes or inodes being freed, first
                       * kind does not need periodic writeout yet, and for the latter
                       * kind writeout is handled by the freer.
                       */
                      spin_lock(&inode->i_lock);
                      if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                              spin_unlock(&inode->i_lock);
                              redirty_tail(inode, wb);
                              continue;
                      }
                      if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
                              /*
                               * If this inode is locked for writeback and we are not
                               * doing writeback-for-data-integrity, move it to
                               * b_more_io so that writeback can proceed with the
                               * other inodes on s_io.
                               *
                               * We'll have another go at writing back this inode
                               * when we completed a full scan of b_io.
                               */
                              spin_unlock(&inode->i_lock);
                              requeue_io(inode, wb);
                              trace_writeback_sb_inodes_requeue(inode);
                              continue;
                      }
                      spin_unlock(&wb->list_lock);
      
                      /*
                       * We already requeued the inode if it had I_SYNC set and we
                       * are doing WB_SYNC_NONE writeback. So this catches only the
                       * WB_SYNC_ALL case.
                       */
                      if (inode->i_state & I_SYNC) {
                              /* Wait for I_SYNC. This function drops i_lock... */
                              inode_sleep_on_writeback(inode);
                              /* Inode may be gone, start again */
                              spin_lock(&wb->list_lock);
                              continue;
                      }
                      inode->i_state |= I_SYNC;
                      wbc_attach_and_unlock_inode(&wbc, inode);
      
                      write_chunk = writeback_chunk_size(wb, work);
                      wbc.nr_to_write = write_chunk;
                      wbc.pages_skipped = 0;
      
                      /*
                       * We use I_SYNC to pin the inode in memory. While it is set
                       * evict_inode() will wait so the inode cannot be freed.
                       */
                      __writeback_single_inode(inode, &wbc);
      
                      wbc_detach_inode(&wbc);
                      work->nr_pages -= write_chunk - wbc.nr_to_write;
                      wrote += write_chunk - wbc.nr_to_write;
      
                      if (need_resched()) {
                              /*
                               * We're trying to balance between building up a nice
                               * long list of IOs to improve our merge rate, and
                               * getting those IOs out quickly for anyone throttling
                               * in balance_dirty_pages().  cond_resched() doesn't
                               * unplug, so get our IOs out the door before we
                               * give up the CPU.
                               */
                              blk_flush_plug(current);
                              cond_resched();
                      }
      
                      /*
                       * Requeue @inode if still dirty.  Be careful as @inode may
                       * have been switched to another wb in the meantime.
                       */
                      tmp_wb = inode_to_wb_and_lock_list(inode);
                      spin_lock(&inode->i_lock);
                      if (!(inode->i_state & I_DIRTY_ALL))
                              wrote++;
                      requeue_inode(inode, tmp_wb, &wbc);
                      inode_sync_complete(inode);
                      spin_unlock(&inode->i_lock);
      
                      if (unlikely(tmp_wb != wb)) {
                              spin_unlock(&tmp_wb->list_lock);
                              spin_lock(&wb->list_lock);
                      }
      
                      /*
                       * bail out to wb_writeback() often enough to check
                       * background threshold and other termination conditions.
                       */
                      if (wrote) {
                              if (time_is_before_jiffies(start_time + HZ / 10UL))
                                      break;
                              if (work->nr_pages <= 0)
                                      break;
                      }
              }
              return wrote;
      }
      
      static long __writeback_inodes_wb(struct bdi_writeback *wb,
                                        struct wb_writeback_work *work)
      {
              unsigned long start_time = jiffies;
              long wrote = 0;
      
              while (!list_empty(&wb->b_io)) {
                      struct inode *inode = wb_inode(wb->b_io.prev);
                      struct super_block *sb = inode->i_sb;
      
                      if (!trylock_super(sb)) {
                              /*
                               * trylock_super() may fail consistently due to
                               * s_umount being grabbed by someone else. Don't use
                               * requeue_io() to avoid busy retrying the inode/sb.
                               */
                              redirty_tail(inode, wb);
                              continue;
                      }
                      wrote += writeback_sb_inodes(sb, wb, work);
                      up_read(&sb->s_umount);
      
                      /* refer to the same tests at the end of writeback_sb_inodes */
                      if (wrote) {
                              if (time_is_before_jiffies(start_time + HZ / 10UL))
                                      break;
                              if (work->nr_pages <= 0)
                                      break;
                      }
              }
              /* Leave any unwritten inodes on b_io */
              return wrote;
      }
      
      static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                                      enum wb_reason reason)
      {
              struct wb_writeback_work work = {
                      .nr_pages        = nr_pages,
                      .sync_mode        = WB_SYNC_NONE,
                      .range_cyclic        = 1,
                      .reason                = reason,
              };
              struct blk_plug plug;
      
              blk_start_plug(&plug);
              spin_lock(&wb->list_lock);
              if (list_empty(&wb->b_io))
                      queue_io(wb, &work);
              __writeback_inodes_wb(wb, &work);
              spin_unlock(&wb->list_lock);
              blk_finish_plug(&plug);
      
              return nr_pages - work.nr_pages;
      }
      
      /*
       * Explicit flushing or periodic writeback of "old" data.
       *
       * Define "old": the first time one of an inode's pages is dirtied, we mark the
       * dirtying-time in the inode's address_space.  So this periodic writeback code
       * just walks the superblock inode list, writing back any inodes which are
       * older than a specific point in time.
       *
       * Try to run once per dirty_writeback_interval.  But if a writeback event
       * takes longer than a dirty_writeback_interval interval, then leave a
       * one-second gap.
       *
       * older_than_this takes precedence over nr_to_write.  So we'll only write back
       * all dirty pages if they are all attached to "old" mappings.
       */
      static long wb_writeback(struct bdi_writeback *wb,
                               struct wb_writeback_work *work)
      {
              unsigned long wb_start = jiffies;
              long nr_pages = work->nr_pages;
              unsigned long oldest_jif;
              struct inode *inode;
              long progress;
              struct blk_plug plug;
      
              oldest_jif = jiffies;
              work->older_than_this = &oldest_jif;
      
              blk_start_plug(&plug);
              spin_lock(&wb->list_lock);
              for (;;) {
                      /*
                       * Stop writeback when nr_pages has been consumed
                       */
                      if (work->nr_pages <= 0)
                              break;
      
                      /*
                       * Background writeout and kupdate-style writeback may
                       * run forever. Stop them if there is other work to do
                       * so that e.g. sync can proceed. They'll be restarted
                       * after the other works are all done.
                       */
                      if ((work->for_background || work->for_kupdate) &&
                          !list_empty(&wb->work_list))
                              break;
      
                      /*
                       * For background writeout, stop when we are below the
                       * background dirty threshold
                       */
                      if (work->for_background && !wb_over_bg_thresh(wb))
                              break;
      
                      /*
                       * Kupdate and background works are special and we want to
                       * include all inodes that need writing. Livelock avoidance is
                       * handled by these works yielding to any other work so we are
                       * safe.
                       */
                      if (work->for_kupdate) {
                              oldest_jif = jiffies -
                                      msecs_to_jiffies(dirty_expire_interval * 10);
                      } else if (work->for_background)
                              oldest_jif = jiffies;
      
                      trace_writeback_start(wb, work);
                      if (list_empty(&wb->b_io))
                              queue_io(wb, work);
                      if (work->sb)
                              progress = writeback_sb_inodes(work->sb, wb, work);
                      else
                              progress = __writeback_inodes_wb(wb, work);
                      trace_writeback_written(wb, work);
      
                      wb_update_bandwidth(wb, wb_start);
      
                      /*
                       * Did we write something? Try for more
                       *
                       * Dirty inodes are moved to b_io for writeback in batches.
                       * The completion of the current batch does not necessarily
                       * mean the overall work is done. So we keep looping as long
                       * as made some progress on cleaning pages or inodes.
                       */
                      if (progress)
                              continue;
                      /*
                       * No more inodes for IO, bail
                       */
                      if (list_empty(&wb->b_more_io))
                              break;
                      /*
                       * Nothing written. Wait for some inode to
                       * become available for writeback. Otherwise
  113                  * we'll just busyloop.
                       */
                      if (!list_empty(&wb->b_more_io))  {
                              trace_writeback_wait(wb, work);
                              inode = wb_inode(wb->b_more_io.prev);
                              spin_lock(&inode->i_lock);
                              spin_unlock(&wb->list_lock);
                              /* This function drops i_lock... */
                              inode_sleep_on_writeback(inode);
                              spin_lock(&wb->list_lock);
                      }
              }
              spin_unlock(&wb->list_lock);
              blk_finish_plug(&plug);
      
              return nr_pages - work->nr_pages;
      }
      
      /*
       * Return the next wb_writeback_work struct that hasn't been processed yet.
       */
      static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
      {
              struct wb_writeback_work *work = NULL;
      
              spin_lock_bh(&wb->work_lock);
              if (!list_empty(&wb->work_list)) {
                      work = list_entry(wb->work_list.next,
                                        struct wb_writeback_work, list);
                      list_del_init(&work->list);
              }
              spin_unlock_bh(&wb->work_lock);
              return work;
      }
      
      /*
       * Add in the number of potentially dirty inodes, because each inode
       * write can dirty pagecache in the underlying blockdev.
       */
      static unsigned long get_nr_dirty_pages(void)
      {
              return global_page_state(NR_FILE_DIRTY) +
                      global_page_state(NR_UNSTABLE_NFS) +
                      get_nr_dirty_inodes();
      }
      
      static long wb_check_background_flush(struct bdi_writeback *wb)
      {
              if (wb_over_bg_thresh(wb)) {
      
                      struct wb_writeback_work work = {
                              .nr_pages        = LONG_MAX,
                              .sync_mode        = WB_SYNC_NONE,
                              .for_background        = 1,
                              .range_cyclic        = 1,
                              .reason                = WB_REASON_BACKGROUND,
                      };
      
                      return wb_writeback(wb, &work);
              }
      
              return 0;
      }
      
      static long wb_check_old_data_flush(struct bdi_writeback *wb)
      {
              unsigned long expired;
              long nr_pages;
      
              /*
               * When set to zero, disable periodic writeback
               */
              if (!dirty_writeback_interval)
                      return 0;
      
              expired = wb->last_old_flush +
                              msecs_to_jiffies(dirty_writeback_interval * 10);
              if (time_before(jiffies, expired))
                      return 0;
      
              wb->last_old_flush = jiffies;
              nr_pages = get_nr_dirty_pages();
      
              if (nr_pages) {
                      struct wb_writeback_work work = {
                              .nr_pages        = nr_pages,
                              .sync_mode        = WB_SYNC_NONE,
                              .for_kupdate        = 1,
                              .range_cyclic        = 1,
                              .reason                = WB_REASON_PERIODIC,
                      };
      
                      return wb_writeback(wb, &work);
              }
      
              return 0;
      }
      
      /*
       * Retrieve work items and do the writeback they describe
       */
      static long wb_do_writeback(struct bdi_writeback *wb)
      {
              struct wb_writeback_work *work;
              long wrote = 0;
      
              set_bit(WB_writeback_running, &wb->state);
              while ((work = get_next_work_item(wb)) != NULL) {
                      trace_writeback_exec(wb, work);
                      wrote += wb_writeback(wb, work);
                      finish_writeback_work(wb, work);
              }
      
              /*
               * Check for periodic writeback, kupdated() style
               */
              wrote += wb_check_old_data_flush(wb);
              wrote += wb_check_background_flush(wb);
              clear_bit(WB_writeback_running, &wb->state);
      
              return wrote;
      }
      
      /*
       * Handle writeback of dirty data for the device backed by this bdi. Also
       * reschedules periodically and does kupdated style flushing.
       */
      void wb_workfn(struct work_struct *work)
      {
              struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                      struct bdi_writeback, dwork);
              long pages_written;
      
              set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
   13         current->flags |= PF_SWAPWRITE;
   13 
              if (likely(!current_is_workqueue_rescuer() ||
   13                    !test_bit(WB_registered, &wb->state))) {
   13                 /*
                       * The normal path.  Keep writing back @wb until its
                       * work_list is empty.  Note that this path is also taken
   13                  * if @wb is shutting down even when we're running off the
                       * rescuer as work_list needs to be drained.
                       */
   13                 do {
   13                         pages_written = wb_do_writeback(wb);
                              trace_writeback_pages_written(pages_written);
                      } while (!list_empty(&wb->work_list));
   13         } else {
                      /*
                       * bdi_wq can't get enough workers and we're running off
                       * the emergency worker.  Don't hog it.  Hopefully, 1024 is
                       * enough for efficient IO.
                       */
                      pages_written = writeback_inodes_wb(wb, 1024,
                                                          WB_REASON_FORKER_THREAD);
                      trace_writeback_pages_written(pages_written);
              }
      
              if (!list_empty(&wb->work_list))
                      wb_wakeup(wb);
              else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
                      wb_wakeup_delayed(wb);
      
              current->flags &= ~PF_SWAPWRITE;
      }
      
      /*
       * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
       * the whole world.
       */
      void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
      {
              struct backing_dev_info *bdi;
      
              if (!nr_pages)
                      nr_pages = get_nr_dirty_pages();
      
              rcu_read_lock();
              list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                      struct bdi_writeback *wb;
      
                      i