/*
       * linux/fs/ext4/readpage.c
       *
       * Copyright (C) 2002, Linus Torvalds.
       * Copyright (C) 2015, Google, Inc.
       *
       * This was originally taken from fs/mpage.c
       *
       * The intent is the ext4_mpage_readpages() function here is intended
       * to replace mpage_readpages() in the general case, not just for
       * encrypted files.  It has some limitations (see below), where it
       * will fall back to read_block_full_page(), but these limitations
       * should only be hit when page_size != block_size.
       *
       * This will allow us to attach a callback function to support ext4
       * encryption.
       *
       * If anything unusual happens, such as:
       *
       * - encountering a page which has buffers
       * - encountering a page which has a non-hole after a hole
       * - encountering a page with non-contiguous blocks
       *
       * then this code just gives up and calls the buffer_head-based read function.
       * It does handle a page which has holes at the end - that is a common case:
       * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
       *
       */
      
      #include <linux/kernel.h>
      #include <linux/export.h>
      #include <linux/mm.h>
      #include <linux/kdev_t.h>
      #include <linux/gfp.h>
      #include <linux/bio.h>
      #include <linux/fs.h>
      #include <linux/buffer_head.h>
      #include <linux/blkdev.h>
      #include <linux/highmem.h>
      #include <linux/prefetch.h>
      #include <linux/mpage.h>
      #include <linux/writeback.h>
      #include <linux/backing-dev.h>
      #include <linux/pagevec.h>
      #include <linux/cleancache.h>
      
      #include "ext4.h"
      #include <trace/events/android_fs.h>
      
      /*
       * Call ext4_decrypt on every single page, reusing the encryption
       * context.
       */
      static void completion_pages(struct work_struct *work)
      {
      #ifdef CONFIG_EXT4_FS_ENCRYPTION
              struct ext4_crypto_ctx *ctx =
                      container_of(work, struct ext4_crypto_ctx, r.work);
              struct bio        *bio        = ctx->r.bio;
              struct bio_vec        *bv;
              int                i;
      
              bio_for_each_segment_all(bv, bio, i) {
                      struct page *page = bv->bv_page;
      
                      int ret = ext4_decrypt(page);
                      if (ret) {
                              WARN_ON_ONCE(1);
                              SetPageError(page);
                      } else
                              SetPageUptodate(page);
                      unlock_page(page);
              }
              ext4_release_crypto_ctx(ctx);
              bio_put(bio);
      #else
              BUG();
      #endif
      }
      
      static inline bool ext4_bio_encrypted(struct bio *bio)
      {
      #ifdef CONFIG_EXT4_FS_ENCRYPTION
              return unlikely(bio->bi_private != NULL);
      #else
              return false;
      #endif
      }
      
      static void
      ext4_trace_read_completion(struct bio *bio)
      {
              struct page *first_page = bio->bi_io_vec[0].bv_page;
      
              if (first_page != NULL)
                      trace_android_fs_dataread_end(first_page->mapping->host,
                                                    page_offset(first_page),
                                                    bio->bi_iter.bi_size);
      }
      
      /*
       * I/O completion handler for multipage BIOs.
       *
       * The mpage code never puts partial pages into a BIO (except for end-of-file).
       * If a page does not map to a contiguous run of blocks then it simply falls
       * back to block_read_full_page().
       *
       * Why is this?  If a page's completion depends on a number of different BIOs
       * which can complete in any order (or at the same time) then determining the
       * status of that page is hard.  See end_buffer_async_read() for the details.
       * There is no point in duplicating all that complexity.
       */
      static void mpage_end_io(struct bio *bio)
      {
              struct bio_vec *bv;
              int i;
      
              if (trace_android_fs_dataread_start_enabled())
                      ext4_trace_read_completion(bio);
      
              if (ext4_bio_encrypted(bio)) {
                      struct ext4_crypto_ctx *ctx = bio->bi_private;
      
                      if (bio->bi_error) {
                              ext4_release_crypto_ctx(ctx);
                      } else {
                              INIT_WORK(&ctx->r.work, completion_pages);
                              ctx->r.bio = bio;
                              queue_work(ext4_read_workqueue, &ctx->r.work);
                              return;
                      }
              }
              bio_for_each_segment_all(bv, bio, i) {
                      struct page *page = bv->bv_page;
      
                      if (!bio->bi_error) {
                              SetPageUptodate(page);
                      } else {
                              ClearPageUptodate(page);
                              SetPageError(page);
                      }
                      unlock_page(page);
              }
      
              bio_put(bio);
      }
      
      static void
      ext4_submit_bio_read(struct bio *bio)
      {
  291         if (trace_android_fs_dataread_start_enabled()) {
                      struct page *first_page = bio->bi_io_vec[0].bv_page;
      
                      if (first_page != NULL) {
                              char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
      
                              path = android_fstrace_get_pathname(pathbuf,
                                                          MAX_TRACE_PATHBUF_LEN,
                                                          first_page->mapping->host);
                              trace_android_fs_dataread_start(
                                      first_page->mapping->host,
                                      page_offset(first_page),
                                      bio->bi_iter.bi_size,
                                      current->pid,
                                      path,
                                      current->comm);
                      }
              }
  291         submit_bio(READ, bio);
      }
      
      int ext4_mpage_readpages(struct address_space *mapping,
                               struct list_head *pages, struct page *page,
                               unsigned nr_pages)
      {
              struct bio *bio = NULL;
              unsigned page_idx;
              sector_t last_block_in_bio = 0;
      
  430         struct inode *inode = mapping->host;
              const unsigned blkbits = inode->i_blkbits;
              const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
              const unsigned blocksize = 1 << blkbits;
              sector_t block_in_file;
              sector_t last_block;
              sector_t last_block_in_file;
              sector_t blocks[MAX_BUF_PER_PAGE];
              unsigned page_block;
              struct block_device *bdev = inode->i_sb->s_bdev;
              int length;
              unsigned relative_block = 0;
              struct ext4_map_blocks map;
      
              map.m_pblk = 0;
              map.m_lblk = 0;
              map.m_len = 0;
              map.m_flags = 0;
      
  430         for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
                      int fully_mapped = 1;
                      unsigned first_hole = blocks_per_page;
      
  430                 prefetchw(&page->flags);
                      if (pages) {
  426                         page = list_entry(pages->prev, struct page, lru);
  426                         list_del(&page->lru);
                              if (add_to_page_cache_lru(page, mapping, page->index,
                                        mapping_gfp_constraint(mapping, GFP_KERNEL)))
                                      goto next_page;
                      }
      
  430                 if (page_has_buffers(page))
                              goto confused;
      
  429                 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
                      last_block = block_in_file + nr_pages * blocks_per_page;
                      last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
                      if (last_block > last_block_in_file)
                              last_block = last_block_in_file;
                      page_block = 0;
      
                      /*
                       * Map blocks using the previous result first.
                       */
                      if ((map.m_flags & EXT4_MAP_MAPPED) &&
  251                     block_in_file > map.m_lblk &&
  251                     block_in_file < (map.m_lblk + map.m_len)) {
  248                         unsigned map_offset = block_in_file - map.m_lblk;
                              unsigned last = map.m_len - map_offset;
      
                              for (relative_block = 0; ; relative_block++) {
                                      if (relative_block == last) {
                                              /* needed? */
  248                                         map.m_flags &= ~EXT4_MAP_MAPPED;
                                              break;
                                      }
  248                                 if (page_block == blocks_per_page)
                                              break;
  248                                 blocks[page_block] = map.m_pblk + map_offset +
                                              relative_block;
                                      page_block++;
                                      block_in_file++;
                              }
                      }
      
                      /*
                       * Then do more ext4_map_blocks() calls until we are
                       * done with this page.
                       */
  429                 while (page_block < blocks_per_page) {
  429                         if (block_in_file < last_block) {
  426                                 map.m_lblk = block_in_file;
                                      map.m_len = last_block - block_in_file;
      
                                      if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
                                      set_error_page:
                                              SetPageError(page);
                                              zero_user_segment(page, 0,
                                                                PAGE_CACHE_SIZE);
                                              unlock_page(page);
                                              goto next_page;
                                      }
                              }
  429                         if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
                                      fully_mapped = 0;
  393                                 if (first_hole == blocks_per_page)
                                              first_hole = page_block;
  393                                 page_block++;
                                      block_in_file++;
                                      continue;
                              }
  291                         if (first_hole != blocks_per_page)
                                      goto confused;                /* hole -> non-hole */
      
                              /* Contiguous blocks? */
  291                         if (page_block && blocks[page_block-1] != map.m_pblk-1)
                                      goto confused;
                              for (relative_block = 0; ; relative_block++) {
  291                                 if (relative_block == map.m_len) {
                                              /* needed? */
  279                                         map.m_flags &= ~EXT4_MAP_MAPPED;
                                              break;
  251                                 } else if (page_block == blocks_per_page)
                                              break;
  291                                 blocks[page_block] = map.m_pblk+relative_block;
                                      page_block++;
                                      block_in_file++;
                              }
                      }
  429                 if (first_hole != blocks_per_page) {
  393                         zero_user_segment(page, first_hole << blkbits,
                                                PAGE_CACHE_SIZE);
  393                         if (first_hole == 0) {
  393                                 SetPageUptodate(page);
                                      unlock_page(page);
                                      goto next_page;
                              }
  291                 } else if (fully_mapped) {
  291                         SetPageMappedToDisk(page);
                      }
  291                 if (fully_mapped && blocks_per_page == 1 &&
  291                     !PageUptodate(page) && cleancache_get_page(page) == 0) {
                              SetPageUptodate(page);
                              goto confused;
                      }
      
                      /*
                       * This page will go to BIO.  Do we need to send this
                       * BIO off first?
                       */
  291                 if (bio && (last_block_in_bio != blocks[0] - 1)) {
                      submit_and_realloc:
  197                         ext4_submit_bio_read(bio);
                              bio = NULL;
                      }
                      if (bio == NULL) {
                              struct ext4_crypto_ctx *ctx = NULL;
      
                              if (ext4_encrypted_inode(inode) &&
                                  S_ISREG(inode->i_mode)) {
                                      ctx = ext4_get_crypto_ctx(inode, GFP_NOFS);
                                      if (IS_ERR(ctx))
                                              goto set_error_page;
                              }
                              bio = bio_alloc(GFP_KERNEL,
  291                                 min_t(int, nr_pages, BIO_MAX_PAGES));
                              if (!bio) {
                                      if (ctx)
                                              ext4_release_crypto_ctx(ctx);
                                      goto set_error_page;
                              }
  291                         bio->bi_bdev = bdev;
                              bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
                              bio->bi_end_io = mpage_end_io;
                              bio->bi_private = ctx;
                      }
      
  291                 length = first_hole << blkbits;
                      if (bio_add_page(bio, page, length, 0) < length)
                              goto submit_and_realloc;
      
  291                 if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
  291                      (relative_block == map.m_len)) ||
                          (first_hole != blocks_per_page)) {
                              ext4_submit_bio_read(bio);
                              bio = NULL;
                      } else
  291                         last_block_in_bio = blocks[blocks_per_page - 1];
                      goto next_page;
              confused:
    6                 if (bio) {
                              ext4_submit_bio_read(bio);
                              bio = NULL;
                      }
    6                 if (!PageUptodate(page))
    6                         block_read_full_page(page, ext4_get_block);
                      else
                              unlock_page(page);
              next_page:
  430                 if (pages)
  426                         page_cache_release(page);
              }
  430         BUG_ON(pages && !list_empty(pages));
  430         if (bio)
  291                 ext4_submit_bio_read(bio);
  430         return 0;
      }
      #include <linux/mm.h>
      #include <linux/highmem.h>
      #include <linux/sched.h>
      #include <linux/hugetlb.h>
      
      static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                                struct mm_walk *walk)
      {
              pte_t *pte;
              int err = 0;
      
    2         pte = pte_offset_map(pmd, addr);
              for (;;) {
    2                 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
                      if (err)
                             break;
                      addr += PAGE_SIZE;
    2                 if (addr == end)
                              break;
    1                 pte++;
              }
      
              pte_unmap(pte);
              return err;
      }
      
      static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                                struct mm_walk *walk)
      {
              pmd_t *pmd;
              unsigned long next;
              int err = 0;
      
  105         pmd = pmd_offset(pud, addr);
              do {
      again:
  105                 next = pmd_addr_end(addr, end);
  105                 if (pmd_none(*pmd) || !walk->vma) {
   94                         if (walk->pte_hole)
   55                                 err = walk->pte_hole(addr, next, walk);
                              if (err)
                                      break;
                              continue;
                      }
                      /*
                       * This implies that each ->pmd_entry() handler
                       * needs to know about pmd_trans_huge() pmds
                       */
   93                 if (walk->pmd_entry)
   91                         err = walk->pmd_entry(pmd, addr, next, walk);
                      if (err)
                              break;
      
                      /*
                       * Check this here so we only break down trans_huge
                       * pages when we _need_ to
                       */
   90                 if (!walk->pte_entry)
                              continue;
      
                      split_huge_page_pmd_mm(walk->mm, addr, pmd);
                      if (pmd_trans_unstable(pmd))
                              goto again;
    2                 err = walk_pte_range(pmd, addr, next, walk);
                      if (err)
                              break;
   95         } while (pmd++, addr = next, addr != end);
      
              return err;
      }
      
      static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                                struct mm_walk *walk)
      {
              pud_t *pud;
              unsigned long next;
              int err = 0;
      
  106         pud = pud_offset(pgd, addr);
              do {
  106                 next = pud_addr_end(addr, end);
  106                 if (pud_none_or_clear_bad(pud)) {
    1                         if (walk->pte_hole)
    1                                 err = walk->pte_hole(addr, next, walk);
                              if (err)
                                      break;
                              continue;
                      }
  105                 if (walk->pmd_entry || walk->pte_entry)
  105                         err = walk_pmd_range(pud, addr, next, walk);
                      if (err)
                              break;
   96         } while (pud++, addr = next, addr != end);
      
              return err;
      }
      
      static int walk_pgd_range(unsigned long addr, unsigned long end,
                                struct mm_walk *walk)
      {
              pgd_t *pgd;
              unsigned long next;
              int err = 0;
      
  106         pgd = pgd_offset(walk->mm, addr);
              do {
  106                 next = pgd_addr_end(addr, end);
  106                 if (pgd_none_or_clear_bad(pgd)) {
                              if (walk->pte_hole)
                                      err = walk->pte_hole(addr, next, walk);
                              if (err)
                                      break;
                              continue;
                      }
  106                 if (walk->pmd_entry || walk->pte_entry)
  106                         err = walk_pud_range(pgd, addr, next, walk);
                      if (err)
                              break;
   96         } while (pgd++, addr = next, addr != end);
      
  106         return err;
      }
      
      #ifdef CONFIG_HUGETLB_PAGE
      static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
                                             unsigned long end)
      {
              unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
              return boundary < end ? boundary : end;
      }
      
      static int walk_hugetlb_range(unsigned long addr, unsigned long end,
                                    struct mm_walk *walk)
      {
              struct vm_area_struct *vma = walk->vma;
              struct hstate *h = hstate_vma(vma);
              unsigned long next;
              unsigned long hmask = huge_page_mask(h);
              pte_t *pte;
              int err = 0;
      
              do {
                      next = hugetlb_entry_end(h, addr, end);
                      pte = huge_pte_offset(walk->mm, addr & hmask);
      
                      if (pte)
                              err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
                      else if (walk->pte_hole)
                              err = walk->pte_hole(addr, next, walk);
      
                      if (err)
                              break;
              } while (addr = next, addr != end);
      
              return err;
      }
      
      #else /* CONFIG_HUGETLB_PAGE */
      static int walk_hugetlb_range(unsigned long addr, unsigned long end,
                                    struct mm_walk *walk)
      {
              return 0;
      }
      
      #endif /* CONFIG_HUGETLB_PAGE */
      
      /*
       * Decide whether we really walk over the current vma on [@start, @end)
       * or skip it via the returned value. Return 0 if we do walk over the
       * current vma, and return 1 if we skip the vma. Negative values means
       * error, where we abort the current walk.
       */
    5 static int walk_page_test(unsigned long start, unsigned long end,
                              struct mm_walk *walk)
      {
   82         struct vm_area_struct *vma = walk->vma;
      
   97         if (walk->test_walk)
   97                 return walk->test_walk(start, end, walk);
      
              /*
               * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
               * range, so we don't walk over it as we do for normal vmas. However,
               * Some callers are interested in handling hole range and they don't
               * want to just ignore any single address range. Such users certainly
               * define their ->pte_hole() callbacks, so let's delegate them to handle
               * vma(VM_PFNMAP).
               */
              if (vma->vm_flags & VM_PFNMAP) {
                      int err = 1;
    5                 if (walk->pte_hole)
    2                         err = walk->pte_hole(start, end, walk);
                      return err ? err : 1;
              }
              return 0;
      }
      
      static int __walk_page_range(unsigned long start, unsigned long end,
                              struct mm_walk *walk)
      {
              int err = 0;
              struct vm_area_struct *vma = walk->vma;
      
              if (vma && is_vm_hugetlb_page(vma)) {
                      if (walk->hugetlb_entry)
                              err = walk_hugetlb_range(start, end, walk);
              } else
   79                 err = walk_pgd_range(start, end, walk);
      
   27         return err;
      }
      
      /**
       * walk_page_range - walk page table with caller specific callbacks
       *
       * Recursively walk the page table tree of the process represented by @walk->mm
       * within the virtual address range [@start, @end). During walking, we can do
       * some caller-specific works for each entry, by setting up pmd_entry(),
       * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
       * callbacks, the associated entries/pages are just ignored.
       * The return values of these callbacks are commonly defined like below:
       *  - 0  : succeeded to handle the current entry, and if you don't reach the
       *         end address yet, continue to walk.
       *  - >0 : succeeded to handle the current entry, and return to the caller
       *         with caller specific value.
       *  - <0 : failed to handle the current entry, and return to the caller
       *         with error code.
       *
       * Before starting to walk page table, some callers want to check whether
       * they really want to walk over the current vma, typically by checking
       * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
       * purpose.
       *
       * struct mm_walk keeps current values of some common data like vma and pmd,
       * which are useful for the access from callbacks. If you want to pass some
       * caller-specific data to callbacks, @walk->private should be helpful.
       *
       * Locking:
       *   Callers of walk_page_range() and walk_page_vma() should hold
       *   @walk->mm->mmap_sem, because these function traverse vma list and/or
       *   access to vma's data.
       */
      int walk_page_range(unsigned long start, unsigned long end,
                          struct mm_walk *walk)
      {
              int err = 0;
              unsigned long next;
              struct vm_area_struct *vma;
      
   79         if (start >= end)
   79                 return -EINVAL;
      
   79         if (!walk->mm)
                      return -EINVAL;
      
   79         VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
      
   79         vma = find_vma(walk->mm, start);
              do {
   79                 if (!vma) { /* after the last vma */
   12                         walk->vma = NULL;
                              next = end;
   79                 } else if (start < vma->vm_start) { /* outside vma */
   61                         walk->vma = NULL;
                              next = min(end, vma->vm_start);
                      } else { /* inside vma */
   70                         walk->vma = vma;
                              next = min(end, vma->vm_end);
                              vma = vma->vm_next;
      
                              err = walk_page_test(start, next, walk);
                              if (err > 0) {
                                      /*
                                       * positive return values are purely for
                                       * controlling the pagewalk, so should never
                                       * be passed to the callers.
                                       */
                                      err = 0;
                                      continue;
                              }
   70                         if (err < 0)
                                      break;
                      }
   79                 if (walk->vma || walk->pte_hole)
   79                         err = __walk_page_range(start, next, walk);
                      if (err)
                              break;
   69         } while (start = next, start < end);
              return err;
      }
      
      int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
      {
              int err;
      
   27         if (!walk->mm)
                      return -EINVAL;
      
   27         VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
   27         VM_BUG_ON(!vma);
   27         walk->vma = vma;
              err = walk_page_test(vma->vm_start, vma->vm_end, walk);
              if (err > 0)
                      return 0;
   27         if (err < 0)
                      return err;
   27         return __walk_page_range(vma->vm_start, vma->vm_end, walk);
      }
      #undef TRACE_SYSTEM
      #define TRACE_SYSTEM filemap
      
      #if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
      #define _TRACE_FILEMAP_H
      
      #include <linux/types.h>
      #include <linux/tracepoint.h>
      #include <linux/mm.h>
      #include <linux/memcontrol.h>
      #include <linux/device.h>
      #include <linux/kdev_t.h>
      
      DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,
      
              TP_PROTO(struct page *page),
      
              TP_ARGS(page),
      
              TP_STRUCT__entry(
                      __field(unsigned long, pfn)
                      __field(unsigned long, i_ino)
                      __field(unsigned long, index)
                      __field(dev_t, s_dev)
              ),
      
              TP_fast_assign(
                      __entry->pfn = page_to_pfn(page);
                      __entry->i_ino = page->mapping->host->i_ino;
                      __entry->index = page->index;
                      if (page->mapping->host->i_sb)
                              __entry->s_dev = page->mapping->host->i_sb->s_dev;
                      else
                              __entry->s_dev = page->mapping->host->i_rdev;
              ),
      
              TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu",
                      MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                      __entry->i_ino,
                      pfn_to_page(__entry->pfn),
                      __entry->pfn,
                      __entry->index << PAGE_SHIFT)
      );
      
  662 DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache,
              TP_PROTO(struct page *page),
              TP_ARGS(page)
              );
      
  814 DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
              TP_PROTO(struct page *page),
              TP_ARGS(page)
              );
      
      #endif /* _TRACE_FILEMAP_H */
      
      /* This part must be outside protection */
      #include <trace/define_trace.h>
      /*
       * inet_diag.c        Module for monitoring INET transport protocols sockets.
       *
       * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
       *
       *        This program is free software; you can redistribute it and/or
       *      modify it under the terms of the GNU General Public License
       *      as published by the Free Software Foundation; either version
       *      2 of the License, or (at your option) any later version.
       */
      
      #include <linux/kernel.h>
      #include <linux/module.h>
      #include <linux/types.h>
      #include <linux/fcntl.h>
      #include <linux/random.h>
      #include <linux/slab.h>
      #include <linux/cache.h>
      #include <linux/init.h>
      #include <linux/time.h>
      
      #include <net/icmp.h>
      #include <net/tcp.h>
      #include <net/ipv6.h>
      #include <net/inet_common.h>
      #include <net/inet_connection_sock.h>
      #include <net/inet_hashtables.h>
      #include <net/inet_timewait_sock.h>
      #include <net/inet6_hashtables.h>
      #include <net/netlink.h>
      
      #include <linux/inet.h>
      #include <linux/stddef.h>
      
      #include <linux/inet_diag.h>
      #include <linux/sock_diag.h>
      
      static const struct inet_diag_handler **inet_diag_table;
      
      struct inet_diag_entry {
              const __be32 *saddr;
              const __be32 *daddr;
              u16 sport;
              u16 dport;
              u16 family;
              u16 userlocks;
              u32 ifindex;
              u32 mark;
      };
      
      static DEFINE_MUTEX(inet_diag_table_mutex);
      
      static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
      {
   83         if (!inet_diag_table[proto])
    2                 request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
                                     NETLINK_SOCK_DIAG, AF_INET, proto);
      
   83         mutex_lock(&inet_diag_table_mutex);
              if (!inet_diag_table[proto])
   83                 return ERR_PTR(-ENOENT);
      
              return inet_diag_table[proto];
      }
      
      static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
      {
   46         mutex_unlock(&inet_diag_table_mutex);
      }
      
    4 static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
      {
   24         r->idiag_family = sk->sk_family;
      
              r->id.idiag_sport = htons(sk->sk_num);
              r->id.idiag_dport = sk->sk_dport;
              r->id.idiag_if = sk->sk_bound_dev_if;
              sock_diag_save_cookie(sk, r->id.idiag_cookie);
      
      #if IS_ENABLED(CONFIG_IPV6)
              if (sk->sk_family == AF_INET6) {
   22                 *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
                      *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
              } else
      #endif
              {
    4         memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
              memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
      
              r->id.idiag_src[0] = sk->sk_rcv_saddr;
              r->id.idiag_dst[0] = sk->sk_daddr;
              }
   24 }
      
      static size_t inet_sk_attr_size(void)
      {
              return          nla_total_size(sizeof(struct tcp_info))
                      + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
                      + nla_total_size(1) /* INET_DIAG_TOS */
                      + nla_total_size(1) /* INET_DIAG_TCLASS */
                      + nla_total_size(4) /* INET_DIAG_MARK */
                      + nla_total_size(sizeof(struct inet_diag_meminfo))
                      + nla_total_size(sizeof(struct inet_diag_msg))
                      + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
                      + nla_total_size(TCP_CA_NAME_MAX)
                      + nla_total_size(sizeof(struct tcpvegas_info))
                      + 64;
      }
      
      int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
                            struct sk_buff *skb, const struct inet_diag_req_v2 *req,
                            struct user_namespace *user_ns,
                            u32 portid, u32 seq, u16 nlmsg_flags,
                            const struct nlmsghdr *unlh,
                            bool net_admin)
      {
              const struct inet_sock *inet = inet_sk(sk);
              const struct tcp_congestion_ops *ca_ops;
              const struct inet_diag_handler *handler;
   24         int ext = req->idiag_ext;
              struct inet_diag_msg *r;
              struct nlmsghdr  *nlh;
              struct nlattr *attr;
              void *info = NULL;
      
   24         handler = inet_diag_table[req->sdiag_protocol];
              BUG_ON(!handler);
      
   24         nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
                              nlmsg_flags);
              if (!nlh)
                      return -EMSGSIZE;
      
   24         r = nlmsg_data(nlh);
              BUG_ON(!sk_fullsock(sk));
      
   24         inet_diag_msg_common_fill(r, sk);
              r->idiag_state = sk->sk_state;
              r->idiag_timer = 0;
              r->idiag_retrans = 0;
      
              if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
                      goto errout;
      
              /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
               * hence this needs to be included regardless of socket family.
               */
   24         if (ext & (1 << (INET_DIAG_TOS - 1)))
   20                 if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
                              goto errout;
      
      #if IS_ENABLED(CONFIG_IPV6)
   24         if (r->idiag_family == AF_INET6) {
   22                 if (ext & (1 << (INET_DIAG_TCLASS - 1)))
                              if (nla_put_u8(skb, INET_DIAG_TCLASS,
   18                                        inet6_sk(sk)->tclass) < 0)
                                      goto errout;
      
   22                 if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
   17                     nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
                              goto errout;
              }
      #endif
      
   24         if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
                      goto errout;
      
   24         r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
              r->idiag_inode = sock_i_ino(sk);
      
              if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
                      struct inet_diag_meminfo minfo = {
   19                         .idiag_rmem = sk_rmem_alloc_get(sk),
                              .idiag_wmem = sk->sk_wmem_queued,
                              .idiag_fmem = sk->sk_forward_alloc,
                              .idiag_tmem = sk_wmem_alloc_get(sk),
                      };
      
   19                 if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
                              goto errout;
              }
      
   24         if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
   19                 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
                              goto errout;
      
   24         if (!icsk) {
    4                 handler->idiag_get_info(sk, r, NULL);
                      goto out;
              }
      
      #define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
      
   20         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
                  icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
                  icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
    2                 r->idiag_timer = 1;
                      r->idiag_retrans = icsk->icsk_retransmits;
                      r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
   20         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
    2                 r->idiag_timer = 4;
                      r->idiag_retrans = icsk->icsk_probes_out;
                      r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
   19         } else if (timer_pending(&sk->sk_timer)) {
    2                 r->idiag_timer = 2;
                      r->idiag_retrans = icsk->icsk_probes_out;
                      r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
              } else {
   17                 r->idiag_timer = 0;
                      r->idiag_expires = 0;
              }
      #undef EXPIRES_IN_MS
      
   20         if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
   16                 attr = nla_reserve(skb, INET_DIAG_INFO,
                                         handler->idiag_info_size);
                      if (!attr)
                              goto errout;
      
   16                 info = nla_data(attr);
              }
      
   16         if (ext & (1 << (INET_DIAG_CONG - 1))) {
                      int err = 0;
      
   15                 rcu_read_lock();
   15                 ca_ops = READ_ONCE(icsk->icsk_ca_ops);
                      if (ca_ops)
                              err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
   15                 rcu_read_unlock();
                      if (err < 0)
                              goto errout;
              }
      
   20         handler->idiag_get_info(sk, r, info);
      
              if (sk->sk_state < TCP_TIME_WAIT) {
                      union tcp_cc_info info;
                      size_t sz = 0;
                      int attr;
      
    5                 rcu_read_lock();
    5                 ca_ops = READ_ONCE(icsk->icsk_ca_ops);
                      if (ca_ops && ca_ops->get_info)
                              sz = ca_ops->get_info(sk, ext, &attr, &info);
    5                 rcu_read_unlock();
    5                 if (sz && nla_put(skb, attr, sz, &info) < 0)
                              goto errout;
              }
      
      out:
   24         nlmsg_end(skb, nlh);
    4         return 0;
      
      errout:
    3         nlmsg_cancel(skb, nlh);
   24         return -EMSGSIZE;
      }
      EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
      
      static int inet_csk_diag_fill(struct sock *sk,
                                    struct sk_buff *skb,
                                    const struct inet_diag_req_v2 *req,
                                    struct user_namespace *user_ns,
                                    u32 portid, u32 seq, u16 nlmsg_flags,
                                    const struct nlmsghdr *unlh,
                                    bool net_admin)
      {
   10         return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
                                       portid, seq, nlmsg_flags, unlh, net_admin);
      }
      
      static int inet_twsk_diag_fill(struct sock *sk,
                                     struct sk_buff *skb,
                                     u32 portid, u32 seq, u16 nlmsg_flags,
                                     const struct nlmsghdr *unlh)
      {
              struct inet_timewait_sock *tw = inet_twsk(sk);
              struct inet_diag_msg *r;
              struct nlmsghdr *nlh;
              long tmo;
      
    1         nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
                              nlmsg_flags);
              if (!nlh)
                      return -EMSGSIZE;
      
    1         r = nlmsg_data(nlh);
              BUG_ON(tw->tw_state != TCP_TIME_WAIT);
      
    1         tmo = tw->tw_timer.expires - jiffies;
              if (tmo < 0)
                      tmo = 0;
      
              inet_diag_msg_common_fill(r, sk);
              r->idiag_retrans      = 0;
      
              r->idiag_state              = tw->tw_substate;
              r->idiag_timer              = 3;
              r->idiag_expires      = jiffies_to_msecs(tmo);
              r->idiag_rqueue              = 0;
              r->idiag_wqueue              = 0;
              r->idiag_uid              = 0;
              r->idiag_inode              = 0;
      
              nlmsg_end(skb, nlh);
              return 0;
      }
      
      static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
                                    u32 portid, u32 seq, u16 nlmsg_flags,
                                    const struct nlmsghdr *unlh, bool net_admin)
      {
              struct request_sock *reqsk = inet_reqsk(sk);
              struct inet_diag_msg *r;
              struct nlmsghdr *nlh;
              long tmo;
      
    2         nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
                              nlmsg_flags);
              if (!nlh)
                      return -EMSGSIZE;
      
    2         r = nlmsg_data(nlh);
              inet_diag_msg_common_fill(r, sk);
              r->idiag_state = TCP_SYN_RECV;
              r->idiag_timer = 1;
              r->idiag_retrans = reqsk->num_retrans;
      
              BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
                           offsetof(struct sock, sk_cookie));
      
              tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
    2         r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
              r->idiag_rqueue        = 0;
              r->idiag_wqueue        = 0;
              r->idiag_uid        = 0;
              r->idiag_inode        = 0;
      
    1         if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
                                           inet_rsk(reqsk)->ir_mark))
                      return -EMSGSIZE;
      
    2         nlmsg_end(skb, nlh);
              return 0;
      }
      
   10 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
                              const struct inet_diag_req_v2 *r,
                              struct user_namespace *user_ns,
                              u32 portid, u32 seq, u16 nlmsg_flags,
                              const struct nlmsghdr *unlh, bool net_admin)
      {
   10         if (sk->sk_state == TCP_TIME_WAIT)
    1                 return inet_twsk_diag_fill(sk, skb, portid, seq,
                                                 nlmsg_flags, unlh);
      
   10         if (sk->sk_state == TCP_NEW_SYN_RECV)
    2                 return inet_req_diag_fill(sk, skb, portid, seq,
                                                nlmsg_flags, unlh, net_admin);
      
   10         return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
                                        nlmsg_flags, unlh, net_admin);
      }
      
      struct sock *inet_diag_find_one_icsk(struct net *net,
                                           struct inet_hashinfo *hashinfo,
                                           const struct inet_diag_req_v2 *req)
      {
              struct sock *sk;
      
   23         if (req->sdiag_family == AF_INET)
   11                 sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
                                       req->id.idiag_dport, req->id.idiag_src[0],
   11                                  req->id.idiag_sport, req->id.idiag_if);
      #if IS_ENABLED(CONFIG_IPV6)
   12         else if (req->sdiag_family == AF_INET6) {
   11                 if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
    1                     ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
                              sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3],
                                               req->id.idiag_dport, req->id.idiag_src[3],
                                               req->id.idiag_sport, req->id.idiag_if);
                      else
                              sk = inet6_lookup(net, hashinfo,
                                                (struct in6_addr *)req->id.idiag_dst,
                                                req->id.idiag_dport,
   11                                           (struct in6_addr *)req->id.idiag_src,
                                                req->id.idiag_sport,
                                                req->id.idiag_if);
              }
      #endif
              else
                      return ERR_PTR(-EINVAL);
      
   22         if (!sk)
   23                 return ERR_PTR(-ENOENT);
      
   12         if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
    5                 sock_gen_put(sk);
                      return ERR_PTR(-ENOENT);
              }
      
              return sk;
      }
      EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
      
      int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
                                  struct sk_buff *in_skb,
                                  const struct nlmsghdr *nlh,
                                  const struct inet_diag_req_v2 *req)
      {
   17         struct net *net = sock_net(in_skb->sk);
              struct sk_buff *rep;
              struct sock *sk;
              int err;
      
              sk = inet_diag_find_one_icsk(net, hashinfo, req);
              if (IS_ERR(sk))
   14                 return PTR_ERR(sk);
      
    3         rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
              if (!rep) {
                      err = -ENOMEM;
                      goto out;
              }
      
              err = sk_diag_fill(sk, rep, req,
                                 sk_user_ns(NETLINK_CB(in_skb).sk),
                                 NETLINK_CB(in_skb).portid,
                                 nlh->nlmsg_seq, 0, nlh,
    3                            netlink_net_capable(in_skb, CAP_NET_ADMIN));
              if (err < 0) {
                      WARN_ON(err == -EMSGSIZE);
                      nlmsg_free(rep);
                      goto out;
              }
    3         err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
                                    MSG_DONTWAIT);
              if (err > 0)
                      err = 0;
      
      out:
    3         if (sk)
   17                 sock_gen_put(sk);
      
              return err;
      }
      EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
      
      static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
                                     const struct nlmsghdr *nlh,
                                     const struct inet_diag_req_v2 *req)
      {
              const struct inet_diag_handler *handler;
              int err;
      
   46         handler = inet_diag_lock_handler(req->sdiag_protocol);
              if (IS_ERR(handler))
    1                 err = PTR_ERR(handler);
   45         else if (cmd == SOCK_DIAG_BY_FAMILY)
   19                 err = handler->dump_one(in_skb, nlh, req);
   26         else if (cmd == SOCK_DESTROY_BACKPORT && handler->destroy)
   26                 err = handler->destroy(in_skb, req);
              else
                      err = -EOPNOTSUPP;
   46         inet_diag_unlock_handler(handler);
      
              return err;
      }
      
      static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
      {
              int words = bits >> 5;
      
              bits &= 0x1f;
      
              if (words) {
                      if (memcmp(a1, a2, words << 2))
                              return 0;
              }
              if (bits) {
                      __be32 w1, w2;
                      __be32 mask;
      
                      w1 = a1[words];
                      w2 = a2[words];
      
                      mask = htonl((0xffffffff) << (32 - bits));
      
                      if ((w1 ^ w2) & mask)
                              return 0;
              }
      
              return 1;
      }
      
      static int inet_diag_bc_run(const struct nlattr *_bc,
                                  const struct inet_diag_entry *entry)
      {
   19         const void *bc = nla_data(_bc);
              int len = nla_len(_bc);
      
   19         while (len > 0) {
                      int yes = 1;
                      const struct inet_diag_bc_op *op = bc;
      
   19                 switch (op->code) {
                      case INET_DIAG_BC_NOP:
                              break;
                      case INET_DIAG_BC_JMP:
                              yes = 0;
                              break;
                      case INET_DIAG_BC_S_GE:
    3                         yes = entry->sport >= op[1].no;
                              break;
                      case INET_DIAG_BC_S_LE:
    1                         yes = entry->sport <= op[1].no;
                              break;
                      case INET_DIAG_BC_D_GE:
    1                         yes = entry->dport >= op[1].no;
                              break;
                      case INET_DIAG_BC_D_LE:
    2                         yes = entry->dport <= op[1].no;
                              break;
                      case INET_DIAG_BC_AUTO:
    1                         yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
                              break;
                      case INET_DIAG_BC_S_COND:
                      case INET_DIAG_BC_D_COND: {
                              const struct inet_diag_hostcond *cond;
                              const __be32 *addr;
      
                              cond = (const struct inet_diag_hostcond *)(op + 1);
    4                         if (cond->port != -1 &&
                                  cond->port != (op->code == INET_DIAG_BC_S_COND ?
    3                                              entry->sport : entry->dport)) {
                                      yes = 0;
                                      break;
                              }
      
    1                         if (op->code == INET_DIAG_BC_S_COND)
                                      addr = entry->saddr;
                              else
                                      addr = entry->daddr;
      
    2                         if (cond->family != AF_UNSPEC &&
                                  cond->family != entry->family) {
                                      if (entry->family == AF_INET6 &&
                                          cond->family == AF_INET) {
                                              if (addr[0] == 0 && addr[1] == 0 &&
                                                  addr[2] == htonl(0xffff) &&
                                                  bitstring_match(addr + 3,
                                                                  cond->addr,
                                                                  cond->prefix_len))
                                                      break;
                                      }
                                      yes = 0;
                                      break;
                              }
      
    2                         if (cond->prefix_len == 0)
                                      break;
                              if (bitstring_match(addr, cond->addr,
                                                  cond->prefix_len))
                                      break;
                              yes = 0;
                              break;
                      }
                      case INET_DIAG_BC_DEV_COND: {
                              u32 ifindex;
      
    2                         ifindex = *((const u32 *)(op + 1));
                              if (ifindex != entry->ifindex)
                                      yes = 0;
                              break;
                      }
                      case INET_DIAG_BC_MARK_COND: {
                              struct inet_diag_markcond *cond;
      
                              cond = (struct inet_diag_markcond *)(op + 1);
    2                         if ((entry->mark & cond->mask) != cond->mark)
                                      yes = 0;
                              break;
                      }
                      }
      
    8                 if (yes) {
   14                         len -= op->yes;
                              bc += op->yes;
                      } else {
    6                         len -= op->no;
                              bc += op->no;
                      }
              }
   27         return len == 0;
      }
      
      /* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
       */
      static void entry_fill_addrs(struct inet_diag_entry *entry,
                                   const struct sock *sk)
      {
      #if IS_ENABLED(CONFIG_IPV6)
              if (sk->sk_family == AF_INET6) {
   17                 entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
                      entry->daddr = sk->sk_v6_daddr.s6_addr32;
              } else
      #endif
              {
    2                 entry->saddr = &sk->sk_rcv_saddr;
                      entry->daddr = &sk->sk_daddr;
              }
      }
      
      int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
      {
              struct inet_sock *inet = inet_sk(sk);
              struct inet_diag_entry entry;
      
   27         if (!bc)
                      return 1;
      
   19         entry.family = sk->sk_family;
   19         entry_fill_addrs(&entry, sk);
   19         entry.sport = inet->inet_num;
              entry.dport = ntohs(inet->inet_dport);
              entry.ifindex = sk->sk_bound_dev_if;
   19         entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
   19         if (sk_fullsock(sk))
   19                 entry.mark = sk->sk_mark;
    4         else if (sk->sk_state == TCP_NEW_SYN_RECV)
    2                 entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
              else
                      entry.mark = 0;
      
   19         return inet_diag_bc_run(bc, &entry);
      }
      EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
      
      static int valid_cc(const void *bc, int len, int cc)
      {
              while (len >= 0) {
                      const struct inet_diag_bc_op *op = bc;
      
    3                 if (cc > len)
                              return 0;
    5                 if (cc == len)
                              return 1;
    5                 if (op->yes < 4 || op->yes & 3)
                              return 0;
    3                 len -= op->yes;
                      bc  += op->yes;
              }
              return 0;
      }
      
      /* data is u32 ifindex */
      static bool valid_devcond(const struct inet_diag_bc_op *op, int len,
                                int *min_len)
      {
              /* Check ifindex space. */
              *min_len += sizeof(u32);
    6         if (len < *min_len)
                      return false;
      
              return true;
      }
      /* Validate an inet_diag_hostcond. */
      static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
                                 int *min_len)
      {
              struct inet_diag_hostcond *cond;
              int addr_len;
      
              /* Check hostcond space. */
              *min_len += sizeof(struct inet_diag_hostcond);
    8         if (len < *min_len)
                      return false;
              cond = (struct inet_diag_hostcond *)(op + 1);
      
              /* Check address family and address length. */
    7         switch (cond->family) {
              case AF_UNSPEC:
                      addr_len = 0;
                      break;
              case AF_INET:
                      addr_len = sizeof(struct in_addr);
                      break;
              case AF_INET6:
                      addr_len = sizeof(struct in6_addr);
                      break;
              default:
                      return false;
              }
              *min_len += addr_len;
    6         if (len < *min_len)
                      return false;
      
              /* Check prefix length (in bits) vs address length (in bytes). */
    5         if (cond->prefix_len > 8 * addr_len)
                      return false;
      
              return true;
      }
      
      /* Validate a port comparison operator. */
      static bool valid_port_comparison(const struct inet_diag_bc_op *op,
                                        int len, int *min_len)
      {
              /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
              *min_len += sizeof(struct inet_diag_bc_op);
   13         if (len < *min_len)
                      return false;
              return true;
      }
      
      static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
                                 int *min_len)
      {
              *min_len += sizeof(struct inet_diag_markcond);
              return len >= *min_len;
      }
      
      static int inet_diag_bc_audit(const struct nlattr *attr,
                                    const struct sk_buff *skb)
      {
   46         bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
              const void *bytecode, *bc;
              int bytecode_len, len;
      
   42         if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
   46                 return -EINVAL;
      
   41         bytecode = bc = nla_data(attr);
              len = bytecode_len = nla_len(attr);
      
              while (len > 0) {
                      int min_len = sizeof(struct inet_diag_bc_op);
                      const struct inet_diag_bc_op *op = bc;
      
   41                 switch (op->code) {
                      case INET_DIAG_BC_S_COND:
                      case INET_DIAG_BC_D_COND:
    8                         if (!valid_hostcond(bc, len, &min_len))
                                      return -EINVAL;
                              break;
                      case INET_DIAG_BC_DEV_COND:
    6                         if (!valid_devcond(bc, len, &min_len))
                                      return -EINVAL;
                              break;
                      case INET_DIAG_BC_S_GE:
                      case INET_DIAG_BC_S_LE:
                      case INET_DIAG_BC_D_GE:
                      case INET_DIAG_BC_D_LE:
   13                         if (!valid_port_comparison(bc, len, &min_len))
                                      return -EINVAL;
                              break;
                      case INET_DIAG_BC_MARK_COND:
    6                         if (!net_admin)
                                      return -EPERM;
    5                         if (!valid_markcond(bc, len, &min_len))
                                      return -EINVAL;
                              break;
                      case INET_DIAG_BC_AUTO:
                      case INET_DIAG_BC_JMP:
                      case INET_DIAG_BC_NOP:
                              break;
                      default:
                              return -EINVAL;
                      }
      
   25                 if (op->code != INET_DIAG_BC_NOP) {
   28                         if (op->no < min_len || op->no > len + 4 || op->no & 3)
                                      return -EINVAL;
   25                         if (op->no < len &&
    5                             !valid_cc(bytecode, bytecode_len, len - op->no))
                                      return -EINVAL;
                      }
      
   26                 if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
                              return -EINVAL;
   24                 bc  += op->yes;
                      len -= op->yes;
              }
   23         return len == 0 ? 0 : -EINVAL;
      }
      
      static int inet_csk_diag_dump(struct sock *sk,
                                    struct sk_buff *skb,
                                    struct netlink_callback *cb,
                                    const struct inet_diag_req_v2 *r,
                                    const struct nlattr *bc,
                                    bool net_admin)
      {
              if (!inet_diag_bc_sk(bc, sk))
                      return 0;
      
   12         return inet_csk_diag_fill(sk, skb, r,
                                        sk_user_ns(NETLINK_CB(cb->skb).sk),
                                        NETLINK_CB(cb->skb).portid,
                                        cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
                                        net_admin);
      }
      
      static void twsk_build_assert(void)
      {
              BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
                           offsetof(struct sock, sk_family));
      
              BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
                           offsetof(struct inet_sock, inet_num));
      
              BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
                           offsetof(struct inet_sock, inet_dport));
      
              BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
                           offsetof(struct inet_sock, inet_rcv_saddr));
      
              BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
                           offsetof(struct inet_sock, inet_daddr));
      
      #if IS_ENABLED(CONFIG_IPV6)
              BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
                           offsetof(struct sock, sk_v6_rcv_saddr));
      
              BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
                           offsetof(struct sock, sk_v6_daddr));
      #endif
      }
      
      void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
                               struct netlink_callback *cb,
                               const struct inet_diag_req_v2 *r, struct nlattr *bc)
      {
   28         struct net *net = sock_net(skb->sk);
              int i, num, s_i, s_num;
              u32 idiag_states = r->idiag_states;
              bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
      
              if (idiag_states & TCPF_SYN_RECV)
    7                 idiag_states |= TCPF_NEW_SYN_RECV;
   28         s_i = cb->args[1];
              s_num = num = cb->args[2];
      
              if (cb->args[0] == 0) {
   28                 if (!(idiag_states & TCPF_LISTEN))
                              goto skip_listen_ht;
      
   27                 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
                              struct inet_listen_hashbucket *ilb;
                              struct hlist_nulls_node *node;
                              struct sock *sk;
      
                              num = 0;
                              ilb = &hashinfo->listening_hash[i];
   27                         spin_lock_bh(&ilb->lock);
   27                         sk_nulls_for_each(sk, node, &ilb->head) {
                                      struct inet_sock *inet = inet_sk(sk);
      
   27                                 if (!net_eq(sock_net(sk), net))
                                              continue;
      
   21                                 if (num < s_num) {
                                              num++;
                                              continue;
                                      }
      
   21                                 if (r->sdiag_family != AF_UNSPEC &&
   19                                     sk->sk_family != r->sdiag_family)
                                              goto next_listen;
      
   20                                 if (r->id.idiag_sport != inet->inet_sport &&
                                          r->id.idiag_sport)
                                              goto next_listen;
      
   19                                 if (r->id.idiag_dport ||
   18                                     cb->args[3] > 0)
                                              goto next_listen;
      
   12                                 if (inet_csk_diag_dump(sk, skb, cb, r,
                                                             bc, net_admin) < 0) {
    1                                         spin_unlock_bh(&ilb->lock);
                                              goto done;
                                      }
      
      next_listen:
   21                                 cb->args[3] = 0;
                                      cb->args[4] = 0;
                                      ++num;
                              }
   27                         spin_unlock_bh(&ilb->lock);
      
                              s_num = 0;
                              cb->args[3] = 0;
                              cb->args[4] = 0;
                      }
      skip_listen_ht:
   27                 cb->args[0] = 1;
                      s_i = num = s_num = 0;
              }
      
   27         if (!(idiag_states & ~TCPF_LISTEN))
                      goto out;
      
   25         for (i = s_i; i <= hashinfo->ehash_mask; i++) {
   25                 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
   25                 spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
                      struct hlist_nulls_node *node;
                      struct sock *sk;
      
                      num = 0;
      
                      if (hlist_nulls_empty(&head->chain))
                              continue;
      
   25                 if (i > s_i)
                              s_num = 0;
      
                      spin_lock_bh(lock);
   25                 sk_nulls_for_each(sk, node, &head->chain) {
                              int state, res;
      
   25                         if (!net_eq(sock_net(sk), net))
                                      continue;
   19                         if (num < s_num)
                                      goto next_normal;
   19                         state = (sk->sk_state == TCP_TIME_WAIT) ?
   19                                 inet_twsk(sk)->tw_substate : sk->sk_state;
   19                         if (!(idiag_states & (1 << state)))
                                      goto next_normal;
   10                         if (r->sdiag_family != AF_UNSPEC &&
    7                             sk->sk_family != r->sdiag_family)
                                      goto next_normal;
   10                         if (r->id.idiag_sport != htons(sk->sk_num) &&
                                  r->id.idiag_sport)
                                      goto next_normal;
    9                         if (r->id.idiag_dport != sk->sk_dport &&
                                  r->id.idiag_dport)
                                      goto next_normal;
                              twsk_build_assert();
      
                              if (!inet_diag_bc_sk(bc, sk))
                                      goto next_normal;
      
    7                         res = sk_diag_fill(sk, skb, r,
                                                 sk_user_ns(NETLINK_CB(cb->skb).sk),
                                                 NETLINK_CB(cb->skb).portid,
                                                 cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                                 cb->nlh, net_admin);
                              if (res < 0) {
    1                                 spin_unlock_bh(lock);
                                      goto done;
                              }
      next_normal:
   19                         ++num;
                      }
      
   25                 spin_unlock_bh(lock);
              }
      
      done:
   26         cb->args[1] = i;
              cb->args[2] = num;
      out:
              ;
   28 }
      EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
      
      static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
                                  const struct inet_diag_req_v2 *r,
                                  struct nlattr *bc)
      {
              const struct inet_diag_handler *handler;
              int err = 0;
      
   37         handler = inet_diag_lock_handler(r->sdiag_protocol);
              if (!IS_ERR(handler))
   36                 handler->dump(skb, cb, r, bc);
              else
    1                 err = PTR_ERR(handler);
              inet_diag_unlock_handler(handler);
      
   37         return err ? : skb->len;
      }
      
      static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
      {
              int hdrlen = sizeof(struct inet_diag_req_v2);
              struct nlattr *bc = NULL;
      
   30         if (nlmsg_attrlen(cb->nlh, hdrlen))
   22                 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
      
   30         return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
      }
      
      static int inet_diag_type2proto(int type)
      {
              switch (type) {
              case TCPDIAG_GETSOCK:
                      return IPPROTO_TCP;
              case DCCPDIAG_GETSOCK:
                      return IPPROTO_DCCP;
              default:
                      return 0;
              }
      }
      
      static int inet_diag_dump_compat(struct sk_buff *skb,
                                       struct netlink_callback *cb)
      {
    7         struct inet_diag_req *rc = nlmsg_data(cb->nlh);
              int hdrlen = sizeof(struct inet_diag_req);
              struct inet_diag_req_v2 req;
              struct nlattr *bc = NULL;
      
              req.sdiag_family = AF_UNSPEC; /* compatibility */
    7         req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
              req.idiag_ext = rc->idiag_ext;
              req.idiag_states = rc->idiag_states;
              req.id = rc->id;
      
              if (nlmsg_attrlen(cb->nlh, hdrlen))
    1                 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
      
    7         return __inet_diag_dump(skb, cb, &req, bc);
      }
      
      static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
                                            const struct nlmsghdr *nlh)
      {
              struct inet_diag_req *rc = nlmsg_data(nlh);
              struct inet_diag_req_v2 req;
      
    2         req.sdiag_family = rc->idiag_family;
    2         req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
              req.idiag_ext = rc->idiag_ext;
              req.idiag_states = rc->idiag_states;
              req.id = rc->id;
      
              return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req);
      }
      
      static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
      {
              int hdrlen = sizeof(struct inet_diag_req);
   10         struct net *net = sock_net(skb->sk);
      
   13         if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
   13             nlmsg_len(nlh) < hdrlen)
                      return -EINVAL;
      
   12         if (nlh->nlmsg_flags & NLM_F_DUMP) {
                      if (nlmsg_attrlen(nlh, hdrlen)) {
                              struct nlattr *attr;
                              int err;
      
    4                         attr = nlmsg_find_attr(nlh, hdrlen,
                                                     INET_DIAG_REQ_BYTECODE);
                              err = inet_diag_bc_audit(attr, skb);
   13                         if (err)
                                      return err;
                      }
                      {
    7                         struct netlink_dump_control c = {
                                      .dump = inet_diag_dump_compat,
                              };
                              return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
                      }
              }
      
    2         return inet_diag_get_exact_compat(skb, nlh);
      }
      
      static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
      {
              int hdrlen = sizeof(struct inet_diag_req_v2);
   50         struct net *net = sock_net(skb->sk);
      
   95         if (nlmsg_len(h) < hdrlen)
                      return -EINVAL;
      
   94         if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
   68             h->nlmsg_flags & NLM_F_DUMP) {
                      if (nlmsg_attrlen(h, hdrlen)) {
                              struct nlattr *attr;
                              int err;
      
   42                         attr = nlmsg_find_attr(h, hdrlen,
                                                     INET_DIAG_REQ_BYTECODE);
                              err = inet_diag_bc_audit(attr, skb);
                              if (err)
                                      return err;
                      }
                      {
   30                         struct netlink_dump_control c = {
                                      .dump = inet_diag_dump,
                              };
                              return netlink_dump_start(net->diag_nlsk, skb, h, &c);
                      }
              }
      
   95         return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));
      }
      
      static
      int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
      {
              const struct inet_diag_handler *handler;
              struct nlmsghdr *nlh;
              struct nlattr *attr;
              struct inet_diag_msg *r;
              void *info = NULL;
              int err = 0;
      
              nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0);
              if (!nlh)
                      return -ENOMEM;
      
              r = nlmsg_data(nlh);
              memset(r, 0, sizeof(*r));
              inet_diag_msg_common_fill(r, sk);
              if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM)
                      r->id.idiag_sport = inet_sk(sk)->inet_sport;
              r->idiag_state = sk->sk_state;
      
              if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) {
                      nlmsg_cancel(skb, nlh);
                      return err;
              }
      
              handler = inet_diag_lock_handler(sk->sk_protocol);
              if (IS_ERR(handler)) {
                      inet_diag_unlock_handler(handler);
                      nlmsg_cancel(skb, nlh);
                      return PTR_ERR(handler);
              }
      
              attr = handler->idiag_info_size
                      ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size)
                      : NULL;
              if (attr)
                      info = nla_data(attr);
      
              handler->idiag_get_info(sk, r, info);
              inet_diag_unlock_handler(handler);
      
              nlmsg_end(skb, nlh);
              return 0;
      }
      
      static const struct sock_diag_handler inet_diag_handler = {
              .family = AF_INET,
              .dump = inet_diag_handler_cmd,
              .get_info = inet_diag_handler_get_info,
              .destroy = inet_diag_handler_cmd,
      };
      
      static const struct sock_diag_handler inet6_diag_handler = {
              .family = AF_INET6,
              .dump = inet_diag_handler_cmd,
              .get_info = inet_diag_handler_get_info,
              .destroy = inet_diag_handler_cmd,
      };
      
      int inet_diag_register(const struct inet_diag_handler *h)
      {
              const __u16 type = h->idiag_type;
              int err = -EINVAL;
      
              if (type >= IPPROTO_MAX)
                      goto out;
      
              mutex_lock(&inet_diag_table_mutex);
              err = -EEXIST;
              if (!inet_diag_table[type]) {
                      inet_diag_table[type] = h;
                      err = 0;
              }
              mutex_unlock(&inet_diag_table_mutex);
      out:
              return err;
      }
      EXPORT_SYMBOL_GPL(inet_diag_register);
      
      void inet_diag_unregister(const struct inet_diag_handler *h)
      {
              const __u16 type = h->idiag_type;
      
              if (type >= IPPROTO_MAX)
                      return;
      
              mutex_lock(&inet_diag_table_mutex);
              inet_diag_table[type] = NULL;
              mutex_unlock(&inet_diag_table_mutex);
      }
      EXPORT_SYMBOL_GPL(inet_diag_unregister);
      
      static int __init inet_diag_init(void)
      {
              const int inet_diag_table_size = (IPPROTO_MAX *
                                                sizeof(struct inet_diag_handler *));
              int err = -ENOMEM;
      
              inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL);
              if (!inet_diag_table)
                      goto out;
      
              err = sock_diag_register(&inet_diag_handler);
              if (err)
                      goto out_free_nl;
      
              err = sock_diag_register(&inet6_diag_handler);
              if (err)
                      goto out_free_inet;
      
              sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
      out:
              return err;
      
      out_free_inet:
              sock_diag_unregister(&inet_diag_handler);
      out_free_nl:
              kfree(inet_diag_table);
              goto out;
      }
      
      static void __exit inet_diag_exit(void)
      {
              sock_diag_unregister(&inet6_diag_handler);
              sock_diag_unregister(&inet_diag_handler);
              sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
              kfree(inet_diag_table);
      }
      
      module_init(inet_diag_init);
      module_exit(inet_diag_exit);
      MODULE_LICENSE("GPL");
      MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
      MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
      /*
       * fs/sysfs/symlink.c - operations for initializing and mounting sysfs
       *
       * Copyright (c) 2001-3 Patrick Mochel
       * Copyright (c) 2007 SUSE Linux Products GmbH
       * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
       *
       * This file is released under the GPLv2.
       *
       * Please see Documentation/filesystems/sysfs.txt for more information.
       */
      
      #define DEBUG
      
      #include <linux/fs.h>
      #include <linux/magic.h>
      #include <linux/mount.h>
      #include <linux/init.h>
      #include <linux/user_namespace.h>
      
      #include "sysfs.h"
      
      static struct kernfs_root *sysfs_root;
      struct kernfs_node *sysfs_root_kn;
      
      static struct dentry *sysfs_mount(struct file_system_type *fs_type,
              int flags, const char *dev_name, void *data)
      {
              struct dentry *root;
              void *ns;
              bool new_sb;
      
   29         if (!(flags & MS_KERNMOUNT)) {
   29                 if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
                              return ERR_PTR(-EPERM);
              }
      
   29         ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
              root = kernfs_mount_ns(fs_type, flags, sysfs_root,
                                      SYSFS_MAGIC, &new_sb, ns);
   29         if (IS_ERR(root) || !new_sb)
   29                 kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
              else if (new_sb)
                      /* Userspace would break if executables appear on sysfs */
   26                 root->d_sb->s_iflags |= SB_I_NOEXEC;
      
              return root;
      }
      
      static void sysfs_kill_sb(struct super_block *sb)
      {
    8         void *ns = (void *)kernfs_super_ns(sb);
      
              kernfs_kill_sb(sb);
              kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
      }
      
      static struct file_system_type sysfs_fs_type = {
              .name                = "sysfs",
              .mount                = sysfs_mount,
              .kill_sb        = sysfs_kill_sb,
              .fs_flags        = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
      };
      
      int __init sysfs_init(void)
      {
              int err;
      
              sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
                                              NULL);
              if (IS_ERR(sysfs_root))
                      return PTR_ERR(sysfs_root);
      
              sysfs_root_kn = sysfs_root->kn;
      
              err = register_filesystem(&sysfs_fs_type);
              if (err) {
                      kernfs_destroy_root(sysfs_root);
                      return err;
              }
      
              return 0;
      }
      #ifndef _NF_CONNTRACK_TIMEOUT_H
      #define _NF_CONNTRACK_TIMEOUT_H
      
      #include <net/net_namespace.h>
      #include <linux/netfilter/nf_conntrack_common.h>
      #include <linux/netfilter/nf_conntrack_tuple_common.h>
      #include <net/netfilter/nf_conntrack.h>
      #include <net/netfilter/nf_conntrack_extend.h>
      
      #define CTNL_TIMEOUT_NAME_MAX        32
      
      struct ctnl_timeout {
              struct list_head        head;
              struct rcu_head                rcu_head;
              atomic_t                refcnt;
              char                        name[CTNL_TIMEOUT_NAME_MAX];
              __u16                        l3num;
              struct nf_conntrack_l4proto *l4proto;
              char                        data[0];
      };
      
      struct nf_conn_timeout {
              struct ctnl_timeout __rcu *timeout;
      };
      
      static inline unsigned int *
      nf_ct_timeout_data(struct nf_conn_timeout *t)
      {
              struct ctnl_timeout *timeout;
      
              timeout = rcu_dereference(t->timeout);
              if (timeout == NULL)
                      return NULL;
      
              return (unsigned int *)timeout->data;
      }
      
      static inline
      struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct)
      {
      #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
              return nf_ct_ext_find(ct, NF_CT_EXT_TIMEOUT);
      #else
              return NULL;
      #endif
      }
      
      static inline
      struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct,
                                                    struct ctnl_timeout *timeout,
                                                    gfp_t gfp)
      {
      #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
              struct nf_conn_timeout *timeout_ext;
      
              timeout_ext = nf_ct_ext_add(ct, NF_CT_EXT_TIMEOUT, gfp);
              if (timeout_ext == NULL)
                      return NULL;
      
              rcu_assign_pointer(timeout_ext->timeout, timeout);
      
              return timeout_ext;
      #else
              return NULL;
      #endif
      };
      
      static inline unsigned int *
      nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct,
                           struct nf_conntrack_l4proto *l4proto)
      {
      #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
              struct nf_conn_timeout *timeout_ext;
              unsigned int *timeouts;
      
              timeout_ext = nf_ct_timeout_find(ct);
              if (timeout_ext) {
                      timeouts = nf_ct_timeout_data(timeout_ext);
                      if (unlikely(!timeouts))
                              timeouts = l4proto->get_timeouts(net);
              } else {
                      timeouts = l4proto->get_timeouts(net);
              }
      
              return timeouts;
      #else
 1409         return l4proto->get_timeouts(net);
      #endif
      }
      
      #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
      int nf_conntrack_timeout_init(void);
      void nf_conntrack_timeout_fini(void);
      #else
      static inline int nf_conntrack_timeout_init(void)
      {
              return 0;
      }
      
      static inline void nf_conntrack_timeout_fini(void)
      {
              return;
      }
      #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
      
      #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
      extern struct ctnl_timeout *(*nf_ct_timeout_find_get_hook)(const char *name);
      extern void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout);
      #endif
      
      #endif /* _NF_CONNTRACK_TIMEOUT_H */
      /*
       * syscalls.h - Linux syscall interfaces (non-arch-specific)
       *
       * Copyright (c) 2004 Randy Dunlap
       * Copyright (c) 2004 Open Source Development Labs
       *
       * This file is released under the GPLv2.
       * See the file COPYING for more details.
       */
      
      #ifndef _LINUX_SYSCALLS_H
      #define _LINUX_SYSCALLS_H
      
      struct epoll_event;
      struct iattr;
      struct inode;
      struct iocb;
      struct io_event;
      struct iovec;
      struct itimerspec;
      struct itimerval;
      struct kexec_segment;
      struct linux_dirent;
      struct linux_dirent64;
      struct list_head;
      struct mmap_arg_struct;
      struct msgbuf;
      struct user_msghdr;
      struct mmsghdr;
      struct msqid_ds;
      struct new_utsname;
      struct nfsctl_arg;
      struct __old_kernel_stat;
      struct oldold_utsname;
      struct old_utsname;
      struct pollfd;
      struct rlimit;
      struct rlimit64;
      struct rusage;
      struct sched_param;
      struct sched_attr;
      struct sel_arg_struct;
      struct semaphore;
      struct sembuf;
      struct shmid_ds;
      struct sockaddr;
      struct stat;
      struct stat64;
      struct statfs;
      struct statfs64;
      struct __sysctl_args;
      struct sysinfo;
      struct timespec;
      struct timeval;
      struct timex;
      struct timezone;
      struct tms;
      struct utimbuf;
      struct mq_attr;
      struct compat_stat;
      struct compat_timeval;
      struct robust_list_head;
      struct getcpu_cache;
      struct old_linux_dirent;
      struct perf_event_attr;
      struct file_handle;
      struct sigaltstack;
      union bpf_attr;
      
      #include <linux/types.h>
      #include <linux/aio_abi.h>
      #include <linux/capability.h>
      #include <linux/signal.h>
      #include <linux/list.h>
      #include <linux/bug.h>
      #include <linux/sem.h>
      #include <asm/siginfo.h>
      #include <linux/unistd.h>
      #include <linux/quota.h>
      #include <linux/key.h>
      #include <trace/syscall.h>
      
      /*
       * __MAP - apply a macro to syscall arguments
       * __MAP(n, m, t1, a1, t2, a2, ..., tn, an) will expand to
       *    m(t1, a1), m(t2, a2), ..., m(tn, an)
       * The first argument must be equal to the amount of type/name
       * pairs given.  Note that this list of pairs (i.e. the arguments
       * of __MAP starting at the third one) is in the same format as
       * for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n>
       */
      #define __MAP0(m,...)
      #define __MAP1(m,t,a) m(t,a)
      #define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
      #define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
      #define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)
      #define __MAP5(m,t,a,...) m(t,a), __MAP4(m,__VA_ARGS__)
      #define __MAP6(m,t,a,...) m(t,a), __MAP5(m,__VA_ARGS__)
      #define __MAP(n,...) __MAP##n(__VA_ARGS__)
      
      #define __SC_DECL(t, a)        t a
      #define __TYPE_IS_L(t)        (__same_type((t)0, 0L))
      #define __TYPE_IS_UL(t)        (__same_type((t)0, 0UL))
      #define __TYPE_IS_LL(t) (__same_type((t)0, 0LL) || __same_type((t)0, 0ULL))
      #define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
      #define __SC_CAST(t, a)        (t) a
      #define __SC_ARGS(t, a)        a
      #define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
      
      #ifdef CONFIG_FTRACE_SYSCALLS
      #define __SC_STR_ADECL(t, a)        #a
      #define __SC_STR_TDECL(t, a)        #t
      
      extern struct trace_event_class event_class_syscall_enter;
      extern struct trace_event_class event_class_syscall_exit;
      extern struct trace_event_functions enter_syscall_print_funcs;
      extern struct trace_event_functions exit_syscall_print_funcs;
      
      #define SYSCALL_TRACE_ENTER_EVENT(sname)                                \
              static struct syscall_metadata __syscall_meta_##sname;                \
              static struct trace_event_call __used                                \
                event_enter_##sname = {                                        \
                      .class                        = &event_class_syscall_enter,        \
                      {                                                        \
                              .name                   = "sys_enter"#sname,        \
                      },                                                        \
                      .event.funcs            = &enter_syscall_print_funcs,        \
                      .data                        = (void *)&__syscall_meta_##sname,\
                      .flags                  = TRACE_EVENT_FL_CAP_ANY,        \
              };                                                                \
              static struct trace_event_call __used                                \
                __attribute__((section("_ftrace_events")))                        \
               *__event_enter_##sname = &event_enter_##sname;
      
      #define SYSCALL_TRACE_EXIT_EVENT(sname)                                        \
              static struct syscall_metadata __syscall_meta_##sname;                \
              static struct trace_event_call __used                                \
                event_exit_##sname = {                                        \
                      .class                        = &event_class_syscall_exit,        \
                      {                                                        \
                              .name                   = "sys_exit"#sname,        \
                      },                                                        \
                      .event.funcs                = &exit_syscall_print_funcs,        \
                      .data                        = (void *)&__syscall_meta_##sname,\
                      .flags                  = TRACE_EVENT_FL_CAP_ANY,        \
              };                                                                \
              static struct trace_event_call __used                                \
                __attribute__((section("_ftrace_events")))                        \
              *__event_exit_##sname = &event_exit_##sname;
      
      #define SYSCALL_METADATA(sname, nb, ...)                        \
              static const char *types_##sname[] = {                        \
                      __MAP(nb,__SC_STR_TDECL,__VA_ARGS__)                \
              };                                                        \
              static const char *args_##sname[] = {                        \
                      __MAP(nb,__SC_STR_ADECL,__VA_ARGS__)                \
              };                                                        \
              SYSCALL_TRACE_ENTER_EVENT(sname);                        \
              SYSCALL_TRACE_EXIT_EVENT(sname);                        \
              static struct syscall_metadata __used                        \
                __syscall_meta_##sname = {                                \
                      .name                 = "sys"#sname,                        \
                      .syscall_nr        = -1,        /* Filled in at boot */        \
                      .nb_args         = nb,                                \
                      .types                = nb ? types_##sname : NULL,        \
                      .args                = nb ? args_##sname : NULL,        \
                      .enter_event        = &event_enter_##sname,                \
                      .exit_event        = &event_exit_##sname,                \
                      .enter_fields        = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
              };                                                        \
              static struct syscall_metadata __used                        \
                __attribute__((section("__syscalls_metadata")))        \
               *__p_syscall_meta_##sname = &__syscall_meta_##sname;
      #else
      #define SYSCALL_METADATA(sname, nb, ...)
      #endif
      
      #define SYSCALL_DEFINE0(sname)                                        \
              SYSCALL_METADATA(_##sname, 0);                                \
              asmlinkage long sys_##sname(void)
      
      #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
      #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
      #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
      #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
      #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
      #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
      
      #define SYSCALL_DEFINEx(x, sname, ...)                                \
              SYSCALL_METADATA(sname, x, __VA_ARGS__)                        \
              __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
      
      #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
      #define __SYSCALL_DEFINEx(x, name, ...)                                        \
              asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))        \
                      __attribute__((alias(__stringify(SyS##name))));                \
              static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));        \
              asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));        \
              asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))        \
              {                                                                \
                      long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));        \
                      __MAP(x,__SC_TEST,__VA_ARGS__);                                \
                      __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));        \
                      return ret;                                                \
              }                                                                \
              static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
      
      asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
                                     qid_t id, void __user *addr);
      asmlinkage long sys_time(time_t __user *tloc);
      asmlinkage long sys_stime(time_t __user *tptr);
      asmlinkage long sys_gettimeofday(struct timeval __user *tv,
                                      struct timezone __user *tz);
      asmlinkage long sys_settimeofday(struct timeval __user *tv,
                                      struct timezone __user *tz);
      asmlinkage long sys_adjtimex(struct timex __user *txc_p);
      
      asmlinkage long sys_times(struct tms __user *tbuf);
  717 
      asmlinkage long sys_gettid(void);
      asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp);
      asmlinkage long sys_alarm(unsigned int seconds);
      asmlinkage long sys_getpid(void);
  717 asmlinkage long sys_getppid(void);
      asmlinkage long sys_getuid(void);
      asmlinkage long sys_geteuid(void);
      asmlinkage long sys_getgid(void);
      asmlinkage long sys_getegid(void);
      asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid);
      asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid);
      asmlinkage long sys_getpgid(pid_t pid);
      asmlinkage long sys_getpgrp(void);
      asmlinkage long sys_getsid(pid_t pid);
      asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist);
      
      asmlinkage long sys_setregid(gid_t rgid, gid_t egid);
      asmlinkage long sys_setgid(gid_t gid);
      asmlinkage long sys_setreuid(uid_t ruid, uid_t euid);
      asmlinkage long sys_setuid(uid_t uid);
      asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid);
      asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid);
      asmlinkage long sys_setfsuid(uid_t uid);
      asmlinkage long sys_setfsgid(gid_t gid);
      asmlinkage long sys_setpgid(pid_t pid, pid_t pgid);
      asmlinkage long sys_setsid(void);
      asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist);
      
      asmlinkage long sys_acct(const char __user *name);
      asmlinkage long sys_capget(cap_user_header_t header,
                                      cap_user_data_t dataptr);
      asmlinkage long sys_capset(cap_user_header_t header,
                                      const cap_user_data_t data);
      asmlinkage long sys_personality(unsigned int personality);
      
      asmlinkage long sys_sigpending(old_sigset_t __user *set);
      asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set,
                                      old_sigset_t __user *oset);
      asmlinkage long sys_sigaltstack(const struct sigaltstack __user *uss,
                                      struct sigaltstack __user *uoss);
      
      asmlinkage long sys_getitimer(int which, struct itimerval __user *value);
      asmlinkage long sys_setitimer(int which,
                                      struct itimerval __user *value,
                                      struct itimerval __user *ovalue);
      asmlinkage long sys_timer_create(clockid_t which_clock,
                                       struct sigevent __user *timer_event_spec,
                                       timer_t __user * created_timer_id);
      asmlinkage long sys_timer_gettime(timer_t timer_id,
                                      struct itimerspec __user *setting);
      asmlinkage long sys_timer_getoverrun(timer_t timer_id);
      asmlinkage long sys_timer_settime(timer_t timer_id, int flags,
                                      const struct itimerspec __user *new_setting,
                                      struct itimerspec __user *old_setting);
      asmlinkage long sys_timer_delete(timer_t timer_id);
      asmlinkage long sys_clock_settime(clockid_t which_clock,
                                      const struct timespec __user *tp);
      asmlinkage long sys_clock_gettime(clockid_t which_clock,
                                      struct timespec __user *tp);
      asmlinkage long sys_clock_adjtime(clockid_t which_clock,
                                      struct timex __user *tx);
      asmlinkage long sys_clock_getres(clockid_t which_clock,
                                      struct timespec __user *tp);
      asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
                                      const struct timespec __user *rqtp,
                                      struct timespec __user *rmtp);
      
      asmlinkage long sys_nice(int increment);
      asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
                                              struct sched_param __user *param);
      asmlinkage long sys_sched_setparam(pid_t pid,
                                              struct sched_param __user *param);
      asmlinkage long sys_sched_setattr(pid_t pid,
                                              struct sched_attr __user *attr,
                                              unsigned int flags);
      asmlinkage long sys_sched_getscheduler(pid_t pid);
      asmlinkage long sys_sched_getparam(pid_t pid,
                                              struct sched_param __user *param);
      asmlinkage long sys_sched_getattr(pid_t pid,
                                              struct sched_attr __user *attr,
                                              unsigned int size,
                                              unsigned int flags);
      asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                              unsigned long __user *user_mask_ptr);
      asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
                                              unsigned long __user *user_mask_ptr);
      asmlinkage long sys_sched_yield(void);
      asmlinkage long sys_sched_get_priority_max(int policy);
      asmlinkage long sys_sched_get_priority_min(int policy);
      asmlinkage long sys_sched_rr_get_interval(pid_t pid,
                                              struct timespec __user *interval);
      asmlinkage long sys_setpriority(int which, int who, int niceval);
      asmlinkage long sys_getpriority(int which, int who);
      
      asmlinkage long sys_shutdown(int, int);
      asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
                                      void __user *arg);
      asmlinkage long sys_restart_syscall(void);
      asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                                      struct kexec_segment __user *segments,
                                      unsigned long flags);
      asmlinkage long sys_kexec_file_load(int kernel_fd, int initrd_fd,
                                          unsigned long cmdline_len,
                                          const char __user *cmdline_ptr,
                                          unsigned long flags);
      
      asmlinkage long sys_exit(int error_code);
      asmlinkage long sys_exit_group(int error_code);
      asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
                                      int options, struct rusage __user *ru);
      asmlinkage long sys_waitid(int which, pid_t pid,
                                 struct siginfo __user *infop,
                                 int options, struct rusage __user *ru);
      asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options);
      asmlinkage long sys_set_tid_address(int __user *tidptr);
      asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
                              struct timespec __user *utime, u32 __user *uaddr2,
                              u32 val3);
      
      asmlinkage long sys_init_module(void __user *umod, unsigned long len,
                                      const char __user *uargs);
      asmlinkage long sys_delete_module(const char __user *name_user,
                                      unsigned int flags);
      
      #ifdef CONFIG_OLD_SIGSUSPEND
      asmlinkage long sys_sigsuspend(old_sigset_t mask);
      #endif
      
      #ifdef CONFIG_OLD_SIGSUSPEND3
      asmlinkage long sys_sigsuspend(int unused1, int unused2, old_sigset_t mask);
      #endif
      
      asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
      
      #ifdef CONFIG_OLD_SIGACTION
      asmlinkage long sys_sigaction(int, const struct old_sigaction __user *,
                                      struct old_sigaction __user *);
      #endif
      
      #ifndef CONFIG_ODD_RT_SIGACTION
      asmlinkage long sys_rt_sigaction(int,
                                       const struct sigaction __user *,
                                       struct sigaction __user *,
                                       size_t);
      #endif
      asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set,
                                      sigset_t __user *oset, size_t sigsetsize);
      asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize);
      asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
                                      siginfo_t __user *uinfo,
                                      const struct timespec __user *uts,
                                      size_t sigsetsize);
      asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t  pid, int sig,
                      siginfo_t __user *uinfo);
      asmlinkage long sys_kill(int pid, int sig);
      asmlinkage long sys_tgkill(int tgid, int pid, int sig);
      asmlinkage long sys_tkill(int pid, int sig);
      asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo);
      asmlinkage long sys_sgetmask(void);
      asmlinkage long sys_ssetmask(int newmask);
      asmlinkage long sys_signal(int sig, __sighandler_t handler);
      asmlinkage long sys_pause(void);
      
      asmlinkage long sys_sync(void);
      asmlinkage long sys_fsync(unsigned int fd);
      asmlinkage long sys_fdatasync(unsigned int fd);
      asmlinkage long sys_bdflush(int func, long data);
      asmlinkage long sys_mount(char __user *dev_name, char __user *dir_name,
                                      char __user *type, unsigned long flags,
                                      void __user *data);
      asmlinkage long sys_umount(char __user *name, int flags);
      asmlinkage long sys_oldumount(char __user *name);
      asmlinkage long sys_truncate(const char __user *path, long length);
      asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
      asmlinkage long sys_stat(const char __user *filename,
                              struct __old_kernel_stat __user *statbuf);
      asmlinkage long sys_statfs(const char __user * path,
                                      struct statfs __user *buf);
      asmlinkage long sys_statfs64(const char __user *path, size_t sz,
                                      struct statfs64 __user *buf);
      asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
      asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
                                      struct statfs64 __user *buf);
      asmlinkage long sys_lstat(const char __user *filename,
                              struct __old_kernel_stat __user *statbuf);
      asmlinkage long sys_fstat(unsigned int fd,
                              struct __old_kernel_stat __user *statbuf);
      asmlinkage long sys_newstat(const char __user *filename,
                                      struct stat __user *statbuf);
      asmlinkage long sys_newlstat(const char __user *filename,
                                      struct stat __user *statbuf);
      asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
      asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
      #if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
      asmlinkage long sys_stat64(const char __user *filename,
                                      struct stat64 __user *statbuf);
      asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
      asmlinkage long sys_lstat64(const char __user *filename,
                                      struct stat64 __user *statbuf);
      asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
                                     struct stat64 __user *statbuf, int flag);
      #endif
      #if BITS_PER_LONG == 32
      asmlinkage long sys_truncate64(const char __user *path, loff_t length);
      asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
      #endif
      
      asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
                                   const void __user *value, size_t size, int flags);
      asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name,
                                    const void __user *value, size_t size, int flags);
      asmlinkage long sys_fsetxattr(int fd, const char __user *name,
                                    const void __user *value, size_t size, int flags);
      asmlinkage long sys_getxattr(const char __user *path, const char __user *name,
                                   void __user *value, size_t size);
      asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name,
                                    void __user *value, size_t size);
      asmlinkage long sys_fgetxattr(int fd, const char __user *name,
                                    void __user *value, size_t size);
      asmlinkage long sys_listxattr(const char __user *path, char __user *list,
                                    size_t size);
      asmlinkage long sys_llistxattr(const char __user *path, char __user *list,
                                     size_t size);
      asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size);
      asmlinkage long sys_removexattr(const char __user *path,
                                      const char __user *name);
      asmlinkage long sys_lremovexattr(const char __user *path,
                                       const char __user *name);
      asmlinkage long sys_fremovexattr(int fd, const char __user *name);
      
      asmlinkage long sys_brk(unsigned long brk);
      asmlinkage long sys_mprotect(unsigned long start, size_t len,
                                      unsigned long prot);
      asmlinkage long sys_mremap(unsigned long addr,
                                 unsigned long old_len, unsigned long new_len,
                                 unsigned long flags, unsigned long new_addr);
      asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
                              unsigned long prot, unsigned long pgoff,
                              unsigned long flags);
      asmlinkage long sys_msync(unsigned long start, size_t len, int flags);
      asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice);
      asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice);
      asmlinkage long sys_munmap(unsigned long addr, size_t len);
      asmlinkage long sys_mlock(unsigned long start, size_t len);
      asmlinkage long sys_munlock(unsigned long start, size_t len);
      asmlinkage long sys_mlockall(int flags);
      asmlinkage long sys_munlockall(void);
      asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
      asmlinkage long sys_mincore(unsigned long start, size_t len,
                                      unsigned char __user * vec);
      
      asmlinkage long sys_pivot_root(const char __user *new_root,
                                      const char __user *put_old);
      asmlinkage long sys_chroot(const char __user *filename);
      asmlinkage long sys_mknod(const char __user *filename, umode_t mode,
                                      unsigned dev);
      asmlinkage long sys_link(const char __user *oldname,
                                      const char __user *newname);
      asmlinkage long sys_symlink(const char __user *old, const char __user *new);
      asmlinkage long sys_unlink(const char __user *pathname);
      asmlinkage long sys_rename(const char __user *oldname,
                                      const char __user *newname);
      asmlinkage long sys_chmod(const char __user *filename, umode_t mode);
      asmlinkage long sys_fchmod(unsigned int fd, umode_t mode);
      
      asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg);
      #if BITS_PER_LONG == 32
      asmlinkage long sys_fcntl64(unsigned int fd,
                                      unsigned int cmd, unsigned long arg);
      #endif
      asmlinkage long sys_pipe(int __user *fildes);
      asmlinkage long sys_pipe2(int __user *fildes, int flags);
      asmlinkage long sys_dup(unsigned int fildes);
      asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
      asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags);
      asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
      asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd,
                                      unsigned long arg);
      asmlinkage long sys_flock(unsigned int fd, unsigned int cmd);
      asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t __user *ctx);
      asmlinkage long sys_io_destroy(aio_context_t ctx);
      asmlinkage long sys_io_getevents(aio_context_t ctx_id,
                                      long min_nr,
                                      long nr,
                                      struct io_event __user *events,
                                      struct timespec __user *timeout);
      asmlinkage long sys_io_submit(aio_context_t, long,
                                      struct iocb __user * __user *);
      asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
                                    struct io_event __user *result);
      asmlinkage long sys_sendfile(int out_fd, int in_fd,
                                   off_t __user *offset, size_t count);
      asmlinkage long sys_sendfile64(int out_fd, int in_fd,
                                     loff_t __user *offset, size_t count);
      asmlinkage long sys_readlink(const char __user *path,
                                      char __user *buf, int bufsiz);
      asmlinkage long sys_creat(const char __user *pathname, umode_t mode);
      asmlinkage long sys_open(const char __user *filename,
                                      int flags, umode_t mode);
      asmlinkage long sys_close(unsigned int fd);
      asmlinkage long sys_access(const char __user *filename, int mode);
      asmlinkage long sys_vhangup(void);
      asmlinkage long sys_chown(const char __user *filename,
                                      uid_t user, gid_t group);
      asmlinkage long sys_lchown(const char __user *filename,
                                      uid_t user, gid_t group);
      asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
      #ifdef CONFIG_HAVE_UID16
      asmlinkage long sys_chown16(const char __user *filename,
                                      old_uid_t user, old_gid_t group);
      asmlinkage long sys_lchown16(const char __user *filename,
                                      old_uid_t user, old_gid_t group);
      asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group);
      asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid);
      asmlinkage long sys_setgid16(old_gid_t gid);
      asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid);
      asmlinkage long sys_setuid16(old_uid_t uid);
      asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid);
      asmlinkage long sys_getresuid16(old_uid_t __user *ruid,
                                      old_uid_t __user *euid, old_uid_t __user *suid);
      asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid);
      asmlinkage long sys_getresgid16(old_gid_t __user *rgid,
                                      old_gid_t __user *egid, old_gid_t __user *sgid);
      asmlinkage long sys_setfsuid16(old_uid_t uid);
      asmlinkage long sys_setfsgid16(old_gid_t gid);
      asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist);
      asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist);
      asmlinkage long sys_getuid16(void);
      asmlinkage long sys_geteuid16(void);
      asmlinkage long sys_getgid16(void);
      asmlinkage long sys_getegid16(void);
      #endif
      
      asmlinkage long sys_utime(char __user *filename,
                                      struct utimbuf __user *times);
      asmlinkage long sys_utimes(char __user *filename,
                                      struct timeval __user *utimes);
      asmlinkage long sys_lseek(unsigned int fd, off_t offset,
                                unsigned int whence);
      asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
                              unsigned long offset_low, loff_t __user *result,
                              unsigned int whence);
      asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);
      asmlinkage long sys_readahead(int fd, loff_t offset, size_t count);
      asmlinkage long sys_readv(unsigned long fd,
                                const struct iovec __user *vec,
                                unsigned long vlen);
      asmlinkage long sys_write(unsigned int fd, const char __user *buf,
                                size_t count);
      asmlinkage long sys_writev(unsigned long fd,
                                 const struct iovec __user *vec,
                                 unsigned long vlen);
      asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
                                  size_t count, loff_t pos);
      asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
                                   size_t count, loff_t pos);
      asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
                                 unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
      asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
                                  unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
      asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
      asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode);
      asmlinkage long sys_chdir(const char __user *filename);
      asmlinkage long sys_fchdir(unsigned int fd);
      asmlinkage long sys_rmdir(const char __user *pathname);
      asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user *buf, size_t len);
      asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special,
                                      qid_t id, void __user *addr);
      asmlinkage long sys_getdents(unsigned int fd,
                                      struct linux_dirent __user *dirent,
                                      unsigned int count);
      asmlinkage long sys_getdents64(unsigned int fd,
                                      struct linux_dirent64 __user *dirent,
                                      unsigned int count);
      
      asmlinkage long sys_setsockopt(int fd, int level, int optname,
                                      char __user *optval, int optlen);
      asmlinkage long sys_getsockopt(int fd, int level, int optname,
                                      char __user *optval, int __user *optlen);
      asmlinkage long sys_bind(int, struct sockaddr __user *, int);
      asmlinkage long sys_connect(int, struct sockaddr __user *, int);
      asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *);
      asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
      asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *);
      asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *);
      asmlinkage long sys_send(int, void __user *, size_t, unsigned);
      asmlinkage long sys_sendto(int, void __user *, size_t, unsigned,
                                      struct sockaddr __user *, int);
      asmlinkage long sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags);
      asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
                                   unsigned int vlen, unsigned flags);
      asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
      asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
                                      struct sockaddr __user *, int __user *);
      asmlinkage long sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags);
      asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
                                   unsigned int vlen, unsigned flags,
                                   struct timespec __user *timeout);
      asmlinkage long sys_socket(int, int, int);
      asmlinkage long sys_socketpair(int, int, int, int __user *);
      asmlinkage long sys_socketcall(int call, unsigned long __user *args);
      asmlinkage long sys_listen(int, int);
      asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                                      int timeout);
      asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                              fd_set __user *exp, struct timeval __user *tvp);
      asmlinkage long sys_old_select(struct sel_arg_struct __user *arg);
      asmlinkage long sys_epoll_create(int size);
      asmlinkage long sys_epoll_create1(int flags);
      asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
                                      struct epoll_event __user *event);
      asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
                                      int maxevents, int timeout);
      asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
                                      int maxevents, int timeout,
                                      const sigset_t __user *sigmask,
                                      size_t sigsetsize);
      asmlinkage long sys_gethostname(char __user *name, int len);
      asmlinkage long sys_sethostname(char __user *name, int len);
      asmlinkage long sys_setdomainname(char __user *name, int len);
      asmlinkage long sys_newuname(struct new_utsname __user *name);
      asmlinkage long sys_uname(struct old_utsname __user *);
      asmlinkage long sys_olduname(struct oldold_utsname __user *);
      
      asmlinkage long sys_getrlimit(unsigned int resource,
                                      struct rlimit __user *rlim);
      #if defined(COMPAT_RLIM_OLD_INFINITY) || !(defined(CONFIG_IA64))
      asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim);
      #endif
      asmlinkage long sys_setrlimit(unsigned int resource,
                                      struct rlimit __user *rlim);
      asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource,
                                      const struct rlimit64 __user *new_rlim,
                                      struct rlimit64 __user *old_rlim);
      asmlinkage long sys_getrusage(int who, struct rusage __user *ru);
      asmlinkage long sys_umask(int mask);
      
      asmlinkage long sys_msgget(key_t key, int msgflg);
      asmlinkage long sys_msgsnd(int msqid, struct msgbuf __user *msgp,
                                      size_t msgsz, int msgflg);
      asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp,
                                      size_t msgsz, long msgtyp, int msgflg);
      asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf);
      
      asmlinkage long sys_semget(key_t key, int nsems, int semflg);
      asmlinkage long sys_semop(int semid, struct sembuf __user *sops,
                                      unsigned nsops);
      asmlinkage long sys_semctl(int semid, int semnum, int cmd, unsigned long arg);
      asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops,
                                      unsigned nsops,
                                      const struct timespec __user *timeout);
      asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg);
      asmlinkage long sys_shmget(key_t key, size_t size, int flag);
      asmlinkage long sys_shmdt(char __user *shmaddr);
      asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
      asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second,
                      unsigned long third, void __user *ptr, long fifth);
      
      asmlinkage long sys_mq_open(const char __user *name, int oflag, umode_t mode, struct mq_attr __user *attr);
      asmlinkage long sys_mq_unlink(const char __user *name);
      asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
      asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
      asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
      asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
      
      asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn);
      asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
                                      unsigned long off, unsigned long len,
                                      void __user *buf);
      asmlinkage long sys_pciconfig_write(unsigned long bus, unsigned long dfn,
                                      unsigned long off, unsigned long len,
                                      void __user *buf);
      
      asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                              unsigned long arg4, unsigned long arg5);
      asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags);
      asmlinkage long sys_swapoff(const char __user *specialfile);
      asmlinkage long sys_sysctl(struct __sysctl_args __user *args);
      asmlinkage long sys_sysinfo(struct sysinfo __user *info);
      asmlinkage long sys_sysfs(int option,
                                      unsigned long arg1, unsigned long arg2);
      asmlinkage long sys_syslog(int type, char __user *buf, int len);
      asmlinkage long sys_uselib(const char __user *library);
      asmlinkage long sys_ni_syscall(void);
      asmlinkage long sys_ptrace(long request, long pid, unsigned long addr,
                                 unsigned long data);
      
      asmlinkage long sys_add_key(const char __user *_type,
                                  const char __user *_description,
                                  const void __user *_payload,
                                  size_t plen,
                                  key_serial_t destringid);
      
      asmlinkage long sys_request_key(const char __user *_type,
                                      const char __user *_description,
                                      const char __user *_callout_info,
                                      key_serial_t destringid);
      
      asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
                                 unsigned long arg4, unsigned long arg5);
      
      asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
      asmlinkage long sys_ioprio_get(int which, int who);
      asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
                                      unsigned long maxnode);
      asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                                      const unsigned long __user *from,
                                      const unsigned long __user *to);
      asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
                                      const void __user * __user *pages,
                                      const int __user *nodes,
                                      int __user *status,
                                      int flags);
      asmlinkage long sys_mbind(unsigned long start, unsigned long len,
                                      unsigned long mode,
                                      const unsigned long __user *nmask,
                                      unsigned long maxnode,
                                      unsigned flags);
      asmlinkage long sys_get_mempolicy(int __user *policy,
                                      unsigned long __user *nmask,
                                      unsigned long maxnode,
                                      unsigned long addr, unsigned long flags);
      
      asmlinkage long sys_inotify_init(void);
      asmlinkage long sys_inotify_init1(int flags);
      asmlinkage long sys_inotify_add_watch(int fd, const char __user *path,
                                              u32 mask);
      asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd);
      
      asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
                                       __u32 __user *ustatus);
      asmlinkage long sys_spu_create(const char __user *name,
                      unsigned int flags, umode_t mode, int fd);
      
      asmlinkage long sys_mknodat(int dfd, const char __user * filename, umode_t mode,
                                  unsigned dev);
      asmlinkage long sys_mkdirat(int dfd, const char __user * pathname, umode_t mode);
      asmlinkage long sys_unlinkat(int dfd, const char __user * pathname, int flag);
      asmlinkage long sys_symlinkat(const char __user * oldname,
                                    int newdfd, const char __user * newname);
      asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
                                 int newdfd, const char __user *newname, int flags);
      asmlinkage long sys_renameat(int olddfd, const char __user * oldname,
                                   int newdfd, const char __user * newname);
      asmlinkage long sys_renameat2(int olddfd, const char __user *oldname,
                                    int newdfd, const char __user *newname,
                                    unsigned int flags);
      asmlinkage long sys_futimesat(int dfd, const char __user *filename,
                                    struct timeval __user *utimes);
      asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode);
      asmlinkage long sys_fchmodat(int dfd, const char __user * filename,
                                   umode_t mode);
      asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
                                   gid_t group, int flag);
      asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
                                 umode_t mode);
      asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
                                     struct stat __user *statbuf, int flag);
      asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
                                     int bufsiz);
      asmlinkage long sys_utimensat(int dfd, const char __user *filename,
                                      struct timespec __user *utimes, int flags);
      asmlinkage long sys_unshare(unsigned long unshare_flags);
      
      asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
                                 int fd_out, loff_t __user *off_out,
                                 size_t len, unsigned int flags);
      
      asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
                                   unsigned long nr_segs, unsigned int flags);
      
      asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
      
      asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
                                              unsigned int flags);
      asmlinkage long sys_sync_file_range2(int fd, unsigned int flags,
                                           loff_t offset, loff_t nbytes);
      asmlinkage long sys_get_robust_list(int pid,
                                          struct robust_list_head __user * __user *head_ptr,
                                          size_t __user *len_ptr);
      asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
                                          size_t len);
      asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
      asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask);
      asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags);
      asmlinkage long sys_timerfd_create(int clockid, int flags);
      asmlinkage long sys_timerfd_settime(int ufd, int flags,
                                          const struct itimerspec __user *utmr,
                                          struct itimerspec __user *otmr);
      asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
      asmlinkage long sys_eventfd(unsigned int count);
      asmlinkage long sys_eventfd2(unsigned int count, int flags);
      asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
      asmlinkage long sys_userfaultfd(int flags);
      asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
      asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
      asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
                                   fd_set __user *, struct timespec __user *,
                                   void __user *);
      asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
                                struct timespec __user *, const sigset_t __user *,
                                size_t);
      asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags);
      asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
                                        u64 mask, int fd,
                                        const char  __user *pathname);
      asmlinkage long sys_syncfs(int fd);
      
      asmlinkage long sys_fork(void);
      asmlinkage long sys_vfork(void);
      #ifdef CONFIG_CLONE_BACKWARDS
      asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long,
                     int __user *);
      #else
      #ifdef CONFIG_CLONE_BACKWARDS3
      asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
                                int __user *, unsigned long);
      #else
      asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
                     int __user *, unsigned long);
      #endif
      #endif
      
      asmlinkage long sys_execve(const char __user *filename,
                      const char __user *const __user *argv,
                      const char __user *const __user *envp);
      
      asmlinkage long sys_perf_event_open(
                      struct perf_event_attr __user *attr_uptr,
                      pid_t pid, int cpu, int group_fd, unsigned long flags);
      
      asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
                              unsigned long prot, unsigned long flags,
                              unsigned long fd, unsigned long pgoff);
      asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
      asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
                                            struct file_handle __user *handle,
                                            int __user *mnt_id, int flag);
      asmlinkage long sys_open_by_handle_at(int mountdirfd,
                                            struct file_handle __user *handle,
                                            int flags);
      asmlinkage long sys_setns(int fd, int nstype);
      asmlinkage long sys_process_vm_readv(pid_t pid,
                                           const struct iovec __user *lvec,
                                           unsigned long liovcnt,
                                           const struct iovec __user *rvec,
                                           unsigned long riovcnt,
                                           unsigned long flags);
      asmlinkage long sys_process_vm_writev(pid_t pid,
                                            const struct iovec __user *lvec,
                                            unsigned long liovcnt,
                                            const struct iovec __user *rvec,
                                            unsigned long riovcnt,
                                            unsigned long flags);
      
      asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
                               unsigned long idx1, unsigned long idx2);
      asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
      asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
                                  const char __user *uargs);
      asmlinkage long sys_getrandom(char __user *buf, size_t count,
                                    unsigned int flags);
      asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
      
      asmlinkage long sys_execveat(int dfd, const char __user *filename,
                              const char __user *const __user *argv,
                              const char __user *const __user *envp, int flags);
      
      asmlinkage long sys_membarrier(int cmd, int flags);
      
      asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
      
      #endif
      #ifndef _ASM_X86_SMP_H
      #define _ASM_X86_SMP_H
      #ifndef __ASSEMBLY__
      #include <linux/cpumask.h>
      #include <asm/percpu.h>
      
      /*
       * We need the APIC definitions automatically as part of 'smp.h'
       */
      #ifdef CONFIG_X86_LOCAL_APIC
      # include <asm/mpspec.h>
      # include <asm/apic.h>
      # ifdef CONFIG_X86_IO_APIC
      #  include <asm/io_apic.h>
      # endif
      #endif
      #include <asm/thread_info.h>
      #include <asm/cpumask.h>
      
      extern int smp_num_siblings;
      extern unsigned int num_processors;
      
      DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
      DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
      /* cpus sharing the last level cache: */
      DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
      DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
      DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
      
      static inline struct cpumask *cpu_llc_shared_mask(int cpu)
      {
              return per_cpu(cpu_llc_shared_map, cpu);
      }
      
      DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
      DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
      #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
      DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
      #endif
      
      /* Static state in head.S used to set up a CPU */
      extern unsigned long stack_start; /* Initial stack pointer address */
      
      struct task_struct;
      
      struct smp_ops {
              void (*smp_prepare_boot_cpu)(void);
              void (*smp_prepare_cpus)(unsigned max_cpus);
              void (*smp_cpus_done)(unsigned max_cpus);
      
              void (*stop_other_cpus)(int wait);
              void (*smp_send_reschedule)(int cpu);
      
              int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
              int (*cpu_disable)(void);
              void (*cpu_die)(unsigned int cpu);
              void (*play_dead)(void);
      
              void (*send_call_func_ipi)(const struct cpumask *mask);
              void (*send_call_func_single_ipi)(int cpu);
      };
      
      /* Globals due to paravirt */
      extern void set_cpu_sibling_map(int cpu);
      
      #ifdef CONFIG_SMP
      #ifndef CONFIG_PARAVIRT
      #define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
      #endif
      extern struct smp_ops smp_ops;
      
      static inline void smp_send_stop(void)
      {
              smp_ops.stop_other_cpus(0);
      }
      
      static inline void stop_other_cpus(void)
      {
              smp_ops.stop_other_cpus(1);
      }
      
      static inline void smp_prepare_boot_cpu(void)
      {
              smp_ops.smp_prepare_boot_cpu();
      }
      
      static inline void smp_prepare_cpus(unsigned int max_cpus)
      {
              smp_ops.smp_prepare_cpus(max_cpus);
      }
      
      static inline void smp_cpus_done(unsigned int max_cpus)
      {
              smp_ops.smp_cpus_done(max_cpus);
      }
      
      static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle)
      {
              return smp_ops.cpu_up(cpu, tidle);
      }
      
      static inline int __cpu_disable(void)
      {
              return smp_ops.cpu_disable();
      }
      
      static inline void __cpu_die(unsigned int cpu)
      {
              smp_ops.cpu_die(cpu);
      }
      
      static inline void play_dead(void)
      {
              smp_ops.play_dead();
      }
      
      static inline void smp_send_reschedule(int cpu)
      {
              smp_ops.smp_send_reschedule(cpu);
      }
      
      static inline void arch_send_call_function_single_ipi(int cpu)
      {
  598         smp_ops.send_call_func_single_ipi(cpu);
      }
      
      static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
      {
              smp_ops.send_call_func_ipi(mask);
      }
      
      void cpu_disable_common(void);
      void native_smp_prepare_boot_cpu(void);
      void native_smp_prepare_cpus(unsigned int max_cpus);
      void native_smp_cpus_done(unsigned int max_cpus);
      void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
      int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
      int native_cpu_disable(void);
      int common_cpu_die(unsigned int cpu);
      void native_cpu_die(unsigned int cpu);
      void native_play_dead(void);
      void play_dead_common(void);
      void wbinvd_on_cpu(int cpu);
      int wbinvd_on_all_cpus(void);
      
      void native_send_call_func_ipi(const struct cpumask *mask);
      void native_send_call_func_single_ipi(int cpu);
      void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
      
      void smp_store_boot_cpu_info(void);
      void smp_store_cpu_info(int id);
      #define cpu_physical_id(cpu)        per_cpu(x86_cpu_to_apicid, cpu)
      
      #else /* !CONFIG_SMP */
      #define wbinvd_on_cpu(cpu)     wbinvd()
      static inline int wbinvd_on_all_cpus(void)
      {
              wbinvd();
              return 0;
      }
      #endif /* CONFIG_SMP */
      
      extern unsigned disabled_cpus;
      
      #ifdef CONFIG_X86_32_SMP
      /*
       * This function is needed by all SMP systems. It must _always_ be valid
       * from the initial startup. We map APIC_BASE very early in page_setup(),
       * so this is correct in the x86 case.
       */
      #define raw_smp_processor_id() (this_cpu_read(cpu_number))
      extern int safe_smp_processor_id(void);
      
      #elif defined(CONFIG_X86_64_SMP)
      #define raw_smp_processor_id() (this_cpu_read(cpu_number))
      
      #define stack_smp_processor_id()                                        \
      ({                                                                \
              struct thread_info *ti;                                                \
              __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));        \
              ti->cpu;                                                        \
      })
      #define safe_smp_processor_id()                smp_processor_id()
      
      #endif
      
      #ifdef CONFIG_X86_LOCAL_APIC
      
      #ifndef CONFIG_X86_64
      static inline int logical_smp_processor_id(void)
      {
              /* we don't want to mark this access volatile - bad code generation */
              return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
      }
      
      #endif
      
      extern int hard_smp_processor_id(void);
      
      #else /* CONFIG_X86_LOCAL_APIC */
      
      # ifndef CONFIG_SMP
      #  define hard_smp_processor_id()        0
      # endif
      
      #endif /* CONFIG_X86_LOCAL_APIC */
      
      #ifdef CONFIG_DEBUG_NMI_SELFTEST
      extern void nmi_selftest(void);
      #else
      #define nmi_selftest() do { } while (0)
      #endif
      
      #endif /* __ASSEMBLY__ */
      #endif /* _ASM_X86_SMP_H */
      /*
       * Scatterlist Cryptographic API.
       *
       * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
       * Copyright (c) 2002 David S. Miller (davem@redhat.com)
       * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
       *
       * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
       * and Nettle, by Niels Möller.
       *
       * This program is free software; you can redistribute it and/or modify it
       * under the terms of the GNU General Public License as published by the Free
       * Software Foundation; either version 2 of the License, or (at your option)
       * any later version.
       *
       */
      
      #include <linux/err.h>
      #include <linux/errno.h>
      #include <linux/kernel.h>
      #include <linux/kmod.h>
      #include <linux/module.h>
      #include <linux/param.h>
      #include <linux/sched.h>
      #include <linux/slab.h>
      #include <linux/string.h>
      #include <linux/completion.h>
      #include "internal.h"
      
      LIST_HEAD(crypto_alg_list);
      EXPORT_SYMBOL_GPL(crypto_alg_list);
      DECLARE_RWSEM(crypto_alg_sem);
      EXPORT_SYMBOL_GPL(crypto_alg_sem);
      
      BLOCKING_NOTIFIER_HEAD(crypto_chain);
      EXPORT_SYMBOL_GPL(crypto_chain);
      
      static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
      
      struct crypto_alg *crypto_mod_get(struct crypto_alg *alg)
      {
   43         return try_module_get(alg->cra_module) ? crypto_alg_get(alg) : NULL;
      }
      EXPORT_SYMBOL_GPL(crypto_mod_get);
      
      void crypto_mod_put(struct crypto_alg *alg)
      {
    7         struct module *module = alg->cra_module;
      
    7         crypto_alg_put(alg);
    7         module_put(module);
      }
      EXPORT_SYMBOL_GPL(crypto_mod_put);
      
      static inline int crypto_is_test_larval(struct crypto_larval *larval)
      {
    6         return larval->alg.cra_driver_name[0];
      }
      
      static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type,
                                                    u32 mask)
      {
              struct crypto_alg *q, *alg = NULL;
              int best = -2;
      
   44         list_for_each_entry(q, &crypto_alg_list, cra_list) {
                      int exact, fuzzy;
      
   44                 if (crypto_is_moribund(q))
                              continue;
      
                      if ((q->cra_flags ^ type) & mask)
                              continue;
      
   44                 if (crypto_is_larval(q) &&
    4                     !crypto_is_test_larval((struct crypto_larval *)q) &&
    4                     ((struct crypto_larval *)q)->mask != mask)
                              continue;
      
   44                 exact = !strcmp(q->cra_driver_name, name);
   44                 fuzzy = !strcmp(q->cra_name, name);
   43                 if (!exact && !(fuzzy && q->cra_priority > best))
                              continue;
      
   43                 if (unlikely(!crypto_mod_get(q)))
                              continue;
      
   43                 best = q->cra_priority;
                      if (alg)
                              crypto_mod_put(alg);
                      alg = q;
      
   43                 if (exact)
                              break;
              }
      
   44         return alg;
      }
      
      static void crypto_larval_destroy(struct crypto_alg *alg)
      {
              struct crypto_larval *larval = (void *)alg;
      
    7         BUG_ON(!crypto_is_larval(alg));
    7         if (larval->adult)
    1                 crypto_mod_put(larval->adult);
    7         kfree(larval);
      }
      
      struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask)
      {
              struct crypto_larval *larval;
      
    7         larval = kzalloc(sizeof(*larval), GFP_KERNEL);
              if (!larval)
                      return ERR_PTR(-ENOMEM);
      
    7         larval->mask = mask;
              larval->alg.cra_flags = CRYPTO_ALG_LARVAL | type;
              larval->alg.cra_priority = -1;
              larval->alg.cra_destroy = crypto_larval_destroy;
      
              strlcpy(larval->alg.cra_name, name, CRYPTO_MAX_ALG_NAME);
              init_completion(&larval->completion);
      
    7         return larval;
      }
      EXPORT_SYMBOL_GPL(crypto_larval_alloc);
      
      static struct crypto_alg *crypto_larval_add(const char *name, u32 type,
                                                  u32 mask)
      {
              struct crypto_alg *alg;
              struct crypto_larval *larval;
      
    7         larval = crypto_larval_alloc(name, type, mask);
              if (IS_ERR(larval))
                      return ERR_CAST(larval);
      
    7         atomic_set(&larval->alg.cra_refcnt, 2);
      
              down_write(&crypto_alg_sem);
              alg = __crypto_alg_lookup(name, type, mask);
              if (!alg) {
                      alg = &larval->alg;
    7                 list_add(&alg->cra_list, &crypto_alg_list);
              }
    7         up_write(&crypto_alg_sem);
      
              if (alg != &larval->alg) {
                      kfree(larval);
   44                 if (crypto_is_larval(alg))
    4                         alg = crypto_larval_wait(alg);
              }
      
              return alg;
      }
      
    7 void crypto_larval_kill(struct crypto_alg *alg)
      {
              struct crypto_larval *larval = (void *)alg;
      
    7         down_write(&crypto_alg_sem);
    7         list_del(&alg->cra_list);
              up_write(&crypto_alg_sem);
              complete_all(&larval->completion);
    7         crypto_alg_put(alg);
    7 }
      EXPORT_SYMBOL_GPL(crypto_larval_kill);
      
      static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg)
      {
              struct crypto_larval *larval = (void *)alg;
              long timeout;
      
    7         timeout = wait_for_completion_killable_timeout(
                      &larval->completion, 60 * HZ);
      
    7         alg = larval->adult;
              if (timeout < 0)
                      alg = ERR_PTR(-EINTR);
    7         else if (!timeout)
                      alg = ERR_PTR(-ETIMEDOUT);
              else if (!alg)
                      alg = ERR_PTR(-ENOENT);
    2         else if (crypto_is_test_larval(larval) &&
                       !(alg->cra_flags & CRYPTO_ALG_TESTED))
                      alg = ERR_PTR(-EAGAIN);
    2         else if (!crypto_mod_get(alg))
                      alg = ERR_PTR(-EAGAIN);
    7         crypto_mod_put(&larval->alg);
      
              return alg;
      }
      
      struct crypto_alg *crypto_alg_lookup(const char *name, u32 type, u32 mask)
      {
              struct crypto_alg *alg;
      
   44         down_read(&crypto_alg_sem);
              alg = __crypto_alg_lookup(name, type, mask);
              up_read(&crypto_alg_sem);
      
              return alg;
      }
      EXPORT_SYMBOL_GPL(crypto_alg_lookup);
      
      struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask)
      {
              struct crypto_alg *alg;
      
   44         if (!name)
                      return ERR_PTR(-ENOENT);
      
   44         mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);
              type &= mask;
      
              alg = crypto_alg_lookup(name, type, mask);
              if (!alg) {
    7                 request_module("crypto-%s", name);
      
                      if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask &
                            CRYPTO_ALG_NEED_FALLBACK))
    7                         request_module("crypto-%s-all", name);
      
    7                 alg = crypto_alg_lookup(name, type, mask);
              }
      
              if (alg)
   43                 return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg;
      
   44         return crypto_larval_add(name, type, mask);
      }
      EXPORT_SYMBOL_GPL(crypto_larval_lookup);
      
      int crypto_probing_notify(unsigned long val, void *v)
      {
              int ok;
      
    7         ok = blocking_notifier_call_chain(&crypto_chain, val, v);
              if (ok == NOTIFY_DONE) {
                      request_module("cryptomgr");
                      ok = blocking_notifier_call_chain(&crypto_chain, val, v);
              }
      
    7         return ok;
      }
      EXPORT_SYMBOL_GPL(crypto_probing_notify);
      
      struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask)
      {
              struct crypto_alg *alg;
              struct crypto_alg *larval;
              int ok;
      
   44         if (!((type | mask) & CRYPTO_ALG_TESTED)) {
   44                 type |= CRYPTO_ALG_TESTED;
                      mask |= CRYPTO_ALG_TESTED;
              }
      
              /*
               * If the internal flag is set for a cipher, require a caller to
               * to invoke the cipher with the internal flag to use that cipher.
               * Also, if a caller wants to allocate a cipher that may or may
               * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and
               * !(mask & CRYPTO_ALG_INTERNAL).
               */
   44         if (!((type | mask) & CRYPTO_ALG_INTERNAL))
   44                 mask |= CRYPTO_ALG_INTERNAL;
      
   44         larval = crypto_larval_lookup(name, type, mask);
   44         if (IS_ERR(larval) || !crypto_is_larval(larval))
                      return larval;
      
    7         ok = crypto_probing_notify(CRYPTO_MSG_ALG_REQUEST, larval);
      
              if (ok == NOTIFY_STOP)
    7                 alg = crypto_larval_wait(larval);
              else {
    3                 crypto_mod_put(larval);
                      alg = ERR_PTR(-ENOENT);
              }
    7         crypto_larval_kill(larval);
              return alg;
      }
      EXPORT_SYMBOL_GPL(crypto_alg_mod_lookup);
      
      static int crypto_init_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
      {
              const struct crypto_type *type_obj = tfm->__crt_alg->cra_type;
      
              if (type_obj)
   19                 return type_obj->init(tfm, type, mask);
      
   15         switch (crypto_tfm_alg_type(tfm)) {
              case CRYPTO_ALG_TYPE_CIPHER:
   13                 return crypto_init_cipher_ops(tfm);
      
              case CRYPTO_ALG_TYPE_COMPRESS:
    2                 return crypto_init_compress_ops(tfm);
      
              default:
                      break;
              }
      
              BUG();
              return -EINVAL;
      }
      
      static void crypto_exit_ops(struct crypto_tfm *tfm)
      {
              const struct crypto_type *type = tfm->__crt_alg->cra_type;
      
              if (type) {
                      if (tfm->exit)
                              tfm->exit(tfm);
                      return;
              }
      
              switch (crypto_tfm_alg_type(tfm)) {
              case CRYPTO_ALG_TYPE_CIPHER:
                      crypto_exit_cipher_ops(tfm);
                      break;
      
              case CRYPTO_ALG_TYPE_COMPRESS:
                      crypto_exit_compress_ops(tfm);
                      break;
      
              default:
                      BUG();
              }
      }
      
      static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask)
      {
   21         const struct crypto_type *type_obj = alg->cra_type;
              unsigned int len;
      
              len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1);
              if (type_obj)
   19                 return len + type_obj->ctxsize(alg, type, mask);
      
   15         switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
              default:
                      BUG();
      
              case CRYPTO_ALG_TYPE_CIPHER:
   13                 len += crypto_cipher_ctxsize(alg);
                      break;
      
              case CRYPTO_ALG_TYPE_COMPRESS:
    2                 len += crypto_compress_ctxsize(alg);
                      break;
              }
      
              return len;
      }
      
      void crypto_shoot_alg(struct crypto_alg *alg)
      {
              down_write(&crypto_alg_sem);
              alg->cra_flags |= CRYPTO_ALG_DYING;
              up_write(&crypto_alg_sem);
      }
      EXPORT_SYMBOL_GPL(crypto_shoot_alg);
      
      struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
                                            u32 mask)
      {
              struct crypto_tfm *tfm = NULL;
              unsigned int tfm_size;
              int err = -ENOMEM;
      
   21         tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask);
              tfm = kzalloc(tfm_size, GFP_KERNEL);
              if (tfm == NULL)
                      goto out_err;
      
   21         tfm->__crt_alg = alg;
      
   21         err = crypto_init_ops(tfm, type, mask);
   21         if (err)
                      goto out_free_tfm;
      
   21         if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
                      goto cra_init_failed;
      
              goto out;
      
      cra_init_failed:
              crypto_exit_ops(tfm);
      out_free_tfm:
              if (err == -EAGAIN)
                      crypto_shoot_alg(alg);
              kfree(tfm);
      out_err:
              tfm = ERR_PTR(err);
      out:
   21         return tfm;
      }
      EXPORT_SYMBOL_GPL(__crypto_alloc_tfm);
      
      /*
       *        crypto_alloc_base - Locate algorithm and allocate transform
       *        @alg_name: Name of algorithm
       *        @type: Type of algorithm
       *        @mask: Mask for type comparison
       *
       *        This function should not be used by new algorithm types.
       *        Please use crypto_alloc_tfm instead.
       *
       *        crypto_alloc_base() will first attempt to locate an already loaded
       *        algorithm.  If that fails and the kernel supports dynamically loadable
       *        modules, it will then attempt to load a module of the same name or
       *        alias.  If that fails it will send a query to any loaded crypto manager
       *        to construct an algorithm on the fly.  A refcount is grabbed on the
       *        algorithm which is then associated with the new transform.
       *
       *        The returned transform is of a non-determinate type.  Most people
       *        should use one of the more specific allocation functions such as
       *        crypto_alloc_blkcipher.
       *
       *        In case of error the return value is an error pointer.
       */
      struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask)
      {
              struct crypto_tfm *tfm;
              int err;
      
              for (;;) {
                      struct crypto_alg *alg;
      
    4                 alg = crypto_alg_mod_lookup(alg_name, type, mask);
                      if (IS_ERR(alg)) {
                              err = PTR_ERR(alg);
                              goto err;
                      }
      
    4                 tfm = __crypto_alloc_tfm(alg, type, mask);
    4                 if (!IS_ERR(tfm))
                              return tfm;
      
                      crypto_mod_put(alg);
                      err = PTR_ERR(tfm);
      
      err:
                      if (err != -EAGAIN)
                              break;
                      if (fatal_signal_pending(current)) {
                              err = -EINTR;
                              break;
                      }
              }
      
              return ERR_PTR(err);
      }
      EXPORT_SYMBOL_GPL(crypto_alloc_base);
      
      void *crypto_create_tfm(struct crypto_alg *alg,
                              const struct crypto_type *frontend)
      {
              char *mem;
              struct crypto_tfm *tfm = NULL;
              unsigned int tfmsize;
              unsigned int total;
              int err = -ENOMEM;
      
   35         tfmsize = frontend->tfmsize;
              total = tfmsize + sizeof(*tfm) + frontend->extsize(alg);
      
              mem = kzalloc(total, GFP_KERNEL);
              if (mem == NULL)
                      goto out_err;
      
   35         tfm = (struct crypto_tfm *)(mem + tfmsize);
              tfm->__crt_alg = alg;
      
              err = frontend->init_tfm(tfm);
              if (err)
                      goto out_free_tfm;
      
   35         if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
                      goto cra_init_failed;
      
              goto out;
      
      cra_init_failed:
              crypto_exit_ops(tfm);
      out_free_tfm:
              if (err == -EAGAIN)
                      crypto_shoot_alg(alg);
              kfree(mem);
      out_err:
              mem = ERR_PTR(err);
      out:
   35         return mem;
      }
      EXPORT_SYMBOL_GPL(crypto_create_tfm);
      
      struct crypto_alg *crypto_find_alg(const char *alg_name,
                                         const struct crypto_type *frontend,
                                         u32 type, u32 mask)
      {
              struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask) =
                      crypto_alg_mod_lookup;
      
   35         if (frontend) {
   35                 type &= frontend->maskclear;
                      mask &= frontend->maskclear;
                      type |= frontend->type;
                      mask |= frontend->maskset;
      
                      if (frontend->lookup)
                              lookup = frontend->lookup;
              }
      
   35         return lookup(alg_name, type, mask);
      }
      EXPORT_SYMBOL_GPL(crypto_find_alg);
      
      /*
       *        crypto_alloc_tfm - Locate algorithm and allocate transform
       *        @alg_name: Name of algorithm
       *        @frontend: Frontend algorithm type
       *        @type: Type of algorithm
       *        @mask: Mask for type comparison
       *
       *        crypto_alloc_tfm() will first attempt to locate an already loaded
       *        algorithm.  If that fails and the kernel supports dynamically loadable
       *        modules, it will then attempt to load a module of the same name or
       *        alias.  If that fails it will send a query to any loaded crypto manager
       *        to construct an algorithm on the fly.  A refcount is grabbed on the
       *        algorithm which is then associated with the new transform.
       *
       *        The returned transform is of a non-determinate type.  Most people
       *        should use one of the more specific allocation functions such as
       *        crypto_alloc_blkcipher.
       *
       *        In case of error the return value is an error pointer.
       */
      void *crypto_alloc_tfm(const char *alg_name,
                             const struct crypto_type *frontend, u32 type, u32 mask)
      {
              void *tfm;
              int err;
      
              for (;;) {
                      struct crypto_alg *alg;
      
   35                 alg = crypto_find_alg(alg_name, frontend, type, mask);
                      if (IS_ERR(alg)) {
                              err = PTR_ERR(alg);
                              goto err;
                      }
      
   35                 tfm = crypto_create_tfm(alg, frontend);
   35                 if (!IS_ERR(tfm))
                              return tfm;
      
                      crypto_mod_put(alg);
                      err = PTR_ERR(tfm);
      
      err:
                      if (err != -EAGAIN)
                              break;
                      if (fatal_signal_pending(current)) {
                              err = -EINTR;
                              break;
                      }
              }
      
              return ERR_PTR(err);
      }
      EXPORT_SYMBOL_GPL(crypto_alloc_tfm);
      
      /*
       *        crypto_destroy_tfm - Free crypto transform
       *        @mem: Start of tfm slab
       *        @tfm: Transform to free
       *
       *        This function frees up the transform and any associated resources,
       *        then drops the refcount on the associated algorithm.
       */
      void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
      {
              struct crypto_alg *alg;
      
              if (unlikely(!mem))
                      return;
      
              alg = tfm->__crt_alg;
      
              if (!tfm->exit && alg->cra_exit)
                      alg->cra_exit(tfm);
              crypto_exit_ops(tfm);
              crypto_mod_put(alg);
              kzfree(mem);
      }
      EXPORT_SYMBOL_GPL(crypto_destroy_tfm);
      
      int crypto_has_alg(const char *name, u32 type, u32 mask)
      {
              int ret = 0;
    7         struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask);
      
              if (!IS_ERR(alg)) {
    6                 crypto_mod_put(alg);
                      ret = 1;
              }
      
    7         return ret;
      }
      EXPORT_SYMBOL_GPL(crypto_has_alg);
      
      void crypto_req_done(struct crypto_async_request *req, int err)
      {
              struct crypto_wait *wait = req->data;
      
              if (err == -EINPROGRESS)
                      return;
      
              wait->err = err;
              complete(&wait->completion);
      }
      EXPORT_SYMBOL_GPL(crypto_req_done);
      
      MODULE_DESCRIPTION("Cryptographic core API");
      MODULE_LICENSE("GPL");
      /*
       * Functions related to sysfs handling
       */
      #include <linux/kernel.h>
      #include <linux/slab.h>
      #include <linux/module.h>
      #include <linux/bio.h>
      #include <linux/blkdev.h>
      #include <linux/backing-dev.h>
      #include <linux/blktrace_api.h>
      #include <linux/blk-mq.h>
      #include <linux/blk-cgroup.h>
      
      #include "blk.h"
      #include "blk-mq.h"
      
      struct queue_sysfs_entry {
              struct attribute attr;
              ssize_t (*show)(struct request_queue *, char *);
              ssize_t (*store)(struct request_queue *, const char *, size_t);
      };
      
      static ssize_t
      queue_var_show(unsigned long var, char *page)
      {
              return sprintf(page, "%lu\n", var);
      }
      
      static ssize_t
      queue_var_store(unsigned long *var, const char *page, size_t count)
      {
              int err;
              unsigned long v;
      
              err = kstrtoul(page, 10, &v);
              if (err || v > UINT_MAX)
                      return -EINVAL;
      
              *var = v;
      
              return count;
      }
      
      static ssize_t queue_requests_show(struct request_queue *q, char *page)
      {
              return queue_var_show(q->nr_requests, (page));
      }
      
      static ssize_t
      queue_requests_store(struct request_queue *q, const char *page, size_t count)
      {
              unsigned long nr;
              int ret, err;
      
              if (!q->request_fn && !q->mq_ops)
                      return -EINVAL;
      
              ret = queue_var_store(&nr, page, count);
              if (ret < 0)
                      return ret;
      
              if (nr < BLKDEV_MIN_RQ)
                      nr = BLKDEV_MIN_RQ;
      
              if (q->request_fn)
                      err = blk_update_nr_requests(q, nr);
              else
                      err = blk_mq_update_nr_requests(q, nr);
      
              if (err)
                      return err;
      
              return ret;
      }
      
      static ssize_t queue_ra_show(struct request_queue *q, char *page)
      {
              unsigned long ra_kb = q->backing_dev_info.ra_pages <<
                                              (PAGE_CACHE_SHIFT - 10);
      
              return queue_var_show(ra_kb, (page));
      }
      
      static ssize_t
      queue_ra_store(struct request_queue *q, const char *page, size_t count)
      {
              unsigned long ra_kb;
              ssize_t ret = queue_var_store(&ra_kb, page, count);
      
              if (ret < 0)
                      return ret;
      
              q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
      
              return ret;
      }
      
      static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
      {
              int max_sectors_kb = queue_max_sectors(q) >> 1;
      
              return queue_var_show(max_sectors_kb, (page));
      }
      
      static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
      {
              return queue_var_show(queue_max_segments(q), (page));
      }
      
      static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
      {
              return queue_var_show(q->limits.max_integrity_segments, (page));
      }
      
      static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
      {
              if (blk_queue_cluster(q))
                      return queue_var_show(queue_max_segment_size(q), (page));
      
              return queue_var_show(PAGE_CACHE_SIZE, (page));
      }
      
      static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
      {
              return queue_var_show(queue_logical_block_size(q), page);
      }
      
      static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
      {
              return queue_var_show(queue_physical_block_size(q), page);
      }
      
      static ssize_t queue_io_min_show(struct request_queue *q, char *page)
      {
              return queue_var_show(queue_io_min(q), page);
      }
      
      static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
      {
              return queue_var_show(queue_io_opt(q), page);
      }
      
      static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
      {
              return queue_var_show(q->limits.discard_granularity, page);
      }
      
      static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
      {
              unsigned long long val;
      
              val = q->limits.max_hw_discard_sectors << 9;
              return sprintf(page, "%llu\n", val);
      }
      
      static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
      {
              return sprintf(page, "%llu\n",
                             (unsigned long long)q->limits.max_discard_sectors << 9);
      }
      
      static ssize_t queue_discard_max_store(struct request_queue *q,
                                             const char *page, size_t count)
      {
              unsigned long max_discard;
              ssize_t ret = queue_var_store(&max_discard, page, count);
      
              if (ret < 0)
                      return ret;
      
              if (max_discard & (q->limits.discard_granularity - 1))
                      return -EINVAL;
      
              max_discard >>= 9;
              if (max_discard > UINT_MAX)
                      return -EINVAL;
      
              if (max_discard > q->limits.max_hw_discard_sectors)
                      max_discard = q->limits.max_hw_discard_sectors;
      
              q->limits.max_discard_sectors = max_discard;
              return ret;
      }
      
      static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
      {
              return queue_var_show(queue_discard_zeroes_data(q), page);
      }
      
      static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
      {
              return sprintf(page, "%llu\n",
                      (unsigned long long)q->limits.max_write_same_sectors << 9);
      }
      
      
      static ssize_t
      queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
      {
              unsigned long max_sectors_kb,
                      max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
                              page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
              ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
      
              if (ret < 0)
                      return ret;
      
              max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long)
                                               q->limits.max_dev_sectors >> 1);
      
              if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
                      return -EINVAL;
      
              spin_lock_irq(q->queue_lock);
              q->limits.max_sectors = max_sectors_kb << 1;
              spin_unlock_irq(q->queue_lock);
      
              return ret;
      }
      
      static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
      {
              int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
      
              return queue_var_show(max_hw_sectors_kb, (page));
      }
      
      #define QUEUE_SYSFS_BIT_FNS(name, flag, neg)                                \
      static ssize_t                                                                \
      queue_show_##name(struct request_queue *q, char *page)                        \
      {                                                                        \
              int bit;                                                        \
              bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags);                \
              return queue_var_show(neg ? !bit : bit, page);                        \
      }                                                                        \
      static ssize_t                                                                \
      queue_store_##name(struct request_queue *q, const char *page, size_t count) \
      {                                                                        \
              unsigned long val;                                                \
              ssize_t ret;                                                        \
              ret = queue_var_store(&val, page, count);                        \
              if (ret < 0)                                                        \
                       return ret;                                                \
              if (neg)                                                        \
                      val = !val;                                                \
                                                                              \
              spin_lock_irq(q->queue_lock);                                        \
              if (val)                                                        \
                      queue_flag_set(QUEUE_FLAG_##flag, q);                        \
              else                                                                \
                      queue_flag_clear(QUEUE_FLAG_##flag, q);                        \
              spin_unlock_irq(q->queue_lock);                                        \
              return ret;                                                        \
      }
      
      QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
      QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
      QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
      #undef QUEUE_SYSFS_BIT_FNS
      
      static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
      {
              return queue_var_show((blk_queue_nomerges(q) << 1) |
                                     blk_queue_noxmerges(q), page);
      }
      
      static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
                                          size_t count)
      {
              unsigned long nm;
              ssize_t ret = queue_var_store(&nm, page, count);
      
              if (ret < 0)
                      return ret;
      
              spin_lock_irq(q->queue_lock);
              queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
              queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
              if (nm == 2)
                      queue_flag_set(QUEUE_FLAG_NOMERGES, q);
              else if (nm)
                      queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
              spin_unlock_irq(q->queue_lock);
      
              return ret;
      }
      
      static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
      {
              bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
              bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
      
              return queue_var_show(set << force, page);
      }
      
      static ssize_t
      queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
      {
              ssize_t ret = -EINVAL;
      #ifdef CONFIG_SMP
              unsigned long val;
      
              ret = queue_var_store(&val, page, count);
              if (ret < 0)
                      return ret;
      
              spin_lock_irq(q->queue_lock);
              if (val == 2) {
                      queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
                      queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
              } else if (val == 1) {
                      queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
                      queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
              } else if (val == 0) {
                      queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
                      queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
              }
              spin_unlock_irq(q->queue_lock);
      #endif
              return ret;
      }
      
      static ssize_t queue_poll_show(struct request_queue *q, char *page)
      {
              return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
      }
      
      static ssize_t queue_poll_store(struct request_queue *q, const char *page,
                                      size_t count)
      {
              unsigned long poll_on;
              ssize_t ret;
      
              if (!q->mq_ops || !q->mq_ops->poll)
                      return -EINVAL;
      
              ret = queue_var_store(&poll_on, page, count);
              if (ret < 0)
                      return ret;
      
              spin_lock_irq(q->queue_lock);
              if (poll_on)
                      queue_flag_set(QUEUE_FLAG_POLL, q);
              else
                      queue_flag_clear(QUEUE_FLAG_POLL, q);
              spin_unlock_irq(q->queue_lock);
      
              return ret;
      }
      
      static struct queue_sysfs_entry queue_requests_entry = {
              .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
              .show = queue_requests_show,
              .store = queue_requests_store,
      };
      
      static struct queue_sysfs_entry queue_ra_entry = {
              .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
              .show = queue_ra_show,
              .store = queue_ra_store,
      };
      
      static struct queue_sysfs_entry queue_max_sectors_entry = {
              .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
              .show = queue_max_sectors_show,
              .store = queue_max_sectors_store,
      };
      
      static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
              .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
              .show = queue_max_hw_sectors_show,
      };
      
      static struct queue_sysfs_entry queue_max_segments_entry = {
              .attr = {.name = "max_segments", .mode = S_IRUGO },
              .show = queue_max_segments_show,
      };
      
      static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
              .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
              .show = queue_max_integrity_segments_show,
      };
      
      static struct queue_sysfs_entry queue_max_segment_size_entry = {
              .attr = {.name = "max_segment_size", .mode = S_IRUGO },
              .show = queue_max_segment_size_show,
      };
      
      static struct queue_sysfs_entry queue_iosched_entry = {
              .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
              .show = elv_iosched_show,
              .store = elv_iosched_store,
      };
      
      static struct queue_sysfs_entry queue_hw_sector_size_entry = {
              .attr = {.name = "hw_sector_size", .mode = S_IRUGO },
              .show = queue_logical_block_size_show,
      };
      
      static struct queue_sysfs_entry queue_logical_block_size_entry = {
              .attr = {.name = "logical_block_size", .mode = S_IRUGO },
              .show = queue_logical_block_size_show,
      };
      
      static struct queue_sysfs_entry queue_physical_block_size_entry = {
              .attr = {.name = "physical_block_size", .mode = S_IRUGO },
              .show = queue_physical_block_size_show,
      };
      
      static struct queue_sysfs_entry queue_io_min_entry = {
              .attr = {.name = "minimum_io_size", .mode = S_IRUGO },
              .show = queue_io_min_show,
      };
      
      static struct queue_sysfs_entry queue_io_opt_entry = {
              .attr = {.name = "optimal_io_size", .mode = S_IRUGO },
              .show = queue_io_opt_show,
      };
      
      static struct queue_sysfs_entry queue_discard_granularity_entry = {
              .attr = {.name = "discard_granularity", .mode = S_IRUGO },
              .show = queue_discard_granularity_show,
      };
      
      static struct queue_sysfs_entry queue_discard_max_hw_entry = {
              .attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO },
              .show = queue_discard_max_hw_show,
      };
      
      static struct queue_sysfs_entry queue_discard_max_entry = {
              .attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR },
              .show = queue_discard_max_show,
              .store = queue_discard_max_store,
      };
      
      static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
              .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
              .show = queue_discard_zeroes_data_show,
      };
      
      static struct queue_sysfs_entry queue_write_same_max_entry = {
              .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
              .show = queue_write_same_max_show,
      };
      
      static struct queue_sysfs_entry queue_nonrot_entry = {
              .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
              .show = queue_show_nonrot,
              .store = queue_store_nonrot,
      };
      
      static struct queue_sysfs_entry queue_nomerges_entry = {
              .attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
              .show = queue_nomerges_show,
              .store = queue_nomerges_store,
      };
      
      static struct queue_sysfs_entry queue_rq_affinity_entry = {
              .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
              .show = queue_rq_affinity_show,
              .store = queue_rq_affinity_store,
      };
      
      static struct queue_sysfs_entry queue_iostats_entry = {
              .attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
              .show = queue_show_iostats,
              .store = queue_store_iostats,
      };
      
      static struct queue_sysfs_entry queue_random_entry = {
              .attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR },
              .show = queue_show_random,
              .store = queue_store_random,
      };
      
      static struct queue_sysfs_entry queue_poll_entry = {
              .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR },
              .show = queue_poll_show,
              .store = queue_poll_store,
      };
      
      static struct attribute *default_attrs[] = {
              &queue_requests_entry.attr,
              &queue_ra_entry.attr,
              &queue_max_hw_sectors_entry.attr,
              &queue_max_sectors_entry.attr,
              &queue_max_segments_entry.attr,
              &queue_max_integrity_segments_entry.attr,
              &queue_max_segment_size_entry.attr,
              &queue_iosched_entry.attr,
              &queue_hw_sector_size_entry.attr,
              &queue_logical_block_size_entry.attr,
              &queue_physical_block_size_entry.attr,
              &queue_io_min_entry.attr,
              &queue_io_opt_entry.attr,
              &queue_discard_granularity_entry.attr,
              &queue_discard_max_entry.attr,
              &queue_discard_max_hw_entry.attr,
              &queue_discard_zeroes_data_entry.attr,
              &queue_write_same_max_entry.attr,
              &queue_nonrot_entry.attr,
              &queue_nomerges_entry.attr,
              &queue_rq_affinity_entry.attr,
              &queue_iostats_entry.attr,
              &queue_random_entry.attr,
              &queue_poll_entry.attr,
              NULL,
      };
      
      #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
      
      static ssize_t
      queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
      {
              struct queue_sysfs_entry *entry = to_queue(attr);
              struct request_queue *q =
                      container_of(kobj, struct request_queue, kobj);
              ssize_t res;
      
              if (!entry->show)
                      return -EIO;
              mutex_lock(&q->sysfs_lock);
              if (blk_queue_dying(q)) {
                      mutex_unlock(&q->sysfs_lock);
                      return -ENOENT;
              }
              res = entry->show(q, page);
              mutex_unlock(&q->sysfs_lock);
              return res;
      }
      
      static ssize_t
      queue_attr_store(struct kobject *kobj, struct attribute *attr,
                          const char *page, size_t length)
      {
              struct queue_sysfs_entry *entry = to_queue(attr);
              struct request_queue *q;
              ssize_t res;
      
              if (!entry->store)
                      return -EIO;
      
              q = container_of(kobj, struct request_queue, kobj);
              mutex_lock(&q->sysfs_lock);
              if (blk_queue_dying(q)) {
                      mutex_unlock(&q->sysfs_lock);
                      return -ENOENT;
              }
              res = entry->store(q, page, length);
              mutex_unlock(&q->sysfs_lock);
              return res;
      }
      
      static void blk_free_queue_rcu(struct rcu_head *rcu_head)
      {
              struct request_queue *q = container_of(rcu_head, struct request_queue,
                                                     rcu_head);
              kmem_cache_free(blk_requestq_cachep, q);
      }
      
      /**
       * blk_release_queue: - release a &struct request_queue when it is no longer needed
       * @kobj:    the kobj belonging to the request queue to be released
       *
       * Description:
       *     blk_release_queue is the pair to blk_init_queue() or
       *     blk_queue_make_request().  It should be called when a request queue is
       *     being released; typically when a block device is being de-registered.
       *     Currently, its primary task it to free all the &struct request
       *     structures that were allocated to the queue and the queue itself.
       *
       * Note:
       *     The low level driver must have finished any outstanding requests first
       *     via blk_cleanup_queue().
       **/
      static void blk_release_queue(struct kobject *kobj)
      {
              struct request_queue *q =
                      container_of(kobj, struct request_queue, kobj);
   29 
              bdi_exit(&q->backing_dev_info);
              blkcg_exit_queue(q);
      
              if (q->elevator) {
                      spin_lock_irq(q->queue_lock);
                      ioc_clear_queue(q);
                      spin_unlock_irq(q->queue_lock);
                      elevator_exit(q->elevator);
              }
      
              blk_exit_rl(&q->root_rl);
   29 
              if (q->queue_tags)
                      __blk_queue_free_tags(q);
      
              if (!q->mq_ops)
   29                 blk_free_flush_queue(q->fq);
              else
                      blk_mq_release(q);
   29 
              blk_trace_shutdown(q);
      
              if (q->bio_split)
   29                 bioset_free(q->bio_split);
   29 
              ida_simple_remove(&blk_queue_ida, q->id);
   29         call_rcu(&q->rcu_head, blk_free_queue_rcu);
      }
      
      static const struct sysfs_ops queue_sysfs_ops = {
              .show        = queue_attr_show,
              .store        = queue_attr_store,
      };
      
      struct kobj_type blk_queue_ktype = {
              .sysfs_ops        = &queue_sysfs_ops,
              .default_attrs        = default_attrs,
              .release        = blk_release_queue,
      };
      
      int blk_register_queue(struct gendisk *disk)
      {
              int ret;
              struct device *dev = disk_to_dev(disk);
              struct request_queue *q = disk->queue;
   18 
              if (WARN_ON(!q))
                      return -ENXIO;
      
              /*
               * SCSI probing may synchronously create and destroy a lot of
               * request_queues for non-existent devices.  Shutting down a fully
               * functional queue takes measureable wallclock time as RCU grace
               * periods are involved.  To avoid excessive latency in these
               * cases, a request_queue starts out in a degraded mode which is
               * faster to shut down and is made fully functional here as
               * request_queues for non-existent devices never get registered.
               */
              if (!blk_queue_init_done(q)) {
   18                 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
   18                 percpu_ref_switch_to_percpu(&q->q_usage_counter);
                      blk_queue_bypass_end(q);
              }
      
              ret = blk_trace_init_sysfs(dev);
              if (ret)
                      return ret;
      
              ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
   18         if (ret < 0) {
                      blk_trace_remove_sysfs(dev);
                      return ret;
              }
      
              kobject_uevent(&q->kobj, KOBJ_ADD);
   18 
              if (q->mq_ops)
                      blk_mq_register_disk(disk);
   18 
              if (!q->request_fn)
   18                 return 0;
      
              ret = elv_register_queue(q);
              if (ret) {
   18                 kobject_uevent(&q->kobj, KOBJ_REMOVE);
                      kobject_del(&q->kobj);
                      blk_trace_remove_sysfs(dev);
                      kobject_put(&dev->kobj);
                      return ret;
              }
      
              return 0;
      }
      
      void blk_unregister_queue(struct gendisk *disk)
      {
              struct request_queue *q = disk->queue;
   29 
              if (WARN_ON(!q))
                      return;
      
              if (q->mq_ops)
   29                 blk_mq_unregister_disk(disk);
   29 
              if (q->request_fn)
   29                 elv_unregister_queue(q);
      
              kobject_uevent(&q->kobj, KOBJ_REMOVE);
   29         kobject_del(&q->kobj);
              blk_trace_remove_sysfs(disk_to_dev(disk));
              kobject_put(&disk_to_dev(disk)->kobj);
   29 }
      /*
       * linux/mm/compaction.c
       *
       * Memory compaction for the reduction of external fragmentation. Note that
       * this heavily depends upon page migration to do all the real heavy
       * lifting
       *
       * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
       */
      #include <linux/swap.h>
      #include <linux/migrate.h>
      #include <linux/compaction.h>
      #include <linux/mm_inline.h>
      #include <linux/backing-dev.h>
      #include <linux/sysctl.h>
      #include <linux/sysfs.h>
      #include <linux/balloon_compaction.h>
      #include <linux/page-isolation.h>
      #include <linux/kasan.h>
      #include "internal.h"
      
      #ifdef CONFIG_COMPACTION
      static inline void count_compact_event(enum vm_event_item item)
      {
              count_vm_event(item);
      }
      
      static inline void count_compact_events(enum vm_event_item item, long delta)
      {
              count_vm_events(item, delta);
      }
      #else
      #define count_compact_event(item) do { } while (0)
      #define count_compact_events(item, delta) do { } while (0)
      #endif
      
      #if defined CONFIG_COMPACTION || defined CONFIG_CMA
      
      #define CREATE_TRACE_POINTS
      #include <trace/events/compaction.h>
      
      static unsigned long release_freepages(struct list_head *freelist)
      {
              struct page *page, *next;
              unsigned long high_pfn = 0;
      
    1         list_for_each_entry_safe(page, next, freelist, lru) {
    1                 unsigned long pfn = page_to_pfn(page);
    1                 list_del(&page->lru);
                      __free_page(page);
                      if (pfn > high_pfn)
                              high_pfn = pfn;
              }
      
    1         return high_pfn;
      }
      
      static void map_pages(struct list_head *list)
      {
              struct page *page;
      
    1         list_for_each_entry(page, list, lru) {
                      arch_alloc_page(page, 0);
                      kernel_map_pages(page, 1, 1);
    1                 kasan_alloc_pages(page, 0);
              }
      }
      
      static inline bool migrate_async_suitable(int migratetype)
      {
              return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
      }
      
      /*
       * Check that the whole (or subset of) a pageblock given by the interval of
       * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
       * with the migration of free compaction scanner. The scanners then need to
       * use only pfn_valid_within() check for arches that allow holes within
       * pageblocks.
       *
       * Return struct page pointer of start_pfn, or NULL if checks were not passed.
       *
       * It's possible on some configurations to have a setup like node0 node1 node0
       * i.e. it's possible that all pages within a zones range of pages do not
       * belong to a single zone. We assume that a border between node0 and node1
       * can occur within a single pageblock, but not a node0 node1 node0
       * interleaving within a single pageblock. It is therefore sufficient to check
       * the first and last page of a pageblock and avoid checking each individual
       * page in a pageblock.
       */
      static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
                                      unsigned long end_pfn, struct zone *zone)
      {
              struct page *start_page;
              struct page *end_page;
      
              /* end_pfn is one past the range we are checking */
    1         end_pfn--;
      
    1         if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
    1                 return NULL;
      
    1         start_page = pfn_to_page(start_pfn);
      
              if (page_zone(start_page) != zone)
                      return NULL;
      
    1         end_page = pfn_to_page(end_pfn);
      
              /* This gives a shorter code than deriving page_zone(end_page) */
              if (page_zone_id(start_page) != page_zone_id(end_page))
                      return NULL;
      
              return start_page;
      }
      
      #ifdef CONFIG_COMPACTION
      
      /* Do not skip compaction more than 64 times */
      #define COMPACT_MAX_DEFER_SHIFT 6
      
      /*
       * Compaction is deferred when compaction fails to result in a page
       * allocation success. 1 << compact_defer_limit compactions are skipped up
       * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
       */
      void defer_compaction(struct zone *zone, int order)
      {
              zone->compact_considered = 0;
              zone->compact_defer_shift++;
      
              if (order < zone->compact_order_failed)
                      zone->compact_order_failed = order;
      
              if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
                      zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
      
              trace_mm_compaction_defer_compaction(zone, order);
      }
      
      /* Returns true if compaction should be skipped this time */
      bool compaction_deferred(struct zone *zone, int order)
      {
              unsigned long defer_limit = 1UL << zone->compact_defer_shift;
      
    1         if (order < zone->compact_order_failed)
    1                 return false;
      
              /* Avoid possible overflow */
              if (++zone->compact_considered > defer_limit)
                      zone->compact_considered = defer_limit;
      
              if (zone->compact_considered >= defer_limit)
                      return false;
      
              trace_mm_compaction_deferred(zone, order);
      
              return true;
      }
      
      /*
       * Update defer tracking counters after successful compaction of given order,
       * which means an allocation either succeeded (alloc_success == true) or is
       * expected to succeed.
       */
      void compaction_defer_reset(struct zone *zone, int order,
                      bool alloc_success)
      {
    1         if (alloc_success) {
    1                 zone->compact_considered = 0;
                      zone->compact_defer_shift = 0;
              }
    1         if (order >= zone->compact_order_failed)
                      zone->compact_order_failed = order + 1;
      
    1         trace_mm_compaction_defer_reset(zone, order);
    1 }
      
      /* Returns true if restarting compaction after many failures */
      bool compaction_restarting(struct zone *zone, int order)
      {
              if (order < zone->compact_order_failed)
                      return false;
      
              return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
                      zone->compact_considered >= 1UL << zone->compact_defer_shift;
      }
      
      /* Returns true if the pageblock should be scanned for pages to isolate. */
      static inline bool isolation_suitable(struct compact_control *cc,
                                              struct page *page)
      {
    1         if (cc->ignore_skip_hint)
                      return true;
      
    1         return !get_pageblock_skip(page);
      }
      
      static void reset_cached_positions(struct zone *zone)
      {
              zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
              zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
              zone->compact_cached_free_pfn =
                              round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
      }
      
      /*
       * This function is called to clear all cached information on pageblocks that
       * should be skipped for page isolation when the migrate and free page scanner
       * meet.
       */
      static void __reset_isolation_suitable(struct zone *zone)
      {
              unsigned long start_pfn = zone->zone_start_pfn;
              unsigned long end_pfn = zone_end_pfn(zone);
              unsigned long pfn;
      
              zone->compact_blockskip_flush = false;
      
              /* Walk the zone and mark every pageblock as suitable for isolation */
              for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                      struct page *page;
      
                      cond_resched();
      
                      if (!pfn_valid(pfn))
                              continue;
      
                      page = pfn_to_page(pfn);
                      if (zone != page_zone(page))
                              continue;
      
                      clear_pageblock_skip(page);
              }
      
              reset_cached_positions(zone);
      }
      
      void reset_isolation_suitable(pg_data_t *pgdat)
      {
              int zoneid;
      
              for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
                      struct zone *zone = &pgdat->node_zones[zoneid];
                      if (!populated_zone(zone))
                              continue;
      
                      /* Only flush if a full compaction finished recently */
                      if (zone->compact_blockskip_flush)
                              __reset_isolation_suitable(zone);
              }
      }
      
      /*
       * If no pages were isolated then mark this pageblock to be skipped in the
       * future. The information is later cleared by __reset_isolation_suitable().
       */
      static void update_pageblock_skip(struct compact_control *cc,
                              struct page *page, unsigned long nr_isolated,
                              bool migrate_scanner)
      {
    1         struct zone *zone = cc->zone;
              unsigned long pfn;
      
    1         if (cc->ignore_skip_hint)
                      return;
      
    1         if (!page)
                      return;
      
    1         if (nr_isolated)
                      return;
      
              set_pageblock_skip(page);
      
              pfn = page_to_pfn(page);
      
              /* Update where async and sync compaction should restart */
              if (migrate_scanner) {
                      if (pfn > zone->compact_cached_migrate_pfn[0])
                              zone->compact_cached_migrate_pfn[0] = pfn;
                      if (cc->mode != MIGRATE_ASYNC &&
                          pfn > zone->compact_cached_migrate_pfn[1])
                              zone->compact_cached_migrate_pfn[1] = pfn;
              } else {
    1                 if (pfn < zone->compact_cached_free_pfn)
    1                         zone->compact_cached_free_pfn = pfn;
              }
      }
      #else
      static inline bool isolation_suitable(struct compact_control *cc,
                                              struct page *page)
      {
              return true;
      }
      
      static void update_pageblock_skip(struct compact_control *cc,
                              struct page *page, unsigned long nr_isolated,
                              bool migrate_scanner)
      {
      }
      #endif /* CONFIG_COMPACTION */
      
      /*
       * Compaction requires the taking of some coarse locks that are potentially
       * very heavily contended. For async compaction, back out if the lock cannot
       * be taken immediately. For sync compaction, spin on the lock if needed.
       *
       * Returns true if the lock is held
       * Returns false if the lock is not held and compaction should abort
       */
      static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
                                                      struct compact_control *cc)
      {
    1         if (cc->mode == MIGRATE_ASYNC) {
    1                 if (!spin_trylock_irqsave(lock, *flags)) {
                              cc->contended = COMPACT_CONTENDED_LOCK;
                              return false;
                      }
              } else {
    1                 spin_lock_irqsave(lock, *flags);
              }
      
              return true;
      }
      
      /*
       * Compaction requires the taking of some coarse locks that are potentially
       * very heavily contended. The lock should be periodically unlocked to avoid
       * having disabled IRQs for a long time, even when there is nobody waiting on
       * the lock. It might also be that allowing the IRQs will result in
       * need_resched() becoming true. If scheduling is needed, async compaction
       * aborts. Sync compaction schedules.
       * Either compaction type will also abort if a fatal signal is pending.
       * In either case if the lock was locked, it is dropped and not regained.
       *
       * Returns true if compaction should abort due to fatal signal pending, or
       *                async compaction due to need_resched()
       * Returns false when compaction can continue (sync compaction might have
       *                scheduled)
       */
      static bool compact_unlock_should_abort(spinlock_t *lock,
                      unsigned long flags, bool *locked, struct compact_control *cc)
      {
    1         if (*locked) {
    1                 spin_unlock_irqrestore(lock, flags);
                      *locked = false;
              }
      
    1         if (fatal_signal_pending(current)) {
                      cc->contended = COMPACT_CONTENDED_SCHED;
                      return true;
              }
      
    1         if (need_resched()) {
                      if (cc->mode == MIGRATE_ASYNC) {
                              cc->contended = COMPACT_CONTENDED_SCHED;
                              return true;
                      }
                      cond_resched();
              }
      
              return false;
      }
      
      /*
       * Aside from avoiding lock contention, compaction also periodically checks
       * need_resched() and either schedules in sync compaction or aborts async
       * compaction. This is similar to what compact_unlock_should_abort() does, but
       * is used where no lock is concerned.
       *
       * Returns false when no scheduling was needed, or sync compaction scheduled.
       * Returns true when async compaction should abort.
       */
      static inline bool compact_should_abort(struct compact_control *cc)
      {
              /* async compaction aborts if contended */
    1         if (need_resched()) {
                      if (cc->mode == MIGRATE_ASYNC) {
                              cc->contended = COMPACT_CONTENDED_SCHED;
                              return true;
                      }
      
    1                 cond_resched();
              }
      
              return false;
      }
      
      /*
       * Isolate free pages onto a private freelist. If @strict is true, will abort
       * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
       * (even though it may still end up isolating some pages).
       */
      static unsigned long isolate_freepages_block(struct compact_control *cc,
                                      unsigned long *start_pfn,
                                      unsigned long end_pfn,
                                      struct list_head *freelist,
                                      bool strict)
      {
              int nr_scanned = 0, total_isolated = 0;
              struct page *cursor, *valid_page = NULL;
    1         unsigned long flags = 0;
              bool locked = false;
              unsigned long blockpfn = *start_pfn;
      
              cursor = pfn_to_page(blockpfn);
      
              /* Isolate free pages. */
    1         for (; blockpfn < end_pfn; blockpfn++, cursor++) {
                      int isolated, i;
                      struct page *page = cursor;
      
                      /*
                       * Periodically drop the lock (if held) regardless of its
                       * contention, to give chance to IRQs. Abort if fatal signal
                       * pending or async compaction detects need_resched()
                       */
    1                 if (!(blockpfn % SWAP_CLUSTER_MAX)
    1                     && compact_unlock_should_abort(&cc->zone->lock, flags,
                                                                      &locked, cc))
                              break;
      
    1                 nr_scanned++;
                      if (!pfn_valid_within(blockpfn))
                              goto isolate_fail;
      
                      if (!valid_page)
                              valid_page = page;
      
                      /*
                       * For compound pages such as THP and hugetlbfs, we can save
                       * potentially a lot of iterations if we skip them at once.
                       * The check is racy, but we can consider only valid values
                       * and the only danger is skipping too much.
                       */
    1                 if (PageCompound(page)) {
    1                         unsigned int comp_order = compound_order(page);
      
    1                         if (likely(comp_order < MAX_ORDER)) {
                                      blockpfn += (1UL << comp_order) - 1;
                                      cursor += (1UL << comp_order) - 1;
                              }
      
                              goto isolate_fail;
                      }
      
    1                 if (!PageBuddy(page))
                              goto isolate_fail;
      
                      /*
                       * If we already hold the lock, we can skip some rechecking.
                       * Note that if we hold the lock now, checked_pageblock was
                       * already set in some previous iteration (or strict is true),
                       * so it is correct to skip the suitable migration target
                       * recheck as well.
                       */
    1                 if (!locked) {
                              /*
                               * The zone lock must be held to isolate freepages.
                               * Unfortunately this is a very coarse lock and can be
                               * heavily contended if there are parallel allocations
                               * or parallel compactions. For async compaction do not
                               * spin on the lock and we acquire the lock as late as
                               * possible.
                               */
    1                         locked = compact_trylock_irqsave(&cc->zone->lock,
                                                                      &flags, cc);
                              if (!locked)
                                      break;
      
                              /* Recheck this is a buddy page under lock */
    1                         if (!PageBuddy(page))
                                      goto isolate_fail;
                      }
      
                      /* Found a free page, break it into order-0 pages */
    1                 isolated = split_free_page(page);
                      if (!isolated)
                              break;
      
    1                 total_isolated += isolated;
                      cc->nr_freepages += isolated;
                      for (i = 0; i < isolated; i++) {
    1                         list_add(&page->lru, freelist);
    1                         page++;
                      }
    1                 if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
    1                         blockpfn += isolated;
                              break;
                      }
                      /* Advance to the end of split page */
    1                 blockpfn += isolated - 1;
                      cursor += isolated - 1;
                      continue;
      
      isolate_fail:
    1                 if (strict)
                              break;
                      else
                              continue;
      
              }
      
    1         if (locked)
    1                 spin_unlock_irqrestore(&cc->zone->lock, flags);
      
              /*
               * There is a tiny chance that we have read bogus compound_order(),
               * so be careful to not go outside of the pageblock.
               */
    1         if (unlikely(blockpfn > end_pfn))
                      blockpfn = end_pfn;
      
    1         trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
                                              nr_scanned, total_isolated);
      
              /* Record how far we have got within the block */
    1         *start_pfn = blockpfn;
      
              /*
               * If strict isolation is requested by CMA then check that all the
               * pages requested were isolated. If there were any failures, 0 is
               * returned and CMA will fail.
               */
              if (strict && blockpfn < end_pfn)
                      total_isolated = 0;
      
              /* Update the pageblock-skip if the whole pageblock was scanned */
    1         if (blockpfn == end_pfn)
    1                 update_pageblock_skip(cc, valid_page, total_isolated, false);
      
              count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
              if (total_isolated)
                      count_compact_events(COMPACTISOLATED, total_isolated);
    1         return total_isolated;
      }
      
      /**
       * isolate_freepages_range() - isolate free pages.
       * @start_pfn: The first PFN to start isolating.
       * @end_pfn:   The one-past-last PFN.
       *
       * Non-free pages, invalid PFNs, or zone boundaries within the
       * [start_pfn, end_pfn) range are considered errors, cause function to
       * undo its actions and return zero.
       *
       * Otherwise, function returns one-past-the-last PFN of isolated page
       * (which may be greater then end_pfn if end fell in a middle of
       * a free page).
       */
      unsigned long
      isolate_freepages_range(struct compact_control *cc,
                              unsigned long start_pfn, unsigned long end_pfn)
      {
              unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
              LIST_HEAD(freelist);
      
              pfn = start_pfn;
              block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
              if (block_start_pfn < cc->zone->zone_start_pfn)
                      block_start_pfn = cc->zone->zone_start_pfn;
              block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
      
              for (; pfn < end_pfn; pfn += isolated,
                                      block_start_pfn = block_end_pfn,
                                      block_end_pfn += pageblock_nr_pages) {
                      /* Protect pfn from changing by isolate_freepages_block */
                      unsigned long isolate_start_pfn = pfn;
      
                      block_end_pfn = min(block_end_pfn, end_pfn);
      
                      /*
                       * pfn could pass the block_end_pfn if isolated freepage
                       * is more than pageblock order. In this case, we adjust
                       * scanning range to right one.
                       */
                      if (pfn >= block_end_pfn) {
                              block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
                              block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
                              block_end_pfn = min(block_end_pfn, end_pfn);
                      }
      
                      if (!pageblock_pfn_to_page(block_start_pfn,
                                              block_end_pfn, cc->zone))
                              break;
      
                      isolated = isolate_freepages_block(cc, &isolate_start_pfn,
                                                      block_end_pfn, &freelist, true);
      
                      /*
                       * In strict mode, isolate_freepages_block() returns 0 if
                       * there are any holes in the block (ie. invalid PFNs or
                       * non-free pages).
                       */
                      if (!isolated)
                              break;
      
                      /*
                       * If we managed to isolate pages, it is always (1 << n) *
                       * pageblock_nr_pages for some non-negative n.  (Max order
                       * page may span two pageblocks).
                       */
              }
      
              /* split_free_page does not map the pages */
              map_pages(&freelist);
      
              if (pfn < end_pfn) {
                      /* Loop terminated early, cleanup. */
                      release_freepages(&freelist);
                      return 0;
              }
      
              /* We don't use freelists for anything. */
              return pfn;
      }
      
      /* Update the number of anon and file isolated pages in the zone */
      static void acct_isolated(struct zone *zone, struct compact_control *cc)
      {
              struct page *page;
    1         unsigned int count[2] = { 0, };
      
              if (list_empty(&cc->migratepages))
                      return;
      
    1         list_for_each_entry(page, &cc->migratepages, lru)
    1                 count[!!page_is_file_cache(page)]++;
      
    1         mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
    1         mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
      }
      
      /* Similar to reclaim, but different enough that they don't share logic */
      static bool too_many_isolated(struct zone *zone)
      {
              unsigned long active, inactive, isolated;
      
              inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
                                              zone_page_state(zone, NR_INACTIVE_ANON);
              active = zone_page_state(zone, NR_ACTIVE_FILE) +
                                              zone_page_state(zone, NR_ACTIVE_ANON);
              isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
                                              zone_page_state(zone, NR_ISOLATED_ANON);
      
              return isolated > (inactive + active) / 2;
      }
      
      /**
       * isolate_migratepages_block() - isolate all migrate-able pages within
       *                                  a single pageblock
       * @cc:                Compaction control structure.
       * @low_pfn:        The first PFN to isolate
       * @end_pfn:        The one-past-the-last PFN to isolate, within same pageblock
       * @isolate_mode: Isolation mode to be used.
       *
       * Isolate all pages that can be migrated from the range specified by
       * [low_pfn, end_pfn). The range is expected to be within same pageblock.
       * Returns zero if there is a fatal signal pending, otherwise PFN of the
       * first page that was not scanned (which may be both less, equal to or more
       * than end_pfn).
       *
       * The pages are isolated on cc->migratepages list (not required to be empty),
       * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
       * is neither read nor updated.
       */
      static unsigned long
      isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                              unsigned long end_pfn, isolate_mode_t isolate_mode)
      {
    1         struct zone *zone = cc->zone;
              unsigned long nr_scanned = 0, nr_isolated = 0;
              struct list_head *migratelist = &cc->migratepages;
              struct lruvec *lruvec;
              unsigned long flags = 0;
              bool locked = false;
              struct page *page = NULL, *valid_page = NULL;
              unsigned long start_pfn = low_pfn;
      
              /*
               * Ensure that there are not too many pages isolated from the LRU
               * list by either parallel reclaimers or compaction. If there are,
               * delay for some time until fewer pages are isolated
               */
              while (unlikely(too_many_isolated(zone))) {
                      /* async migration should just abort */
                      if (cc->mode == MIGRATE_ASYNC)
                              return 0;
      
                      congestion_wait(BLK_RW_ASYNC, HZ/10);
      
                      if (fatal_signal_pending(current))
                              return 0;
              }
      
    1         if (compact_should_abort(cc))
                      return 0;
      
              /* Time to isolate some pages for migration */
    1         for (; low_pfn < end_pfn; low_pfn++) {
                      bool is_lru;
      
                      /*
                       * Periodically drop the lock (if held) regardless of its
                       * contention, to give chance to IRQs. Abort async compaction
                       * if contended.
                       */
    1                 if (!(low_pfn % SWAP_CLUSTER_MAX)
    1                     && compact_unlock_should_abort(&zone->lru_lock, flags,
                                                                      &locked, cc))
                              break;
      
                      if (!pfn_valid_within(low_pfn))
                              continue;
    1                 nr_scanned++;
      
                      page = pfn_to_page(low_pfn);
      
                      if (!valid_page)
                              valid_page = page;
      
                      /*
                       * Skip if free. We read page order here without zone lock
                       * which is generally unsafe, but the race window is small and
                       * the worst thing that can happen is that we skip some
                       * potential isolation targets.
                       */
    1                 if (PageBuddy(page)) {
    1                         unsigned long freepage_order = page_order_unsafe(page);
      
                              /*
                               * Without lock, we cannot be sure that what we got is
                               * a valid page order. Consider only values in the
                               * valid order range to prevent low_pfn overflow.
                               */
                              if (freepage_order > 0 && freepage_order < MAX_ORDER)
                                      low_pfn += (1UL << freepage_order) - 1;
                              continue;
                      }
      
                      /*
                       * Check may be lockless but that's ok as we recheck later.
                       * It's possible to migrate LRU pages and balloon pages
                       * Skip any other type of page
                       */
    1                 is_lru = PageLRU(page);
                      if (!is_lru) {
                              if (unlikely(balloon_page_movable(page))) {
                                      if (balloon_page_isolate(page)) {
                                              /* Successfully isolated */
                                              goto isolate_success;
                                      }
                              }
                      }
      
                      /*
                       * Regardless of being on LRU, compound pages such as THP and
                       * hugetlbfs are not to be compacted. We can potentially save
                       * a lot of iterations if we skip them at once. The check is
                       * racy, but we can consider only valid values and the only
                       * danger is skipping too much.
                       */
    1                 if (PageCompound(page)) {
    1                         unsigned int comp_order = compound_order(page);
      
    1                         if (likely(comp_order < MAX_ORDER))
                                      low_pfn += (1UL << comp_order) - 1;
      
                              continue;
                      }
      
                      if (!is_lru)
                              continue;
      
                      /*
                       * Migration will fail if an anonymous page is pinned in memory,
                       * so avoid taking lru_lock and isolating it unnecessarily in an
                       * admittedly racy check.
                       */
    1                 if (!page_mapping(page) &&
    1                     page_count(page) > page_mapcount(page))
                              continue;
      
                      /* If we already hold the lock, we can skip some rechecking */
    1                 if (!locked) {
    1                         locked = compact_trylock_irqsave(&zone->lru_lock,
                                                                      &flags, cc);
                              if (!locked)
                                      break;
      
                              /* Recheck PageLRU and PageCompound under lock */
    1                         if (!PageLRU(page))
                                      continue;
      
                              /*
                               * Page become compound since the non-locked check,
                               * and it's on LRU. It can only be a THP so the order
                               * is safe to read and it's 0 for tail pages.
                               */
    1                         if (unlikely(PageCompound(page))) {
                                      low_pfn += (1UL << compound_order(page)) - 1;
                                      continue;
                              }
                      }
      
                      lruvec = mem_cgroup_page_lruvec(page, zone);
      
                      /* Try isolate the page */
    1                 if (__isolate_lru_page(page, isolate_mode) != 0)
                              continue;
      
    1                 VM_BUG_ON_PAGE(PageCompound(page), page);
      
                      /* Successfully isolated */
    1                 del_page_from_lru_list(page, lruvec, page_lru(page));
      
      isolate_success:
    1                 list_add(&page->lru, migratelist);
    1                 cc->nr_migratepages++;
                      nr_isolated++;
      
                      /* Avoid isolating too much */
                      if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
    1                         ++low_pfn;
                              break;
                      }
              }
      
              /*
               * The PageBuddy() check could have potentially brought us outside
               * the range to be scanned.
               */
    1         if (unlikely(low_pfn > end_pfn))
                      low_pfn = end_pfn;
      
    1         if (locked)
    1                 spin_unlock_irqrestore(&zone->lru_lock, flags);
      
              /*
               * Update the pageblock-skip information and cached scanner pfn,
               * if the whole pageblock was scanned without isolating any page.
               */
    1         if (low_pfn == end_pfn)
                      update_pageblock_skip(cc, valid_page, nr_isolated, true);
      
    1         trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
                                                      nr_scanned, nr_isolated);
      
              count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
              if (nr_isolated)
                      count_compact_events(COMPACTISOLATED, nr_isolated);
      
              return low_pfn;
      }
      
      /**
       * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
       * @cc:        Compaction control structure.
       * @start_pfn: The first PFN to start isolating.
       * @end_pfn:   The one-past-last PFN.
       *
       * Returns zero if isolation fails fatally due to e.g. pending signal.
       * Otherwise, function returns one-past-the-last PFN of isolated page
       * (which may be greater than end_pfn if end fell in a middle of a THP page).
       */
      unsigned long
      isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
                                                              unsigned long end_pfn)
      {
              unsigned long pfn, block_start_pfn, block_end_pfn;
      
              /* Scan block by block. First and last block may be incomplete */
              pfn = start_pfn;
              block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
              if (block_start_pfn < cc->zone->zone_start_pfn)
                      block_start_pfn = cc->zone->zone_start_pfn;
              block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
      
              for (; pfn < end_pfn; pfn = block_end_pfn,
                                      block_start_pfn = block_end_pfn,
                                      block_end_pfn += pageblock_nr_pages) {
      
                      block_end_pfn = min(block_end_pfn, end_pfn);
      
                      if (!pageblock_pfn_to_page(block_start_pfn,
                                              block_end_pfn, cc->zone))
                              continue;
      
                      pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
                                                              ISOLATE_UNEVICTABLE);
      
                      if (!pfn)
                              break;
      
                      if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
                              break;
              }
              acct_isolated(cc->zone, cc);
      
              return pfn;
      }
      
      #endif /* CONFIG_COMPACTION || CONFIG_CMA */
      #ifdef CONFIG_COMPACTION
      
      /* Returns true if the page is within a block suitable for migration to */
      static bool suitable_migration_target(struct page *page)
      {
              /* If the page is a large free page, then disallow migration */
    1         if (PageBuddy(page)) {
                      /*
                       * We are checking page_order without zone->lock taken. But
                       * the only small danger is that we skip a potentially suitable
                       * pageblock, so it's not worth to check order for valid range.
                       */
                      if (page_order_unsafe(page) >= pageblock_order)
                              return false;
              }
      
              /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
    1         if (migrate_async_suitable(get_pageblock_migratetype(page)))
                      return true;
      
              /* Otherwise skip the block */
              return false;
      }
      
      /*
       * Test whether the free scanner has reached the same or lower pageblock than
       * the migration scanner, and compaction should thus terminate.
       */
      static inline bool compact_scanners_met(struct compact_control *cc)
      {
    1         return (cc->free_pfn >> pageblock_order)
                      <= (cc->migrate_pfn >> pageblock_order);
      }
      
      /*
       * Based on information in the current compact_control, find blocks
       * suitable for isolating free pages from and then isolate them.
       */
      static void isolate_freepages(struct compact_control *cc)
      {
    1         struct zone *zone = cc->zone;
              struct page *page;
              unsigned long block_start_pfn;        /* start of current pageblock */
              unsigned long isolate_start_pfn; /* exact pfn we start at */
              unsigned long block_end_pfn;        /* end of current pageblock */
              unsigned long low_pfn;             /* lowest pfn scanner is able to scan */
              struct list_head *freelist = &cc->freepages;
      
              /*
               * Initialise the free scanner. The starting point is where we last
               * successfully isolated from, zone-cached value, or the end of the
               * zone when isolating for the first time. For looping we also need
               * this pfn aligned down to the pageblock boundary, because we do
               * block_start_pfn -= pageblock_nr_pages in the for loop.
               * For ending point, take care when isolating in last pageblock of a
               * a zone which ends in the middle of a pageblock.
               * The low boundary is the end of the pageblock the migration scanner
               * is using.
               */
              isolate_start_pfn = cc->free_pfn;
              block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
              block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
                                                      zone_end_pfn(zone));
              low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
      
              /*
               * Isolate free pages until enough are available to migrate the
               * pages on cc->migratepages. We stop searching if the migrate
               * and free page scanners meet or enough free pages are isolated.
               */
              for (; block_start_pfn >= low_pfn;
                                      block_end_pfn = block_start_pfn,
    1                                 block_start_pfn -= pageblock_nr_pages,
                                      isolate_start_pfn = block_start_pfn) {
                      /*
                       * This can iterate a massively long zone without finding any
                       * suitable migration targets, so periodically check if we need
                       * to schedule, or even abort async compaction.
                       */
    1                 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
    1                                                 && compact_should_abort(cc))
                              break;
      
    1                 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
                                                                              zone);
                      if (!page)
                              continue;
      
                      /* Check the block is suitable for migration */
    1                 if (!suitable_migration_target(page))
                              continue;
      
                      /* If isolation recently failed, do not retry */
    1                 if (!isolation_suitable(cc, page))
                              continue;
      
                      /* Found a block suitable for isolating free pages from. */
    1                 isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
                                              freelist, false);
      
                      /*
                       * If we isolated enough freepages, or aborted due to lock
                       * contention, terminate.
                       */
                      if ((cc->nr_freepages >= cc->nr_migratepages)
    1                                                         || cc->contended) {
    1                         if (isolate_start_pfn >= block_end_pfn) {
                                      /*
                                       * Restart at previous pageblock if more
                                       * freepages can be isolated next time.
                                       */
                                      isolate_start_pfn =
                                              block_start_pfn - pageblock_nr_pages;
                              }
                              break;
    1                 } else if (isolate_start_pfn < block_end_pfn) {
                              /*
                               * If isolation failed early, do not continue
                               * needlessly.
                               */
                              break;
                      }
              }
      
              /* split_free_page does not map the pages */
    1         map_pages(freelist);
      
              /*
               * Record where the free scanner will restart next time. Either we
               * broke from the loop and set isolate_start_pfn based on the last
               * call to isolate_freepages_block(), or we met the migration scanner
               * and the loop terminated due to isolate_start_pfn < low_pfn
               */
    1         cc->free_pfn = isolate_start_pfn;
      }
      
      /*
       * This is a migrate-callback that "allocates" freepages by taking pages
       * from the isolated freelists in the block we are migrating to.
       */
      static struct page *compaction_alloc(struct page *migratepage,
                                              unsigned long data,
                                              int **result)
      {
    1         struct compact_control *cc = (struct compact_control *)data;
              struct page *freepage;
      
              /*
               * Isolate free pages if necessary, and if we are not aborting due to
               * contention.
               */
              if (list_empty(&cc->freepages)) {
    1                 if (!cc->contended)
    1                         isolate_freepages(cc);
      
                      if (list_empty(&cc->freepages))
                              return NULL;
              }
      
    1         freepage = list_entry(cc->freepages.next, struct page, lru);
    1         list_del(&freepage->lru);
              cc->nr_freepages--;
      
    1         return freepage;
      }
      
      /*
       * This is a migrate-callback that "frees" freepages back to the isolated
       * freelist.  All pages on the freelist are from the same zone, so there is no
       * special handling needed for NUMA.
       */
      static void compaction_free(struct page *page, unsigned long data)
      {
              struct compact_control *cc = (struct compact_control *)data;
      
              list_add(&page->lru, &cc->freepages);
              cc->nr_freepages++;
      }
      
      /* possible outcome of isolate_migratepages */
      typedef enum {
              ISOLATE_ABORT,                /* Abort compaction now */
              ISOLATE_NONE,                /* No pages isolated, continue scanning */
              ISOLATE_SUCCESS,        /* Pages isolated, migrate */
      } isolate_migrate_t;
      
      /*
       * Allow userspace to control policy on scanning the unevictable LRU for
       * compactable pages.
       */
      int sysctl_compact_unevictable_allowed __read_mostly = 1;
      
      /*
       * Isolate all pages that can be migrated from the first suitable block,
       * starting at the block pointed to by the migrate scanner pfn within
       * compact_control.
       */
      static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                              struct compact_control *cc)
      {
              unsigned long block_start_pfn;
              unsigned long block_end_pfn;
              unsigned long low_pfn;
              unsigned long isolate_start_pfn;
              struct page *page;
              const isolate_mode_t isolate_mode =
    1                 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
    1                 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
      
              /*
               * Start at where we last stopped, or beginning of the zone as
               * initialized by compact_zone()
               */
    1         low_pfn = cc->migrate_pfn;
              block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
              if (block_start_pfn < zone->zone_start_pfn)
                      block_start_pfn = zone->zone_start_pfn;
      
              /* Only scan within a pageblock boundary */
              block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
      
              /*
               * Iterate over whole pageblocks until we find the first suitable.
               * Do not cross the free scanner.
               */
              for (; block_end_pfn <= cc->free_pfn;
                              low_pfn = block_end_pfn,
                              block_start_pfn = block_end_pfn,
                              block_end_pfn += pageblock_nr_pages) {
      
                      /*
                       * This can potentially iterate a massively long zone with
                       * many pageblocks unsuitable, so periodically check if we
                       * need to schedule, or even abort async compaction.
                       */
    1                 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
    1                                                 && compact_should_abort(cc))
                              break;
      
    1                 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
                                                                              zone);
                      if (!page)
                              continue;
      
                      /* If isolation recently failed, do not retry */
    1                 if (!isolation_suitable(cc, page))
                              continue;
      
                      /*
                       * For async compaction, also only scan in MOVABLE blocks.
                       * Async compaction is optimistic to see if the minimum amount
                       * of work satisfies the allocation.
                       */
    1                 if (cc->mode == MIGRATE_ASYNC &&
    1                     !migrate_async_suitable(get_pageblock_migratetype(page)))
                              continue;
      
                      /* Perform the isolation */
                      isolate_start_pfn = low_pfn;
    1                 low_pfn = isolate_migratepages_block(cc, low_pfn,
                                                      block_end_pfn, isolate_mode);
      
    1                 if (!low_pfn || cc->contended) {
                              acct_isolated(zone, cc);
                              return ISOLATE_ABORT;
                      }
      
                      /*
                       * Record where we could have freed pages by migration and not
                       * yet flushed them to buddy allocator.
                       * - this is the lowest page that could have been isolated and
                       * then freed by migration.
                       */
    1                 if (cc->nr_migratepages && !cc->last_migrated_pfn)
    1                         cc->last_migrated_pfn = isolate_start_pfn;
      
                      /*
                       * Either we isolated something and proceed with migration. Or
                       * we failed and compact_zone should decide if we should
                       * continue or not.
                       */
                      break;
              }
      
    1         acct_isolated(zone, cc);
              /* Record where migration scanner will be restarted. */
              cc->migrate_pfn = low_pfn;
      
              return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
      }
      
      /*
       * order == -1 is expected when compacting via
       * /proc/sys/vm/compact_memory
       */
      static inline bool is_via_compact_memory(int order)
      {
              return order == -1;
      }
      
      static int __compact_finished(struct zone *zone, struct compact_control *cc,
                                  const int migratetype)
      {
              unsigned int order;
              unsigned long watermark;
      
    1         if (cc->contended || fatal_signal_pending(current))
                      return COMPACT_CONTENDED;
      
              /* Compaction run completes if the migrate and free scanner meet */
    1         if (compact_scanners_met(cc)) {
                      /* Let the next compaction start anew. */
                      reset_cached_positions(zone);
      
                      /*
                       * Mark that the PG_migrate_skip information should be cleared
                       * by kswapd when it goes to sleep. kswapd does not set the
                       * flag itself as the decision to be clear should be directly
                       * based on an allocation request.
                       */
                      if (!current_is_kswapd())
                              zone->compact_blockskip_flush = true;
      
                      return COMPACT_COMPLETE;
              }
      
    1         if (is_via_compact_memory(cc->order))
                      return COMPACT_CONTINUE;
      
              /* Compaction run is not finished if the watermark is not met */
    1         watermark = low_wmark_pages(zone);
      
              if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
                                                              cc->alloc_flags))
                      return COMPACT_CONTINUE;
      
              /* Direct compactor: Is a suitable page free? */
    1         for (order = cc->order; order < MAX_ORDER; order++) {
    1                 struct free_area *area = &zone->free_area[order];
                      bool can_steal;
      
                      /* Job done if page is free of the right migratetype */
                      if (!list_empty(&area->free_list[migratetype]))
    1                         return COMPACT_PARTIAL;
      
      #ifdef CONFIG_CMA
                      /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
                      if (migratetype == MIGRATE_MOVABLE &&
                              !list_empty(&area->free_list[MIGRATE_CMA]))
                              return COMPACT_PARTIAL;
      #endif
                      /*
                       * Job done if allocation would steal freepages from
                       * other migratetype buddy lists.
                       */
    1                 if (find_suitable_fallback(area, order, migratetype,
                                                      true, &can_steal) != -1)
                              return COMPACT_PARTIAL;
              }
      
              return COMPACT_NO_SUITABLE_PAGE;
      }
      
      static int compact_finished(struct zone *zone, struct compact_control *cc,
                                  const int migratetype)
      {
              int ret;
      
    1         ret = __compact_finished(zone, cc, migratetype);
    1         trace_mm_compaction_finished(zone, cc->order, ret);
    1         if (ret == COMPACT_NO_SUITABLE_PAGE)
                      ret = COMPACT_CONTINUE;
      
              return ret;
      }
      
      /*
       * compaction_suitable: Is this suitable to run compaction on this zone now?
       * Returns
       *   COMPACT_SKIPPED  - If there are too few free pages for compaction
       *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
       *   COMPACT_CONTINUE - If compaction should run now
       */
      static unsigned long __compaction_suitable(struct zone *zone, int order,
                                              int alloc_flags, int classzone_idx)
      {
              int fragindex;
              unsigned long watermark;
      
    2         if (is_via_compact_memory(order))
                      return COMPACT_CONTINUE;
      
    2         watermark = low_wmark_pages(zone);
              /*
               * If watermarks for high-order allocation are already met, there
               * should be no need for compaction at all.
               */
              if (zone_watermark_ok(zone, order, watermark, classzone_idx,
                                                                      alloc_flags))
                      return COMPACT_PARTIAL;
      
              /*
               * Watermarks for order-0 must be met for compaction. Note the 2UL.
               * This is because during migration, copies of pages need to be
               * allocated and for a short time, the footprint is higher
               */
    1         watermark += (2UL << order);
              if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
                      return COMPACT_SKIPPED;
      
              /*
               * fragmentation index determines if allocation failures are due to
               * low memory or external fragmentation
               *
               * index of -1000 would imply allocations might succeed depending on
               * watermarks, but we already failed the high-order watermark check
               * index towards 0 implies failure is due to lack of memory
               * index towards 1000 implies failure is due to fragmentation
               *
               * Only compact if a failure would be due to fragmentation.
               */
    1         fragindex = fragmentation_index(zone, order);
              if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
                      return COMPACT_NOT_SUITABLE_ZONE;
      
              return COMPACT_CONTINUE;
      }
      
      unsigned long compaction_suitable(struct zone *zone, int order,
                                              int alloc_flags, int classzone_idx)
      {
              unsigned long ret;
      
    2         ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
    2         trace_mm_compaction_suitable(zone, order, ret);
    2         if (ret == COMPACT_NOT_SUITABLE_ZONE)
                      ret = COMPACT_SKIPPED;
      
    2         return ret;
      }
      
      static int compact_zone(struct zone *zone, struct compact_control *cc)
      {
              int ret;
    1         unsigned long start_pfn = zone->zone_start_pfn;
    1         unsigned long end_pfn = zone_end_pfn(zone);
    1         const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
    1         const bool sync = cc->mode != MIGRATE_ASYNC;
      
              ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
                                                              cc->classzone_idx);
              switch (ret) {
              case COMPACT_PARTIAL:
              case COMPACT_SKIPPED:
                      /* Compaction is likely to fail */
                      return ret;
              case COMPACT_CONTINUE:
                      /* Fall through to compaction */
                      ;
              }
      
              /*
               * Clear pageblock skip if there were failures recently and compaction
               * is about to be retried after being deferred. kswapd does not do
               * this reset as it'll reset the cached information when going to sleep.
               */
    1         if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
                      __reset_isolation_suitable(zone);
      
              /*
               * Setup to move all movable pages to the end of the zone. Used cached
               * information on where the scanners should start but check that it
               * is initialised by ensuring the values are within zone boundaries.
               */
              cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
    1         cc->free_pfn = zone->compact_cached_free_pfn;
    1         if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
                      cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
                      zone->compact_cached_free_pfn = cc->free_pfn;
              }
    1         if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
                      cc->migrate_pfn = start_pfn;
                      zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
                      zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
              }
    1         cc->last_migrated_pfn = 0;
      
    1         trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                                      cc->free_pfn, end_pfn, sync);
      
    1         migrate_prep_local();
      
    1         while ((ret = compact_finished(zone, cc, migratetype)) ==
                                                      COMPACT_CONTINUE) {
                      int err;
      
    1                 switch (isolate_migratepages(zone, cc)) {
                      case ISOLATE_ABORT:
                              ret = COMPACT_CONTENDED;
                              putback_movable_pages(&cc->migratepages);
                              cc->nr_migratepages = 0;
                              goto out;
                      case ISOLATE_NONE:
                              /*
                               * We haven't isolated and migrated anything, but
                               * there might still be unflushed migrations from
                               * previous cc->order aligned block.
                               */
                              goto check_drain;
                      case ISOLATE_SUCCESS:
                              ;
                      }
      
    1                 err = migrate_pages(&cc->migratepages, compaction_alloc,
                                      compaction_free, (unsigned long)cc, cc->mode,
                                      MR_COMPACTION);
      
    1                 trace_mm_compaction_migratepages(cc->nr_migratepages, err,
                                                              &cc->migratepages);
      
                      /* All pages were either migrated or will be released */
    1                 cc->nr_migratepages = 0;
                      if (err) {
                              putback_movable_pages(&cc->migratepages);
                              /*
                               * migrate_pages() may return -ENOMEM when scanners meet
                               * and we want compact_finished() to detect it
                               */
                              if (err == -ENOMEM && !compact_scanners_met(cc)) {
                                      ret = COMPACT_CONTENDED;
                                      goto out;
                              }
                      }
      
      check_drain:
                      /*
                       * Has the migration scanner moved away from the previous
                       * cc->order aligned block where we migrated from? If yes,
                       * flush the pages that were freed, so that they can merge and
                       * compact_finished() can detect immediately if allocation
                       * would succeed.
                       */
    1                 if (cc->order > 0 && cc->last_migrated_pfn) {
                              int cpu;
                              unsigned long current_block_start =
    1                                 cc->migrate_pfn & ~((1UL << cc->order) - 1);
      
    1                         if (cc->last_migrated_pfn < current_block_start) {
    1                                 cpu = get_cpu();
                                      lru_add_drain_cpu(cpu);
                                      drain_local_pages(zone);
                                      put_cpu();
                                      /* No more flushing until we migrate again */
    1                                 cc->last_migrated_pfn = 0;
                              }
                      }
      
              }
      
      out:
              /*
               * Release free pages and update where the free scanner should restart,
               * so we don't leave any returned pages behind in the next attempt.
               */
    1         if (cc->nr_freepages > 0) {
    1                 unsigned long free_pfn = release_freepages(&cc->freepages);
      
                      cc->nr_freepages = 0;
                      VM_BUG_ON(free_pfn == 0);
                      /* The cached pfn is always the first in a pageblock */
    1                 free_pfn &= ~(pageblock_nr_pages-1);
                      /*
                       * Only go back, not forward. The cached pfn might have been
                       * already reset to zone end in compact_finished()
                       */
                      if (free_pfn > zone->compact_cached_free_pfn)
                              zone->compact_cached_free_pfn = free_pfn;
              }
      
    1         trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
                                      cc->free_pfn, end_pfn, sync, ret);
      
    1         if (ret == COMPACT_CONTENDED)
    1                 ret = COMPACT_PARTIAL;
      
              return ret;
      }
      
      static unsigned long compact_zone_order(struct zone *zone, int order,
                      gfp_t gfp_mask, enum migrate_mode mode, int *contended,
                      int alloc_flags, int classzone_idx)
      {
              unsigned long ret;
    1         struct compact_control cc = {
                      .nr_freepages = 0,
                      .nr_migratepages = 0,
                      .order = order,
                      .gfp_mask = gfp_mask,
                      .zone = zone,
                      .mode = mode,
                      .alloc_flags = alloc_flags,
                      .classzone_idx = classzone_idx,
              };
              INIT_LIST_HEAD(&cc.freepages);
              INIT_LIST_HEAD(&cc.migratepages);
      
              ret = compact_zone(zone, &cc);
      
              VM_BUG_ON(!list_empty(&cc.freepages));
    1         VM_BUG_ON(!list_empty(&cc.migratepages));
      
    1         *contended = cc.contended;
              return ret;
      }
      
      int sysctl_extfrag_threshold = 500;
      
      /**
       * try_to_compact_pages - Direct compact to satisfy a high-order allocation
       * @gfp_mask: The GFP mask of the current allocation
       * @order: The order of the current allocation
       * @alloc_flags: The allocation flags of the current allocation
       * @ac: The context of current allocation
       * @mode: The migration mode for async, sync light, or sync migration
       * @contended: Return value that determines if compaction was aborted due to
       *               need_resched() or lock contention
       *
       * This is the main entry point for direct page compaction.
       */
      unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
                              int alloc_flags, const struct alloc_context *ac,
                              enum migrate_mode mode, int *contended)
      {
    1         int may_enter_fs = gfp_mask & __GFP_FS;
              int may_perform_io = gfp_mask & __GFP_IO;
              struct zoneref *z;
              struct zone *zone;
              int rc = COMPACT_DEFERRED;
              int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
      
    1         *contended = COMPACT_CONTENDED_NONE;
      
              /* Check if the GFP flags allow compaction */
    1         if (!order || !may_enter_fs || !may_perform_io)
                      return COMPACT_SKIPPED;
      
    1         trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
      
              /* Compact each zone in the list */
    1         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                                                      ac->nodemask) {
                      int status;
                      int zone_contended;
      
    1                 if (compaction_deferred(zone, order))
                              continue;
      
    1                 status = compact_zone_order(zone, order, gfp_mask, mode,
                                      &zone_contended, alloc_flags,
                                      ac->classzone_idx);
                      rc = max(status, rc);
                      /*
                       * It takes at least one zone that wasn't lock contended
                       * to clear all_zones_contended.
                       */
                      all_zones_contended &= zone_contended;
      
                      /* If a normal allocation would succeed, stop compacting */
                      if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
                                              ac->classzone_idx, alloc_flags)) {
                              /*
                               * We think the allocation will succeed in this zone,
                               * but it is not certain, hence the false. The caller
                               * will repeat this with true if allocation indeed
                               * succeeds in this zone.
                               */
    1                         compaction_defer_reset(zone, order, false);
                              /*
                               * It is possible that async compaction aborted due to
                               * need_resched() and the watermarks were ok thanks to
                               * somebody else freeing memory. The allocation can
                               * however still fail so we better signal the
                               * need_resched() contention anyway (this will not
                               * prevent the allocation attempt).
                               */
                              if (zone_contended == COMPACT_CONTENDED_SCHED)
                                      *contended = COMPACT_CONTENDED_SCHED;
      
                              goto break_loop;
                      }
      
                      if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
                              /*
                               * We think that allocation won't succeed in this zone
                               * so we defer compaction there. If it ends up
                               * succeeding after all, it will be reset.
                               */
                              defer_compaction(zone, order);
                      }
      
                      /*
                       * We might have stopped compacting due to need_resched() in
                       * async compaction, or due to a fatal signal detected. In that
                       * case do not try further zones and signal need_resched()
                       * contention.
                       */
                      if ((zone_contended == COMPACT_CONTENDED_SCHED)
                                              || fatal_signal_pending(current)) {
                              *contended = COMPACT_CONTENDED_SCHED;
                              goto break_loop;
                      }
      
                      continue;
      break_loop:
                      /*
                       * We might not have tried all the zones, so  be conservative
                       * and assume they are not all lock contended.
                       */
                      all_zones_contended = 0;
    1                 break;
              }
      
              /*
               * If at least one zone wasn't deferred or skipped, we report if all
               * zones that were tried were lock contended.
               */
              if (rc > COMPACT_SKIPPED && all_zones_contended)
                      *contended = COMPACT_CONTENDED_LOCK;
      
              return rc;
      }
      
      
      /* Compact all zones within a node */
      static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
      {
              int zoneid;
              struct zone *zone;
      
              for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
      
                      zone = &pgdat->node_zones[zoneid];
                      if (!populated_zone(zone))
                              continue;
      
                      cc->nr_freepages = 0;
                      cc->nr_migratepages = 0;
                      cc->zone = zone;
                      INIT_LIST_HEAD(&cc->freepages);
                      INIT_LIST_HEAD(&cc->migratepages);
      
                      /*
                       * When called via /proc/sys/vm/compact_memory
                       * this makes sure we compact the whole zone regardless of
                       * cached scanner positions.
                       */
                      if (is_via_compact_memory(cc->order))
                              __reset_isolation_suitable(zone);
      
                      if (is_via_compact_memory(cc->order) ||
                                      !compaction_deferred(zone, cc->order))
                              compact_zone(zone, cc);
      
                      if (cc->order > 0) {
                              if (zone_watermark_ok(zone, cc->order,
                                                      low_wmark_pages(zone), 0, 0))
                                      compaction_defer_reset(zone, cc->order, false);
                      }
      
                      VM_BUG_ON(!list_empty(&cc->freepages));
                      VM_BUG_ON(!list_empty(&cc->migratepages));
              }
      }
      
      void compact_pgdat(pg_data_t *pgdat, int order)
      {
              struct compact_control cc = {
                      .order = order,
                      .mode = MIGRATE_ASYNC,
              };
      
              if (!order)
                      return;
      
              __compact_pgdat(pgdat, &cc);
      }
      
      static void compact_node(int nid)
      {
              struct compact_control cc = {
                      .order = -1,
                      .mode = MIGRATE_SYNC,
                      .ignore_skip_hint = true,
              };
      
              __compact_pgdat(NODE_DATA(nid), &cc);
      }
      
      /* Compact all nodes in the system */
      static void compact_nodes(void)
      {
              int nid;
      
              /* Flush pending updates to the LRU lists */
              lru_add_drain_all();
      
              for_each_online_node(nid)
                      compact_node(nid);
      }
      
      /* The written value is actually unused, all memory is compacted */
      int sysctl_compact_memory;
      
      /* This is the entry point for compacting all nodes via /proc/sys/vm */
      int sysctl_compaction_handler(struct ctl_table *table, int write,
                              void __user *buffer, size_t *length, loff_t *ppos)
      {
              if (write)
                      compact_nodes();
      
              return 0;
      }
      
      int sysctl_extfrag_handler(struct ctl_table *table, int write,
                              void __user *buffer, size_t *length, loff_t *ppos)
      {
              proc_dointvec_minmax(table, write, buffer, length, ppos);
      
              return 0;
      }
      
      #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
      static ssize_t sysfs_compact_node(struct device *dev,
                              struct device_attribute *attr,
                              const char *buf, size_t count)
      {
              int nid = dev->id;
      
              if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
                      /* Flush pending updates to the LRU lists */
                      lru_add_drain_all();
      
                      compact_node(nid);
              }
      
              return count;
      }
      static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
      
      int compaction_register_node(struct node *node)
      {
              return device_create_file(&node->dev, &dev_attr_compact);
      }
      
      void compaction_unregister_node(struct node *node)
      {
              return device_remove_file(&node->dev, &dev_attr_compact);
      }
      #endif /* CONFIG_SYSFS && CONFIG_NUMA */
      
      #endif /* CONFIG_COMPACTION */
      #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      
      #include <linux/workqueue.h>
      #include <linux/rtnetlink.h>
      #include <linux/cache.h>
      #include <linux/slab.h>
      #include <linux/list.h>
      #include <linux/delay.h>
      #include <linux/sched.h>
      #include <linux/idr.h>
      #include <linux/rculist.h>
      #include <linux/nsproxy.h>
      #include <linux/fs.h>
      #include <linux/proc_ns.h>
      #include <linux/file.h>
      #include <linux/export.h>
      #include <linux/user_namespace.h>
      #include <linux/net_namespace.h>
      #include <net/sock.h>
      #include <net/netlink.h>
      #include <net/net_namespace.h>
      #include <net/netns/generic.h>
      
      /*
       *        Our network namespace constructor/destructor lists
       */
      
      static LIST_HEAD(pernet_list);
      static struct list_head *first_device = &pernet_list;
      DEFINE_MUTEX(net_mutex);
      
      LIST_HEAD(net_namespace_list);
      EXPORT_SYMBOL_GPL(net_namespace_list);
      
      struct net init_net = {
              .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
      };
      EXPORT_SYMBOL(init_net);
      
      #define INITIAL_NET_GEN_PTRS        13 /* +1 for len +2 for rcu_head */
      
      static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
      
      static struct net_generic *net_alloc_generic(void)
      {
              struct net_generic *ng;
   80         size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
      
              ng = kzalloc(generic_size, GFP_KERNEL);
              if (ng)
   80                 ng->len = max_gen_ptrs;
      
   80         return ng;
      }
      
      static int net_assign_generic(struct net *net, int id, void *data)
      {
              struct net_generic *ng, *old_ng;
      
              BUG_ON(!mutex_is_locked(&net_mutex));
   39         BUG_ON(id == 0);
      
   39         old_ng = rcu_dereference_protected(net->gen,
                                                 lockdep_is_held(&net_mutex));
              ng = old_ng;
              if (old_ng->len >= id)
                      goto assign;
      
              ng = net_alloc_generic();
              if (ng == NULL)
                      return -ENOMEM;
      
              /*
               * Some synchronisation notes:
               *
               * The net_generic explores the net->gen array inside rcu
               * read section. Besides once set the net->gen->ptr[x]
               * pointer never changes (see rules in netns/generic.h).
               *
               * That said, we simply duplicate this array and schedule
               * the old copy for kfree after a grace period.
               */
      
              memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
      
              rcu_assign_pointer(net->gen, ng);
              kfree_rcu(old_ng, rcu);
      assign:
   39         ng->ptr[id - 1] = data;
              return 0;
      }
      
      static int ops_init(const struct pernet_operations *ops, struct net *net)
      {
              int err = -ENOMEM;
              void *data = NULL;
      
   39         if (ops->id && ops->size) {
   39                 data = kzalloc(ops->size, GFP_KERNEL);
                      if (!data)
                              goto out;
      
   39                 err = net_assign_generic(net, *ops->id, data);
                      if (err)
                              goto cleanup;
              }
              err = 0;
   39         if (ops->init)
   39                 err = ops->init(net);
              if (!err)
                      return 0;
      
      cleanup:
   39         kfree(data);
      
      out:
              return err;
      }
      
      static void ops_free(const struct pernet_operations *ops, struct net *net)
      {
              if (ops->id && ops->size) {
                      int id = *ops->id;
                      kfree(net_generic(net, id));
              }
      }
      
      static void ops_exit_list(const struct pernet_operations *ops,
                                struct list_head *net_exit_list)
      {
              struct net *net;
              if (ops->exit) {
                      list_for_each_entry(net, net_exit_list, exit_list)
                              ops->exit(net);
              }
              if (ops->exit_batch)
                      ops->exit_batch(net_exit_list);
      }
      
      static void ops_free_list(const struct pernet_operations *ops,
                                struct list_head *net_exit_list)
      {
              struct net *net;
              if (ops->size && ops->id) {
                      list_for_each_entry(net, net_exit_list, exit_list)
                              ops_free(ops, net);
              }
      }
      
      /* should be called with nsid_lock held */
      static int alloc_netid(struct net *net, struct net *peer, int reqid)
      {
              int min = 0, max = 0;
      
              if (reqid >= 0) {
                      min = reqid;
                      max = reqid + 1;
              }
      
              return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
      }
      
      /* This function is used by idr_for_each(). If net is equal to peer, the
       * function returns the id so that idr_for_each() stops. Because we cannot
       * returns the id 0 (idr_for_each() will not stop), we return the magic value
       * NET_ID_ZERO (-1) for it.
       */
      #define NET_ID_ZERO -1
      static int net_eq_idr(int id, void *net, void *peer)
      {
    6         if (net_eq(net, peer))
    4                 return id ? : NET_ID_ZERO;
              return 0;
      }
      
      /* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
       * is set to true, thus the caller knows that the new id must be notified via
       * rtnl.
       */
      static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
      {
  350         int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
              bool alloc_it = *alloc;
      
              *alloc = false;
      
              /* Magic value for id 0. */
              if (id == NET_ID_ZERO)
                      return 0;
  348         if (id > 0)
                      return id;
      
  346         if (alloc_it) {
                      id = alloc_netid(net, peer, -1);
                      *alloc = true;
  350                 return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
              }
      
              return NETNSA_NSID_NOT_ASSIGNED;
      }
      
      /* should be called with nsid_lock held */
      static int __peernet2id(struct net *net, struct net *peer)
      {
              bool no = false;
      
              return __peernet2id_alloc(net, peer, &no);
      }
      
      static void rtnl_net_notifyid(struct net *net, int cmd, int id);
      /* This function returns the id of a peer netns. If no id is assigned, one will
       * be allocated and returned.
       */
      int peernet2id_alloc(struct net *net, struct net *peer)
      {
              unsigned long flags;
              bool alloc;
              int id;
      
              if (atomic_read(&net->count) == 0)
                      return NETNSA_NSID_NOT_ASSIGNED;
              spin_lock_irqsave(&net->nsid_lock, flags);
              alloc = atomic_read(&peer->count) == 0 ? false : true;
              id = __peernet2id_alloc(net, peer, &alloc);
              spin_unlock_irqrestore(&net->nsid_lock, flags);
              if (alloc && id >= 0)
                      rtnl_net_notifyid(net, RTM_NEWNSID, id);
              return id;
      }
      EXPORT_SYMBOL(peernet2id_alloc);
      
      /* This function returns, if assigned, the id of a peer netns. */
      int peernet2id(struct net *net, struct net *peer)
      {
              unsigned long flags;
              int id;
      
  348         spin_lock_irqsave(&net->nsid_lock, flags);
              id = __peernet2id(net, peer);
              spin_unlock_irqrestore(&net->nsid_lock, flags);
              return id;
      }
      
      /* This function returns true is the peer netns has an id assigned into the
       * current netns.
       */
      bool peernet_has_id(struct net *net, struct net *peer)
      {
   11         return peernet2id(net, peer) >= 0;
      }
      
      struct net *get_net_ns_by_id(struct net *net, int id)
      {
              unsigned long flags;
              struct net *peer;
      
              if (id < 0)
                      return NULL;
      
              rcu_read_lock();
              spin_lock_irqsave(&net->nsid_lock, flags);
              peer = idr_find(&net->netns_ids, id);
              if (peer)
                      peer = maybe_get_net(peer);
              spin_unlock_irqrestore(&net->nsid_lock, flags);
              rcu_read_unlock();
      
              return peer;
      }
      
      /*
       * setup_net runs the initializers for the network namespace object.
       */
      static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
      {
              /* Must be called with net_mutex held */
              const struct pernet_operations *ops, *saved_ops;
              int error = 0;
   39         LIST_HEAD(net_exit_list);
      
              atomic_set(&net->count, 1);
              atomic_set(&net->passive, 1);
              get_random_bytes(&net->hash_mix, sizeof(u32));
              net->dev_base_seq = 1;
              net->user_ns = user_ns;
              idr_init(&net->netns_ids);
              spin_lock_init(&net->nsid_lock);
   39 
   39         list_for_each_entry(ops, &pernet_list, list) {
                      error = ops_init(ops, net);
                      if (error < 0)
                              goto out_undo;
              }
      out:
              return error;
      
      out_undo:
              /* Walk through the list backwards calling the exit functions
               * for the pernet modules whose init functions did not fail.
               */
              list_add(&net->exit_list, &net_exit_list);
              saved_ops = ops;
              list_for_each_entry_continue_reverse(ops, &pernet_list, list)
                      ops_exit_list(ops, &net_exit_list);
      
              ops = saved_ops;
              list_for_each_entry_continue_reverse(ops, &pernet_list, list)
                      ops_free_list(ops, &net_exit_list);
      
              rcu_barrier();
              goto out;
      }
      
      static int __net_init net_defaults_init_net(struct net *net)
   39 {
              net->core.sysctl_somaxconn = SOMAXCONN;
              return 0;
      }
      
      static struct pernet_operations net_defaults_ops = {
              .init = net_defaults_init_net,
      };
      
      static __init int net_defaults_init(void)
      {
              if (register_pernet_subsys(&net_defaults_ops))
                      panic("Cannot initialize net default settings");
      
              return 0;
      }
      
      core_initcall(net_defaults_init);
      
      #ifdef CONFIG_NET_NS
      static struct kmem_cache *net_cachep;
      static struct workqueue_struct *netns_wq;
      
      static struct net *net_alloc(void)
      {
              struct net *net = NULL;
              struct net_generic *ng;
   80 
              ng = net_alloc_generic();
              if (!ng)
                      goto out;
   80 
              net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
              if (!net)
                      goto out_free;
   80 
              rcu_assign_pointer(net->gen, ng);
      out:
              return net;
      
      out_free:
              kfree(ng);
              goto out;
      }
      
      static void net_free(struct net *net)
      {
              kfree(rcu_access_pointer(net->gen));
              kmem_cache_free(net_cachep, net);
      }
      
      void net_drop_ns(void *p)
      {
   12         struct net *ns = p;
              if (ns && atomic_dec_and_test(&ns->passive))
   12                 net_free(ns);
      }
      
      struct net *copy_net_ns(unsigned long flags,
                              struct user_namespace *user_ns, struct net *old_net)
      {
              struct net *net;
              int rv;
  145 
   66         if (!(flags & CLONE_NEWNET))
                      return get_net(old_net);
   80 
              net = net_alloc();
              if (!net)
                      return ERR_PTR(-ENOMEM);
   80 
              get_user_ns(user_ns);
   80 
              mutex_lock(&net_mutex);
              rv = setup_net(net, user_ns);
              if (rv == 0) {
                      rtnl_lock();
                      list_add_tail_rcu(&net->list, &net_namespace_list);
                      rtnl_unlock();
              }
              mutex_unlock(&net_mutex);
              if (rv < 0) {
                      put_user_ns(user_ns);
                      net_drop_ns(net);
                      return ERR_PTR(rv);
              }
              return net;
      }
      
      static DEFINE_SPINLOCK(cleanup_list_lock);
      static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
      
      static void cleanup_net(struct work_struct *work)
      {
              const struct pernet_operations *ops;
              struct net *net, *tmp;
              struct list_head net_kill_list;
              LIST_HEAD(net_exit_list);
      
              /* Atomically snapshot the list of namespaces to cleanup */
              spin_lock_irq(&cleanup_list_lock);
              list_replace_init(&cleanup_list, &net_kill_list);
              spin_unlock_irq(&cleanup_list_lock);
      
              mutex_lock(&net_mutex);
      
              /* Don't let anyone else find us. */
              rtnl_lock();
              list_for_each_entry(net, &net_kill_list, cleanup_list) {
                      list_del_rcu(&net->list);
                      list_add_tail(&net->exit_list, &net_exit_list);
                      for_each_net(tmp) {
                              int id;
      
                              spin_lock_irq(&tmp->nsid_lock);
                              id = __peernet2id(tmp, net);
                              if (id >= 0)
                                      idr_remove(&tmp->netns_ids, id);
                              spin_unlock_irq(&tmp->nsid_lock);
                              if (id >= 0)
                                      rtnl_net_notifyid(tmp, RTM_DELNSID, id);
                      }
                      spin_lock_irq(&net->nsid_lock);
                      idr_destroy(&net->netns_ids);
                      spin_unlock_irq(&net->nsid_lock);
      
              }
              rtnl_unlock();
      
              /*
               * Another CPU might be rcu-iterating the list, wait for it.
               * This needs to be before calling the exit() notifiers, so
               * the rcu_barrier() below isn't sufficient alone.
               */
              synchronize_rcu();
      
              /* Run all of the network namespace exit methods */
              list_for_each_entry_reverse(ops, &pernet_list, list)
                      ops_exit_list(ops, &net_exit_list);
      
              /* Free the net generic variables */
              list_for_each_entry_reverse(ops, &pernet_list, list)
                      ops_free_list(ops, &net_exit_list);
      
              mutex_unlock(&net_mutex);
      
              /* Ensure there are no outstanding rcu callbacks using this
               * network namespace.
               */
              rcu_barrier();
      
              /* Finally it is safe to free my network namespace structure */
              list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
                      list_del_init(&net->exit_list);
                      put_user_ns(net->user_ns);
                      net_drop_ns(net);
              }
      }
      static DECLARE_WORK(net_cleanup_work, cleanup_net);
      
      void __put_net(struct net *net)
      {
              /* Cleanup the network namespace in process context */
              unsigned long flags;
      
              spin_lock_irqsave(&cleanup_list_lock, flags);
              list_add(&net->cleanup_list, &cleanup_list);
              spin_unlock_irqrestore(&cleanup_list_lock, flags);
      
              queue_work(netns_wq, &net_cleanup_work);
      }
      EXPORT_SYMBOL_GPL(__put_net);
      
      struct net *get_net_ns_by_fd(int fd)
      {
              struct file *file;
              struct ns_common *ns;
              struct net *net;
    5 
              file = proc_ns_fget(fd);
    3         if (IS_ERR(file))
                      return ERR_CAST(file);
    2 
              ns = get_proc_ns(file_inode(file));
    1         if (ns->ops == &netns_operations)
                      net = get_net(container_of(ns, struct net, ns));
              else
                      net = ERR_PTR(-EINVAL);
    2 
    2         fput(file);
              return net;
      }
      
      #else
      struct net *get_net_ns_by_fd(int fd)
      {
              return ERR_PTR(-EINVAL);
      }
      #endif
      EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
      
      struct net *get_net_ns_by_pid(pid_t pid)
      {
              struct task_struct *tsk;
              struct net *net;
      
              /* Lookup the network namespace */
    5         net = ERR_PTR(-ESRCH);
    5         rcu_read_lock();
              tsk = find_task_by_vpid(pid);
              if (tsk) {
    4                 struct nsproxy *nsproxy;
                      task_lock(tsk);
                      nsproxy = tsk->nsproxy;
    4                 if (nsproxy)
    4                         net = get_net(nsproxy->net_ns);
                      task_unlock(tsk);
    5         }
              rcu_read_unlock();
              return net;
      }
      EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
      
      static __net_init int net_ns_net_init(struct net *net)
      {
   39 #ifdef CONFIG_NET_NS
              net->ns.ops = &netns_operations;
      #endif
              return ns_alloc_inum(&net->ns);
      }
      
      static __net_exit void net_ns_net_exit(struct net *net)
      {
              ns_free_inum(&net->ns);
      }
      
      static struct pernet_operations __net_initdata net_ns_ops = {
              .init = net_ns_net_init,
              .exit = net_ns_net_exit,
      };
      
      static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
              [NETNSA_NONE]                = { .type = NLA_UNSPEC },
              [NETNSA_NSID]                = { .type = NLA_S32 },
              [NETNSA_PID]                = { .type = NLA_U32 },
              [NETNSA_FD]                = { .type = NLA_U32 },
      };
      
      static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
    8 {
              struct net *net = sock_net(skb->sk);
              struct nlattr *tb[NETNSA_MAX + 1];
              unsigned long flags;
              struct net *peer;
              int nsid, err;
    8 
              err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
                                rtnl_net_policy);
              if (err < 0)
    5                 return err;
              if (!tb[NETNSA_NSID])
    4                 return -EINVAL;
              nsid = nla_get_s32(tb[NETNSA_NSID]);
      
    1         if (tb[NETNSA_PID])
    3                 peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
    2         else if (tb[NETNSA_FD])
                      peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
              else
    3                 return -EINVAL;
    1         if (IS_ERR(peer))
                      return PTR_ERR(peer);
    2 
              spin_lock_irqsave(&net->nsid_lock, flags);
    2         if (__peernet2id(net, peer) >= 0) {
                      spin_unlock_irqrestore(&net->nsid_lock, flags);
                      err = -EEXIST;
                      goto out;
              }
      
              err = alloc_netid(net, peer, nsid);
              spin_unlock_irqrestore(&net->nsid_lock, flags);
              if (err >= 0) {
                      rtnl_net_notifyid(net, RTM_NEWNSID, err);
                      err = 0;
              }
    2 out:
              put_net(peer);
              return err;
      }
      
      static int rtnl_net_get_size(void)
      {
              return NLMSG_ALIGN(sizeof(struct rtgenmsg))
                     + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
                     ;
      }
      
      static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
                               int cmd, struct net *net, int nsid)
      {
              struct nlmsghdr *nlh;
              struct rtgenmsg *rth;
    1 
              nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
              if (!nlh)
                      return -EMSGSIZE;
      
    1         rth = nlmsg_data(nlh);
              rth->rtgen_family = AF_UNSPEC;
      
              if (nla_put_s32(skb, NETNSA_NSID, nsid))
                      goto nla_put_failure;
    1 
    1         nlmsg_end(skb, nlh);
              return 0;
      
      nla_put_failure:
              nlmsg_cancel(skb, nlh);
              return -EMSGSIZE;
      }
      
      static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
    6 {
              struct net *net = sock_net(skb->sk);
              struct nlattr *tb[NETNSA_MAX + 1];
              struct sk_buff *msg;
              struct net *peer;
              int err, id;
    6 
              err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
                                rtnl_net_policy);
              if (err < 0)
    5                 return err;
    1         if (tb[NETNSA_PID])
    4                 peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
    3         else if (tb[NETNSA_FD])
                      peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
              else
                      return -EINVAL;
    4 
    3         if (IS_ERR(peer))
                      return PTR_ERR(peer);
    1 
              msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
              if (!msg) {
                      err = -ENOMEM;
                      goto out;
              }
    1 
              id = peernet2id(net, peer);
              err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
                                  RTM_NEWNSID, net, id);
              if (err < 0)
                      goto err_out;
    1 
              err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
              goto out;
      
      err_out:
              nlmsg_free(msg);
    1 out:
              put_net(peer);
              return err;
      }
      
      struct rtnl_net_dump_cb {
              struct net *net;
              struct sk_buff *skb;
              struct netlink_callback *cb;
              int idx;
              int s_idx;
      };
      
      static int rtnl_net_dumpid_one(int id, void *peer, void *data)
      {
              struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
              int ret;
      
              if (net_cb->idx < net_cb->s_idx)
                      goto cont;
      
              ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
                                  net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                  RTM_NEWNSID, net_cb->net, id);
              if (ret < 0)
                      return ret;
      
      cont:
              net_cb->idx++;
              return 0;
      }
      
      static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
    2 {
              struct net *net = sock_net(skb->sk);
              struct rtnl_net_dump_cb net_cb = {
                      .net = net,
                      .skb = skb,
                      .cb = cb,
                      .idx = 0,
                      .s_idx = cb->args[0],
              };
              unsigned long flags;
      
              spin_lock_irqsave(&net->nsid_lock, flags);
              idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
              spin_unlock_irqrestore(&net->nsid_lock, flags);
      
              cb->args[0] = net_cb.idx;
              return skb->len;
      }
      
      static void rtnl_net_notifyid(struct net *net, int cmd, int id)
      {
              struct sk_buff *msg;
              int err = -ENOMEM;
      
              msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
              if (!msg)
                      goto out;
      
              err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
              if (err < 0)
                      goto err_out;
      
              rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0);
              return;
      
      err_out:
              nlmsg_free(msg);
      out:
              rtnl_set_sk_err(net, RTNLGRP_NSID, err);
      }
      
      static int __init net_ns_init(void)
      {
              struct net_generic *ng;
      
      #ifdef CONFIG_NET_NS
              net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
                                              SMP_CACHE_BYTES,
                                              SLAB_PANIC, NULL);
      
              /* Create workqueue for cleanup */
              netns_wq = create_singlethread_workqueue("netns");
              if (!netns_wq)
                      panic("Could not create netns workq");
      #endif
      
              ng = net_alloc_generic();
              if (!ng)
                      panic("Could not allocate generic netns");
      
              rcu_assign_pointer(init_net.gen, ng);
      
              mutex_lock(&net_mutex);
              if (setup_net(&init_net, &init_user_ns))
                      panic("Could not setup the initial network namespace");
      
              rtnl_lock();
              list_add_tail_rcu(&init_net.list, &net_namespace_list);
              rtnl_unlock();
      
              mutex_unlock(&net_mutex);
      
              register_pernet_subsys(&net_ns_ops);
      
              rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL);
              rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
                            NULL);
      
              return 0;
      }
      
      pure_initcall(net_ns_init);
      
      #ifdef CONFIG_NET_NS
      static int __register_pernet_operations(struct list_head *list,
                                              struct pernet_operations *ops)
      {
              struct net *net;
              int error;
              LIST_HEAD(net_exit_list);
      
              list_add_tail(&ops->list, list);
              if (ops->init || (ops->id && ops->size)) {
                      for_each_net(net) {
                              error = ops_init(ops, net);
                              if (error)
                                      goto out_undo;
                              list_add_tail(&net->exit_list, &net_exit_list);
                      }
              }
              return 0;
      
      out_undo:
              /* If I have an error cleanup all namespaces I initialized */
              list_del(&ops->list);
              ops_exit_list(ops, &net_exit_list);
              ops_free_list(ops, &net_exit_list);
              return error;
      }
      
      static void __unregister_pernet_operations(struct pernet_operations *ops)
      {
              struct net *net;
              LIST_HEAD(net_exit_list);
      
              list_del(&ops->list);
              for_each_net(net)
                      list_add_tail(&net->exit_list, &net_exit_list);
              ops_exit_list(ops, &net_exit_list);
              ops_free_list(ops, &net_exit_list);
      }
      
      #else
      
      static int __register_pernet_operations(struct list_head *list,
                                              struct pernet_operations *ops)
      {
              return ops_init(ops, &init_net);
      }
      
      static void __unregister_pernet_operations(struct pernet_operations *ops)
      {
              LIST_HEAD(net_exit_list);
              list_add(&init_net.exit_list, &net_exit_list);
              ops_exit_list(ops, &net_exit_list);
              ops_free_list(ops, &net_exit_list);
      }
      
      #endif /* CONFIG_NET_NS */
      
      static DEFINE_IDA(net_generic_ids);
      
      static int register_pernet_operations(struct list_head *list,
                                            struct pernet_operations *ops)
      {
              int error;
      
              if (ops->id) {
      again:
                      error = ida_get_new_above(&net_generic_ids, 1, ops->id);
                      if (error < 0) {
                              if (error == -EAGAIN) {
                                      ida_pre_get(&net_generic_ids, GFP_KERNEL);
                                      goto again;
                              }
                              return error;
                      }
                      max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
              }
              error = __register_pernet_operations(list, ops);
              if (error) {
                      rcu_barrier();
                      if (ops->id)
                              ida_remove(&net_generic_ids, *ops->id);
              }
      
              return error;
      }
      
      static void unregister_pernet_operations(struct pernet_operations *ops)
      {
              
              __unregister_pernet_operations(ops);
              rcu_barrier();
              if (ops->id)
                      ida_remove(&net_generic_ids, *ops->id);
      }
      
      /**
       *      register_pernet_subsys - register a network namespace subsystem
       *        @ops:  pernet operations structure for the subsystem
       *
       *        Register a subsystem which has init and exit functions
       *        that are called when network namespaces are created and
       *        destroyed respectively.
       *
       *        When registered all network namespace init functions are
       *        called for every existing network namespace.  Allowing kernel
       *        modules to have a race free view of the set of network namespaces.
       *
       *        When a new network namespace is created all of the init
       *        methods are called in the order in which they were registered.
       *
       *        When a network namespace is destroyed all of the exit methods
       *        are called in the reverse of the order with which they were
       *        registered.
       */
      int register_pernet_subsys(struct pernet_operations *ops)
      {
              int error;
              mutex_lock(&net_mutex);
              error =  register_pernet_operations(first_device, ops);
              mutex_unlock(&net_mutex);
              return error;
      }
      EXPORT_SYMBOL_GPL(register_pernet_subsys);
      
      /**
       *      unregister_pernet_subsys - unregister a network namespace subsystem
       *        @ops: pernet operations structure to manipulate
       *
       *        Remove the pernet operations structure from the list to be
       *        used when network namespaces are created or destroyed.  In
       *        addition run the exit method for all existing network
       *        namespaces.
       */
      void unregister_pernet_subsys(struct pernet_operations *ops)
      {
              mutex_lock(&net_mutex);
              unregister_pernet_operations(ops);
              mutex_unlock(&net_mutex);
      }
      EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
      
      /**
       *      register_pernet_device - register a network namespace device
       *        @ops:  pernet operations structure for the subsystem
       *
       *        Register a device which has init and exit functions
       *        that are called when network namespaces are created and
       *        destroyed respectively.
       *
       *        When registered all network namespace init functions are
       *        called for every existing network namespace.  Allowing kernel
       *        modules to have a race free view of the set of network namespaces.
       *
       *        When a new network namespace is created all of the init
       *        methods are called in the order in which they were registered.
       *
       *        When a network namespace is destroyed all of the exit methods
       *        are called in the reverse of the order with which they were
       *        registered.
       */
      int register_pernet_device(struct pernet_operations *ops)
      {
              int error;
              mutex_lock(&net_mutex);
              error = register_pernet_operations(&pernet_list, ops);
              if (!error && (first_device == &pernet_list))
                      first_device = &ops->list;
              mutex_unlock(&net_mutex);
              return error;
      }
      EXPORT_SYMBOL_GPL(register_pernet_device);
      
      /**
       *      unregister_pernet_device - unregister a network namespace netdevice
       *        @ops: pernet operations structure to manipulate
       *
       *        Remove the pernet operations structure from the list to be
       *        used when network namespaces are created or destroyed.  In
       *        addition run the exit method for all existing network
       *        namespaces.
       */
      void unregister_pernet_device(struct pernet_operations *ops)
      {
              mutex_lock(&net_mutex);
              if (&ops->list == first_device)
                      first_device = first_device->next;
              unregister_pernet_operations(ops);
              mutex_unlock(&net_mutex);
      }
      EXPORT_SYMBOL_GPL(unregister_pernet_device);
      
      #ifdef CONFIG_NET_NS
      static struct ns_common *netns_get(struct task_struct *task)
      {
              struct net *net = NULL;
              struct nsproxy *nsproxy;
    7 
              task_lock(task);
              nsproxy = task->nsproxy;
    7         if (nsproxy)
                      net = get_net(nsproxy->net_ns);
              task_unlock(task);
    7 
              return net ? &net->ns : NULL;
      }
      
      static inline struct net *to_net_ns(struct ns_common *ns)
    2 {
              return container_of(ns, struct net, ns);
      }
      
      static void netns_put(struct ns_common *ns)
    7 {
    7         put_net(to_net_ns(ns));
      }
      
      static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
    2 {
              struct net *net = to_net_ns(ns);
      
    1         if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
                  !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
                      return -EPERM;
    1 
    1         put_net(nsproxy->net_ns);
    2         nsproxy->net_ns = get_net(net);
              return 0;
      }
      
      const struct proc_ns_operations netns_operations = {
              .name                = "net",
              .type                = CLONE_NEWNET,
              .get                = netns_get,
              .put                = netns_put,
              .install        = netns_install,
      };
      #endif
      /*
       * Virtio PCI driver - common functionality for all device versions
       *
       * This module allows virtio devices to be used over a virtual PCI device.
       * This can be used with QEMU based VMMs like KVM or Xen.
       *
       * Copyright IBM Corp. 2007
       * Copyright Red Hat, Inc. 2014
       *
       * Authors:
       *  Anthony Liguori  <aliguori@us.ibm.com>
       *  Rusty Russell <rusty@rustcorp.com.au>
       *  Michael S. Tsirkin <mst@redhat.com>
       *
       * This work is licensed under the terms of the GNU GPL, version 2 or later.
       * See the COPYING file in the top-level directory.
       *
       */
      
      #include "virtio_pci_common.h"
      
      static bool force_legacy = false;
      
      #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY)
      module_param(force_legacy, bool, 0444);
      MODULE_PARM_DESC(force_legacy,
                       "Force legacy mode for transitional virtio 1 devices");
      #endif
      
      /* wait for pending irq handlers */
      void vp_synchronize_vectors(struct virtio_device *vdev)
      {
              struct virtio_pci_device *vp_dev = to_vp_device(vdev);
              int i;
      
              if (vp_dev->intx_enabled)
                      synchronize_irq(vp_dev->pci_dev->irq);
      
              for (i = 0; i < vp_dev->msix_vectors; ++i)
                      synchronize_irq(vp_dev->msix_entries[i].vector);
      }
      
      /* the notify function used when creating a virt queue */
      bool vp_notify(struct virtqueue *vq)
      {
              /* we write the queue's selector into the notification register to
               * signal the other end */
  595         iowrite16(vq->index, (void __iomem *)vq->priv);
              return true;
      }
      
      /* Handle a configuration change: Tell driver if it wants to know. */
      static irqreturn_t vp_config_changed(int irq, void *opaque)
      {
              struct virtio_pci_device *vp_dev = opaque;
      
              virtio_config_changed(&vp_dev->vdev);
              return IRQ_HANDLED;
      }
      
      /* Notify all virtqueues on an interrupt. */
      static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
      {
              struct virtio_pci_device *vp_dev = opaque;
              struct virtio_pci_vq_info *info;
              irqreturn_t ret = IRQ_NONE;
              unsigned long flags;
      
              spin_lock_irqsave(&vp_dev->lock, flags);
              list_for_each_entry(info, &vp_dev->virtqueues, node) {
                      if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
                              ret = IRQ_HANDLED;
              }
              spin_unlock_irqrestore(&vp_dev->lock, flags);
      
              return ret;
      }
      
      /* A small wrapper to also acknowledge the interrupt when it's handled.
       * I really need an EIO hook for the vring so I can ack the interrupt once we
       * know that we'll be handling the IRQ but before we invoke the callback since
       * the callback may notify the host which results in the host attempting to
       * raise an interrupt that we would then mask once we acknowledged the
       * interrupt. */
      static irqreturn_t vp_interrupt(int irq, void *opaque)
      {
              struct virtio_pci_device *vp_dev = opaque;
              u8 isr;
      
              /* reading the ISR has the effect of also clearing it so it's very
               * important to save off the value. */
              isr = ioread8(vp_dev->isr);
      
              /* It's definitely not us if the ISR was not high */
              if (!isr)
                      return IRQ_NONE;
      
              /* Configuration change?  Tell driver if it wants to know. */
              if (isr & VIRTIO_PCI_ISR_CONFIG)
                      vp_config_changed(irq, opaque);
      
              return vp_vring_interrupt(irq, opaque);
      }
      
      static void vp_free_vectors(struct virtio_device *vdev)
      {
              struct virtio_pci_device *vp_dev = to_vp_device(vdev);
              int i;
      
              if (vp_dev->intx_enabled) {
                      free_irq(vp_dev->pci_dev->irq, vp_dev);
                      vp_dev->intx_enabled = 0;
              }
      
              for (i = 0; i < vp_dev->msix_used_vectors; ++i)
                      free_irq(vp_dev->msix_entries[i].vector, vp_dev);
      
              for (i = 0; i < vp_dev->msix_vectors; i++)
                      if (vp_dev->msix_affinity_masks[i])
                              free_cpumask_var(vp_dev->msix_affinity_masks[i]);
      
              if (vp_dev->msix_enabled) {
                      /* Disable the vector used for configuration */
                      vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
      
                      pci_disable_msix(vp_dev->pci_dev);
                      vp_dev->msix_enabled = 0;
              }
      
              vp_dev->msix_vectors = 0;
              vp_dev->msix_used_vectors = 0;
              kfree(vp_dev->msix_names);
              vp_dev->msix_names = NULL;
              kfree(vp_dev->msix_entries);
              vp_dev->msix_entries = NULL;
              kfree(vp_dev->msix_affinity_masks);
              vp_dev->msix_affinity_masks = NULL;
      }
      
      static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
                                         bool per_vq_vectors)
      {
              struct virtio_pci_device *vp_dev = to_vp_device(vdev);
              const char *name = dev_name(&vp_dev->vdev.dev);
              unsigned i, v;
              int err = -ENOMEM;
      
              vp_dev->msix_vectors = nvectors;
      
              vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
                                             GFP_KERNEL);
              if (!vp_dev->msix_entries)
                      goto error;
              vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
                                           GFP_KERNEL);
              if (!vp_dev->msix_names)
                      goto error;
              vp_dev->msix_affinity_masks
                      = kzalloc(nvectors * sizeof *vp_dev->msix_affinity_masks,
                                GFP_KERNEL);
              if (!vp_dev->msix_affinity_masks)
                      goto error;
              for (i = 0; i < nvectors; ++i)
                      if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i],
                                              GFP_KERNEL))
                              goto error;
      
              for (i = 0; i < nvectors; ++i)
                      vp_dev->msix_entries[i].entry = i;
      
              err = pci_enable_msix_exact(vp_dev->pci_dev,
                                          vp_dev->msix_entries, nvectors);
              if (err)
                      goto error;
              vp_dev->msix_enabled = 1;
      
              /* Set the vector used for configuration */
              v = vp_dev->msix_used_vectors;
              snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
                       "%s-config", name);
              err = request_irq(vp_dev->msix_entries[v].vector,
                                vp_config_changed, 0, vp_dev->msix_names[v],
                                vp_dev);
              if (err)
                      goto error;
              ++vp_dev->msix_used_vectors;
      
              v = vp_dev->config_vector(vp_dev, v);
              /* Verify we had enough resources to assign the vector */
              if (v == VIRTIO_MSI_NO_VECTOR) {
                      err = -EBUSY;
                      goto error;
              }
      
              if (!per_vq_vectors) {
                      /* Shared vector for all VQs */
                      v = vp_dev->msix_used_vectors;
                      snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
                               "%s-virtqueues", name);
                      err = request_irq(vp_dev->msix_entries[v].vector,
                                        vp_vring_interrupt, 0, vp_dev->msix_names[v],
                                        vp_dev);
                      if (err)
                              goto error;
                      ++vp_dev->msix_used_vectors;
              }
              return 0;
      error:
              vp_free_vectors(vdev);
              return err;
      }
      
      static int vp_request_intx(struct virtio_device *vdev)
      {
              int err;
              struct virtio_pci_device *vp_dev = to_vp_device(vdev);
      
              err = request_irq(vp_dev->pci_dev->irq, vp_interrupt,
                                IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
              if (!err)
                      vp_dev->intx_enabled = 1;
              return err;
      }
      
      static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index,
                                           void (*callback)(struct virtqueue *vq),
                                           const char *name,
                                           u16 msix_vec)
      {
              struct virtio_pci_device *vp_dev = to_vp_device(vdev);
              struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL);
              struct virtqueue *vq;
              unsigned long flags;
      
              /* fill out our structure that represents an active queue */
              if (!info)
                      return ERR_PTR(-ENOMEM);
      
              vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, msix_vec);
              if (IS_ERR(vq))
                      goto out_info;
      
              info->vq = vq;
              if (callback) {
                      spin_lock_irqsave(&vp_dev->lock, flags);
                      list_add(&info->node, &vp_dev->virtqueues);
                      spin_unlock_irqrestore(&vp_dev->lock, flags);
              } else {
                      INIT_LIST_HEAD(&info->node);
              }
      
              vp_dev->vqs[index] = info;
              return vq;
      
      out_info:
              kfree(info);
              return vq;
      }
      
      static void vp_del_vq(struct virtqueue *vq)
      {
              struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
              struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
              unsigned long flags;
      
              spin_lock_irqsave(&vp_dev->lock, flags);
              list_del(&info->node);
              spin_unlock_irqrestore(&vp_dev->lock, flags);
      
              vp_dev->del_vq(info);
              kfree(info);
      }
      
      /* the config->del_vqs() implementation */
      void vp_del_vqs(struct virtio_device *vdev)
      {
              struct virtio_pci_device *vp_dev = to_vp_device(vdev);
              struct virtqueue *vq, *n;
              struct virtio_pci_vq_info *info;
      
              list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
                      info = vp_dev->vqs[vq->index];
                      if (vp_dev->per_vq_vectors &&
                              info->msix_vector != VIRTIO_MSI_NO_VECTOR)
                              free_irq(vp_dev->msix_entries[info->msix_vector].vector,
                                       vq);
                      vp_del_vq(vq);
              }
              vp_dev->per_vq_vectors = false;
      
              vp_free_vectors(vdev);
              kfree(vp_dev->vqs);
              vp_dev->vqs = NULL;
      }
      
      static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
                                    struct virtqueue *vqs[],
                                    vq_callback_t *callbacks[],
                                    const char * const names[],
                                    bool use_msix,
                                    bool per_vq_vectors)
      {
              struct virtio_pci_device *vp_dev = to_vp_device(vdev);
              u16 msix_vec;
              int i, err, nvectors, allocated_vectors;
      
              vp_dev->vqs = kmalloc(nvqs * sizeof *vp_dev->vqs, GFP_KERNEL);
              if (!vp_dev->vqs)
                      return -ENOMEM;
      
              if (!use_msix) {
                      /* Old style: one normal interrupt for change and all vqs. */
                      err = vp_request_intx(vdev);
                      if (err)
                              goto error_find;
              } else {
                      if (per_vq_vectors) {
                              /* Best option: one for change interrupt, one per vq. */
                              nvectors = 1;
                              for (i = 0; i < nvqs; ++i)
                                      if (callbacks[i])
                                              ++nvectors;
                      } else {
                              /* Second best: one for change, shared for all vqs. */
                              nvectors = 2;
                      }
      
                      err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);
                      if (err)
                              goto error_find;
              }
      
              vp_dev->per_vq_vectors = per_vq_vectors;
              allocated_vectors = vp_dev->msix_used_vectors;
              for (i = 0; i < nvqs; ++i) {
                      if (!names[i]) {
                              vqs[i] = NULL;
                              continue;
                      } else if (!callbacks[i] || !vp_dev->msix_enabled)
                              msix_vec = VIRTIO_MSI_NO_VECTOR;
                      else if (vp_dev->per_vq_vectors)
                              msix_vec = allocated_vectors++;
                      else
                              msix_vec = VP_MSIX_VQ_VECTOR;
                      vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
                      if (IS_ERR(vqs[i])) {
                              err = PTR_ERR(vqs[i]);
                              goto error_find;
                      }
      
                      if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
                              continue;
      
                      /* allocate per-vq irq if available and necessary */
                      snprintf(vp_dev->msix_names[msix_vec],
                               sizeof *vp_dev->msix_names,
                               "%s-%s",
                               dev_name(&vp_dev->vdev.dev), names[i]);
                      err = request_irq(vp_dev->msix_entries[msix_vec].vector,
                                        vring_interrupt, 0,
                                        vp_dev->msix_names[msix_vec],
                                        vqs[i]);
                      if (err) {
                              vp_del_vq(vqs[i]);
                              goto error_find;
                      }
              }
              return 0;
      
      error_find:
              vp_del_vqs(vdev);
              return err;
      }
      
      /* the config->find_vqs() implementation */
      int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
                      struct virtqueue *vqs[],
                      vq_callback_t *callbacks[],
                      const char * const names[])
      {
              int err;
      
              /* Try MSI-X with one vector per queue. */
              err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true, true);
              if (!err)
                      return 0;
              /* Fallback: MSI-X with one vector for config, one shared for queues. */