// SPDX-License-Identifier: GPL-2.0-only
      /*
       *  Copyright (C) 2009  Red Hat, Inc.
       */
      
      #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      
      #include <linux/mm.h>
      #include <linux/sched.h>
      #include <linux/sched/coredump.h>
      #include <linux/sched/numa_balancing.h>
      #include <linux/highmem.h>
      #include <linux/hugetlb.h>
      #include <linux/mmu_notifier.h>
      #include <linux/rmap.h>
      #include <linux/swap.h>
      #include <linux/shrinker.h>
      #include <linux/mm_inline.h>
      #include <linux/swapops.h>
      #include <linux/dax.h>
      #include <linux/khugepaged.h>
      #include <linux/freezer.h>
      #include <linux/pfn_t.h>
      #include <linux/mman.h>
      #include <linux/memremap.h>
      #include <linux/pagemap.h>
      #include <linux/debugfs.h>
      #include <linux/migrate.h>
      #include <linux/hashtable.h>
      #include <linux/userfaultfd_k.h>
      #include <linux/page_idle.h>
      #include <linux/shmem_fs.h>
      #include <linux/oom.h>
      #include <linux/numa.h>
      #include <linux/page_owner.h>
      
      #include <asm/tlb.h>
      #include <asm/pgalloc.h>
      #include "internal.h"
      
      /*
       * By default, transparent hugepage support is disabled in order to avoid
       * risking an increased memory footprint for applications that are not
       * guaranteed to benefit from it. When transparent hugepage support is
       * enabled, it is for all mappings, and khugepaged scans all mappings.
       * Defrag is invoked by khugepaged hugepage allocations and by page faults
       * for all hugepage allocations.
       */
      unsigned long transparent_hugepage_flags __read_mostly =
      #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
              (1<<TRANSPARENT_HUGEPAGE_FLAG)|
      #endif
      #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
              (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
      #endif
              (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
              (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
              (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
      
      static struct shrinker deferred_split_shrinker;
      
      static atomic_t huge_zero_refcount;
      struct page *huge_zero_page __read_mostly;
      
      bool transparent_hugepage_enabled(struct vm_area_struct *vma)
      {
              /* The addr is used to check if the vma size fits */
              unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
      
              if (!transhuge_vma_suitable(vma, addr))
                      return false;
              if (vma_is_anonymous(vma))
                      return __transparent_hugepage_enabled(vma);
              if (vma_is_shmem(vma))
                      return shmem_huge_enabled(vma);
      
              return false;
      }
      
      static struct page *get_huge_zero_page(void)
      {
              struct page *zero_page;
      retry:
  290         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
                      return READ_ONCE(huge_zero_page);
      
              zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                              HPAGE_PMD_ORDER);
              if (!zero_page) {
                      count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
                      return NULL;
              }
              count_vm_event(THP_ZERO_PAGE_ALLOC);
              preempt_disable();
              if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
                      preempt_enable();
                      __free_pages(zero_page, compound_order(zero_page));
                      goto retry;
              }
      
              /* We take additional reference here. It will be put back by shrinker */
              atomic_set(&huge_zero_refcount, 2);
              preempt_enable();
  290         return READ_ONCE(huge_zero_page);
      }
      
      static void put_huge_zero_page(void)
      {
              /*
               * Counter should never go to zero here. Only shrinker can put
               * last reference.
               */
              BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
      }
      
      struct page *mm_get_huge_zero_page(struct mm_struct *mm)
      {
  290         if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                      return READ_ONCE(huge_zero_page);
      
  290         if (!get_huge_zero_page())
                      return NULL;
      
  290         if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                      put_huge_zero_page();
      
  290         return READ_ONCE(huge_zero_page);
      }
      
      void mm_put_huge_zero_page(struct mm_struct *mm)
      {
              if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                      put_huge_zero_page();
      }
      
      static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
                                              struct shrink_control *sc)
      {
              /* we can free zero page only if last reference remains */
              return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
      }
      
      static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
                                             struct shrink_control *sc)
      {
              if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
                      struct page *zero_page = xchg(&huge_zero_page, NULL);
                      BUG_ON(zero_page == NULL);
                      __free_pages(zero_page, compound_order(zero_page));
                      return HPAGE_PMD_NR;
              }
      
              return 0;
      }
      
      static struct shrinker huge_zero_page_shrinker = {
              .count_objects = shrink_huge_zero_page_count,
              .scan_objects = shrink_huge_zero_page_scan,
              .seeks = DEFAULT_SEEKS,
      };
      
      #ifdef CONFIG_SYSFS
      static ssize_t enabled_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
      {
              if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
                      return sprintf(buf, "[always] madvise never\n");
              else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
                      return sprintf(buf, "always [madvise] never\n");
              else
                      return sprintf(buf, "always madvise [never]\n");
      }
      
      static ssize_t enabled_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
      {
              ssize_t ret = count;
      
              if (sysfs_streq(buf, "always")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
                      set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
              } else if (sysfs_streq(buf, "madvise")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
                      set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
              } else if (sysfs_streq(buf, "never")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
              } else
                      ret = -EINVAL;
      
              if (ret > 0) {
                      int err = start_stop_khugepaged();
                      if (err)
                              ret = err;
              }
              return ret;
      }
      static struct kobj_attribute enabled_attr =
              __ATTR(enabled, 0644, enabled_show, enabled_store);
      
      ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                      struct kobj_attribute *attr, char *buf,
                                      enum transparent_hugepage_flag flag)
      {
              return sprintf(buf, "%d\n",
                             !!test_bit(flag, &transparent_hugepage_flags));
      }
      
      ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                       struct kobj_attribute *attr,
                                       const char *buf, size_t count,
                                       enum transparent_hugepage_flag flag)
      {
              unsigned long value;
              int ret;
      
              ret = kstrtoul(buf, 10, &value);
              if (ret < 0)
                      return ret;
              if (value > 1)
                      return -EINVAL;
      
              if (value)
                      set_bit(flag, &transparent_hugepage_flags);
              else
                      clear_bit(flag, &transparent_hugepage_flags);
      
              return count;
      }
      
      static ssize_t defrag_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
      {
              if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                      return sprintf(buf, "[always] defer defer+madvise madvise never\n");
              if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
                      return sprintf(buf, "always [defer] defer+madvise madvise never\n");
              if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
                      return sprintf(buf, "always defer [defer+madvise] madvise never\n");
              if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
                      return sprintf(buf, "always defer defer+madvise [madvise] never\n");
              return sprintf(buf, "always defer defer+madvise madvise [never]\n");
      }
      
      static ssize_t defrag_store(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  const char *buf, size_t count)
      {
              if (sysfs_streq(buf, "always")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                      set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
              } else if (sysfs_streq(buf, "defer+madvise")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                      set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
              } else if (sysfs_streq(buf, "defer")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                      set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
              } else if (sysfs_streq(buf, "madvise")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                      set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
              } else if (sysfs_streq(buf, "never")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
              } else
                      return -EINVAL;
      
              return count;
      }
      static struct kobj_attribute defrag_attr =
              __ATTR(defrag, 0644, defrag_show, defrag_store);
      
      static ssize_t use_zero_page_show(struct kobject *kobj,
                      struct kobj_attribute *attr, char *buf)
      {
              return single_hugepage_flag_show(kobj, attr, buf,
                                      TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
      }
      static ssize_t use_zero_page_store(struct kobject *kobj,
                      struct kobj_attribute *attr, const char *buf, size_t count)
      {
              return single_hugepage_flag_store(kobj, attr, buf, count,
                                       TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
      }
      static struct kobj_attribute use_zero_page_attr =
              __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
      
      static ssize_t hpage_pmd_size_show(struct kobject *kobj,
                      struct kobj_attribute *attr, char *buf)
      {
              return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
      }
      static struct kobj_attribute hpage_pmd_size_attr =
              __ATTR_RO(hpage_pmd_size);
      
      #ifdef CONFIG_DEBUG_VM
      static ssize_t debug_cow_show(struct kobject *kobj,
                                      struct kobj_attribute *attr, char *buf)
      {
              return single_hugepage_flag_show(kobj, attr, buf,
                                      TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
      }
      static ssize_t debug_cow_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
      {
              return single_hugepage_flag_store(kobj, attr, buf, count,
                                       TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
      }
      static struct kobj_attribute debug_cow_attr =
              __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
      #endif /* CONFIG_DEBUG_VM */
      
      static struct attribute *hugepage_attr[] = {
              &enabled_attr.attr,
              &defrag_attr.attr,
              &use_zero_page_attr.attr,
              &hpage_pmd_size_attr.attr,
      #ifdef CONFIG_SHMEM
              &shmem_enabled_attr.attr,
      #endif
      #ifdef CONFIG_DEBUG_VM
              &debug_cow_attr.attr,
      #endif
              NULL,
      };
      
      static const struct attribute_group hugepage_attr_group = {
              .attrs = hugepage_attr,
      };
      
      static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
      {
              int err;
      
              *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
              if (unlikely(!*hugepage_kobj)) {
                      pr_err("failed to create transparent hugepage kobject\n");
                      return -ENOMEM;
              }
      
              err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
              if (err) {
                      pr_err("failed to register transparent hugepage group\n");
                      goto delete_obj;
              }
      
              err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
              if (err) {
                      pr_err("failed to register transparent hugepage group\n");
                      goto remove_hp_group;
              }
      
              return 0;
      
      remove_hp_group:
              sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
      delete_obj:
              kobject_put(*hugepage_kobj);
              return err;
      }
      
      static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
      {
              sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
              sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
              kobject_put(hugepage_kobj);
      }
      #else
      static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
      {
              return 0;
      }
      
      static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
      {
      }
      #endif /* CONFIG_SYSFS */
      
      static int __init hugepage_init(void)
      {
              int err;
              struct kobject *hugepage_kobj;
      
              if (!has_transparent_hugepage()) {
                      transparent_hugepage_flags = 0;
                      return -EINVAL;
              }
      
              /*
               * hugepages can't be allocated by the buddy allocator
               */
              MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
              /*
               * we use page->mapping and page->index in second tail page
               * as list_head: assuming THP order >= 2
               */
              MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
      
              err = hugepage_init_sysfs(&hugepage_kobj);
              if (err)
                      goto err_sysfs;
      
              err = khugepaged_init();
              if (err)
                      goto err_slab;
      
              err = register_shrinker(&huge_zero_page_shrinker);
              if (err)
                      goto err_hzp_shrinker;
              err = register_shrinker(&deferred_split_shrinker);
              if (err)
                      goto err_split_shrinker;
      
              /*
               * By default disable transparent hugepages on smaller systems,
               * where the extra memory used could hurt more than TLB overhead
               * is likely to save.  The admin can still enable it through /sys.
               */
              if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
                      transparent_hugepage_flags = 0;
                      return 0;
              }
      
              err = start_stop_khugepaged();
              if (err)
                      goto err_khugepaged;
      
              return 0;
      err_khugepaged:
              unregister_shrinker(&deferred_split_shrinker);
      err_split_shrinker:
              unregister_shrinker(&huge_zero_page_shrinker);
      err_hzp_shrinker:
              khugepaged_destroy();
      err_slab:
              hugepage_exit_sysfs(hugepage_kobj);
      err_sysfs:
              return err;
      }
      subsys_initcall(hugepage_init);
      
      static int __init setup_transparent_hugepage(char *str)
      {
              int ret = 0;
              if (!str)
                      goto out;
              if (!strcmp(str, "always")) {
                      set_bit(TRANSPARENT_HUGEPAGE_FLAG,
                              &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                                &transparent_hugepage_flags);
                      ret = 1;
              } else if (!strcmp(str, "madvise")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                                &transparent_hugepage_flags);
                      set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                              &transparent_hugepage_flags);
                      ret = 1;
              } else if (!strcmp(str, "never")) {
                      clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                                &transparent_hugepage_flags);
                      clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                                &transparent_hugepage_flags);
                      ret = 1;
              }
      out:
              if (!ret)
                      pr_warn("transparent_hugepage= cannot parse, ignored\n");
              return ret;
      }
      __setup("transparent_hugepage=", setup_transparent_hugepage);
      
      pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
      {
              if (likely(vma->vm_flags & VM_WRITE))
  606                 pmd = pmd_mkwrite(pmd);
              return pmd;
      }
      
      #ifdef CONFIG_MEMCG
      static inline struct deferred_split *get_deferred_split_queue(struct page *page)
      {
  480         struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
  480         struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
      
              if (memcg)
  480                 return &memcg->deferred_split_queue;
              else
                      return &pgdat->deferred_split_queue;
      }
      #else
      static inline struct deferred_split *get_deferred_split_queue(struct page *page)
      {
              struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
      
              return &pgdat->deferred_split_queue;
      }
      #endif
      
      void prep_transhuge_page(struct page *page)
      {
              /*
               * we use page->mapping and page->indexlru in second tail page
               * as list_head: assuming THP order >= 2
               */
      
  606         INIT_LIST_HEAD(page_deferred_list(page));
              set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
      }
      
      bool is_transparent_hugepage(struct page *page)
      {
              if (!PageCompound(page))
                      return false;
      
              page = compound_head(page);
              return is_huge_zero_page(page) ||
                     page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
      }
      EXPORT_SYMBOL_GPL(is_transparent_hugepage);
      
      static unsigned long __thp_get_unmapped_area(struct file *filp,
                      unsigned long addr, unsigned long len,
                      loff_t off, unsigned long flags, unsigned long size)
      {
              loff_t off_end = off + len;
              loff_t off_align = round_up(off, size);
              unsigned long len_pad, ret;
      
              if (off_end <= off_align || (off_end - off_align) < size)
                      return 0;
      
              len_pad = len + size;
              if (len_pad < len || (off + len_pad) < off)
                      return 0;
      
              ret = current->mm->get_unmapped_area(filp, addr, len_pad,
                                                    off >> PAGE_SHIFT, flags);
      
              /*
               * The failure might be due to length padding. The caller will retry
               * without the padding.
               */
              if (IS_ERR_VALUE(ret))
                      return 0;
      
              /*
               * Do not try to align to THP boundary if allocation at the address
               * hint succeeds.
               */
              if (ret == addr)
                      return addr;
      
              ret += (off - ret) & (size - 1);
              return ret;
      }
      
      unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                      unsigned long len, unsigned long pgoff, unsigned long flags)
      {
              unsigned long ret;
              loff_t off = (loff_t)pgoff << PAGE_SHIFT;
      
  428         if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
                      goto out;
      
              ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
              if (ret)
                      return ret;
      out:
  428         return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
      }
      EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
      
      static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                              struct page *page, gfp_t gfp)
      {
              struct vm_area_struct *vma = vmf->vma;
              pgtable_t pgtable;
  606         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
              vm_fault_t ret = 0;
      
              VM_BUG_ON_PAGE(!PageCompound(page), page);
      
  606         if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
                      put_page(page);
                      count_vm_event(THP_FAULT_FALLBACK);
                      count_vm_event(THP_FAULT_FALLBACK_CHARGE);
                      return VM_FAULT_FALLBACK;
              }
  606         cgroup_throttle_swaprate(page, gfp);
      
              pgtable = pte_alloc_one(vma->vm_mm);
              if (unlikely(!pgtable)) {
                      ret = VM_FAULT_OOM;
                      goto release;
              }
      
  606         clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
              /*
               * The memory barrier inside __SetPageUptodate makes sure that
               * clear_huge_page writes become visible before the set_pmd_at()
               * write.
               */
  606         __SetPageUptodate(page);
      
              vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
              if (unlikely(!pmd_none(*vmf->pmd))) {
                      goto unlock_release;
              } else {
                      pmd_t entry;
      
  606                 ret = check_stable_address_space(vma->vm_mm);
                      if (ret)
                              goto unlock_release;
      
                      /* Deliver the page fault to userland */
  606                 if (userfaultfd_missing(vma)) {
                              vm_fault_t ret2;
      
                              spin_unlock(vmf->ptl);
                              put_page(page);
                              pte_free(vma->vm_mm, pgtable);
                              ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
                              VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
                              return ret2;
                      }
      
  606                 entry = mk_huge_pmd(page, vma->vm_page_prot);
  606                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                      page_add_new_anon_rmap(page, vma, haddr, true);
                      lru_cache_add_active_or_unevictable(page, vma);
                      pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  606                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
                      add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                      mm_inc_nr_ptes(vma->vm_mm);
                      spin_unlock(vmf->ptl);
                      count_vm_event(THP_FAULT_ALLOC);
  606                 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
              }
      
              return 0;
      unlock_release:
    5         spin_unlock(vmf->ptl);
      release:
              if (pgtable)
    5                 pte_free(vma->vm_mm, pgtable);
    5         put_page(page);
              return ret;
      
      }
      
      /*
       * always: directly stall for all thp allocations
       * defer: wake kswapd and fail if not immediately available
       * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
       *                  fail if not immediately available
       * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
       *            available
       * never: never stall for any thp allocation
       */
      static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
      {
              const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
      
              /* Always do synchronous compaction */
              if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                      return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
      
              /* Kick kcompactd and fail quickly */
  607         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
                      return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
      
              /* Synchronous compaction if madvised, otherwise kick kcompactd */
  607         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
                      return GFP_TRANSHUGE_LIGHT |
                              (vma_madvised ? __GFP_DIRECT_RECLAIM :
                                              __GFP_KSWAPD_RECLAIM);
      
              /* Only do synchronous compaction if madvised */
  607         if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  607                 return GFP_TRANSHUGE_LIGHT |
                             (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
      
              return GFP_TRANSHUGE_LIGHT;
      }
      
      /* Caller must hold page table lock. */
  290 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                      struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                      struct page *zero_page)
      {
              pmd_t entry;
              if (!pmd_none(*pmd))
                      return false;
  290         entry = mk_pmd(zero_page, vma->vm_page_prot);
              entry = pmd_mkhuge(entry);
              if (pgtable)
  290                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
  290         set_pmd_at(mm, haddr, pmd, entry);
              mm_inc_nr_ptes(mm);
              return true;
  290 }
      
      vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
      {
  904         struct vm_area_struct *vma = vmf->vma;
              gfp_t gfp;
              struct page *page;
              unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
      
  904         if (!transhuge_vma_suitable(vma, haddr))
  904                 return VM_FAULT_FALLBACK;
  888         if (unlikely(anon_vma_prepare(vma)))
                      return VM_FAULT_OOM;
  888         if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
                      return VM_FAULT_OOM;
  888         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                              !mm_forbids_zeropage(vma->vm_mm) &&
  290                         transparent_hugepage_use_zero_page()) {
                      pgtable_t pgtable;
                      struct page *zero_page;
                      bool set;
                      vm_fault_t ret;
  290                 pgtable = pte_alloc_one(vma->vm_mm);
                      if (unlikely(!pgtable))
                              return VM_FAULT_OOM;
  290                 zero_page = mm_get_huge_zero_page(vma->vm_mm);
                      if (unlikely(!zero_page)) {
                              pte_free(vma->vm_mm, pgtable);
                              count_vm_event(THP_FAULT_FALLBACK);
                              return VM_FAULT_FALLBACK;
                      }
  290                 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                      ret = 0;
                      set = false;
                      if (pmd_none(*vmf->pmd)) {
  290                         ret = check_stable_address_space(vma->vm_mm);
                              if (ret) {
                                      spin_unlock(vmf->ptl);
  290                         } else if (userfaultfd_missing(vma)) {
                                      spin_unlock(vmf->ptl);
                                      ret = handle_userfault(vmf, VM_UFFD_MISSING);
                                      VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                              } else {
  290                                 set_huge_zero_page(pgtable, vma->vm_mm, vma,
                                                         haddr, vmf->pmd, zero_page);
                                      spin_unlock(vmf->ptl);
                                      set = true;
                              }
                      } else
    6                         spin_unlock(vmf->ptl);
                      if (!set)
    6                         pte_free(vma->vm_mm, pgtable);
                      return ret;
              }
  607         gfp = alloc_hugepage_direct_gfpmask(vma);
  607         page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
              if (unlikely(!page)) {
    1                 count_vm_event(THP_FAULT_FALLBACK);
                      return VM_FAULT_FALLBACK;
              }
  606         prep_transhuge_page(page);
  606         return __do_huge_pmd_anonymous_page(vmf, page, gfp);
      }
      
      static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                      pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
                      pgtable_t pgtable)
      {
              struct mm_struct *mm = vma->vm_mm;
              pmd_t entry;
              spinlock_t *ptl;
      
              ptl = pmd_lock(mm, pmd);
              if (!pmd_none(*pmd)) {
                      if (write) {
                              if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
                                      WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
                                      goto out_unlock;
                              }
                              entry = pmd_mkyoung(*pmd);
                              entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                              if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
                                      update_mmu_cache_pmd(vma, addr, pmd);
                      }
      
                      goto out_unlock;
              }
      
              entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
              if (pfn_t_devmap(pfn))
                      entry = pmd_mkdevmap(entry);
              if (write) {
                      entry = pmd_mkyoung(pmd_mkdirty(entry));
                      entry = maybe_pmd_mkwrite(entry, vma);
              }
      
              if (pgtable) {
                      pgtable_trans_huge_deposit(mm, pmd, pgtable);
                      mm_inc_nr_ptes(mm);
                      pgtable = NULL;
              }
      
              set_pmd_at(mm, addr, pmd, entry);
              update_mmu_cache_pmd(vma, addr, pmd);
      
      out_unlock:
              spin_unlock(ptl);
              if (pgtable)
                      pte_free(mm, pgtable);
      }
      
      /**
       * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
       * @vmf: Structure describing the fault
       * @pfn: pfn to insert
       * @pgprot: page protection to use
       * @write: whether it's a write fault
       *
       * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
       * also consult the vmf_insert_mixed_prot() documentation when
       * @pgprot != @vmf->vma->vm_page_prot.
       *
       * Return: vm_fault_t value.
       */
      vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
                                         pgprot_t pgprot, bool write)
      {
              unsigned long addr = vmf->address & PMD_MASK;
              struct vm_area_struct *vma = vmf->vma;
              pgtable_t pgtable = NULL;
      
              /*
               * If we had pmd_special, we could avoid all these restrictions,
               * but we need to be consistent with PTEs and architectures that
               * can't support a 'special' bit.
               */
              BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
                              !pfn_t_devmap(pfn));
              BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                      (VM_PFNMAP|VM_MIXEDMAP));
              BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
      
              if (addr < vma->vm_start || addr >= vma-&