// SPDX-License-Identifier: GPL-2.0
      /*
       * Functions related to io context handling
       */
      #include <linux/kernel.h>
      #include <linux/module.h>
      #include <linux/init.h>
      #include <linux/bio.h>
      #include <linux/blkdev.h>
      #include <linux/slab.h>
      #include <linux/sched/task.h>
      
      #include "blk.h"
      
      /*
       * For io context allocations
       */
      static struct kmem_cache *iocontext_cachep;
      
      /**
       * get_io_context - increment reference count to io_context
       * @ioc: io_context to get
       *
       * Increment reference count to @ioc.
       */
      void get_io_context(struct io_context *ioc)
      {
   35         BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
   35         atomic_long_inc(&ioc->refcount);
      }
      EXPORT_SYMBOL(get_io_context);
      
      static void icq_free_icq_rcu(struct rcu_head *head)
      {
              struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
      
              kmem_cache_free(icq->__rcu_icq_cache, icq);
      }
      
      /*
       * Exit an icq. Called with ioc locked for blk-mq, and with both ioc
       * and queue locked for legacy.
       */
      static void ioc_exit_icq(struct io_cq *icq)
      {
              struct elevator_type *et = icq->q->elevator->type;
      
              if (icq->flags & ICQ_EXITED)
                      return;
      
              if (et->uses_mq && et->ops.mq.exit_icq)
                      et->ops.mq.exit_icq(icq);
              else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
                      et->ops.sq.elevator_exit_icq_fn(icq);
      
              icq->flags |= ICQ_EXITED;
      }
      
      /*
       * Release an icq. Called with ioc locked for blk-mq, and with both ioc
       * and queue locked for legacy.
       */
      static void ioc_destroy_icq(struct io_cq *icq)
      {
              struct io_context *ioc = icq->ioc;
              struct request_queue *q = icq->q;
              struct elevator_type *et = q->elevator->type;
      
              lockdep_assert_held(&ioc->lock);
      
              radix_tree_delete(&ioc->icq_tree, icq->q->id);
              hlist_del_init(&icq->ioc_node);
              list_del_init(&icq->q_node);
      
              /*
               * Both setting lookup hint to and clearing it from @icq are done
               * under queue_lock.  If it's not pointing to @icq now, it never
               * will.  Hint assignment itself can race safely.
               */
              if (rcu_access_pointer(ioc->icq_hint) == icq)
                      rcu_assign_pointer(ioc->icq_hint, NULL);
      
              ioc_exit_icq(icq);
      
              /*
               * @icq->q might have gone away by the time RCU callback runs
               * making it impossible to determine icq_cache.  Record it in @icq.
               */
              icq->__rcu_icq_cache = et->icq_cache;
              call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
      }
      
      /*
       * Slow path for ioc release in put_io_context().  Performs double-lock
       * dancing to unlink all icq's and then frees ioc.
       */
      static void ioc_release_fn(struct work_struct *work)
      {
              struct io_context *ioc = container_of(work, struct io_context,
                                                    release_work);
              unsigned long flags;
      
              /*
               * Exiting icq may call into put_io_context() through elevator
               * which will trigger lockdep warning.  The ioc's are guaranteed to
               * be different, use a different locking subclass here.  Use
               * irqsave variant as there's no spin_lock_irq_nested().
               */
              spin_lock_irqsave_nested(&ioc->lock, flags, 1);
      
              while (!hlist_empty(&ioc->icq_list)) {
                      struct io_cq *icq = hlist_entry(ioc->icq_list.first,
                                                      struct io_cq, ioc_node);
                      struct request_queue *q = icq->q;
      
                      if (spin_trylock(q->queue_lock)) {
                              ioc_destroy_icq(icq);
                              spin_unlock(q->queue_lock);
                      } else {
                              spin_unlock_irqrestore(&ioc->lock, flags);
                              cpu_relax();
                              spin_lock_irqsave_nested(&ioc->lock, flags, 1);
                      }
              }
      
              spin_unlock_irqrestore(&ioc->lock, flags);
      
              kmem_cache_free(iocontext_cachep, ioc);
      }
      
      /**
       * put_io_context - put a reference of io_context
       * @ioc: io_context to put
       *
       * Decrement reference count of @ioc and release it if the count reaches
       * zero.
       */
      void put_io_context(struct io_context *ioc)
      {
              unsigned long flags;
              bool free_ioc = false;
      
   36         if (ioc == NULL)
                      return;
      
   36         BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
      
              /*
               * Releasing ioc requires reverse order double locking and we may
               * already be holding a queue_lock.  Do it asynchronously from wq.
               */
   36         if (atomic_long_dec_and_test(&ioc->refcount)) {
    2                 spin_lock_irqsave(&ioc->lock, flags);
                      if (!hlist_empty(&ioc->icq_list))
                              queue_work(system_power_efficient_wq,
                                              &ioc->release_work);
                      else
                              free_ioc = true;
    2                 spin_unlock_irqrestore(&ioc->lock, flags);
              }
      
              if (free_ioc)
                      kmem_cache_free(iocontext_cachep, ioc);
      }
      EXPORT_SYMBOL(put_io_context);
      
      /**
       * put_io_context_active - put active reference on ioc
       * @ioc: ioc of interest
       *
       * Undo get_io_context_active().  If active reference reaches zero after
       * put, @ioc can never issue further IOs and ioscheds are notified.
       */
    2 void put_io_context_active(struct io_context *ioc)
      {
              struct elevator_type *et;
              unsigned long flags;
              struct io_cq *icq;
      
    3         if (!atomic_dec_and_test(&ioc->active_ref)) {
    3                 put_io_context(ioc);
                      return;
              }
      
              /*
               * Need ioc lock to walk icq_list and q lock to exit icq.  Perform
               * reverse double locking.  Read comment in ioc_release_fn() for
               * explanation on the nested locking annotation.
               */
      retry:
    2         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
              hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
                      if (icq->flags & ICQ_EXITED)
                              continue;
      
                      et = icq->q->elevator->type;
                      if (et->uses_mq) {
                              ioc_exit_icq(icq);
                      } else {
                              if (spin_trylock(icq->q->queue_lock)) {
                                      ioc_exit_icq(icq);
                                      spin_unlock(icq->q->queue_lock);
                              } else {
                                      spin_unlock_irqrestore(&ioc->lock, flags);
                                      cpu_relax();
                                      goto retry;
                              }
                      }
              }
    2         spin_unlock_irqrestore(&ioc->lock, flags);
      
              put_io_context(ioc);
      }
      
      /* Called by the exiting task */
      void exit_io_context(struct task_struct *task)
      {
              struct io_context *ioc;
      
    3         task_lock(task);
              ioc = task->io_context;
              task->io_context = NULL;
              task_unlock(task);
      
              atomic_dec(&ioc->nr_tasks);
              put_io_context_active(ioc);
      }
      
      static void __ioc_clear_queue(struct list_head *icq_list)
      {
              unsigned long flags;
      
              while (!list_empty(icq_list)) {
                      struct io_cq *icq = list_entry(icq_list->next,
                                                     struct io_cq, q_node);
                      struct io_context *ioc = icq->ioc;
      
                      spin_lock_irqsave(&ioc->lock, flags);
                      ioc_destroy_icq(icq);
                      spin_unlock_irqrestore(&ioc->lock, flags);
              }
      }
      
      /**
       * ioc_clear_queue - break any ioc association with the specified queue
       * @q: request_queue being cleared
       *
       * Walk @q->icq_list and exit all io_cq's.
       */
      void ioc_clear_queue(struct request_queue *q)
      {
              LIST_HEAD(icq_list);
      
              spin_lock_irq(q->queue_lock);
              list_splice_init(&q->icq_list, &icq_list);
      
              if (q->mq_ops) {
                      spin_unlock_irq(q->queue_lock);
                      __ioc_clear_queue(&icq_list);
              } else {
                      __ioc_clear_queue(&icq_list);
                      spin_unlock_irq(q->queue_lock);
              }
      }
      
      int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
      {
              struct io_context *ioc;
              int ret;
      
 1818         ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
                                          node);
              if (unlikely(!ioc))
                      return -ENOMEM;
      
              /* initialize */
 1814         atomic_long_set(&ioc->refcount, 1);
              atomic_set(&ioc->nr_tasks, 1);
              atomic_set(&ioc->active_ref, 1);
              spin_lock_init(&ioc->lock);
              INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
              INIT_HLIST_HEAD(&ioc->icq_list);
              INIT_WORK(&ioc->release_work, ioc_release_fn);
      
              /*
               * Try to install.  ioc shouldn't be installed if someone else
               * already did or @task, which isn't %current, is exiting.  Note
               * that we need to allow ioc creation on exiting %current as exit
               * path may issue IOs from e.g. exit_files().  The exit path is
               * responsible for not issuing IO after exit_io_context().
               */
              task_lock(task);
              if (!task->io_context &&
 1813             (task == current || !(task->flags & PF_EXITING)))
 1811                 task->io_context = ioc;
              else
    8                 kmem_cache_free(iocontext_cachep, ioc);
      
              ret = task->io_context ? 0 : -EBUSY;
      
 1813         task_unlock(task);
      
 1812         return ret;
      }
      
      /**
       * get_task_io_context - get io_context of a task
       * @task: task of interest
       * @gfp_flags: allocation flags, used if allocation is necessary
       * @node: allocation node, used if allocation is necessary
       *
       * Return io_context of @task.  If it doesn't exist, it is created with
       * @gfp_flags and @node.  The returned io_context has its reference count
       * incremented.
       *
       * This function always goes through task_lock() and it's better to use
       * %current->io_context + get_io_context() for %current.
       */
      struct io_context *get_task_io_context(struct task_struct *task,
                                             gfp_t gfp_flags, int node)
      {
              struct io_context *ioc;
      
   37         might_sleep_if(gfpflags_allow_blocking(gfp_flags));
      
              do {
   37                 task_lock(task);
                      ioc = task->io_context;
                      if (likely(ioc)) {
   35                         get_io_context(ioc);
                              task_unlock(task);
   35                         return ioc;
                      }
   35                 task_unlock(task);
              } while (!create_task_io_context(task, gfp_flags, node));
      
              return NULL;
      }
      EXPORT_SYMBOL(get_task_io_context);
      
      /**
       * ioc_lookup_icq - lookup io_cq from ioc
       * @ioc: the associated io_context
       * @q: the associated request_queue
       *
       * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
       * with @q->queue_lock held.
       */
      struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
      {
              struct io_cq *icq;
      
              lockdep_assert_held(q->queue_lock);
      
              /*
               * icq's are indexed from @ioc using radix tree and hint pointer,
               * both of which are protected with RCU.  All removals are done
               * holding both q and ioc locks, and we're holding q lock - if we
               * find a icq which points to us, it's guaranteed to be valid.
               */
              rcu_read_lock();
              icq = rcu_dereference(ioc->icq_hint);
              if (icq && icq->q == q)
                      goto out;
      
              icq = radix_tree_lookup(&ioc->icq_tree, q->id);
              if (icq && icq->q == q)
                      rcu_assign_pointer(ioc->icq_hint, icq);        /* allowed to race */
              else
                      icq = NULL;
      out:
              rcu_read_unlock();
              return icq;
      }
      EXPORT_SYMBOL(ioc_lookup_icq);
      
      /**
       * ioc_create_icq - create and link io_cq
       * @ioc: io_context of interest
       * @q: request_queue of interest
       * @gfp_mask: allocation mask
       *
       * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
       * will be created using @gfp_mask.
       *
       * The caller is responsible for ensuring @ioc won't go away and @q is
       * alive and will stay alive until this function returns.
       */
      struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
                                   gfp_t gfp_mask)
      {
              struct elevator_type *et = q->elevator->type;
              struct io_cq *icq;
      
              /* allocate stuff */
              icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
                                          q->node);
              if (!icq)
                      return NULL;
      
              if (radix_tree_maybe_preload(gfp_mask) < 0) {
                      kmem_cache_free(et->icq_cache, icq);
                      return NULL;
              }
      
              icq->ioc = ioc;
              icq->q = q;
              INIT_LIST_HEAD(&icq->q_node);
              INIT_HLIST_NODE(&icq->ioc_node);
      
              /* lock both q and ioc and try to link @icq */
              spin_lock_irq(q->queue_lock);
              spin_lock(&ioc->lock);
      
              if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
                      hlist_add_head(&icq->ioc_node, &ioc->icq_list);
                      list_add(&icq->q_node, &q->icq_list);
                      if (et->uses_mq && et->ops.mq.init_icq)
                              et->ops.mq.init_icq(icq);
                      else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
                              et->ops.sq.elevator_init_icq_fn(icq);
              } else {
                      kmem_cache_free(et->icq_cache, icq);
                      icq = ioc_lookup_icq(ioc, q);
                      if (!icq)
                              printk(KERN_ERR "cfq: icq link failed!\n");
              }
      
              spin_unlock(&ioc->lock);
              spin_unlock_irq(q->queue_lock);
              radix_tree_preload_end();
              return icq;
      }
      
      static int __init blk_ioc_init(void)
      {
              iocontext_cachep = kmem_cache_create("blkdev_ioc",
                              sizeof(struct io_context), 0, SLAB_PANIC, NULL);
              return 0;
      }
      subsys_initcall(blk_ioc_init);
      /*
       *  Generic Timer-queue
       *
       *  Manages a simple queue of timers, ordered by expiration time.
       *  Uses rbtrees for quick list adds and expiration.
       *
       *  NOTE: All of the following functions need to be serialized
       *  to avoid races. No locking is done by this library code.
       *
       *  This program is free software; you can redistribute it and/or modify
       *  it under the terms of the GNU General Public License as published by
       *  the Free Software Foundation; either version 2 of the License, or
       *  (at your option) any later version.
       *
       *  This program is distributed in the hope that it will be useful,
       *  but WITHOUT ANY WARRANTY; without even the implied warranty of
       *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       *  GNU General Public License for more details.
       *
       *  You should have received a copy of the GNU General Public License
       *  along with this program; if not, write to the Free Software
       *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
       */
      
      #include <linux/bug.h>
      #include <linux/timerqueue.h>
      #include <linux/rbtree.h>
      #include <linux/export.h>
      
      /**
       * timerqueue_add - Adds timer to timerqueue.
       *
       * @head: head of timerqueue
       * @node: timer node to be added
       *
       * Adds the timer node to the timerqueue, sorted by the
       * node's expires value.
       */
      bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
      {
 8672         struct rb_node **p = &head->head.rb_node;
              struct rb_node *parent = NULL;
              struct timerqueue_node  *ptr;
      
              /* Make sure we don't add nodes that are already added */
              WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
      
 8672         while (*p) {
                      parent = *p;
                      ptr = rb_entry(parent, struct timerqueue_node, node);
 8634                 if (node->expires < ptr->expires)
 8569                         p = &(*p)->rb_left;
                      else
 8633                         p = &(*p)->rb_right;
              }
 8671         rb_link_node(&node->node, parent, p);
              rb_insert_color(&node->node, &head->head);
      
 8671         if (!head->next || node->expires < head->next->expires) {
 8465                 head->next = node;
                      return true;
              }
              return false;
      }
      EXPORT_SYMBOL_GPL(timerqueue_add);
      
      /**
       * timerqueue_del - Removes a timer from the timerqueue.
       *
       * @head: head of timerqueue
       * @node: timer node to be removed
       *
       * Removes the timer node from the timerqueue.
       */
      bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
      {
 8431         WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
      
              /* update next pointer */
 8431         if (head->next == node) {
 8384                 struct rb_node *rbn = rb_next(&node->node);
      
                      head->next = rb_entry_safe(rbn, struct timerqueue_node, node);
              }
 8431         rb_erase(&node->node, &head->head);
              RB_CLEAR_NODE(&node->node);
              return head->next != NULL;
      }
      EXPORT_SYMBOL_GPL(timerqueue_del);
      
      /**
       * timerqueue_iterate_next - Returns the timer after the provided timer
       *
       * @node: Pointer to a timer.
       *
       * Provides the timer that is after the given node. This is used, when
       * necessary, to iterate through the list of timers in a timer list
       * without modifying the list.
       */
      struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
      {
              struct rb_node *next;
      
              if (!node)
                      return NULL;
              next = rb_next(&node->node);
              if (!next)
                      return NULL;
              return container_of(next, struct timerqueue_node, node);
      }
      EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
      // SPDX-License-Identifier: GPL-2.0
      #include <linux/spinlock.h>
      #include <linux/task_work.h>
      #include <linux/tracehook.h>
      
      static struct callback_head work_exited; /* all we need is ->next == NULL */
      
      /**
       * task_work_add - ask the @task to execute @work->func()
       * @task: the task which should run the callback
       * @work: the callback to run
       * @notify: send the notification if true
       *
       * Queue @work for task_work_run() below and notify the @task if @notify.
       * Fails if the @task is exiting/exited and thus it can't process this @work.
       * Otherwise @work->func() will be called when the @task returns from kernel
       * mode or exits.
       *
       * This is like the signal handler which runs in kernel mode, but it doesn't
       * try to wake up the @task.
       *
       * Note: there is no ordering guarantee on works queued here.
       *
       * RETURNS:
       * 0 if succeeds or -ESRCH.
       */
      int
      task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
      {
              struct callback_head *head;
      
              do {
 2359                 head = READ_ONCE(task->task_works);
                      if (unlikely(head == &work_exited))
 2359                         return -ESRCH;
 2359                 work->next = head;
              } while (cmpxchg(&task->task_works, head, work) != head);
      
 2359         if (notify)
 2359                 set_notify_resume(task);
              return 0;
      }
      
      /**
       * task_work_cancel - cancel a pending work added by task_work_add()
       * @task: the task which should execute the work
       * @func: identifies the work to remove
       *
       * Find the last queued pending work with ->func == @func and remove
       * it from queue.
       *
       * RETURNS:
       * The found work or NULL if not found.
       */
      struct callback_head *
      task_work_cancel(struct task_struct *task, task_work_func_t func)
      {
              struct callback_head **pprev = &task->task_works;
              struct callback_head *work;
              unsigned long flags;
      
    5         if (likely(!task->task_works))
                      return NULL;
              /*
               * If cmpxchg() fails we continue without updating pprev.
               * Either we raced with task_work_add() which added the
               * new entry before this work, we will find it again. Or
               * we raced with task_work_run(), *pprev == NULL/exited.
               */
              raw_spin_lock_irqsave(&task->pi_lock, flags);
              while ((work = READ_ONCE(*pprev))) {
                      if (work->func != func)
                              pprev = &work->next;
                      else if (cmpxchg(pprev, work, work->next) == work)
                              break;
              }
              raw_spin_unlock_irqrestore(&task->pi_lock, flags);
      
              return work;
      }
      
      /**
       * task_work_run - execute the works added by task_work_add()
       *
       * Flush the pending works. Should be used by the core kernel code.
       * Called before the task returns to the user-mode or stops, or when
       * it exits. In the latter case task_work_add() can no longer add the
       * new work after task_work_run() returns.
       */
      void task_work_run(void)
      {
 2306         struct task_struct *task = current;
              struct callback_head *work, *head, *next;
      
              for (;;) {
                      /*
                       * work->func() can do task_work_add(), do not set
                       * work_exited unless the list is empty.
                       */
 2306                 raw_spin_lock_irq(&task->pi_lock);
                      do {
 2306                         work = READ_ONCE(task->task_works);
 2045                         head = !work && (task->flags & PF_EXITING) ?
                                      &work_exited : NULL;
 2306                 } while (cmpxchg(&task->task_works, work, head) != work);
 2306                 raw_spin_unlock_irq(&task->pi_lock);
      
                      if (!work)
                              break;
      
                      do {
 2305                         next = work->next;
                              work->func(work);
                              work = next;
                              cond_resched();
                      } while (work);
              }
      }
      /*
       * common LSM auditing functions
       *
       * Based on code written for SELinux by :
       *                        Stephen Smalley, <sds@tycho.nsa.gov>
       *                         James Morris <jmorris@redhat.com>
       * Author : Etienne Basset, <etienne.basset@ensta.org>
       *
       * This program is free software; you can redistribute it and/or modify
       * it under the terms of the GNU General Public License version 2,
       * as published by the Free Software Foundation.
       */
      
      #include <linux/types.h>
      #include <linux/stddef.h>
      #include <linux/kernel.h>
      #include <linux/gfp.h>
      #include <linux/fs.h>
      #include <linux/init.h>
      #include <net/sock.h>
      #include <linux/un.h>
      #include <net/af_unix.h>
      #include <linux/audit.h>
      #include <linux/ipv6.h>
      #include <linux/ip.h>
      #include <net/ip.h>
      #include <net/ipv6.h>
      #include <linux/tcp.h>
      #include <linux/udp.h>
      #include <linux/dccp.h>
      #include <linux/sctp.h>
      #include <linux/lsm_audit.h>
      
      /**
       * ipv4_skb_to_auditdata : fill auditdata from skb
       * @skb : the skb
       * @ad : the audit data to fill
       * @proto : the layer 4 protocol
       *
       * return  0 on success
       */
      int ipv4_skb_to_auditdata(struct sk_buff *skb,
                      struct common_audit_data *ad, u8 *proto)
      {
              int ret = 0;
              struct iphdr *ih;
      
              ih = ip_hdr(skb);
              if (ih == NULL)
                      return -EINVAL;
      
              ad->u.net->v4info.saddr = ih->saddr;
              ad->u.net->v4info.daddr = ih->daddr;
      
              if (proto)
                      *proto = ih->protocol;
              /* non initial fragment */
              if (ntohs(ih->frag_off) & IP_OFFSET)
                      return 0;
      
              switch (ih->protocol) {
              case IPPROTO_TCP: {
                      struct tcphdr *th = tcp_hdr(skb);
                      if (th == NULL)
                              break;
      
                      ad->u.net->sport = th->source;
                      ad->u.net->dport = th->dest;
                      break;
              }
              case IPPROTO_UDP: {
                      struct udphdr *uh = udp_hdr(skb);
                      if (uh == NULL)
                              break;
      
                      ad->u.net->sport = uh->source;
                      ad->u.net->dport = uh->dest;
                      break;
              }
              case IPPROTO_DCCP: {
                      struct dccp_hdr *dh = dccp_hdr(skb);
                      if (dh == NULL)
                              break;
      
                      ad->u.net->sport = dh->dccph_sport;
                      ad->u.net->dport = dh->dccph_dport;
                      break;
              }
              case IPPROTO_SCTP: {
                      struct sctphdr *sh = sctp_hdr(skb);
                      if (sh == NULL)
                              break;
                      ad->u.net->sport = sh->source;
                      ad->u.net->dport = sh->dest;
                      break;
              }
              default:
                      ret = -EINVAL;
              }
              return ret;
      }
      #if IS_ENABLED(CONFIG_IPV6)
      /**
       * ipv6_skb_to_auditdata : fill auditdata from skb
       * @skb : the skb
       * @ad : the audit data to fill
       * @proto : the layer 4 protocol
       *
       * return  0 on success
       */
      int ipv6_skb_to_auditdata(struct sk_buff *skb,
                      struct common_audit_data *ad, u8 *proto)
      {
              int offset, ret = 0;
              struct ipv6hdr *ip6;
              u8 nexthdr;
              __be16 frag_off;
      
              ip6 = ipv6_hdr(skb);
              if (ip6 == NULL)
                      return -EINVAL;
              ad->u.net->v6info.saddr = ip6->saddr;
              ad->u.net->v6info.daddr = ip6->daddr;
              ret = 0;
              /* IPv6 can have several extension header before the Transport header
               * skip them */
              offset = skb_network_offset(skb);
              offset += sizeof(*ip6);
              nexthdr = ip6->nexthdr;
              offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
              if (offset < 0)
                      return 0;
              if (proto)
                      *proto = nexthdr;
              switch (nexthdr) {
              case IPPROTO_TCP: {
                      struct tcphdr _tcph, *th;
      
                      th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                      if (th == NULL)
                              break;
      
                      ad->u.net->sport = th->source;
                      ad->u.net->dport = th->dest;
                      break;
              }
              case IPPROTO_UDP: {
                      struct udphdr _udph, *uh;
      
                      uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                      if (uh == NULL)
                              break;
      
                      ad->u.net->sport = uh->source;
                      ad->u.net->dport = uh->dest;
                      break;
              }
              case IPPROTO_DCCP: {
                      struct dccp_hdr _dccph, *dh;
      
                      dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                      if (dh == NULL)
                              break;
      
                      ad->u.net->sport = dh->dccph_sport;
                      ad->u.net->dport = dh->dccph_dport;
                      break;
              }
              case IPPROTO_SCTP: {
                      struct sctphdr _sctph, *sh;
      
                      sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                      if (sh == NULL)
                              break;
                      ad->u.net->sport = sh->source;
                      ad->u.net->dport = sh->dest;
                      break;
              }
              default:
                      ret = -EINVAL;
              }
              return ret;
      }
      #endif
      
      
      static inline void print_ipv6_addr(struct audit_buffer *ab,
                                         struct in6_addr *addr, __be16 port,
                                         char *name1, char *name2)
      {
              if (!ipv6_addr_any(addr))
                      audit_log_format(ab, " %s=%pI6c", name1, addr);
              if (port)
                      audit_log_format(ab, " %s=%d", name2, ntohs(port));
      }
      
      static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr,
                                         __be16 port, char *name1, char *name2)
      {
              if (addr)
                      audit_log_format(ab, " %s=%pI4", name1, &addr);
              if (port)
                      audit_log_format(ab, " %s=%d", name2, ntohs(port));
      }
      
      /**
       * dump_common_audit_data - helper to dump common audit data
       * @a : common audit data
       *
       */
      static void dump_common_audit_data(struct audit_buffer *ab,
                                         struct common_audit_data *a)
      {
              char comm[sizeof(current->comm)];
      
              /*
               * To keep stack sizes in check force programers to notice if they
               * start making this union too large!  See struct lsm_network_audit
               * as an example of how to deal with large data.
               */
              BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2);
      
  129         audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
              audit_log_untrustedstring(ab, memcpy(comm, current->comm, sizeof(comm)));
      
              switch (a->type) {
              case LSM_AUDIT_DATA_NONE:
                      return;
              case LSM_AUDIT_DATA_IPC:
                      audit_log_format(ab, " key=%d ", a->u.ipc_id);
                      break;
              case LSM_AUDIT_DATA_CAP:
   12                 audit_log_format(ab, " capability=%d ", a->u.cap);
                      break;
              case LSM_AUDIT_DATA_PATH: {
                      struct inode *inode;
      
                      audit_log_d_path(ab, " path=", &a->u.path);
      
                      inode = d_backing_inode(a->u.path.dentry);
                      if (inode) {
                              audit_log_format(ab, " dev=");
                              audit_log_untrustedstring(ab, inode->i_sb->s_id);
                              audit_log_format(ab, " ino=%lu", inode->i_ino);
                      }
                      break;
              }
              case LSM_AUDIT_DATA_FILE: {
                      struct inode *inode;
      
   25                 audit_log_d_path(ab, " path=", &a->u.file->f_path);
      
                      inode = file_inode(a->u.file);
                      if (inode) {
   25                         audit_log_format(ab, " dev=");
                              audit_log_untrustedstring(ab, inode->i_sb->s_id);
                              audit_log_format(ab, " ino=%lu", inode->i_ino);
                      }
                      break;
              }
              case LSM_AUDIT_DATA_IOCTL_OP: {
                      struct inode *inode;
      
    1                 audit_log_d_path(ab, " path=", &a->u.op->path);
      
                      inode = a->u.op->path.dentry->d_inode;
                      if (inode) {
    1                         audit_log_format(ab, " dev=");
                              audit_log_untrustedstring(ab, inode->i_sb->s_id);
                              audit_log_format(ab, " ino=%lu", inode->i_ino);
                      }
      
    1                 audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd);
                      break;
              }
              case LSM_AUDIT_DATA_DENTRY: {
                      struct inode *inode;
      
    6                 audit_log_format(ab, " name=");
                      audit_log_untrustedstring(ab, a->u.dentry->d_name.name);
      
                      inode = d_backing_inode(a->u.dentry);
                      if (inode) {
    5                         audit_log_format(ab, " dev=");
                              audit_log_untrustedstring(ab, inode->i_sb->s_id);
                              audit_log_format(ab, " ino=%lu", inode->i_ino);
                      }
                      break;
              }
              case LSM_AUDIT_DATA_INODE: {
                      struct dentry *dentry;
                      struct inode *inode;
      
    3                 inode = a->u.inode;
                      dentry = d_find_alias(inode);
                      if (dentry) {
    2                         audit_log_format(ab, " name=");
                              audit_log_untrustedstring(ab,
                                               dentry->d_name.name);
                              dput(dentry);
                      }
    3                 audit_log_format(ab, " dev=");
                      audit_log_untrustedstring(ab, inode->i_sb->s_id);
                      audit_log_format(ab, " ino=%lu", inode->i_ino);
                      break;
              }
              case LSM_AUDIT_DATA_TASK: {
                      struct task_struct *tsk = a->u.tsk;
                      if (tsk) {
                              pid_t pid = task_tgid_nr(tsk);
                              if (pid) {
                                      char comm[sizeof(tsk->comm)];
                                      audit_log_format(ab, " opid=%d ocomm=", pid);
                                      audit_log_untrustedstring(ab,
                                          memcpy(comm, tsk->comm, sizeof(comm)));
                              }
                      }
                      break;
              }
              case LSM_AUDIT_DATA_NET:
   30                 if (a->u.net->sk) {
                              struct sock *sk = a->u.net->sk;
                              struct unix_sock *u;
                              struct unix_address *addr;
                              int len = 0;
                              char *p = NULL;
      
   30                         switch (sk->sk_family) {
                              case AF_INET: {
                                      struct inet_sock *inet = inet_sk(sk);
      
                                      print_ipv4_addr(ab, inet->inet_rcv_saddr,
                                                      inet->inet_sport,
                                                      "laddr", "lport");
                                      print_ipv4_addr(ab, inet->inet_daddr,
                                                      inet->inet_dport,
                                                      "faddr", "fport");
                                      break;
                              }
      #if IS_ENABLED(CONFIG_IPV6)
                              case AF_INET6: {
                                      struct inet_sock *inet = inet_sk(sk);
      
                                      print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr,
                                                      inet->inet_sport,
                                                      "laddr", "lport");
                                      print_ipv6_addr(ab, &sk->sk_v6_daddr,
                                                      inet->inet_dport,
                                                      "faddr", "fport");
                                      break;
                              }
      #endif
                              case AF_UNIX:
                                      u = unix_sk(sk);
                                      addr = smp_load_acquire(&u->addr);
                                      if (!addr)
                                              break;
                                      if (u->path.dentry) {
                                              audit_log_d_path(ab, " path=", &u->path);
                                              break;
                                      }
                                      len = addr->len-sizeof(short);
                                      p = &addr->name->sun_path[0];
                                      audit_log_format(ab, " path=");
                                      if (*p)
                                              audit_log_untrustedstring(ab, p);
                                      else
                                              audit_log_n_hex(ab, p, len);
                                      break;
                              }
                      }
      
   30                 switch (a->u.net->family) {
                      case AF_INET:
                              print_ipv4_addr(ab, a->u.net->v4info.saddr,
                                              a->u.net->sport,
                                              "saddr", "src");
                              print_ipv4_addr(ab, a->u.net->v4info.daddr,
                                              a->u.net->dport,
                                              "daddr", "dest");
                              break;
                      case AF_INET6:
                              print_ipv6_addr(ab, &a->u.net->v6info.saddr,
                                              a->u.net->sport,
                                              "saddr", "src");
                              print_ipv6_addr(ab, &a->u.net->v6info.daddr,
                                              a->u.net->dport,
                                              "daddr", "dest");
                              break;
                      }
   30                 if (a->u.net->netif > 0) {
                              struct net_device *dev;
      
                              /* NOTE: we always use init's namespace */
                              dev = dev_get_by_index(&init_net, a->u.net->netif);
                              if (dev) {
                                      audit_log_format(ab, " netif=%s", dev->name);
                                      dev_put(dev);
                              }
                      }
                      break;
      #ifdef CONFIG_KEYS
              case LSM_AUDIT_DATA_KEY:
                      audit_log_format(ab, " key_serial=%u", a->u.key_struct.key);
                      if (a->u.key_struct.key_desc) {
                              audit_log_format(ab, " key_desc=");
  129                         audit_log_untrustedstring(ab, a->u.key_struct.key_desc);
                      }
                      break;
      #endif
              case LSM_AUDIT_DATA_KMOD:
                      audit_log_format(ab, " kmod=");
                      audit_log_untrustedstring(ab, a->u.kmod_name);
                      break;
              case LSM_AUDIT_DATA_IBPKEY: {
                      struct in6_addr sbn_pfx;
      
                      memset(&sbn_pfx.s6_addr, 0,
                             sizeof(sbn_pfx.s6_addr));
                      memcpy(&sbn_pfx.s6_addr, &a->u.ibpkey->subnet_prefix,
                             sizeof(a->u.ibpkey->subnet_prefix));
                      audit_log_format(ab, " pkey=0x%x subnet_prefix=%pI6c",
                                       a->u.ibpkey->pkey, &sbn_pfx);
                      break;
              }
              case LSM_AUDIT_DATA_IBENDPORT:
                      audit_log_format(ab, " device=%s port_num=%u",
                                       a->u.ibendport->dev_name,
                                       a->u.ibendport->port);
                      break;
              } /* switch (a->type) */
      }
      
      /**
       * common_lsm_audit - generic LSM auditing function
       * @a:  auxiliary audit data
       * @pre_audit: lsm-specific pre-audit callback
       * @post_audit: lsm-specific post-audit callback
       *
       * setup the audit buffer for common security information
       * uses callback to print LSM specific information
       */
      void common_lsm_audit(struct common_audit_data *a,
              void (*pre_audit)(struct audit_buffer *, void *),
              void (*post_audit)(struct audit_buffer *, void *))
      {
              struct audit_buffer *ab;
      
  129         if (a == NULL)
                      return;
              /* we use GFP_ATOMIC so we won't sleep */
  129         ab = audit_log_start(current->audit_context, GFP_ATOMIC | __GFP_NOWARN,
                                   AUDIT_AVC);
      
              if (ab == NULL)
                      return;
      
  129         if (pre_audit)
  129                 pre_audit(ab, a);
      
  129         dump_common_audit_data(ab, a);
      
              if (post_audit)
  129                 post_audit(ab, a);
      
  128         audit_log_end(ab);
      }
      #include <linux/kdebug.h>
      #include <linux/kprobes.h>
      #include <linux/export.h>
      #include <linux/notifier.h>
      #include <linux/rcupdate.h>
      #include <linux/vmalloc.h>
      #include <linux/reboot.h>
      
      /*
       *        Notifier list for kernel code which wants to be called
       *        at shutdown. This is used to stop any idling DMA operations
       *        and the like.
       */
      BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
      
      /*
       *        Notifier chain core routines.  The exported routines below
       *        are layered on top of these, with appropriate locking added.
       */
      
      static int notifier_chain_register(struct notifier_block **nl,
                      struct notifier_block *n)
      {
              while ((*nl) != NULL) {
                      if (n->priority > (*nl)->priority)
                              break;
                      nl = &((*nl)->next);
              }
              n->next = *nl;
              rcu_assign_pointer(*nl, n);
              return 0;
      }
      
      static int notifier_chain_cond_register(struct notifier_block **nl,
                      struct notifier_block *n)
      {
              while ((*nl) != NULL) {
                      if ((*nl) == n)
                              return 0;
                      if (n->priority > (*nl)->priority)
                              break;
                      nl = &((*nl)->next);
              }
              n->next = *nl;
              rcu_assign_pointer(*nl, n);
              return 0;
      }
      
      static int notifier_chain_unregister(struct notifier_block **nl,
                      struct notifier_block *n)
      {
              while ((*nl) != NULL) {
                      if ((*nl) == n) {
                              rcu_assign_pointer(*nl, n->next);
                              return 0;
                      }
                      nl = &((*nl)->next);
              }
              return -ENOENT;
      }
      
      /**
       * notifier_call_chain - Informs the registered notifiers about an event.
       *        @nl:                Pointer to head of the blocking notifier chain
       *        @val:                Value passed unmodified to notifier function
       *        @v:                Pointer passed unmodified to notifier function
       *        @nr_to_call:        Number of notifier functions to be called. Don't care
       *                        value of this parameter is -1.
       *        @nr_calls:        Records the number of notifications sent. Don't care
       *                        value of this field is NULL.
       *        @returns:        notifier_call_chain returns the value returned by the
       *                        last notifier function called.
       */
      static int notifier_call_chain(struct notifier_block **nl,
                                     unsigned long val, void *v,
                                     int nr_to_call, int *nr_calls)
      {
              int ret = NOTIFY_DONE;
              struct notifier_block *nb, *next_nb;
      
 1772         nb = rcu_dereference_raw(*nl);
      
 1500         while (nb && nr_to_call) {
 1500                 next_nb = rcu_dereference_raw(nb->next);
      
      #ifdef CONFIG_DEBUG_NOTIFIERS
                      if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                              WARN(1, "Invalid notifier called!");
                              nb = next_nb;
                              continue;
                      }
      #endif
 1500                 ret = nb->notifier_call(nb, val, v);
      
                      if (nr_calls)
                              (*nr_calls)++;
      
 1495                 if (ret & NOTIFY_STOP_MASK)
                              break;
                      nb = next_nb;
 1471                 nr_to_call--;
              }
 1763         return ret;
      }
      NOKPROBE_SYMBOL(notifier_call_chain);
      
      /*
       *        Atomic notifier chain routines.  Registration and unregistration
       *        use a spinlock, and call_chain is synchronized by RCU (no locks).
       */
      
      /**
       *        atomic_notifier_chain_register - Add notifier to an atomic notifier chain
       *        @nh: Pointer to head of the atomic notifier chain
       *        @n: New entry in notifier chain
       *
       *        Adds a notifier to an atomic notifier chain.
       *
       *        Currently always returns zero.
       */
      int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                      struct notifier_block *n)
      {
              unsigned long flags;
              int ret;
      
              spin_lock_irqsave(&nh->lock, flags);
              ret = notifier_chain_register(&nh->head, n);
              spin_unlock_irqrestore(&nh->lock, flags);
              return ret;
      }
      EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
      
      /**
       *        atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
       *        @nh: Pointer to head of the atomic notifier chain
       *        @n: Entry to remove from notifier chain
       *
       *        Removes a notifier from an atomic notifier chain.
       *
       *        Returns zero on success or %-ENOENT on failure.
       */
      int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                      struct notifier_block *n)
      {
              unsigned long flags;
              int ret;
      
              spin_lock_irqsave(&nh->lock, flags);
              ret = notifier_chain_unregister(&nh->head, n);
              spin_unlock_irqrestore(&nh->lock, flags);
              synchronize_rcu();
              return ret;
      }
      EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
      
      /**
       *        __atomic_notifier_call_chain - Call functions in an atomic notifier chain
       *        @nh: Pointer to head of the atomic notifier chain
       *        @val: Value passed unmodified to notifier function
       *        @v: Pointer passed unmodified to notifier function
       *        @nr_to_call: See the comment for notifier_call_chain.
       *        @nr_calls: See the comment for notifier_call_chain.
       *
       *        Calls each function in a notifier chain in turn.  The functions
       *        run in an atomic context, so they must not block.
       *        This routine uses RCU to synchronize with changes to the chain.
       *
       *        If the return value of the notifier can be and'ed
       *        with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
       *        will return immediately, with the return value of
       *        the notifier function which halted execution.
       *        Otherwise the return value is the return value
       *        of the last notifier function called.
       */
      int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                                       unsigned long val, void *v,
                                       int nr_to_call, int *nr_calls)
      {
              int ret;
      
  702         rcu_read_lock();
  702         ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
  702         rcu_read_unlock();
              return ret;
      }
      EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
      NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
      
      int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                                     unsigned long val, void *v)
      {
  687         return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
      }
      EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
      NOKPROBE_SYMBOL(atomic_notifier_call_chain);
      
      /*
       *        Blocking notifier chain routines.  All access to the chain is
       *        synchronized by an rwsem.
       */
      
      /**
       *        blocking_notifier_chain_register - Add notifier to a blocking notifier chain
       *        @nh: Pointer to head of the blocking notifier chain
       *        @n: New entry in notifier chain
       *
       *        Adds a notifier to a blocking notifier chain.
       *        Must be called in process context.
       *
       *        Currently always returns zero.
       */
      int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                      struct notifier_block *n)
      {
              int ret;
      
              /*
               * This code gets used during boot-up, when task switching is
               * not yet working and interrupts must remain disabled.  At
               * such times we must not call down_write().
               */
              if (unlikely(system_state == SYSTEM_BOOTING))
                      return notifier_chain_register(&nh->head, n);
      
              down_write(&nh->rwsem);
              ret = notifier_chain_register(&nh->head, n);
              up_write(&nh->rwsem);
              return ret;
      }
      EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
      
      /**
       *        blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
       *        @nh: Pointer to head of the blocking notifier chain
       *        @n: New entry in notifier chain
       *
       *        Adds a notifier to a blocking notifier chain, only if not already
       *        present in the chain.
       *        Must be called in process context.
       *
       *        Currently always returns zero.
       */
      int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
                      struct notifier_block *n)
      {
              int ret;
      
              down_write(&nh->rwsem);
              ret = notifier_chain_cond_register(&nh->head, n);
              up_write(&nh->rwsem);
              return ret;
      }
      EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
      
      /**
       *        blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
       *        @nh: Pointer to head of the blocking notifier chain
       *        @n: Entry to remove from notifier chain
       *
       *        Removes a notifier from a blocking notifier chain.
       *        Must be called from process context.
       *
       *        Returns zero on success or %-ENOENT on failure.
       */
      int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                      struct notifier_block *n)
      {
              int ret;
      
              /*
               * This code gets used during boot-up, when task switching is
               * not yet working and interrupts must remain disabled.  At
               * such times we must not call down_write().
               */
              if (unlikely(system_state == SYSTEM_BOOTING))
                      return notifier_chain_unregister(&nh->head, n);
      
              down_write(&nh->rwsem);
              ret = notifier_chain_unregister(&nh->head, n);
              up_write(&nh->rwsem);
              return ret;
      }
      EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
      
      /**
       *        __blocking_notifier_call_chain - Call functions in a blocking notifier chain
       *        @nh: Pointer to head of the blocking notifier chain
       *        @val: Value passed unmodified to notifier function
       *        @v: Pointer passed unmodified to notifier function
       *        @nr_to_call: See comment for notifier_call_chain.
       *        @nr_calls: See comment for notifier_call_chain.
       *
       *        Calls each function in a notifier chain in turn.  The functions
       *        run in a process context, so they are allowed to block.
       *
       *        If the return value of the notifier can be and'ed
       *        with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
       *        will return immediately, with the return value of
       *        the notifier function which halted execution.
       *        Otherwise the return value is the return value
       *        of the last notifier function called.
       */
  530 int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                                         unsigned long val, void *v,
                                         int nr_to_call, int *nr_calls)
      {
              int ret = NOTIFY_DONE;
      
              /*
               * We check the head outside the lock, but if this access is
               * racy then it does not matter what the result of the test
               * is, we re-check the list after having taken the lock anyway:
               */
  666         if (rcu_access_pointer(nh->head)) {
  530                 down_read(&nh->rwsem);
                      ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
                                              nr_calls);
                      up_read(&nh->rwsem);
              }
              return ret;
      }
      EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
      
  530 int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                      unsigned long val, void *v)
      {
  666         return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
      }
      EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
      
      /*
       *        Raw notifier chain routines.  There is no protection;
       *        the caller must provide it.  Use at your own risk!
       */
      
      /**
       *        raw_notifier_chain_register - Add notifier to a raw notifier chain
       *        @nh: Pointer to head of the raw notifier chain
       *        @n: New entry in notifier chain
       *
       *        Adds a notifier to a raw notifier chain.
       *        All locking must be provided by the caller.
       *
       *        Currently always returns zero.
       */
      int raw_notifier_chain_register(struct raw_notifier_head *nh,
                      struct notifier_block *n)
      {
              return notifier_chain_register(&nh->head, n);
      }
      EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
      
      /**
       *        raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
       *        @nh: Pointer to head of the raw notifier chain
       *        @n: Entry to remove from notifier chain
       *
       *        Removes a notifier from a raw notifier chain.
       *        All locking must be provided by the caller.
       *
       *        Returns zero on success or %-ENOENT on failure.
       */
      int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                      struct notifier_block *n)
      {
              return notifier_chain_unregister(&nh->head, n);
      }
      EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
      
      /**
       *        __raw_notifier_call_chain - Call functions in a raw notifier chain
       *        @nh: Pointer to head of the raw notifier chain
       *        @val: Value passed unmodified to notifier function
       *        @v: Pointer passed unmodified to notifier function
       *        @nr_to_call: See comment for notifier_call_chain.
       *        @nr_calls: See comment for notifier_call_chain
       *
       *        Calls each function in a notifier chain in turn.  The functions
       *        run in an undefined context.
       *        All locking must be provided by the caller.
       *
       *        If the return value of the notifier can be and'ed
       *        with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
       *        will return immediately, with the return value of
       *        the notifier function which halted execution.
       *        Otherwise the return value is the return value
       *        of the last notifier function called.
       */
      int __raw_notifier_call_chain(struct raw_notifier_head *nh,
                                    unsigned long val, void *v,
                                    int nr_to_call, int *nr_calls)
      {
 1039         return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
      }
      EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
      
      int raw_notifier_call_chain(struct raw_notifier_head *nh,
                      unsigned long val, void *v)
      {
 1039         return __raw_notifier_call_chain(nh, val, v, -1, NULL);
      }
      EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
      
      #ifdef CONFIG_SRCU
      /*
       *        SRCU notifier chain routines.    Registration and unregistration
       *        use a mutex, and call_chain is synchronized by SRCU (no locks).
       */
      
      /**
       *        srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
       *        @nh: Pointer to head of the SRCU notifier chain
       *        @n: New entry in notifier chain
       *
       *        Adds a notifier to an SRCU notifier chain.
       *        Must be called in process context.
       *
       *        Currently always returns zero.
       */
      int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                      struct notifier_block *n)
      {
              int ret;
      
              /*
               * This code gets used during boot-up, when task switching is
               * not yet working and interrupts must remain disabled.  At
               * such times we must not call mutex_lock().
               */
              if (unlikely(system_state == SYSTEM_BOOTING))
                      return notifier_chain_register(&nh->head, n);
      
              mutex_lock(&nh->mutex);
              ret = notifier_chain_register(&nh->head, n);
              mutex_unlock(&nh->mutex);
              return ret;
      }
      EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
      
      /**
       *        srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
       *        @nh: Pointer to head of the SRCU notifier chain
       *        @n: Entry to remove from notifier chain
       *
       *        Removes a notifier from an SRCU notifier chain.
       *        Must be called from process context.
       *
       *        Returns zero on success or %-ENOENT on failure.
       */
      int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                      struct notifier_block *n)
      {
              int ret;
      
              /*
               * This code gets used during boot-up, when task switching is
               * not yet working and interrupts must remain disabled.  At
               * such times we must not call mutex_lock().
               */
              if (unlikely(system_state == SYSTEM_BOOTING))
                      return notifier_chain_unregister(&nh->head, n);
      
              mutex_lock(&nh->mutex);
              ret = notifier_chain_unregister(&nh->head, n);
              mutex_unlock(&nh->mutex);
              synchronize_srcu(&nh->srcu);
              return ret;
      }
      EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
      
      /**
       *        __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
       *        @nh: Pointer to head of the SRCU notifier chain
       *        @val: Value passed unmodified to notifier function
       *        @v: Pointer passed unmodified to notifier function
       *        @nr_to_call: See comment for notifier_call_chain.
       *        @nr_calls: See comment for notifier_call_chain
       *
       *        Calls each function in a notifier chain in turn.  The functions
       *        run in a process context, so they are allowed to block.
       *
       *        If the return value of the notifier can be and'ed
       *        with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
       *        will return immediately, with the return value of
       *        the notifier function which halted execution.
       *        Otherwise the return value is the return value
       *        of the last notifier function called.
       */
      int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                                     unsigned long val, void *v,
                                     int nr_to_call, int *nr_calls)
      {
              int ret;
              int idx;
      
              idx = srcu_read_lock(&nh->srcu);
              ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
              srcu_read_unlock(&nh->srcu, idx);
              return ret;
      }
      EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
      
      int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                      unsigned long val, void *v)
      {
              return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
      }
      EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
      
      /**
       *        srcu_init_notifier_head - Initialize an SRCU notifier head
       *        @nh: Pointer to head of the srcu notifier chain
       *
       *        Unlike other sorts of notifier heads, SRCU notifier heads require
       *        dynamic initialization.  Be sure to call this routine before
       *        calling any of the other SRCU notifier routines for this head.
       *
       *        If an SRCU notifier head is deallocated, it must first be cleaned
       *        up by calling srcu_cleanup_notifier_head().  Otherwise the head's
       *        per-cpu data (used by the SRCU mechanism) will leak.
       */
      void srcu_init_notifier_head(struct srcu_notifier_head *nh)
      {
              mutex_init(&nh->mutex);
              if (init_srcu_struct(&nh->srcu) < 0)
                      BUG();
              nh->head = NULL;
      }
      EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
      
      #endif /* CONFIG_SRCU */
      
      static ATOMIC_NOTIFIER_HEAD(die_chain);
      
      int notrace notify_die(enum die_val val, const char *str,
                     struct pt_regs *regs, long err, int trap, int sig)
      {
   15         struct die_args args = {
                      .regs        = regs,
                      .str        = str,
                      .err        = err,
                      .trapnr        = trap,
                      .signr        = sig,
      
              };
   15         RCU_LOCKDEP_WARN(!rcu_is_watching(),
                                 "notify_die called but RCU thinks we're quiescent");
   15         return atomic_notifier_call_chain(&die_chain, val, &args);
      }
      NOKPROBE_SYMBOL(notify_die);
      
      int register_die_notifier(struct notifier_block *nb)
      {
              vmalloc_sync_all();
              return atomic_notifier_chain_register(&die_chain, nb);
      }
      EXPORT_SYMBOL_GPL(register_die_notifier);
      
      int unregister_die_notifier(struct notifier_block *nb)
      {
              return atomic_notifier_chain_unregister(&die_chain, nb);
      }
      EXPORT_SYMBOL_GPL(unregister_die_notifier);
      // SPDX-License-Identifier: GPL-2.0
      /*
       * trace context switch
       *
       * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
       *
       */
      #include <linux/module.h>
      #include <linux/kallsyms.h>
      #include <linux/uaccess.h>
      #include <linux/ftrace.h>
      #include <trace/events/sched.h>
      
      #include "trace.h"
      
      #define RECORD_CMDLINE        1
      #define RECORD_TGID        2
      
      static int                sched_cmdline_ref;
      static int                sched_tgid_ref;
      static DEFINE_MUTEX(sched_register_mutex);
      
      static void
      probe_sched_switch(void *ignore, bool preempt,
                         struct task_struct *prev, struct task_struct *next)
      {
              int flags;
      
 2224         flags = (RECORD_TGID * !!sched_tgid_ref) +
                      (RECORD_CMDLINE * !!sched_cmdline_ref);
      
 2224         if (!flags)
                      return;
 2224         tracing_record_taskinfo_sched_switch(prev, next, flags);
      }
      
      static void
      probe_sched_wakeup(void *ignore, struct task_struct *wakee)
      {
              int flags;
      
 1030         flags = (RECORD_TGID * !!sched_tgid_ref) +
                      (RECORD_CMDLINE * !!sched_cmdline_ref);
      
 1030         if (!flags)
                      return;
 1030         tracing_record_taskinfo(current, flags);
      }
      
      static int tracing_sched_register(void)
      {
              int ret;
      
              ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
              if (ret) {
                      pr_info("wakeup trace: Couldn't activate tracepoint"
                              " probe to kernel_sched_wakeup\n");
                      return ret;
              }
      
              ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
              if (ret) {
                      pr_info("wakeup trace: Couldn't activate tracepoint"
                              " probe to kernel_sched_wakeup_new\n");
                      goto fail_deprobe;
              }
      
              ret = register_trace_sched_switch(probe_sched_switch, NULL);
              if (ret) {
                      pr_info("sched trace: Couldn't activate tracepoint"
                              " probe to kernel_sched_switch\n");
                      goto fail_deprobe_wake_new;
              }
      
              return ret;
      fail_deprobe_wake_new:
              unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
      fail_deprobe:
              unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
              return ret;
      }
      
      static void tracing_sched_unregister(void)
      {
              unregister_trace_sched_switch(probe_sched_switch, NULL);
              unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
              unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
      }
      
      static void tracing_start_sched_switch(int ops)
      {
              bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
              mutex_lock(&sched_register_mutex);
      
              switch (ops) {
              case RECORD_CMDLINE:
                      sched_cmdline_ref++;
                      break;
      
              case RECORD_TGID:
                      sched_tgid_ref++;
                      break;
              }
      
              if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
                      tracing_sched_register();
              mutex_unlock(&sched_register_mutex);
      }
      
      static void tracing_stop_sched_switch(int ops)
      {
              mutex_lock(&sched_register_mutex);
      
              switch (ops) {
              case RECORD_CMDLINE:
                      sched_cmdline_ref--;
                      break;
      
              case RECORD_TGID:
                      sched_tgid_ref--;
                      break;
              }
      
              if (!sched_cmdline_ref && !sched_tgid_ref)
                      tracing_sched_unregister();
              mutex_unlock(&sched_register_mutex);
      }
      
      void tracing_start_cmdline_record(void)
      {
              tracing_start_sched_switch(RECORD_CMDLINE);
      }
      
      void tracing_stop_cmdline_record(void)
      {
              tracing_stop_sched_switch(RECORD_CMDLINE);
      }
      
      void tracing_start_tgid_record(void)
      {
              tracing_start_sched_switch(RECORD_TGID);
      }
      
      void tracing_stop_tgid_record(void)
      {
              tracing_stop_sched_switch(RECORD_TGID);
      }
      /*
       * ratelimit.c - Do something with rate limit.
       *
       * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
       *
       * 2008-05-01 rewrite the function and use a ratelimit_state data struct as
       * parameter. Now every user can use their own standalone ratelimit_state.
       *
       * This file is released under the GPLv2.
       */
      
      #include <linux/ratelimit.h>
      #include <linux/jiffies.h>
      #include <linux/export.h>
      
      /*
       * __ratelimit - rate limiting
       * @rs: ratelimit_state data
       * @func: name of calling function
       *
       * This enforces a rate limit: not more than @rs->burst callbacks
       * in every @rs->interval
       *
       * RETURNS:
       * 0 means callbacks will be suppressed.
       * 1 means go ahead and do it.
       */
 1037 int ___ratelimit(struct ratelimit_state *rs, const char *func)
      {
              unsigned long flags;
              int ret;
      
 1272         if (!rs->interval)
                      return 1;
      
              /*
               * If we contend on this state's lock then almost
               * by definition we are too busy to print a message,
               * in addition to the one that will be printed by
               * the entity that is holding the lock already:
               */
 1036         if (!raw_spin_trylock_irqsave(&rs->lock, flags))
                      return 0;
      
 1037         if (!rs->begin)
                      rs->begin = jiffies;
      
 1037         if (time_is_before_jiffies(rs->begin + rs->interval)) {
  209                 if (rs->missed) {
   83                         if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
   83                                 printk_deferred(KERN_WARNING
                                                      "%s: %d callbacks suppressed\n",
                                                      func, rs->missed);
                                      rs->missed = 0;
                              }
                      }
  209                 rs->begin   = jiffies;
                      rs->printed = 0;
              }
 1037         if (rs->burst && rs->burst > rs->printed) {
  861                 rs->printed++;
                      ret = 1;
              } else {
  383                 rs->missed++;
                      ret = 0;
              }
 1037         raw_spin_unlock_irqrestore(&rs->lock, flags);
      
              return ret;
      }
      EXPORT_SYMBOL(___ratelimit);
      // SPDX-License-Identifier: GPL-2.0
      /*
       * This file contains the procedures for the handling of select and poll
       *
       * Created for Linux based loosely upon Mathius Lattner's minix
       * patches by Peter MacDonald. Heavily edited by Linus.
       *
       *  4 February 1994
       *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
       *     flag set in its personality we do *not* modify the given timeout
       *     parameter to reflect time remaining.
       *
       *  24 January 2000
       *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
       *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
       */
      
      #include <linux/kernel.h>
      #include <linux/sched/signal.h>
      #include <linux/sched/rt.h>
      #include <linux/syscalls.h>
      #include <linux/export.h>
      #include <linux/slab.h>
      #include <linux/poll.h>
      #include <linux/personality.h> /* for STICKY_TIMEOUTS */
      #include <linux/file.h>
      #include <linux/fdtable.h>
      #include <linux/fs.h>
      #include <linux/rcupdate.h>
      #include <linux/hrtimer.h>
      #include <linux/freezer.h>
      #include <net/busy_poll.h>
      #include <linux/vmalloc.h>
      
      #include <linux/uaccess.h>
      
      
      /*
       * Estimate expected accuracy in ns from a timeval.
       *
       * After quite a bit of churning around, we've settled on
       * a simple thing of taking 0.1% of the timeout as the
       * slack, with a cap of 100 msec.
       * "nice" tasks get a 0.5% slack instead.
       *
       * Consider this comment an open invitation to come up with even
       * better solutions..
       */
      
      #define MAX_SLACK        (100 * NSEC_PER_MSEC)
      
      static long __estimate_accuracy(struct timespec64 *tv)
      {
              long slack;
              int divfactor = 1000;
      
              if (tv->tv_sec < 0)
                      return 0;
      
  205         if (task_nice(current) > 0)
                      divfactor = divfactor / 5;
      
  205         if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
                      return MAX_SLACK;
      
  191         slack = tv->tv_nsec / divfactor;
              slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
      
              if (slack > MAX_SLACK)
                      return MAX_SLACK;
      
              return slack;
      }
      
      u64 select_estimate_accuracy(struct timespec64 *tv)
      {
              u64 ret;
              struct timespec64 now;
      
              /*
               * Realtime tasks get a slack of 0 for obvious reasons.
               */
      
  219         if (rt_task(current))
                      return 0;
      
  206         ktime_get_ts64(&now);
              now = timespec64_sub(*tv, now);
  205         ret = __estimate_accuracy(&now);
  219         if (ret < current->timer_slack_ns)
   54                 return current->timer_slack_ns;
              return ret;
      }
      
      
      
      struct poll_table_page {
              struct poll_table_page * next;
              struct poll_table_entry * entry;
              struct poll_table_entry entries[0];
      };
      
      #define POLL_TABLE_FULL(table) \
              ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
      
      /*
       * Ok, Peter made a complicated, but straightforward multiple_wait() function.
       * I have rewritten this, taking some shortcuts: This code may not be easy to
       * follow, but it should be free of race-conditions, and it's practical. If you
       * understand what I'm doing here, then you understand how the linux
       * sleep/wakeup mechanism works.
       *
       * Two very simple procedures, poll_wait() and poll_freewait() make all the
       * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
       * as all select/poll functions have to call it to add an entry to the
       * poll table.
       */
      static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                             poll_table *p);
      
      void poll_initwait(struct poll_wqueues *pwq)
      {
  403         init_poll_funcptr(&pwq->pt, __pollwait);
              pwq->polling_task = current;
              pwq->triggered = 0;
              pwq->error = 0;
              pwq->table = NULL;
              pwq->inline_index = 0;
      }
      EXPORT_SYMBOL(poll_initwait);
      
      static void free_poll_entry(struct poll_table_entry *entry)
      {
              remove_wait_queue(entry->wait_address, &entry->wait);
              fput(entry->filp);
      }
      
      void poll_freewait(struct poll_wqueues *pwq)
      {
  264         struct poll_table_page * p = pwq->table;
              int i;
  159         for (i = 0; i < pwq->inline_index; i++)
  159                 free_poll_entry(pwq->inline_entries + i);
  264         while (p) {
                      struct poll_table_entry * entry;
                      struct poll_table_page *old;
      
    6                 entry = p->entry;
                      do {
    6                         entry--;
                              free_poll_entry(entry);
                      } while (entry > p->entries);
                      old = p;
    6                 p = p->next;
                      free_page((unsigned long) old);
              }
      }
      EXPORT_SYMBOL(poll_freewait);
      
      static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
      {
    9         struct poll_table_page *table = p->table;
      
  255         if (p->inline_index < N_INLINE_POLL_ENTRIES)
  255                 return p->inline_entries + p->inline_index++;
      
    3         if (!table || POLL_TABLE_FULL(table)) {
                      struct poll_table_page *new_table;
      
    9                 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
                      if (!new_table) {
                              p->error = -ENOMEM;
                              return NULL;
                      }
    9                 new_table->entry = new_table->entries;
                      new_table->next = table;
                      p->table = new_table;
                      table = new_table;
              }
      
    9         return table->entry++;
      }
      
      static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
      {
              struct poll_wqueues *pwq = wait->private;
              DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
      
              /*
               * Although this function is called under waitqueue lock, LOCK
               * doesn't imply write barrier and the users expect write
               * barrier semantics on wakeup functions.  The following
               * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
               * and is paired with smp_store_mb() in poll_schedule_timeout.
               */
              smp_wmb();
              pwq->triggered = 1;
      
              /*
               * Perform the default wake up operation using a dummy
               * waitqueue.
               *
               * TODO: This is hacky but there currently is no interface to
               * pass in @sync.  @sync is scheduled to be removed and once
               * that happens, wake_up_process() can be used directly.
               */
   71         return default_wake_function(&dummy_wait, mode, sync, key);
      }
      
      static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
      {
              struct poll_table_entry *entry;
      
              entry = container_of(wait, struct poll_table_entry, wait);
   73         if (key && !((unsigned long)key & entry->key))
                      return 0;
   73         return __pollwake(wait, mode, sync, key);
      }
      
      /* Add a new entry */
      static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                      poll_table *p)
      {
              struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
  255         struct poll_table_entry *entry = poll_get_entry(pwq);
  255         if (!entry)
                      return;
  255         entry->filp = get_file(filp);
              entry->wait_address = wait_address;
              entry->key = p->_key;
              init_waitqueue_func_entry(&entry->wait, pollwake);
              entry->wait.private = pwq;
              add_wait_queue(wait_address, &entry->wait);
      }
      
      int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
                                ktime_t *expires, unsigned long slack)
      {
              int rc = -EINTR;
      
  263         set_current_state(state);
              if (!pwq->triggered)
  263                 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
  214         __set_current_state(TASK_RUNNING);
      
              /*
               * Prepare for the next iteration.
               *
               * The following smp_store_mb() serves two purposes.  First, it's
               * the counterpart rmb of the wmb in pollwake() such that data
               * written before wake up is always visible after wake up.
               * Second, the full barrier guarantees that triggered clearing
               * doesn't pass event check of the next iteration.  Note that
               * this problem doesn't exist for the first iteration as
               * add_wait_queue() has full barrier semantics.
               */
              smp_store_mb(pwq->triggered, 0);
      
              return rc;
      }
      EXPORT_SYMBOL(poll_schedule_timeout);
      
      /**
       * poll_select_set_timeout - helper function to setup the timeout value
       * @to:                pointer to timespec64 variable for the final timeout
       * @sec:        seconds (from user space)
       * @nsec:        nanoseconds (from user space)
       *
       * Note, we do not use a timespec for the user space value here, That
       * way we can use the function for timeval and compat interfaces as well.
       *
       * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
       */
  315 int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
      {
              struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
      
  422         if (!timespec64_valid(&ts))
                      return -EINVAL;
      
              /* Optimize for the zero timeout value here */
  418         if (!sec && !nsec) {
  105                 to->tv_sec = to->tv_nsec = 0;
              } else {
  315                 ktime_get_ts64(to);
  315                 *to = timespec64_add_safe(*to, ts);
              }
  105         return 0;
   10 }
      
      static int poll_select_copy_remaining(struct timespec64 *end_time,
                                            void __user *p,
                                            int timeval, int ret)
      {
              struct timespec64 rts64;
              struct timespec rts;
              struct timeval rtv;
      
  209         if (!p)
                      return ret;
      
  168         if (current->personality & STICKY_TIMEOUTS)
                      goto sticky;
      
              /* No update for zero timeout */
  167         if (!end_time->tv_sec && !end_time->tv_nsec)
                      return ret;
      
  120         ktime_get_ts64(&rts64);
   27         rts64 = timespec64_sub(*end_time, rts64);
              if (rts64.tv_sec < 0)
   98                 rts64.tv_sec = rts64.tv_nsec = 0;
      
  120         rts = timespec64_to_timespec(rts64);
      
              if (timeval) {
                      if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
                              memset(&rtv, 0, sizeof(rtv));
    3                 rtv.tv_sec = rts64.tv_sec;
                      rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
      
                      if (!copy_to_user(p, &rtv, sizeof(rtv)))
                              return ret;
      
  117         } else if (!copy_to_user(p, &rts, sizeof(rts)))
                      return ret;
      
              /*
               * If an application puts its timeval in read-only memory, we
               * don't want the Linux-specific update to the timeval to
               * cause a fault after the select has completed
               * successfully. However, because we're not updating the
               * timeval, we can't restart the system call.
               */
      
      sticky:
  209         if (ret == -ERESTARTNOHAND)
                      ret = -EINTR;
              return ret;
      }
      
      /*
       * Scalable version of the fd_set.
       */
      
      typedef struct {
              unsigned long *in, *out, *ex;
              unsigned long *res_in, *res_out, *res_ex;
      } fd_set_bits;
      
      /*
       * How many longwords for "nr" bits?
       */
      #define FDS_BITPERLONG        (8*sizeof(long))
      #define FDS_LONGS(nr)        (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
      #define FDS_BYTES(nr)        (FDS_LONGS(nr)*sizeof(long))
      
      /*
       * We do a VERIFY_WRITE here even though we are only reading this time:
       * we'll write to it eventually..
       *
       * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
       */
      static inline
      int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
      {
              nr = FDS_BYTES(nr);
              if (ufdset)
  194                 return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;
      
  195         memset(fdset, 0, nr);
              return 0;
      }
      
      static inline unsigned long __must_check
      set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
      {
  116         if (ufdset)
                      return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
              return 0;
      }
      
      static inline
      void zero_fd_set(unsigned long nr, unsigned long *fdset)
      {
   10         memset(fdset, 0, FDS_BYTES(nr));
      }
      
      #define FDS_IN(fds, n)                (fds->in + n)
      #define FDS_OUT(fds, n)                (fds->out + n)
      #define FDS_EX(fds, n)                (fds->ex + n)
      
      #define BITS(fds, n)        (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
      
      static int max_select_fd(unsigned long n, fd_set_bits *fds)
      {
              unsigned long *open_fds;
              unsigned long set;
              int max;
              struct fdtable *fdt;
      
              /* handle last in-complete long-word first */
              set = ~(~0UL << (n & (BITS_PER_LONG-1)));
              n /= BITS_PER_LONG;
  210         fdt = files_fdtable(current->files);
  210         open_fds = fdt->open_fds + n;
              max = 0;
              if (set) {
    5                 set &= BITS(fds, n);
                      if (set) {
    4                         if (!(set & ~*open_fds))
                                      goto get_max;
                              return -EBADF;
                      }
              }
  209         while (n) {
  186                 open_fds--;
                      n--;
  186                 set = BITS(fds, n);
                      if (!set)
                              continue;
  180                 if (set & ~*open_fds)
                              return -EBADF;
  168                 if (max)
                              continue;
      get_max:
                      do {
  171                         max++;
                              set >>= 1;
                      } while (set);
  171                 max += n * BITS_PER_LONG;
              }
      
              return max;
      }
      
      #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
      #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
      #define POLLEX_SET (POLLPRI)
      
      static inline void wait_key_set(poll_table *wait, unsigned long in,
                                      unsigned long out, unsigned long bit,
                                      unsigned int ll_flag)
      {
  165         wait->_key = POLLEX_SET | ll_flag;
  170         if (in & bit)
    9                 wait->_key |= POLLIN_SET;
  170         if (out & bit)
    5                 wait->_key |= POLLOUT_SET;
      }
      
      static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
      {
              ktime_t expire, *to = NULL;
              struct poll_wqueues table;
              poll_table *wait;
              int retval, i, timed_out = 0;
              u64 slack = 0;
  210         unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
              unsigned long busy_start = 0;
      
  210         rcu_read_lock();
  210         retval = max_select_fd(n, fds);
  210         rcu_read_unlock();
      
              if (retval < 0)
                      return retval;
              n = retval;
      
  196         poll_initwait(&table);
              wait = &table.pt;
  156         if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
   38                 wait->_qproc = NULL;
                      timed_out = 1;
              }
      
              if (end_time && !timed_out)
  118                 slack = select_estimate_accuracy(end_time);
      
              retval = 0;
              for (;;) {
                      unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
                      bool can_busy_loop = false;
      
  199                 inp = fds->in; outp = fds->out; exp = fds->ex;
                      rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
      
  164                 for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
                              unsigned long in, out, ex, all_bits, bit = 1, mask, j;
                              unsigned long res_in = 0, res_out = 0, res_ex = 0;
      
  171                         in = *inp++; out = *outp++; ex = *exp++;
                              all_bits = in | out | ex;
                              if (all_bits == 0) {
    3                                 i += BITS_PER_LONG;
                                      continue;
                              }
      
  171                         for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
                                      struct fd f;
  171                                 if (i >= n)
                                              break;
  171                                 if (!(bit & all_bits))
                                              continue;
  171                                 f = fdget(i);
                                      if (f.file) {
                                              const struct file_operations *f_op;
                                              f_op = f.file->f_op;
  164                                         mask = DEFAULT_POLLMASK;
                                              if (f_op->poll) {
  170                                                 wait_key_set(wait, in, out,
                                                                   bit, busy_flag);
  170                                                 mask = (*f_op->poll)(f.file, wait);
                                              }
  171                                         fdput(f);
  171                                         if ((mask & POLLIN_SET) && (in & bit)) {
    7                                                 res_in |= bit;
  162                                                 retval++;
                                                      wait->_qproc = NULL;
                                              }
  171                                         if ((mask & POLLOUT_SET) && (out & bit)) {
    5                                                 res_out |= bit;
                                                      retval++;
                                                      wait->_qproc = NULL;
                                              }
  171                                         if ((mask & POLLEX_SET) && (ex & bit)) {
    2                                                 res_ex |= bit;
                                                      retval++;
                                                      wait->_qproc = NULL;
                                              }
                                              /* got something, stop busy polling */
  171                                         if (retval) {
                                                      can_busy_loop = false;
                                                      busy_flag = 0;
      
                                              /*
                                               * only remember a returned
                                               * POLL_BUSY_LOOP if we asked for it
                                               */
  162                                         } else if (busy_flag & mask)
                                                      can_busy_loop = true;
      
                                      }
                              }
  164                         if (res_in)
    5                                 *rinp = res_in;
  164                         if (res_out)
    3                                 *routp = res_out;
  164                         if (res_ex)
    2                                 *rexp = res_ex;
  164                         cond_resched();
                      }
  192                 wait->_qproc = NULL;
  151                 if (retval || timed_out || signal_pending(current))
                              break;
  148                 if (table.error) {
                              retval = table.error;
                              break;
                      }
      
                      /* only if found POLL_BUSY_LOOP sockets && not out of time */
  148                 if (can_busy_loop && !need_resched()) {
                              if (!busy_start) {
                                      busy_start = busy_loop_current_time();
                                      continue;
                              }
                              if (!busy_loop_timeout(busy_start))
                                      continue;
                      }
                      busy_flag = 0;
      
                      /*
                       * If this is the first loop and we have a timeout
                       * given, then we convert to ktime_t and set the to
                       * pointer to the expiry value.
                       */
  148                 if (end_time && !to) {
  114                         expire = timespec64_to_ktime(*end_time);
                              to = &expire;
                      }
      
  148                 if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                                                 to, slack))
                              timed_out = 1;
              }
      
  120         poll_freewait(&table);
      
  133         return retval;
      }
      
      /*
       * We can actually return ERESTARTSYS instead of EINTR, but I'd
       * like to be certain this leads to no problems. So I return
       * EINTR just for safety.
       *
       * Update: ERESTARTSYS breaks at least the xview clock binary, so
       * I'm trying ERESTARTNOHAND which restart only when you want to.
       */
      int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                                 fd_set __user *exp, struct timespec64 *end_time)
      {
              fd_set_bits fds;
              void *bits;
              int ret, max_fds;
              size_t size, alloc_size;
              struct fdtable *fdt;
              /* Allocate small arguments on the stack to save memory and be faster */
              long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
      
              ret = -EINVAL;
  202         if (n < 0)
                      goto out_nofds;
      
              /* max_fds can increase, so grab it once to avoid race */
  201         rcu_read_lock();
  201         fdt = files_fdtable(current->files);
  201         max_fds = fdt->max_fds;
  201         rcu_read_unlock();
              if (n > max_fds)
                      n = max_fds;
      
              /*
               * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
               * since we used fdset we need to allocate memory in units of
               * long-words. 
               */
              size = FDS_BYTES(n);
              bits = stack_fds;
              if (size > sizeof(stack_fds) / 6) {
                      /* Not enough space in on-stack array; must use kmalloc */
                      ret = -ENOMEM;
                      if (size > (SIZE_MAX / 6))
                              goto out_nofds;
      
                      alloc_size = 6 * size;
                      bits = kvmalloc(alloc_size, GFP_KERNEL);
                      if (!bits)
                              goto out_nofds;
              }
  201         fds.in      = bits;
              fds.out     = bits +   size;
              fds.ex      = bits + 2*size;
              fds.res_in  = bits + 3*size;
              fds.res_out = bits + 4*size;
              fds.res_ex  = bits + 5*size;
      
  201         if ((ret = get_fd_set(n, inp, fds.in)) ||
  200             (ret = get_fd_set(n, outp, fds.out)) ||
  200             (ret = get_fd_set(n, exp, fds.ex)))
                      goto out;
  200         zero_fd_set(n, fds.res_in);
              zero_fd_set(n, fds.res_out);
              zero_fd_set(n, fds.res_ex);
      
              ret = do_select(n, &fds, end_time);
      
              if (ret < 0)
                      goto out;
  117         if (!ret) {
                      ret = -ERESTARTNOHAND;
  109                 if (signal_pending(current))
                              goto out;
                      ret = 0;
              }
      
  116         if (set_fd_set(n, inp, fds.res_in) ||
  115             set_fd_set(n, outp, fds.res_out) ||
  113             set_fd_set(n, exp, fds.res_ex))
                      ret = -EFAULT;
      
      out:
  130         if (bits != stack_fds)
                      kvfree(bits);
      out_nofds:
  131         return ret;
      }
      
   15 SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
                      fd_set __user *, exp, struct timeval __user *, tvp)
      {
              struct timespec64 end_time, *to = NULL;
              struct timeval tv;
              int ret;
      
              if (tvp) {
   10                 if (copy_from_user(&tv, tvp, sizeof(tv)))
                              return -EFAULT;
      
                      to = &end_time;
                      if (poll_select_set_timeout(to,
                                      tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
    9                                 (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                              return -EINVAL;
              }
      
   12         ret = core_sys_select(n, inp, outp, exp, to);
              ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
      
   11         return ret;
      }
      
      static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
                             fd_set __user *exp, struct timespec __user *tsp,
                             const sigset_t __user *sigmask, size_t sigsetsize)
      {
              sigset_t ksigmask, sigsaved;
              struct timespec ts;
              struct timespec64 ts64, end_time, *to = NULL;
              int ret;
      
  192         if (tsp) {
  160                 if (copy_from_user(&ts, tsp, sizeof(ts)))
                              return -EFAULT;
  159                 ts64 = timespec_to_timespec64(ts);
      
                      to = &end_time;
                      if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
                              return -EINVAL;
              }
      
  160         if (sigmask) {
                      /* XXX: Don't preclude handling different sized sigset_t's.  */
    7                 if (sigsetsize != sizeof(sigset_t))
                              return -EINVAL;
    6                 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
                              return -EFAULT;
      
    6                 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
                      sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
              }
      
  184         ret = core_sys_select(n, inp, outp, exp, to);
              ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
      
              if (ret == -ERESTARTNOHAND) {
                      /*
                       * Don't restore the signal mask yet. Let do_signal() deliver
                       * the signal on the way back to userspace, before the signal
                       * mask is restored.
                       */
                      if (sigmask) {
    2                         memcpy(&current->saved_sigmask, &sigsaved,
                                              sizeof(sigsaved));
                              set_restore_sigmask();
                      }
              } else if (sigmask)
  126                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
      
              return ret;
      }
      
      /*
       * Most architectures can't handle 7-argument syscalls. So we provide a
       * 6-argument version where the sixth argument is a pointer to a structure
       * which has a pointer to the sigset_t itself followed by a size_t containing
       * the sigset size.
       */
  195 SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
                      fd_set __user *, exp, struct timespec __user *, tsp,
                      void __user *, sig)
      {
              size_t sigsetsize = 0;
              sigset_t __user *up = NULL;
      
              if (sig) {
   12                 if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
                          || __get_user(up, (sigset_t __user * __user *)sig)
    9                     || __get_user(sigsetsize,
                                      (size_t __user *)(sig+sizeof(void *))))
                              return -EFAULT;
              }
      
  192         return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
      }
      
      #ifdef __ARCH_WANT_SYS_OLD_SELECT
      struct sel_arg_struct {
              unsigned long n;
              fd_set __user *inp, *outp, *exp;
              struct timeval __user *tvp;
      };
      
      SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
      {
              struct sel_arg_struct a;
      
              if (copy_from_user(&a, arg, sizeof(a)))
                      return -EFAULT;
              return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
      }
      #endif
      
      struct poll_list {
              struct poll_list *next;
              int len;
              struct pollfd entries[0];
      };
      
      #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
      
      /*
       * Fish for pollable events on the pollfd->fd file descriptor. We're only
       * interested in events matching the pollfd->events mask, and the result
       * matching that mask is both recorded in pollfd->revents and returned. The
       * pwait poll_table will be used by the fd-provided poll handler for waiting,
       * if pwait->_qproc is non-NULL.
       */
      static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                           bool *can_busy_poll,
                                           unsigned int busy_flag)
      {
              unsigned int mask;
              int fd;
      
              mask = 0;
  175         fd = pollfd->fd;
              if (fd >= 0) {
  170                 struct fd f = fdget(fd);
                      mask = POLLNVAL;
                      if (f.file) {
                              mask = DEFAULT_POLLMASK;
  169                         if (f.file->f_op->poll) {
  162                                 pwait->_key = pollfd->events|POLLERR|POLLHUP;
                                      pwait->_key |= busy_flag;
                                      mask = f.file->f_op->poll(f.file, pwait);
  161                                 if (mask & busy_flag)
                                              *can_busy_poll = true;
                              }
                              /* Mask out unneeded events. */
  168                         mask &= pollfd->events | POLLERR | POLLHUP;
  166                         fdput(f);
                      }
              }
  173         pollfd->revents = mask;
      
              return mask;
      }
      
      static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                         struct timespec64 *end_time)
      {
              poll_table* pt = &wait->pt;
              ktime_t expire, *to = NULL;
              int timed_out = 0, count = 0;
              u64 slack = 0;
              unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
              unsigned long busy_start = 0;
      
              /* Optimise the no-wait case */
  208         if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
   51                 pt->_qproc = NULL;
                      timed_out = 1;
              }
      
              if (end_time && !timed_out)
   89                 slack = select_estimate_accuracy(end_time);
      
              for (;;) {
                      struct poll_list *walk;
                      bool can_busy_loop = false;
      
  194                 for (walk = list; walk != NULL; walk = walk->next) {
                              struct pollfd * pfd, * pfd_end;
      
  210                         pfd = walk->entries;
                              pfd_end = pfd + walk->len;
  173                         for (; pfd != pfd_end; pfd++) {
                                      /*
                                       * Fish for events. If we found one, record it
                                       * and kill poll_table->_qproc, so we don't
                                       * needlessly register any other waiters after
                                       * this. They'll get immediately deregistered
                                       * when we break out and return.
                                       */
  175                                 if (do_pollfd(pfd, pt, &can_busy_loop,
                                                    busy_flag)) {
   74                                         count++;
                                              pt->_qproc = NULL;
                                              /* found something, stop busy polling */
                                              busy_flag = 0;
                                              can_busy_loop = false;
                                      }
                              }
                      }
                      /*
                       * All waiters have already been registered, so don't provide
                       * a poll_table->_qproc to them on the next loop iteration.
                       */
  191                 pt->_qproc = NULL;
                      if (!count) {
  153                         count = wait->error;
                              if (signal_pending(current))
                                      count = -EINTR;
                      }
  190                 if (count || timed_out)
                              break;
      
                      /* only if found POLL_BUSY_LOOP sockets && not out of time */
  116                 if (can_busy_loop && !need_resched()) {
                              if (!busy_start) {
                                      busy_start = busy_loop_current_time();
                                      continue;
                              }
                              if (!busy_loop_timeout(busy_start))
                                      continue;
                      }
                      busy_flag = 0;
      
                      /*
                       * If this is the first loop and we have a timeout
                       * given, then we convert to ktime_t and set the to
                       * pointer to the expiry value.
                       */
  116                 if (end_time && !to) {
   67                         expire = timespec64_to_ktime(*end_time);
                              to = &expire;
                      }
      
  116                 if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
                              timed_out = 1;
              }
  145         return count;
      }
      
      #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
                              sizeof(struct pollfd))
      
      static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                      struct timespec64 *end_time)
      {
              struct poll_wqueues table;
               int err = -EFAULT, fdcount, len, size;
              /* Allocate small arguments on the stack to save memory and be
                 faster - use long to make sure the buffer is aligned properly
                 on 64 bit archs to avoid unaligned access */
              long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
              struct poll_list *const head = (struct poll_list *)stack_pps;
               struct poll_list *walk = head;
  212          unsigned long todo = nfds;
      
              if (nfds > rlimit(RLIMIT_NOFILE))
                      return -EINVAL;
      
  209         len = min_t(unsigned int, nfds, N_STACK_PPS);
              for (;;) {
  209                 walk->next = NULL;
                      walk->len = len;
                      if (!len)
                              break;
      
                      if (copy_from_user(walk->entries, ufds + nfds-todo,
  175                                         sizeof(struct pollfd) * walk->len))
                              goto out_fds;
      
  174                 todo -= walk->len;
                      if (!todo)
                              break;
      
   20                 len = min(todo, POLLFD_PER_PAGE);
                      size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
                      walk = walk->next = kmalloc(size, GFP_KERNEL);
                      if (!walk) {
                              err = -ENOMEM;
                              goto out_fds;
                      }
              }
      
  208         poll_initwait(&table);
  210         fdcount = do_poll(head, &table, end_time);
              poll_freewait(&table);
      
  144         for (walk = head; walk; walk = walk->next) {
  145                 struct pollfd *fds = walk->entries;
                      int j;
      
  131                 for (j = 0; j < walk->len; j++, ufds++)
  132                         if (__put_user(fds[j].revents, &ufds->revents))
                                      goto out_fds;
                }
      
              err = fdcount;
      out_fds:
  146         walk = head->next;
  149         while (walk) {
                      struct poll_list *pos = walk;
   16                 walk = walk->next;
                      kfree(pos);
              }
      
              return err;
      }
      
      static long do_restart_poll(struct restart_block *restart_block)
      {
    5         struct pollfd __user *ufds = restart_block->poll.ufds;
              int nfds = restart_block->poll.nfds;
              struct timespec64 *to = NULL, end_time;
              int ret;
      
              if (restart_block->poll.has_timeout) {
    1                 end_time.tv_sec = restart_block->poll.tv_sec;
                      end_time.tv_nsec = restart_block->poll.tv_nsec;
                      to = &end_time;
              }
      
    5         ret = do_sys_poll(ufds, nfds, to);
      
    4         if (ret == -EINTR) {
    2                 restart_block->fn = do_restart_poll;
                      ret = -ERESTART_RESTARTBLOCK;
              }
    5         return ret;
      }
      
  104 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                      int, timeout_msecs)
      {
              struct timespec64 end_time, *to = NULL;
              int ret;
      
              if (timeout_msecs >= 0) {
                      to = &end_time;
                      poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
   81                         NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
              }
      
   22         ret = do_sys_poll(ufds, nfds, to);
      
   60         if (ret == -EINTR) {
                      struct restart_block *restart_block;
      
    8                 restart_block = &current->restart_block;
                      restart_block->fn = do_restart_poll;
                      restart_block->poll.ufds = ufds;
                      restart_block->poll.nfds = nfds;
      
                      if (timeout_msecs >= 0) {
                              restart_block->poll.tv_sec = end_time.tv_sec;
                              restart_block->poll.tv_nsec = end_time.tv_nsec;
                              restart_block->poll.has_timeout = 1;
                      } else
                              restart_block->poll.has_timeout = 0;
      
                      ret = -ERESTART_RESTARTBLOCK;
              }
   68         return ret;
      }
      
  109 SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
                      struct timespec __user *, tsp, const sigset_t __user *, sigmask,
                      size_t, sigsetsize)
      {
              sigset_t ksigmask, sigsaved;
              struct timespec ts;
              struct timespec64 end_time, *to = NULL;
              int ret;
      
              if (tsp) {
   62                 if (copy_from_user(&ts, tsp, sizeof(ts)))
                              return -EFAULT;
      
                      to = &end_time;
   60                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                              return -EINVAL;
              }
      
  105         if (sigmask) {
                      /* XXX: Don't preclude handling different sized sigset_t's.  */
   19                 if (sigsetsize != sizeof(sigset_t))
                              return -EINVAL;
   18                 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
                              return -EFAULT;
      
   18                 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
                      sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
              }
      
   86         ret = do_sys_poll(ufds, nfds, to);
      
              /* We can restart this syscall, usually */
              if (ret == -EINTR) {
                      /*
                       * Don't restore the signal mask yet. Let do_signal() deliver
                       * the signal on the way back to userspace, before the signal
                       * mask is restored.
                       */
                      if (sigmask) {
    2                         memcpy(&current->saved_sigmask, &sigsaved,
                                              sizeof(sigsaved));
                              set_restore_sigmask();
                      }
                      ret = -ERESTARTNOHAND;
              } else if (sigmask)
   10                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
      
   78         ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
      
   82         return ret;
      }
      
      #ifdef CONFIG_COMPAT
      #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
      
      static
      int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
                                            int timeval, int ret)
      {
              struct timespec ts;
      
   13         if (!p)
                      return ret;
      
    6         if (current->personality & STICKY_TIMEOUTS)
                      goto sticky;
      
              /* No update for zero timeout */
    6         if (!end_time->tv_sec && !end_time->tv_nsec)
                      return ret;
      
    3         ktime_get_ts(&ts);
    3         ts = timespec_sub(*end_time, ts);
              if (ts.tv_sec < 0)
                      ts.tv_sec = ts.tv_nsec = 0;
      
    3         if (timeval) {
                      struct compat_timeval rtv;
      
    3                 rtv.tv_sec = ts.tv_sec;
                      rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
      
                      if (!copy_to_user(p, &rtv, sizeof(rtv)))
                              return ret;
              } else {
                      struct compat_timespec rts;
      
                      rts.tv_sec = ts.tv_sec;
                      rts.tv_nsec = ts.tv_nsec;
      
                      if (!copy_to_user(p, &rts, sizeof(rts)))
    3                         return ret;
              }
              /*
               * If an application puts its timeval in read-only memory, we
               * don't want the Linux-specific update to the timeval to
               * cause a fault after the select has completed
               * successfully. However, because we're not updating the
               * timeval, we can't restart the system call.
               */
      
      sticky:
   13         if (ret == -ERESTARTNOHAND)
                      ret = -EINTR;
              return ret;
      }
      
      /*
       * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
       * 64-bit unsigned longs.
       */
      static
   10 int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
                              unsigned long *fdset)
      {
   14         if (ufdset) {
    7                 return compat_get_bitmap(fdset, ufdset, nr);
              } else {
   10                 zero_fd_set(nr, fdset);
   10                 return 0;
              }
      }
      
      static
      int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
                            unsigned long *fdset)
      {
    1         if (!ufdset)
                      return 0;
              return compat_put_bitmap(ufdset, fdset, nr);
      }
      
      
      /*
       * This is a virtual copy of sys_select from fs/select.c and probably
       * should be compared to it from time to time
       */
      
      /*
       * We can actually return ERESTARTSYS instead of EINTR, but I'd
       * like to be certain this leads to no problems. So I return
       * EINTR just for safety.
       *
       * Update: ERESTARTSYS breaks at least the xview clock binary, so
       * I'm trying ERESTARTNOHAND which restart only when you want to.
       */
      static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
              compat_ulong_t __user *outp, compat_ulong_t __user *exp,
              struct timespec *end_time)
      {
              fd_set_bits fds;
              void *bits;
              int size, max_fds, ret = -EINVAL;
              struct fdtable *fdt;
              long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
      
   17         if (n < 0)
                      goto out_nofds;
      
              /* max_fds can increase, so grab it once to avoid race */
   14         rcu_read_lock();
   14         fdt = files_fdtable(current->files);
   14         max_fds = fdt->max_fds;
   14         rcu_read_unlock();
              if (n > max_fds)
                      n = max_fds;
      
              /*
               * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
               * since we used fdset we need to allocate memory in units of
               * long-words.
               */
              size = FDS_BYTES(n);
              bits = stack_fds;
              if (size > sizeof(stack_fds) / 6) {
                      bits = kmalloc(6 * size, GFP_KERNEL);
                      ret = -ENOMEM;
                      if (!bits)
                              goto out_nofds;
              }
   14         fds.in      = (unsigned long *)  bits;
              fds.out     = (unsigned long *) (bits +   size);
              fds.ex      = (unsigned long *) (bits + 2*size);
              fds.res_in  = (unsigned long *) (bits + 3*size);
              fds.res_out = (unsigned long *) (bits + 4*size);
              fds.res_ex  = (unsigned long *) (bits + 5*size);
      
              if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
   11             (ret = compat_get_fd_set(n, outp, fds.out)) ||
   10             (ret = compat_get_fd_set(n, exp, fds.ex)))
                      goto out;
   10         zero_fd_set(n, fds.res_in);
              zero_fd_set(n, fds.res_out);
              zero_fd_set(n, fds.res_ex);
      
              ret = do_select(n, &fds, end_time);
      
              if (ret < 0)
                      goto out;
    3         if (!ret) {
                      ret = -ERESTARTNOHAND;
    2                 if (signal_pending(current))
                              goto out;
                      ret = 0;
              }
      
    1         if (compat_set_fd_set(n, inp, fds.res_in) ||
    1             compat_set_fd_set(n, outp, fds.res_out) ||
    1             compat_set_fd_set(n, exp, fds.res_ex))
                      ret = -EFAULT;
      out:
    7         if (bits != stack_fds)
                      kfree(bits);
      out_nofds:
   10         return ret;
      }
      
   13 COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
              compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
              struct compat_timeval __user *, tvp)
      {
              struct timespec end_time, *to = NULL;
              struct compat_timeval tv;
              int ret;
      
              if (tvp) {
    4                 if (copy_from_user(&tv, tvp, sizeof(tv)))
                              return -EFAULT;
      
                      to = &end_time;
                      if (poll_select_set_timeout(to,
                                      tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
    4                                 (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                              return -EINVAL;
              }
      
   12         ret = compat_core_sys_select(n, inp, outp, exp, to);
              ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);
      
    6         return ret;
      }
      
      struct compat_sel_arg_struct {
              compat_ulong_t n;
              compat_uptr_t inp;
              compat_uptr_t outp;
              compat_uptr_t exp;
              compat_uptr_t tvp;
      };
      
    7 COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
      {
              struct compat_sel_arg_struct a;
      
              if (copy_from_user(&a, arg, sizeof(a)))
                      return -EFAULT;
    6         return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
    5                                  compat_ptr(a.exp), compat_ptr(a.tvp));
      }
      
      static long do_compat_pselect(int n, compat_ulong_t __user *inp,
              compat_ulong_t __user *outp, compat_ulong_t __user *exp,
              struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
              compat_size_t sigsetsize)
      {
              compat_sigset_t ss32;
              sigset_t ksigmask, sigsaved;
              struct compat_timespec ts;
              struct timespec end_time, *to = NULL;
              int ret;
      
    4         if (tsp) {
    4                 if (copy_from_user(&ts, tsp, sizeof(ts)))
                              return -EFAULT;
      
                      to = &end_time;
                      if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                              return -EINVAL;
              }
      
    3         if (sigmask) {
    1                 if (sigsetsize != sizeof(compat_sigset_t))
                              return -EINVAL;
                      if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
                              return -EFAULT;
                      sigset_from_compat(&ksigmask, &ss32);
      
                      sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
                      sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
              }
      
    5         ret = compat_core_sys_select(n, inp, outp, exp, to);
              ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
      
              if (ret == -ERESTARTNOHAND) {
                      /*
                       * Don't restore the signal mask yet. Let do_signal() deliver
                       * the signal on the way back to userspace, before the signal
                       * mask is restored.
                       */
                      if (sigmask) {
                              memcpy(&current->saved_sigmask, &sigsaved,
                                              sizeof(sigsaved));
                              set_restore_sigmask();
                      }
              } else if (sigmask)
    9                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
      
              return ret;
      }
      
   11 COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
              compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
              struct compat_timespec __user *, tsp, void __user *, sig)
      {
              compat_size_t sigsetsize = 0;
              compat_uptr_t up = 0;
      
              if (sig) {
    7                 if (!access_ok(VERIFY_READ, sig,
                                      sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
                                  __get_user(up, (compat_uptr_t __user *)sig) ||
    6                             __get_user(sigsetsize,
                                      (compat_size_t __user *)(sig+sizeof(up))))
                              return -EFAULT;
              }
   10         return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
                                       sigsetsize);
      }
      
    7 COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
              unsigned int,  nfds, struct compat_timespec __user *, tsp,
              const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
      {
              compat_sigset_t ss32;
              sigset_t ksigmask, sigsaved;
              struct compat_timespec ts;
              struct timespec end_time, *to = NULL;
              int ret;
      
              if (tsp) {
    4                 if (copy_from_user(&ts, tsp, sizeof(ts)))
                              return -EFAULT;
      
                      to = &end_time;
    4                 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                              return -EINVAL;
              }
      
    5         if (sigmask) {
                      if (sigsetsize != sizeof(compat_sigset_t))
                              return -EINVAL;
                      if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
                              return -EFAULT;
                      sigset_from_compat(&ksigmask, &ss32);
      
                      sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
                      sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
              }
      
    5         ret = do_sys_poll(ufds, nfds, to);
      
              /* We can restart this syscall, usually */
              if (ret == -EINTR) {
                      /*
                       * Don't restore the signal mask yet. Let do_signal() deliver
                       * the signal on the way back to userspace, before the signal
                       * mask is restored.
                       */
                      if (sigmask) {
                              memcpy(&current->saved_sigmask, &sigsaved,
                                      sizeof(sigsaved));
                              set_restore_sigmask();
                      }
                      ret = -ERESTARTNOHAND;
              } else if (sigmask)
                      sigprocmask(SIG_SETMASK, &sigsaved, NULL);
      
    3         ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
      
    5         return ret;
      }
      #endif
      /*
       * INET                An implementation of the TCP/IP protocol suite for the LINUX
       *                operating system.  INET is implemented using the  BSD Socket
       *                interface as the means of communication with the user level.
       *
       *                Checksumming functions for IPv6
       *
       * Authors:        Jorge Cwik, <jorge@laser.satlink.net>
       *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
       *                Borrows very liberally from tcp.c and ip.c, see those
       *                files for more names.
       *
       *                This program is free software; you can redistribute it and/or
       *                modify it under the terms of the GNU General Public License
       *                as published by the Free Software Foundation; either version
       *                2 of the License, or (at your option) any later version.
       */
      
      /*
       *        Fixes:
       *
       *        Ralf Baechle                        :        generic ipv6 checksum
       *        <ralf@waldorf-gmbh.de>
       */
      
      #ifndef _CHECKSUM_IPV6_H
      #define _CHECKSUM_IPV6_H
      
      #include <asm/types.h>
      #include <asm/byteorder.h>
      #include <net/ip.h>
      #include <asm/checksum.h>
      #include <linux/in6.h>
      #include <linux/tcp.h>
      #include <linux/ipv6.h>
      
      #ifndef _HAVE_ARCH_IPV6_CSUM
      __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
                              const struct in6_addr *daddr,
                              __u32 len, __u8 proto, __wsum csum);
      #endif
      
      static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto)
      {
  117         return ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                                  &ipv6_hdr(skb)->daddr,
                                                  skb->len, proto, 0));
      }
      
      static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto)
      {
              const struct ipv6hdr *iph = skb_gro_network_header(skb);
      
              return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr,
                                                  skb_gro_len(skb), proto, 0));
      }
      
      static __inline__ __sum16 tcp_v6_check(int len,
                                         const struct in6_addr *saddr,
                                         const struct in6_addr *daddr,
                                         __wsum base)
      {
              return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base);
      }
      
      static inline void __tcp_v6_send_check(struct sk_buff *skb,
                                             const struct in6_addr *saddr,
                                             const struct in6_addr *daddr)
      {
              struct tcphdr *th = tcp_hdr(skb);
      
              if (skb->ip_summed == CHECKSUM_PARTIAL) {
  262                 th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0);
                      skb->csum_start = skb_transport_header(skb) - skb->head;
                      skb->csum_offset = offsetof(struct tcphdr, check);
              } else {
    4                 th->check = tcp_v6_check(skb->len, saddr, daddr,
                                               csum_partial(th, th->doff << 2,
                                                            skb->csum));
              }
      }
      
      #if IS_ENABLED(CONFIG_IPV6)
      static inline void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
      {
  199         struct ipv6_pinfo *np = inet6_sk(sk);
      
  199         __tcp_v6_send_check(skb, &np->saddr, &sk->sk_v6_daddr);
      }
      #endif
      
      static inline __sum16 udp_v6_check(int len,
                                         const struct in6_addr *saddr,
                                         const struct in6_addr *daddr,
                                         __wsum base)
      {
              return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base);
      }
      
      void udp6_set_csum(bool nocheck, struct sk_buff *skb,
                         const struct in6_addr *saddr,
                         const struct in6_addr *daddr, int len);
      
      int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto);
      #endif
      /* tnum: tracked (or tristate) numbers
       *
       * A tnum tracks knowledge about the bits of a value.  Each bit can be either
       * known (0 or 1), or unknown (x).  Arithmetic operations on tnums will
       * propagate the unknown bits such that the tnum result represents all the
       * possible results for possible values of the operands.
       */
      #include <linux/kernel.h>
      #include <linux/tnum.h>
      
      #define TNUM(_v, _m)        (struct tnum){.value = _v, .mask = _m}
      /* A completely unknown value */
      const struct tnum tnum_unknown = { .value = 0, .mask = -1 };
      
      struct tnum tnum_const(u64 value)
      {
  586         return TNUM(value, 0);
      }
      
      struct tnum tnum_range(u64 min, u64 max)
      {
  388         u64 chi = min ^ max, delta;
              u8 bits = fls64(chi);
      
              /* special case, needed because 1ULL << 64 is undefined */
              if (bits > 63)
  251                 return tnum_unknown;
              /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
               * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
               *  constant min (since min == max).
               */
  352         delta = (1ULL << bits) - 1;
  388         return TNUM(min & ~delta, delta);
      }
      
      struct tnum tnum_lshift(struct tnum a, u8 shift)
      {
   40         return TNUM(a.value << shift, a.mask << shift);
      }
      
      struct tnum tnum_rshift(struct tnum a, u8 shift)
      {
    1         return TNUM(a.value >> shift, a.mask >> shift);
      }
      
  288 struct tnum tnum_add(struct tnum a, struct tnum b)
      {
              u64 sm, sv, sigma, chi, mu;
      
   71         sm = a.mask + b.mask;
              sv = a.value + b.value;
              sigma = sm + sv;
              chi = sigma ^ sv;
              mu = chi | a.mask | b.mask;
              return TNUM(sv & ~mu, mu);
      }
      
  133 struct tnum tnum_sub(struct tnum a, struct tnum b)
      {
              u64 dv, alpha, beta, chi, mu;
      
              dv = a.value - b.value;
              alpha = dv + a.mask;
              beta = dv - b.mask;
              chi = alpha ^ beta;
              mu = chi | a.mask | b.mask;
              return TNUM(dv & ~mu, mu);
      }
      
   24 struct tnum tnum_and(struct tnum a, struct tnum b)
      {
              u64 alpha, beta, v;
      
              alpha = a.value | a.mask;
              beta = b.value | b.mask;
              v = a.value & b.value;
              return TNUM(v, alpha & beta & ~v);
      }
      
      struct tnum tnum_or(struct tnum a, struct tnum b)
      {
              u64 v, mu;
      
    4         v = a.value | b.value;
              mu = a.mask | b.mask;
              return TNUM(v, mu & ~v);
      }
      
      struct tnum tnum_xor(struct tnum a, struct tnum b)
      {
              u64 v, mu;
      
              v = a.value ^ b.value;
              mu = a.mask | b.mask;
              return TNUM(v & ~mu, mu);
      }
      
      /* half-multiply add: acc += (unknown * mask * value).
       * An intermediate step in the multiply algorithm.
       */
      static struct tnum hma(struct tnum acc, u64 value, u64 mask)
      {
   75         while (mask) {
   71                 if (mask & 1)
   71                         acc = tnum_add(acc, TNUM(0, value));
   71                 mask >>= 1;
                      value <<= 1;
              }
   75         return acc;
      }
      
   75 struct tnum tnum_mul(struct tnum a, struct tnum b)
      {
              struct tnum acc;
              u64 pi;
      
              pi = a.value * b.value;
   70         acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
   75         return hma(acc, b.mask, a.value);
      }
      
      /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
       * a 'known 0' - this will return a 'known 1' for that bit.
       */
      struct tnum tnum_intersect(struct tnum a, struct tnum b)
      {
              u64 v, mu;
      
  388         v = a.value | b.value;
              mu = a.mask & b.mask;
              return TNUM(v & ~mu, mu);
      }
      
  319 struct tnum tnum_cast(struct tnum a, u8 size)
      {
              a.value &= (1ULL << (size * 8)) - 1;
              a.mask &= (1ULL << (size * 8)) - 1;
              return a;
      }
      
      bool tnum_is_aligned(struct tnum a, u64 size)
      {
  179         if (!size)
                      return true;
  179         return !((a.value | a.mask) & (size - 1));
      }
      
   74 bool tnum_in(struct tnum a, struct tnum b)
      {
   74         if (b.mask & ~a.mask)
                      return false;
              b.value &= ~a.mask;
   74         return a.value == b.value;
      }
      
      int tnum_strn(char *str, size_t size, struct tnum a)
      {
   69         return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
      }
      EXPORT_SYMBOL_GPL(tnum_strn);
      
      int tnum_sbin(char *str, size_t size, struct tnum a)
      {
              size_t n;
      
              for (n = 64; n; n--) {
                      if (n < size) {
                              if (a.mask & 1)
                                      str[n - 1] = 'x';
                              else if (a.value & 1)
                                      str[n - 1] = '1';
                              else
                                      str[n - 1] = '0';
                      }
                      a.mask >>= 1;
                      a.value >>= 1;
              }
              str[min(size - 1, (size_t)64)] = 0;
              return 64;
      }
      /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
       * Copyright (c) 2016,2017 Facebook
       *
       * This program is free software; you can redistribute it and/or
       * modify it under the terms of version 2 of the GNU General Public
       * License as published by the Free Software Foundation.
       *
       * This program is distributed in the hope that it will be useful, but
       * WITHOUT ANY WARRANTY; without even the implied warranty of
       * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
       * General Public License for more details.
       */
      #include <linux/bpf.h>
      #include <linux/err.h>
      #include <linux/slab.h>
      #include <linux/mm.h>
      #include <linux/filter.h>
      #include <linux/perf_event.h>
      
      #include "map_in_map.h"
      
      #define ARRAY_CREATE_FLAG_MASK \
              (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
      
      static void bpf_array_free_percpu(struct bpf_array *array)
      {
              int i;
      
              for (i = 0; i < array->map.max_entries; i++) {
                      free_percpu(array->pptrs[i]);
                      cond_resched();
              }
      }
      
      static int bpf_array_alloc_percpu(struct bpf_array *array)
      {
              void __percpu *ptr;
              int i;
      
   17         for (i = 0; i < array->map.max_entries; i++) {
   17                 ptr = __alloc_percpu_gfp(array->elem_size, 8,
                                               GFP_USER | __GFP_NOWARN);
                      if (!ptr) {
                              bpf_array_free_percpu(array);
                              return -ENOMEM;
                      }
   17                 array->pptrs[i] = ptr;
                      cond_resched();
              }
      
              return 0;
      }
      
      /* Called from syscall */
      static struct bpf_map *array_map_alloc(union bpf_attr *attr)
      {
   43         bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
    3         int ret, numa_node = bpf_map_attr_numa_node(attr);
              u32 elem_size, index_mask, max_entries;
   43         bool unpriv = !capable(CAP_SYS_ADMIN);
              u64 cost, array_size, mask64;
              struct bpf_array *array;
      
              /* check sanity of attributes */
   43         if (attr->max_entries == 0 || attr->key_size != 4 ||
   41             attr->value_size == 0 ||
   40             attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
   21             (percpu && numa_node != NUMA_NO_NODE))
                      return ERR_PTR(-EINVAL);
      
   38         if (attr->value_size > KMALLOC_MAX_SIZE)
                      /* if value_size is bigger, the user space won't be able to
                       * access the elements.
                       */
                      return ERR_PTR(-E2BIG);
      
              elem_size = round_up(attr->value_size, 8);
      
              max_entries = attr->max_entries;
      
              /* On 32 bit archs roundup_pow_of_two() with max_entries that has
               * upper most bit set in u32 space is undefined behavior due to
               * resulting 1U << 32, so do it manually here in u64 space.
               */
              mask64 = fls_long(max_entries - 1);
              mask64 = 1ULL << mask64;
              mask64 -= 1;
      
              index_mask = mask64;
              if (unpriv) {
                      /* round up array size to nearest power of 2,
                       * since cpu will speculate within index_mask limits
                       */
    3                 max_entries = index_mask + 1;
                      /* Check for overflows. */
                      if (max_entries < attr->max_entries)
                              return ERR_PTR(-E2BIG);
              }
      
              array_size = sizeof(*array);
   36         if (percpu)
   19                 array_size += (u64) max_entries * sizeof(void *);
              else
   17                 array_size += (u64) max_entries * elem_size;
      
              /* make sure there is no u32 overflow later in round_up() */
              cost = array_size;
              if (cost >= U32_MAX - PAGE_SIZE)
                      return ERR_PTR(-ENOMEM);
              if (percpu) {
   18                 cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
                      if (cost >= U32_MAX - PAGE_SIZE)
                              return ERR_PTR(-ENOMEM);
              }
   34         cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
      
              ret = bpf_map_precharge_memlock(cost);
              if (ret < 0)
    1                 return ERR_PTR(ret);
      
              /* allocate all map elements and zero-initialize them */
   33         array = bpf_map_area_alloc(array_size, numa_node);
              if (!array)
                      return ERR_PTR(-ENOMEM);
   33         array->index_mask = index_mask;
              array->map.unpriv_array = unpriv;
      
              /* copy mandatory map attributes */
              array->map.map_type = attr->map_type;
              array->map.key_size = attr->key_size;
              array->map.value_size = attr->value_size;
              array->map.max_entries = attr->max_entries;
              array->map.map_flags = attr->map_flags;
              array->map.numa_node = numa_node;
              array->map.pages = cost;
              array->elem_size = elem_size;
      
   17         if (percpu && bpf_array_alloc_percpu(array)) {
                      bpf_map_area_free(array);
                      return ERR_PTR(-ENOMEM);
              }
      
   23         return &array->map;
      }
      
      /* Called from syscall or from eBPF program */
      static void *array_map_lookup_elem(struct bpf_map *map, void *key)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
    7         u32 index = *(u32 *)key;
      
              if (unlikely(index >= array->map.max_entries))
                      return NULL;
      
    5         return array->value + array->elem_size * (index & array->index_mask);
    1 }
      
      /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
      static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
              struct bpf_insn *insn = insn_buf;
              u32 elem_size = round_up(map->value_size, 8);
              const int ret = BPF_REG_0;
              const int map_ptr = BPF_REG_1;
              const int index = BPF_REG_2;
      
              *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
              *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
              if (map->unpriv_array) {
                      *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
                      *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
              } else {
                      *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
              }
      
              if (is_power_of_2(elem_size)) {
                      *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
              } else {
                      *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
              }
              *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
              *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
              *insn++ = BPF_MOV64_IMM(ret, 0);
              return insn - insn_buf;
      }
      
      /* Called from eBPF program */
      static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
              u32 index = *(u32 *)key;
      
              if (unlikely(index >= array->map.max_entries))
                      return NULL;
      
              return this_cpu_ptr(array->pptrs[index & array->index_mask]);
      }
      
      int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
    7         u32 index = *(u32 *)key;
              void __percpu *pptr;
              int cpu, off = 0;
              u32 size;
      
              if (unlikely(index >= array->map.max_entries))
                      return -ENOENT;
      
              /* per_cpu areas are zero-filled and bpf programs can only
               * access 'value_size' of them, so copying rounded areas
               * will not leak any kernel data
               */
    6         size = round_up(map->value_size, 8);
    6         rcu_read_lock();
    6         pptr = array->pptrs[index & array->index_mask];
    6         for_each_possible_cpu(cpu) {
    6                 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
    6                 off += size;
              }
    6         rcu_read_unlock();
    7         return 0;
      }
      
      /* Called from syscall */
      static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
    6         u32 index = key ? *(u32 *)key : U32_MAX;
              u32 *next = (u32 *)next_key;
      
              if (index >= array->map.max_entries) {
    3                 *next = 0;
    3                 return 0;
              }
      
    3         if (index == array->map.max_entries - 1)
                      return -ENOENT;
      
    2         *next = index + 1;
    3         return 0;
      }
      
      /* Called from syscall or from eBPF program */
      static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
                                       u64 map_flags)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
    4         u32 index = *(u32 *)key;
      
              if (unlikely(map_flags > BPF_EXIST))
                      /* unknown flags */
                      return -EINVAL;
      
    3         if (unlikely(index >= array->map.max_entries))
                      /* all elements were pre-allocated, cannot insert a new one */
                      return -E2BIG;
      
    2         if (unlikely(map_flags == BPF_NOEXIST))
                      /* all elements already exist */
                      return -EEXIST;
      
    1         if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
                      memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
                             value, map->value_size);
              else
    4                 memcpy(array->value +
                             array->elem_size * (index & array->index_mask),
                             value, map->value_size);
              return 0;
      }
      
      int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
                                  u64 map_flags)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
    8         u32 index = *(u32 *)key;
              void __percpu *pptr;
              int cpu, off = 0;
              u32 size;
      
              if (unlikely(map_flags > BPF_EXIST))
                      /* unknown flags */
                      return -EINVAL;
      
    7         if (unlikely(index >= array->map.max_entries))
                      /* all elements were pre-allocated, cannot insert a new one */
                      return -E2BIG;
      
    6         if (unlikely(map_flags == BPF_NOEXIST))
                      /* all elements already exist */
                      return -EEXIST;
      
              /* the user space will provide round_up(value_size, 8) bytes that
               * will be copied into per-cpu area. bpf programs can only access
               * value_size of it. During lookup the same extra bytes will be
               * returned or zeros which were zero-filled by percpu_alloc,
               * so no kernel data leaks possible
               */
    5         size = round_up(map->value_size, 8);
    5         rcu_read_lock();
    5         pptr = array->pptrs[index & array->index_mask];
    5         for_each_possible_cpu(cpu) {
    5                 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
    5                 off += size;
              }
    5         rcu_read_unlock();
    8         return 0;
      }
      
      /* Called from syscall or from eBPF program */
      static int array_map_delete_elem(struct bpf_map *map, void *key)
      {
    1         return -EINVAL;
      }
      
      /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
      static void array_map_free(struct bpf_map *map)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
      
              /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
               * so the programs (can be more than one that used this map) were
               * disconnected from events. Wait for outstanding programs to complete
               * and free the array
               */
              synchronize_rcu();
      
              if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
                      bpf_array_free_percpu(array);
      
              bpf_map_area_free(array);
      }
      
      const struct bpf_map_ops array_map_ops = {
              .map_alloc = array_map_alloc,
              .map_free = array_map_free,
              .map_get_next_key = array_map_get_next_key,
              .map_lookup_elem = array_map_lookup_elem,
              .map_update_elem = array_map_update_elem,
              .map_delete_elem = array_map_delete_elem,
              .map_gen_lookup = array_map_gen_lookup,
      };
      
      const struct bpf_map_ops percpu_array_map_ops = {
              .map_alloc = array_map_alloc,
              .map_free = array_map_free,
              .map_get_next_key = array_map_get_next_key,
              .map_lookup_elem = percpu_array_map_lookup_elem,
              .map_update_elem = array_map_update_elem,
              .map_delete_elem = array_map_delete_elem,
      };
      
      static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
      {
              /* only file descriptors can be stored in this type of map */
   21         if (attr->value_size != sizeof(u32))
                      return ERR_PTR(-EINVAL);
   18         return array_map_alloc(attr);
      }
      
      static void fd_array_map_free(struct bpf_map *map)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
              int i;
      
              synchronize_rcu();
      
              /* make sure it's empty */
              for (i = 0; i < array->map.max_entries; i++)
                      BUG_ON(array->ptrs[i] != NULL);
      
              bpf_map_area_free(array);
      }
      
      static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
      {
              return NULL;
      }
      
      /* only called from syscall */
      int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
      {
              void **elem, *ptr;
              int ret =  0;
      
    5         if (!map->ops->map_fd_sys_lookup_elem)
                      return -ENOTSUPP;
      
    4         rcu_read_lock();
    4         elem = array_map_lookup_elem(map, key);
    3         if (elem && (ptr = READ_ONCE(*elem)))
    2                 *value = map->ops->map_fd_sys_lookup_elem(ptr);
              else
                      ret = -ENOENT;
    4         rcu_read_unlock();
      
    5         return ret;
      }
      
      /* only called from syscall */
      int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
                                       void *key, void *value, u64 map_flags)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
              void *new_ptr, *old_ptr;
   24         u32 index = *(u32 *)key, ufd;
      
              if (map_flags != BPF_ANY)
                      return -EINVAL;
      
   23         if (index >= array->map.max_entries)
                      return -E2BIG;
      
   22         ufd = *(u32 *)value;
              new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
              if (IS_ERR(new_ptr))
   12                 return PTR_ERR(new_ptr);
      
   10         old_ptr = xchg(array->ptrs + index, new_ptr);
              if (old_ptr)
   24                 map->ops->map_fd_put_ptr(old_ptr);
      
              return 0;
      }
      
      static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
              void *old_ptr;
    6         u32 index = *(u32 *)key;
      
              if (index >= array->map.max_entries)
                      return -E2BIG;
      
    5         old_ptr = xchg(array->ptrs + index, NULL);
              if (old_ptr) {
    3                 map->ops->map_fd_put_ptr(old_ptr);
    6                 return 0;
              } else {
                      return -ENOENT;
              }
      }
      
      static void *prog_fd_array_get_ptr(struct bpf_map *map,
                                         struct file *map_file, int fd)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
    1         struct bpf_prog *prog = bpf_prog_get(fd);
      
              if (IS_ERR(prog))
                      return prog;
      
    1         if (!bpf_prog_array_compatible(array, prog)) {
                      bpf_prog_put(prog);
                      return ERR_PTR(-EINVAL);
              }
      
              return prog;
      }
      
      static void prog_fd_array_put_ptr(void *ptr)
      {
              bpf_prog_put(ptr);
      }
      
      static u32 prog_fd_array_sys_lookup_elem(void *ptr)
      {
              return ((struct bpf_prog *)ptr)->aux->id;
      }
      
      /* decrement refcnt of all bpf_progs that are stored in this map */
      static void bpf_fd_array_map_clear(struct bpf_map *map)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
              int i;
      
    1         for (i = 0; i < array->map.max_entries; i++)
    1                 fd_array_map_delete_elem(map, &i);
      }
      
      const struct bpf_map_ops prog_array_map_ops = {
              .map_alloc = fd_array_map_alloc,
              .map_free = fd_array_map_free,
              .map_get_next_key = array_map_get_next_key,
              .map_lookup_elem = fd_array_map_lookup_elem,
              .map_delete_elem = fd_array_map_delete_elem,
              .map_fd_get_ptr = prog_fd_array_get_ptr,
              .map_fd_put_ptr = prog_fd_array_put_ptr,
              .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
              .map_release_uref = bpf_fd_array_map_clear,
      };
      
      static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
                                                         struct file *map_file)
      {
              struct bpf_event_entry *ee;
      
    5         ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
              if (ee) {
    5                 ee->event = perf_file->private_data;
                      ee->perf_file = perf_file;
                      ee->map_file = map_file;
              }
      
              return ee;
      }
      
      static void __bpf_event_entry_free(struct rcu_head *rcu)
      {
              struct bpf_event_entry *ee;
      
              ee = container_of(rcu, struct bpf_event_entry, rcu);
              fput(ee->perf_file);
              kfree(ee);
      }
      
      static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
      {
    3         call_rcu(&ee->rcu, __bpf_event_entry_free);
      }
      
      static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
                                               struct file *map_file, int fd)
      {
              struct bpf_event_entry *ee;
              struct perf_event *event;
              struct file *perf_file;
              u64 value;
      
    9         perf_file = perf_event_get(fd);
              if (IS_ERR(perf_file))
                      return perf_file;
      
              ee = ERR_PTR(-EOPNOTSUPP);
    6         event = perf_file->private_data;
              if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
                      goto err_out;
      
    5         ee = bpf_event_entry_gen(perf_file, map_file);
    9         if (ee)
                      return ee;
              ee = ERR_PTR(-ENOMEM);
      err_out:
    1         fput(perf_file);
              return ee;
      }
      
      static void perf_event_fd_array_put_ptr(void *ptr)
      {
    3         bpf_event_entry_free_rcu(ptr);
      }
      
      static void perf_event_fd_array_release(struct bpf_map *map,
                                              struct file *map_file)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
              struct bpf_event_entry *ee;
              int i;
      
    6         rcu_read_lock();
    6         for (i = 0; i < array->map.max_entries; i++) {
    6                 ee = READ_ONCE(array->ptrs[i]);
                      if (ee && ee->map_file == map_file)
    2                         fd_array_map_delete_elem(map, &i);
              }
    6         rcu_read_unlock();
      }
      
      const struct bpf_map_ops perf_event_array_map_ops = {
              .map_alloc = fd_array_map_alloc,
              .map_free = fd_array_map_free,
              .map_get_next_key = array_map_get_next_key,
              .map_lookup_elem = fd_array_map_lookup_elem,
              .map_delete_elem = fd_array_map_delete_elem,
              .map_fd_get_ptr = perf_event_fd_array_get_ptr,
              .map_fd_put_ptr = perf_event_fd_array_put_ptr,
              .map_release = perf_event_fd_array_release,
      };
      
      #ifdef CONFIG_CGROUPS
      static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
                                           struct file *map_file /* not used */,
                                           int fd)
      {
    1         return cgroup_get_from_fd(fd);
      }
      
      static void cgroup_fd_array_put_ptr(void *ptr)
      {
              /* cgroup_put free cgrp after a rcu grace period */
              cgroup_put(ptr);
      }
      
      static void cgroup_fd_array_free(struct bpf_map *map)
      {
              bpf_fd_array_map_clear(map);
              fd_array_map_free(map);
      }
      
      const struct bpf_map_ops cgroup_array_map_ops = {
              .map_alloc = fd_array_map_alloc,
              .map_free = cgroup_fd_array_free,
              .map_get_next_key = array_map_get_next_key,
              .map_lookup_elem = fd_array_map_lookup_elem,
              .map_delete_elem = fd_array_map_delete_elem,
              .map_fd_get_ptr = cgroup_fd_array_get_ptr,
              .map_fd_put_ptr = cgroup_fd_array_put_ptr,
      };
      #endif
      
      static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
      {
              struct bpf_map *map, *inner_map_meta;
      
    8         inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
              if (IS_ERR(inner_map_meta))
    1                 return inner_map_meta;
      
    7         map = fd_array_map_alloc(attr);
              if (IS_ERR(map)) {
    3                 bpf_map_meta_free(inner_map_meta);
    3                 return map;
              }
      
    4         map->inner_map_meta = inner_map_meta;
      
    4         return map;
      }
      
      static void array_of_map_free(struct bpf_map *map)
      {
              /* map->inner_map_meta is only accessed by syscall which
               * is protected by fdget/fdput.
               */
              bpf_map_meta_free(map->inner_map_meta);
              bpf_fd_array_map_clear(map);
              fd_array_map_free(map);
      }
      
      static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
      {
              struct bpf_map **inner_map = array_map_lookup_elem(map, key);
      
              if (!inner_map)
                      return NULL;
      
              return READ_ONCE(*inner_map);
      }
      
      static u32 array_of_map_gen_lookup(struct bpf_map *map,
                                         struct bpf_insn *insn_buf)
      {
              struct bpf_array *array = container_of(map, struct bpf_array, map);
              u32 elem_size = round_up(map->value_size, 8);
              struct bpf_insn *insn = insn_buf;
              const int ret = BPF_REG_0;
              const int map_ptr = BPF_REG_1;
              const int index = BPF_REG_2;
      
              *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
              *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
              if (map->unpriv_array) {
                      *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
                      *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
              } else {
                      *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
              }
              if (is_power_of_2(elem_size))
                      *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
              else
                      *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
              *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
              *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
              *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
              *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
              *insn++ = BPF_MOV64_IMM(ret, 0);
      
              return insn - insn_buf;
      }
      
      const struct bpf_map_ops array_of_maps_map_ops = {
              .map_alloc = array_of_map_alloc,
              .map_free = array_of_map_free,
              .map_get_next_key = array_map_get_next_key,
              .map_lookup_elem = array_of_map_lookup_elem,
              .map_delete_elem = fd_array_map_delete_elem,
              .map_fd_get_ptr = bpf_map_fd_get_ptr,
              .map_fd_put_ptr = bpf_map_fd_put_ptr,
              .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
              .map_gen_lookup = array_of_map_gen_lookup,
      };
      /*
       * blk-mq scheduling framework
       *
       * Copyright (C) 2016 Jens Axboe
       */
      #include <linux/kernel.h>
      #include <linux/module.h>
      #include <linux/blk-mq.h>
      
      #include <trace/events/block.h>
      
      #include "blk.h"
      #include "blk-mq.h"
      #include "blk-mq-debugfs.h"
      #include "blk-mq-sched.h"
      #include "blk-mq-tag.h"
      #include "blk-wbt.h"
      
      void blk_mq_sched_free_hctx_data(struct request_queue *q,
                                       void (*exit)(struct blk_mq_hw_ctx *))
      {
              struct blk_mq_hw_ctx *hctx;
              int i;
      
              queue_for_each_hw_ctx(q, hctx, i) {
                      if (exit && hctx->sched_data)
                              exit(hctx);
                      kfree(hctx->sched_data);
                      hctx->sched_data = NULL;
              }
      }
      EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
      
      void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
      {
              struct request_queue *q = rq->q;
              struct io_context *ioc = rq_ioc(bio);
              struct io_cq *icq;
      
              spin_lock_irq(q->queue_lock);
              icq = ioc_lookup_icq(ioc, q);
              spin_unlock_irq(q->queue_lock);
      
              if (!icq) {
                      icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
                      if (!icq)
                              return;
              }
              get_io_context(icq->ioc);
              rq->elv.icq = icq;
      }
      
      /*
       * Mark a hardware queue as needing a restart. For shared queues, maintain
       * a count of how many hardware queues are marked for restart.
       */
      static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
      {
    2         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                      return;
      
    2         if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
                      struct request_queue *q = hctx->queue;
      
                      if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                              atomic_inc(&q->shared_hctx_restart);
              } else
    2                 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
      }
      
      static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
      {
   43         if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                      return false;
      
              if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
                      struct request_queue *q = hctx->queue;
      
                      if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                              atomic_dec(&q->shared_hctx_restart);
              } else
                      clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
      
              if (blk_mq_hctx_has_pending(hctx)) {
                      blk_mq_run_hw_queue(hctx, true);
                      return true;
              }
      
              return false;
      }
      
      void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
      {
 1223         struct request_queue *q = hctx->queue;
              struct elevator_queue *e = q->elevator;
 1223         const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
              bool do_sched_dispatch = true;
 1223         LIST_HEAD(rq_list);
      
              /* RCU or SRCU read lock is needed before checking quiesced flag */
 1223         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
 1215                 return;
      
 1223         hctx->run++;
      
              /*
               * If we have previous entries on our dispatch list, grab them first for
               * more fair dispatch.
               */
 1223         if (!list_empty_careful(&hctx->dispatch)) {
    2                 spin_lock(&hctx->lock);
                      if (!list_empty(&hctx->dispatch))
    2                         list_splice_init(&hctx->dispatch, &rq_list);
    2                 spin_unlock(&hctx->lock);
              }
      
              /*
               * Only ask the scheduler for requests, if we didn't have residual
               * requests from the dispatch list. This is to avoid the case where
               * we only ever dispatch a fraction of the requests available because
               * of low device queue depth. Once we pull requests out of the IO
               * scheduler, we can no longer merge or sort them. So it's best to
               * leave them there for as long as we can. Mark the hw queue as
               * needing a restart in that case.
               */
 1223         if (!list_empty(&rq_list)) {
    2                 blk_mq_sched_mark_restart_hctx(hctx);
    2                 do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
 1223         } else if (!has_sched_dispatch) {
                      blk_mq_flush_busy_ctxs(hctx, &rq_list);
                      blk_mq_dispatch_rq_list(q, &rq_list);
              }
      
              /*
               * We want to dispatch from the scheduler if there was nothing
               * on the dispatch list or we were able to dispatch from the
               * dispatch list.
               */
    2         if (do_sched_dispatch && has_sched_dispatch) {
                      do {
                              struct request *rq;
      
 1223                         rq = e->type->ops.mq.dispatch_request(hctx);
                              if (!rq)
                                      break;
 1223                         list_add(&rq->queuelist, &rq_list);
 1223                 } while (blk_mq_dispatch_rq_list(q, &rq_list));
              }
      }
      
      bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                                  struct request **merged_request)
      {
              struct request *rq;
      
 1733         switch (elv_merge(q, &rq, bio)) {
              case ELEVATOR_BACK_MERGE:
   39                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 1731                         return false;
   39                 if (!bio_attempt_back_merge(q, rq, bio))
                              return false;
   39                 *merged_request = attempt_back_merge(q, rq);
                      if (!*merged_request)
   39                         elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
                      return true;
              case ELEVATOR_FRONT_MERGE:
   13                 if (!blk_mq_sched_allow_merge(q, rq, bio))
                              return false;
   13                 if (!bio_attempt_front_merge(q, rq, bio))
                              return false;
   13                 *merged_request = attempt_front_merge(q, rq);
                      if (!*merged_request)
   13                         elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
                      return true;
              default:
                      return false;
              }
      }
      EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
      
      /*
       * Reverse check our software queue for entries that we could potentially
       * merge with. Currently includes a hand-wavy stop count of 8, to not spend
       * too much time checking for merges.
       */
      static bool blk_mq_attempt_merge(struct request_queue *q,
                                       struct blk_mq_ctx *ctx, struct bio *bio)
      {
              struct request *rq;
              int checked = 8;
      
              lockdep_assert_held(&ctx->lock);
      
              list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
                      bool merged = false;
      
                      if (!checked--)
                              break;
      
                      if (!blk_rq_merge_ok(rq, bio))
                              continue;
      
                      switch (blk_try_merge(rq, bio)) {
                      case ELEVATOR_BACK_MERGE:
                              if (blk_mq_sched_allow_merge(q, rq, bio))
                                      merged = bio_attempt_back_merge(q, rq, bio);
                              break;
                      case ELEVATOR_FRONT_MERGE:
                              if (blk_mq_sched_allow_merge(q, rq, bio))
                                      merged = bio_attempt_front_merge(q, rq, bio);
                              break;
                      case ELEVATOR_DISCARD_MERGE:
                              merged = bio_attempt_discard_merge(q, rq, bio);
                              break;
                      default:
                              continue;
                      }
      
                      if (merged)
                              ctx->rq_merged++;
                      return merged;
              }
      
              return false;
      }
      
      bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
      {
 1733         struct elevator_queue *e = q->elevator;
              struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
              struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
              bool ret = false;
      
 1733         if (e && e->type->ops.mq.bio_merge) {
 1733                 blk_mq_put_ctx(ctx);
 1733                 return e->type->ops.mq.bio_merge(hctx, bio);
              }
      
              if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
                              !list_empty_careful(&ctx->rq_list)) {
                      /* default per sw-queue merge */
                      spin_lock(&ctx->lock);
                      ret = blk_mq_attempt_merge(q, ctx, bio);
                      spin_unlock(&ctx->lock);
              }
      
 1731         blk_mq_put_ctx(ctx);
              return ret;
      }
      
      bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
      {
 2338         return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
      }
      EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
      
      void blk_mq_sched_request_inserted(struct request *rq)
      {
 2338         trace_block_rq_insert(rq->q, rq);
      }
      EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
      
      static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
                                             struct request *rq)
      {
              if (rq->tag == -1) {
 1423                 rq->rq_flags |= RQF_SORTED;
                      return false;
              }
      
              /*
               * If we already have a real request tag, send directly to
               * the dispatch list.
               */
              spin_lock(&hctx->lock);
              list_add(&rq->queuelist, &hctx->dispatch);
              spin_unlock(&hctx->lock);
              return true;
      }
      
      /**
       * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
       * @pos:    loop cursor.
       * @skip:   the list element that will not be examined. Iteration starts at
       *          @skip->next.
       * @head:   head of the list to examine. This list must have at least one
       *          element, namely @skip.
       * @member: name of the list_head structure within typeof(*pos).
       */
      #define list_for_each_entry_rcu_rr(pos, skip, head, member)                \
              for ((pos) = (skip);                                                \
                   (pos = (pos)->member.next != (head) ? list_entry_rcu(        \
                              (pos)->member.next, typeof(*pos), member) :        \
                    list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
                   (pos) != (skip); )
      
      /*
       * Called after a driver tag has been freed to check whether a hctx needs to
       * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
       * queues in a round-robin fashion if the tag set of @hctx is shared with other
       * hardware queues.
       */
      void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
      {
              struct blk_mq_tags *const tags = hctx->tags;
   43         struct blk_mq_tag_set *const set = hctx->queue->tag_set;
              struct request_queue *const queue = hctx->queue, *q;
              struct blk_mq_hw_ctx *hctx2;
              unsigned int i, j;
      
              if (set->flags & BLK_MQ_F_TAG_SHARED) {
                      /*
                       * If this is 0, then we know that no hardware queues
                       * have RESTART marked. We're done.
                       */
                      if (!atomic_read(&queue->shared_hctx_restart))
                              return;
      
                      rcu_read_lock();
                      list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
                                                 tag_set_list) {
                              queue_for_each_hw_ctx(q, hctx2, i)
                                      if (hctx2->tags == tags &&
                                          blk_mq_sched_restart_hctx(hctx2))
                                              goto done;
                      }
                      j = hctx->queue_num + 1;
                      for (i = 0; i < queue->nr_hw_queues; i++, j++) {
                              if (j == queue->nr_hw_queues)
                                      j = 0;
                              hctx2 = queue->queue_hw_ctx[j];
                              if (hctx2->tags == tags &&
                                  blk_mq_sched_restart_hctx(hctx2))
                                      break;
                      }
      done:
                      rcu_read_unlock();
              } else {
   43                 blk_mq_sched_restart_hctx(hctx);
              }
      }
      
      /*
       * Add flush/fua to the queue. If we fail getting a driver tag, then
       * punt to the requeue list. Requeue will re-invoke us from a context
       * that's safe to block from.
       */
      static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
                                            struct request *rq, bool can_block)
      {
              if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
  544                 blk_insert_flush(rq);
                      blk_mq_run_hw_queue(hctx, true);
              } else
                      blk_mq_add_to_requeue_list(rq, false, true);
      }
      
      void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                                       bool run_queue, bool async, bool can_block)
      {
 1593         struct request_queue *q = rq->q;
              struct elevator_queue *e = q->elevator;
              struct blk_mq_ctx *ctx = rq->mq_ctx;
              struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
      
 1593         if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
  544                 blk_mq_sched_insert_flush(hctx, rq, can_block);
                      return;
              }
      
 1423         if (e && blk_mq_sched_bypass_insert(hctx, rq))
                      goto run;
      
 1423         if (e && e->type->ops.mq.insert_requests) {
 1423                 LIST_HEAD(list);
      
 1423                 list_add(&rq->queuelist, &list);
 1423                 e->type->ops.mq.insert_requests(hctx, &list, at_head);
              } else {
                      spin_lock(&ctx->lock);
                      __blk_mq_insert_request(hctx, rq, at_head);
                      spin_unlock(&ctx->lock);
              }
      
      run:
 1591         if (run_queue)
 1423                 blk_mq_run_hw_queue(hctx, async);
      }
      
      void blk_mq_sched_insert_requests(struct request_queue *q,
                                        struct blk_mq_ctx *ctx,
                                        struct list_head *list, bool run_queue_async)
      {
 1368         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
              struct elevator_queue *e = hctx->queue->elevator;
      
              if (e) {
                      struct request *rq, *next;
      
                      /*
                       * We bypass requests that already have a driver tag assigned,
                       * which should only be flushes. Flushes are only ever inserted
                       * as single requests, so we shouldn't ever hit the
                       * WARN_ON_ONCE() below (but let's handle it just in case).
                       */
 1368                 list_for_each_entry_safe(rq, next, list, queuelist) {
 1368                         if (WARN_ON_ONCE(rq->tag != -1)) {
                                      list_del_init(&rq->queuelist);
                                      blk_mq_sched_bypass_insert(hctx, rq);
                              }
                      }
              }
      
 1368         if (e && e->type->ops.mq.insert_requests)
 1368                 e->type->ops.mq.insert_requests(hctx, list, false);
              else
                      blk_mq_insert_requests(hctx, ctx, list);
      
 1368         blk_mq_run_hw_queue(hctx, run_queue_async);
      }
      
      static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
                                         struct blk_mq_hw_ctx *hctx,
                                         unsigned int hctx_idx)
      {
   27         if (hctx->sched_tags) {
   27                 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
                      blk_mq_free_rq_map(hctx->sched_tags);
                      hctx->sched_tags = NULL;
              }
      }
      
      static int blk_mq_sched_alloc_tags(struct request_queue *q,
                                         struct blk_mq_hw_ctx *hctx,
                                         unsigned int hctx_idx)
      {
              struct blk_mq_tag_set *set = q->tag_set;
              int ret;
      
   39         hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
                                                     set->reserved_tags);
              if (!hctx->sched_tags)
                      return -ENOMEM;
      
   40         ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
   40         if (ret)
                      blk_mq_sched_free_tags(set, hctx, hctx_idx);
      
              return ret;
      }
      
      static void blk_mq_sched_tags_teardown(struct request_queue *q)
      {
              struct blk_mq_tag_set *set = q->tag_set;
              struct blk_mq_hw_ctx *hctx;
              int i;
      
              queue_for_each_hw_ctx(q, hctx, i)
                      blk_mq_sched_free_tags(set, hctx, i);
      }
      
      int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
                                 unsigned int hctx_idx)
      {
   39         struct elevator_queue *e = q->elevator;
              int ret;
      
              if (!e)
                      return 0;
      
              ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
              if (ret)
                      return ret;
      
              if (e->type->ops.mq.init_hctx) {
                      ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
                      if (ret) {
                              blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
                              return ret;
                      }
              }
      
              blk_mq_debugfs_register_sched_hctx(q, hctx);
      
   39         return 0;
      }
      
      void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
                                  unsigned int hctx_idx)
      {
   27         struct elevator_queue *e = q->elevator;
      
              if (!e)
                      return;
      
   27         blk_mq_debugfs_unregister_sched_hctx(hctx);
      
              if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
                      e->type->ops.mq.exit_hctx(hctx, hctx_idx);
                      hctx->sched_data = NULL;
              }
      
   27         blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
      }
      
      int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
      {
              struct blk_mq_hw_ctx *hctx;
              struct elevator_queue *eq;
              unsigned int i;
              int ret;
      
   39         if (!e) {
                      q->elevator = NULL;
                      return 0;
              }
      
              /*
               * Default to double of smaller one between hw queue_depth and 128,
               * since we don't split into sync/async like the old code did.
               * Additionally, this is a per-hw queue depth.
               */
   39         q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
                                         BLKDEV_MAX_RQ);
      
   40         queue_for_each_hw_ctx(q, hctx, i) {
                      ret = blk_mq_sched_alloc_tags(q, hctx, i);
                      if (ret)
                              goto err;
              }
      
   40         ret = e->ops.mq.init_sched(q, e);
              if (ret)
                      goto err;
      
   40         blk_mq_debugfs_register_sched(q);
      
   40         queue_for_each_hw_ctx(q, hctx, i) {
                      if (e->ops.mq.init_hctx) {
                              ret = e->ops.mq.init_hctx(hctx, i);
                              if (ret) {
                                      eq = q->elevator;
                                      blk_mq_exit_sched(q, eq);
                                      kobject_put(&eq->kobj);
                                      return ret;
                              }
                      }
   40                 blk_mq_debugfs_register_sched_hctx(q, hctx);
              }
      
              return 0;
      
      err:
              blk_mq_sched_tags_teardown(q);
              q->elevator = NULL;
   40         return ret;
      }
      
      void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
      {
              struct blk_mq_hw_ctx *hctx;
              unsigned int i;
      
              queue_for_each_hw_ctx(q, hctx, i) {
                      blk_mq_debugfs_unregister_sched_hctx(hctx);
                      if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
                              e->type->ops.mq.exit_hctx(hctx, i);
                              hctx->sched_data = NULL;
                      }
              }
              blk_mq_debugfs_unregister_sched(q);
              if (e->type->ops.mq.exit_sched)
                      e->type->ops.mq.exit_sched(e);
              blk_mq_sched_tags_teardown(q);
              q->elevator = NULL;
      }
      
      int blk_mq_sched_init(struct request_queue *q)
      {
              int ret;
      
   39         mutex_lock(&q->sysfs_lock);
              ret = elevator_init(q, NULL);
              mutex_unlock(&q->sysfs_lock);
      
              return ret;
      }
      /*
       * Copyright (c) 2012 Taobao.
       * Written by Tao Ma <boyu.mt@taobao.com>
       *
       * This program is free software; you can redistribute it and/or modify it
       * under the terms of version 2.1 of the GNU Lesser General Public License
       * as published by the Free Software Foundation.
       *
       * This program is distributed in the hope that it will be useful,
       * but WITHOUT ANY WARRANTY; without even the implied warranty of
       * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       * GNU General Public License for more details.
       */
      
      #include <linux/fiemap.h>
      
      #include "ext4_jbd2.h"
      #include "ext4.h"
      #include "xattr.h"
      #include "truncate.h"
      #include <trace/events/android_fs.h>
      
      #define EXT4_XATTR_SYSTEM_DATA        "data"
      #define EXT4_MIN_INLINE_DATA_SIZE        ((sizeof(__le32) * EXT4_N_BLOCKS))
      #define EXT4_INLINE_DOTDOT_OFFSET        2
      #define EXT4_INLINE_DOTDOT_SIZE                4
      
      static int ext4_get_inline_size(struct inode *inode)
      {
              if (EXT4_I(inode)->i_inline_off)
                      return EXT4_I(inode)->i_inline_size;
      
              return 0;
      }
      
      static int get_max_inline_xattr_value_size(struct inode *inode,
                                                 struct ext4_iloc *iloc)
      {
              struct ext4_xattr_ibody_header *header;
              struct ext4_xattr_entry *entry;
              struct ext4_inode *raw_inode;
              int free, min_offs;
      
              min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
                              EXT4_GOOD_OLD_INODE_SIZE -
                              EXT4_I(inode)->i_extra_isize -
                              sizeof(struct ext4_xattr_ibody_header);
      
              /*
               * We need to subtract another sizeof(__u32) since an in-inode xattr
               * needs an empty 4 bytes to indicate the gap between the xattr entry
               * and the name/value pair.
               */
              if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                      return EXT4_XATTR_SIZE(min_offs -
                              EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
                              EXT4_XATTR_ROUND - sizeof(__u32));
      
              raw_inode = ext4_raw_inode(iloc);
              header = IHDR(inode, raw_inode);
              entry = IFIRST(header);
      
              /* Compute min_offs. */
              for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
                      if (!entry->e_value_inum && entry->e_value_size) {
                              size_t offs = le16_to_cpu(entry->e_value_offs);
                              if (offs < min_offs)
                                      min_offs = offs;
                      }
              }
              free = min_offs -
                      ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
      
              if (EXT4_I(inode)->i_inline_off) {
                      entry = (struct ext4_xattr_entry *)
                              ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
      
                      free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
                      goto out;
              }
      
              free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
      
              if (free > EXT4_XATTR_ROUND)
                      free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
              else
                      free = 0;
      
      out:
              return free;
      }
      
      /*
       * Get the maximum size we now can store in an inode.
       * If we can't find the space for a xattr entry, don't use the space
       * of the extents since we have no space to indicate the inline data.
       */
      int ext4_get_max_inline_size(struct inode *inode)
      {
              int error, max_inline_size;
              struct ext4_iloc iloc;
      
              if (EXT4_I(inode)->i_extra_isize == 0)
                      return 0;
      
              error = ext4_get_inode_loc(inode, &iloc);
              if (error) {
                      ext4_error_inode(inode, __func__, __LINE__, 0,
                                       "can't get inode location %lu",
                                       inode->i_ino);
                      return 0;
              }
      
              down_read(&EXT4_I(inode)->xattr_sem);
              max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
              up_read(&EXT4_I(inode)->xattr_sem);
      
              brelse(iloc.bh);
      
              if (!max_inline_size)
                      return 0;
      
              return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
      }
      
      /*
       * this function does not take xattr_sem, which is OK because it is
       * currently only used in a code path coming form ext4_iget, before
       * the new inode has been unlocked
       */
      int ext4_find_inline_data_nolock(struct inode *inode)
      {
              struct ext4_xattr_ibody_find is = {
                      .s = { .not_found = -ENODATA, },
              };
              struct ext4_xattr_info i = {
                      .name_index = EXT4_XATTR_INDEX_SYSTEM,
                      .name = EXT4_XATTR_SYSTEM_DATA,
              };
              int error;
      
              if (EXT4_I(inode)->i_extra_isize == 0)
                      return 0;
      
              error = ext4_get_inode_loc(inode, &is.iloc);
              if (error)
                      return error;
      
              error = ext4_xattr_ibody_find(inode, &i, &is);
              if (error)
                      goto out;
      
              if (!is.s.not_found) {
                      if (is.s.here->e_value_inum) {
                              EXT4_ERROR_INODE(inode, "inline data xattr refers "
                                               "to an external xattr inode");
                              error = -EFSCORRUPTED;
                              goto out;
                      }
                      EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                              (void *)ext4_raw_inode(&is.iloc));
                      EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
                                      le32_to_cpu(is.s.here->e_value_size);
                      ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
              }
      out:
              brelse(is.iloc.bh);
              return error;
      }
      
      static int ext4_read_inline_data(struct inode *inode, void *buffer,
                                       unsigned int len,
                                       struct ext4_iloc *iloc)
      {
              struct ext4_xattr_entry *entry;
              struct ext4_xattr_ibody_header *header;
              int cp_len = 0;
              struct ext4_inode *raw_inode;
      
              if (!len)
                      return 0;
      
              BUG_ON(len > EXT4_I(inode)->i_inline_size);
      
              cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
                              len : EXT4_MIN_INLINE_DATA_SIZE;
      
              raw_inode = ext4_raw_inode(iloc);
              memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
      
              len -= cp_len;
              buffer += cp_len;
      
              if (!len)
                      goto out;
      
              header = IHDR(inode, raw_inode);
              entry = (struct ext4_xattr_entry *)((void *)raw_inode +
                                                  EXT4_I(inode)->i_inline_off);
              len = min_t(unsigned int, len,
                          (unsigned int)le32_to_cpu(entry->e_value_size));
      
              memcpy(buffer,
                     (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
              cp_len += len;
      
      out:
              return cp_len;
      }
      
      /*
       * write the buffer to the inline inode.
       * If 'create' is set, we don't need to do the extra copy in the xattr
       * value since it is already handled by ext4_xattr_ibody_inline_set.
       * That saves us one memcpy.
       */
      static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
                                         void *buffer, loff_t pos, unsigned int len)
      {
              struct ext4_xattr_entry *entry;
              struct ext4_xattr_ibody_header *header;
              struct ext4_inode *raw_inode;
              int cp_len = 0;
      
              if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                      return;
      
              BUG_ON(!EXT4_I(inode)->i_inline_off);
              BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
      
              raw_inode = ext4_raw_inode(iloc);
              buffer += pos;
      
              if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
                      cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
                               EXT4_MIN_INLINE_DATA_SIZE - pos : len;
                      memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
      
                      len -= cp_len;
                      buffer += cp_len;
                      pos += cp_len;
              }
      
              if (!len)
                      return;
      
              pos -= EXT4_MIN_INLINE_DATA_SIZE;
              header = IHDR(inode, raw_inode);
              entry = (struct ext4_xattr_entry *)((void *)raw_inode +
                                                  EXT4_I(inode)->i_inline_off);
      
              memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
                     buffer, len);
      }
      
      static int ext4_create_inline_data(handle_t *handle,
                                         struct inode *inode, unsigned len)
      {
              int error;
              void *value = NULL;
              struct ext4_xattr_ibody_find is = {
                      .s = { .not_found = -ENODATA, },
              };
              struct ext4_xattr_info i = {
                      .name_index = EXT4_XATTR_INDEX_SYSTEM,
                      .name = EXT4_XATTR_SYSTEM_DATA,
              };
      
              error = ext4_get_inode_loc(inode, &is.iloc);
              if (error)
                      return error;
      
              BUFFER_TRACE(is.iloc.bh, "get_write_access");
              error = ext4_journal_get_write_access(handle, is.iloc.bh);
              if (error)
                      goto out;
      
              if (len > EXT4_MIN_INLINE_DATA_SIZE) {
                      value = EXT4_ZERO_XATTR_VALUE;
                      len -= EXT4_MIN_INLINE_DATA_SIZE;
              } else {
                      value = "";
                      len = 0;
              }
      
              /* Insert the the xttr entry. */
              i.value = value;
              i.value_len = len;
      
              error = ext4_xattr_ibody_find(inode, &i, &is);
              if (error)
                      goto out;
      
              BUG_ON(!is.s.not_found);
      
              error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
              if (error) {
                      if (error == -ENOSPC)
                              ext4_clear_inode_state(inode,
                                                     EXT4_STATE_MAY_INLINE_DATA);
                      goto out;
              }
      
              memset((void *)ext4_raw_inode(&is.iloc)->i_block,
                      0, EXT4_MIN_INLINE_DATA_SIZE);
      
              EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                            (void *)ext4_raw_inode(&is.iloc));
              EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
              ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
              ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
              get_bh(is.iloc.bh);
              error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
      
      out:
              brelse(is.iloc.bh);
              return error;
      }
      
      static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
                                         unsigned int len)
      {
              int error;
              void *value = NULL;
              struct ext4_xattr_ibody_find is = {
                      .s = { .not_found = -ENODATA, },
              };
              struct ext4_xattr_info i = {
                      .name_index = EXT4_XATTR_INDEX_SYSTEM,
                      .name = EXT4_XATTR_SYSTEM_DATA,
              };
      
              /* If the old space is ok, write the data directly. */
              if (len <= EXT4_I(inode)->i_inline_size)
                      return 0;
      
              error = ext4_get_inode_loc(inode, &is.iloc);
              if (error)
                      return error;
      
              error = ext4_xattr_ibody_find(inode, &i, &is);
              if (error)
                      goto out;
      
              BUG_ON(is.s.not_found);
      
              len -= EXT4_MIN_INLINE_DATA_SIZE;
              value = kzalloc(len, GFP_NOFS);
              if (!value) {
                      error = -ENOMEM;
                      goto out;
              }
      
              error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
                                           value, len);
              if (error == -ENODATA)
                      goto out;
      
              BUFFER_TRACE(is.iloc.bh, "get_write_access");
              error = ext4_journal_get_write_access(handle, is.iloc.bh);
              if (error)
                      goto out;
      
              /* Update the xttr entry. */
              i.value = value;
              i.value_len = len;
      
              error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
              if (error)
                      goto out;
      
              EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                            (void *)ext4_raw_inode(&is.iloc));
              EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
                                      le32_to_cpu(is.s.here->e_value_size);
              ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
              get_bh(is.iloc.bh);
              error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
      
      out:
              kfree(value);
              brelse(is.iloc.bh);
              return error;
      }
      
      static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
                                          unsigned int len)
      {
              int ret, size, no_expand;
              struct ext4_inode_info *ei = EXT4_I(inode);
      
              if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                      return -ENOSPC;
      
              size = ext4_get_max_inline_size(inode);
              if (size < len)
                      return -ENOSPC;
      
              ext4_write_lock_xattr(inode, &no_expand);
      
              if (ei->i_inline_off)
                      ret = ext4_update_inline_data(handle, inode, len);
              else
                      ret = ext4_create_inline_data(handle, inode, len);
      
              ext4_write_unlock_xattr(inode, &no_expand);
              return ret;
      }
      
      static int ext4_destroy_inline_data_nolock(handle_t *handle,
                                                 struct inode *inode)
      {
              struct ext4_inode_info *ei = EXT4_I(inode);
              struct ext4_xattr_ibody_find is = {
                      .s = { .not_found = 0, },
              };
              struct ext4_xattr_info i = {
                      .name_index = EXT4_XATTR_INDEX_SYSTEM,
                      .name = EXT4_XATTR_SYSTEM_DATA,
                      .value = NULL,
                      .value_len = 0,
              };
              int error;
      
              if (!ei->i_inline_off)
                      return 0;
      
              error = ext4_get_inode_loc(inode, &is.iloc);
              if (error)
                      return error;
      
              error = ext4_xattr_ibody_find(inode, &i, &is);
              if (error)
                      goto out;
      
              BUFFER_TRACE(is.iloc.bh, "get_write_access");
              error = ext4_journal_get_write_access(handle, is.iloc.bh);
              if (error)
                      goto out;
      
              error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
              if (error)
                      goto out;
      
              memset((void *)ext4_raw_inode(&is.iloc)->i_block,
                      0, EXT4_MIN_INLINE_DATA_SIZE);
              memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE);
      
              if (ext4_has_feature_extents(inode->i_sb)) {
                      if (S_ISDIR(inode->i_mode) ||
                          S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
                              ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                              ext4_ext_tree_init(handle, inode);
                      }
              }
              ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
      
              get_bh(is.iloc.bh);
              error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
      
              EXT4_I(inode)->i_inline_off = 0;
              EXT4_I(inode)->i_inline_size = 0;
              ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
      out:
              brelse(is.iloc.bh);
              if (error == -ENODATA)
                      error = 0;
              return error;
      }
      
      static int ext4_read_inline_page(struct inode *inode, struct page *page)
      {
              void *kaddr;
              int ret = 0;
              size_t len;
              struct ext4_iloc iloc;
      
              BUG_ON(!PageLocked(page));
              BUG_ON(!ext4_has_inline_data(inode));
              BUG_ON(page->index);
      
              if (!EXT4_I(inode)->i_inline_off) {
                      ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
                                   inode->i_ino);
                      goto out;
              }
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret)
                      goto out;
      
              len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
              kaddr = kmap_atomic(page);
              ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
              flush_dcache_page(page);
              kunmap_atomic(kaddr);
              zero_user_segment(page, len, PAGE_SIZE);
              SetPageUptodate(page);
              brelse(iloc.bh);
      
      out:
              return ret;
      }
      
      int ext4_readpage_inline(struct inode *inode, struct page *page)
      {
              int ret = 0;
      
              down_read(&EXT4_I(inode)->xattr_sem);
              if (!ext4_has_inline_data(inode)) {
                      up_read(&EXT4_I(inode)->xattr_sem);
                      return -EAGAIN;
              }
      
              if (trace_android_fs_dataread_start_enabled()) {
                      char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
      
                      path = android_fstrace_get_pathname(pathbuf,
                                                          MAX_TRACE_PATHBUF_LEN,
                                                          inode);
                      trace_android_fs_dataread_start(inode, page_offset(page),
                                                      PAGE_SIZE, current->pid,
                                                      path, current->comm);
              }
      
              /*
               * Current inline data can only exist in the 1st page,
               * So for all the other pages, just set them uptodate.
               */
              if (!page->index)
                      ret = ext4_read_inline_page(inode, page);
              else if (!PageUptodate(page)) {
                      zero_user_segment(page, 0, PAGE_SIZE);
                      SetPageUptodate(page);
              }
      
              trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE);
      
              up_read(&EXT4_I(inode)->xattr_sem);
      
              unlock_page(page);
              return ret >= 0 ? 0 : ret;
      }
      
      static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
                                                    struct inode *inode,
                                                    unsigned flags)
      {
              int ret, needed_blocks, no_expand;
              handle_t *handle = NULL;
              int retries = 0, sem_held = 0;
              struct page *page = NULL;
              unsigned from, to;
              struct ext4_iloc iloc;
      
              if (!ext4_has_inline_data(inode)) {
                      /*
                       * clear the flag so that no new write
                       * will trap here again.
                       */
                      ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                      return 0;
              }
      
              needed_blocks = ext4_writepage_trans_blocks(inode);
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret)
                      return ret;
      
      retry:
              handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
              if (IS_ERR(handle)) {
                      ret = PTR_ERR(handle);
                      handle = NULL;
                      goto out;
              }
      
              /* We cannot recurse into the filesystem as the transaction is already
               * started */
              flags |= AOP_FLAG_NOFS;
      
              page = grab_cache_page_write_begin(mapping, 0, flags);
              if (!page) {
                      ret = -ENOMEM;
                      goto out;
              }
      
              ext4_write_lock_xattr(inode, &no_expand);
              sem_held = 1;
              /* If some one has already done this for us, just exit. */
              if (!ext4_has_inline_data(inode)) {
                      ret = 0;
                      goto out;
              }
      
              from = 0;
              to = ext4_get_inline_size(inode);
              if (!PageUptodate(page)) {
                      ret = ext4_read_inline_page(inode, page);
                      if (ret < 0)
                              goto out;
              }
      
              ret = ext4_destroy_inline_data_nolock(handle, inode);
              if (ret)
                      goto out;
      
              if (ext4_should_dioread_nolock(inode)) {
                      ret = __block_write_begin(page, from, to,
                                                ext4_get_block_unwritten);
              } else
                      ret = __block_write_begin(page, from, to, ext4_get_block);
      
              if (!ret && ext4_should_journal_data(inode)) {
                      ret = ext4_walk_page_buffers(handle, page_buffers(page),
                                                   from, to, NULL,
                                                   do_journal_get_write_access);
              }
      
              if (ret) {
                      unlock_page(page);
                      put_page(page);
                      page = NULL;
                      ext4_orphan_add(handle, inode);
                      ext4_write_unlock_xattr(inode, &no_expand);
                      sem_held = 0;
                      ext4_journal_stop(handle);
                      handle = NULL;
                      ext4_truncate_failed_write(inode);
                      /*
                       * If truncate failed early the inode might
                       * still be on the orphan list; we need to
                       * make sure the inode is removed from the
                       * orphan list in that case.
                       */
                      if (inode->i_nlink)
                              ext4_orphan_del(NULL, inode);
              }
      
              if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                      goto retry;
      
              if (page)
                      block_commit_write(page, from, to);
      out:
              if (page) {
                      unlock_page(page);
                      put_page(page);
              }
              if (sem_held)
                      ext4_write_unlock_xattr(inode, &no_expand);
              if (handle)
                      ext4_journal_stop(handle);
              brelse(iloc.bh);
              return ret;
      }
      
      /*
       * Try to write data in the inode.
       * If the inode has inline data, check whether the new write can be
       * in the inode also. If not, create the page the handle, move the data
       * to the page make it update and let the later codes create extent for it.
       */
      int ext4_try_to_write_inline_data(struct address_space *mapping,
                                        struct inode *inode,
                                        loff_t pos, unsigned len,
                                        unsigned flags,
                                        struct page **pagep)
      {
              int ret;
              handle_t *handle;
              struct page *page;
              struct ext4_iloc iloc;
      
              if (pos + len > ext4_get_max_inline_size(inode))
                      goto convert;
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret)
                      return ret;
      
              /*
               * The possible write could happen in the inode,
               * so try to reserve the space in inode first.
               */
              handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
              if (IS_ERR(handle)) {
                      ret = PTR_ERR(handle);
                      handle = NULL;
                      goto out;
              }
      
              ret = ext4_prepare_inline_data(handle, inode, pos + len);
              if (ret && ret != -ENOSPC)
                      goto out;
      
              /* We don't have space in inline inode, so convert it to extent. */
              if (ret == -ENOSPC) {
                      ext4_journal_stop(handle);
                      brelse(iloc.bh);
                      goto convert;
              }
      
              ret = ext4_journal_get_write_access(handle, iloc.bh);
              if (ret)
                      goto out;
      
              flags |= AOP_FLAG_NOFS;
      
              page = grab_cache_page_write_begin(mapping, 0, flags);
              if (!page) {
                      ret = -ENOMEM;
                      goto out;
              }
      
              *pagep = page;
              down_read(&EXT4_I(inode)->xattr_sem);
              if (!ext4_has_inline_data(inode)) {
                      ret = 0;
                      unlock_page(page);
                      put_page(page);
                      goto out_up_read;
              }
      
              if (!PageUptodate(page)) {
                      ret = ext4_read_inline_page(inode, page);
                      if (ret < 0) {
                              unlock_page(page);
                              put_page(page);
                              goto out_up_read;
                      }
              }
      
              ret = 1;
              handle = NULL;
      out_up_read:
              up_read(&EXT4_I(inode)->xattr_sem);
      out:
              if (handle && (ret != 1))
                      ext4_journal_stop(handle);
              brelse(iloc.bh);
              return ret;
      convert:
              return ext4_convert_inline_data_to_extent(mapping,
                                                        inode, flags);
      }
      
      int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
                                     unsigned copied, struct page *page)
      {
              int ret, no_expand;
              void *kaddr;
              struct ext4_iloc iloc;
      
              if (unlikely(copied < len)) {
                      if (!PageUptodate(page)) {
                              copied = 0;
                              goto out;
                      }
              }
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret) {
                      ext4_std_error(inode->i_sb, ret);
                      copied = 0;
                      goto out;
              }
      
              ext4_write_lock_xattr(inode, &no_expand);
              BUG_ON(!ext4_has_inline_data(inode));
      
              kaddr = kmap_atomic(page);
              ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
              kunmap_atomic(kaddr);
              SetPageUptodate(page);
              /* clear page dirty so that writepages wouldn't work for us. */
              ClearPageDirty(page);
      
              ext4_write_unlock_xattr(inode, &no_expand);
              brelse(iloc.bh);
              mark_inode_dirty(inode);
      out:
              return copied;
      }
      
      struct buffer_head *
      ext4_journalled_write_inline_data(struct inode *inode,
                                        unsigned len,
                                        struct page *page)
      {
              int ret, no_expand;
              void *kaddr;
              struct ext4_iloc iloc;
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret) {
                      ext4_std_error(inode->i_sb, ret);
                      return NULL;
              }
      
              ext4_write_lock_xattr(inode, &no_expand);
              kaddr = kmap_atomic(page);
              ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
              kunmap_atomic(kaddr);
              ext4_write_unlock_xattr(inode, &no_expand);
      
              return iloc.bh;
      }
      
      /*
       * Try to make the page cache and handle ready for the inline data case.
       * We can call this function in 2 cases:
       * 1. The inode is created and the first write exceeds inline size. We can
       *    clear the inode state safely.
       * 2. The inode has inline data, then we need to read the data, make it
       *    update and dirty so that ext4_da_writepages can handle it. We don't
       *    need to start the journal since the file's metatdata isn't changed now.
       */
      static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
                                                       struct inode *inode,
                                                       unsigned flags,
                                                       void **fsdata)
      {
              int ret = 0, inline_size;
              struct page *page;
      
              page = grab_cache_page_write_begin(mapping, 0, flags);
              if (!page)
                      return -ENOMEM;
      
              down_read(&EXT4_I(inode)->xattr_sem);
              if (!ext4_has_inline_data(inode)) {
                      ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                      goto out;
              }
      
              inline_size = ext4_get_inline_size(inode);
      
              if (!PageUptodate(page)) {
                      ret = ext4_read_inline_page(inode, page);
                      if (ret < 0)
                              goto out;
              }
      
              ret = __block_write_begin(page, 0, inline_size,
                                        ext4_da_get_block_prep);
              if (ret) {
                      up_read(&EXT4_I(inode)->xattr_sem);
                      unlock_page(page);
                      put_page(page);
                      ext4_truncate_failed_write(inode);
                      return ret;
              }
      
              SetPageDirty(page);
              SetPageUptodate(page);
              ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
              *fsdata = (void *)CONVERT_INLINE_DATA;
      
      out:
              up_read(&EXT4_I(inode)->xattr_sem);
              if (page) {
                      unlock_page(page);
                      put_page(page);
              }
              return ret;
      }
      
      /*
       * Prepare the write for the inline data.
       * If the the data can be written into the inode, we just read
       * the page and make it uptodate, and start the journal.
       * Otherwise read the page, makes it dirty so that it can be
       * handle in writepages(the i_disksize update is left to the
       * normal ext4_da_write_end).
       */
      int ext4_da_write_inline_data_begin(struct address_space *mapping,
                                          struct inode *inode,
                                          loff_t pos, unsigned len,
                                          unsigned flags,
                                          struct page **pagep,
                                          void **fsdata)
      {
              int ret, inline_size;
              handle_t *handle;
              struct page *page;
              struct ext4_iloc iloc;
              int retries = 0;
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret)
                      return ret;
      
      retry_journal:
              handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
              if (IS_ERR(handle)) {
                      ret = PTR_ERR(handle);
                      goto out;
              }
      
              inline_size = ext4_get_max_inline_size(inode);
      
              ret = -ENOSPC;
              if (inline_size >= pos + len) {
                      ret = ext4_prepare_inline_data(handle, inode, pos + len);
                      if (ret && ret != -ENOSPC)
                              goto out_journal;
              }
      
              /*
               * We cannot recurse into the filesystem as the transaction
               * is already started.
               */
              flags |= AOP_FLAG_NOFS;
      
              if (ret == -ENOSPC) {
                      ext4_journal_stop(handle);
                      ret = ext4_da_convert_inline_data_to_extent(mapping,
                                                                  inode,
                                                                  flags,
                                                                  fsdata);
                      if (ret == -ENOSPC &&
                          ext4_should_retry_alloc(inode->i_sb, &retries))
                              goto retry_journal;
                      goto out;
              }
      
              page = grab_cache_page_write_begin(mapping, 0, flags);
              if (!page) {
                      ret = -ENOMEM;
                      goto out_journal;
              }
      
              down_read(&EXT4_I(inode)->xattr_sem);
              if (!ext4_has_inline_data(inode)) {
                      ret = 0;
                      goto out_release_page;
              }
      
              if (!PageUptodate(page)) {
                      ret = ext4_read_inline_page(inode, page);
                      if (ret < 0)
                              goto out_release_page;
              }
              ret = ext4_journal_get_write_access(handle, iloc.bh);
              if (ret)
                      goto out_release_page;
      
              up_read(&EXT4_I(inode)->xattr_sem);
              *pagep = page;
              brelse(iloc.bh);
              return 1;
      out_release_page:
              up_read(&EXT4_I(inode)->xattr_sem);
              unlock_page(page);
              put_page(page);
      out_journal:
              ext4_journal_stop(handle);
      out:
              brelse(iloc.bh);
              return ret;
      }
      
      int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
                                        unsigned len, unsigned copied,
                                        struct page *page)
      {
              int ret;
      
              ret = ext4_write_inline_data_end(inode, pos, len, copied, page);
              if (ret < 0) {
                      unlock_page(page);
                      put_page(page);
                      return ret;
              }
              copied = ret;
      
              /*
               * No need to use i_size_read() here, the i_size
               * cannot change under us because we hold i_mutex.
               *
               * But it's important to update i_size while still holding page lock:
               * page writeout could otherwise come in and zero beyond i_size.
               */
              if (pos+copied > inode->i_size)
                      i_size_write(inode, pos+copied);
              unlock_page(page);
              put_page(page);
      
              /*
               * Don't mark the inode dirty under page lock. First, it unnecessarily
               * makes the holding time of page lock longer. Second, it forces lock
               * ordering of page lock and transaction start for journaling
               * filesystems.
               */
              mark_inode_dirty(inode);
      
              return copied;
      }
      
      #ifdef INLINE_DIR_DEBUG
      void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
                                void *inline_start, int inline_size)
      {
              int offset;
              unsigned short de_len;
              struct ext4_dir_entry_2 *de = inline_start;
              void *dlimit = inline_start + inline_size;
      
              trace_printk("inode %lu\n", dir->i_ino);
              offset = 0;
              while ((void *)de < dlimit) {
                      de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
                      trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n",
                                   offset, de_len, de->name_len, de->name,
                                   de->name_len, le32_to_cpu(de->inode));
                      if (ext4_check_dir_entry(dir, NULL, de, bh,
                                               inline_start, inline_size, offset))
                              BUG();
      
                      offset += de_len;
                      de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
              }
      }
      #else
      #define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
      #endif
      
      /*
       * Add a new entry into a inline dir.
       * It will return -ENOSPC if no space is available, and -EIO
       * and -EEXIST if directory entry already exists.
       */
      static int ext4_add_dirent_to_inline(handle_t *handle,
                                           struct ext4_filename *fname,
                                           struct inode *dir,
                                           struct inode *inode,
                                           struct ext4_iloc *iloc,
                                           void *inline_start, int inline_size)
      {
              int                err;
              struct ext4_dir_entry_2 *de;
      
              err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
                                      inline_size, fname, &de);
              if (err)
                      return err;
      
              BUFFER_TRACE(iloc->bh, "get_write_access");
              err = ext4_journal_get_write_access(handle, iloc->bh);
              if (err)
                      return err;
              ext4_insert_dentry(inode, de, inline_size, fname);
      
              ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
      
              /*
               * XXX shouldn't update any times until successful
               * completion of syscall, but too many callers depend
               * on this.
               *
               * XXX similarly, too many callers depend on
               * ext4_new_inode() setting the times, but error
               * recovery deletes the inode, so the worst that can
               * happen is that the times are slightly out of date
               * and/or different from the directory change time.
               */
              dir->i_mtime = dir->i_ctime = current_time(dir);
              ext4_update_dx_flag(dir);
              dir->i_version++;
              return 1;
      }
      
      static void *ext4_get_inline_xattr_pos(struct inode *inode,
                                             struct ext4_iloc *iloc)
      {
              struct ext4_xattr_entry *entry;
              struct ext4_xattr_ibody_header *header;
      
              BUG_ON(!EXT4_I(inode)->i_inline_off);
      
              header = IHDR(inode, ext4_raw_inode(iloc));
              entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
                                                  EXT4_I(inode)->i_inline_off);
      
              return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
      }
      
      /* Set the final de to cover the whole block. */
      static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
      {
              struct ext4_dir_entry_2 *de, *prev_de;
              void *limit;
              int de_len;
      
              de = (struct ext4_dir_entry_2 *)de_buf;
              if (old_size) {
                      limit = de_buf + old_size;
                      do {
                              prev_de = de;
                              de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
                              de_buf += de_len;
                              de = (struct ext4_dir_entry_2 *)de_buf;
                      } while (de_buf < limit);
      
                      prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
                                                              old_size, new_size);
              } else {
                      /* this is just created, so create an empty entry. */
                      de->inode = 0;
                      de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
              }
      }
      
      static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
                                        struct ext4_iloc *iloc)
      {
              int ret;
              int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
              int new_size = get_max_inline_xattr_value_size(dir, iloc);
      
              if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
                      return -ENOSPC;
      
              ret = ext4_update_inline_data(handle, dir,
                                            new_size + EXT4_MIN_INLINE_DATA_SIZE);
              if (ret)
                      return ret;
      
              ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
                                   EXT4_I(dir)->i_inline_size -
                                                      EXT4_MIN_INLINE_DATA_SIZE);
              dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
              return 0;
      }
      
      static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
                                           struct ext4_iloc *iloc,
                                           void *buf, int inline_size)
      {
              ext4_create_inline_data(handle, inode, inline_size);
              ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
              ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
      }
      
      static int ext4_finish_convert_inline_dir(handle_t *handle,
                                                struct inode *inode,
                                                struct buffer_head *dir_block,
                                                void *buf,
                                                int inline_size)
      {
              int err, csum_size = 0, header_size = 0;
              struct ext4_dir_entry_2 *de;
              struct ext4_dir_entry_tail *t;
              void *target = dir_block->b_data;
      
              /*
               * First create "." and ".." and then copy the dir information
               * back to the block.
               */
              de = (struct ext4_dir_entry_2 *)target;
              de = ext4_init_dot_dotdot(inode, de,
                      inode->i_sb->s_blocksize, csum_size,
                      le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
              header_size = (void *)de - target;
      
              memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
                      inline_size - EXT4_INLINE_DOTDOT_SIZE);
      
              if (ext4_has_metadata_csum(inode->i_sb))
                      csum_size = sizeof(struct ext4_dir_entry_tail);
      
              inode->i_size = inode->i_sb->s_blocksize;
              i_size_write(inode, inode->i_sb->s_blocksize);
              EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
              ext4_update_final_de(dir_block->b_data,
                              inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
                              inode->i_sb->s_blocksize - csum_size);
      
              if (csum_size) {
                      t = EXT4_DIRENT_TAIL(dir_block->b_data,
                                           inode->i_sb->s_blocksize);
                      initialize_dirent_tail(t, inode->i_sb->s_blocksize);
              }
              set_buffer_uptodate(dir_block);
              err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
              if (err)
                      return err;
              set_buffer_verified(dir_block);
              return ext4_mark_inode_dirty(handle, inode);
      }
      
      static int ext4_convert_inline_data_nolock(handle_t *handle,
                                                 struct inode *inode,
                                                 struct ext4_iloc *iloc)
      {
              int error;
              void *buf = NULL;
              struct buffer_head *data_bh = NULL;
              struct ext4_map_blocks map;
              int inline_size;
      
              inline_size = ext4_get_inline_size(inode);
              buf = kmalloc(inline_size, GFP_NOFS);
              if (!buf) {
                      error = -ENOMEM;
                      goto out;
              }
      
              error = ext4_read_inline_data(inode, buf, inline_size, iloc);
              if (error < 0)
                      goto out;
      
              /*
               * Make sure the inline directory entries pass checks before we try to
               * convert them, so that we avoid touching stuff that needs fsck.
               */
              if (S_ISDIR(inode->i_mode)) {
                      error = ext4_check_all_de(inode, iloc->bh,
                                              buf + EXT4_INLINE_DOTDOT_SIZE,
                                              inline_size - EXT4_INLINE_DOTDOT_SIZE);
                      if (error)
                              goto out;
              }
      
              error = ext4_destroy_inline_data_nolock(handle, inode);
              if (error)
                      goto out;
      
              map.m_lblk = 0;
              map.m_len = 1;
              map.m_flags = 0;
              error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
              if (error < 0)
                      goto out_restore;
              if (!(map.m_flags & EXT4_MAP_MAPPED)) {
                      error = -EIO;
                      goto out_restore;
              }
      
              data_bh = sb_getblk(inode->i_sb, map.m_pblk);
              if (!data_bh) {
                      error = -ENOMEM;
                      goto out_restore;
              }
      
              lock_buffer(data_bh);
              error = ext4_journal_get_create_access(handle, data_bh);
              if (error) {
                      unlock_buffer(data_bh);
                      error = -EIO;
                      goto out_restore;
              }
              memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
      
              if (!S_ISDIR(inode->i_mode)) {
                      memcpy(data_bh->b_data, buf, inline_size);
                      set_buffer_uptodate(data_bh);
                      error = ext4_handle_dirty_metadata(handle,
                                                         inode, data_bh);
              } else {
                      error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
                                                             buf, inline_size);
              }
      
              unlock_buffer(data_bh);
      out_restore:
              if (error)
                      ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
      
      out:
              brelse(data_bh);
              kfree(buf);
              return error;
      }
      
      /*
       * Try to add the new entry to the inline data.
       * If succeeds, return 0. If not, extended the inline dir and copied data to
       * the new created block.
       */
      int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
                                    struct inode *dir, struct inode *inode)
      {
              int ret, inline_size, no_expand;
              void *inline_start;
              struct ext4_iloc iloc;
      
              ret = ext4_get_inode_loc(dir, &iloc);
              if (ret)
                      return ret;
      
              ext4_write_lock_xattr(dir, &no_expand);
              if (!ext4_has_inline_data(dir))
                      goto out;
      
              inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
                                                       EXT4_INLINE_DOTDOT_SIZE;
              inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
      
              ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,
                                              inline_start, inline_size);
              if (ret != -ENOSPC)
                      goto out;
      
              /* check whether it can be inserted to inline xattr space. */
              inline_size = EXT4_I(dir)->i_inline_size -
                              EXT4_MIN_INLINE_DATA_SIZE;
              if (!inline_size) {
                      /* Try to use the xattr space.*/
                      ret = ext4_update_inline_dir(handle, dir, &iloc);
                      if (ret && ret != -ENOSPC)
                              goto out;
      
                      inline_size = EXT4_I(dir)->i_inline_size -
                                      EXT4_MIN_INLINE_DATA_SIZE;
              }
      
              if (inline_size) {
                      inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
      
                      ret = ext4_add_dirent_to_inline(handle, fname, dir,
                                                      inode, &iloc, inline_start,
                                                      inline_size);
      
                      if (ret != -ENOSPC)
                              goto out;
              }
      
              /*
               * The inline space is filled up, so create a new block for it.
               * As the extent tree will be created, we have to save the inline
               * dir first.
               */
              ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
      
      out:
              ext4_write_unlock_xattr(dir, &no_expand);
              ext4_mark_inode_dirty(handle, dir);
              brelse(iloc.bh);
              return ret;
      }
      
      /*
       * This function fills a red-black tree with information from an
       * inlined dir.  It returns the number directory entries loaded
       * into the tree.  If there is an error it is returned in err.
       */
      int htree_inlinedir_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash,
                                  int *has_inline_data)
      {
              int err = 0, count = 0;
              unsigned int parent_ino;
              int pos;
              struct ext4_dir_entry_2 *de;
              struct inode *inode = file_inode(dir_file);
              int ret, inline_size = 0;
              struct ext4_iloc iloc;
              void *dir_buf = NULL;
              struct ext4_dir_entry_2 fake;
              struct fscrypt_str tmp_str;
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret)
                      return ret;
      
              down_read(&EXT4_I(inode)->xattr_sem);
              if (!ext4_has_inline_data(inode)) {
                      up_read(&EXT4_I(inode)->xattr_sem);
                      *has_inline_data = 0;
                      goto out;
              }
      
              inline_size = ext4_get_inline_size(inode);
              dir_buf = kmalloc(inline_size, GFP_NOFS);
              if (!dir_buf) {
                      ret = -ENOMEM;
                      up_read(&EXT4_I(inode)->xattr_sem);
                      goto out;
              }
      
              ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
              up_read(&EXT4_I(inode)->xattr_sem);
              if (ret < 0)
                      goto out;
      
              pos = 0;
              parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
              while (pos < inline_size) {
                      /*
                       * As inlined dir doesn't store any information about '.' and
                       * only the inode number of '..' is stored, we have to handle
                       * them differently.
                       */
                      if (pos == 0) {
                              fake.inode = cpu_to_le32(inode->i_ino);
                              fake.name_len = 1;
                              strcpy(fake.name, ".");
                              fake.rec_len = ext4_rec_len_to_disk(
                                                      EXT4_DIR_REC_LEN(fake.name_len),
                                                      inline_size);
                              ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
                              de = &fake;
                              pos = EXT4_INLINE_DOTDOT_OFFSET;
                      } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
                              fake.inode = cpu_to_le32(parent_ino);
                              fake.name_len = 2;
                              strcpy(fake.name, "..");
                              fake.rec_len = ext4_rec_len_to_disk(
                                                      EXT4_DIR_REC_LEN(fake.name_len),
                                                      inline_size);
                              ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
                              de = &fake;
                              pos = EXT4_INLINE_DOTDOT_SIZE;
                      } else {
                              de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
                              pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
                              if (ext4_check_dir_entry(inode, dir_file, de,
                                               iloc.bh, dir_buf,
                                               inline_size, pos)) {
                                      ret = count;
                                      goto out;
                              }
                      }
      
                      ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
                      if ((hinfo->hash < start_hash) ||
                          ((hinfo->hash == start_hash) &&
                           (hinfo->minor_hash < start_minor_hash)))
                              continue;
                      if (de->inode == 0)
                              continue;
                      tmp_str.name = de->name;
                      tmp_str.len = de->name_len;
                      err = ext4_htree_store_dirent(dir_file, hinfo->hash,
                                                    hinfo->minor_hash, de, &tmp_str);
                      if (err) {
                              count = err;
                              goto out;
                      }
                      count++;
              }
              ret = count;
      out:
              kfree(dir_buf);
              brelse(iloc.bh);
              return ret;
      }
      
      /*
       * So this function is called when the volume is mkfsed with
       * dir_index disabled. In order to keep f_pos persistent
       * after we convert from an inlined dir to a blocked based,
       * we just pretend that we are a normal dir and return the
       * offset as if '.' and '..' really take place.
       *
       */
      int ext4_read_inline_dir(struct file *file,
                               struct dir_context *ctx,
                               int *has_inline_data)
      {
              unsigned int offset, parent_ino;
              int i;
              struct ext4_dir_entry_2 *de;
              struct super_block *sb;
              struct inode *inode = file_inode(file);
              int ret, inline_size = 0;
              struct ext4_iloc iloc;
              void *dir_buf = NULL;
              int dotdot_offset, dotdot_size, extra_offset, extra_size;
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret)
                      return ret;
      
              down_read(&EXT4_I(inode)->xattr_sem);
              if (!ext4_has_inline_data(inode)) {
                      up_read(&EXT4_I(inode)->xattr_sem);
                      *has_inline_data = 0;
                      goto out;
              }
      
              inline_size = ext4_get_inline_size(inode);
              dir_buf = kmalloc(inline_size, GFP_NOFS);
              if (!dir_buf) {
                      ret = -ENOMEM;
                      up_read(&EXT4_I(inode)->xattr_sem);
                      goto out;
              }
      
              ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
              up_read(&EXT4_I(inode)->xattr_sem);
              if (ret < 0)
                      goto out;
      
              ret = 0;
              sb = inode->i_sb;
              parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
              offset = ctx->pos;
      
              /*
               * dotdot_offset and dotdot_size is the real offset and
               * size for ".." and "." if the dir is block based while
               * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
               * So we will use extra_offset and extra_size to indicate them
               * during the inline dir iteration.
               */
              dotdot_offset = EXT4_DIR_REC_LEN(1);
              dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
              extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
              extra_size = extra_offset + inline_size;
      
              /*
               * If the version has changed since the last call to
               * readdir(2), then we might be pointing to an invalid
               * dirent right now.  Scan from the start of the inline
               * dir to make sure.
               */
              if (file->f_version != inode->i_version) {
                      for (i = 0; i < extra_size && i < offset;) {
                              /*
                               * "." is with offset 0 and
                               * ".." is dotdot_offset.
                               */
                              if (!i) {
                                      i = dotdot_offset;
                                      continue;
                              } else if (i == dotdot_offset) {
                                      i = dotdot_size;
                                      continue;
                              }
                              /* for other entry, the real offset in
                               * the buf has to be tuned accordingly.
                               */
                              de = (struct ext4_dir_entry_2 *)
                                      (dir_buf + i - extra_offset);
                              /* It's too expensive to do a full
                               * dirent test each time round this
                               * loop, but we do have to test at
                               * least that it is non-zero.  A
                               * failure will be detected in the
                               * dirent test below. */
                              if (ext4_rec_len_from_disk(de->rec_len, extra_size)
                                      < EXT4_DIR_REC_LEN(1))
                                      break;
                              i += ext4_rec_len_from_disk(de->rec_len,
                                                          extra_size);
                      }
                      offset = i;
                      ctx->pos = offset;
                      file->f_version = inode->i_version;
              }
      
              while (ctx->pos < extra_size) {
                      if (ctx->pos == 0) {
                              if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
                                      goto out;
                              ctx->pos = dotdot_offset;
                              continue;
                      }
      
                      if (ctx->pos == dotdot_offset) {
                              if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
                                      goto out;
                              ctx->pos = dotdot_size;
                              continue;
                      }
      
                      de = (struct ext4_dir_entry_2 *)
                              (dir_buf + ctx->pos - extra_offset);
                      if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
                                               extra_size, ctx->pos))
                              goto out;
                      if (le32_to_cpu(de->inode)) {
                              if (!dir_emit(ctx, de->name, de->name_len,
                                            le32_to_cpu(de->inode),
                                            get_dtype(sb, de->file_type)))
                                      goto out;
                      }
                      ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
              }
      out:
              kfree(dir_buf);
              brelse(iloc.bh);
              return ret;
      }
      
      struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                              struct ext4_dir_entry_2 **parent_de,
                                              int *retval)
      {
              struct ext4_iloc iloc;
      
              *retval = ext4_get_inode_loc(inode, &iloc);
              if (*retval)
                      return NULL;
      
              *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
      
              return iloc.bh;
      }
      
      /*
       * Try to create the inline data for the new dir.
       * If it succeeds, return 0, otherwise return the error.
       * In case of ENOSPC, the caller should create the normal disk layout dir.
       */
      int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
                                     struct inode *inode)
      {
              int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
              struct ext4_iloc iloc;
              struct ext4_dir_entry_2 *de;
      
              ret = ext4_get_inode_loc(inode, &iloc);
              if (ret)
                      return ret;
      
              ret = ext4_prepare_inline_data(handle, inode, inline_size);
              if (ret)
                      goto out;
      
              /*
               * For inline dir, we only save the inode information for the ".."
               * and create a fake dentry to cover the left space.
               */
              de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
              de->inode = cpu_to_le32(parent->i_ino);
              de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
              de->inode = 0;
              de->rec_len = ext4_rec_len_to_disk(
                                      inline_size - EXT4_INLINE_DOTDOT_SIZE,
                                      inline_size);
              set_nlink(inode, 2);
              inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
      out:
              brelse(iloc.bh);
              return ret;
      }
      
      struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                              struct ext4_filename *fname,
                                              struct ext4_dir_entry_2 **res_dir,
                                              int *has_inline_data)
      {
              int ret;
              struct ext4_iloc iloc;
              void *inline_start;
              int inline_size;
      
              if (ext4_get_inode_loc(dir, &iloc))
                      return NULL;
      
              down_read(&EXT4_I(dir)->xattr_sem);
              if (!ext4_has_inline_data(dir)) {
                      *has_inline_data = 0;
                      goto out;
              }
      
              inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
                                                      EXT4_INLINE_DOTDOT_SIZE;
              inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
              ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
                                    dir, fname, 0, res_dir);
              if (ret == 1)
                      goto out_find;
              if (ret < 0)
                      goto out;
      
              if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
                      goto out;
      
              inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
              inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
      
              ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
                                    dir, fname, 0, res_dir);
              if (ret == 1)
                      goto out_find;
      
      out:
              brelse(iloc.bh);
              iloc.bh = NULL;
      out_find:
              up_read(&EXT4_I(dir)->xattr_sem);
              return iloc.bh;
      }
      
      int ext4_delete_inline_entry(handle_t *handle,
                                   struct inode *dir,
                                   struct ext4_dir_entry_2 *de_del,
                                   struct buffer_head *bh,
                                   int *has_inline_data)
      {
              int err, inline_size, no_expand;
              struct ext4_iloc iloc;
              void *inline_start;
      
              err = ext4_get_inode_loc(dir, &iloc);
              if (err)
                      return err;
      
              ext4_write_lock_xattr(dir, &no_expand);
              if (!ext4_has_inline_data(dir)) {
                      *has_inline_data = 0;
                      goto out;
              }
      
              if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
                      EXT4_MIN_INLINE_DATA_SIZE) {
                      inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
                                              EXT4_INLINE_DOTDOT_SIZE;
                      inline_size = EXT4_MIN_INLINE_DATA_SIZE -
                                      EXT4_INLINE_DOTDOT_SIZE;
              } else {
                      inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
                      inline_size = ext4_get_inline_size(dir) -
                                      EXT4_MIN_INLINE_DATA_SIZE;
              }
      
              BUFFER_TRACE(bh, "get_write_access");
              err = ext4_journal_get_write_access(handle, bh);
              if (err)
                      goto out;
      
              err = ext4_generic_delete_entry(handle, dir, de_del, bh,
                                              inline_start, inline_size, 0);
              if (err)
                      goto out;
      
              ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
      out:
              ext4_write_unlock_xattr(dir, &no_expand);
              if (likely(err == 0))
                      err = ext4_mark_inode_dirty(handle, dir);
              brelse(iloc.bh);
              if (err != -ENOENT)
                      ext4_std_error(dir->i_sb, err);
              return err;
      }
      
      /*
       * Get the inline dentry at offset.
       */
      static inline struct ext4_dir_entry_2 *
      ext4_get_inline_entry(struct inode *inode,
                            struct ext4_iloc *iloc,
                            unsigned int offset,
                            void **inline_start,
                            int *inline_size)
      {
              void *inline_pos;
      
              BUG_ON(offset > ext4_get_inline_size(inode));
      
              if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
                      inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
                      *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
              } else {
                      inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
                      offset -= EXT4_MIN_INLINE_DATA_SIZE;
                      *inline_size = ext4_get_inline_size(inode) -
                                      EXT4_MIN_INLINE_DATA_SIZE;
              }
      
              if (inline_start)
                      *inline_start = inline_pos;
              return (struct ext4_dir_entry_2 *)(inline_pos + offset);
      }
      
      bool empty_inline_dir(struct inode *dir, int *has_inline_data)
      {
              int err, inline_size;
              struct ext4_iloc iloc;
              size_t inline_len;
              void *inline_pos;
              unsigned int offset;
              struct ext4_dir_entry_2 *de;
              bool ret = true;
      
              err = ext4_get_inode_loc(dir, &iloc);
              if (err) {
                      EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
                                       err, dir->i_ino);
                      return true;
              }
      
              down_read(&EXT4_I(dir)->xattr_sem);
              if (!ext4_has_inline_data(dir)) {
                      *has_inline_data = 0;
                      goto out;
              }
      
              de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
              if (!le32_to_cpu(de->inode)) {
                      ext4_warning(dir->i_sb,
                                   "bad inline directory (dir #%lu) - no `..'",
                                   dir->i_ino);
                      ret = true;
                      goto out;
              }
      
              inline_len = ext4_get_inline_size(dir);
              offset = EXT4_INLINE_DOTDOT_SIZE;
              while (offset < inline_len) {
                      de = ext4_get_inline_entry(dir, &iloc, offset,
                                                 &inline_pos, &inline_size);
                      if (ext4_check_dir_entry(dir, NULL, de,
                                               iloc.bh, inline_pos,
                                               inline_size, offset)) {
                              ext4_warning(dir->i_sb,
                                           "bad inline directory (dir #%lu) - "
                                           "inode %u, rec_len %u, name_len %d"
                                           "inline size %d",
                                           dir->i_ino, le32_to_cpu(de->inode),
                                           le16_to_cpu(de->rec_len), de->name_len,
                                           inline_size);
                              ret = true;
                              goto out;
                      }
                      if (le32_to_cpu(de->inode)) {
                              ret = false;
                              goto out;
                      }
                      offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
              }
      
      out:
              up_read(&EXT4_I(dir)->xattr_sem);
              brelse(iloc.bh);
              return ret;
      }
      
      int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
      {
              int ret, no_expand;
      
              ext4_write_lock_xattr(inode, &no_expand);
              ret = ext4_destroy_inline_data_nolock(handle, inode);
              ext4_write_unlock_xattr(inode, &no_expand);
      
              return ret;
      }
      
      int ext4_inline_data_fiemap(struct inode *inode,
                                  struct fiemap_extent_info *fieinfo,
                                  int *has_inline, __u64 start, __u64 len)
      {
              __u64 physical = 0;
              __u64 inline_len;
              __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
                      FIEMAP_EXTENT_LAST;
              int error = 0;
              struct ext4_iloc iloc;
      
              down_read(&EXT4_I(inode)->xattr_sem);
              if (!ext4_has_inline_data(inode)) {
                      *has_inline = 0;
                      goto out;
              }
              inline_len = min_t(size_t, ext4_get_inline_size(inode),
                                 i_size_read(inode));
              if (start >= inline_len)
                      goto out;
              if (start + len < inline_len)
                      inline_len = start + len;
              inline_len -= start;
      
              error = ext4_get_inode_loc(inode, &iloc);
              if (error)
                      goto out;
      
              physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
              physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
              physical += offsetof(struct ext4_inode, i_block);
      
              brelse(iloc.bh);
      out:
              up_read(&EXT4_I(inode)->xattr_sem);
              if (physical)
                      error = fiemap_fill_next_extent(fieinfo, start, physical,
                                                      inline_len, flags);
              return (error < 0 ? error : 0);
      }
      
      int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
      {
              handle_t *handle;
              int inline_size, value_len, needed_blocks, no_expand, err = 0;
              size_t i_size;
              void *value = NULL;
              struct ext4_xattr_ibody_find is = {
                      .s = { .not_found = -ENODATA, },
              };
              struct ext4_xattr_info i = {
                      .name_index = EXT4_XATTR_INDEX_SYSTEM,
                      .name = EXT4_XATTR_SYSTEM_DATA,
              };
      
      
              needed_blocks = ext4_writepage_trans_blocks(inode);
              handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
              if (IS_ERR(handle))
                      return PTR_ERR(handle);
      
              ext4_write_lock_xattr(inode, &no_expand);
              if (!ext4_has_inline_data(inode)) {
                      *has_inline = 0;
                      ext4_journal_stop(handle);
                      return 0;
              }
      
              if ((err = ext4_orphan_add(handle, inode)) != 0)
                      goto out;
      
              if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0)
                      goto out;
      
              down_write(&EXT4_I(inode)->i_data_sem);
              i_size = inode->i_size;
              inline_size = ext4_get_inline_size(inode);
              EXT4_I(inode)->i_disksize = i_size;
      
              if (i_size < inline_size) {
                      /* Clear the content in the xattr space. */
                      if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
                              if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
                                      goto out_error;
      
                              BUG_ON(is.s.not_found);
      
                              value_len = le32_to_cpu(is.s.here->e_value_size);
                              value = kmalloc(value_len, GFP_NOFS);
                              if (!value) {
                                      err = -ENOMEM;
                                      goto out_error;
                              }
      
                              err = ext4_xattr_ibody_get(inode, i.name_index,
                                                         i.name, value, value_len);
                              if (err <= 0)
                                      goto out_error;
      
                              i.value = value;
                              i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
                                              i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
                              err = ext4_xattr_ibody_inline_set(handle, inode,
                                                                &i, &is);
                              if (err)
                                      goto out_error;
                      }
      
                      /* Clear the content within i_blocks. */
                      if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
                              void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
                              memset(p + i_size, 0,
                                     EXT4_MIN_INLINE_DATA_SIZE - i_size);
                      }
      
                      EXT4_I(inode)->i_inline_size = i_size <
                                              EXT4_MIN_INLINE_DATA_SIZE ?
                                              EXT4_MIN_INLINE_DATA_SIZE : i_size;
              }
      
      out_error:
              up_write(&EXT4_I(inode)->i_data_sem);
      out:
              brelse(is.iloc.bh);
              ext4_write_unlock_xattr(inode, &no_expand);
              kfree(value);
              if (inode->i_nlink)
                      ext4_orphan_del(handle, inode);
      
              if (err == 0) {
                      inode->i_mtime = inode->i_ctime = current_time(inode);
                      err = ext4_mark_inode_dirty(handle, inode);
                      if (IS_SYNC(inode))
                              ext4_handle_sync(handle);
              }
              ext4_journal_stop(handle);
              return err;
      }
      
      int ext4_convert_inline_data(struct inode *inode)
      {
              int error, needed_blocks, no_expand;
              handle_t *handle;
              struct ext4_iloc iloc;
      
  430         if (!ext4_has_inline_data(inode)) {
  430                 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
  430                 return 0;
              }
      
              needed_blocks = ext4_writepage_trans_blocks(inode);
      
              iloc.bh = NULL;
              error = ext4_get_inode_loc(inode, &iloc);
              if (error)
                      return error;
      
              handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
              if (IS_ERR(handle)) {
                      error = PTR_ERR(handle);
                      goto out_free;
              }
      
              ext4_write_lock_xattr(inode, &no_expand);
              if (ext4_has_inline_data(inode))
                      error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
              ext4_write_unlock_xattr(inode, &no_expand);
              ext4_journal_stop(handle);
      out_free:
              brelse(iloc.bh);
              return error;
      }
      /* SPDX-License-Identifier: GPL-2.0 */
      #undef TRACE_SYSTEM
      #define TRACE_SYSTEM exceptions
      
      #if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
      #define _TRACE_PAGE_FAULT_H
      
      #include <linux/tracepoint.h>
      #include <asm/trace/common.h>
      
      extern int trace_pagefault_reg(void);
      extern void trace_pagefault_unreg(void);
      
      DECLARE_EVENT_CLASS(x86_exceptions,
      
              TP_PROTO(unsigned long address, struct pt_regs *regs,
                       unsigned long error_code),
      
              TP_ARGS(address, regs, error_code),
      
              TP_STRUCT__entry(
                      __field(                unsigned long, address        )
                      __field(                unsigned long, ip        )
                      __field(                unsigned long, error_code )
              ),
      
              TP_fast_assign(
                      __entry->address = address;
                      __entry->ip = regs->ip;
                      __entry->error_code = error_code;
              ),
      
              TP_printk("address=%pf ip=%pf error_code=0x%lx",
                        (void *)__entry->address, (void *)__entry->ip,
                        __entry->error_code) );
      
      #define DEFINE_PAGE_FAULT_EVENT(name)                                \
      DEFINE_EVENT_FN(x86_exceptions, name,                                \
              TP_PROTO(unsigned long address,        struct pt_regs *regs,        \
                       unsigned long error_code),                        \
              TP_ARGS(address, regs, error_code),                        \
              trace_pagefault_reg, trace_pagefault_unreg);
      
    6 DEFINE_PAGE_FAULT_EVENT(page_fault_user);
    9 DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
      
      #undef TRACE_INCLUDE_PATH
      #define TRACE_INCLUDE_PATH .
      #define TRACE_INCLUDE_FILE exceptions
      #endif /*  _TRACE_PAGE_FAULT_H */
      
      /* This part must be outside protection */
      #include <trace/define_trace.h>
      /* SPDX-License-Identifier: GPL-2.0 */
      /* Atomic operations usable in machine independent code */
      #ifndef _LINUX_ATOMIC_H
      #define _LINUX_ATOMIC_H
      #include <asm/atomic.h>
      #include <asm/barrier.h>
      
      /*
       * Relaxed variants of xchg, cmpxchg and some atomic operations.
       *
       * We support four variants:
       *
       * - Fully ordered: The default implementation, no suffix required.
       * - Acquire: Provides ACQUIRE semantics, _acquire suffix.
       * - Release: Provides RELEASE semantics, _release suffix.
       * - Relaxed: No ordering guarantees, _relaxed suffix.
       *
       * For compound atomics performing both a load and a store, ACQUIRE
       * semantics apply only to the load and RELEASE semantics only to the
       * store portion of the operation. Note that a failed cmpxchg_acquire
       * does -not- imply any memory ordering constraints.
       *
       * See Documentation/memory-barriers.txt for ACQUIRE/RELEASE definitions.
       */
      
      #ifndef atomic_read_acquire
      #define  atomic_read_acquire(v)                smp_load_acquire(&(v)->counter)
      #endif
      
      #ifndef atomic_set_release
      #define  atomic_set_release(v, i)        smp_store_release(&(v)->counter, (i))
      #endif
      
      /*
       * The idea here is to build acquire/release variants by adding explicit
       * barriers on top of the relaxed variant. In the case where the relaxed
       * variant is already fully ordered, no additional barriers are needed.
       *
       * Besides, if an arch has a special barrier for acquire/release, it could
       * implement its own __atomic_op_* and use the same framework for building
       * variants
       *
       * If an architecture overrides __atomic_op_acquire() it will probably want
       * to define smp_mb__after_spinlock().
       */
      #ifndef __atomic_op_acquire
      #define __atomic_op_acquire(op, args...)                                \
      ({                                                                        \
              typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);                \
              smp_mb__after_atomic();                                                \
              __ret;                                                                \
      })
      #endif
      
      #ifndef __atomic_op_release
      #define __atomic_op_release(op, args...)                                \
      ({                                                                        \
              smp_mb__before_atomic();                                        \
              op##_relaxed(args);                                                \
      })
      #endif
      
      #ifndef __atomic_op_fence
      #define __atomic_op_fence(op, args...)                                        \
      ({                                                                        \
              typeof(op##_relaxed(args)) __ret;                                \
              smp_mb__before_atomic();                                        \
              __ret = op##_relaxed(args);                                        \
              smp_mb__after_atomic();                                                \
              __ret;                                                                \
      })
      #endif
      
      /* atomic_add_return_relaxed */
      #ifndef atomic_add_return_relaxed
      #define  atomic_add_return_relaxed        atomic_add_return
      #define  atomic_add_return_acquire        atomic_add_return
      #define  atomic_add_return_release        atomic_add_return
      
      #else /* atomic_add_return_relaxed */
      
      #ifndef atomic_add_return_acquire
      #define  atomic_add_return_acquire(...)                                        \
              __atomic_op_acquire(atomic_add_return, __VA_ARGS__)
      #endif
      
      #ifndef atomic_add_return_release
      #define  atomic_add_return_release(...)                                        \
              __atomic_op_release(atomic_add_return, __VA_ARGS__)
      #endif
      
      #ifndef atomic_add_return
      #define  atomic_add_return(...)                                                \
              __atomic_op_fence(atomic_add_return, __VA_ARGS__)
      #endif
      #endif /* atomic_add_return_relaxed */
      
      /* atomic_inc_return_relaxed */
      #ifndef atomic_inc_return_relaxed
      #define  atomic_inc_return_relaxed        atomic_inc_return
      #define  atomic_inc_return_acquire        atomic_inc_return
      #define  atomic_inc_return_release        atomic_inc_return
      
      #else /* atomic_inc_return_relaxed */
      
      #ifndef atomic_inc_return_acquire
      #define  atomic_inc_return_acquire(...)                                        \
              __atomic_op_acquire(atomic_inc_return, __VA_ARGS__)
      #endif
      
      #ifndef atomic_inc_return_release
      #define  atomic_inc_return_release(...)                                        \
              __atomic_op_release(atomic_inc_return, __VA_ARGS__)
      #endif
      
      #ifndef atomic_inc_return
      #define  atomic_inc_return(...)                                                \
              __atomic_op_fence(atomic_inc_return, __VA_ARGS__)
      #endif
      #endif /* atomic_inc_return_relaxed */
      
      /* atomic_sub_return_relaxed */
      #ifndef atomic_sub_return_relaxed
      #define  atomic_sub_return_relaxed        atomic_sub_return
      #define  atomic_sub_return_acquire        atomic_sub_return
      #define  atomic_sub_return_release        atomic_sub_return
      
      #else /* atomic_sub_return_relaxed */
      
      #ifndef atomic_sub_return_acquire
      #define  atomic_sub_return_acquire(...)                                        \
              __atomic_op_acquire(atomic_sub_return, __VA_ARGS__)
      #endif
      
      #ifndef atomic_sub_return_release
      #define  atomic_sub_return_release(...)                                        \
              __atomic_op_release(atomic_sub_return, __VA_ARGS__)
      #endif
      
      #ifndef atomic_sub_return
      #define  atomic_sub_return(...)                                                \
              __atomic_op_fence(atomic_sub_return, __VA_ARGS__)
      #endif
      #endif /* atomic_sub_return_relaxed */
      
      /* atomic_dec_return_relaxed */
      #ifndef atomic_dec_return_relaxed
      #define  atomic_dec_return_relaxed        atomic_dec_return
      #define  atomic_dec_return_acquire        atomic_dec_return
      #define  atomic_dec_return_release        atomic_dec_return
      
      #else /* atomic_dec_return_relaxed */
      
      #ifndef atomic_dec_return_acquire
      #define  atomic_dec_return_acquire(...)                                        \
              __atomic_op_acquire(atomic_dec_return, __VA_ARGS__)
      #endif
      
      #ifndef atomic_dec_return_release
      #define  atomic_dec_return_release(...)                                        \
              __atomic_op_release(atomic_dec_return, __VA_ARGS__)
      #endif
      
      #ifndef atomic_dec_return
      #define  atomic_dec_return(...)                                                \
              __atomic_op_fence(atomic_dec_return, __VA_ARGS__)
      #endif
      #endif /* atomic_dec_return_relaxed */
      
      
      /* atomic_fetch_add_relaxed */
      #ifndef atomic_fetch_add_relaxed
      #define atomic_fetch_add_relaxed        atomic_fetch_add
      #define atomic_fetch_add_acquire        atomic_fetch_add
      #define atomic_fetch_add_release        atomic_fetch_add
      
      #else /* atomic_fetch_add_relaxed */
      
      #ifndef atomic_fetch_add_acquire
      #define atomic_fetch_add_acquire(...)                                        \
              __atomic_op_acquire(atomic_fetch_add, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_add_release
      #define atomic_fetch_add_release(...)                                        \
              __atomic_op_release(atomic_fetch_add, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_add
      #define atomic_fetch_add(...)                                                \
              __atomic_op_fence(atomic_fetch_add, __VA_ARGS__)
      #endif
      #endif /* atomic_fetch_add_relaxed */
      
      /* atomic_fetch_inc_relaxed */
      #ifndef atomic_fetch_inc_relaxed
      
      #ifndef atomic_fetch_inc
      #define atomic_fetch_inc(v)                atomic_fetch_add(1, (v))
      #define atomic_fetch_inc_relaxed(v)        atomic_fetch_add_relaxed(1, (v))
      #define atomic_fetch_inc_acquire(v)        atomic_fetch_add_acquire(1, (v))
      #define atomic_fetch_inc_release(v)        atomic_fetch_add_release(1, (v))
      #else /* atomic_fetch_inc */
      #define atomic_fetch_inc_relaxed        atomic_fetch_inc
      #define atomic_fetch_inc_acquire        atomic_fetch_inc
      #define atomic_fetch_inc_release        atomic_fetch_inc
      #endif /* atomic_fetch_inc */
      
      #else /* atomic_fetch_inc_relaxed */
      
      #ifndef atomic_fetch_inc_acquire
      #define atomic_fetch_inc_acquire(...)                                        \
              __atomic_op_acquire(atomic_fetch_inc, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_inc_release
      #define atomic_fetch_inc_release(...)                                        \
              __atomic_op_release(atomic_fetch_inc, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_inc
      #define atomic_fetch_inc(...)                                                \
              __atomic_op_fence(atomic_fetch_inc, __VA_ARGS__)
      #endif
      #endif /* atomic_fetch_inc_relaxed */
      
      /* atomic_fetch_sub_relaxed */
      #ifndef atomic_fetch_sub_relaxed
      #define atomic_fetch_sub_relaxed        atomic_fetch_sub
      #define atomic_fetch_sub_acquire        atomic_fetch_sub
      #define atomic_fetch_sub_release        atomic_fetch_sub
      
      #else /* atomic_fetch_sub_relaxed */
      
      #ifndef atomic_fetch_sub_acquire
      #define atomic_fetch_sub_acquire(...)                                        \
              __atomic_op_acquire(atomic_fetch_sub, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_sub_release
      #define atomic_fetch_sub_release(...)                                        \
              __atomic_op_release(atomic_fetch_sub, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_sub
      #define atomic_fetch_sub(...)                                                \
              __atomic_op_fence(atomic_fetch_sub, __VA_ARGS__)
      #endif
      #endif /* atomic_fetch_sub_relaxed */
      
      /* atomic_fetch_dec_relaxed */
      #ifndef atomic_fetch_dec_relaxed
      
      #ifndef atomic_fetch_dec
      #define atomic_fetch_dec(v)                atomic_fetch_sub(1, (v))
      #define atomic_fetch_dec_relaxed(v)        atomic_fetch_sub_relaxed(1, (v))
      #define atomic_fetch_dec_acquire(v)        atomic_fetch_sub_acquire(1, (v))
      #define atomic_fetch_dec_release(v)        atomic_fetch_sub_release(1, (v))
      #else /* atomic_fetch_dec */
      #define atomic_fetch_dec_relaxed        atomic_fetch_dec
      #define atomic_fetch_dec_acquire        atomic_fetch_dec
      #define atomic_fetch_dec_release        atomic_fetch_dec
      #endif /* atomic_fetch_dec */
      
      #else /* atomic_fetch_dec_relaxed */
      
      #ifndef atomic_fetch_dec_acquire
      #define atomic_fetch_dec_acquire(...)                                        \
              __atomic_op_acquire(atomic_fetch_dec, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_dec_release
      #define atomic_fetch_dec_release(...)                                        \
              __atomic_op_release(atomic_fetch_dec, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_dec
      #define atomic_fetch_dec(...)                                                \
              __atomic_op_fence(atomic_fetch_dec, __VA_ARGS__)
      #endif
      #endif /* atomic_fetch_dec_relaxed */
      
      /* atomic_fetch_or_relaxed */
      #ifndef atomic_fetch_or_relaxed
      #define atomic_fetch_or_relaxed        atomic_fetch_or
      #define atomic_fetch_or_acquire        atomic_fetch_or
      #define atomic_fetch_or_release        atomic_fetch_or
      
      #else /* atomic_fetch_or_relaxed */
      
      #ifndef atomic_fetch_or_acquire
      #define atomic_fetch_or_acquire(...)                                        \
              __atomic_op_acquire(atomic_fetch_or, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_or_release
      #define atomic_fetch_or_release(...)                                        \
              __atomic_op_release(atomic_fetch_or, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_or
      #define atomic_fetch_or(...)                                                \
              __atomic_op_fence(atomic_fetch_or, __VA_ARGS__)
      #endif
      #endif /* atomic_fetch_or_relaxed */
      
      /* atomic_fetch_and_relaxed */
      #ifndef atomic_fetch_and_relaxed
      #define atomic_fetch_and_relaxed        atomic_fetch_and
      #define atomic_fetch_and_acquire        atomic_fetch_and
      #define atomic_fetch_and_release        atomic_fetch_and
      
      #else /* atomic_fetch_and_relaxed */
      
      #ifndef atomic_fetch_and_acquire
      #define atomic_fetch_and_acquire(...)                                        \
              __atomic_op_acquire(atomic_fetch_and, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_and_release
      #define atomic_fetch_and_release(...)                                        \
              __atomic_op_release(atomic_fetch_and, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_and
      #define atomic_fetch_and(...)                                                \
              __atomic_op_fence(atomic_fetch_and, __VA_ARGS__)
      #endif
      #endif /* atomic_fetch_and_relaxed */
      
      #ifdef atomic_andnot
      /* atomic_fetch_andnot_relaxed */
      #ifndef atomic_fetch_andnot_relaxed
      #define atomic_fetch_andnot_relaxed        atomic_fetch_andnot
      #define atomic_fetch_andnot_acquire        atomic_fetch_andnot
      #define atomic_fetch_andnot_release        atomic_fetch_andnot
      
      #else /* atomic_fetch_andnot_relaxed */
      
      #ifndef atomic_fetch_andnot_acquire
      #define atomic_fetch_andnot_acquire(...)                                        \
              __atomic_op_acquire(atomic_fetch_andnot, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_andnot_release
      #define atomic_fetch_andnot_release(...)                                        \
              __atomic_op_release(atomic_fetch_andnot, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_andnot
      #define atomic_fetch_andnot(...)                                                \
              __atomic_op_fence(atomic_fetch_andnot, __VA_ARGS__)
      #endif
      #endif /* atomic_fetch_andnot_relaxed */
      #endif /* atomic_andnot */
      
      /* atomic_fetch_xor_relaxed */
      #ifndef atomic_fetch_xor_relaxed
      #define atomic_fetch_xor_relaxed        atomic_fetch_xor
      #define atomic_fetch_xor_acquire        atomic_fetch_xor
      #define atomic_fetch_xor_release        atomic_fetch_xor
      
      #else /* atomic_fetch_xor_relaxed */
      
      #ifndef atomic_fetch_xor_acquire
      #define atomic_fetch_xor_acquire(...)                                        \
              __atomic_op_acquire(atomic_fetch_xor, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_xor_release
      #define atomic_fetch_xor_release(...)                                        \
              __atomic_op_release(atomic_fetch_xor, __VA_ARGS__)
      #endif
      
      #ifndef atomic_fetch_xor
      #define atomic_fetch_xor(...)                                                \
              __atomic_op_fence(atomic_fetch_xor, __VA_ARGS__)
      #endif
      #endif /* atomic_fetch_xor_relaxed */
      
      
      /* atomic_xchg_relaxed */
      #ifndef atomic_xchg_relaxed
      #define  atomic_xchg_relaxed                atomic_xchg
      #define  atomic_xchg_acquire                atomic_xchg
      #define  atomic_xchg_release                atomic_xchg
      
      #else /* atomic_xchg_relaxed */
      
      #ifndef atomic_xchg_acquire
      #define  atomic_xchg_acquire(...)                                        \
              __atomic_op_acquire(atomic_xchg, __VA_ARGS__)
      #endif
      
      #ifndef atomic_xchg_release
      #define  atomic_xchg_release(...)                                        \
              __atomic_op_release(atomic_xchg, __VA_ARGS__)
      #endif
      
      #ifndef atomic_xchg
      #define  atomic_xchg(...)                                                \
              __atomic_op_fence(atomic_xchg, __VA_ARGS__)
      #endif
      #endif /* atomic_xchg_relaxed */
      
      /* atomic_cmpxchg_relaxed */
      #ifndef atomic_cmpxchg_relaxed
      #define  atomic_cmpxchg_relaxed                atomic_cmpxchg
      #define  atomic_cmpxchg_acquire                atomic_cmpxchg
      #define  atomic_cmpxchg_release                atomic_cmpxchg
      
      #else /* atomic_cmpxchg_relaxed */
      
      #ifndef atomic_cmpxchg_acquire
      #define  atomic_cmpx