/*
       * sha512_base.h - core logic for SHA-512 implementations
       *
       * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
       *
       * This program is free software; you can redistribute it and/or modify
       * it under the terms of the GNU General Public License version 2 as
       * published by the Free Software Foundation.
       */
      
      #include <crypto/internal/hash.h>
      #include <crypto/sha.h>
      #include <linux/crypto.h>
      #include <linux/module.h>
      
      #include <asm/unaligned.h>
      
      typedef void (sha512_block_fn)(struct sha512_state *sst, u8 const *src,
                                     int blocks);
      
      static inline int sha384_base_init(struct shash_desc *desc)
      {
              struct sha512_state *sctx = shash_desc_ctx(desc);
      
    1         sctx->state[0] = SHA384_H0;
              sctx->state[1] = SHA384_H1;
              sctx->state[2] = SHA384_H2;
              sctx->state[3] = SHA384_H3;
              sctx->state[4] = SHA384_H4;
              sctx->state[5] = SHA384_H5;
              sctx->state[6] = SHA384_H6;
              sctx->state[7] = SHA384_H7;
              sctx->count[0] = sctx->count[1] = 0;
      
              return 0;
      }
      
      static inline int sha512_base_init(struct shash_desc *desc)
      {
              struct sha512_state *sctx = shash_desc_ctx(desc);
      
    9         sctx->state[0] = SHA512_H0;
              sctx->state[1] = SHA512_H1;
              sctx->state[2] = SHA512_H2;
              sctx->state[3] = SHA512_H3;
              sctx->state[4] = SHA512_H4;
              sctx->state[5] = SHA512_H5;
              sctx->state[6] = SHA512_H6;
              sctx->state[7] = SHA512_H7;
              sctx->count[0] = sctx->count[1] = 0;
      
              return 0;
      }
      
      static inline int sha512_base_do_update(struct shash_desc *desc,
                                              const u8 *data,
                                              unsigned int len,
                                              sha512_block_fn *block_fn)
      {
              struct sha512_state *sctx = shash_desc_ctx(desc);
   10         unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
      
              sctx->count[0] += len;
              if (sctx->count[0] < len)
                      sctx->count[1]++;
      
   10         if (unlikely((partial + len) >= SHA512_BLOCK_SIZE)) {
                      int blocks;
      
   10                 if (partial) {
                              int p = SHA512_BLOCK_SIZE - partial;
      
                              memcpy(sctx->buf + partial, data, p);
                              data += p;
                              len -= p;
      
                              block_fn(sctx, sctx->buf, 1);
                      }
      
   10                 blocks = len / SHA512_BLOCK_SIZE;
                      len %= SHA512_BLOCK_SIZE;
      
                      if (blocks) {
   10                         block_fn(sctx, data, blocks);
   10                         data += blocks * SHA512_BLOCK_SIZE;
                      }
                      partial = 0;
              }
   10         if (len)
                      memcpy(sctx->buf + partial, data, len);
      
              return 0;
      }
      
      static inline int sha512_base_do_finalize(struct shash_desc *desc,
                                                sha512_block_fn *block_fn)
      {
              const int bit_offset = SHA512_BLOCK_SIZE - sizeof(__be64[2]);
              struct sha512_state *sctx = shash_desc_ctx(desc);
              __be64 *bits = (__be64 *)(sctx->buf + bit_offset);
              unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
      
              sctx->buf[partial++] = 0x80;
              if (partial > bit_offset) {
                      memset(sctx->buf + partial, 0x0, SHA512_BLOCK_SIZE - partial);
                      partial = 0;
      
                      block_fn(sctx, sctx->buf, 1);
              }
      
              memset(sctx->buf + partial, 0x0, bit_offset - partial);
              bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
              bits[1] = cpu_to_be64(sctx->count[0] << 3);
              block_fn(sctx, sctx->buf, 1);
      
              return 0;
      }
      
      static inline int sha512_base_finish(struct shash_desc *desc, u8 *out)
      {
              unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
              struct sha512_state *sctx = shash_desc_ctx(desc);
              __be64 *digest = (__be64 *)out;
              int i;
      
              for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be64))
                      put_unaligned_be64(sctx->state[i], digest++);
      
              *sctx = (struct sha512_state){};
              return 0;
      }
      /*
       * Implementation of the kernel access vector cache (AVC).
       *
       * Authors:  Stephen Smalley, <sds@epoch.ncsc.mil>
       *             James Morris <jmorris@redhat.com>
       *
       * Update:   KaiGai, Kohei <kaigai@ak.jp.nec.com>
       *        Replaced the avc_lock spinlock by RCU.
       *
       * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
       *
       *        This program is free software; you can redistribute it and/or modify
       *        it under the terms of the GNU General Public License version 2,
       *        as published by the Free Software Foundation.
       */
      #include <linux/types.h>
      #include <linux/stddef.h>
      #include <linux/kernel.h>
      #include <linux/slab.h>
      #include <linux/fs.h>
      #include <linux/dcache.h>
      #include <linux/init.h>
      #include <linux/skbuff.h>
      #include <linux/percpu.h>
      #include <linux/list.h>
      #include <net/sock.h>
      #include <linux/un.h>
      #include <net/af_unix.h>
      #include <linux/ip.h>
      #include <linux/audit.h>
      #include <linux/ipv6.h>
      #include <net/ipv6.h>
      #include "avc.h"
      #include "avc_ss.h"
      #include "classmap.h"
      
      #define AVC_CACHE_SLOTS                        512
      #define AVC_DEF_CACHE_THRESHOLD                512
      #define AVC_CACHE_RECLAIM                16
      
      #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
      #define avc_cache_stats_incr(field)        this_cpu_inc(avc_cache_stats.field)
      #else
      #define avc_cache_stats_incr(field)        do {} while (0)
      #endif
      
      struct avc_entry {
              u32                        ssid;
              u32                        tsid;
              u16                        tclass;
              struct av_decision        avd;
              struct avc_xperms_node        *xp_node;
      };
      
      struct avc_node {
              struct avc_entry        ae;
              struct hlist_node        list; /* anchored in avc_cache->slots[i] */
              struct rcu_head                rhead;
      };
      
      struct avc_xperms_decision_node {
              struct extended_perms_decision xpd;
              struct list_head xpd_list; /* list of extended_perms_decision */
      };
      
      struct avc_xperms_node {
              struct extended_perms xp;
              struct list_head xpd_head; /* list head of extended_perms_decision */
      };
      
      struct avc_cache {
              struct hlist_head        slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
              spinlock_t                slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
              atomic_t                lru_hint;        /* LRU hint for reclaim scan */
              atomic_t                active_nodes;
              u32                        latest_notif;        /* latest revocation notification */
      };
      
      struct avc_callback_node {
              int (*callback) (u32 event);
              u32 events;
              struct avc_callback_node *next;
      };
      
      /* Exported via selinufs */
      unsigned int avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD;
      
      #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
      DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
      #endif
      
      static struct avc_cache avc_cache;
      static struct avc_callback_node *avc_callbacks;
      static struct kmem_cache *avc_node_cachep;
      static struct kmem_cache *avc_xperms_data_cachep;
      static struct kmem_cache *avc_xperms_decision_cachep;
      static struct kmem_cache *avc_xperms_cachep;
      
      static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
      {
  276         return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
      }
      
      /**
       * avc_dump_av - Display an access vector in human-readable form.
       * @tclass: target security class
       * @av: access vector
       */
      static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av)
      {
              const char **perms;
              int i, perm;
      
              if (av == 0) {
                      audit_log_format(ab, " null");
                      return;
              }
      
              BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
              perms = secclass_map[tclass-1].perms;
      
              audit_log_format(ab, " {");
              i = 0;
              perm = 1;
              while (i < (sizeof(av) * 8)) {
  110                 if ((perm & av) && perms[i]) {
  110                         audit_log_format(ab, " %s", perms[i]);
                              av &= ~perm;
                      }
  110                 i++;
                      perm <<= 1;
              }
      
  110         if (av)
                      audit_log_format(ab, " 0x%x", av);
      
  110         audit_log_format(ab, " }");
      }
      
      /**
       * avc_dump_query - Display a SID pair and a class in human-readable form.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       */
      static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass)
      {
              int rc;
              char *scontext;
              u32 scontext_len;
      
              rc = security_sid_to_context(ssid, &scontext, &scontext_len);
              if (rc)
                      audit_log_format(ab, "ssid=%d", ssid);
              else {
  110                 audit_log_format(ab, "scontext=%s", scontext);
                      kfree(scontext);
              }
      
  110         rc = security_sid_to_context(tsid, &scontext, &scontext_len);
              if (rc)
                      audit_log_format(ab, " tsid=%d", tsid);
              else {
  110                 audit_log_format(ab, " tcontext=%s", scontext);
                      kfree(scontext);
              }
      
  110         BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
  110         audit_log_format(ab, " tclass=%s", secclass_map[tclass-1].name);
      }
      
      /**
       * avc_init - Initialize the AVC.
       *
       * Initialize the access vector cache.
       */
      void __init avc_init(void)
      {
              int i;
      
              for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                      INIT_HLIST_HEAD(&avc_cache.slots[i]);
                      spin_lock_init(&avc_cache.slots_lock[i]);
              }
              atomic_set(&avc_cache.active_nodes, 0);
              atomic_set(&avc_cache.lru_hint, 0);
      
              avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
                                              0, SLAB_PANIC, NULL);
              avc_xperms_cachep = kmem_cache_create("avc_xperms_node",
                                              sizeof(struct avc_xperms_node),
                                              0, SLAB_PANIC, NULL);
              avc_xperms_decision_cachep = kmem_cache_create(
                                              "avc_xperms_decision_node",
                                              sizeof(struct avc_xperms_decision_node),
                                              0, SLAB_PANIC, NULL);
              avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data",
                                              sizeof(struct extended_perms_data),
                                              0, SLAB_PANIC, NULL);
      
              audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n");
      }
      
      int avc_get_hash_stats(char *page)
      {
              int i, chain_len, max_chain_len, slots_used;
              struct avc_node *node;
              struct hlist_head *head;
      
    7         rcu_read_lock();
      
              slots_used = 0;
              max_chain_len = 0;
    7         for (i = 0; i < AVC_CACHE_SLOTS; i++) {
    7                 head = &avc_cache.slots[i];
                      if (!hlist_empty(head)) {
    7                         slots_used++;
                              chain_len = 0;
    7                         hlist_for_each_entry_rcu(node, head, list)
    7                                 chain_len++;
    7                         if (chain_len > max_chain_len)
                                      max_chain_len = chain_len;
                      }
              }
      
    6         rcu_read_unlock();
      
              return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
                               "longest chain: %d\n",
                               atomic_read(&avc_cache.active_nodes),
                               slots_used, AVC_CACHE_SLOTS, max_chain_len);
      }
      
      /*
       * using a linked list for extended_perms_decision lookup because the list is
       * always small. i.e. less than 5, typically 1
       */
      static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver,
                                              struct avc_xperms_node *xp_node)
      {
              struct avc_xperms_decision_node *xpd_node;
      
              list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
                      if (xpd_node->xpd.driver == driver)
                              return &xpd_node->xpd;
              }
              return NULL;
      }
      
      static inline unsigned int
      avc_xperms_has_perm(struct extended_perms_decision *xpd,
                                              u8 perm, u8 which)
      {
              unsigned int rc = 0;
      
              if ((which == XPERMS_ALLOWED) &&
                              (xpd->used & XPERMS_ALLOWED))
                      rc = security_xperm_test(xpd->allowed->p, perm);
              else if ((which == XPERMS_AUDITALLOW) &&
                              (xpd->used & XPERMS_AUDITALLOW))
                      rc = security_xperm_test(xpd->auditallow->p, perm);
              else if ((which == XPERMS_DONTAUDIT) &&
                              (xpd->used & XPERMS_DONTAUDIT))
                      rc = security_xperm_test(xpd->dontaudit->p, perm);
              return rc;
      }
      
      static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
                                      u8 driver, u8 perm)
      {
              struct extended_perms_decision *xpd;
              security_xperm_set(xp_node->xp.drivers.p, driver);
              xpd = avc_xperms_decision_lookup(driver, xp_node);
              if (xpd && xpd->allowed)
                      security_xperm_set(xpd->allowed->p, perm);
      }
      
      static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node)
      {
              struct extended_perms_decision *xpd;
      
              xpd = &xpd_node->xpd;
              if (xpd->allowed)
                      kmem_cache_free(avc_xperms_data_cachep, xpd->allowed);
              if (xpd->auditallow)
                      kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow);
              if (xpd->dontaudit)
                      kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit);
              kmem_cache_free(avc_xperms_decision_cachep, xpd_node);
      }
      
      static void avc_xperms_free(struct avc_xperms_node *xp_node)
      {
              struct avc_xperms_decision_node *xpd_node, *tmp;
      
   10         if (!xp_node)
                      return;
      
              list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) {
                      list_del(&xpd_node->xpd_list);
                      avc_xperms_decision_free(xpd_node);
              }
   10         kmem_cache_free(avc_xperms_cachep, xp_node);
      }
      
      static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
                                              struct extended_perms_decision *src)
      {
              dest->driver = src->driver;
              dest->used = src->used;
              if (dest->used & XPERMS_ALLOWED)
                      memcpy(dest->allowed->p, src->allowed->p,
                                      sizeof(src->allowed->p));
              if (dest->used & XPERMS_AUDITALLOW)
                      memcpy(dest->auditallow->p, src->auditallow->p,
                                      sizeof(src->auditallow->p));
              if (dest->used & XPERMS_DONTAUDIT)
                      memcpy(dest->dontaudit->p, src->dontaudit->p,
                                      sizeof(src->dontaudit->p));
      }
      
      /*
       * similar to avc_copy_xperms_decision, but only copy decision
       * information relevant to this perm
       */
      static inline void avc_quick_copy_xperms_decision(u8 perm,
                              struct extended_perms_decision *dest,
                              struct extended_perms_decision *src)
      {
              /*
               * compute index of the u32 of the 256 bits (8 u32s) that contain this
               * command permission
               */
              u8 i = perm >> 5;
      
              dest->used = src->used;
              if (dest->used & XPERMS_ALLOWED)
                      dest->allowed->p[i] = src->allowed->p[i];
              if (dest->used & XPERMS_AUDITALLOW)
                      dest->auditallow->p[i] = src->auditallow->p[i];
              if (dest->used & XPERMS_DONTAUDIT)
                      dest->dontaudit->p[i] = src->dontaudit->p[i];
      }
      
      static struct avc_xperms_decision_node
                      *avc_xperms_decision_alloc(u8 which)
      {
              struct avc_xperms_decision_node *xpd_node;
              struct extended_perms_decision *xpd;
      
              xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT);
              if (!xpd_node)
                      return NULL;
      
              xpd = &xpd_node->xpd;
              if (which & XPERMS_ALLOWED) {
                      xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                      GFP_NOWAIT);
                      if (!xpd->allowed)
                              goto error;
              }
              if (which & XPERMS_AUDITALLOW) {
                      xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                      GFP_NOWAIT);
                      if (!xpd->auditallow)
                              goto error;
              }
              if (which & XPERMS_DONTAUDIT) {
                      xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                      GFP_NOWAIT);
                      if (!xpd->dontaudit)
                              goto error;
              }
              return xpd_node;
      error:
              avc_xperms_decision_free(xpd_node);
              return NULL;
      }
      
      static int avc_add_xperms_decision(struct avc_node *node,
                              struct extended_perms_decision *src)
      {
              struct avc_xperms_decision_node *dest_xpd;
      
              node->ae.xp_node->xp.len++;
              dest_xpd = avc_xperms_decision_alloc(src->used);
              if (!dest_xpd)
                      return -ENOMEM;
              avc_copy_xperms_decision(&dest_xpd->xpd, src);
              list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head);
              return 0;
      }
      
      static struct avc_xperms_node *avc_xperms_alloc(void)
      {
              struct avc_xperms_node *xp_node;
      
              xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT);
              if (!xp_node)
                      return xp_node;
              INIT_LIST_HEAD(&xp_node->xpd_head);
              return xp_node;
      }
      
      static int avc_xperms_populate(struct avc_node *node,
                                      struct avc_xperms_node *src)
      {
              struct avc_xperms_node *dest;
              struct avc_xperms_decision_node *dest_xpd;
              struct avc_xperms_decision_node *src_xpd;
      
              if (src->xp.len == 0)
                      return 0;
              dest = avc_xperms_alloc();
              if (!dest)
                      return -ENOMEM;
      
              memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
              dest->xp.len = src->xp.len;
      
              /* for each source xpd allocate a destination xpd and copy */
              list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
                      dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used);
                      if (!dest_xpd)
                              goto error;
                      avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd);
                      list_add(&dest_xpd->xpd_list, &dest->xpd_head);
              }
              node->ae.xp_node = dest;
              return 0;
      error:
              avc_xperms_free(dest);
              return -ENOMEM;
      
      }
      
      static inline u32 avc_xperms_audit_required(u32 requested,
                                              struct av_decision *avd,
                                              struct extended_perms_decision *xpd,
                                              u8 perm,
                                              int result,
                                              u32 *deniedp)
      {
              u32 denied, audited;
      
              denied = requested & ~avd->allowed;
              if (unlikely(denied)) {
    1                 audited = denied & avd->auditdeny;
    1                 if (audited && xpd) {
                              if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT))
                                      audited &= ~requested;
                      }
 2574         } else if (result) {
                      audited = denied = requested;
              } else {
 2574                 audited = requested & avd->auditallow;
                      if (audited && xpd) {
                              if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW))
                                      audited &= ~requested;
                      }
              }
      
              *deniedp = denied;
              return audited;
      }
      
      static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass,
                                      u32 requested, struct av_decision *avd,
                                      struct extended_perms_decision *xpd,
                                      u8 perm, int result,
                                      struct common_audit_data *ad)
      {
              u32 audited, denied;
      
 2575         audited = avc_xperms_audit_required(
                              requested, avd, xpd, perm, result, &denied);
 2574         if (likely(!audited))
                      return 0;
    1         return slow_avc_audit(ssid, tsid, tclass, requested,
                              audited, denied, result, ad, 0);
      }
      
      static void avc_node_free(struct rcu_head *rhead)
      {
              struct avc_node *node = container_of(rhead, struct avc_node, rhead);
              avc_xperms_free(node->ae.xp_node);
              kmem_cache_free(avc_node_cachep, node);
              avc_cache_stats_incr(frees);
      }
      
      static void avc_node_delete(struct avc_node *node)
      {
  258         hlist_del_rcu(&node->list);
              call_rcu(&node->rhead, avc_node_free);
              atomic_dec(&avc_cache.active_nodes);
      }
      
      static void avc_node_kill(struct avc_node *node)
      {
   10         avc_xperms_free(node->ae.xp_node);
              kmem_cache_free(avc_node_cachep, node);
              avc_cache_stats_incr(frees);
              atomic_dec(&avc_cache.active_nodes);
      }
      
      static void avc_node_replace(struct avc_node *new, struct avc_node *old)
      {
   11         hlist_replace_rcu(&old->list, &new->list);
              call_rcu(&old->rhead, avc_node_free);
              atomic_dec(&avc_cache.active_nodes);
      }
      
      static inline int avc_reclaim_node(void)
      {
              struct avc_node *node;
              int hvalue, try, ecx;
              unsigned long flags;
              struct hlist_head *head;
              spinlock_t *lock;
      
  263         for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) {
  263                 hvalue = atomic_inc_return(&avc_cache.lru_hint) & (AVC_CACHE_SLOTS - 1);
                      head = &avc_cache.slots[hvalue];
                      lock = &avc_cache.slots_lock[hvalue];
      
    2                 if (!spin_trylock_irqsave(lock, flags))
                              continue;
      
  263                 rcu_read_lock();
  263                 hlist_for_each_entry(node, head, list) {
  254                         avc_node_delete(node);
                              avc_cache_stats_incr(reclaims);
                              ecx++;
                              if (ecx >= AVC_CACHE_RECLAIM) {
    1                                 rcu_read_unlock();
                                      spin_unlock_irqrestore(lock, flags);
                                      goto out;
                              }
                      }
  263                 rcu_read_unlock();
                      spin_unlock_irqrestore(lock, flags);
              }
      out:
              return ecx;
      }
      
  263 static struct avc_node *avc_alloc_node(void)
      {
              struct avc_node *node;
      
  277         node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT);
              if (!node)
                      goto out;
      
  277         INIT_HLIST_NODE(&node->list);
              avc_cache_stats_incr(allocations);
      
              if (atomic_inc_return(&avc_cache.active_nodes) > avc_cache_threshold)
  263                 avc_reclaim_node();
      
      out:
  276         return node;
      }
      
      static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd)
      {
              node->ae.ssid = ssid;
              node->ae.tsid = tsid;
              node->ae.tclass = tclass;
              memcpy(&node->ae.avd, avd, sizeof(node->ae.avd));
      }
      
      static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass)
      {
              struct avc_node *node, *ret = NULL;
              int hvalue;
              struct hlist_head *head;
      
              hvalue = avc_hash(ssid, tsid, tclass);
              head = &avc_cache.slots[hvalue];
 13694         hlist_for_each_entry_rcu(node, head, list) {
 13693                 if (ssid == node->ae.ssid &&
 13854                     tclass == node->ae.tclass &&
 13692                     tsid == node->ae.tsid) {
                              ret = node;
                              break;
                      }
              }
      
              return ret;
      }
      
      /**
       * avc_lookup - Look up an AVC entry.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       *
       * Look up an AVC entry that is valid for the
       * (@ssid, @tsid), interpreting the permissions
       * based on @tclass.  If a valid AVC entry exists,
       * then this function returns the avc_node.
       * Otherwise, this function returns NULL.
       */
      static struct avc_node *avc_lookup(u32 ssid, u32 tsid, u16 tclass)
      {
              struct avc_node *node;
      
 13856         avc_cache_stats_incr(lookups);
 13855         node = avc_search_node(ssid, tsid, tclass);
      
              if (node)
                      return node;
      
  283         avc_cache_stats_incr(misses);
              return NULL;
      }
      
      static int avc_latest_notif_update(int seqno, int is_insert)
      {
              int ret = 0;
              static DEFINE_SPINLOCK(notif_lock);
              unsigned long flag;
      
    4         spin_lock_irqsave(&notif_lock, flag);
              if (is_insert) {
                      if (seqno < avc_cache.latest_notif) {
                              printk(KERN_WARNING "SELinux: avc:  seqno %d < latest_notif %d\n",
                                     seqno, avc_cache.latest_notif);
                              ret = -EAGAIN;
                      }
              } else {
                      if (seqno > avc_cache.latest_notif)
    4                         avc_cache.latest_notif = seqno;
              }
  277         spin_unlock_irqrestore(&notif_lock, flag);
      
              return ret;
      }
      
      /**
       * avc_insert - Insert an AVC entry.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       * @avd: resulting av decision
       * @xp_node: resulting extended permissions
       *
       * Insert an AVC entry for the SID pair
       * (@ssid, @tsid) and class @tclass.
       * The access vectors and the sequence number are
       * normally provided by the security server in
       * response to a security_compute_av() call.  If the
       * sequence number @avd->seqno is not less than the latest
       * revocation notification, then the function copies
       * the access vectors into a cache entry, returns
       * avc_node inserted. Otherwise, this function returns NULL.
       */
      static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass,
                                      struct av_decision *avd,
                                      struct avc_xperms_node *xp_node)
      {
              struct avc_node *pos, *node = NULL;
              int hvalue;
              unsigned long flag;
      
  277         if (avc_latest_notif_update(avd->seqno, 1))
                      goto out;
      
              node = avc_alloc_node();
              if (node) {
                      struct hlist_head *head;
                      spinlock_t *lock;
                      int rc = 0;
      
  276                 hvalue = avc_hash(ssid, tsid, tclass);
                      avc_node_populate(node, ssid, tsid, tclass, avd);
                      rc = avc_xperms_populate(node, xp_node);
                      if (rc) {
                              kmem_cache_free(avc_node_cachep, node);
                              return NULL;
                      }
  276                 head = &avc_cache.slots[hvalue];
                      lock = &avc_cache.slots_lock[hvalue];
      
                      spin_lock_irqsave(lock, flag);
   14                 hlist_for_each_entry(pos, head, list) {
   14                         if (pos->ae.ssid == ssid &&
   13                             pos->ae.tsid == tsid &&
   11                             pos->ae.tclass == tclass) {
   11                                 avc_node_replace(node, pos);
                                      goto found;
                              }
                      }
  276                 hlist_add_head_rcu(&node->list, head);
      found:
  276                 spin_unlock_irqrestore(lock, flag);
              }
      out:
              return node;
      }
      
      /**
       * avc_audit_pre_callback - SELinux specific information
       * will be called by generic audit code
       * @ab: the audit buffer
       * @a: audit_data
       */
      static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
      {
              struct common_audit_data *ad = a;
  110         audit_log_format(ab, "avc:  %s ",
  110                          ad->selinux_audit_data->denied ? "denied" : "granted");
  110         avc_dump_av(ab, ad->selinux_audit_data->tclass,
                              ad->selinux_audit_data->audited);
  110         audit_log_format(ab, " for ");
      }
      
      /**
       * avc_audit_post_callback - SELinux specific information
       * will be called by generic audit code
       * @ab: the audit buffer
       * @a: audit_data
       */
      static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
      {
              struct common_audit_data *ad = a;
  110         audit_log_format(ab, " ");
  110         avc_dump_query(ab, ad->selinux_audit_data->ssid,
                                 ad->selinux_audit_data->tsid,
                                 ad->selinux_audit_data->tclass);
              if (ad->selinux_audit_data->denied) {
                      audit_log_format(ab, " permissive=%u",
  110                                  ad->selinux_audit_data->result ? 0 : 1);
              }
  110 }
      
      /* This is the slow part of avc audit with big stack footprint */
      noinline int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass,
                      u32 requested, u32 audited, u32 denied, int result,
                      struct common_audit_data *a,
                      unsigned flags)
      {
              struct common_audit_data stack_data;
              struct selinux_audit_data sad;
      
  110         if (!a) {
                      a = &stack_data;
   75                 a->type = LSM_AUDIT_DATA_NONE;
              }
      
              /*
               * When in a RCU walk do the audit on the RCU retry.  This is because
               * the collection of the dname in an inode audit message is not RCU
               * safe.  Note this may drop some audits when the situation changes
               * during retry. However this is logically just as if the operation
               * happened a little later.
               */
   36         if ((a->type == LSM_AUDIT_DATA_INODE) &&
                  (flags & MAY_NOT_BLOCK))
                      return -ECHILD;
      
  110         sad.tclass = tclass;
              sad.requested = requested;
              sad.ssid = ssid;
              sad.tsid = tsid;
              sad.audited = audited;
              sad.denied = denied;
              sad.result = result;
      
              a->selinux_audit_data = &sad;
      
              common_lsm_audit(a, avc_audit_pre_callback, avc_audit_post_callback);
  103         return 0;
      }
      
      /**
       * avc_add_callback - Register a callback for security events.
       * @callback: callback function
       * @events: security events
       *
       * Register a callback function for events in the set @events.
       * Returns %0 on success or -%ENOMEM if insufficient memory
       * exists to add the callback.
       */
      int __init avc_add_callback(int (*callback)(u32 event), u32 events)
      {
              struct avc_callback_node *c;
              int rc = 0;
      
              c = kmalloc(sizeof(*c), GFP_KERNEL);
              if (!c) {
                      rc = -ENOMEM;
                      goto out;
              }
      
              c->callback = callback;
              c->events = events;
              c->next = avc_callbacks;
              avc_callbacks = c;
      out:
              return rc;
      }
      
      /**
       * avc_update_node Update an AVC entry
       * @event : Updating event
       * @perms : Permission mask bits
       * @ssid,@tsid,@tclass : identifier of an AVC entry
       * @seqno : sequence number when decision was made
       * @xpd: extended_perms_decision to be added to the node
       *
       * if a valid AVC entry doesn't exist,this function returns -ENOENT.
       * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
       * otherwise, this function updates the AVC entry. The original AVC-entry object
       * will release later by RCU.
       */
      static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
                              u32 tsid, u16 tclass, u32 seqno,
                              struct extended_perms_decision *xpd,
                              u32 flags)
      {
              int hvalue, rc = 0;
              unsigned long flag;
              struct avc_node *pos, *node, *orig = NULL;
              struct hlist_head *head;
              spinlock_t *lock;
      
   11         node = avc_alloc_node();
              if (!node) {
                      rc = -ENOMEM;
                      goto out;
              }
      
              /* Lock the target slot */
   11         hvalue = avc_hash(ssid, tsid, tclass);
      
              head = &avc_cache.slots[hvalue];
              lock = &avc_cache.slots_lock[hvalue];
      
              spin_lock_irqsave(lock, flag);
      
    1         hlist_for_each_entry(pos, head, list) {
    1                 if (ssid == pos->ae.ssid &&
    1                     tsid == pos->ae.tsid &&
    1                     tclass == pos->ae.tclass &&
    1                     seqno == pos->ae.avd.seqno){
                              orig = pos;
                              break;
                      }
              }
      
              if (!orig) {
                      rc = -ENOENT;
   10                 avc_node_kill(node);
                      goto out_unlock;
              }
      
              /*
               * Copy and replace original node.
               */
      
    1         avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd);
      
              if (orig->ae.xp_node) {
                      rc = avc_xperms_populate(node, orig->ae.xp_node);
                      if (rc) {
                              kmem_cache_free(avc_node_cachep, node);
                              goto out_unlock;
                      }
              }
      
    1         switch (event) {
              case AVC_CALLBACK_GRANT:
    1                 node->ae.avd.allowed |= perms;
                      if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
                              avc_xperms_allow_perm(node->ae.xp_node, driver, xperm);
                      break;
              case AVC_CALLBACK_TRY_REVOKE:
              case AVC_CALLBACK_REVOKE:
                      node->ae.avd.allowed &= ~perms;
                      break;
              case AVC_CALLBACK_AUDITALLOW_ENABLE:
                      node->ae.avd.auditallow |= perms;
                      break;
              case AVC_CALLBACK_AUDITALLOW_DISABLE:
                      node->ae.avd.auditallow &= ~perms;
                      break;
              case AVC_CALLBACK_AUDITDENY_ENABLE:
                      node->ae.avd.auditdeny |= perms;
                      break;
              case AVC_CALLBACK_AUDITDENY_DISABLE:
                      node->ae.avd.auditdeny &= ~perms;
                      break;
              case AVC_CALLBACK_ADD_XPERMS:
                      avc_add_xperms_decision(node, xpd);
                      break;
              }
    1         avc_node_replace(node, orig);
      out_unlock:
   11         spin_unlock_irqrestore(lock, flag);
      out:
   11         return rc;
      }
      
      /**
       * avc_flush - Flush the cache
       */
      static void avc_flush(void)
    5 {
              struct hlist_head *head;
              struct avc_node *node;
              spinlock_t *lock;
              unsigned long flag;
              int i;
      
              for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                      head = &avc_cache.slots[i];
                      lock = &avc_cache.slots_lock[i];
      
    5                 spin_lock_irqsave(lock, flag);
                      /*
                       * With preemptable RCU, the outer spinlock does not
                       * prevent RCU grace periods from ending.
                       */
    5                 rcu_read_lock();
    5                 hlist_for_each_entry(node, head, list)
    5                         avc_node_delete(node);
    5                 rcu_read_unlock();
                      spin_unlock_irqrestore(lock, flag);
              }
    4 }
      
      /**
       * avc_ss_reset - Flush the cache and revalidate migrated permissions.
       * @seqno: policy sequence number
       */
      int avc_ss_reset(u32 seqno)
      {
              struct avc_callback_node *c;
              int rc = 0, tmprc;
      
    5         avc_flush();
      
    4         for (c = avc_callbacks; c; c = c->next) {
    4                 if (c->events & AVC_CALLBACK_RESET) {
    4                         tmprc = c->callback(AVC_CALLBACK_RESET);
                              /* save the first error encountered for the return
                                 value and continue processing the callbacks */
                              if (!rc)
                                      rc = tmprc;
                      }
              }
      
    4         avc_latest_notif_update(seqno, 0);
              return rc;
      }
      
      /*
       * Slow-path helper function for avc_has_perm_noaudit,
       * when the avc_node lookup fails. We get called with
       * the RCU read lock held, and need to return with it
       * still held, but drop if for the security compute.
       *
       * Don't inline this, since it's the slow-path and just
       * results in a bigger stack frame.
       */
      static noinline struct avc_node *avc_compute_av(u32 ssid, u32 tsid,
                               u16 tclass, struct av_decision *avd,
                               struct avc_xperms_node *xp_node)
      {
  283         rcu_read_unlock();
              INIT_LIST_HEAD(&xp_node->xpd_head);
              security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp);
  277         rcu_read_lock();
  277         return avc_insert(ssid, tsid, tclass, avd, xp_node);
      }
      
      static noinline int avc_denied(u32 ssid, u32 tsid,
                                      u16 tclass, u32 requested,
                                      u8 driver, u8 xperm, unsigned flags,
                                      struct av_decision *avd)
      {
  113         if (flags & AVC_STRICT)
                      return -EACCES;
      
  110         if (selinux_enforcing && !(avd->flags & AVD_FLAGS_PERMISSIVE))
                      return -EACCES;
      
   11         avc_update_node(AVC_CALLBACK_GRANT, requested, driver, xperm, ssid,
                                      tsid, tclass, avd->seqno, NULL, flags);
  113         return 0;
      }
      
      /*
       * The avc extended permissions logic adds an additional 256 bits of
       * permissions to an avc node when extended permissions for that node are
       * specified in the avtab. If the additional 256 permissions is not adequate,
       * as-is the case with ioctls, then multiple may be chained together and the
       * driver field is used to specify which set contains the permission.
       */
      int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                              u8 driver, u8 xperm, struct common_audit_data *ad)
      {
              struct avc_node *node;
              struct av_decision avd;
              u32 denied;
              struct extended_perms_decision local_xpd;
              struct extended_perms_decision *xpd = NULL;
              struct extended_perms_data allowed;
              struct extended_perms_data auditallow;
              struct extended_perms_data dontaudit;
              struct avc_xperms_node local_xp_node;
              struct avc_xperms_node *xp_node;
              int rc = 0, rc2;
      
              xp_node = &local_xp_node;
 2575         BUG_ON(!requested);
      
 2575         rcu_read_lock();
      
 2575         node = avc_lookup(ssid, tsid, tclass);
              if (unlikely(!node)) {
  104                 node = avc_compute_av(ssid, tsid, tclass, &avd, xp_node);
              } else {
 2478                 memcpy(&avd, &node->ae.avd, sizeof(avd));
                      xp_node = node->ae.xp_node;
              }
              /* if extended permissions are not defined, only consider av_decision */
  104         if (!xp_node || !xp_node->xp.len)
                      goto decision;
      
              local_xpd.allowed = &allowed;
              local_xpd.auditallow = &auditallow;
              local_xpd.dontaudit = &dontaudit;
      
              xpd = avc_xperms_decision_lookup(driver, xp_node);
              if (unlikely(!xpd)) {
                      /*
                       * Compute the extended_perms_decision only if the driver
                       * is flagged
                       */
                      if (!security_xperm_test(xp_node->xp.drivers.p, driver)) {
                              avd.allowed &= ~requested;
                              goto decision;
                      }
                      rcu_read_unlock();
                      security_compute_xperms_decision(ssid, tsid, tclass, driver,
                                                      &local_xpd);
                      rcu_read_lock();
                      avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm,
                                      ssid, tsid, tclass, avd.seqno, &local_xpd, 0);
              } else {
                      avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
              }
              xpd = &local_xpd;
      
              if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED))
                      avd.allowed &= ~requested;
      
      decision:
 2574         denied = requested & ~(avd.allowed);
              if (unlikely(denied))
    1                 rc = avc_denied(ssid, tsid, tclass, requested, driver, xperm,
                                      AVC_EXTENDED_PERMS, &avd);
      
 2575         rcu_read_unlock();
      
 2575         rc2 = avc_xperms_audit(ssid, tsid, tclass, requested,
                              &avd, xpd, xperm, rc, ad);
              if (rc2)
                      return rc2;
              return rc;
      }
      
      /**
       * avc_has_perm_noaudit - Check permissions but perform no auditing.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       * @requested: requested permissions, interpreted based on @tclass
       * @flags:  AVC_STRICT or 0
       * @avd: access vector decisions
       *
       * Check the AVC to determine whether the @requested permissions are granted
       * for the SID pair (@ssid, @tsid), interpreting the permissions
       * based on @tclass, and call the security server on a cache miss to obtain
       * a new decision and add it to the cache.  Return a copy of the decisions
       * in @avd.  Return %0 if all @requested permissions are granted,
       * -%EACCES if any permissions are denied, or another -errno upon
       * other errors.  This function is typically called by avc_has_perm(),
       * but may also be called directly to separate permission checking from
       * auditing, e.g. in cases where a lock must be held for the check but
       * should be released for the auditing.
       */
      inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
                               u16 tclass, u32 requested,
                               unsigned flags,
                               struct av_decision *avd)
      {
              struct avc_node *node;
              struct avc_xperms_node xp_node;
              int rc = 0;
              u32 denied;
      
 7228         BUG_ON(!requested);
      
 12805         rcu_read_lock();
      
 12797         node = avc_lookup(ssid, tsid, tclass);
              if (unlikely(!node))
  205                 node = avc_compute_av(ssid, tsid, tclass, avd, &xp_node);
              else
 12697                 memcpy(avd, &node->ae.avd, sizeof(*avd));
      
 12781         denied = requested & ~(avd->allowed);
              if (unlikely(denied))
  112                 rc = avc_denied(ssid, tsid, tclass, requested, 0, 0, flags, avd);
      
 12779         rcu_read_unlock();
              return rc;
      }
      
      /**
       * avc_has_perm - Check permissions and perform any appropriate auditing.
       * @ssid: source security identifier
       * @tsid: target security identifier
       * @tclass: target security class
       * @requested: requested permissions, interpreted based on @tclass
       * @auditdata: auxiliary audit data
       *
       * Check the AVC to determine whether the @requested permissions are granted
       * for the SID pair (@ssid, @tsid), interpreting the permissions
       * based on @tclass, and call the security server on a cache miss to obtain
       * a new decision and add it to the cache.  Audit the granting or denial of
       * permissions in accordance with the policy.  Return %0 if all @requested
       * permissions are granted, -%EACCES if any permissions are denied, or
       * another -errno upon other errors.
       */
      int avc_has_perm(u32 ssid, u32 tsid, u16 tclass,
                       u32 requested, struct common_audit_data *auditdata)
      {
              struct av_decision avd;
              int rc, rc2;
      
 11311         rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd);
      
 11298         rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata, 0);
              if (rc2)
                      return rc2;
              return rc;
      }
      
      int avc_has_perm_flags(u32 ssid, u32 tsid, u16 tclass,
                             u32 requested, struct common_audit_data *auditdata,
                             int flags)
      {
              struct av_decision avd;
              int rc, rc2;
      
  484         rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd);
      
  484         rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc,
                              auditdata, flags);
              if (rc2)
                      return rc2;
              return rc;
      }
      
      u32 avc_policy_seqno(void)
      {
 3500         return avc_cache.latest_notif;
      }
      
      void avc_disable(void)
      {
              /*
               * If you are looking at this because you have realized that we are
               * not destroying the avc_node_cachep it might be easy to fix, but
               * I don't know the memory barrier semantics well enough to know.  It's
               * possible that some other task dereferenced security_ops when
               * it still pointed to selinux operations.  If that is the case it's
               * possible that it is about to use the avc and is about to need the
               * avc_node_cachep.  I know I could wrap the security.c security_ops call
               * in an rcu_lock, but seriously, it's not worth it.  Instead I just flush
               * the cache and get that memory back.
               */
              if (avc_node_cachep) {
                      avc_flush();
                      /* kmem_cache_destroy(avc_node_cachep); */
              }
      }
      /*
       *  linux/fs/ext4/dir.c
       *
       * Copyright (C) 1992, 1993, 1994, 1995
       * Remy Card (card@masi.ibp.fr)
       * Laboratoire MASI - Institut Blaise Pascal
       * Universite Pierre et Marie Curie (Paris VI)
       *
       *  from
       *
       *  linux/fs/minix/dir.c
       *
       *  Copyright (C) 1991, 1992  Linus Torvalds
       *
       *  ext4 directory handling functions
       *
       *  Big-endian to little-endian byte-swapping/bitmaps by
       *        David S. Miller (davem@caip.rutgers.edu), 1995
       *
       * Hash Tree Directory indexing (c) 2001  Daniel Phillips
       *
       */
      
      #include <linux/fs.h>
      #include <linux/buffer_head.h>
      #include <linux/slab.h>
      #include "ext4.h"
      #include "xattr.h"
      
      static int ext4_dx_readdir(struct file *, struct dir_context *);
      
      /**
       * Check if the given dir-inode refers to an htree-indexed directory
       * (or a directory which could potentially get converted to use htree
       * indexing).
       *
       * Return 1 if it is a dx dir, 0 if not
       */
      static int is_dx_dir(struct inode *inode)
      {
   56         struct super_block *sb = inode->i_sb;
      
              if (ext4_has_feature_dir_index(inode->i_sb) &&
   56             ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
   56              ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
    1              ext4_has_inline_data(inode)))
                      return 1;
      
   56         return 0;
      }
      
      /*
       * Return 0 if the directory entry is OK, and 1 if there is a problem
       *
       * Note: this is the opposite of what ext2 and ext3 historically returned...
       *
       * bh passed here can be an inode block or a dir data block, depending
       * on the inode inline data flag.
       */
      int __ext4_check_dir_entry(const char *function, unsigned int line,
                                 struct inode *dir, struct file *filp,
                                 struct ext4_dir_entry_2 *de,
                                 struct buffer_head *bh, char *buf, int size,
                                 unsigned int offset)
      {
              const char *error_msg = NULL;
 1175         const int rlen = ext4_rec_len_from_disk(de->rec_len,
 1175                                                 dir->i_sb->s_blocksize);
      
              if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
                      error_msg = "rec_len is smaller than minimal";
 1175         else if (unlikely(rlen % 4 != 0))
                      error_msg = "rec_len % 4 != 0";
 1175         else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                      error_msg = "rec_len is too small for name_len";
 1175         else if (unlikely(((char *) de - buf) + rlen > size))
                      error_msg = "directory entry overrun";
 1175         else if (unlikely(le32_to_cpu(de->inode) >
                              le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                      error_msg = "inode out of bounds";
              else
                      return 0;
      
              if (filp)
                      ext4_error_file(filp, function, line, bh->b_blocknr,
                                      "bad entry in directory: %s - offset=%u, "
                                      "inode=%u, rec_len=%d, name_len=%d, size=%d",
                                      error_msg, offset, le32_to_cpu(de->inode),
                                      rlen, de->name_len, size);
              else
                      ext4_error_inode(dir, function, line, bh->b_blocknr,
                                      "bad entry in directory: %s - offset=%u, "
                                      "inode=%u, rec_len=%d, name_len=%d, size=%d",
                                       error_msg, offset, le32_to_cpu(de->inode),
                                       rlen, de->name_len, size);
      
              return 1;
      }
      
      static int ext4_readdir(struct file *file, struct dir_context *ctx)
      {
              unsigned int offset;
              int i;
              struct ext4_dir_entry_2 *de;
              int err;
   48         struct inode *inode = file_inode(file);
              struct super_block *sb = inode->i_sb;
              struct buffer_head *bh = NULL;
              struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
      
   38         if (IS_ENCRYPTED(inode)) {
                      err = fscrypt_get_encryption_info(inode);
                      if (err && err != -ENOKEY)
                              return err;
              }
      
   48         if (is_dx_dir(inode)) {
   48                 err = ext4_dx_readdir(file, ctx);
                      if (err != ERR_BAD_DX_DIR) {
                              return err;
                      }
                      /*
                       * We don't set the inode dirty flag since it's not
                       * critical that it get flushed back to the disk.
                       */
                      ext4_clear_inode_flag(file_inode(file),
                                            EXT4_INODE_INDEX);
              }
      
              if (ext4_has_inline_data(inode)) {
                      int has_inline_data = 1;
                      err = ext4_read_inline_dir(file, ctx,
                                                 &has_inline_data);
                      if (has_inline_data)
                              return err;
              }
      
              if (IS_ENCRYPTED(inode)) {
                      err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
                      if (err < 0)
                              return err;
              }
      
              while (ctx->pos < inode->i_size) {
                      struct ext4_map_blocks map;
      
                      if (fatal_signal_pending(current)) {
                              err = -ERESTARTSYS;
                              goto errout;
                      }
                      cond_resched();
                      offset = ctx->pos & (sb->s_blocksize - 1);
                      map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                      map.m_len = 1;
                      err = ext4_map_blocks(NULL, inode, &map, 0);
                      if (err == 0) {
                              /* m_len should never be zero but let's avoid
                               * an infinite loop if it somehow is */
                              if (map.m_len == 0)
                                      map.m_len = 1;
                              ctx->pos += map.m_len * sb->s_blocksize;
                              continue;
                      }
                      if (err > 0) {
                              pgoff_t index = map.m_pblk >>
                                              (PAGE_SHIFT - inode->i_blkbits);
                              if (!ra_has_index(&file->f_ra, index))
                                      page_cache_sync_readahead(
                                              sb->s_bdev->bd_inode->i_mapping,
                                              &file->f_ra, file,
                                              index, 1);
                              file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
                              bh = ext4_bread(NULL, inode, map.m_lblk, 0);
                              if (IS_ERR(bh)) {
                                      err = PTR_ERR(bh);
                                      bh = NULL;
                                      goto errout;
                              }
                      }
      
                      if (!bh) {
                              /* corrupt size?  Maybe no more blocks to read */
                              if (ctx->pos > inode->i_blocks << 9)
                                      break;
                              ctx->pos += sb->s_blocksize - offset;
                              continue;
                      }
      
                      /* Check the checksum */
                      if (!buffer_verified(bh) &&
                          !ext4_dirent_csum_verify(inode,
                                      (struct ext4_dir_entry *)bh->b_data)) {
                              EXT4_ERROR_FILE(file, 0, "directory fails checksum "
                                              "at offset %llu",
                                              (unsigned long long)ctx->pos);
                              ctx->pos += sb->s_blocksize - offset;
                              brelse(bh);
                              bh = NULL;
                              continue;
                      }
                      set_buffer_verified(bh);
      
                      /* If the dir block has changed since the last call to
                       * readdir(2), then we might be pointing to an invalid
                       * dirent right now.  Scan from the start of the block
                       * to make sure. */
                      if (file->f_version != inode->i_version) {
                              for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                      de = (struct ext4_dir_entry_2 *)
                                              (bh->b_data + i);
                                      /* It's too expensive to do a full
                                       * dirent test each time round this
                                       * loop, but we do have to test at
                                       * least that it is non-zero.  A
                                       * failure will be detected in the
                                       * dirent test below. */
                                      if (ext4_rec_len_from_disk(de->rec_len,
                                              sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
                                              break;
                                      i += ext4_rec_len_from_disk(de->rec_len,
                                                                  sb->s_blocksize);
                              }
                              offset = i;
                              ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                      | offset;
                              file->f_version = inode->i_version;
                      }
      
                      while (ctx->pos < inode->i_size
                             && offset < sb->s_blocksize) {
                              de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
                              if (ext4_check_dir_entry(inode, file, de, bh,
                                                       bh->b_data, bh->b_size,
                                                       offset)) {
                                      /*
                                       * On error, skip to the next block
                                       */
                                      ctx->pos = (ctx->pos |
                                                      (sb->s_blocksize - 1)) + 1;
                                      break;
                              }
                              offset += ext4_rec_len_from_disk(de->rec_len,
                                              sb->s_blocksize);
                              if (le32_to_cpu(de->inode)) {
                                      if (!IS_ENCRYPTED(inode)) {
                                              if (!dir_emit(ctx, de->name,
                                                  de->name_len,
                                                  le32_to_cpu(de->inode),
                                                  get_dtype(sb, de->file_type)))
                                                      goto done;
                                      } else {
                                              int save_len = fstr.len;
                                              struct fscrypt_str de_name =
                                                              FSTR_INIT(de->name,
                                                                      de->name_len);
      
                                              /* Directory is encrypted */
                                              err = fscrypt_fname_disk_to_usr(inode,
                                                      0, 0, &de_name, &fstr);
                                              de_name = fstr;
                                              fstr.len = save_len;
                                              if (err)
                                                      goto errout;
                                              if (!dir_emit(ctx,
                                                  de_name.name, de_name.len,
                                                  le32_to_cpu(de->inode),
                                                  get_dtype(sb, de->file_type)))
                                                      goto done;
                                      }
                              }
                              ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                      sb->s_blocksize);
                      }
                      if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode))
                              goto done;
                      brelse(bh);
                      bh = NULL;
                      offset = 0;
              }
      done:
   43         err = 0;
      errout:
              fscrypt_fname_free_buffer(&fstr);
    1         brelse(bh);
              return err;
      }
      
      static inline int is_32bit_api(void)
      {
      #ifdef CONFIG_COMPAT
   56         return in_compat_syscall();
      #else
              return (BITS_PER_LONG == 32);
      #endif
      }
      
      /*
       * These functions convert from the major/minor hash to an f_pos
       * value for dx directories
       *
       * Upper layer (for example NFS) should specify FMODE_32BITHASH or
       * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
       * directly on both 32-bit and 64-bit nodes, under such case, neither
       * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
       */
      static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
      {
              if ((filp->f_mode & FMODE_32BITHASH) ||
   46             (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                      return major >> 1;
              else
   46                 return ((__u64)(major >> 1) << 32) | (__u64)minor;
      }
      
      static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
      {
   42         if ((filp->f_mode & FMODE_32BITHASH) ||
   44             (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                      return (pos << 1) & 0xffffffff;
              else
   44                 return ((pos >> 32) << 1) & 0xffffffff;
      }
      
      static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
      {
   42         if ((filp->f_mode & FMODE_32BITHASH) ||
   44             (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                      return 0;
              else
   44                 return pos & 0xffffffff;
      }
      
      /*
       * Return 32- or 64-bit end-of-file for dx directories
       */
      static inline loff_t ext4_get_htree_eof(struct file *filp)
      {
   48         if ((filp->f_mode & FMODE_32BITHASH) ||
   56             (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                      return EXT4_HTREE_EOF_32BIT;
              else
                      return EXT4_HTREE_EOF_64BIT;
      }
      
      
      /*
       * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
       * directories, where the "offset" is in terms of the filename hash
       * value instead of the byte offset.
       *
       * Because we may return a 64-bit hash that is well beyond offset limits,
       * we need to pass the max hash as the maximum allowable offset in
       * the htree directory case.
       *
       * For non-htree, ext4_llseek already chooses the proper max offset.
       */
      static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
      {
    9         struct inode *inode = file->f_mapping->host;
              int dx_dir = is_dx_dir(inode);
    9         loff_t htree_max = ext4_get_htree_eof(file);
      
    8         if (likely(dx_dir))
    8                 return generic_file_llseek_size(file, offset, whence,
                                                          htree_max, htree_max);
              else
    1                 return ext4_llseek(file, offset, whence);
      }
      
      /*
       * This structure holds the nodes of the red-black tree used to store
       * the directory entry in hash order.
       */
      struct fname {
              __u32                hash;
              __u32                minor_hash;
              struct rb_node        rb_hash;
              struct fname        *next;
              __u32                inode;
              __u8                name_len;
              __u8                file_type;
              char                name[0];
      };
      
      /*
       * This functoin implements a non-recursive way of freeing all of the
       * nodes in the red-black tree.
       */
      static void free_rb_tree_fname(struct rb_root *root)
      {
              struct fname *fname, *next;
      
   48         rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
                      while (fname) {
                              struct fname *old = fname;
    6                         fname = fname->next;
                              kfree(old);
                      }
      
   48         *root = RB_ROOT;
      }
      
      
      static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
                                                                 loff_t pos)
      {
              struct dir_private_info *p;
      
   42         p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
              if (!p)
                      return NULL;
   42         p->curr_hash = pos2maj_hash(filp, pos);
   42         p->curr_minor_hash = pos2min_hash(filp, pos);
              return p;
      }
      
      void ext4_htree_free_dir_info(struct dir_private_info *p)
      {
    2         free_rb_tree_fname(&p->root);
              kfree(p);
      }
      
      /*
       * Given a directory entry, enter it into the fname rb tree.
       *
       * When filename encryption is enabled, the dirent will hold the
       * encrypted filename, while the htree will hold decrypted filename.
       * The decrypted filename is passed in via ent_name.  parameter.
       */
      int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                   __u32 minor_hash,
                                  struct ext4_dir_entry_2 *dirent,
                                  struct fscrypt_str *ent_name)
      {
              struct rb_node **p, *parent = NULL;
              struct fname *fname, *new_fn;
              struct dir_private_info *info;
              int len;
      
   44         info = dir_file->private_data;
              p = &info->root.rb_node;
      
              /* Create and allocate the fname structure */
              len = sizeof(struct fname) + ent_name->len + 1;
              new_fn = kzalloc(len, GFP_KERNEL);
              if (!new_fn)
                      return -ENOMEM;
   44         new_fn->hash = hash;
              new_fn->minor_hash = minor_hash;
              new_fn->inode = le32_to_cpu(dirent->inode);
              new_fn->name_len = ent_name->len;
              new_fn->file_type = dirent->file_type;
              memcpy(new_fn->name, ent_name->name, ent_name->len);
              new_fn->name[ent_name->len] = 0;
      
   43         while (*p) {
                      parent = *p;
                      fname = rb_entry(parent, struct fname, rb_hash);
      
                      /*
                       * If the hash and minor hash match up, then we put
                       * them on a linked list.  This rarely happens...
                       */
   43                 if ((new_fn->hash == fname->hash) &&
                          (new_fn->minor_hash == fname->minor_hash)) {
                              new_fn->next = fname->next;
                              fname->next = new_fn;
   44                         return 0;
                      }
      
   43                 if (new_fn->hash < fname->hash)
                              p = &(*p)->rb_left;
   43                 else if (new_fn->hash > fname->hash)
                              p = &(*p)->rb_right;
                      else if (new_fn->minor_hash < fname->minor_hash)
   17                         p = &(*p)->rb_left;
                      else /* if (new_fn->minor_hash > fname->minor_hash) */
   43                         p = &(*p)->rb_right;
              }
      
   44         rb_link_node(&new_fn->rb_hash, parent, p);
              rb_insert_color(&new_fn->rb_hash, &info->root);
              return 0;
   43 }
      
      
      
      /*
       * This is a helper function for ext4_dx_readdir.  It calls filldir
       * for all entres on the fname linked list.  (Normally there is only
       * one entry on the linked list, unless there are 62 bit hash collisions.)
       */
      static int call_filldir(struct file *file, struct dir_context *ctx,
                              struct fname *fname)
      {
   46         struct dir_private_info *info = file->private_data;
              struct inode *inode = file_inode(file);
              struct super_block *sb = inode->i_sb;
      
              if (!fname) {
                      ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
                               "called with null fname?!?", __func__, __LINE__,
                               inode->i_ino, current->comm);
                      return 0;
              }
   46         ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
   45         while (fname) {
                      if (!dir_emit(ctx, fname->name,
                                      fname->name_len,
   46                                 fname->inode,
   46                                 get_dtype(sb, fname->file_type))) {
    8                         info->extra_fname = fname;
                              return 1;
                      }
   40                 fname = fname->next;
              }
              return 0;
      }
      
      static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
      {
   48         struct dir_private_info *info = file->private_data;
              struct inode *inode = file_inode(file);
              struct fname *fname;
              int        ret;
      
    7         if (!info) {
   42                 info = ext4_htree_create_dir_info(file, ctx->pos);
                      if (!info)
                              return -ENOMEM;
                      file->private_data = info;
              }
      
   48         if (ctx->pos == ext4_get_htree_eof(file))
                      return 0;        /* EOF */
      
              /* Some one has messed with f_pos; reset the world */
   47         if (info->last_pos != ctx->pos) {
    2                 free_rb_tree_fname(&info->root);
                      info->curr_node = NULL;
                      info->extra_fname = NULL;
    2                 info->curr_hash = pos2maj_hash(file, ctx->pos);
    2                 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
              }
      
              /*
               * If there are any leftover names on the hash collision
               * chain, return them first.
               */
   45         if (info->extra_fname) {
    4                 if (call_filldir(file, ctx, info->extra_fname))
                              goto finished;
   44                 info->extra_fname = NULL;
                      goto next_node;
   44         } else if (!info->curr_node)
   44                 info->curr_node = rb_first(&info->root);
      
              while (1) {
                      /*
                       * Fill the rbtree if we have no more entries,
                       * or the inode has changed since we last read in the
                       * cached entries.
                       */
    2                 if ((!info->curr_node) ||
   39                     (file->f_version != inode->i_version)) {
   46                         info->curr_node = NULL;
                              free_rb_tree_fname(&info->root);
                              file->f_version = inode->i_version;
                              ret = ext4_htree_fill_tree(file, info->curr_hash,
                                                         info->curr_minor_hash,
                                                         &info->next_hash);
                              if (ret < 0)
                                      return ret;
   45                         if (ret == 0) {
    1                                 ctx->pos = ext4_get_htree_eof(file);
                                      break;
                              }
   44                         info->curr_node = rb_first(&info->root);
                      }
      
   44                 fname = rb_entry(info->curr_node, struct fname, rb_hash);
                      info->curr_hash = fname->hash;
                      info->curr_minor_hash = fname->minor_hash;
    7                 if (call_filldir(file, ctx, fname))
                              break;
              next_node:
   40                 info->curr_node = rb_next(info->curr_node);
                      if (info->curr_node) {
                              fname = rb_entry(info->curr_node, struct fname,
                                               rb_hash);
   39                         info->curr_hash = fname->hash;
                              info->curr_minor_hash = fname->minor_hash;
                      } else {
   34                         if (info->next_hash == ~0) {
   34                                 ctx->pos = ext4_get_htree_eof(file);
                                      break;
                              }
                              info->curr_hash = info->next_hash;
                              info->curr_minor_hash = 0;
                      }
              }
      finished:
   42         info->last_pos = ctx->pos;
              return 0;
      }
      
      static int ext4_dir_open(struct inode * inode, struct file * filp)
      {
   26         if (IS_ENCRYPTED(inode))
                      return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
              return 0;
      }
      
      static int ext4_release_dir(struct inode *inode, struct file *filp)
      {
    6         if (filp->private_data)
    2                 ext4_htree_free_dir_info(filp->private_data);
      
    5         return 0;
      }
      
      int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
                            int buf_size)
      {
              struct ext4_dir_entry_2 *de;
              int rlen;
              unsigned int offset = 0;
              char *top;
      
              de = (struct ext4_dir_entry_2 *)buf;
              top = buf + buf_size;
              while ((char *) de < top) {
                      if (ext4_check_dir_entry(dir, NULL, de, bh,
                                               buf, buf_size, offset))
                              return -EFSCORRUPTED;
                      rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
                      de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
                      offset += rlen;
              }
              if ((char *) de > top)
                      return -EFSCORRUPTED;
      
              return 0;
      }
      
      const struct file_operations ext4_dir_operations = {
              .llseek                = ext4_dir_llseek,
              .read                = generic_read_dir,
              .iterate_shared        = ext4_readdir,
              .unlocked_ioctl = ext4_ioctl,
      #ifdef CONFIG_COMPAT
              .compat_ioctl        = ext4_compat_ioctl,
      #endif
              .fsync                = ext4_sync_file,
              .open                = ext4_dir_open,
              .release        = ext4_release_dir,
      };
      /*
       * NET                Generic infrastructure for Network protocols.
       *
       * Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
       *
       *                This program is free software; you can redistribute it and/or
       *                modify it under the terms of the GNU General Public License
       *                as published by the Free Software Foundation; either version
       *                2 of the License, or (at your option) any later version.
       */
      #ifndef _TIMEWAIT_SOCK_H
      #define _TIMEWAIT_SOCK_H
      
      #include <linux/slab.h>
      #include <linux/bug.h>
      #include <net/sock.h>
      
      struct timewait_sock_ops {
              struct kmem_cache        *twsk_slab;
              char                *twsk_slab_name;
              unsigned int        twsk_obj_size;
              int                (*twsk_unique)(struct sock *sk,
                                             struct sock *sktw, void *twp);
              void                (*twsk_destructor)(struct sock *sk);
      };
      
      static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
      {
   28         if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
                      return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
              return 0;
      }
      
      static inline void twsk_destructor(struct sock *sk)
      {
              if (sk->sk_prot->twsk_prot->twsk_destructor != NULL)
   28                 sk->sk_prot->twsk_prot->twsk_destructor(sk);
      }
      
      #endif /* _TIMEWAIT_SOCK_H */
      
      /* (C) 1999-2001 Paul `Rusty' Russell
       * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
       * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
       *
       * This program is free software; you can redistribute it and/or modify
       * it under the terms of the GNU General Public License version 2 as
       * published by the Free Software Foundation.
       */
      
      #include <linux/types.h>
      #include <linux/ip.h>
      #include <linux/netfilter.h>
      #include <linux/module.h>
      #include <linux/skbuff.h>
      #include <linux/icmp.h>
      #include <linux/sysctl.h>
      #include <net/route.h>
      #include <net/ip.h>
      
      #include <linux/netfilter_ipv4.h>
      #include <net/netfilter/nf_conntrack.h>
      #include <net/netfilter/nf_conntrack_helper.h>
      #include <net/netfilter/nf_conntrack_l4proto.h>
      #include <net/netfilter/nf_conntrack_l3proto.h>
      #include <net/netfilter/nf_conntrack_zones.h>
      #include <net/netfilter/nf_conntrack_core.h>
      #include <net/netfilter/nf_conntrack_seqadj.h>
      #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
      #include <net/netfilter/nf_nat_helper.h>
      #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
      #include <net/netfilter/nf_log.h>
      
      static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
                                    struct nf_conntrack_tuple *tuple)
      {
              const __be32 *ap;
              __be32 _addrs[2];
  873         ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
                                      sizeof(u_int32_t) * 2, _addrs);
              if (ap == NULL)
                      return false;
      
  873         tuple->src.u3.ip = ap[0];
              tuple->dst.u3.ip = ap[1];
      
  873         return true;
      }
      
      static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
                                    const struct nf_conntrack_tuple *orig)
      {
  511         tuple->src.u3.ip = orig->dst.u3.ip;
              tuple->dst.u3.ip = orig->src.u3.ip;
      
              return true;
      }
      
      static void ipv4_print_tuple(struct seq_file *s,
                                  const struct nf_conntrack_tuple *tuple)
      {
              seq_printf(s, "src=%pI4 dst=%pI4 ",
                         &tuple->src.u3.ip, &tuple->dst.u3.ip);
      }
      
      static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
                                  unsigned int *dataoff, u_int8_t *protonum)
      {
              const struct iphdr *iph;
              struct iphdr _iph;
      
  895         iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
              if (iph == NULL)
                      return -NF_ACCEPT;
      
              /* Conntrack defragments packets, we might still see fragments
               * inside ICMP packets though. */
  895         if (iph->frag_off & htons(IP_OFFSET))
                      return -NF_ACCEPT;
      
  895         *dataoff = nhoff + (iph->ihl << 2);
              *protonum = iph->protocol;
      
              /* Check bogus IP headers */
              if (*dataoff > skb->len) {
                      pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
                               "nhoff %u, ihl %u, skblen %u\n",
                               nhoff, iph->ihl << 2, skb->len);
  895                 return -NF_ACCEPT;
              }
      
              return NF_ACCEPT;
      }
      
      static unsigned int ipv4_helper(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
      {
              struct nf_conn *ct;
              enum ip_conntrack_info ctinfo;
              const struct nf_conn_help *help;
              const struct nf_conntrack_helper *helper;
      
              /* This is where we call the helper: as the packet goes out. */
  842         ct = nf_ct_get(skb, &ctinfo);
  640         if (!ct || ctinfo == IP_CT_RELATED_REPLY)
  842                 return NF_ACCEPT;
      
  630         help = nfct_help(ct);
              if (!help)
                      return NF_ACCEPT;
      
              /* rcu_read_lock()ed by nf_hook_thresh */
              helper = rcu_dereference(help->helper);
              if (!helper)
                      return NF_ACCEPT;
      
              return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
                                  ct, ctinfo);
      }
      
      static unsigned int ipv4_confirm(void *priv,
                                       struct sk_buff *skb,
                                       const struct nf_hook_state *state)
      {
              struct nf_conn *ct;
              enum ip_conntrack_info ctinfo;
      
  842         ct = nf_ct_get(skb, &ctinfo);
  640         if (!ct || ctinfo == IP_CT_RELATED_REPLY)
                      goto out;
      
              /* adjust seqs for loopback traffic only in outgoing direction */
  630         if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
                  !nf_is_loopback_packet(skb)) {
                      if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
                              NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
                              return NF_DROP;
                      }
              }
      out:
              /* We've seen it coming out the other side: confirm it */
  842         return nf_conntrack_confirm(skb);
      }
      
      static unsigned int ipv4_conntrack_in(void *priv,
                                            struct sk_buff *skb,
                                            const struct nf_hook_state *state)
      {
              return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
      }
      
  895 static unsigned int ipv4_conntrack_local(void *priv,
                                               struct sk_buff *skb,
                                               const struct nf_hook_state *state)
      {
              /* root is playing with raw sockets. */
  904         if (skb->len < sizeof(struct iphdr) ||
  904             ip_hdrlen(skb) < sizeof(struct iphdr))
  896                 return NF_ACCEPT;
      
  896         if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */
                      return NF_ACCEPT;
      
  895         return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
      }
      
      /* Connection tracking may drop packets, but never alters them, so
         make it the first hook. */
      static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
              {
                      .hook                = ipv4_conntrack_in,
                      .pf                = NFPROTO_IPV4,
                      .hooknum        = NF_INET_PRE_ROUTING,
                      .priority        = NF_IP_PRI_CONNTRACK,
              },
              {
                      .hook                = ipv4_conntrack_local,
                      .pf                = NFPROTO_IPV4,
                      .hooknum        = NF_INET_LOCAL_OUT,
                      .priority        = NF_IP_PRI_CONNTRACK,
              },
              {
                      .hook                = ipv4_helper,
                      .pf                = NFPROTO_IPV4,
                      .hooknum        = NF_INET_POST_ROUTING,
                      .priority        = NF_IP_PRI_CONNTRACK_HELPER,
              },
              {
                      .hook                = ipv4_confirm,
                      .pf                = NFPROTO_IPV4,
                      .hooknum        = NF_INET_POST_ROUTING,
                      .priority        = NF_IP_PRI_CONNTRACK_CONFIRM,
              },
              {
                      .hook                = ipv4_helper,
                      .pf                = NFPROTO_IPV4,
                      .hooknum        = NF_INET_LOCAL_IN,
                      .priority        = NF_IP_PRI_CONNTRACK_HELPER,
              },
              {
                      .hook                = ipv4_confirm,
                      .pf                = NFPROTO_IPV4,
                      .hooknum        = NF_INET_LOCAL_IN,
                      .priority        = NF_IP_PRI_CONNTRACK_CONFIRM,
              },
      };
      
      /* Fast function for those who don't want to parse /proc (and I don't
         blame them). */
      /* Reversing the socket's dst/src point of view gives us the reply
         mapping. */
      static int
      getorigdst(struct sock *sk, int optval, void __user *user, int *len)
      {
              const struct inet_sock *inet = inet_sk(sk);
              const struct nf_conntrack_tuple_hash *h;
              struct nf_conntrack_tuple tuple;
      
    3         memset(&tuple, 0, sizeof(tuple));
      
              lock_sock(sk);
              tuple.src.u3.ip = inet->inet_rcv_saddr;
              tuple.src.u.tcp.port = inet->inet_sport;
              tuple.dst.u3.ip = inet->inet_daddr;
              tuple.dst.u.tcp.port = inet->inet_dport;
              tuple.src.l3num = PF_INET;
              tuple.dst.protonum = sk->sk_protocol;
              release_sock(sk);
      
              /* We only do TCP and SCTP at the moment: is there a better way? */
    1         if (tuple.dst.protonum != IPPROTO_TCP &&
                  tuple.dst.protonum != IPPROTO_SCTP) {
                      pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
                      return -ENOPROTOOPT;
              }
      
    3         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
                      pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
                               *len, sizeof(struct sockaddr_in));
                      return -EINVAL;
              }
      
    3         h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
              if (h) {
                      struct sockaddr_in sin;
                      struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
      
                      sin.sin_family = AF_INET;
                      sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
                              .tuple.dst.u.tcp.port;
                      sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
                              .tuple.dst.u3.ip;
                      memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
      
                      pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
                               &sin.sin_addr.s_addr, ntohs(sin.sin_port));
                      nf_ct_put(ct);
                      if (copy_to_user(user, &sin, sizeof(sin)) != 0)
                              return -EFAULT;
                      else
    3                         return 0;
              }
              pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
                       &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
                       &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
              return -ENOENT;
      }
      
      #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
      
      #include <linux/netfilter/nfnetlink.h>
      #include <linux/netfilter/nfnetlink_conntrack.h>
      
      static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
                                      const struct nf_conntrack_tuple *tuple)
      {
   13         if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
   13             nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
                      goto nla_put_failure;
              return 0;
      
      nla_put_failure:
              return -1;
      }
      
      static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
              [CTA_IP_V4_SRC]        = { .type = NLA_U32 },
              [CTA_IP_V4_DST]        = { .type = NLA_U32 },
      };
      
      static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
                                      struct nf_conntrack_tuple *t)
      {
    2         if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
                      return -EINVAL;
      
              t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
              t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
      
    2         return 0;
      }
      
      static int ipv4_nlattr_tuple_size(void)
      {
              return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1);
      }
      #endif
      
      static struct nf_sockopt_ops so_getorigdst = {
              .pf                = PF_INET,
              .get_optmin        = SO_ORIGINAL_DST,
              .get_optmax        = SO_ORIGINAL_DST+1,
              .get                = getorigdst,
              .owner                = THIS_MODULE,
      };
      
      static int ipv4_init_net(struct net *net)
      {
  115         return 0;
      }
      
      struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
              .l3proto         = PF_INET,
              .name                 = "ipv4",
              .pkt_to_tuple         = ipv4_pkt_to_tuple,
              .invert_tuple         = ipv4_invert_tuple,
              .print_tuple         = ipv4_print_tuple,
              .get_l4proto         = ipv4_get_l4proto,
      #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
              .tuple_to_nlattr = ipv4_tuple_to_nlattr,
              .nlattr_tuple_size = ipv4_nlattr_tuple_size,
              .nlattr_to_tuple = ipv4_nlattr_to_tuple,
              .nla_policy         = ipv4_nla_policy,
      #endif
              .init_net         = ipv4_init_net,
              .me                 = THIS_MODULE,
      };
      
      module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
                        &nf_conntrack_htable_size, 0600);
      
      MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
      MODULE_ALIAS("ip_conntrack");
      MODULE_LICENSE("GPL");
      
      static int ipv4_net_init(struct net *net)
      {
              int ret = 0;
      
  114         ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
              if (ret < 0) {
                      pr_err("nf_conntrack_tcp4: pernet registration failed\n");
                      goto out_tcp;
              }
  115         ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
              if (ret < 0) {
                      pr_err("nf_conntrack_udp4: pernet registration failed\n");
                      goto out_udp;
              }
  115         ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
              if (ret < 0) {
                      pr_err("nf_conntrack_icmp4: pernet registration failed\n");
                      goto out_icmp;
              }
  115         ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
              if (ret < 0) {
                      pr_err("nf_conntrack_ipv4: pernet registration failed\n");
                      goto out_ipv4;
              }
  115         return 0;
      out_ipv4:
              nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
      out_icmp:
              nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
      out_udp:
              nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
      out_tcp:
              return ret;
      }
      
      static void ipv4_net_exit(struct net *net)
      {
              nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
              nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
              nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
              nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
      }
      
      static struct pernet_operations ipv4_net_ops = {
              .init = ipv4_net_init,
              .exit = ipv4_net_exit,
      };
      
      static int __init nf_conntrack_l3proto_ipv4_init(void)
      {
              int ret = 0;
      
              need_conntrack();
              nf_defrag_ipv4_enable();
      
              ret = nf_register_sockopt(&so_getorigdst);
              if (ret < 0) {
                      pr_err("Unable to register netfilter socket option\n");
                      return ret;
              }
      
              ret = register_pernet_subsys(&ipv4_net_ops);
              if (ret < 0) {
                      pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
                      goto cleanup_sockopt;
              }
      
              ret = nf_register_hooks(ipv4_conntrack_ops,
                                      ARRAY_SIZE(ipv4_conntrack_ops));
              if (ret < 0) {
                      pr_err("nf_conntrack_ipv4: can't register hooks.\n");
                      goto cleanup_pernet;
              }
      
              ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
              if (ret < 0) {
                      pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
                      goto cleanup_hooks;
              }
      
              ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
              if (ret < 0) {
                      pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
                      goto cleanup_tcp4;
              }
      
              ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
              if (ret < 0) {
                      pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
                      goto cleanup_udp4;
              }
      
              ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
              if (ret < 0) {
                      pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
                      goto cleanup_icmpv4;
              }
      
              return ret;
       cleanup_icmpv4:
              nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
       cleanup_udp4:
              nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
       cleanup_tcp4:
              nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
       cleanup_hooks:
              nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
       cleanup_pernet:
              unregister_pernet_subsys(&ipv4_net_ops);
       cleanup_sockopt:
              nf_unregister_sockopt(&so_getorigdst);
              return ret;
      }
      
      static void __exit nf_conntrack_l3proto_ipv4_fini(void)
      {
              synchronize_net();
              nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
              nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
              nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
              nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
              nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
              unregister_pernet_subsys(&ipv4_net_ops);
              nf_unregister_sockopt(&so_getorigdst);
      }
      
      module_init(nf_conntrack_l3proto_ipv4_init);
      module_exit(nf_conntrack_l3proto_ipv4_fini);
      /*
       *        Wrapper functions for 16bit uid back compatibility. All nicely tied
       *        together in the faint hope we can take the out in five years time.
       */
      
      #include <linux/mm.h>
      #include <linux/mman.h>
      #include <linux/notifier.h>
      #include <linux/reboot.h>
      #include <linux/prctl.h>
      #include <linux/capability.h>
      #include <linux/init.h>
      #include <linux/highuid.h>
      #include <linux/security.h>
      #include <linux/syscalls.h>
      
      #include <asm/uaccess.h>
      
    2 SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
      {
    2         return sys_chown(filename, low2highuid(user), low2highgid(group));
      }
      
    2 SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
      {
    2         return sys_lchown(filename, low2highuid(user), low2highgid(group));
      }
      
    4 SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
      {
    4         return sys_fchown(fd, low2highuid(user), low2highgid(group));
      }
      
    2 SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
      {
    2         return sys_setregid(low2highgid(rgid), low2highgid(egid));
      }
      
    2 SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
      {
    2         return sys_setgid(low2highgid(gid));
      }
      
    2 SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
      {
    2         return sys_setreuid(low2highuid(ruid), low2highuid(euid));
      }
      
    4 SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
      {
    4         return sys_setuid(low2highuid(uid));
      }
      
    2 SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
      {
    2         return sys_setresuid(low2highuid(ruid), low2highuid(euid),
                                       low2highuid(suid));
      }
      
    2 SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
      {
              const struct cred *cred = current_cred();
              int retval;
              old_uid_t ruid, euid, suid;
      
    2         ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
    2         euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
    2         suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
      
    2         if (!(retval   = put_user(ruid, ruidp)) &&
    1             !(retval   = put_user(euid, euidp)))
                      retval = put_user(suid, suidp);
      
    2         return retval;
      }
      
    3 SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
      {
    3         return sys_setresgid(low2highgid(rgid), low2highgid(egid),
                                       low2highgid(sgid));
      }
      
      
    3 SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
      {
              const struct cred *cred = current_cred();
              int retval;
              old_gid_t rgid, egid, sgid;
      
    3         rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
    3         egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
    3         sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
      
    3         if (!(retval   = put_user(rgid, rgidp)) &&
    1             !(retval   = put_user(egid, egidp)))
                      retval = put_user(sgid, sgidp);
      
    3         return retval;
      }
      
    3 SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
      {
    3         return sys_setfsuid(low2highuid(uid));
      }
      
    3 SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
      {
    3         return sys_setfsgid(low2highgid(gid));
      }
      
      static int groups16_to_user(old_gid_t __user *grouplist,
          struct group_info *group_info)
      {
    2         struct user_namespace *user_ns = current_user_ns();
              int i;
              old_gid_t group;
              kgid_t kgid;
      
              for (i = 0; i < group_info->ngroups; i++) {
    2                 kgid = group_info->gid[i];
    4                 group = high2lowgid(from_kgid_munged(user_ns, kgid));
    2                 if (put_user(group, grouplist+i))
                              return -EFAULT;
              }
      
              return 0;
      }
      
      static int groups16_from_user(struct group_info *group_info,
          old_gid_t __user *grouplist)
      {
    3         struct user_namespace *user_ns = current_user_ns();
              int i;
              old_gid_t group;
              kgid_t kgid;
      
              for (i = 0; i < group_info->ngroups; i++) {
    1                 if (get_user(group, grouplist+i))
                              return  -EFAULT;
      
                      kgid = make_kgid(user_ns, low2highgid(group));
                      if (!gid_valid(kgid))
                              return -EINVAL;
      
                      group_info->gid[i] = kgid;
              }
      
              return 0;
      }
      
    4 SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
      {
              const struct cred *cred = current_cred();
              int i;
      
              if (gidsetsize < 0)
                      return -EINVAL;
      
    4         i = cred->group_info->ngroups;
              if (gidsetsize) {
    2                 if (i > gidsetsize) {
                              i = -EINVAL;
                              goto out;
                      }
    4                 if (groups16_to_user(grouplist, cred->group_info)) {
                              i = -EFAULT;
                              goto out;
                      }
              }
      out:
              return i;
      }
      
    4 SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
      {
              struct group_info *group_info;
              int retval;
      
              if (!may_setgroups())
                      return -EPERM;
    4         if ((unsigned)gidsetsize > NGROUPS_MAX)
                      return -EINVAL;
      
    3         group_info = groups_alloc(gidsetsize);
              if (!group_info)
                      return -ENOMEM;
    3         retval = groups16_from_user(group_info, grouplist);
              if (retval) {
    1                 put_group_info(group_info);
                      return retval;
              }
      
              groups_sort(group_info);
              retval = set_current_groups(group_info);
              put_group_info(group_info);
      
    2         return retval;
      }
      
      SYSCALL_DEFINE0(getuid16)
      {
    1         return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
      }
      
      SYSCALL_DEFINE0(geteuid16)
      {
    2         return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
      }
      
      SYSCALL_DEFINE0(getgid16)
      {
    2         return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
      }
      
      SYSCALL_DEFINE0(getegid16)
      {
    1         return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
      }
      #ifndef _LINUX_SWAP_H
      #define _LINUX_SWAP_H
      
      #include <linux/spinlock.h>
      #include <linux/linkage.h>
      #include <linux/mmzone.h>
      #include <linux/list.h>
      #include <linux/memcontrol.h>
      #include <linux/sched.h>
      #include <linux/node.h>
      #include <linux/fs.h>
      #include <linux/atomic.h>
      #include <linux/page-flags.h>
      #include <asm/page.h>
      
      struct notifier_block;
      
      struct bio;
      
      #define SWAP_FLAG_PREFER        0x8000        /* set if swap priority specified */
      #define SWAP_FLAG_PRIO_MASK        0x7fff
      #define SWAP_FLAG_PRIO_SHIFT        0
      #define SWAP_FLAG_DISCARD        0x10000 /* enable discard for swap */
      #define SWAP_FLAG_DISCARD_ONCE        0x20000 /* discard swap area at swapon-time */
      #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */
      
      #define SWAP_FLAGS_VALID        (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
                                       SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
                                       SWAP_FLAG_DISCARD_PAGES)
      
      static inline int current_is_kswapd(void)
      {
    1         return current->flags & PF_KSWAPD;
      }
      
      /*
       * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
       * be swapped to.  The swap type and the offset into that swap type are
       * encoded into pte's and into pgoff_t's in the swapcache.  Using five bits
       * for the type means that the maximum number of swapcache pages is 27 bits
       * on 32-bit-pgoff_t architectures.  And that assumes that the architecture packs
       * the type/offset into the pte as 5/27 as well.
       */
      #define MAX_SWAPFILES_SHIFT        5
      
      /*
       * Use some of the swap files numbers for other purposes. This
       * is a convenient way to hook into the VM to trigger special
       * actions on faults.
       */
      
      /*
       * NUMA node memory migration support
       */
      #ifdef CONFIG_MIGRATION
      #define SWP_MIGRATION_NUM 2
      #define SWP_MIGRATION_READ        (MAX_SWAPFILES + SWP_HWPOISON_NUM)
      #define SWP_MIGRATION_WRITE        (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
      #else
      #define SWP_MIGRATION_NUM 0
      #endif
      
      /*
       * Handling of hardware poisoned pages with memory corruption.
       */
      #ifdef CONFIG_MEMORY_FAILURE
      #define SWP_HWPOISON_NUM 1
      #define SWP_HWPOISON                MAX_SWAPFILES
      #else
      #define SWP_HWPOISON_NUM 0
      #endif
      
      #define MAX_SWAPFILES \
              ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
      
      /*
       * Magic header for a swap area. The first part of the union is
       * what the swap magic looks like for the old (limited to 128MB)
       * swap area format, the second part of the union adds - in the
       * old reserved area - some extra information. Note that the first
       * kilobyte is reserved for boot loader or disk label stuff...
       *
       * Having the magic at the end of the PAGE_SIZE makes detecting swap
       * areas somewhat tricky on machines that support multiple page sizes.
       * For 2.5 we'll probably want to move the magic to just beyond the
       * bootbits...
       */
      union swap_header {
              struct {
                      char reserved[PAGE_SIZE - 10];
                      char magic[10];                        /* SWAP-SPACE or SWAPSPACE2 */
              } magic;
              struct {
                      char                bootbits[1024];        /* Space for disklabel etc. */
                      __u32                version;
                      __u32                last_page;
                      __u32                nr_badpages;
                      unsigned char        sws_uuid[16];
                      unsigned char        sws_volume[16];
                      __u32                padding[117];
                      __u32                badpages[1];
              } info;
      };
      
      /*
       * current->reclaim_state points to one of these when a task is running
       * memory reclaim
       */
      struct reclaim_state {
              unsigned long reclaimed_slab;
      };
      
      #ifdef __KERNEL__
      
      struct address_space;
      struct sysinfo;
      struct writeback_control;
      struct zone;
      
      /*
       * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
       * disk blocks.  A list of swap extents maps the entire swapfile.  (Where the
       * term `swapfile' refers to either a blockdevice or an IS_REG file.  Apart
       * from setup, they're handled identically.
       *
       * We always assume that blocks are of size PAGE_SIZE.
       */
      struct swap_extent {
              struct list_head list;
              pgoff_t start_page;
              pgoff_t nr_pages;
              sector_t start_block;
      };
      
      /*
       * Max bad pages in the new format..
       */
      #define MAX_SWAP_BADPAGES \
              ((offsetof(union swap_header, magic.magic) - \
                offsetof(union swap_header, info.badpages)) / sizeof(int))
      
      enum {
              SWP_USED        = (1 << 0),        /* is slot in swap_info[] used? */
              SWP_WRITEOK        = (1 << 1),        /* ok to write to this swap?        */
              SWP_DISCARDABLE = (1 << 2),        /* blkdev support discard */
              SWP_DISCARDING        = (1 << 3),        /* now discarding a free cluster */
              SWP_SOLIDSTATE        = (1 << 4),        /* blkdev seeks are cheap */
              SWP_CONTINUED        = (1 << 5),        /* swap_map has count continuation */
              SWP_BLKDEV        = (1 << 6),        /* its a block device */
              SWP_FILE        = (1 << 7),        /* set after swap_activate success */
              SWP_AREA_DISCARD = (1 << 8),        /* single-time swap area discards */
              SWP_PAGE_DISCARD = (1 << 9),        /* freed swap page-cluster discards */
              SWP_STABLE_WRITES = (1 << 10),        /* no overwrite PG_writeback pages */
                                              /* add others here before... */
              SWP_SCANNING        = (1 << 11),        /* refcount in scan_swap_map */
      };
      
      #define SWAP_CLUSTER_MAX 32UL
      #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
      
      #define SWAP_MAP_MAX        0x3e        /* Max duplication count, in first swap_map */
      #define SWAP_MAP_BAD        0x3f        /* Note pageblock is bad, in first swap_map */
      #define SWAP_HAS_CACHE        0x40        /* Flag page is cached, in first swap_map */
      #define SWAP_CONT_MAX        0x7f        /* Max count, in each swap_map continuation */
      #define COUNT_CONTINUED        0x80        /* See swap_map continuation for full count */
      #define SWAP_MAP_SHMEM        0xbf        /* Owned by shmem/tmpfs, in first swap_map */
      
      /*
       * We use this to track usage of a cluster. A cluster is a block of swap disk
       * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
       * free clusters are organized into a list. We fetch an entry from the list to
       * get a free cluster.
       *
       * The data field stores next cluster if the cluster is free or cluster usage
       * counter otherwise. The flags field determines if a cluster is free. This is
       * protected by swap_info_struct.lock.
       */
      struct swap_cluster_info {
              unsigned int data:24;
              unsigned int flags:8;
      };
      #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
      #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
      
      /*
       * We assign a cluster to each CPU, so each CPU can allocate swap entry from
       * its own cluster and swapout sequentially. The purpose is to optimize swapout
       * throughput.
       */
      struct percpu_cluster {
              struct swap_cluster_info index; /* Current cluster index */
              unsigned int next; /* Likely next allocation offset */
      };
      
      struct swap_cluster_list {
              struct swap_cluster_info head;
              struct swap_cluster_info tail;
      };
      
      /*
       * The in-memory structure used to track swap areas.
       */
      struct swap_info_struct {
              unsigned long        flags;                /* SWP_USED etc: see above */
              signed short        prio;                /* swap priority of this type */
              struct plist_node list;                /* entry in swap_active_head */
              struct plist_node avail_list;        /* entry in swap_avail_head */
              signed char        type;                /* strange name for an index */
              unsigned int        max;                /* extent of the swap_map */
              unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
              struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
              struct swap_cluster_list free_clusters; /* free clusters list */
              unsigned int lowest_bit;        /* index of first free in swap_map */
              unsigned int highest_bit;        /* index of last free in swap_map */
              unsigned int pages;                /* total of usable pages of swap */
              unsigned int inuse_pages;        /* number of those currently in use */
              unsigned int cluster_next;        /* likely index for next allocation */
              unsigned int cluster_nr;        /* countdown to next cluster search */
              struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
              struct swap_extent *curr_swap_extent;
              struct swap_extent first_swap_extent;
              struct block_device *bdev;        /* swap device or bdev of swap file */
              struct file *swap_file;                /* seldom referenced */
              unsigned int old_block_size;        /* seldom referenced */
      #ifdef CONFIG_FRONTSWAP
              unsigned long *frontswap_map;        /* frontswap in-use, one bit per page */
              atomic_t frontswap_pages;        /* frontswap pages in-use counter */
      #endif
              spinlock_t lock;                /*
                                               * protect map scan related fields like
                                               * swap_map, lowest_bit, highest_bit,
                                               * inuse_pages, cluster_next,
                                               * cluster_nr, lowest_alloc,
                                               * highest_alloc, free/discard cluster
                                               * list. other fields are only changed
                                               * at swapon/swapoff, so are protected
                                               * by swap_lock. changing flags need
                                               * hold this lock and swap_lock. If
                                               * both locks need hold, hold swap_lock
                                               * first.
                                               */
              struct work_struct discard_work; /* discard worker */
              struct swap_cluster_list discard_clusters; /* discard clusters list */
      };
      
      /* linux/mm/workingset.c */
      void *workingset_eviction(struct address_space *mapping, struct page *page);
      void workingset_refault(struct page *page, void *shadow);
      void workingset_activation(struct page *page);
      extern struct list_lru workingset_shadow_nodes;
      
      static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
      {
  443         return node->count & RADIX_TREE_COUNT_MASK;
      }
      
      static inline void workingset_node_pages_inc(struct radix_tree_node *node)
      {
 1052         node->count++;
      }
      
      static inline void workingset_node_pages_dec(struct radix_tree_node *node)
      {
  443         VM_WARN_ON_ONCE(!workingset_node_pages(node));
  443         node->count--;
      }
      
      static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
      {
              return node->count >> RADIX_TREE_COUNT_SHIFT;
      }
      
      static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
      {
              node->count += 1U << RADIX_TREE_COUNT_SHIFT;
      }
      
      static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
      {
              VM_WARN_ON_ONCE(!workingset_node_shadows(node));
              node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
      }
      
      /* linux/mm/page_alloc.c */
      extern unsigned long totalram_pages;
      extern unsigned long totalreserve_pages;
      extern unsigned long nr_free_buffer_pages(void);
      extern unsigned long nr_free_pagecache_pages(void);
      
      /* Definition of global_page_state not available yet */
      #define nr_free_pages() global_page_state(NR_FREE_PAGES)
      
      
      /* linux/mm/swap.c */
      extern void lru_cache_add(struct page *);
      extern void lru_cache_add_anon(struct page *page);
      extern void lru_cache_add_file(struct page *page);
      extern void lru_add_page_tail(struct page *page, struct page *page_tail,
                               struct lruvec *lruvec, struct list_head *head);
      extern void activate_page(struct page *);
      extern void mark_page_accessed(struct page *);
      extern void lru_add_drain(void);
      extern void lru_add_drain_cpu(int cpu);
      extern void lru_add_drain_all(void);
      extern void rotate_reclaimable_page(struct page *page);
      extern void deactivate_file_page(struct page *page);
      extern void deactivate_page(struct page *page);
      extern void swap_setup(void);
      
      extern void add_page_to_unevictable_list(struct page *page);
      
      extern void lru_cache_add_active_or_unevictable(struct page *page,
                                                      struct vm_area_struct *vma);
      
      /* linux/mm/vmscan.c */
      extern unsigned long zone_reclaimable_pages(struct zone *zone);
      extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
      extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                              gfp_t gfp_mask, nodemask_t *mask);
      extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
      extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                        unsigned long nr_pages,
                                                        gfp_t gfp_mask,
                                                        bool may_swap);
      extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
                                                      gfp_t gfp_mask, bool noswap,
                                                      pg_data_t *pgdat,
                                                      unsigned long *nr_scanned);
      extern unsigned long shrink_all_memory(unsigned long nr_pages);
      extern int vm_swappiness;
      extern int remove_mapping(struct address_space *mapping, struct page *page);
      extern unsigned long vm_total_pages;
      
      #ifdef CONFIG_NUMA
      extern int node_reclaim_mode;
      extern int sysctl_min_unmapped_ratio;
      extern int sysctl_min_slab_ratio;
      extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
      #else
      #define node_reclaim_mode 0
      static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                      unsigned int order)
      {
              return 0;
      }
      #endif
      
      extern int page_evictable(struct page *page);
      extern void check_move_unevictable_pages(struct page **, int nr_pages);
      
      extern int kswapd_run(int nid);
      extern void kswapd_stop(int nid);
      
      #ifdef CONFIG_SWAP
      /* linux/mm/page_io.c */
      extern int swap_readpage(struct page *);
      extern int swap_writepage(struct page *page, struct writeback_control *wbc);
      extern void end_swap_bio_write(struct bio *bio);
      extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
              bio_end_io_t end_write_func);
      extern int swap_set_page_dirty(struct page *page);
      
      int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                      unsigned long nr_pages, sector_t start_block);
      int generic_swapfile_activate(struct swap_info_struct *, struct file *,
                      sector_t *);
      
      /* linux/mm/swap_state.c */
      extern struct address_space swapper_spaces[];
      #define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
      extern unsigned long total_swapcache_pages(void);
      extern void show_swap_cache_info(void);
      extern int add_to_swap(struct page *, struct list_head *list);
      extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
      extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
      extern void __delete_from_swap_cache(struct page *);
      extern void delete_from_swap_cache(struct page *);
      extern void free_page_and_swap_cache(struct page *);
      extern void free_pages_and_swap_cache(struct page **, int);
      extern struct page *lookup_swap_cache(swp_entry_t);
      extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
                              struct vm_area_struct *vma, unsigned long addr);
      extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
                              struct vm_area_struct *vma, unsigned long addr,
                              bool *new_page_allocated);
      extern struct page *swapin_readahead(swp_entry_t, gfp_t,
                              struct vm_area_struct *vma, unsigned long addr);
      
      /* linux/mm/swapfile.c */
      extern atomic_long_t nr_swap_pages;
      extern long total_swap_pages;
      
      /* Swap 50% full? Release swapcache more aggressively.. */
      static inline bool vm_swap_full(void)
      {
              return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
      }
      
      static inline long get_nr_swap_pages(void)
      {
    4         return atomic_long_read(&nr_swap_pages);
      }
      
      extern void si_swapinfo(struct sysinfo *);
      extern swp_entry_t get_swap_page(void);
      extern swp_entry_t get_swap_page_of_type(int);
      extern int add_swap_count_continuation(swp_entry_t, gfp_t);
      extern void swap_shmem_alloc(swp_entry_t);
      extern int swap_duplicate(swp_entry_t);
      extern int swapcache_prepare(swp_entry_t);
      extern void swap_free(swp_entry_t);
      extern void swapcache_free(swp_entry_t);
      extern int free_swap_and_cache(swp_entry_t);
      extern int swap_type_of(dev_t, sector_t, struct block_device **);
      extern unsigned int count_swap_pages(int, int);
      extern sector_t map_swap_page(struct page *, struct block_device **);
      extern sector_t swapdev_block(int, pgoff_t);
      extern int page_swapcount(struct page *);
      extern int swp_swapcount(swp_entry_t entry);
      extern struct swap_info_struct *page_swap_info(struct page *);
      extern bool reuse_swap_page(struct page *, int *);
      extern int try_to_free_swap(struct page *);
      struct backing_dev_info;
      
      #else /* CONFIG_SWAP */
      
      #define swap_address_space(entry)                (NULL)
      #define get_nr_swap_pages()                        0L
      #define total_swap_pages                        0L
      #define total_swapcache_pages()                        0UL
      #define vm_swap_full()                                0
      
      #define si_swapinfo(val) \
              do { (val)->freeswap = (val)->totalswap = 0; } while (0)
      /* only sparc can not include linux/pagemap.h in this file
       * so leave put_page and release_pages undeclared... */
      #define free_page_and_swap_cache(page) \
              put_page(page)
      #define free_pages_and_swap_cache(pages, nr) \
              release_pages((pages), (nr), false);
      
      static inline void show_swap_cache_info(void)
      {
      }
      
      #define free_swap_and_cache(swp)        is_migration_entry(swp)
      #define swapcache_prepare(swp)                is_migration_entry(swp)
      
      static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
      {
              return 0;
      }
      
      static inline void swap_shmem_alloc(swp_entry_t swp)
      {
      }
      
      static inline int swap_duplicate(swp_entry_t swp)
      {
              return 0;
      }
      
      static inline void swap_free(swp_entry_t swp)
      {
      }
      
      static inline void swapcache_free(swp_entry_t swp)
      {
      }
      
      static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
                              struct vm_area_struct *vma, unsigned long addr)
      {
              return NULL;
      }
      
      static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
      {
              return 0;
      }
      
      static inline struct page *lookup_swap_cache(swp_entry_t swp)
      {
              return NULL;
      }
      
      static inline int add_to_swap(struct page *page, struct list_head *list)
      {
              return 0;
      }
      
      static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
                                                              gfp_t gfp_mask)
      {
              return -1;
      }
      
      static inline void __delete_from_swap_cache(struct page *page)
      {
      }
      
      static inline void delete_from_swap_cache(struct page *page)
      {
      }
      
      static inline int page_swapcount(struct page *page)
      {
              return 0;
      }
      
      static inline int swp_swapcount(swp_entry_t entry)
      {
              return 0;
      }
      
      #define reuse_swap_page(page, total_mapcount) \
              (page_trans_huge_mapcount(page, total_mapcount) == 1)
      
      static inline int try_to_free_swap(struct page *page)
      {
              return 0;
      }
      
      static inline swp_entry_t get_swap_page(void)
      {
              swp_entry_t entry;
              entry.val = 0;
              return entry;
      }
      
      #endif /* CONFIG_SWAP */
      
      #ifdef CONFIG_MEMCG
      static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
      {
              /* Cgroup2 doesn't have per-cgroup swappiness */
              if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                      return vm_swappiness;
      
              /* root ? */
              if (mem_cgroup_disabled() || !memcg->css.parent)
                      return vm_swappiness;
      
              return memcg->swappiness;
      }
      
      #else
      static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
      {
              return vm_swappiness;
      }
      #endif
      
      #ifdef CONFIG_MEMCG_SWAP
      extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
      extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
      extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
      extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
      extern bool mem_cgroup_swap_full(struct page *page);
      #else
      static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
      {
      }
      
      static inline int mem_cgroup_try_charge_swap(struct page *page,
                                                   swp_entry_t entry)
      {
              return 0;
      }
      
      static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
      {
      }
      
      static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
      {
    1         return get_nr_swap_pages();
      }
      
      static inline bool mem_cgroup_swap_full(struct page *page)
      {
              return vm_swap_full();
      }
      #endif
      
      #endif /* __KERNEL__*/
      #endif /* _LINUX_SWAP_H */
      /*
       *  UDPLITE     An implementation of the UDP-Lite protocol (RFC 3828).
       *
       *  Authors:    Gerrit Renker       <gerrit@erg.abdn.ac.uk>
       *
       *  Changes:
       *  Fixes:
       *                This program is free software; you can redistribute it and/or
       *                modify it under the terms of the GNU General Public License
       *                as published by the Free Software Foundation; either version
       *                2 of the License, or (at your option) any later version.
       */
      
      #define pr_fmt(fmt) "UDPLite: " fmt
      
      #include <linux/export.h>
      #include "udp_impl.h"
      
      struct udp_table         udplite_table __read_mostly;
      EXPORT_SYMBOL(udplite_table);
      
      static int udplite_rcv(struct sk_buff *skb)
      {
              return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
      }
      
      static void udplite_err(struct sk_buff *skb, u32 info)
      {
              __udp4_lib_err(skb, info, &udplite_table);
      }
      
      static const struct net_protocol udplite_protocol = {
              .handler        = udplite_rcv,
              .err_handler        = udplite_err,
              .no_policy        = 1,
              .netns_ok        = 1,
      };
      
      struct proto         udplite_prot = {
              .name                   = "UDP-Lite",
              .owner                   = THIS_MODULE,
              .close                   = udp_lib_close,
              .connect           = ip4_datagram_connect,
              .disconnect           = udp_disconnect,
              .ioctl                   = udp_ioctl,
              .init                   = udplite_sk_init,
              .destroy           = udp_destroy_sock,
              .setsockopt           = udp_setsockopt,
              .getsockopt           = udp_getsockopt,
              .sendmsg           = udp_sendmsg,
              .recvmsg           = udp_recvmsg,
              .sendpage           = udp_sendpage,
              .backlog_rcv           = __udp_queue_rcv_skb,
              .hash                   = udp_lib_hash,
              .unhash                   = udp_lib_unhash,
              .get_port           = udp_v4_get_port,
              .obj_size           = sizeof(struct udp_sock),
              .h.udp_table           = &udplite_table,
      #ifdef CONFIG_COMPAT
              .compat_setsockopt = compat_udp_setsockopt,
              .compat_getsockopt = compat_udp_getsockopt,
      #endif
      };
      EXPORT_SYMBOL(udplite_prot);
      
      static struct inet_protosw udplite4_protosw = {
              .type                =  SOCK_DGRAM,
              .protocol        =  IPPROTO_UDPLITE,
              .prot                =  &udplite_prot,
              .ops                =  &inet_dgram_ops,
              .flags                =  INET_PROTOSW_PERMANENT,
      };
      
      #ifdef CONFIG_PROC_FS
      
      static const struct file_operations udplite_afinfo_seq_fops = {
              .owner    = THIS_MODULE,
              .open     = udp_seq_open,
              .read     = seq_read,
              .llseek   = seq_lseek,
              .release  = seq_release_net
      };
      
      static struct udp_seq_afinfo udplite4_seq_afinfo = {
              .name                = "udplite",
              .family                = AF_INET,
              .udp_table         = &udplite_table,
              .seq_fops        = &udplite_afinfo_seq_fops,
              .seq_ops        = {
                      .show                = udp4_seq_show,
              },
      };
      
      static int __net_init udplite4_proc_init_net(struct net *net)
      {
  116         return udp_proc_register(net, &udplite4_seq_afinfo);
      }
      
      static void __net_exit udplite4_proc_exit_net(struct net *net)
      {
              udp_proc_unregister(net, &udplite4_seq_afinfo);
      }
      
      static struct pernet_operations udplite4_net_ops = {
              .init = udplite4_proc_init_net,
              .exit = udplite4_proc_exit_net,
      };
      
      static __init int udplite4_proc_init(void)
      {
              return register_pernet_subsys(&udplite4_net_ops);
      }
      #else
      static inline int udplite4_proc_init(void)
      {
              return 0;
      }
      #endif
      
      void __init udplite4_register(void)
      {
              udp_table_init(&udplite_table, "UDP-Lite");
              if (proto_register(&udplite_prot, 1))
                      goto out_register_err;
      
              if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0)
                      goto out_unregister_proto;
      
              inet_register_protosw(&udplite4_protosw);
      
              if (udplite4_proc_init())
                      pr_err("%s: Cannot register /proc!\n", __func__);
              return;
      
      out_unregister_proto:
              proto_unregister(&udplite_prot);
      out_register_err:
              pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);
      }
      /* user-type.h: User-defined key type
       *
       * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
       * Written by David Howells (dhowells@redhat.com)
       *
       * This program is free software; you can redistribute it and/or
       * modify it under the terms of the GNU General Public License
       * as published by the Free Software Foundation; either version
       * 2 of the License, or (at your option) any later version.
       */
      
      #ifndef _KEYS_USER_TYPE_H
      #define _KEYS_USER_TYPE_H
      
      #include <linux/key.h>
      #include <linux/rcupdate.h>
      
      #ifdef CONFIG_KEYS
      
      /*****************************************************************************/
      /*
       * the payload for a key of type "user" or "logon"
       * - once filled in and attached to a key:
       *   - the payload struct is invariant may not be changed, only replaced
       *   - the payload must be read with RCU procedures or with the key semaphore
       *     held
       *   - the payload may only be replaced with the key semaphore write-locked
       * - the key's data length is the size of the actual data, not including the
       *   payload wrapper
       */
      struct user_key_payload {
              struct rcu_head        rcu;                /* RCU destructor */
              unsigned short        datalen;        /* length of this data */
              char                data[0] __aligned(__alignof__(u64)); /* actual data */
      };
      
      extern struct key_type key_type_user;
      extern struct key_type key_type_logon;
      
      struct key_preparsed_payload;
      
      extern int user_preparse(struct key_preparsed_payload *prep);
      extern void user_free_preparse(struct key_preparsed_payload *prep);
      extern int user_update(struct key *key, struct key_preparsed_payload *prep);
      extern void user_revoke(struct key *key);
      extern void user_destroy(struct key *key);
      extern void user_describe(const struct key *user, struct seq_file *m);
      extern long user_read(const struct key *key,
                            char __user *buffer, size_t buflen);
      
      static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key)
      {
              return (struct user_key_payload *)dereference_key_rcu(key);
      }
      
      static inline struct user_key_payload *user_key_payload_locked(const struct key *key)
      {
    9         return (struct user_key_payload *)dereference_key_locked((struct key *)key);
      }
      
      #endif /* CONFIG_KEYS */
      
      #endif /* _KEYS_USER_TYPE_H */
      /*
       *  Block device elevator/IO-scheduler.
       *
       *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
       *
       * 30042000 Jens Axboe <axboe@kernel.dk> :
       *
       * Split the elevator a bit so that it is possible to choose a different
       * one or even write a new "plug in". There are three pieces:
       * - elevator_fn, inserts a new request in the queue list
       * - elevator_merge_fn, decides whether a new buffer can be merged with
       *   an existing request
       * - elevator_dequeue_fn, called when a request is taken off the active list
       *
       * 20082000 Dave Jones <davej@suse.de> :
       * Removed tests for max-bomb-segments, which was breaking elvtune
       *  when run without -bN
       *
       * Jens:
       * - Rework again to work with bio instead of buffer_heads
       * - loose bi_dev comparisons, partition handling is right now
       * - completely modularize elevator setup and teardown
       *
       */
      #include <linux/kernel.h>
      #include <linux/fs.h>
      #include <linux/blkdev.h>
      #include <linux/elevator.h>
      #include <linux/bio.h>
      #include <linux/module.h>
      #include <linux/slab.h>
      #include <linux/init.h>
      #include <linux/compiler.h>
      #include <linux/blktrace_api.h>
      #include <linux/hash.h>
      #include <linux/uaccess.h>
      #include <linux/pm_runtime.h>
      #include <linux/blk-cgroup.h>
      
      #include <trace/events/block.h>
      
      #include "blk.h"
      
      static DEFINE_SPINLOCK(elv_list_lock);
      static LIST_HEAD(elv_list);
      
      /*
       * Merge hash stuff.
       */
      #define rq_hash_key(rq)                (blk_rq_pos(rq) + blk_rq_sectors(rq))
      
      /*
       * Query io scheduler to see if the current process issuing bio may be
       * merged with rq.
       */
      static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
      {
  223         struct request_queue *q = rq->q;
              struct elevator_queue *e = q->elevator;
      
              if (e->type->ops.elevator_allow_bio_merge_fn)
  223                 return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);
      
              return 1;
      }
      
      /*
       * can we safely merge with this request?
       */
  241 bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
      {
  241         if (!blk_rq_merge_ok(rq, bio))
                      return false;
      
  241         if (!elv_iosched_allow_bio_merge(rq, bio))
                      return false;
      
              return true;
      }
      EXPORT_SYMBOL(elv_bio_merge_ok);
      
      static struct elevator_type *elevator_find(const char *name)
      {
              struct elevator_type *e;
      
              list_for_each_entry(e, &elv_list, list) {
                      if (!strcmp(e->elevator_name, name))
                              return e;
              }
      
              return NULL;
      }
      
      static void elevator_put(struct elevator_type *e)
      {
              module_put(e->elevator_owner);
      }
      
      static struct elevator_type *elevator_get(const char *name, bool try_loading)
      {
              struct elevator_type *e;
      
              spin_lock(&elv_list_lock);
      
              e = elevator_find(name);
              if (!e && try_loading) {
                      spin_unlock(&elv_list_lock);
                      request_module("%s-iosched", name);
                      spin_lock(&elv_list_lock);
                      e = elevator_find(name);
              }
      
              if (e && !try_module_get(e->elevator_owner))
                      e = NULL;
      
              spin_unlock(&elv_list_lock);
      
              return e;
      }
      
      static char chosen_elevator[ELV_NAME_MAX];
      
      static int __init elevator_setup(char *str)
      {
              /*
               * Be backwards-compatible with previous kernels, so users
               * won't get the wrong elevator.
               */
              strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
              return 1;
      }
      
      __setup("elevator=", elevator_setup);
      
      /* called during boot to load the elevator chosen by the elevator param */
      void __init load_default_elevator_module(void)
      {
              struct elevator_type *e;
      
              if (!chosen_elevator[0])
                      return;
      
              spin_lock(&elv_list_lock);
              e = elevator_find(chosen_elevator);
              spin_unlock(&elv_list_lock);
      
              if (!e)
                      request_module("%s-iosched", chosen_elevator);
      }
      
      static struct kobj_type elv_ktype;
      
      struct elevator_queue *elevator_alloc(struct request_queue *q,
                                        struct elevator_type *e)
      {
              struct elevator_queue *eq;
      
              eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
              if (unlikely(!eq))
                      return NULL;
      
              eq->type = e;
              kobject_init(&eq->kobj, &elv_ktype);
              mutex_init(&eq->sysfs_lock);
              hash_init(eq->hash);
      
              return eq;
      }
      EXPORT_SYMBOL(elevator_alloc);
      
      static void elevator_release(struct kobject *kobj)
      {
              struct elevator_queue *e;
      
              e = container_of(kobj, struct elevator_queue, kobj);
              elevator_put(e->type);
              kfree(e);
      }
      
      int elevator_init(struct request_queue *q, char *name)
      {
              struct elevator_type *e = NULL;
              int err;
      
              /*
               * q->sysfs_lock must be held to provide mutual exclusion between
               * elevator_switch() and here.
               */
              lockdep_assert_held(&q->sysfs_lock);
      
              if (unlikely(q->elevator))
                      return 0;
      
              INIT_LIST_HEAD(&q->queue_head);
              q->last_merge = NULL;
              q->end_sector = 0;
              q->boundary_rq = NULL;
      
              if (name) {
                      e = elevator_get(name, true);
                      if (!e)
                              return -EINVAL;
              }
      
              /*
               * Use the default elevator specified by config boot param or
               * config option.  Don't try to load modules as we could be running
               * off async and request_module() isn't allowed from async.
               */
              if (!e && *chosen_elevator) {
                      e = elevator_get(chosen_elevator, false);
                      if (!e)
                              printk(KERN_ERR "I/O scheduler %s not found\n",
                                                              chosen_elevator);
              }
      
              if (!e) {
                      e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
                      if (!e) {
                              printk(KERN_ERR
                                      "Default I/O scheduler not found. " \
                                      "Using noop.\n");
                              e = elevator_get("noop", false);
                      }
              }
      
              err = e->ops.elevator_init_fn(q, e);
              if (err)
                      elevator_put(e);
              return err;
      }
      EXPORT_SYMBOL(elevator_init);
      
      void elevator_exit(struct elevator_queue *e)
      {
              mutex_lock(&e->sysfs_lock);
              if (e->type->ops.elevator_exit_fn)
                      e->type->ops.elevator_exit_fn(e);
              mutex_unlock(&e->sysfs_lock);
      
              kobject_put(&e->kobj);
      }
      EXPORT_SYMBOL(elevator_exit);
      
      static inline void __elv_rqhash_del(struct request *rq)
      {
 1105         hash_del(&rq->hash);
 1105         rq->cmd_flags &= ~REQ_HASHED;
      }
      
      static void elv_rqhash_del(struct request_queue *q, struct request *rq)
      {
 1106         if (ELV_ON_HASH(rq))
 1103                 __elv_rqhash_del(rq);
      }
      
      static void elv_rqhash_add(struct request_queue *q, struct request *rq)
      {
              struct elevator_queue *e = q->elevator;
      
 1127         BUG_ON(ELV_ON_HASH(rq));
 1127         hash_add(e->hash, &rq->hash, rq_hash_key(rq));
              rq->cmd_flags |= REQ_HASHED;
      }
      
      static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
      {
   13         __elv_rqhash_del(rq);
              elv_rqhash_add(q, rq);
      }
      
      static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
      {
              struct elevator_queue *e = q->elevator;
              struct hlist_node *next;
              struct request *rq;
      
 1189         hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
                      BUG_ON(!ELV_ON_HASH(rq));
      
  163                 if (unlikely(!rq_mergeable(rq))) {
    1                         __elv_rqhash_del(rq);
                              continue;
                      }
      
  163                 if (rq_hash_key(rq) == offset)
                              return rq;
              }
      
 1189         return NULL;
      }
      
      /*
       * RB-tree support functions for inserting/lookup/removal of requests
       * in a sorted RB tree.
       */
      void elv_rb_add(struct rb_root *root, struct request *rq)
      {
 1127         struct rb_node **p = &root->rb_node;
              struct rb_node *parent = NULL;
              struct request *__rq;
      
 1127         while (*p) {
                      parent = *p;
                      __rq = rb_entry(parent, struct request, rb_node);
      
  258                 if (blk_rq_pos(rq) < blk_rq_pos(__rq))
   28                         p = &(*p)->rb_left;
                      else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
  258                         p = &(*p)->rb_right;
              }
      
 1127         rb_link_node(&rq->rb_node, parent, p);
              rb_insert_color(&rq->rb_node, root);
      }
      EXPORT_SYMBOL(elv_rb_add);
      
      void elv_rb_del(struct rb_root *root, struct request *rq)
      {
 1106         BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
 1106         rb_erase(&rq->rb_node, root);
              RB_CLEAR_NODE(&rq->rb_node);
      }
      EXPORT_SYMBOL(elv_rb_del);
      
      struct request *elv_rb_find(struct rb_root *root, sector_t sector)
      {
 1022         struct rb_node *n = root->rb_node;
              struct request *rq;
      
   37         while (n) {
    1                 rq = rb_entry(n, struct request, rb_node);
      
   37                 if (sector < blk_rq_pos(rq))
   20                         n = n->rb_left;
   25                 else if (sector > blk_rq_pos(rq))
   25                         n = n->rb_right;
                      else
                              return rq;
              }
      
 1022         return NULL;
      }
      EXPORT_SYMBOL(elv_rb_find);
      
      /*
       * Insert rq into dispatch queue of q.  Queue lock must be held on
       * entry.  rq is sort instead into the dispatch queue. To be used by
       * specific elevators.
       */
      void elv_dispatch_sort(struct request_queue *q, struct request *rq)
      {
              sector_t boundary;
              struct list_head *entry;
              int stop_flags;
      
 1106         if (q->last_merge == rq)
 1098                 q->last_merge = NULL;
      
 1106         elv_rqhash_del(q, rq);
      
 1106         q->nr_sorted--;
      
              boundary = q->end_sector;
              stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
              list_for_each_prev(entry, &q->queue_head) {
                      struct request *pos = list_entry_rq(entry);
      
                      if (req_op(rq) != req_op(pos))
                              break;
                      if (rq_data_dir(rq) != rq_data_dir(pos))
                              break;
                      if (pos->cmd_flags & stop_flags)
                              break;
                      if (blk_rq_pos(rq) >= boundary) {
                              if (blk_rq_pos(pos) < boundary)
                                      continue;
                      } else {
                              if (blk_rq_pos(pos) >= boundary)
                                      break;
                      }
                      if (blk_rq_pos(rq) >= blk_rq_pos(pos))
                              break;
              }
      
 1106         list_add(&rq->queuelist, entry);
 1106 }
      EXPORT_SYMBOL(elv_dispatch_sort);
      
      /*
       * Insert rq into dispatch queue of q.  Queue lock must be held on
       * entry.  rq is added to the back of the dispatch queue. To be used by
       * specific elevators.
       */
      void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
      {
              if (q->last_merge == rq)
                      q->last_merge = NULL;
      
              elv_rqhash_del(q, rq);
      
              q->nr_sorted--;
      
              q->end_sector = rq_end_sector(rq);
              q->boundary_rq = rq;
              list_add_tail(&rq->queuelist, &q->queue_head);
      }
      EXPORT_SYMBOL(elv_dispatch_add_tail);
      
      int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
      {
 1189         struct elevator_queue *e = q->elevator;
              struct request *__rq;
              int ret;
      
              /*
               * Levels of merges:
               *         nomerges:  No merges at all attempted
               *         noxmerges: Only simple one-hit cache try
               *         merges:           All merge tries attempted
               */
 1191         if (blk_queue_nomerges(q) || !bio_mergeable(bio))
 1191                 return ELEVATOR_NO_MERGE;
      
              /*
               * First try one-hit cache.
               */
  240         if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
   27                 ret = blk_try_merge(q->last_merge, bio);
                      if (ret != ELEVATOR_NO_MERGE) {
    9                         *req = q->last_merge;
                              return ret;
                      }
              }
      
 1189         if (blk_queue_noxmerges(q))
                      return ELEVATOR_NO_MERGE;
      
              /*
               * See if our hash lookup can find a potential backmerge.
               */
 1189         __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
   64         if (__rq && elv_bio_merge_ok(__rq, bio)) {
    4                 *req = __rq;
                      return ELEVATOR_BACK_MERGE;
              }
      
 1189         if (e->type->ops.elevator_merge_fn)
 1189                 return e->type->ops.elevator_merge_fn(q, req, bio);
      
              return ELEVATOR_NO_MERGE;
      }
      
      /*
       * Attempt to do an insertion back merge. Only check for the case where
       * we can append 'rq' to an existing request, so we can throw 'rq' away
       * afterwards.
       *
       * Returns true if we merged, false otherwise
       */
      static bool elv_attempt_insert_merge(struct request_queue *q,
                                           struct request *rq)
      {
              struct request *__rq;
              bool ret;
      
  677         if (blk_queue_nomerges(q))
                      return false;
      
              /*
               * First try one-hit cache.
               */
  677         if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
                      return true;
      
  677         if (blk_queue_noxmerges(q))
                      return false;
      
              ret = false;
              /*
               * See if our hash lookup can find a potential backmerge.
               */
              while (1) {
  677                 __rq = elv_rqhash_find(q, blk_rq_pos(rq));
   82                 if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
                              break;
      
                      /* The merged request could be merged with others, try again */
                      ret = true;
                      rq = __rq;
              }
      
              return ret;
      }
      
      void elv_merged_request(struct request_queue *q, struct request *rq, int type)
      {
   11         struct elevator_queue *e = q->elevator;
      
              if (e->type->ops.elevator_merged_fn)
   11                 e->type->ops.elevator_merged_fn(q, rq, type);
      
   11         if (type == ELEVATOR_BACK_MERGE)
    9                 elv_rqhash_reposition(q, rq);
      
   11         q->last_merge = rq;
      }
      
      void elv_merge_requests(struct request_queue *q, struct request *rq,
                                   struct request *next)
      {
              struct elevator_queue *e = q->elevator;
    5         const int next_sorted = next->cmd_flags & REQ_SORTED;
      
              if (next_sorted && e->type->ops.elevator_merge_req_fn)
                      e->type->ops.elevator_merge_req_fn(q, rq, next);
      
    5         elv_rqhash_reposition(q, rq);
      
              if (next_sorted) {
                      elv_rqhash_del(q, next);
                      q->nr_sorted--;
              }
      
    5         q->last_merge = rq;
      }
      
      void elv_bio_merged(struct request_queue *q, struct request *rq,
                              struct bio *bio)
      {
   11         struct elevator_queue *e = q->elevator;
      
              if (e->type->ops.elevator_bio_merged_fn)
   11                 e->type->ops.elevator_bio_merged_fn(q, rq, bio);
   11 }
      
      #ifdef CONFIG_PM
      static void blk_pm_requeue_request(struct request *rq)
      {
              if (rq->q->dev && !(rq->cmd_flags & REQ_PM))
                      rq->q->nr_pending--;
      }
      
      static void blk_pm_add_request(struct request_queue *q, struct request *rq)
      {
 1156         if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 &&
 1127             (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
                      pm_request_resume(q->dev);
      }
      #else
      static inline void blk_pm_requeue_request(struct request *rq) {}
      static inline void blk_pm_add_request(struct request_queue *q,
                                            struct request *rq)
      {
      }
      #endif
      
      void elv_requeue_request(struct request_queue *q, struct request *rq)
      {
              /*
               * it already went through dequeue, we need to decrement the
               * in_flight count again
               */
              if (blk_account_rq(rq)) {
                      q->in_flight[rq_is_sync(rq)]--;
                      if (rq->cmd_flags & REQ_SORTED)
                              elv_deactivate_rq(q, rq);
              }
      
              rq->cmd_flags &= ~REQ_STARTED;
      
              blk_pm_requeue_request(rq);
      
              __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
      }
      
      void elv_drain_elevator(struct request_queue *q)
      {
              static int printed;
      
              lockdep_assert_held(q->queue_lock);
      
              while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
                      ;
              if (q->nr_sorted && printed++ < 10) {
                      printk(KERN_ERR "%s: forced dispatching is broken "
                             "(nr_sorted=%u), please report this\n",
                             q->elevator->type->elevator_name, q->nr_sorted);
              }
      }
      
      void __elv_add_request(struct request_queue *q, struct request *rq, int where)
      {
 1156         trace_block_rq_insert(q, rq);
      
 1156         blk_pm_add_request(q, rq);
      
 1155         rq->q = q;
      
              if (rq->cmd_flags & REQ_SOFTBARRIER) {
                      /* barriers are scheduling boundary, update end_sector */
                      if (rq->cmd_type == REQ_TYPE_FS) {
                              q->end_sector = rq_end_sector(rq);
                              q->boundary_rq = rq;
                      }
 1155         } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
  472                     (where == ELEVATOR_INSERT_SORT ||
                           where == ELEVATOR_INSERT_SORT_MERGE))
                      where = ELEVATOR_INSERT_BACK;
      
 1155         switch (where) {
              case ELEVATOR_INSERT_REQUEUE:
              case ELEVATOR_INSERT_FRONT:
                      rq->cmd_flags |= REQ_SOFTBARRIER;
                      list_add(&rq->queuelist, &q->queue_head);
                      break;
      
              case ELEVATOR_INSERT_BACK:
                      rq->cmd_flags |= REQ_SOFTBARRIER;
                      elv_drain_elevator(q);
                      list_add_tail(&rq->queuelist, &q->queue_head);
                      /*
                       * We kick the queue here for the following reasons.
                       * - The elevator might have returned NULL previously
                       *   to delay requests and returned them now.  As the
                       *   queue wasn't empty before this request, ll_rw_blk
                       *   won't run the queue on return, resulting in hang.
                       * - Usually, back inserted requests won't be merged
                       *   with anything.  There's no point in delaying queue
                       *   processing.
                       */
                      __blk_run_queue(q);
                      break;
      
              case ELEVATOR_INSERT_SORT_MERGE:
                      /*
                       * If we succeed in merging this request with one in the
                       * queue already, we are done - rq has now been freed,
                       * so no need to do anything further.
                       */
  677                 if (elv_attempt_insert_merge(q, rq))
                              break;
              case ELEVATOR_INSERT_SORT:
 1128                 BUG_ON(rq->cmd_type != REQ_TYPE_FS);
 1128                 rq->cmd_flags |= REQ_SORTED;
                      q->nr_sorted++;
 1128                 if (rq_mergeable(rq)) {
 1127                         elv_rqhash_add(q, rq);
  366                         if (!q->last_merge)
 1122                                 q->last_merge = rq;
                      }
      
                      /*
                       * Some ioscheds (cfq) run q->request_fn directly, so
                       * rq cannot be accessed after calling
                       * elevator_add_req_fn.
                       */
 1127                 q->elevator->type->ops.elevator_add_req_fn(q, rq);
                      break;
      
              case ELEVATOR_INSERT_FLUSH:
  472                 rq->cmd_flags |= REQ_SOFTBARRIER;
                      blk_insert_flush(rq);
                      break;
              default:
                      printk(KERN_ERR "%s: bad insertion point %d\n",
                             __func__, where);
                      BUG();
              }
 1151 }
      EXPORT_SYMBOL(__elv_add_request);
      
      void elv_add_request(struct request_queue *q, struct request *rq, int where)
      {
              unsigned long flags;
      
              spin_lock_irqsave(q->queue_lock, flags);
              __elv_add_request(q, rq, where);
              spin_unlock_irqrestore(q->queue_lock, flags);
      }
      EXPORT_SYMBOL(elv_add_request);
      
      struct request *elv_latter_request(struct request_queue *q, struct request *rq)
      {
    9         struct elevator_queue *e = q->elevator;
      
              if (e->type->ops.elevator_latter_req_fn)
    9                 return e->type->ops.elevator_latter_req_fn(q, rq);
              return NULL;
      }
      
      struct request *elv_former_request(struct request_queue *q, struct request *rq)
      {
    2         struct elevator_queue *e = q->elevator;
      
              if (e->type->ops.elevator_former_req_fn)
    2                 return e->type->ops.elevator_former_req_fn(q, rq);
              return NULL;
      }
      
      int elv_set_request(struct request_queue *q, struct request *rq,
                          struct bio *bio, gfp_t gfp_mask)
      {
 1190         struct elevator_queue *e = q->elevator;
      
              if (e->type->ops.elevator_set_req_fn)
 1190                 return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
              return 0;
      }
      
      void elv_put_request(struct request_queue *q, struct request *rq)
      {
    5         struct elevator_queue *e = q->elevator;
      
              if (e->type->ops.elevator_put_req_fn)
    5                 e->type->ops.elevator_put_req_fn(rq);
    5 }
      
      int elv_may_queue(struct request_queue *q, int op, int op_flags)
      {
 1217         struct elevator_queue *e = q->elevator;
      
              if (e->type->ops.elevator_may_queue_fn)
 1217                 return e->type->ops.elevator_may_queue_fn(q, op, op_flags);
      
              return ELV_MQUEUE_MAY;
      }
      
      void elv_completed_request(struct request_queue *q, struct request *rq)
      {
              struct elevator_queue *e = q->elevator;
      
              /*
               * request is released from the driver, io must be done
               */
    5         if (blk_account_rq(rq)) {
                      q->in_flight[rq_is_sync(rq)]--;
                      if ((rq->cmd_flags & REQ_SORTED) &&
                          e->type->ops.elevator_completed_req_fn)
                              e->type->ops.elevator_completed_req_fn(q, rq);
              }
    5 }
      
      #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
      
      static ssize_t
      elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
      {
              struct elv_fs_entry *entry = to_elv(attr);
              struct elevator_queue *e;
              ssize_t error;
      
              if (!entry->show)
                      return -EIO;
      
              e = container_of(kobj, struct elevator_queue, kobj);
              mutex_lock(&e->sysfs_lock);
              error = e->type ? entry->show(e, page) : -ENOENT;
              mutex_unlock(&e->sysfs_lock);
              return error;
      }
      
      static ssize_t
      elv_attr_store(struct kobject *kobj, struct attribute *attr,
                     const char *page, size_t length)
      {
              struct elv_fs_entry *entry = to_elv(attr);
              struct elevator_queue *e;
              ssize_t error;
      
              if (!entry->store)
                      return -EIO;
      
              e = container_of(kobj, struct elevator_queue, kobj);
              mutex_lock(&e->sysfs_lock);
              error = e->type ? entry->store(e, page, length) : -ENOENT;
              mutex_unlock(&e->sysfs_lock);
              return error;
      }
      
      static const struct sysfs_ops elv_sysfs_ops = {
              .show        = elv_attr_show,
              .store        = elv_attr_store,
      };
      
      static struct kobj_type elv_ktype = {
              .sysfs_ops        = &elv_sysfs_ops,
              .release        = elevator_release,
      };
      
      int elv_register_queue(struct request_queue *q)
      {
              struct elevator_queue *e = q->elevator;
              int error;
      
              error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
              if (!error) {
                      struct elv_fs_entry *attr = e->type->elevator_attrs;
                      if (attr) {
                              while (attr->attr.name) {
                                      if (sysfs_create_file(&e->kobj, &attr->attr))
                                              break;
                                      attr++;
                              }
                      }
                      kobject_uevent(&e->kobj, KOBJ_ADD);
                      e->registered = 1;
                      if (e->type->ops.elevator_registered_fn)
                              e->type->ops.elevator_registered_fn(q);
              }
              return error;
      }
      EXPORT_SYMBOL(elv_register_queue);
      
      void elv_unregister_queue(struct request_queue *q)
      {
              if (q) {
                      struct elevator_queue *e = q->elevator;
      
                      kobject_uevent(&e->kobj, KOBJ_REMOVE);
                      kobject_del(&e->kobj);
                      e->registered = 0;
              }
      }
      EXPORT_SYMBOL(elv_unregister_queue);
      
      int elv_register(struct elevator_type *e)
      {
              char *def = "";
      
              /* create icq_cache if requested */
              if (e->icq_size) {
                      if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
                          WARN_ON(e->icq_align < __alignof__(struct io_cq)))
                              return -EINVAL;
      
                      snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
                               "%s_io_cq", e->elevator_name);
                      e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
                                                       e->icq_align, 0, NULL);
                      if (!e->icq_cache)
                              return -ENOMEM;
              }
      
              /* register, don't allow duplicate names */
              spin_lock(&elv_list_lock);
              if (elevator_find(e->elevator_name)) {
                      spin_unlock(&elv_list_lock);
                      if (e->icq_cache)
                              kmem_cache_destroy(e->icq_cache);
                      return -EBUSY;
              }
              list_add_tail(&e->list, &elv_list);
              spin_unlock(&elv_list_lock);
      
              /* print pretty message */
              if (!strcmp(e->elevator_name, chosen_elevator) ||
                              (!*chosen_elevator &&
                               !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
                                      def = " (default)";
      
              printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
                                                                      def);
              return 0;
      }
      EXPORT_SYMBOL_GPL(elv_register);
      
      void elv_unregister(struct elevator_type *e)
      {
              /* unregister */
              spin_lock(&elv_list_lock);
              list_del_init(&e->list);
              spin_unlock(&elv_list_lock);
      
              /*
               * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
               * sure all RCU operations are complete before proceeding.
               */
              if (e->icq_cache) {
                      rcu_barrier();
                      kmem_cache_destroy(e->icq_cache);
                      e->icq_cache = NULL;
              }
      }
      EXPORT_SYMBOL_GPL(elv_unregister);
      
      /*
       * switch to new_e io scheduler. be careful not to introduce deadlocks -
       * we don't free the old io scheduler, before we have allocated what we
       * need for the new one. this way we have a chance of going back to the old
       * one, if the new one fails init for some reason.
       */
      static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
      {
              struct elevator_queue *old = q->elevator;
              bool registered = old->registered;
              int err;
      
              /*
               * Turn on BYPASS and drain all requests w/ elevator private data.
               * Block layer doesn't call into a quiesced elevator - all requests
               * are directly put on the dispatch list without elevator data
               * using INSERT_BACK.  All requests have SOFTBARRIER set and no
               * merge happens either.
               */
              blk_queue_bypass_start(q);
      
              /* unregister and clear all auxiliary data of the old elevator */
              if (registered)
                      elv_unregister_queue(q);
      
              spin_lock_irq(q->queue_lock);
              ioc_clear_queue(q);
              spin_unlock_irq(q->queue_lock);
      
              /* allocate, init and register new elevator */
              err = new_e->ops.elevator_init_fn(q, new_e);
              if (err)
                      goto fail_init;
      
              if (registered) {
                      err = elv_register_queue(q);
                      if (err)
                              goto fail_register;
              }
      
              /* done, kill the old one and finish */
              elevator_exit(old);
              blk_queue_bypass_end(q);
      
              blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
      
              return 0;
      
      fail_register:
              elevator_exit(q->elevator);
      fail_init:
              /* switch failed, restore and re-register old elevator */
              q->elevator = old;
              elv_register_queue(q);
              blk_queue_bypass_end(q);
      
              return err;
      }
      
      /*
       * Switch this queue to the given IO scheduler.
       */
      static int __elevator_change(struct request_queue *q, const char *name)
      {
              char elevator_name[ELV_NAME_MAX];
              struct elevator_type *e;
      
              if (!q->elevator)
                      return -ENXIO;
      
              strlcpy(elevator_name, name, sizeof(elevator_name));
              e = elevator_get(strstrip(elevator_name), true);
              if (!e) {
                      printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
                      return -EINVAL;
              }
      
              if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
                      elevator_put(e);
                      return 0;
              }
      
              return elevator_switch(q, e);
      }
      
      int elevator_change(struct request_queue *q, const char *name)
      {
              int ret;
      
              /* Protect q->elevator from elevator_init() */
              mutex_lock(&q->sysfs_lock);
              ret = __elevator_change(q, name);
              mutex_unlock(&q->sysfs_lock);
      
              return ret;
      }
      EXPORT_SYMBOL(elevator_change);
      
      ssize_t elv_iosched_store(struct request_queue *q, const char *name,
                                size_t count)
      {
              int ret;
      
              if (!q->elevator)
                      return count;
      
              ret = __elevator_change(q, name);
              if (!ret)
                      return count;
      
              printk(KERN_ERR "elevator: switch to %s failed\n", name);
              return ret;
      }
      
      ssize_t elv_iosched_show(struct request_queue *q, char *name)
      {
              struct elevator_queue *e = q->elevator;
              struct elevator_type *elv;
              struct elevator_type *__e;
              int len = 0;
      
              if (!q->elevator || !blk_queue_stackable(q))
                      return sprintf(name, "none\n");
      
              elv = e->type;
      
              spin_lock(&elv_list_lock);
              list_for_each_entry(__e, &elv_list, list) {
                      if (!strcmp(elv->elevator_name, __e->elevator_name))
                              len += sprintf(name+len, "[%s] ", elv->elevator_name);
                      else
                              len += sprintf(name+len, "%s ", __e->elevator_name);
              }
              spin_unlock(&elv_list_lock);
      
              len += sprintf(len+name, "\n");
              return len;
      }
      
      struct request *elv_rb_former_request(struct request_queue *q,
                                            struct request *rq)
      {
    2         struct rb_node *rbprev = rb_prev(&rq->rb_node);
      
              if (rbprev)
    2                 return rb_entry_rq(rbprev);
      
              return NULL;
      }
      EXPORT_SYMBOL(elv_rb_former_request);
      
      struct request *elv_rb_latter_request(struct request_queue *q,
                                            struct request *rq)
      {
    9         struct rb_node *rbnext = rb_next(&rq->rb_node);
      
              if (rbnext)
    9                 return rb_entry_rq(rbnext);
      
              return NULL;
      }
      EXPORT_SYMBOL(elv_rb_latter_request);
      /*
       * xsave/xrstor support.
       *
       * Author: Suresh Siddha <suresh.b.siddha@intel.com>
       */
      #include <linux/compat.h>
      #include <linux/cpu.h>
      #include <linux/mman.h>
      #include <linux/pkeys.h>
      
      #include <asm/fpu/api.h>
      #include <asm/fpu/internal.h>
      #include <asm/fpu/signal.h>
      #include <asm/fpu/regset.h>
      #include <asm/fpu/xstate.h>
      
      #include <asm/tlbflush.h>
      
      /*
       * Although we spell it out in here, the Processor Trace
       * xfeature is completely unused.  We use other mechanisms
       * to save/restore PT state in Linux.
       */
      static const char *xfeature_names[] =
      {
              "x87 floating point registers"        ,
              "SSE registers"                        ,
              "AVX registers"                        ,
              "MPX bounds registers"                ,
              "MPX CSR"                        ,
              "AVX-512 opmask"                ,
              "AVX-512 Hi256"                        ,
              "AVX-512 ZMM_Hi256"                ,
              "Processor Trace (unused)"        ,
              "Protection Keys User registers",
              "unknown xstate feature"        ,
      };
      
      /*
       * Mask of xstate features supported by the CPU and the kernel:
       */
      u64 xfeatures_mask __read_mostly;
      
      static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
      static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
      static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
      
      /*
       * The XSAVE area of kernel can be in standard or compacted format;
       * it is always in standard format for user mode. This is the user
       * mode standard format size used for signal and ptrace frames.
       */
      unsigned int fpu_user_xstate_size;
      
      /*
       * Clear all of the X86_FEATURE_* bits that are unavailable
       * when the CPU has no XSAVE support.
       */
      void fpu__xstate_clear_all_cpu_caps(void)
      {
              setup_clear_cpu_cap(X86_FEATURE_XSAVE);
              setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
              setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
              setup_clear_cpu_cap(X86_FEATURE_XSAVES);
              setup_clear_cpu_cap(X86_FEATURE_AVX);
              setup_clear_cpu_cap(X86_FEATURE_AVX2);
              setup_clear_cpu_cap(X86_FEATURE_AVX512F);
              setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
              setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
              setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
              setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
              setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
              setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
              setup_clear_cpu_cap(X86_FEATURE_MPX);
              setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
              setup_clear_cpu_cap(X86_FEATURE_PKU);
              setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
              setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
      }
      
      /*
       * Return whether the system supports a given xfeature.
       *
       * Also return the name of the (most advanced) feature that the caller requested:
       */
      int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
      {
              u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
      
              if (unlikely(feature_name)) {
                      long xfeature_idx, max_idx;
                      u64 xfeatures_print;
                      /*
                       * So we use FLS here to be able to print the most advanced
                       * feature that was requested but is missing. So if a driver
                       * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
                       * missing AVX feature - this is the most informative message
                       * to users:
                       */
                      if (xfeatures_missing)
                              xfeatures_print = xfeatures_missing;
                      else
                              xfeatures_print = xfeatures_needed;
      
                      xfeature_idx = fls64(xfeatures_print)-1;
                      max_idx = ARRAY_SIZE(xfeature_names)-1;
                      xfeature_idx = min(xfeature_idx, max_idx);
      
                      *feature_name = xfeature_names[xfeature_idx];
              }
      
              if (xfeatures_missing)
                      return 0;
      
              return 1;
      }
      EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
      
      static int xfeature_is_supervisor(int xfeature_nr)
      {
              /*
               * We currently do not support supervisor states, but if
               * we did, we could find out like this.
               *
               * SDM says: If state component 'i' is a user state component,
               * ECX[0] return 0; if state component i is a supervisor
               * state component, ECX[0] returns 1.
               */
              u32 eax, ebx, ecx, edx;
      
              cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
              return !!(ecx & 1);
      }
      
      static int xfeature_is_user(int xfeature_nr)
      {
              return !xfeature_is_supervisor(xfeature_nr);
      }
      
      /*
       * When executing XSAVEOPT (or other optimized XSAVE instructions), if
       * a processor implementation detects that an FPU state component is still
       * (or is again) in its initialized state, it may clear the corresponding
       * bit in the header.xfeatures field, and can skip the writeout of registers
       * to the corresponding memory layout.
       *
       * This means that when the bit is zero, the state component might still contain
       * some previous - non-initialized register state.
       *
       * Before writing xstate information to user-space we sanitize those components,
       * to always ensure that the memory layout of a feature will be in the init state
       * if the corresponding header bit is zero. This is to ensure that user-space doesn't
       * see some stale state in the memory layout during signal handling, debugging etc.
       */
      void fpstate_sanitize_xstate(struct fpu *fpu)
      {
              struct fxregs_state *fx = &fpu->state.fxsave;
              int feature_bit;
              u64 xfeatures;
      
   25         if (!use_xsaveopt())
                      return;
      
   25         xfeatures = fpu->state.xsave.header.xfeatures;
      
              /*
               * None of the feature bits are in init state. So nothing else
               * to do for us, as the memory layout is up to date.
               */
              if ((xfeatures & xfeatures_mask) == xfeatures_mask)
                      return;
      
              /*
               * FP is in init state
               */
   25         if (!(xfeatures & XFEATURE_MASK_FP)) {
   21                 fx->cwd = 0x37f;
                      fx->swd = 0;
                      fx->twd = 0;
                      fx->fop = 0;
                      fx->rip = 0;
                      fx->rdp = 0;
                      memset(&fx->st_space[0], 0, 128);
              }
      
              /*
               * SSE is in init state
               */
   25         if (!(xfeatures & XFEATURE_MASK_SSE))
                      memset(&fx->xmm_space[0], 0, 256);
      
              /*
               * First two features are FPU and SSE, which above we handled
               * in a special way already:
               */
              feature_bit = 0x2;
   25         xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
      
              /*
               * Update all the remaining memory layouts according to their
               * standard xstate layout, if their header bit is in the init
               * state:
               */
   25         while (xfeatures) {
   25                 if (xfeatures & 0x1) {
   25                         int offset = xstate_comp_offsets[feature_bit];
                              int size = xstate_sizes[feature_bit];
      
                              memcpy((void *)fx + offset,
                                     (void *)&init_fpstate.xsave + offset,
                                     size);
                      }
      
   25                 xfeatures >>= 1;
                      feature_bit++;
              }
      }
      
      /*
       * Enable the extended processor state save/restore feature.
       * Called once per CPU onlining.
       */
      void fpu__init_cpu_xstate(void)
      {
              if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
                      return;
              /*
               * Make it clear that XSAVES supervisor states are not yet
               * implemented should anyone expect it to work by changing
               * bits in XFEATURE_MASK_* macros and XCR0.
               */
              WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
                      "x86/fpu: XSAVES supervisor states are not yet implemented.\n");
      
              xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
      
              cr4_set_bits(X86_CR4_OSXSAVE);
              xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
      }
      
      /*
       * Note that in the future we will likely need a pair of
       * functions here: one for user xstates and the other for
       * system xstates.  For now, they are the same.
       */
      static int xfeature_enabled(enum xfeature xfeature)
      {
              return !!(xfeatures_mask & (1UL << xfeature));
      }
      
      /*
       * Record the offsets and sizes of various xstates contained
       * in the XSAVE state memory layout.
       */
      static void __init setup_xstate_features(void)
      {
              u32 eax, ebx, ecx, edx, i;
              /* start at the beginnning of the "extended state" */
              unsigned int last_good_offset = offsetof(struct xregs_state,
                                                       extended_state_area);
              /*
               * The FP xstates and SSE xstates are legacy states. They are always
               * in the fixed offsets in the xsave area in either compacted form
               * or standard form.
               */
              xstate_offsets[0] = 0;
              xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
              xstate_offsets[1] = xstate_sizes[0];
              xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
      
              for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
                      if (!xfeature_enabled(i))
                              continue;
      
                      cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
      
                      /*
                       * If an xfeature is supervisor state, the offset
                       * in EBX is invalid. We leave it to -1.
                       */
                      if (xfeature_is_user(i))
                              xstate_offsets[i] = ebx;
      
                      xstate_sizes[i] = eax;
                      /*
                       * In our xstate size checks, we assume that the
                       * highest-numbered xstate feature has the
                       * highest offset in the buffer.  Ensure it does.
                       */
                      WARN_ONCE(last_good_offset > xstate_offsets[i],
                              "x86/fpu: misordered xstate at %d\n", last_good_offset);
                      last_good_offset = xstate_offsets[i];
              }
      }
      
      static void __init print_xstate_feature(u64 xstate_mask)
      {
              const char *feature_name;
      
              if (cpu_has_xfeatures(xstate_mask, &feature_name))
                      pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
      }
      
      /*
       * Print out all the supported xstate features:
       */
      static void __init print_xstate_features(void)
      {
              print_xstate_feature(XFEATURE_MASK_FP);
              print_xstate_feature(XFEATURE_MASK_SSE);
              print_xstate_feature(XFEATURE_MASK_YMM);
              print_xstate_feature(XFEATURE_MASK_BNDREGS);
              print_xstate_feature(XFEATURE_MASK_BNDCSR);
              print_xstate_feature(XFEATURE_MASK_OPMASK);
              print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
              print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
              print_xstate_feature(XFEATURE_MASK_PKRU);
      }
      
      /*
       * This check is important because it is easy to get XSTATE_*
       * confused with XSTATE_BIT_*.
       */
      #define CHECK_XFEATURE(nr) do {                \
              WARN_ON(nr < FIRST_EXTENDED_XFEATURE);        \
              WARN_ON(nr >= XFEATURE_MAX);        \
      } while (0)
      
      /*
       * We could cache this like xstate_size[], but we only use
       * it here, so it would be a waste of space.
       */
      static int xfeature_is_aligned(int xfeature_nr)
      {
              u32 eax, ebx, ecx, edx;
      
              CHECK_XFEATURE(xfeature_nr);
              cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
              /*
               * The value returned by ECX[1] indicates the alignment
               * of state component 'i' when the compacted format
               * of the extended region of an XSAVE area is used:
               */
              return !!(ecx & 2);
      }
      
      /*
       * This function sets up offsets and sizes of all extended states in
       * xsave area. This supports both standard format and compacted format
       * of the xsave aread.
       */
      static void __init setup_xstate_comp(void)
      {
              unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
              int i;
      
              /*
               * The FP xstates and SSE xstates are legacy states. They are always
               * in the fixed offsets in the xsave area in either compacted form
               * or standard form.
               */
              xstate_comp_offsets[0] = 0;
              xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
      
              if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
                      for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
                              if (xfeature_enabled(i)) {
                                      xstate_comp_offsets[i] = xstate_offsets[i];
                                      xstate_comp_sizes[i] = xstate_sizes[i];
                              }
                      }
                      return;
              }
      
              xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
                      FXSAVE_SIZE + XSAVE_HDR_SIZE;
      
              for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
                      if (xfeature_enabled(i))
                              xstate_comp_sizes[i] = xstate_sizes[i];
                      else
                              xstate_comp_sizes[i] = 0;
      
                      if (i > FIRST_EXTENDED_XFEATURE) {
                              xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
                                              + xstate_comp_sizes[i-1];
      
                              if (xfeature_is_aligned(i))
                                      xstate_comp_offsets[i] =
                                              ALIGN(xstate_comp_offsets[i], 64);
                      }
              }
      }
      
      /*
       * Print out xstate component offsets and sizes
       */
      static void __init print_xstate_offset_size(void)
      {
              int i;
      
              for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
                      if (!xfeature_enabled(i))
                              continue;
                      pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
                               i, xstate_comp_offsets[i], i, xstate_sizes[i]);
              }
      }
      
      /*
       * setup the xstate image representing the init state
       */
      static void __init setup_init_fpu_buf(void)
      {
              static int on_boot_cpu __initdata = 1;
      
              WARN_ON_FPU(!on_boot_cpu);
              on_boot_cpu = 0;
      
              if (!boot_cpu_has(X86_FEATURE_XSAVE))
                      return;
      
              setup_xstate_features();
              print_xstate_features();
      
              if (boot_cpu_has(X86_FEATURE_XSAVES))
                      init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
      
              /*
               * Init all the features state with header.xfeatures being 0x0
               */
              copy_kernel_to_xregs_booting(&init_fpstate.xsave);
      
              /*
               * Dump the init state again. This is to identify the init state
               * of any feature which is not represented by all zero's.
               */
              copy_xregs_to_kernel_booting(&init_fpstate.xsave);
      }
      
      static int xfeature_uncompacted_offset(int xfeature_nr)
      {
              u32 eax, ebx, ecx, edx;
      
              /*
               * Only XSAVES supports supervisor states and it uses compacted
               * format. Checking a supervisor state's uncompacted offset is
               * an error.
               */
              if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) {
                      WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
                      return -1;
              }
      
              CHECK_XFEATURE(xfeature_nr);
              cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
              return ebx;
      }
      
      static int xfeature_size(int xfeature_nr)
      {
              u32 eax, ebx, ecx, edx;
      
              CHECK_XFEATURE(xfeature_nr);
              cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
              return eax;
      }
      
      /*
       * 'XSAVES' implies two different things:
       * 1. saving of supervisor/system state
       * 2. using the compacted format
       *
       * Use this function when dealing with the compacted format so
       * that it is obvious which aspect of 'XSAVES' is being handled
       * by the calling code.
       */
      int using_compacted_format(void)
      {
    7         return boot_cpu_has(X86_FEATURE_XSAVES);
      }
      
      static void __xstate_dump_leaves(void)
      {
              int i;
              u32 eax, ebx, ecx, edx;
              static int should_dump = 1;
      
              if (!should_dump)
                      return;
              should_dump = 0;
              /*
               * Dump out a few leaves past the ones that we support
               * just in case there are some goodies up there
               */
              for (i = 0; i < XFEATURE_MAX + 10; i++) {
                      cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
                      pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
                              XSTATE_CPUID, i, eax, ebx, ecx, edx);
              }
      }
      
      #define XSTATE_WARN_ON(x) do {                                                        \
              if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) {        \
                      __xstate_dump_leaves();                                                \
              }                                                                        \
      } while (0)
      
      #define XCHECK_SZ(sz, nr, nr_macro, __struct) do {                        \
              if ((nr == nr_macro) &&                                                \
                  WARN_ONCE(sz != sizeof(__struct),                                \
                      "%s: struct is %zu bytes, cpu state %d bytes\n",        \
                      __stringify(nr_macro), sizeof(__struct), sz)) {                \
                      __xstate_dump_leaves();                                        \
              }                                                                \
      } while (0)
      
      /*
       * We have a C struct for each 'xstate'.  We need to ensure
       * that our software representation matches what the CPU
       * tells us about the state's size.
       */
      static void check_xstate_against_struct(int nr)
      {
              /*
               * Ask the CPU for the size of the state.
               */
              int sz = xfeature_size(nr);
              /*
               * Match each CPU state with the corresponding software
               * structure.
               */
              XCHECK_SZ(sz, nr, XFEATURE_YMM,       struct ymmh_struct);
              XCHECK_SZ(sz, nr, XFEATURE_BNDREGS,   struct mpx_bndreg_state);
              XCHECK_SZ(sz, nr, XFEATURE_BNDCSR,    struct mpx_bndcsr_state);
              XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
              XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
              XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
              XCHECK_SZ(sz, nr, XFEATURE_PKRU,      struct pkru_state);
      
              /*
               * Make *SURE* to add any feature numbers in below if
               * there are "holes" in the xsave state component
               * numbers.
               */
              if ((nr < XFEATURE_YMM) ||
                  (nr >= XFEATURE_MAX) ||
                  (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
                      WARN_ONCE(1, "no structure for xstate: %d\n", nr);
                      XSTATE_WARN_ON(1);
              }
      }
      
      /*
       * This essentially double-checks what the cpu told us about
       * how large the XSAVE buffer needs to be.  We are recalculating
       * it to be safe.
       */
      static void do_extra_xstate_size_checks(void)
      {
              int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
              int i;
      
              for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
                      if (!xfeature_enabled(i))
                              continue;
      
                      check_xstate_against_struct(i);
                      /*
                       * Supervisor state components can be managed only by
                       * XSAVES, which is compacted-format only.
                       */
                      if (!using_compacted_format())
                              XSTATE_WARN_ON(xfeature_is_supervisor(i));
      
                      /* Align from the end of the previous feature */
                      if (xfeature_is_aligned(i))
                              paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
                      /*
                       * The offset of a given state in the non-compacted
                       * format is given to us in a CPUID leaf.  We check
                       * them for being ordered (increasing offsets) in
                       * setup_xstate_features().
                       */
                      if (!using_compacted_format())
                              paranoid_xstate_size = xfeature_uncompacted_offset(i);
                      /*
                       * The compacted-format offset always depends on where
                       * the previous state ended.
                       */
                      paranoid_xstate_size += xfeature_size(i);
              }
              XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
      }
      
      
      /*
       * Get total size of enabled xstates in XCR0/xfeatures_mask.
       *
       * Note the SDM's wording here.  "sub-function 0" only enumerates
       * the size of the *user* states.  If we use it to size a buffer
       * that we use 'XSAVES' on, we could potentially overflow the
       * buffer because 'XSAVES' saves system states too.
       *
       * Note that we do not currently set any bits on IA32_XSS so
       * 'XCR0 | IA32_XSS == XCR0' for now.
       */
      static unsigned int __init get_xsaves_size(void)
      {
              unsigned int eax, ebx, ecx, edx;
              /*
               * - CPUID function 0DH, sub-function 1:
               *    EBX enumerates the size (in bytes) required by
               *    the XSAVES instruction for an XSAVE area
               *    containing all the state components
               *    corresponding to bits currently set in
               *    XCR0 | IA32_XSS.
               */
              cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
              return ebx;
      }
      
      static unsigned int __init get_xsave_size(void)
      {
              unsigned int eax, ebx, ecx, edx;
              /*
               * - CPUID function 0DH, sub-function 0:
               *    EBX enumerates the size (in bytes) required by
               *    the XSAVE instruction for an XSAVE area
               *    containing all the *user* state components
               *    corresponding to bits currently set in XCR0.
               */
              cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
              return ebx;
      }
      
      /*
       * Will the runtime-enumerated 'xstate_size' fit in the init
       * task's statically-allocated buffer?
       */
      static bool is_supported_xstate_size(unsigned int test_xstate_size)
      {
              if (test_xstate_size <= sizeof(union fpregs_state))
                      return true;
      
              pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
                              sizeof(union fpregs_state), test_xstate_size);
              return false;
      }
      
      static int init_xstate_size(void)
      {
              /* Recompute the context size for enabled features: */
              unsigned int possible_xstate_size;
              unsigned int xsave_size;
      
              xsave_size = get_xsave_size();
      
              if (boot_cpu_has(X86_FEATURE_XSAVES))
                      possible_xstate_size = get_xsaves_size();
              else
                      possible_xstate_size = xsave_size;
      
              /* Ensure we have the space to store all enabled: */
              if (!is_supported_xstate_size(possible_xstate_size))
                      return -EINVAL;
      
              /*
               * The size is OK, we are definitely going to use xsave,
               * make it known to the world that we need more space.
               */
              fpu_kernel_xstate_size = possible_xstate_size;
              do_extra_xstate_size_checks();
      
              /*
               * User space is always in standard format.
               */
              fpu_user_xstate_size = xsave_size;
              return 0;
      }
      
      /*
       * We enabled the XSAVE hardware, but something went wrong and
       * we can not use it.  Disable it.
       */
      static void fpu__init_disable_system_xstate(void)
      {
              xfeatures_mask = 0;
              cr4_clear_bits(X86_CR4_OSXSAVE);
              fpu__xstate_clear_all_cpu_caps();
      }
      
      /*
       * Enable and initialize the xsave feature.
       * Called once per system bootup.
       */
      void __init fpu__init_system_xstate(void)
      {
              unsigned int eax, ebx, ecx, edx;
              static int on_boot_cpu __initdata = 1;
              int err;
      
              WARN_ON_FPU(!on_boot_cpu);
              on_boot_cpu = 0;
      
              if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
                      pr_info("x86/fpu: Legacy x87 FPU detected.\n");
                      return;
              }
      
              if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
                      WARN_ON_FPU(1);
                      return;
              }
      
              cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
              xfeatures_mask = eax + ((u64)edx << 32);
      
              if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
                      /*
                       * This indicates that something really unexpected happened
                       * with the enumeration.  Disable XSAVE and try to continue
                       * booting without it.  This is too early to BUG().
                       */
                      pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
                      goto out_disable;
              }
      
              xfeatures_mask &= fpu__get_supported_xfeatures_mask();
      
              /* Enable xstate instructions to be able to continue with initialization: */
              fpu__init_cpu_xstate();
              err = init_xstate_size();
              if (err)
                      goto out_disable;
      
              /*
               * Update info used for ptrace frames; use standard-format size and no
               * supervisor xstates:
               */
              update_regset_xstate_info(fpu_user_xstate_size,        xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
      
              fpu__init_prepare_fx_sw_frame();
              setup_init_fpu_buf();
              setup_xstate_comp();
              print_xstate_offset_size();
      
              pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
                      xfeatures_mask,
                      fpu_kernel_xstate_size,
                      boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
              return;
      
      out_disable:
              /* something went wrong, try to boot without any XSAVE support */
              fpu__init_disable_system_xstate();
      }
      
      /*
       * Restore minimal FPU state after suspend:
       */
      void fpu__resume_cpu(void)
      {
              /*
               * Restore XCR0 on xsave capable CPUs:
               */
              if (boot_cpu_has(X86_FEATURE_XSAVE))
                      xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
      }
      
      /*
       * Given an xstate feature mask, calculate where in the xsave
       * buffer the state is.  Callers should ensure that the buffer
       * is valid.
       *
       * Note: does not work for compacted buffers.
       */
      void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
      {
              int feature_nr = fls64(xstate_feature_mask) - 1;
      
              if (!xfeature_enabled(feature_nr)) {
                      WARN_ON_FPU(1);
                      return NULL;
              }
      
              return (void *)xsave + xstate_comp_offsets[feature_nr];
      }
      /*
       * Given the xsave area and a state inside, this function returns the
       * address of the state.
       *
       * This is the API that is called to get xstate address in either
       * standard format or compacted format of xsave area.
       *
       * Note that if there is no data for the field in the xsave buffer
       * this will return NULL.
       *
       * Inputs:
       *        xstate: the thread's storage area for all FPU data
       *        xstate_feature: state which is defined in xsave.h (e.g.
       *        XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
       * Output:
       *        address of the state in the xsave area, or NULL if the
       *        field is not present in the xsave buffer.
       */
      void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
      {
              /*
               * Do we even *have* xsave state?
               */
              if (!boot_cpu_has(X86_FEATURE_XSAVE))
                      return NULL;
      
              /*
               * We should not ever be requesting features that we
               * have not enabled.  Remember that pcntxt_mask is
               * what we write to the XCR0 register.
               */
              WARN_ONCE(!(xfeatures_mask & xstate_feature),
                        "get of unsupported state");
              /*
               * This assumes the last 'xsave*' instruction to
               * have requested that 'xstate_feature' be saved.
               * If it did not, we might be seeing and old value
               * of the field in the buffer.
               *
               * This can happen because the last 'xsave' did not
               * request that this feature be saved (unlikely)
               * or because the "init optimization" caused it
               * to not be saved.
               */
              if (!(xsave->header.xfeatures & xstate_feature))
                      return NULL;
      
              return __raw_xsave_addr(xsave, xstate_feature);
      }
      EXPORT_SYMBOL_GPL(get_xsave_addr);
      
      /*
       * This wraps up the common operations that need to occur when retrieving
       * data from xsave state.  It first ensures that the current task was
       * using the FPU and retrieves the data in to a buffer.  It then calculates
       * the offset of the requested field in the buffer.
       *
       * This function is safe to call whether the FPU is in use or not.
       *
       * Note that this only works on the current task.
       *
       * Inputs:
       *        @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
       *        XFEATURE_MASK_SSE, etc...)
       * Output:
       *        address of the state in the xsave area or NULL if the state
       *        is not present or is in its 'init state'.
       */
      const void *get_xsave_field_ptr(int xsave_state)
      {
              struct fpu *fpu = &current->thread.fpu;
      
              if (!fpu->fpstate_active)
                      return NULL;
              /*
               * fpu__save() takes the CPU's xstate registers
               * and saves them off to the 'fpu memory buffer.
               */
              fpu__save(fpu);
      
              return get_xsave_addr(&fpu->state.xsave, xsave_state);
      }
      
      #ifdef CONFIG_ARCH_HAS_PKEYS
      
      #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
      #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
      /*
       * This will go out and modify PKRU register to set the access
       * rights for @pkey to @init_val.
       */
      int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                      unsigned long init_val)
      {
              u32 old_pkru;
              int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
              u32 new_pkru_bits = 0;
      
              /*
               * This check implies XSAVE support.  OSPKE only gets
               * set if we enable XSAVE and we enable PKU in XCR0.
               */
              if (!boot_cpu_has(X86_FEATURE_OSPKE))
                      return -EINVAL;
      
              /* Set the bits we need in PKRU:  */
              if (init_val & PKEY_DISABLE_ACCESS)
                      new_pkru_bits |= PKRU_AD_BIT;
              if (init_val & PKEY_DISABLE_WRITE)
                      new_pkru_bits |= PKRU_WD_BIT;
      
              /* Shift the bits in to the correct place in PKRU for pkey: */
              new_pkru_bits <<= pkey_shift;
      
              /* Get old PKRU and mask off any old bits in place: */
              old_pkru = read_pkru();
              old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
      
              /* Write old part along with new part: */
              write_pkru(old_pkru | new_pkru_bits);
      
              return 0;
      }
      #endif /* ! CONFIG_ARCH_HAS_PKEYS */
      
      /*
       * This is similar to user_regset_copyout(), but will not add offset to
       * the source data pointer or increment pos, count, kbuf, and ubuf.
       */
      static inline int xstate_copyout(unsigned int pos, unsigned int count,
                                       void *kbuf, void __user *ubuf,
                                       const void *data, const int start_pos,
                                       const int end_pos)
      {
              if ((count == 0) || (pos < start_pos))
                      return 0;
      
              if (end_pos < 0 || pos < end_pos) {
                      unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
      
                      if (kbuf) {
                              memcpy(kbuf + pos, data, copy);
                      } else {
                              if (__copy_to_user(ubuf + pos, data, copy))
                                      return -EFAULT;
                      }
              }
              return 0;
      }
      
      /*
       * Convert from kernel XSAVES compacted format to standard format and copy
       * to a ptrace buffer. It supports partial copy but pos always starts from
       * zero. This is called from xstateregs_get() and there we check the CPU
       * has XSAVES.
       */
      int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
                              void __user *ubuf, struct xregs_state *xsave)
      {
              unsigned int offset, size;
              int ret, i;
              struct xstate_header header;
      
              /*
               * Currently copy_regset_to_user() starts from pos 0:
               */
              if (unlikely(pos != 0))
                      return -EFAULT;
      
              /*
               * The destination is a ptrace buffer; we put in only user xstates:
               */
              memset(&header, 0, sizeof(header));
              header.xfeatures = xsave->header.xfeatures;
              header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
      
              /*
               * Copy xregs_state->header:
               */
              offset = offsetof(struct xregs_state, header);
              size = sizeof(header);
      
              ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count);
      
              if (ret)
                      return ret;
      
              for (i = 0; i < XFEATURE_MAX; i++) {
                      /*
                       * Copy only in-use xstates:
                       */
                      if ((header.xfeatures >> i) & 1) {
                              void *src = __raw_xsave_addr(xsave, 1 << i);
      
                              offset = xstate_offsets[i];
                              size = xstate_sizes[i];
      
                              ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count);
      
                              if (ret)
                                      return ret;
      
                              if (offset + size >= count)
                                      break;
                      }
      
              }
      
              /*
               * Fill xsave->i387.sw_reserved value for ptrace frame:
               */
              offset = offsetof(struct fxregs_state, sw_reserved);
              size = sizeof(xstate_fx_sw_bytes);
      
              ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
      
              if (ret)
                      return ret;
      
              return 0;
      }
      
      /*
       * Convert from a ptrace standard-format buffer to kernel XSAVES format
       * and copy to the target thread. This is called from xstateregs_set() and
       * there we check the CPU has XSAVES and a whole standard-sized buffer
       * exists.
       */
      int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
                           struct xregs_state *xsave)
      {
              unsigned int offset, size;
              int i;
              u64 xfeatures;
              u64 allowed_features;
      
              offset = offsetof(struct xregs_state, header);
              size = sizeof(xfeatures);
      
              if (kbuf) {
                      memcpy(&xfeatures, kbuf + offset, size);
              } else {
                      if (__copy_from_user(&xfeatures, ubuf + offset, size))
                              return -EFAULT;
              }
      
              /*
               * Reject if the user sets any disabled or supervisor features:
               */
              allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
      
              if (xfeatures & ~allowed_features)
                      return -EINVAL;
      
              for (i = 0; i < XFEATURE_MAX; i++) {
                      u64 mask = ((u64)1 << i);
      
                      if (xfeatures & mask) {
                              void *dst = __raw_xsave_addr(xsave, 1 << i);
      
                              offset = xstate_offsets[i];
                              size = xstate_sizes[i];
      
                              if (kbuf) {
                                      memcpy(dst, kbuf + offset, size);
                              } else {
                                      if (__copy_from_user(dst, ubuf + offset, size))
                                              return -EFAULT;
                              }
                      }
              }
      
              /*
               * The state that came in from userspace was user-state only.
               * Mask all the user states out of 'xfeatures':
               */
              xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
      
              /*
               * Add back in the features that came in from userspace:
               */
              xsave->header.xfeatures |= xfeatures;
              xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xsave->header.xfeatures;
      
              return 0;
      }
      /*
       * INET                An implementation of the TCP/IP protocol suite for the LINUX
       *                operating system.  INET is implemented using the  BSD Socket
       *                interface as the means of communication with the user level.
       *
       *                IPv4 Forwarding Information Base: policy rules.
       *
       * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
       *                Thomas Graf <tgraf@suug.ch>
       *
       *                This program is free software; you can redistribute it and/or
       *                modify it under the terms of the GNU General Public License
       *                as published by the Free Software Foundation; either version
       *                2 of the License, or (at your option) any later version.
       *
       * Fixes:
       *                Rani Assaf        :        local_rule cannot be deleted
       *                Marc Boucher        :        routing by fwmark
       */
      
      #include <linux/types.h>
      #include <linux/kernel.h>
      #include <linux/netdevice.h>
      #include <linux/netlink.h>
      #include <linux/inetdevice.h>
      #include <linux/init.h>
      #include <linux/list.h>
      #include <linux/rcupdate.h>
      #include <linux/export.h>
      #include <net/ip.h>
      #include <net/route.h>
      #include <net/tcp.h>
      #include <net/ip_fib.h>
      #include <net/fib_rules.h>
      
      struct fib4_rule {
              struct fib_rule                common;
              u8                        dst_len;
              u8                        src_len;
              u8                        tos;
              __be32                        src;
              __be32                        srcmask;
              __be32                        dst;
              __be32                        dstmask;
      #ifdef CONFIG_IP_ROUTE_CLASSID
              u32                        tclassid;
      #endif
      };
      
      int __fib_lookup(struct net *net, struct flowi4 *flp,
                       struct fib_result *res, unsigned int flags)
      {
  318         struct fib_lookup_arg arg = {
                      .result = res,
                      .flags = flags,
              };
              int err;
      
              /* update flow if oif or iif point to device enslaved to l3mdev */
              l3mdev_update_flow(net, flowi4_to_flowi(flp));
      
              err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
      #ifdef CONFIG_IP_ROUTE_CLASSID
              if (arg.rule)
                      res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
              else
                      res->tclassid = 0;
      #endif
      
              if (err == -ESRCH)
                      err = -ENETUNREACH;
      
  290         return err;
      }
      EXPORT_SYMBOL_GPL(__fib_lookup);
      
  300 static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
                                  int flags, struct fib_lookup_arg *arg)
      {
              int err = -EAGAIN;
              struct fib_table *tbl;
              u32 tb_id;
      
  318         switch (rule->action) {
              case FR_ACT_TO_TBL:
                      break;
      
              case FR_ACT_UNREACHABLE:
                      return -ENETUNREACH;
      
              case FR_ACT_PROHIBIT:
                      return -EACCES;
      
              case FR_ACT_BLACKHOLE:
              default:
                      return -EINVAL;
              }
      
  300         rcu_read_lock();
      
  299         tb_id = fib_rule_get_table(rule, arg);
              tbl = fib_get_table(rule->fr_net, tb_id);
              if (tbl)
  296                 err = fib_table_lookup(tbl, &flp->u.ip4,
                                             (struct fib_result *)arg->result,
                                             arg->flags);
      
  282         rcu_read_unlock();
              return err;
      }
      
      static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
      {
  211         struct fib_result *result = (struct fib_result *) arg->result;
              struct net_device *dev = NULL;
      
              if (result->fi)
                      dev = result->fi->fib_dev;
      
              /* do not accept result if the route does
               * not meet the required prefix length
               */
  211         if (result->prefixlen <= rule->suppress_prefixlen)
                      goto suppress_route;
      
              /* do not accept result if the route uses a device
               * belonging to a forbidden interface group
               */
  211         if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
                      goto suppress_route;
      
              return false;
      
      suppress_route:
              if (!(arg->flags & FIB_LOOKUP_NOREF))
                      fib_info_put(result->fi);
              return true;
      }
      
  318 static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
      {
              struct fib4_rule *r = (struct fib4_rule *) rule;
              struct flowi4 *fl4 = &fl->u.ip4;
  318         __be32 daddr = fl4->daddr;
  318         __be32 saddr = fl4->saddr;
      
              if (((saddr ^ r->src) & r->srcmask) ||
                  ((daddr ^ r->dst) & r->dstmask))
  318                 return 0;
      
  318         if (r->tos && (r->tos != fl4->flowi4_tos))
                      return 0;
      
              return 1;
      }
      
      static struct fib_table *fib_empty_table(struct net *net)
      {
              u32 id;
      
    6         for (id = 1; id <= RT_TABLE_MAX; id++)
    6                 if (!fib_get_table(net, id))
    6                         return fib_new_table(net, id);
              return NULL;
      }
      
      static int call_fib_rule_notifiers(struct net *net,
                                         enum fib_event_type event_type)
      {
              struct fib_notifier_info info;
      
              return call_fib_notifiers(net, event_type, &info);
      }
      
      static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
              FRA_GENERIC_POLICY,
              [FRA_FLOW]        = { .type = NLA_U32 },
      };
      
      static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
                                     struct fib_rule_hdr *frh,
                                     struct nlattr **tb)
      {
   36         struct net *net = sock_net(skb->sk);
              int err = -EINVAL;
              struct fib4_rule *rule4 = (struct fib4_rule *) rule;
      
              if (frh->tos & ~IPTOS_TOS_MASK)
                      goto errout;
      
              /* split local/main if they are not already split */
   34         err = fib_unmerge(net);
              if (err)
                      goto errout;
      
   34         if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) {
   29                 if (rule->action == FR_ACT_TO_TBL) {
                              struct fib_table *table;
      
    6                         table = fib_empty_table(net);
                              if (!table) {
                                      err = -ENOBUFS;
                                      goto errout;
                              }
      
    6                         rule->table = table->tb_id;
                      }
              }
      
   34         if (frh->src_len)
    1                 rule4->src = nla_get_in_addr(tb[FRA_SRC]);
      
   33         if (frh->dst_len)
    1                 rule4->dst = nla_get_in_addr(tb[FRA_DST]);
      
      #ifdef CONFIG_IP_ROUTE_CLASSID
              if (tb[FRA_FLOW]) {
                      rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
                      if (rule4->tclassid)
                              net->ipv4.fib_num_tclassid_users++;
              }
      #endif
      
   34         rule4->src_len = frh->src_len;
   34         rule4->srcmask = inet_make_mask(rule4->src_len);
              rule4->dst_len = frh->dst_len;
   34         rule4->dstmask = inet_make_mask(rule4->dst_len);
              rule4->tos = frh->tos;
      
              net->ipv4.fib_has_custom_rules = true;
              call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD);
      
              err = 0;
      errout:
   36         return err;
      }
      
      static int fib4_rule_delete(struct fib_rule *rule)
      {
    2         struct net *net = rule->fr_net;
              int err;
      
              /* split local/main if they are not already split */
              err = fib_unmerge(net);
              if (err)
                      goto errout;
      
      #ifdef CONFIG_IP_ROUTE_CLASSID
              if (((struct fib4_rule *)rule)->tclassid)
                      net->ipv4.fib_num_tclassid_users--;
      #endif
    2         net->ipv4.fib_has_custom_rules = true;
              call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL);
      errout:
    2         return err;
      }
      
      static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
                                   struct nlattr **tb)
      {
              struct fib4_rule *rule4 = (struct fib4_rule *) rule;
      
   39         if (frh->src_len && (rule4->src_len != frh->src_len))
   39                 return 0;
      
   39         if (frh->dst_len && (rule4->dst_len != frh->dst_len))
                      return 0;
      
   38         if (frh->tos && (rule4->tos != frh->tos))
                      return 0;
      
      #ifdef CONFIG_IP_ROUTE_CLASSID
              if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
                      return 0;
      #endif
      
   37         if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC])))
                      return 0;
      
   35         if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST])))
                      return 0;
      
              return 1;
      }
      
      static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
                                struct fib_rule_hdr *frh)
      {
              struct fib4_rule *rule4 = (struct fib4_rule *) rule;
      
   46         frh->dst_len = rule4->dst_len;
              frh->src_len = rule4->src_len;
              frh->tos = rule4->tos;
      
              if ((rule4->dst_len &&
    2              nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
   46             (rule4->src_len &&
    7              nla_put_in_addr(skb, FRA_SRC, rule4->src)))
                      goto nla_put_failure;
      #ifdef CONFIG_IP_ROUTE_CLASSID
              if (rule4->tclassid &&
                  nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
                      goto nla_put_failure;
      #endif
   46         return 0;
      
      nla_put_failure:
              return -ENOBUFS;
      }
      
      static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
      {
              return nla_total_size(4) /* dst */
                     + nla_total_size(4) /* src */
   36                + nla_total_size(4); /* flow */
      }
      
      static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
      {
   36         rt_cache_flush(ops->fro_net);
      }
      
      static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
              .family                = AF_INET,
              .rule_size        = sizeof(struct fib4_rule),
              .addr_size        = sizeof(u32),
              .action                = fib4_rule_action,
              .suppress        = fib4_rule_suppress,
              .match                = fib4_rule_match,
              .configure        = fib4_rule_configure,
              .delete                = fib4_rule_delete,
              .compare        = fib4_rule_compare,
              .fill                = fib4_rule_fill,
              .nlmsg_payload        = fib4_rule_nlmsg_payload,
              .flush_cache        = fib4_rule_flush_cache,
              .nlgroup        = RTNLGRP_IPV4_RULE,
              .policy                = fib4_rule_policy,
              .owner                = THIS_MODULE,
      };
      
      static int fib_default_rules_init(struct fib_rules_ops *ops)
      {
              int err;
      
  116         err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
              if (err < 0)
                      return err;
  116         err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
              if (err < 0)
                      return err;
  116         err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
              if (err < 0)
                      return err;
              return 0;
      }
      
      int __net_init fib4_rules_init(struct net *net)
      {
              int err;
              struct fib_rules_ops *ops;
      
  116         ops = fib_rules_register(&fib4_rules_ops_template, net);
              if (IS_ERR(ops))
                      return PTR_ERR(ops);
      
  116         err = fib_default_rules_init(ops);
              if (err < 0)
                      goto fail;
  116         net->ipv4.rules_ops = ops;
              net->ipv4.fib_has_custom_rules = false;
  116         return 0;
      
      fail:
              /* also cleans all rules already added */
              fib_rules_unregister(ops);
              return err;
      }
      
      void __net_exit fib4_rules_exit(struct net *net)
      {
              fib_rules_unregister(net->ipv4.rules_ops);
      }
      /*
       * HID raw devices, giving access to raw HID events.
       *
       * In comparison to hiddev, this device does not process the
       * hid events at all (no parsing, no lookups). This lets applications
       * to work on raw hid events as they want to, and avoids a need to
       * use a transport-specific userspace libhid/libusb libraries.
       *
       *  Copyright (c) 2007-2014 Jiri Kosina
       */
      
      /*
       * This program is free software; you can redistribute it and/or modify it
       * under the terms and conditions of the GNU General Public License,
       * version 2, as published by the Free Software Foundation.
       *
       * You should have received a copy of the GNU General Public License along with
       * this program; if not, write to the Free Software Foundation, Inc.,
       * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
       */
      
      #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      
      #include <linux/fs.h>
      #include <linux/module.h>
      #include <linux/errno.h>
      #include <linux/kernel.h>
      #include <linux/init.h>
      #include <linux/cdev.h>
      #include <linux/poll.h>
      #include <linux/device.h>
      #include <linux/major.h>
      #include <linux/slab.h>
      #include <linux/hid.h>
      #include <linux/mutex.h>
      #include <linux/sched.h>
      #include <linux/string.h>
      
      #include <linux/hidraw.h>
      
      static int hidraw_major;
      static struct cdev hidraw_cdev;
      static struct class *hidraw_class;
      static struct hidraw *hidraw_table[HIDRAW_MAX_DEVICES];
      static DEFINE_MUTEX(minors_lock);
      
      static ssize_t hidraw_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
      {
              struct hidraw_list *list = file->private_data;
              int ret = 0, len;
              DECLARE_WAITQUEUE(wait, current);
      
              mutex_lock(&list->read_mutex);
      
              while (ret == 0) {
                      if (list->head == list->tail) {
                              add_wait_queue(&list->hidraw->wait, &wait);
                              set_current_state(TASK_INTERRUPTIBLE);
      
                              while (list->head == list->tail) {
                                      if (signal_pending(current)) {
                                              ret = -ERESTARTSYS;
                                              break;
                                      }
                                      if (!list->hidraw->exist) {
                                              ret = -EIO;
                                              break;
                                      }
                                      if (file->f_flags & O_NONBLOCK) {
                                              ret = -EAGAIN;
                                              break;
                                      }
      
                                      /* allow O_NONBLOCK to work well from other threads */
                                      mutex_unlock(&list->read_mutex);
                                      schedule();
                                      mutex_lock(&list->read_mutex);
                                      set_current_state(TASK_INTERRUPTIBLE);
                              }
      
                              set_current_state(TASK_RUNNING);
                              remove_wait_queue(&list->hidraw->wait, &wait);
                      }
      
                      if (ret)
                              goto out;
      
                      len = list->buffer[list->tail].len > count ?
                              count : list->buffer[list->tail].len;
      
                      if (list->buffer[list->tail].value) {
                              if (copy_to_user(buffer, list->buffer[list->tail].value, len)) {
                                      ret = -EFAULT;
                                      goto out;
                              }
                              ret = len;
                      }
      
                      kfree(list->buffer[list->tail].value);
                      list->buffer[list->tail].value = NULL;
                      list->tail = (list->tail + 1) & (HIDRAW_BUFFER_SIZE - 1);
              }
      out:
              mutex_unlock(&list->read_mutex);
              return ret;
      }
      
      /*
       * The first byte of the report buffer is expected to be a report number.
       *
       * This function is to be called with the minors_lock mutex held.
       */
      static ssize_t hidraw_send_report(struct file *file, const char __user *buffer, size_t count, unsigned char report_type)
      {
              unsigned int minor = iminor(file_inode(file));
              struct hid_device *dev;
              __u8 *buf;
              int ret = 0;
      
              if (!hidraw_table[minor] || !hidraw_table[minor]->exist) {
                      ret = -ENODEV;
                      goto out;
              }
      
              dev = hidraw_table[minor]->hid;
      
              if (count > HID_MAX_BUFFER_SIZE) {
                      hid_warn(dev, "pid %d passed too large report\n",
                               task_pid_nr(current));
                      ret = -EINVAL;
                      goto out;
              }
      
              if (count < 2) {
                      hid_warn(dev, "pid %d passed too short report\n",
                               task_pid_nr(current));
                      ret = -EINVAL;
                      goto out;
              }
      
              buf = memdup_user(buffer, count);
              if (IS_ERR(buf)) {
                      ret = PTR_ERR(buf);
                      goto out;
              }
      
              if ((report_type == HID_OUTPUT_REPORT) &&
                  !(dev->quirks & HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP)) {
                      ret = hid_hw_output_report(dev, buf, count);
                      /*
                       * compatibility with old implementation of USB-HID and I2C-HID:
                       * if the device does not support receiving output reports,
                       * on an interrupt endpoint, fallback to SET_REPORT HID command.
                       */
                      if (ret != -ENOSYS)
                              goto out_free;
              }
      
              ret = hid_hw_raw_request(dev, buf[0], buf, count, report_type,
                                      HID_REQ_SET_REPORT);
      
      out_free:
              kfree(buf);
      out:
              return ret;
      }
      
      static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
      {
              ssize_t ret;
              mutex_lock(&minors_lock);
              ret = hidraw_send_report(file, buffer, count, HID_OUTPUT_REPORT);
              mutex_unlock(&minors_lock);
              return ret;
      }
      
      
      /*
       * This function performs a Get_Report transfer over the control endpoint
       * per section 7.2.1 of the HID specification, version 1.1.  The first byte
       * of buffer is the report number to request, or 0x0 if the defice does not
       * use numbered reports. The report_type parameter can be HID_FEATURE_REPORT
       * or HID_INPUT_REPORT.
       *
       * This function is to be called with the minors_lock mutex held.
       */
      static ssize_t hidraw_get_report(struct file *file, char __user *buffer, size_t count, unsigned char report_type)
      {
              unsigned int minor = iminor(file_inode(file));
              struct hid_device *dev;
              __u8 *buf;
              int ret = 0, len;
              unsigned char report_number;
      
              if (!hidraw_table[minor] || !hidraw_table[minor]->exist) {
                      ret = -ENODEV;
                      goto out;
              }
      
              dev = hidraw_table[minor]->hid;
      
              if (!dev->ll_driver->raw_request) {
                      ret = -ENODEV;
                      goto out;
              }
      
              if (count > HID_MAX_BUFFER_SIZE) {
                      printk(KERN_WARNING "hidraw: pid %d passed too large report\n",
                                      task_pid_nr(current));
                      ret = -EINVAL;
                      goto out;
              }
      
              if (count < 2) {
                      printk(KERN_WARNING "hidraw: pid %d passed too short report\n",
                                      task_pid_nr(current));
                      ret = -EINVAL;
                      goto out;
              }
      
              buf = kmalloc(count * sizeof(__u8), GFP_KERNEL);
              if (!buf) {
                      ret = -ENOMEM;
                      goto out;
              }
      
              /*
               * Read the first byte from the user. This is the report number,
               * which is passed to hid_hw_raw_request().
               */
              if (copy_from_user(&report_number, buffer, 1)) {
                      ret = -EFAULT;
                      goto out_free;
              }
      
              ret = hid_hw_raw_request(dev, report_number, buf, count, report_type,
                                       HID_REQ_GET_REPORT);
      
              if (ret < 0)
                      goto out_free;
      
              len = (ret < count) ? ret : count;
      
              if (copy_to_user(buffer, buf, len)) {
                      ret = -EFAULT;
                      goto out_free;
              }
      
              ret = len;
      
      out_free:
              kfree(buf);
      out:
              return ret;
      }
      
      static unsigned int hidraw_poll(struct file *file, poll_table *wait)
      {
              struct hidraw_list *list = file->private_data;
      
              poll_wait(file, &list->hidraw->wait, wait);
              if (list->head != list->tail)
                      return POLLIN | POLLRDNORM;
              if (!list->hidraw->exist)
                      return POLLERR | POLLHUP;
              return 0;
      }
      
      static int hidraw_open(struct inode *inode, struct file *file)
      {
              unsigned int minor = iminor(inode);
              struct hidraw *dev;
              struct hidraw_list *list;
              unsigned long flags;
              int err = 0;
      
              if (!(list = kzalloc(sizeof(struct hidraw_list), GFP_KERNEL))) {
                      err = -ENOMEM;
                      goto out;
              }
      
              mutex_lock(&minors_lock);
              if (!hidraw_table[minor] || !hidraw_table[minor]->exist) {
                      err = -ENODEV;
                      goto out_unlock;
              }
      
              dev = hidraw_table[minor];
              if (!dev->open++) {
                      err = hid_hw_power(dev->hid, PM_HINT_FULLON);
                      if (err < 0) {
                              dev->open--;
                              goto out_unlock;
                      }
      
                      err = hid_hw_open(dev->hid);
                      if (err < 0) {
                              hid_hw_power(dev->hid, PM_HINT_NORMAL);
                              dev->open--;
                              goto out_unlock;
                      }
              }
      
              list->hidraw = hidraw_table[minor];
              mutex_init(&list->read_mutex);
              spin_lock_irqsave(&hidraw_table[minor]->list_lock, flags);
              list_add_tail(&list->node, &hidraw_table[minor]->list);
              spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags);
              file->private_data = list;
      out_unlock:
              mutex_unlock(&minors_lock);
      out:
              if (err < 0)
                      kfree(list);
              return err;
      
      }
      
      static int hidraw_fasync(int fd, struct file *file, int on)
      {
              struct hidraw_list *list = file->private_data;
      
              return fasync_helper(fd, file, on, &list->fasync);
      }
      
      static void drop_ref(struct hidraw *hidraw, int exists_bit)
      {
              if (exists_bit) {
                      hidraw->exist = 0;
                      if (hidraw->open) {
                              hid_hw_close(hidraw->hid);
                              wake_up_interruptible(&hidraw->wait);
                      }
                      device_destroy(hidraw_class,
    4                                MKDEV(hidraw_major, hidraw->minor));
              } else {
                      --hidraw->open;
              }
    3         if (!hidraw->open) {
    3                 if (!hidraw->exist) {
    3                         hidraw_table[hidraw->minor] = NULL;
                              kfree(hidraw);
                      } else {
                              /* close device for last reader */
                              hid_hw_power(hidraw->hid, PM_HINT_NORMAL);
                              hid_hw_close(hidraw->hid);
                      }
              }
    3 }
      
      static int hidraw_release(struct inode * inode, struct file * file)
      {
              unsigned int minor = iminor(inode);
              struct hidraw_list *list = file->private_data;
              unsigned long flags;
      
              mutex_lock(&minors_lock);
      
              spin_lock_irqsave(&hidraw_table[minor]->list_lock, flags);
              list_del(&list->node);
              spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags);
              kfree(list);
      
              drop_ref(hidraw_table[minor], 0);
      
              mutex_unlock(&minors_lock);
              return 0;
      }
      
      static long hidraw_ioctl(struct file *file, unsigned int cmd,
                                                              unsigned long arg)
      {
              struct inode *inode = file_inode(file);
              unsigned int minor = iminor(inode);
              long ret = 0;
              struct hidraw *dev;
              void __user *user_arg = (void __user*) arg;
      
              mutex_lock(&minors_lock);
              dev = hidraw_table[minor];
              if (!dev) {
                      ret = -ENODEV;
                      goto out;
              }
      
              switch (cmd) {
                      case HIDIOCGRDESCSIZE:
                              if (put_user(dev->hid->rsize, (int __user *)arg))
                                      ret = -EFAULT;
                              break;
      
                      case HIDIOCGRDESC:
                              {
                                      __u32 len;
      
                                      if (get_user(len, (int __user *)arg))
                                              ret = -EFAULT;
                                      else if (len > HID_MAX_DESCRIPTOR_SIZE - 1)
                                              ret = -EINVAL;
                                      else if (copy_to_user(user_arg + offsetof(
                                              struct hidraw_report_descriptor,
                                              value[0]),
                                              dev->hid->rdesc,
                                              min(dev->hid->rsize, len)))
                                              ret = -EFAULT;
                                      break;
                              }
                      case HIDIOCGRAWINFO:
                              {
                                      struct hidraw_devinfo dinfo;
      
                                      dinfo.bustype = dev->hid->bus;
                                      dinfo.vendor = dev->hid->vendor;
                                      dinfo.product = dev->hid->product;
                                      if (copy_to_user(user_arg, &dinfo, sizeof(dinfo)))
                                              ret = -EFAULT;
                                      break;
                              }
                      default:
                              {
                                      struct hid_device *hid = dev->hid;
                                      if (_IOC_TYPE(cmd) != 'H') {
                                              ret = -EINVAL;
                                              break;
                                      }
      
                                      if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSFEATURE(0))) {
                                              int len = _IOC_SIZE(cmd);
                                              ret = hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT);
                                              break;
                                      }
                                      if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGFEATURE(0))) {
                                              int len = _IOC_SIZE(cmd);
                                              ret = hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT);
                                              break;
                                      }
      
                                      /* Begin Read-only ioctls. */
                                      if (_IOC_DIR(cmd) != _IOC_READ) {
                                              ret = -EINVAL;
                                              break;
                                      }
      
                                      if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWNAME(0))) {
                                              int len = strlen(hid->name) + 1;
                                              if (len > _IOC_SIZE(cmd))
                                                      len = _IOC_SIZE(cmd);
                                              ret = copy_to_user(user_arg, hid->name, len) ?
                                                      -EFAULT : len;
                                              break;
                                      }
      
                                      if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWPHYS(0))) {
                                              int len = strlen(hid->phys) + 1;
                                              if (len > _IOC_SIZE(cmd))
                                                      len = _IOC_SIZE(cmd);
                                              ret = copy_to_user(user_arg, hid->phys, len) ?
                                                      -EFAULT : len;
                                              break;
                                      }
                              }
      
                      ret = -ENOTTY;
              }
      out:
              mutex_unlock(&minors_lock);
              return ret;
      }
      
      static const struct file_operations hidraw_ops = {
              .owner =        THIS_MODULE,
              .read =         hidraw_read,
              .write =        hidraw_write,
              .poll =         hidraw_poll,
              .open =         hidraw_open,
              .release =      hidraw_release,
              .unlocked_ioctl = hidraw_ioctl,
              .fasync =        hidraw_fasync,
      #ifdef CONFIG_COMPAT
              .compat_ioctl   = hidraw_ioctl,
      #endif
              .llseek =        noop_llseek,
      };
      
      int hidraw_report_event(struct hid_device *hid, u8 *data, int len)
      {
              struct hidraw *dev = hid->hidraw;
              struct hidraw_list *list;
              int ret = 0;
              unsigned long flags;
      
              spin_lock_irqsave(&dev->list_lock, flags);
              list_for_each_entry(list, &dev->list, node) {
                      int new_head = (list->head + 1) & (HIDRAW_BUFFER_SIZE - 1);
      
                      if (new_head == list->tail)
                              continue;
      
                      if (!(list->buffer[list->head].value = kmemdup(data, len, GFP_ATOMIC))) {
                              ret = -ENOMEM;
                              break;
                      }
                      list->buffer[list->head].len = len;
                      list->head = new_head;
                      kill_fasync(&list->fasync, SIGIO, POLL_IN);
              }
              spin_unlock_irqrestore(&dev->list_lock, flags);
      
              wake_up_interruptible(&dev->wait);
              return ret;
      }
      EXPORT_SYMBOL_GPL(hidraw_report_event);
      
      int hidraw_connect(struct hid_device *hid)
      {
              int minor, result;
              struct hidraw *dev;
      
              /* we accept any HID device, all applications */
      
              dev = kzalloc(sizeof(struct hidraw), GFP_KERNEL);
              if (!dev)
                      return -ENOMEM;
      
              result = -EINVAL;
      
              mutex_lock(&minors_lock);
      
              for (minor = 0; minor < HIDRAW_MAX_DEVICES; minor++) {
                      if (hidraw_table[minor])
                              continue;
                      hidraw_table[minor] = dev;
                      result = 0;
                      break;
              }
      
              if (result) {
                      mutex_unlock(&minors_lock);
                      kfree(dev);
                      goto out;
              }
      
              dev->dev = device_create(hidraw_class, &hid->dev, MKDEV(hidraw_major, minor),
                                       NULL, "%s%d", "hidraw", minor);
      
              if (IS_ERR(dev->dev)) {
                      hidraw_table[minor] = NULL;
                      mutex_unlock(&minors_lock);
                      result = PTR_ERR(dev->dev);
                      kfree(dev);
                      goto out;
              }
      
              init_waitqueue_head(&dev->wait);
              spin_lock_init(&dev->list_lock);
              INIT_LIST_HEAD(&dev->list);
      
              dev->hid = hid;
              dev->minor = minor;
      
              dev->exist = 1;
              hid->hidraw = dev;
      
              mutex_unlock(&minors_lock);
      out:
              return result;
      
      }
      EXPORT_SYMBOL_GPL(hidraw_connect);
      
      void hidraw_disconnect(struct hid_device *hid)
      {
    4         struct hidraw *hidraw = hid->hidraw;
      
              mutex_lock(&minors_lock);
      
    4         drop_ref(hidraw, 1);
      
    3         mutex_unlock(&minors_lock);
      }
      EXPORT_SYMBOL_GPL(hidraw_disconnect);
      
      int __init hidraw_init(void)
      {
              int result;
              dev_t dev_id;
      
              result = alloc_chrdev_region(&dev_id, HIDRAW_FIRST_MINOR,
                              HIDRAW_MAX_DEVICES, "hidraw");
              if (result < 0) {
                      pr_warn("can't get major number\n");
                      goto out;
              }
      
              hidraw_major = MAJOR(dev_id);
      
              hidraw_class = class_create(THIS_MODULE, "hidraw");
              if (IS_ERR(hidraw_class)) {
                      result = PTR_ERR(hidraw_class);
                      goto error_cdev;
              }
      
              cdev_init(&hidraw_cdev, &hidraw_ops);
              result = cdev_add(&hidraw_cdev, dev_id, HIDRAW_MAX_DEVICES);
              if (result < 0)
                      goto error_class;
      
              printk(KERN_INFO "hidraw: raw HID events driver (C) Jiri Kosina\n");
      out:
              return result;
      
      error_class:
              class_destroy(hidraw_class);
      error_cdev:
              unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES);
              goto out;
      }
      
      void hidraw_exit(void)
      {
              dev_t dev_id = MKDEV(hidraw_major, 0);
      
              cdev_del(&hidraw_cdev);
              class_destroy(hidraw_class);
              unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES);
      
      }
      /*
       *  linux/fs/ext4/ialloc.c
       *
       * Copyright (C) 1992, 1993, 1994, 1995
       * Remy Card (card@masi.ibp.fr)
       * Laboratoire MASI - Institut Blaise Pascal
       * Universite Pierre et Marie Curie (Paris VI)
       *
       *  BSD ufs-inspired inode and directory allocation by
       *  Stephen Tweedie (sct@redhat.com), 1993
       *  Big-endian to little-endian byte-swapping/bitmaps by
       *        David S. Miller (davem@caip.rutgers.edu), 1995
       */
      
      #include <linux/time.h>
      #include <linux/fs.h>
      #include <linux/stat.h>
      #include <linux/string.h>
      #include <linux/quotaops.h>
      #include <linux/buffer_head.h>
      #include <linux/random.h>
      #include <linux/bitops.h>
      #include <linux/blkdev.h>
      #include <asm/byteorder.h>
      
      #include "ext4.h"
      #include "ext4_jbd2.h"
      #include "xattr.h"
      #include "acl.h"
      
      #include <trace/events/ext4.h>
      
      /*
       * ialloc.c contains the inodes allocation and deallocation routines
       */
      
      /*
       * The free inodes are managed by bitmaps.  A file system contains several
       * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
       * block for inodes, N blocks for the inode table and data blocks.
       *
       * The file system contains group descriptors which are located after the
       * super block.  Each descriptor contains the number of the bitmap block and
       * the free blocks count in the block.
       */
      
      /*
       * To avoid calling the atomic setbit hundreds or thousands of times, we only
       * need to use it within a single byte (to ensure we get endianness right).
       * We can use memset for the rest of the bitmap as there are no other users.
       */
      void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
      {
              int i;
      
    4         if (start_bit >= end_bit)
                      return;
      
              ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
              for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
                      ext4_set_bit(i, bitmap);
              if (i < end_bit)
                      memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
      }
      
      void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
      {
              if (uptodate) {
                      set_buffer_uptodate(bh);
                      set_bitmap_uptodate(bh);
              }
              unlock_buffer(bh);
              put_bh(bh);
      }
      
      static int ext4_validate_inode_bitmap(struct super_block *sb,
                                            struct ext4_group_desc *desc,
                                            ext4_group_t block_group,
                                            struct buffer_head *bh)
      {
              ext4_fsblk_t        blk;
 1127         struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
              struct ext4_sb_info *sbi = EXT4_SB(sb);
      
 1127         if (buffer_verified(bh))
                      return 0;
              if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                      return -EFSCORRUPTED;
      
              ext4_lock_group(sb, block_group);
              if (buffer_verified(bh))
                      goto verified;
              blk = ext4_inode_bitmap(sb, desc);
              if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
                                                 EXT4_INODES_PER_GROUP(sb) / 8)) {
                      ext4_unlock_group(sb, block_group);
                      ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
                                 "inode_bitmap = %llu", block_group, blk);
                      grp = ext4_get_group_info(sb, block_group);
                      if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
                              int count;
                              count = ext4_free_inodes_count(sb, desc);
                              percpu_counter_sub(&sbi->s_freeinodes_counter,
                                                 count);
                      }
                      set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
                      return -EFSBADCRC;
              }
              set_buffer_verified(bh);
      verified:
              ext4_unlock_group(sb, block_group);
              return 0;
      }
      
      /*
       * Read the inode allocation bitmap for a given block_group, reading
       * into the specified slot in the superblock's bitmap cache.
       *
       * Return buffer_head of bitmap on success or NULL.
       */
      static struct buffer_head *
      ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
      {
              struct ext4_group_desc *desc;
 1128         struct ext4_sb_info *sbi = EXT4_SB(sb);
              struct buffer_head *bh = NULL;
              ext4_fsblk_t bitmap_blk;
              int err;
      
              desc = ext4_get_group_desc(sb, block_group, NULL);
              if (!desc)
                      return ERR_PTR(-EFSCORRUPTED);
      
 1128         bitmap_blk = ext4_inode_bitmap(sb, desc);
              if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
 1128             (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
                      ext4_error(sb, "Invalid inode bitmap blk %llu in "
                                 "block_group %u", bitmap_blk, block_group);
 1127                 return ERR_PTR(-EFSCORRUPTED);
              }
 1128         bh = sb_getblk(sb, bitmap_blk);
              if (unlikely(!bh)) {
                      ext4_error(sb, "Cannot read inode bitmap - "
                                  "block_group = %u, inode_bitmap = %llu",
                                  block_group, bitmap_blk);
                      return ERR_PTR(-EIO);
              }
 1127         if (bitmap_uptodate(bh))
                      goto verify;
      
              lock_buffer(bh);
              if (bitmap_uptodate(bh)) {
                      unlock_buffer(bh);
                      goto verify;
              }
      
              ext4_lock_group(sb, block_group);
              if (ext4_has_group_desc_csum(sb) &&
                  (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
                      if (block_group == 0) {
                              ext4_unlock_group(sb, block_group);
                              unlock_buffer(bh);
                              ext4_error(sb, "Inode bitmap for bg 0 marked "
                                         "uninitialized");
                              err = -EFSCORRUPTED;
                              goto out;
                      }
                      memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
                      ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
                                           sb->s_blocksize * 8, bh->b_data);
                      set_bitmap_uptodate(bh);
                      set_buffer_uptodate(bh);
                      set_buffer_verified(bh);
                      ext4_unlock_group(sb, block_group);
                      unlock_buffer(bh);
                      return bh;
              }
              ext4_unlock_group(sb, block_group);
      
              if (buffer_uptodate(bh)) {
                      /*
                       * if not uninit if bh is uptodate,
                       * bitmap is also uptodate
                       */
                      set_bitmap_uptodate(bh);
                      unlock_buffer(bh);
                      goto verify;
              }
              /*
               * submit the buffer_head for reading
               */
              trace_ext4_load_inode_bitmap(sb, block_group);
              bh->b_end_io = ext4_end_bitmap_read;
              get_bh(bh);
              submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
              wait_on_buffer(bh);
              if (!buffer_uptodate(bh)) {
                      put_bh(bh);
                      ext4_error(sb, "Cannot read inode bitmap - "
                                 "block_group = %u, inode_bitmap = %llu",
                                 block_group, bitmap_blk);
                      return ERR_PTR(-EIO);
              }
      
      verify:
 1127         err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
              if (err)
                      goto out;
              return bh;
      out:
              put_bh(bh);
              return ERR_PTR(err);
      }
      
      /*
       * NOTE! When we get the inode, we're the only people
       * that have access to it, and as such there are no
       * race conditions we have to worry about. The inode
       * is not on the hash-lists, and it cannot be reached
       * through the filesystem because the directory entry
       * has been deleted earlier.
       *
       * HOWEVER: we must make sure that we get no aliases,
       * which means that we have to call "clear_inode()"
       * _before_ we mark the inode not in use in the inode
       * bitmaps. Otherwise a newly created file might use
       * the same inode number (not actually the same pointer
       * though), and then we'd have two inodes sharing the
       * same inode number and space on the harddisk.
       */
      void ext4_free_inode(handle_t *handle, struct inode *inode)
      {
  198         struct super_block *sb = inode->i_sb;
              int is_directory;
              unsigned long ino;
              struct buffer_head *bitmap_bh = NULL;
              struct buffer_head *bh2;
              ext4_group_t block_group;
              unsigned long bit;
              struct ext4_group_desc *gdp;
              struct ext4_super_block *es;
              struct ext4_sb_info *sbi;
              int fatal = 0, err, count, cleared;
              struct ext4_group_info *grp;
      
              if (!sb) {
                      printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
                             "nonexistent device\n", __func__, __LINE__);
                      return;
              }
  198         if (atomic_read(&inode->i_count) > 1) {
                      ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
                               __func__, __LINE__, inode->i_ino,
                               atomic_read(&inode->i_count));
                      return;
              }
  198         if (inode->i_nlink) {
                      ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
                               __func__, __LINE__, inode->i_ino, inode->i_nlink);
                      return;
              }
  198         sbi = EXT4_SB(sb);
      
              ino = inode->i_ino;
              ext4_debug("freeing inode %lu\n", ino);
  198         trace_ext4_free_inode(inode);
      
              /*
               * Note: we must free any quota before locking the superblock,
               * as writing the quota to disk may need the lock as well.
               */
  198         dquot_initialize(inode);
              ext4_xattr_delete_inode(handle, inode);
              dquot_free_inode(inode);
              dquot_drop(inode);
      
              is_directory = S_ISDIR(inode->i_mode);
      
              /* Do this BEFORE marking the inode not in use or returning an error */
              ext4_clear_inode(inode);
      
  197         es = EXT4_SB(sb)->s_es;
              if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
                      ext4_error(sb, "reserved or nonexistent inode %lu", ino);
                      goto error_return;
              }
  197         block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
              bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
              bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
              /* Don't bother if the inode bitmap is corrupt. */
  196         grp = ext4_get_group_info(sb, block_group);
              if (IS_ERR(bitmap_bh)) {
                      fatal = PTR_ERR(bitmap_bh);
                      bitmap_bh = NULL;
                      goto error_return;
              }
  196         if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
                      fatal = -EFSCORRUPTED;
                      goto error_return;
              }
      
              BUFFER_TRACE(bitmap_bh, "get_write_access");
  196         fatal = ext4_journal_get_write_access(handle, bitmap_bh);
              if (fatal)
                      goto error_return;
      
              fatal = -ESRCH;
  196         gdp = ext4_get_group_desc(sb, block_group, &bh2);
              if (gdp) {
                      BUFFER_TRACE(bh2, "get_write_access");
  196                 fatal = ext4_journal_get_write_access(handle, bh2);
              }
  196         ext4_lock_group(sb, block_group);
  196         cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
  196         if (fatal || !cleared) {
                      ext4_unlock_group(sb, block_group);
                      goto out;
              }
      
              count = ext4_free_inodes_count(sb, gdp) + 1;
              ext4_free_inodes_set(sb, gdp, count);
              if (is_directory) {
   93                 count = ext4_used_dirs_count(sb, gdp) - 1;
                      ext4_used_dirs_set(sb, gdp, count);
                      percpu_counter_dec(&sbi->s_dirs_counter);
              }
              ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
  196                                    EXT4_INODES_PER_GROUP(sb) / 8);
              ext4_group_desc_csum_set(sb, block_group, gdp);
              ext4_unlock_group(sb, block_group);
      
              percpu_counter_inc(&sbi->s_freeinodes_counter);
              if (sbi->s_log_groups_per_flex) {
  196                 ext4_group_t f = ext4_flex_group(sbi, block_group);
      
                      atomic_inc(&sbi->s_flex_groups[f].free_inodes);
                      if (is_directory)
   93                         atomic_dec(&sbi->s_flex_groups[f].used_dirs);
              }
              BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
  196         fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
      out:
              if (cleared) {
                      BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
                      err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                      if (!fatal)
                              fatal = err;
              } else {
                      ext4_error(sb, "bit already cleared for inode %lu", ino);
                      if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
                              int count;
                              count = ext4_free_inodes_count(sb, gdp);
                              percpu_counter_sub(&sbi->s_freeinodes_counter,
                                                 count);
                      }
                      set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
              }
      
      error_return:
  196         brelse(bitmap_bh);
  196         ext4_std_error(sb, fatal);
      }
      
      struct orlov_stats {
              __u64 free_clusters;
              __u32 free_inodes;
              __u32 used_dirs;
      };
      
      /*
       * Helper function for Orlov's allocator; returns critical information
       * for a particular block group or flex_bg.  If flex_size is 1, then g
       * is a block group number; otherwise it is flex_bg number.
       */
      static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
                                  int flex_size, struct orlov_stats *stats)
      {
              struct ext4_group_desc *desc;
  348         struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
      
  348         if (flex_size > 1) {
                      stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
                      stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
                      stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
  348                 return;
              }
      
              desc = ext4_get_group_desc(sb, g, NULL);
              if (desc) {
                      stats->free_inodes = ext4_free_inodes_count(sb, desc);
                      stats->free_clusters = ext4_free_group_clusters(sb, desc);
                      stats->used_dirs = ext4_used_dirs_count(sb, desc);
              } else {
                      stats->free_inodes = 0;
                      stats->free_clusters = 0;
                      stats->used_dirs = 0;
              }
      }
      
      /*
       * Orlov's allocator for directories.
       *
       * We always try to spread first-level directories.
       *
       * If there are blockgroups with both free inodes and free blocks counts
       * not worse than average we return one with smallest directory count.
       * Otherwise we simply return a random group.
       *
       * For the rest rules look so:
       *
       * It's OK to put directory into a group unless
       * it has too many directories already (max_dirs) or
       * it has too few free inodes left (min_inodes) or
       * it has too few free blocks left (min_blocks) or
       * Parent's group is preferred, if it doesn't satisfy these
       * conditions we search cyclically through the rest. If none
       * of the groups look good we just look for a group with more
       * free inodes than average (starting at parent's group).
       */
      
      static int find_group_orlov(struct super_block *sb, struct inode *parent,
                                  ext4_group_t *group, umode_t mode,
                                  const struct qstr *qstr)
      {
  348         ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
              struct ext4_sb_info *sbi = EXT4_SB(sb);
              ext4_group_t real_ngroups = ext4_get_groups_count(sb);
              int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
              unsigned int freei, avefreei, grp_free;
              ext4_fsblk_t freeb, avefreec;
              unsigned int ndirs;
              int max_dirs, min_inodes;
              ext4_grpblk_t min_clusters;
              ext4_group_t i, grp, g, ngroups;
              struct ext4_group_desc *desc;
              struct orlov_stats stats;
              int flex_size = ext4_flex_bg_size(sbi);
              struct dx_hash_info hinfo;
      
              ngroups = real_ngroups;
              if (flex_size > 1) {
  348                 ngroups = (real_ngroups + flex_size - 1) >>
                              sbi->s_log_groups_per_flex;
                      parent_group >>= sbi->s_log_groups_per_flex;
              }
      
  348         freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
              avefreei = freei / ngroups;
              freeb = EXT4_C2B(sbi,
                      percpu_counter_read_positive(&sbi->s_freeclusters_counter));
              avefreec = freeb;
              do_div(avefreec, ngroups);
              ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
      
              if (S_ISDIR(mode) &&
  348             ((parent == d_inode(sb->s_root)) ||
  347              (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
                      int best_ndir = inodes_per_group;
                      int ret = -1;
      
    2                 if (qstr) {
    2                         hinfo.hash_version = DX_HASH_HALF_MD4;
                              hinfo.seed = sbi->s_hash_seed;
                              ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
                              grp = hinfo.hash;
                      } else
                              grp = prandom_u32();
    2                 parent_group = (unsigned)grp % ngroups;
    2                 for (i = 0; i < ngroups; i++) {
    2                         g = (parent_group + i) % ngroups;
                              get_orlov_stats(sb, g, flex_size, &stats);
                              if (!stats.free_inodes)
                                      continue;
    2                         if (stats.used_dirs >= best_ndir)
                                      continue;
    2                         if (stats.free_inodes < avefreei)
                                      continue;
    1                         if (stats.free_clusters < avefreec)
                                      continue;
                              grp = g;
                              ret = 0;
    1                         best_ndir = stats.used_dirs;
                      }
    2                 if (ret)
                              goto fallback;
              found_flex_bg:
  345                 if (flex_size == 1) {
                              *group = grp;
                              return 0;
                      }
      
                      /*
                       * We pack inodes at the beginning of the flexgroup's
                       * inode tables.  Block allocation decisions will do
                       * something similar, although regular files will
                       * start at 2nd block group of the flexgroup.  See
                       * ext4_ext_find_goal() and ext4_find_near().
                       */
  345                 grp *= flex_size;
  345                 for (i = 0; i < flex_size; i++) {
  345                         if (grp+i >= real_ngroups)
                                      break;
  345                         desc = ext4_get_group_desc(sb, grp+i, NULL);
  345                         if (desc && ext4_free_inodes_count(sb, desc)) {
  345                                 *group = grp+i;
                                      return 0;
                              }
                      }
                      goto fallback;
              }
      
  346         max_dirs = ndirs / ngroups + inodes_per_group / 16;
              min_inodes = avefreei - inodes_per_group*flex_size / 4;
              if (min_inodes < 1)
                      min_inodes = 1;
              min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
      
              /*
               * Start looking in the flex group where we last allocated an
               * inode for this parent directory
               */
              if (EXT4_I(parent)->i_last_alloc_group != ~0) {
                      parent_group = EXT4_I(parent)->i_last_alloc_group;
  325                 if (flex_size > 1)
  325                         parent_group >>= sbi->s_log_groups_per_flex;
              }
      
  346         for (i = 0; i < ngroups; i++) {
  346                 grp = (parent_group + i) % ngroups;
                      get_orlov_stats(sb, grp, flex_size, &stats);
                      if (stats.used_dirs >= max_dirs)
                              continue;
  346                 if (stats.free_inodes < min_inodes)
                              continue;
  346                 if (stats.free_clusters < min_clusters)
                              continue;
                      goto found_flex_bg;
              }
      
      fallback:
              ngroups = real_ngroups;
   17         avefreei = freei / ngroups;
      fallback_retry:
              parent_group = EXT4_I(parent)->i_block_group;
    1         for (i = 0; i < ngroups; i++) {
   17                 grp = (parent_group + i) % ngroups;
                      desc = ext4_get_group_desc(sb, grp, NULL);
                      if (desc) {
   17                         grp_free = ext4_free_inodes_count(sb, desc);
   17                         if (grp_free && grp_free >= avefreei) {
   17                                 *group = grp;
  348                                 return 0;
                              }
                      }
              }
      
              if (avefreei) {
                      /*
                       * The free-inodes counter is approximate, and for really small
                       * filesystems the above test can fail to find any blockgroups
                       */
                      avefreei = 0;
                      goto fallback_retry;
              }
      
              return -1;
      }
      
      static int find_group_other(struct super_block *sb, struct inode *parent,
                                  ext4_group_t *group, umode_t mode)
      {
  721         ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
              ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
              struct ext4_group_desc *desc;
  721         int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
      
              /*
               * Try to place the inode is the same flex group as its
               * parent.  If we can't find space, use the Orlov algorithm to
               * find another flex group, and store that information in the
               * parent directory's inode information so that use that flex
               * group for future allocations.
               */
              if (flex_size > 1) {
                      int retry = 0;
      
              try_again:
                      parent_group &= ~(flex_size-1);
                      last = parent_group + flex_size;
                      if (last > ngroups)
                              last = ngroups;
  721                 for  (i = parent_group; i < last; i++) {
  721                         desc = ext4_get_group_desc(sb, i, NULL);
  721                         if (desc && ext4_free_inodes_count(sb, desc)) {
  721                                 *group = i;
                                      return 0;
                              }
                      }
                      if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
                              retry = 1;
                              parent_group = EXT4_I(parent)->i_last_alloc_group;
                              goto try_again;
                      }
                      /*
                       * If this didn't work, use the Orlov search algorithm
                       * to find a new flex group; we pass in the mode to
                       * avoid the topdir algorithms.
                       */
                      *group = parent_group + flex_size;
                      if (*group > ngroups)
                              *group = 0;
                      return find_group_orlov(sb, parent, group, mode, NULL);
              }
      
              /*
               * Try to place the inode in its parent directory
               */
              *group = parent_group;
              desc = ext4_get_group_desc(sb, *group, NULL);
              if (desc && ext4_free_inodes_count(sb, desc) &&
                  ext4_free_group_clusters(sb, desc))
                      return 0;
      
              /*
               * We're going to place this inode in a different blockgroup from its
               * parent.  We want to cause files in a common directory to all land in
               * the same blockgroup.  But we want files which are in a different
               * directory which shares a blockgroup with our parent to land in a
               * different blockgroup.
               *
               * So add our directory's i_ino into the starting point for the hash.
               */
              *group = (*group + parent->i_ino) % ngroups;
      
              /*
               * Use a quadratic hash to find a group with a free inode and some free
               * blocks.
               */
              for (i = 1; i < ngroups; i <<= 1) {
                      *group += i;
                      if (*group >= ngroups)
                              *group -= ngroups;
                      desc = ext4_get_group_desc(sb, *group, NULL);
                      if (desc && ext4_free_inodes_count(sb, desc) &&
                          ext4_free_group_clusters(sb, desc))
                              return 0;
              }
      
              /*
               * That failed: try linear search for a free inode, even if that group
               * has no free blocks.
               */
              *group = parent_group;
              for (i = 0; i < ngroups; i++) {
                      if (++*group >= ngroups)
                              *group = 0;
                      desc = ext4_get_group_desc(sb, *group, NULL);
                      if (desc && ext4_free_inodes_count(sb, desc))
                              return 0;
              }
      
              return -1;
      }
      
      /*
       * In no journal mode, if an inode has recently been deleted, we want
       * to avoid reusing it until we're reasonably sure the inode table
       * block has been written back to disk.  (Yes, these values are
       * somewhat arbitrary...)
       */
      #define RECENTCY_MIN        5
      #define RECENTCY_DIRTY        30
      
      static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
      {
              struct ext4_group_desc        *gdp;
              struct ext4_inode        *raw_inode;
              struct buffer_head        *bh;
              unsigned long                dtime, now;
              int        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
              int        offset, ret = 0, recentcy = RECENTCY_MIN;
      
              gdp = ext4_get_group_desc(sb, group, NULL);
              if (unlikely(!gdp))
                      return 0;
      
 1037         bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
                             (ino / inodes_per_block));
 1037         if (unlikely(!bh) || !buffer_uptodate(bh))
                      /*
                       * If the block is not in the buffer cache, then it
                       * must have been written out.
                       */
                      goto out;
      
 1037         offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
              raw_inode = (struct ext4_inode *) (bh->b_data + offset);
              dtime = le32_to_cpu(raw_inode->i_dtime);
              now = get_seconds();
              if (buffer_dirty(bh))
                      recentcy += RECENTCY_DIRTY;
      
 1037         if (dtime && (dtime < now) && (now < dtime + recentcy))
                      ret = 1;
      out:
  960         brelse(bh);
              return ret;
      }
      
      /*
       * There are two policies for allocating an inode.  If the new inode is
       * a directory, then a forward search is made for a block group with both
       * free space and a low directory-to-inode ratio; if that fails, then of
       * the groups with above-average free space, that group with the fewest
       * directories already is chosen.
       *
       * For other inodes, search forward from the parent directory's block
       * group to find a free inode.
       */
      struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
                                     umode_t mode, const struct qstr *qstr,
                                     __u32 goal, uid_t *owner, int handle_type,
                                     unsigned int line_no, int nblocks)
      {
              struct super_block *sb;
              struct buffer_head *inode_bitmap_bh = NULL;
              struct buffer_head *group_desc_bh;
 1035         ext4_group_t ngroups, group = 0;
              unsigned long ino = 0;
              struct inode *inode;
              struct ext4_group_desc *gdp = NULL;
              struct ext4_inode_info *ei;
              struct ext4_sb_info *sbi;
              int ret2, err;
              struct inode *ret;
              ext4_group_t i;
              ext4_group_t flex_group;
              struct ext4_group_info *grp;
              int encrypt = 0;
      
              /* Cannot create files in a deleted directory */
 1035         if (!dir || !dir->i_nlink)
 1002                 return ERR_PTR(-EPERM);
      
 1035         if ((IS_ENCRYPTED(dir) ||
                   DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
                   (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
                      err = fscrypt_get_encryption_info(dir);
                      if (err)
                              return ERR_PTR(err);
                      if (!fscrypt_has_encryption_key(dir))
                              return ERR_PTR(-ENOKEY);
                      if (!handle)
                              nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
                      encrypt = 1;
              }
      
 1035         sb = dir->i_sb;
              ngroups = ext4_get_groups_count(sb);
 1035         trace_ext4_request_inode(dir, mode);
 1035         inode = new_inode(sb);
              if (!inode)
                      return ERR_PTR(-ENOMEM);
 1035         ei = EXT4_I(inode);
 1035         sbi = EXT4_SB(sb);
      
              /*
               * Initialize owners and quota early so that we don't have to account
               * for quota initialization worst case in standard inode creating
               * transaction
               */
              if (owner) {
   22                 inode->i_mode = mode;
                      i_uid_write(inode, owner[0]);
                      i_gid_write(inode, owner[1]);
 1025         } else if (test_opt(sb, GRPID)) {
                      inode->i_mode = mode;
                      inode->i_uid = current_fsuid();
                      inode->i_gid = dir->i_gid;
              } else
 1025                 inode_init_owner(inode, dir, mode);
      
 1035         if (ext4_has_feature_project(sb) &&
                  ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
                      ei->i_projid = EXT4_I(dir)->i_projid;
              else
                      ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
      
 1035         err = dquot_initialize(inode);
              if (err)
                      goto out;
      
 1035         if (!goal)
 1025                 goal = sbi->s_inode_goal;
      
   22         if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
   22                 group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
                      ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
                      ret2 = 0;
                      goto got_group;
              }
      
 1025         if (S_ISDIR(mode))
  348                 ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
              else
  721                 ret2 = find_group_other(sb, dir, &group, mode);
      
      got_group:
  348         EXT4_I(dir)->i_last_alloc_group = group;
              err = -ENOSPC;
              if (ret2 == -1)
                      goto out;
      
              /*
               * Normally we will only go through one pass of this loop,
               * unless we get unlucky and it turns out the group we selected
               * had its last inode grabbed by someone else.
               */
 1035         for (i = 0; i < ngroups; i++, ino = 0) {
                      err = -EIO;
      
 1035                 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                      if (!gdp)
                              goto out;
      
                      /*
                       * Check free inodes count before loading bitmap.
                       */
 1035                 if (ext4_free_inodes_count(sb, gdp) == 0) {
                              if (++group == ngroups)
                                      group = 0;
                              continue;
                      }
      
 1035                 grp = ext4_get_group_info(sb, group);
                      /* Skip groups with already-known suspicious inode tables */
                      if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
                              if (++group == ngroups)
                                      group = 0;
                              continue;
                      }
      
 1035                 brelse(inode_bitmap_bh);
 1035                 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
                      /* Skip groups with suspicious inode tables */
 1035                 if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
 1035                     IS_ERR(inode_bitmap_bh)) {
                              inode_bitmap_bh = NULL;
                              if (++group == ngroups)
                                      group = 0;
                              continue;
                      }
      
      repeat_in_this_group:
                      ino = ext4_find_next_zero_bit((unsigned long *)
 1037                                               inode_bitmap_bh->b_data,
                                                    EXT4_INODES_PER_GROUP(sb), ino);
                      if (ino >= EXT4_INODES_PER_GROUP(sb))
                              goto next_group;
 1037                 if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
                              ext4_error(sb, "reserved inode found cleared - "
                                         "inode=%lu", ino + 1);
                              continue;
                      }
 1037                 if ((EXT4_SB(sb)->s_journal == NULL) &&
 1037                     recently_deleted(sb, group, ino)) {
 1030                         ino++;
                              goto next_inode;
                      }
 1009                 if (!handle) {
  999                         BUG_ON(nblocks <= 0);
  999                         handle = __ext4_journal_start_sb(dir->i_sb, line_no,
                                                               handle_type, nblocks,
                                                               0);
                              if (IS_ERR(handle)) {
                                      err = PTR_ERR(handle);
                                      ext4_std_error(sb, err);
                                      goto out;
                              }
                      }
                      BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
 1009                 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
                      if (err) {
                              ext4_std_error(sb, err);
                              goto out;
                      }
 1009                 ext4_lock_group(sb, group);
 1009                 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
                      ext4_unlock_group(sb, group);
                      ino++;                /* the inode bitmap is zero-based */
                      if (!ret2)
                              goto got; /* we grabbed the inode! */
      next_inode:
 1030                 if (ino < EXT4_INODES_PER_GROUP(sb))
                              goto repeat_in_this_group;
      next_group:
                      if (++group == ngroups)
                              group = 0;
              }
              err = -ENOSPC;
              goto out;
      
      got:
              BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
 1009         err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
              if (err) {
                      ext4_std_error(sb, err);
                      goto out;
              }
      
              BUFFER_TRACE(group_desc_bh, "get_write_access");
 1009         err = ext4_journal_get_write_access(handle, group_desc_bh);
              if (err) {
                      ext4_std_error(sb, err);
                      goto out;
              }
      
              /* We may have to initialize the block bitmap if it isn't already */
 1009         if (ext4_has_group_desc_csum(sb) &&
 1009             gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                      struct buffer_head *block_bitmap_bh;
      
                      block_bitmap_bh = ext4_read_block_bitmap(sb, group);
                      if (IS_ERR(block_bitmap_bh)) {
                              err = PTR_ERR(block_bitmap_bh);
                              goto out;
                      }
                      BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
                      err = ext4_journal_get_write_access(handle, block_bitmap_bh);
                      if (err) {
                              brelse(block_bitmap_bh);
                              ext4_std_error(sb, err);
                              goto out;
                      }