// SPDX-License-Identifier: GPL-2.0 /* * Functions related to io context handling */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/sched/task.h> #include "blk.h" /* * For io context allocations */ static struct kmem_cache *iocontext_cachep; /** * get_io_context - increment reference count to io_context * @ioc: io_context to get * * Increment reference count to @ioc. */ void get_io_context(struct io_context *ioc) { 35 BUG_ON(atomic_long_read(&ioc->refcount) <= 0); 35 atomic_long_inc(&ioc->refcount); } EXPORT_SYMBOL(get_io_context); static void icq_free_icq_rcu(struct rcu_head *head) { struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); kmem_cache_free(icq->__rcu_icq_cache, icq); } /* * Exit an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. */ static void ioc_exit_icq(struct io_cq *icq) { struct elevator_type *et = icq->q->elevator->type; if (icq->flags & ICQ_EXITED) return; if (et->uses_mq && et->ops.mq.exit_icq) et->ops.mq.exit_icq(icq); else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn) et->ops.sq.elevator_exit_icq_fn(icq); icq->flags |= ICQ_EXITED; } /* * Release an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. */ static void ioc_destroy_icq(struct io_cq *icq) { struct io_context *ioc = icq->ioc; struct request_queue *q = icq->q; struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); list_del_init(&icq->q_node); /* * Both setting lookup hint to and clearing it from @icq are done * under queue_lock. If it's not pointing to @icq now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); ioc_exit_icq(icq); /* * @icq->q might have gone away by the time RCU callback runs * making it impossible to determine icq_cache. Record it in @icq. */ icq->__rcu_icq_cache = et->icq_cache; call_rcu(&icq->__rcu_head, icq_free_icq_rcu); } /* * Slow path for ioc release in put_io_context(). Performs double-lock * dancing to unlink all icq's and then frees ioc. */ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); unsigned long flags; /* * Exiting icq may call into put_io_context() through elevator * which will trigger lockdep warning. The ioc's are guaranteed to * be different, use a different locking subclass here. Use * irqsave variant as there's no spin_lock_irq_nested(). */ spin_lock_irqsave_nested(&ioc->lock, flags, 1); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, struct io_cq, ioc_node); struct request_queue *q = icq->q; if (spin_trylock(q->queue_lock)) { ioc_destroy_icq(icq); spin_unlock(q->queue_lock); } else { spin_unlock_irqrestore(&ioc->lock, flags); cpu_relax(); spin_lock_irqsave_nested(&ioc->lock, flags, 1); } } spin_unlock_irqrestore(&ioc->lock, flags); kmem_cache_free(iocontext_cachep, ioc); } /** * put_io_context - put a reference of io_context * @ioc: io_context to put * * Decrement reference count of @ioc and release it if the count reaches * zero. */ void put_io_context(struct io_context *ioc) { unsigned long flags; bool free_ioc = false; 36 if (ioc == NULL) return; 36 BUG_ON(atomic_long_read(&ioc->refcount) <= 0); /* * Releasing ioc requires reverse order double locking and we may * already be holding a queue_lock. Do it asynchronously from wq. */ 36 if (atomic_long_dec_and_test(&ioc->refcount)) { 2 spin_lock_irqsave(&ioc->lock, flags); if (!hlist_empty(&ioc->icq_list)) queue_work(system_power_efficient_wq, &ioc->release_work); else free_ioc = true; 2 spin_unlock_irqrestore(&ioc->lock, flags); } if (free_ioc) kmem_cache_free(iocontext_cachep, ioc); } EXPORT_SYMBOL(put_io_context); /** * put_io_context_active - put active reference on ioc * @ioc: ioc of interest * * Undo get_io_context_active(). If active reference reaches zero after * put, @ioc can never issue further IOs and ioscheds are notified. */ 2 void put_io_context_active(struct io_context *ioc) { struct elevator_type *et; unsigned long flags; struct io_cq *icq; 3 if (!atomic_dec_and_test(&ioc->active_ref)) { 3 put_io_context(ioc); return; } /* * Need ioc lock to walk icq_list and q lock to exit icq. Perform * reverse double locking. Read comment in ioc_release_fn() for * explanation on the nested locking annotation. */ retry: 2 spin_lock_irqsave_nested(&ioc->lock, flags, 1); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { if (icq->flags & ICQ_EXITED) continue; et = icq->q->elevator->type; if (et->uses_mq) { ioc_exit_icq(icq); } else { if (spin_trylock(icq->q->queue_lock)) { ioc_exit_icq(icq); spin_unlock(icq->q->queue_lock); } else { spin_unlock_irqrestore(&ioc->lock, flags); cpu_relax(); goto retry; } } } 2 spin_unlock_irqrestore(&ioc->lock, flags); put_io_context(ioc); } /* Called by the exiting task */ void exit_io_context(struct task_struct *task) { struct io_context *ioc; 3 task_lock(task); ioc = task->io_context; task->io_context = NULL; task_unlock(task); atomic_dec(&ioc->nr_tasks); put_io_context_active(ioc); } static void __ioc_clear_queue(struct list_head *icq_list) { unsigned long flags; while (!list_empty(icq_list)) { struct io_cq *icq = list_entry(icq_list->next, struct io_cq, q_node); struct io_context *ioc = icq->ioc; spin_lock_irqsave(&ioc->lock, flags); ioc_destroy_icq(icq); spin_unlock_irqrestore(&ioc->lock, flags); } } /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared * * Walk @q->icq_list and exit all io_cq's. */ void ioc_clear_queue(struct request_queue *q) { LIST_HEAD(icq_list); spin_lock_irq(q->queue_lock); list_splice_init(&q->icq_list, &icq_list); if (q->mq_ops) { spin_unlock_irq(q->queue_lock); __ioc_clear_queue(&icq_list); } else { __ioc_clear_queue(&icq_list); spin_unlock_irq(q->queue_lock); } } int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) { struct io_context *ioc; int ret; 1818 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) return -ENOMEM; /* initialize */ 1814 atomic_long_set(&ioc->refcount, 1); atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); /* * Try to install. ioc shouldn't be installed if someone else * already did or @task, which isn't %current, is exiting. Note * that we need to allow ioc creation on exiting %current as exit * path may issue IOs from e.g. exit_files(). The exit path is * responsible for not issuing IO after exit_io_context(). */ task_lock(task); if (!task->io_context && 1813 (task == current || !(task->flags & PF_EXITING))) 1811 task->io_context = ioc; else 8 kmem_cache_free(iocontext_cachep, ioc); ret = task->io_context ? 0 : -EBUSY; 1813 task_unlock(task); 1812 return ret; } /** * get_task_io_context - get io_context of a task * @task: task of interest * @gfp_flags: allocation flags, used if allocation is necessary * @node: allocation node, used if allocation is necessary * * Return io_context of @task. If it doesn't exist, it is created with * @gfp_flags and @node. The returned io_context has its reference count * incremented. * * This function always goes through task_lock() and it's better to use * %current->io_context + get_io_context() for %current. */ struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) { struct io_context *ioc; 37 might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { 37 task_lock(task); ioc = task->io_context; if (likely(ioc)) { 35 get_io_context(ioc); task_unlock(task); 35 return ioc; } 35 task_unlock(task); } while (!create_task_io_context(task, gfp_flags, node)); return NULL; } EXPORT_SYMBOL(get_task_io_context); /** * ioc_lookup_icq - lookup io_cq from ioc * @ioc: the associated io_context * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * with @q->queue_lock held. */ struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) { struct io_cq *icq; lockdep_assert_held(q->queue_lock); /* * icq's are indexed from @ioc using radix tree and hint pointer, * both of which are protected with RCU. All removals are done * holding both q and ioc locks, and we're holding q lock - if we * find a icq which points to us, it's guaranteed to be valid. */ rcu_read_lock(); icq = rcu_dereference(ioc->icq_hint); if (icq && icq->q == q) goto out; icq = radix_tree_lookup(&ioc->icq_tree, q->id); if (icq && icq->q == q) rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */ else icq = NULL; out: rcu_read_unlock(); return icq; } EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq * @ioc: io_context of interest * @q: request_queue of interest * @gfp_mask: allocation mask * * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * will be created using @gfp_mask. * * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, gfp_t gfp_mask) { struct elevator_type *et = q->elevator->type; struct io_cq *icq; /* allocate stuff */ icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, q->node); if (!icq) return NULL; if (radix_tree_maybe_preload(gfp_mask) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } icq->ioc = ioc; icq->q = q; INIT_LIST_HEAD(&icq->q_node); INIT_HLIST_NODE(&icq->ioc_node); /* lock both q and ioc and try to link @icq */ spin_lock_irq(q->queue_lock); spin_lock(&ioc->lock); if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); if (et->uses_mq && et->ops.mq.init_icq) et->ops.mq.init_icq(icq); else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn) et->ops.sq.elevator_init_icq_fn(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(ioc, q); if (!icq) printk(KERN_ERR "cfq: icq link failed!\n"); } spin_unlock(&ioc->lock); spin_unlock_irq(q->queue_lock); radix_tree_preload_end(); return icq; } static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", sizeof(struct io_context), 0, SLAB_PANIC, NULL); return 0; } subsys_initcall(blk_ioc_init);
/* * Generic Timer-queue * * Manages a simple queue of timers, ordered by expiration time. * Uses rbtrees for quick list adds and expiration. * * NOTE: All of the following functions need to be serialized * to avoid races. No locking is done by this library code. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/bug.h> #include <linux/timerqueue.h> #include <linux/rbtree.h> #include <linux/export.h> /** * timerqueue_add - Adds timer to timerqueue. * * @head: head of timerqueue * @node: timer node to be added * * Adds the timer node to the timerqueue, sorted by the * node's expires value. */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { 8672 struct rb_node **p = &head->head.rb_node; struct rb_node *parent = NULL; struct timerqueue_node *ptr; /* Make sure we don't add nodes that are already added */ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); 8672 while (*p) { parent = *p; ptr = rb_entry(parent, struct timerqueue_node, node); 8634 if (node->expires < ptr->expires) 8569 p = &(*p)->rb_left; else 8633 p = &(*p)->rb_right; } 8671 rb_link_node(&node->node, parent, p); rb_insert_color(&node->node, &head->head); 8671 if (!head->next || node->expires < head->next->expires) { 8465 head->next = node; return true; } return false; } EXPORT_SYMBOL_GPL(timerqueue_add); /** * timerqueue_del - Removes a timer from the timerqueue. * * @head: head of timerqueue * @node: timer node to be removed * * Removes the timer node from the timerqueue. */ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { 8431 WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); /* update next pointer */ 8431 if (head->next == node) { 8384 struct rb_node *rbn = rb_next(&node->node); head->next = rb_entry_safe(rbn, struct timerqueue_node, node); } 8431 rb_erase(&node->node, &head->head); RB_CLEAR_NODE(&node->node); return head->next != NULL; } EXPORT_SYMBOL_GPL(timerqueue_del); /** * timerqueue_iterate_next - Returns the timer after the provided timer * * @node: Pointer to a timer. * * Provides the timer that is after the given node. This is used, when * necessary, to iterate through the list of timers in a timer list * without modifying the list. */ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) { struct rb_node *next; if (!node) return NULL; next = rb_next(&node->node); if (!next) return NULL; return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
// SPDX-License-Identifier: GPL-2.0 #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/tracehook.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run * @notify: send the notification if true * * Queue @work for task_work_run() below and notify the @task if @notify. * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task returns from kernel * mode or exits. * * This is like the signal handler which runs in kernel mode, but it doesn't * try to wake up the @task. * * Note: there is no ordering guarantee on works queued here. * * RETURNS: * 0 if succeeds or -ESRCH. */ int task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { struct callback_head *head; do { 2359 head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) 2359 return -ESRCH; 2359 work->next = head; } while (cmpxchg(&task->task_works, head, work) != head); 2359 if (notify) 2359 set_notify_resume(task); return 0; } /** * task_work_cancel - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @func: identifies the work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; 5 if (likely(!task->task_works)) return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the * new entry before this work, we will find it again. Or * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); while ((work = READ_ONCE(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); return work; } /** * task_work_run - execute the works added by task_work_add() * * Flush the pending works. Should be used by the core kernel code. * Called before the task returns to the user-mode or stops, or when * it exits. In the latter case task_work_add() can no longer add the * new work after task_work_run() returns. */ void task_work_run(void) { 2306 struct task_struct *task = current; struct callback_head *work, *head, *next; for (;;) { /* * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ 2306 raw_spin_lock_irq(&task->pi_lock); do { 2306 work = READ_ONCE(task->task_works); 2045 head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; 2306 } while (cmpxchg(&task->task_works, work, head) != work); 2306 raw_spin_unlock_irq(&task->pi_lock); if (!work) break; do { 2305 next = work->next; work->func(work); work = next; cond_resched(); } while (work); } }
/* * common LSM auditing functions * * Based on code written for SELinux by : * Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * Author : Etienne Basset, <etienne.basset@ensta.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/fs.h> #include <linux/init.h> #include <net/sock.h> #include <linux/un.h> #include <net/af_unix.h> #include <linux/audit.h> #include <linux/ipv6.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ipv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <linux/lsm_audit.h> /** * ipv4_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv4_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int ret = 0; struct iphdr *ih; ih = ip_hdr(skb); if (ih == NULL) return -EINVAL; ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; if (proto) *proto = ih->protocol; /* non initial fragment */ if (ntohs(ih->frag_off) & IP_OFFSET) return 0; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr *th = tcp_hdr(skb); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr *uh = udp_hdr(skb); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr *dh = dccp_hdr(skb); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #if IS_ENABLED(CONFIG_IPV6) /** * ipv6_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv6_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ret = 0; struct ipv6hdr *ip6; u8 nexthdr; __be16 frag_off; ip6 = ipv6_hdr(skb); if (ip6 == NULL) return -EINVAL; ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; ret = 0; /* IPv6 can have several extension header before the Transport header * skip them */ offset = skb_network_offset(skb); offset += sizeof(*ip6); nexthdr = ip6->nexthdr; offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return 0; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #endif static inline void print_ipv6_addr(struct audit_buffer *ab, struct in6_addr *addr, __be16 port, char *name1, char *name2) { if (!ipv6_addr_any(addr)) audit_log_format(ab, " %s=%pI6c", name1, addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr, __be16 port, char *name1, char *name2) { if (addr) audit_log_format(ab, " %s=%pI4", name1, &addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } /** * dump_common_audit_data - helper to dump common audit data * @a : common audit data * */ static void dump_common_audit_data(struct audit_buffer *ab, struct common_audit_data *a) { char comm[sizeof(current->comm)]; /* * To keep stack sizes in check force programers to notice if they * start making this union too large! See struct lsm_network_audit * as an example of how to deal with large data. */ BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); 129 audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, memcpy(comm, current->comm, sizeof(comm))); switch (a->type) { case LSM_AUDIT_DATA_NONE: return; case LSM_AUDIT_DATA_IPC: audit_log_format(ab, " key=%d ", a->u.ipc_id); break; case LSM_AUDIT_DATA_CAP: 12 audit_log_format(ab, " capability=%d ", a->u.cap); break; case LSM_AUDIT_DATA_PATH: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.path); inode = d_backing_inode(a->u.path.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_FILE: { struct inode *inode; 25 audit_log_d_path(ab, " path=", &a->u.file->f_path); inode = file_inode(a->u.file); if (inode) { 25 audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_IOCTL_OP: { struct inode *inode; 1 audit_log_d_path(ab, " path=", &a->u.op->path); inode = a->u.op->path.dentry->d_inode; if (inode) { 1 audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } 1 audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); break; } case LSM_AUDIT_DATA_DENTRY: { struct inode *inode; 6 audit_log_format(ab, " name="); audit_log_untrustedstring(ab, a->u.dentry->d_name.name); inode = d_backing_inode(a->u.dentry); if (inode) { 5 audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_INODE: { struct dentry *dentry; struct inode *inode; 3 inode = a->u.inode; dentry = d_find_alias(inode); if (dentry) { 2 audit_log_format(ab, " name="); audit_log_untrustedstring(ab, dentry->d_name.name); dput(dentry); } 3 audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); break; } case LSM_AUDIT_DATA_TASK: { struct task_struct *tsk = a->u.tsk; if (tsk) { pid_t pid = task_tgid_nr(tsk); if (pid) { char comm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); audit_log_untrustedstring(ab, memcpy(comm, tsk->comm, sizeof(comm))); } } break; } case LSM_AUDIT_DATA_NET: 30 if (a->u.net->sk) { struct sock *sk = a->u.net->sk; struct unix_sock *u; struct unix_address *addr; int len = 0; char *p = NULL; 30 switch (sk->sk_family) { case AF_INET: { struct inet_sock *inet = inet_sk(sk); print_ipv4_addr(ab, inet->inet_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv4_addr(ab, inet->inet_daddr, inet->inet_dport, "faddr", "fport"); break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct inet_sock *inet = inet_sk(sk); print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv6_addr(ab, &sk->sk_v6_daddr, inet->inet_dport, "faddr", "fport"); break; } #endif case AF_UNIX: u = unix_sk(sk); addr = smp_load_acquire(&u->addr); if (!addr) break; if (u->path.dentry) { audit_log_d_path(ab, " path=", &u->path); break; } len = addr->len-sizeof(short); p = &addr->name->sun_path[0]; audit_log_format(ab, " path="); if (*p) audit_log_untrustedstring(ab, p); else audit_log_n_hex(ab, p, len); break; } } 30 switch (a->u.net->family) { case AF_INET: print_ipv4_addr(ab, a->u.net->v4info.saddr, a->u.net->sport, "saddr", "src"); print_ipv4_addr(ab, a->u.net->v4info.daddr, a->u.net->dport, "daddr", "dest"); break; case AF_INET6: print_ipv6_addr(ab, &a->u.net->v6info.saddr, a->u.net->sport, "saddr", "src"); print_ipv6_addr(ab, &a->u.net->v6info.daddr, a->u.net->dport, "daddr", "dest"); break; } 30 if (a->u.net->netif > 0) { struct net_device *dev; /* NOTE: we always use init's namespace */ dev = dev_get_by_index(&init_net, a->u.net->netif); if (dev) { audit_log_format(ab, " netif=%s", dev->name); dev_put(dev); } } break; #ifdef CONFIG_KEYS case LSM_AUDIT_DATA_KEY: audit_log_format(ab, " key_serial=%u", a->u.key_struct.key); if (a->u.key_struct.key_desc) { audit_log_format(ab, " key_desc="); 129 audit_log_untrustedstring(ab, a->u.key_struct.key_desc); } break; #endif case LSM_AUDIT_DATA_KMOD: audit_log_format(ab, " kmod="); audit_log_untrustedstring(ab, a->u.kmod_name); break; case LSM_AUDIT_DATA_IBPKEY: { struct in6_addr sbn_pfx; memset(&sbn_pfx.s6_addr, 0, sizeof(sbn_pfx.s6_addr)); memcpy(&sbn_pfx.s6_addr, &a->u.ibpkey->subnet_prefix, sizeof(a->u.ibpkey->subnet_prefix)); audit_log_format(ab, " pkey=0x%x subnet_prefix=%pI6c", a->u.ibpkey->pkey, &sbn_pfx); break; } case LSM_AUDIT_DATA_IBENDPORT: audit_log_format(ab, " device=%s port_num=%u", a->u.ibendport->dev_name, a->u.ibendport->port); break; } /* switch (a->type) */ } /** * common_lsm_audit - generic LSM auditing function * @a: auxiliary audit data * @pre_audit: lsm-specific pre-audit callback * @post_audit: lsm-specific post-audit callback * * setup the audit buffer for common security information * uses callback to print LSM specific information */ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)) { struct audit_buffer *ab; 129 if (a == NULL) return; /* we use GFP_ATOMIC so we won't sleep */ 129 ab = audit_log_start(current->audit_context, GFP_ATOMIC | __GFP_NOWARN, AUDIT_AVC); if (ab == NULL) return; 129 if (pre_audit) 129 pre_audit(ab, a); 129 dump_common_audit_data(ab, a); if (post_audit) 129 post_audit(ab, a); 128 audit_log_end(ab); }
#include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/vmalloc.h> #include <linux/reboot.h> /* * Notifier list for kernel code which wants to be called * at shutdown. This is used to stop any idling DMA operations * and the like. */ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); /* * Notifier chain core routines. The exported routines below * are layered on top of these, with appropriate locking added. */ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); } n->next = *nl; rcu_assign_pointer(*nl, n); return 0; } static int notifier_chain_cond_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if ((*nl) == n) return 0; if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); } n->next = *nl; rcu_assign_pointer(*nl, n); return 0; } static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if ((*nl) == n) { rcu_assign_pointer(*nl, n->next); return 0; } nl = &((*nl)->next); } return -ENOENT; } /** * notifier_call_chain - Informs the registered notifiers about an event. * @nl: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: Number of notifier functions to be called. Don't care * value of this parameter is -1. * @nr_calls: Records the number of notifications sent. Don't care * value of this field is NULL. * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ static int notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; 1772 nb = rcu_dereference_raw(*nl); 1500 while (nb && nr_to_call) { 1500 next_nb = rcu_dereference_raw(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { WARN(1, "Invalid notifier called!"); nb = next_nb; continue; } #endif 1500 ret = nb->notifier_call(nb, val, v); if (nr_calls) (*nr_calls)++; 1495 if (ret & NOTIFY_STOP_MASK) break; nb = next_nb; 1471 nr_to_call--; } 1763 return ret; } NOKPROBE_SYMBOL(notifier_call_chain); /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** * atomic_notifier_chain_register - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain. * * Currently always returns zero. */ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); /** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an atomic notifier chain. * * Returns zero on success or %-ENOENT on failure. */ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); /** * __atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: See the comment for notifier_call_chain. * @nr_calls: See the comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. * This routine uses RCU to synchronize with changes to the chain. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret; 702 rcu_read_lock(); 702 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); 702 rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); NOKPROBE_SYMBOL(__atomic_notifier_call_chain); int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { 687 return __atomic_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. */ /** * blocking_notifier_chain_register - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to a blocking notifier chain. * Must be called in process context. * * Currently always returns zero. */ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_register(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to a blocking notifier chain, only if not already * present in the chain. * Must be called in process context. * * Currently always returns zero. */ int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; down_write(&nh->rwsem); ret = notifier_chain_cond_register(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a blocking notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_unregister(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); /** * __blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: See comment for notifier_call_chain. * @nr_calls: See comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ 530 int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ 666 if (rcu_access_pointer(nh->head)) { 530 down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); 530 int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v) { 666 return __blocking_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* * Raw notifier chain routines. There is no protection; * the caller must provide it. Use at your own risk! */ /** * raw_notifier_chain_register - Add notifier to a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: New entry in notifier chain * * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. * * Currently always returns zero. */ int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_register(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); /** * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a raw notifier chain. * All locking must be provided by the caller. * * Returns zero on success or %-ENOENT on failure. */ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_unregister(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); /** * __raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: See comment for notifier_call_chain. * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. * All locking must be provided by the caller. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int __raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v, int nr_to_call, int *nr_calls) { 1039 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); } EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { 1039 return __raw_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); #ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). */ /** * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: New entry in notifier chain * * Adds a notifier to an SRCU notifier chain. * Must be called in process context. * * Currently always returns zero. */ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_register(&nh->head, n); mutex_unlock(&nh->mutex); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); /** * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an SRCU notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_unregister(&nh->head, n); mutex_unlock(&nh->mutex); synchronize_srcu(&nh->srcu); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: See comment for notifier_call_chain. * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); srcu_read_unlock(&nh->srcu, idx); return ret; } EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v) { return __srcu_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** * srcu_init_notifier_head - Initialize an SRCU notifier head * @nh: Pointer to head of the srcu notifier chain * * Unlike other sorts of notifier heads, SRCU notifier heads require * dynamic initialization. Be sure to call this routine before * calling any of the other SRCU notifier routines for this head. * * If an SRCU notifier head is deallocated, it must first be cleaned * up by calling srcu_cleanup_notifier_head(). Otherwise the head's * per-cpu data (used by the SRCU mechanism) will leak. */ void srcu_init_notifier_head(struct srcu_notifier_head *nh) { mutex_init(&nh->mutex); if (init_srcu_struct(&nh->srcu) < 0) BUG(); nh->head = NULL; } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); #endif /* CONFIG_SRCU */ static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { 15 struct die_args args = { .regs = regs, .str = str, .err = err, .trapnr = trap, .signr = sig, }; 15 RCU_LOCKDEP_WARN(!rcu_is_watching(), "notify_die called but RCU thinks we're quiescent"); 15 return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { vmalloc_sync_all(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier);
// SPDX-License-Identifier: GPL-2.0 /* * trace context switch * * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> * */ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <trace/events/sched.h> #include "trace.h" #define RECORD_CMDLINE 1 #define RECORD_TGID 2 static int sched_cmdline_ref; static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { int flags; 2224 flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); 2224 if (!flags) return; 2224 tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { int flags; 1030 flags = (RECORD_TGID * !!sched_tgid_ref) + (RECORD_CMDLINE * !!sched_cmdline_ref); 1030 if (!flags) return; 1030 tracing_record_taskinfo(current, flags); } static int tracing_sched_register(void) { int ret; ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { unregister_trace_sched_switch(probe_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(int ops) { bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref); mutex_lock(&sched_register_mutex); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref++; break; case RECORD_TGID: sched_tgid_ref++; break; } if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); switch (ops) { case RECORD_CMDLINE: sched_cmdline_ref--; break; case RECORD_TGID: sched_tgid_ref--; break; } if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(RECORD_CMDLINE); } void tracing_start_tgid_record(void) { tracing_start_sched_switch(RECORD_TGID); } void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); }
/* * ratelimit.c - Do something with rate limit. * * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com> * * 2008-05-01 rewrite the function and use a ratelimit_state data struct as * parameter. Now every user can use their own standalone ratelimit_state. * * This file is released under the GPLv2. */ #include <linux/ratelimit.h> #include <linux/jiffies.h> #include <linux/export.h> /* * __ratelimit - rate limiting * @rs: ratelimit_state data * @func: name of calling function * * This enforces a rate limit: not more than @rs->burst callbacks * in every @rs->interval * * RETURNS: * 0 means callbacks will be suppressed. * 1 means go ahead and do it. */ 1037 int ___ratelimit(struct ratelimit_state *rs, const char *func) { unsigned long flags; int ret; 1272 if (!rs->interval) return 1; /* * If we contend on this state's lock then almost * by definition we are too busy to print a message, * in addition to the one that will be printed by * the entity that is holding the lock already: */ 1036 if (!raw_spin_trylock_irqsave(&rs->lock, flags)) return 0; 1037 if (!rs->begin) rs->begin = jiffies; 1037 if (time_is_before_jiffies(rs->begin + rs->interval)) { 209 if (rs->missed) { 83 if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { 83 printk_deferred(KERN_WARNING "%s: %d callbacks suppressed\n", func, rs->missed); rs->missed = 0; } } 209 rs->begin = jiffies; rs->printed = 0; } 1037 if (rs->burst && rs->burst > rs->printed) { 861 rs->printed++; ret = 1; } else { 383 rs->missed++; ret = 0; } 1037 raw_spin_unlock_irqrestore(&rs->lock, flags); return ret; } EXPORT_SYMBOL(___ratelimit);
// SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; 205 if (task_nice(current) > 0) divfactor = divfactor / 5; 205 if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; 191 slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. */ 219 if (rt_task(current)) return 0; 206 ktime_get_ts64(&now); now = timespec64_sub(*tv, now); 205 ret = __estimate_accuracy(&now); 219 if (ret < current->timer_slack_ns) 54 return current->timer_slack_ns; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[0]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { 403 init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { 264 struct poll_table_page * p = pwq->table; int i; 159 for (i = 0; i < pwq->inline_index; i++) 159 free_poll_entry(pwq->inline_entries + i); 264 while (p) { struct poll_table_entry * entry; struct poll_table_page *old; 6 entry = p->entry; do { 6 entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; 6 p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { 9 struct poll_table_page *table = p->table; 255 if (p->inline_index < N_INLINE_POLL_ENTRIES) 255 return p->inline_entries + p->inline_index++; 3 if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; 9 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } 9 new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } 9 return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ 71 return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); 73 if (key && !((unsigned long)key & entry->key)) return 0; 73 return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); 255 struct poll_table_entry *entry = poll_get_entry(pwq); 255 if (!entry) return; 255 entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; 263 set_current_state(state); if (!pwq->triggered) 263 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); 214 __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } EXPORT_SYMBOL(poll_schedule_timeout); /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ 315 int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; 422 if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ 418 if (!sec && !nsec) { 105 to->tv_sec = to->tv_nsec = 0; } else { 315 ktime_get_ts64(to); 315 *to = timespec64_add_safe(*to, ts); } 105 return 0; 10 } static int poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, int timeval, int ret) { struct timespec64 rts64; struct timespec rts; struct timeval rtv; 209 if (!p) return ret; 168 if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ 167 if (!end_time->tv_sec && !end_time->tv_nsec) return ret; 120 ktime_get_ts64(&rts64); 27 rts64 = timespec64_sub(*end_time, rts64); if (rts64.tv_sec < 0) 98 rts64.tv_sec = rts64.tv_nsec = 0; 120 rts = timespec64_to_timespec(rts64); if (timeval) { if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); 3 rtv.tv_sec = rts64.tv_sec; rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; 117 } else if (!copy_to_user(p, &rts, sizeof(rts))) return ret; /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: 209 if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * We do a VERIFY_WRITE here even though we are only reading this time: * we'll write to it eventually.. * * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) 194 return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; 195 memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { 116 if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { 10 memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; 210 fdt = files_fdtable(current->files); 210 open_fds = fdt->open_fds + n; max = 0; if (set) { 5 set &= BITS(fds, n); if (set) { 4 if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } 209 while (n) { 186 open_fds--; n--; 186 set = BITS(fds, n); if (!set) continue; 180 if (set & ~*open_fds) return -EBADF; 168 if (max) continue; get_max: do { 171 max++; set >>= 1; } while (set); 171 max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR) #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, unsigned int ll_flag) { 165 wait->_key = POLLEX_SET | ll_flag; 170 if (in & bit) 9 wait->_key |= POLLIN_SET; 170 if (out & bit) 5 wait->_key |= POLLOUT_SET; } static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; 210 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; 210 rcu_read_lock(); 210 retval = max_select_fd(n, fds); 210 rcu_read_unlock(); if (retval < 0) return retval; n = retval; 196 poll_initwait(&table); wait = &table.pt; 156 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 38 wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) 118 slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; 199 inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 164 for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; 171 in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { 3 i += BITS_PER_LONG; continue; } 171 for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { struct fd f; 171 if (i >= n) break; 171 if (!(bit & all_bits)) continue; 171 f = fdget(i); if (f.file) { const struct file_operations *f_op; f_op = f.file->f_op; 164 mask = DEFAULT_POLLMASK; if (f_op->poll) { 170 wait_key_set(wait, in, out, bit, busy_flag); 170 mask = (*f_op->poll)(f.file, wait); } 171 fdput(f); 171 if ((mask & POLLIN_SET) && (in & bit)) { 7 res_in |= bit; 162 retval++; wait->_qproc = NULL; } 171 if ((mask & POLLOUT_SET) && (out & bit)) { 5 res_out |= bit; retval++; wait->_qproc = NULL; } 171 if ((mask & POLLEX_SET) && (ex & bit)) { 2 res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ 171 if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ 162 } else if (busy_flag & mask) can_busy_loop = true; } } 164 if (res_in) 5 *rinp = res_in; 164 if (res_out) 3 *routp = res_out; 164 if (res_ex) 2 *rexp = res_ex; 164 cond_resched(); } 192 wait->_qproc = NULL; 151 if (retval || timed_out || signal_pending(current)) break; 148 if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ 148 if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ 148 if (end_time && !to) { 114 expire = timespec64_to_ktime(*end_time); to = &expire; } 148 if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } 120 poll_freewait(&table); 133 return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; 202 if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ 201 rcu_read_lock(); 201 fdt = files_fdtable(current->files); 201 max_fds = fdt->max_fds; 201 rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } 201 fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; 201 if ((ret = get_fd_set(n, inp, fds.in)) || 200 (ret = get_fd_set(n, outp, fds.out)) || 200 (ret = get_fd_set(n, exp, fds.ex))) goto out; 200 zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; 117 if (!ret) { ret = -ERESTARTNOHAND; 109 if (signal_pending(current)) goto out; ret = 0; } 116 if (set_fd_set(n, inp, fds.res_in) || 115 set_fd_set(n, outp, fds.res_out) || 113 set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: 130 if (bits != stack_fds) kvfree(bits); out_nofds: 131 return ret; } 15 SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp) { struct timespec64 end_time, *to = NULL; struct timeval tv; int ret; if (tvp) { 10 if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), 9 (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } 12 ret = core_sys_select(n, inp, outp, exp, to); ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); 11 return ret; } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; struct timespec ts; struct timespec64 ts64, end_time, *to = NULL; int ret; 192 if (tsp) { 160 if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; 159 ts64 = timespec_to_timespec64(ts); to = &end_time; if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec)) return -EINVAL; } 160 if (sigmask) { /* XXX: Don't preclude handling different sized sigset_t's. */ 7 if (sigsetsize != sizeof(sigset_t)) return -EINVAL; 6 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) return -EFAULT; 6 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } 184 ret = core_sys_select(n, inp, outp, exp, to); ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); if (ret == -ERESTARTNOHAND) { /* * Don't restore the signal mask yet. Let do_signal() deliver * the signal on the way back to userspace, before the signal * mask is restored. */ if (sigmask) { 2 memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); set_restore_sigmask(); } } else if (sigmask) 126 sigprocmask(SIG_SETMASK, &sigsaved, NULL); return ret; } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ 195 SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timespec __user *, tsp, void __user *, sig) { size_t sigsetsize = 0; sigset_t __user *up = NULL; if (sig) { 12 if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) || __get_user(up, (sigset_t __user * __user *)sig) 9 || __get_user(sigsetsize, (size_t __user *)(sig+sizeof(void *)))) return -EFAULT; } 192 return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize); } #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; int len; struct pollfd entries[0]; }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, unsigned int busy_flag) { unsigned int mask; int fd; mask = 0; 175 fd = pollfd->fd; if (fd >= 0) { 170 struct fd f = fdget(fd); mask = POLLNVAL; if (f.file) { mask = DEFAULT_POLLMASK; 169 if (f.file->f_op->poll) { 162 pwait->_key = pollfd->events|POLLERR|POLLHUP; pwait->_key |= busy_flag; mask = f.file->f_op->poll(f.file, pwait); 161 if (mask & busy_flag) *can_busy_poll = true; } /* Mask out unneeded events. */ 168 mask &= pollfd->events | POLLERR | POLLHUP; 166 fdput(f); } } 173 pollfd->revents = mask; return mask; } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ 208 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 51 pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) 89 slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; 194 for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; 210 pfd = walk->entries; pfd_end = pfd + walk->len; 173 for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ 175 if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) { 74 count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ 191 pt->_qproc = NULL; if (!count) { 153 count = wait->error; if (signal_pending(current)) count = -EINTR; } 190 if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ 116 if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ 116 if (end_time && !to) { 67 expire = timespec64_to_ktime(*end_time); to = &expire; } 116 if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } 145 return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; 212 unsigned long todo = nfds; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; 209 len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { 209 walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, 175 sizeof(struct pollfd) * walk->len)) goto out_fds; 174 todo -= walk->len; if (!todo) break; 20 len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; walk = walk->next = kmalloc(size, GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } 208 poll_initwait(&table); 210 fdcount = do_poll(head, &table, end_time); poll_freewait(&table); 144 for (walk = head; walk; walk = walk->next) { 145 struct pollfd *fds = walk->entries; int j; 131 for (j = 0; j < walk->len; j++, ufds++) 132 if (__put_user(fds[j].revents, &ufds->revents)) goto out_fds; } err = fdcount; out_fds: 146 walk = head->next; 149 while (walk) { struct poll_list *pos = walk; 16 walk = walk->next; kfree(pos); } return err; } static long do_restart_poll(struct restart_block *restart_block) { 5 struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { 1 end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } 5 ret = do_sys_poll(ufds, nfds, to); 4 if (ret == -EINTR) { 2 restart_block->fn = do_restart_poll; ret = -ERESTART_RESTARTBLOCK; } 5 return ret; } 104 SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, 81 NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } 22 ret = do_sys_poll(ufds, nfds, to); 60 if (ret == -EINTR) { struct restart_block *restart_block; 8 restart_block = ¤t->restart_block; restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = -ERESTART_RESTARTBLOCK; } 68 return ret; } 109 SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; struct timespec ts; struct timespec64 end_time, *to = NULL; int ret; if (tsp) { 62 if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; to = &end_time; 60 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } 105 if (sigmask) { /* XXX: Don't preclude handling different sized sigset_t's. */ 19 if (sigsetsize != sizeof(sigset_t)) return -EINVAL; 18 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) return -EFAULT; 18 sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } 86 ret = do_sys_poll(ufds, nfds, to); /* We can restart this syscall, usually */ if (ret == -EINTR) { /* * Don't restore the signal mask yet. Let do_signal() deliver * the signal on the way back to userspace, before the signal * mask is restored. */ if (sigmask) { 2 memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); set_restore_sigmask(); } ret = -ERESTARTNOHAND; } else if (sigmask) 10 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 78 ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); 82 return ret; } #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) static int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, int timeval, int ret) { struct timespec ts; 13 if (!p) return ret; 6 if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ 6 if (!end_time->tv_sec && !end_time->tv_nsec) return ret; 3 ktime_get_ts(&ts); 3 ts = timespec_sub(*end_time, ts); if (ts.tv_sec < 0) ts.tv_sec = ts.tv_nsec = 0; 3 if (timeval) { struct compat_timeval rtv; 3 rtv.tv_sec = ts.tv_sec; rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } else { struct compat_timespec rts; rts.tv_sec = ts.tv_sec; rts.tv_nsec = ts.tv_nsec; if (!copy_to_user(p, &rts, sizeof(rts))) 3 return ret; } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: 13 if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static 10 int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { 14 if (ufdset) { 7 return compat_get_bitmap(fdset, ufdset, nr); } else { 10 zero_fd_set(nr, fdset); 10 return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { 1 if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; 17 if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ 14 rcu_read_lock(); 14 fdt = files_fdtable(current->files); 14 max_fds = fdt->max_fds; 14 rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc(6 * size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } 14 fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || 11 (ret = compat_get_fd_set(n, outp, fds.out)) || 10 (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; 10 zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; 3 if (!ret) { ret = -ERESTARTNOHAND; 2 if (signal_pending(current)) goto out; ret = 0; } 1 if (compat_set_fd_set(n, inp, fds.res_in) || 1 compat_set_fd_set(n, outp, fds.res_out) || 1 compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: 7 if (bits != stack_fds) kfree(bits); out_nofds: 10 return ret; } 13 COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct compat_timeval __user *, tvp) { struct timespec end_time, *to = NULL; struct compat_timeval tv; int ret; if (tvp) { 4 if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), 4 (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } 12 ret = compat_core_sys_select(n, inp, outp, exp, to); ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret); 6 return ret; } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; 7 COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; 6 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), 5 compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; struct compat_timespec ts; struct timespec end_time, *to = NULL; int ret; 4 if (tsp) { 4 if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } 3 if (sigmask) { 1 if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (copy_from_user(&ss32, sigmask, sizeof(ss32))) return -EFAULT; sigset_from_compat(&ksigmask, &ss32); sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } 5 ret = compat_core_sys_select(n, inp, outp, exp, to); ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); if (ret == -ERESTARTNOHAND) { /* * Don't restore the signal mask yet. Let do_signal() deliver * the signal on the way back to userspace, before the signal * mask is restored. */ if (sigmask) { memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); set_restore_sigmask(); } } else if (sigmask) 9 sigprocmask(SIG_SETMASK, &sigsaved, NULL); return ret; } 11 COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct compat_timespec __user *, tsp, void __user *, sig) { compat_size_t sigsetsize = 0; compat_uptr_t up = 0; if (sig) { 7 if (!access_ok(VERIFY_READ, sig, sizeof(compat_uptr_t)+sizeof(compat_size_t)) || __get_user(up, (compat_uptr_t __user *)sig) || 6 __get_user(sigsetsize, (compat_size_t __user *)(sig+sizeof(up)))) return -EFAULT; } 10 return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), sigsetsize); } 7 COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct compat_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; struct compat_timespec ts; struct timespec end_time, *to = NULL; int ret; if (tsp) { 4 if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; to = &end_time; 4 if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } 5 if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (copy_from_user(&ss32, sigmask, sizeof(ss32))) return -EFAULT; sigset_from_compat(&ksigmask, &ss32); sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } 5 ret = do_sys_poll(ufds, nfds, to); /* We can restart this syscall, usually */ if (ret == -EINTR) { /* * Don't restore the signal mask yet. Let do_signal() deliver * the signal on the way back to userspace, before the signal * mask is restored. */ if (sigmask) { memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); set_restore_sigmask(); } ret = -ERESTARTNOHAND; } else if (sigmask) sigprocmask(SIG_SETMASK, &sigsaved, NULL); 3 ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); 5 return ret; } #endif
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IPv6 * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ /* * Fixes: * * Ralf Baechle : generic ipv6 checksum * <ralf@waldorf-gmbh.de> */ #ifndef _CHECKSUM_IPV6_H #define _CHECKSUM_IPV6_H #include <asm/types.h> #include <asm/byteorder.h> #include <net/ip.h> #include <asm/checksum.h> #include <linux/in6.h> #include <linux/tcp.h> #include <linux/ipv6.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum); #endif static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) { 117 return ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, proto, 0)); } static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) { const struct ipv6hdr *iph = skb_gro_network_header(skb); return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, skb_gro_len(skb), proto, 0)); } static __inline__ __sum16 tcp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline void __tcp_v6_send_check(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { 262 th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { 4 th->check = tcp_v6_check(skb->len, saddr, daddr, csum_partial(th, th->doff << 2, skb->csum)); } } #if IS_ENABLED(CONFIG_IPV6) static inline void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) { 199 struct ipv6_pinfo *np = inet6_sk(sk); 199 __tcp_v6_send_check(skb, &np->saddr, &sk->sk_v6_daddr); } #endif static inline __sum16 udp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len); int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto); #endif
/* tnum: tracked (or tristate) numbers * * A tnum tracks knowledge about the bits of a value. Each bit can be either * known (0 or 1), or unknown (x). Arithmetic operations on tnums will * propagate the unknown bits such that the tnum result represents all the * possible results for possible values of the operands. */ #include <linux/kernel.h> #include <linux/tnum.h> #define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} /* A completely unknown value */ const struct tnum tnum_unknown = { .value = 0, .mask = -1 }; struct tnum tnum_const(u64 value) { 586 return TNUM(value, 0); } struct tnum tnum_range(u64 min, u64 max) { 388 u64 chi = min ^ max, delta; u8 bits = fls64(chi); /* special case, needed because 1ULL << 64 is undefined */ if (bits > 63) 251 return tnum_unknown; /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7. * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return * constant min (since min == max). */ 352 delta = (1ULL << bits) - 1; 388 return TNUM(min & ~delta, delta); } struct tnum tnum_lshift(struct tnum a, u8 shift) { 40 return TNUM(a.value << shift, a.mask << shift); } struct tnum tnum_rshift(struct tnum a, u8 shift) { 1 return TNUM(a.value >> shift, a.mask >> shift); } 288 struct tnum tnum_add(struct tnum a, struct tnum b) { u64 sm, sv, sigma, chi, mu; 71 sm = a.mask + b.mask; sv = a.value + b.value; sigma = sm + sv; chi = sigma ^ sv; mu = chi | a.mask | b.mask; return TNUM(sv & ~mu, mu); } 133 struct tnum tnum_sub(struct tnum a, struct tnum b) { u64 dv, alpha, beta, chi, mu; dv = a.value - b.value; alpha = dv + a.mask; beta = dv - b.mask; chi = alpha ^ beta; mu = chi | a.mask | b.mask; return TNUM(dv & ~mu, mu); } 24 struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; alpha = a.value | a.mask; beta = b.value | b.mask; v = a.value & b.value; return TNUM(v, alpha & beta & ~v); } struct tnum tnum_or(struct tnum a, struct tnum b) { u64 v, mu; 4 v = a.value | b.value; mu = a.mask | b.mask; return TNUM(v, mu & ~v); } struct tnum tnum_xor(struct tnum a, struct tnum b) { u64 v, mu; v = a.value ^ b.value; mu = a.mask | b.mask; return TNUM(v & ~mu, mu); } /* half-multiply add: acc += (unknown * mask * value). * An intermediate step in the multiply algorithm. */ static struct tnum hma(struct tnum acc, u64 value, u64 mask) { 75 while (mask) { 71 if (mask & 1) 71 acc = tnum_add(acc, TNUM(0, value)); 71 mask >>= 1; value <<= 1; } 75 return acc; } 75 struct tnum tnum_mul(struct tnum a, struct tnum b) { struct tnum acc; u64 pi; pi = a.value * b.value; 70 acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value); 75 return hma(acc, b.mask, a.value); } /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has * a 'known 0' - this will return a 'known 1' for that bit. */ struct tnum tnum_intersect(struct tnum a, struct tnum b) { u64 v, mu; 388 v = a.value | b.value; mu = a.mask & b.mask; return TNUM(v & ~mu, mu); } 319 struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; a.mask &= (1ULL << (size * 8)) - 1; return a; } bool tnum_is_aligned(struct tnum a, u64 size) { 179 if (!size) return true; 179 return !((a.value | a.mask) & (size - 1)); } 74 bool tnum_in(struct tnum a, struct tnum b) { 74 if (b.mask & ~a.mask) return false; b.value &= ~a.mask; 74 return a.value == b.value; } int tnum_strn(char *str, size_t size, struct tnum a) { 69 return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); } EXPORT_SYMBOL_GPL(tnum_strn); int tnum_sbin(char *str, size_t size, struct tnum a) { size_t n; for (n = 64; n; n--) { if (n < size) { if (a.mask & 1) str[n - 1] = 'x'; else if (a.value & 1) str[n - 1] = '1'; else str[n - 1] = '0'; } a.mask >>= 1; a.value >>= 1; } str[min(size - 1, (size_t)64)] = 0; return 64; }
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include <linux/bpf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/filter.h> #include <linux/perf_event.h> #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) static void bpf_array_free_percpu(struct bpf_array *array) { int i; for (i = 0; i < array->map.max_entries; i++) { free_percpu(array->pptrs[i]); cond_resched(); } } static int bpf_array_alloc_percpu(struct bpf_array *array) { void __percpu *ptr; int i; 17 for (i = 0; i < array->map.max_entries; i++) { 17 ptr = __alloc_percpu_gfp(array->elem_size, 8, GFP_USER | __GFP_NOWARN); if (!ptr) { bpf_array_free_percpu(array); return -ENOMEM; } 17 array->pptrs[i] = ptr; cond_resched(); } return 0; } /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { 43 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 3 int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; 43 bool unpriv = !capable(CAP_SYS_ADMIN); u64 cost, array_size, mask64; struct bpf_array *array; /* check sanity of attributes */ 43 if (attr->max_entries == 0 || attr->key_size != 4 || 41 attr->value_size == 0 || 40 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || 21 (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); 38 if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ return ERR_PTR(-E2BIG); elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; /* On 32 bit archs roundup_pow_of_two() with max_entries that has * upper most bit set in u32 space is undefined behavior due to * resulting 1U << 32, so do it manually here in u64 space. */ mask64 = fls_long(max_entries - 1); mask64 = 1ULL << mask64; mask64 -= 1; index_mask = mask64; if (unpriv) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ 3 max_entries = index_mask + 1; /* Check for overflows. */ if (max_entries < attr->max_entries) return ERR_PTR(-E2BIG); } array_size = sizeof(*array); 36 if (percpu) 19 array_size += (u64) max_entries * sizeof(void *); else 17 array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ cost = array_size; if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); if (percpu) { 18 cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); } 34 cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_precharge_memlock(cost); if (ret < 0) 1 return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ 33 array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); 33 array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; array->map.map_flags = attr->map_flags; array->map.numa_node = numa_node; array->map.pages = cost; array->elem_size = elem_size; 17 if (percpu && bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } 23 return &array->map; } /* Called from syscall or from eBPF program */ static void *array_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); 7 u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; 5 return array->value + array->elem_size * (index & array->index_mask); 1 } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (map->unpriv_array) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); } else { *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); } *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } /* Called from eBPF program */ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); 7 u32 index = *(u32 *)key; void __percpu *pptr; int cpu, off = 0; u32 size; if (unlikely(index >= array->map.max_entries)) return -ENOENT; /* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ 6 size = round_up(map->value_size, 8); 6 rcu_read_lock(); 6 pptr = array->pptrs[index & array->index_mask]; 6 for_each_possible_cpu(cpu) { 6 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 6 off += size; } 6 rcu_read_unlock(); 7 return 0; } /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_array *array = container_of(map, struct bpf_array, map); 6 u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { 3 *next = 0; 3 return 0; } 3 if (index == array->map.max_entries - 1) return -ENOENT; 2 *next = index + 1; 3 return 0; } /* Called from syscall or from eBPF program */ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); 4 u32 index = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; 3 if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; 2 if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; 1 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else 4 memcpy(array->value + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); 8 u32 index = *(u32 *)key; void __percpu *pptr; int cpu, off = 0; u32 size; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; 7 if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; 6 if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; /* the user space will provide round_up(value_size, 8) bytes that * will be copied into per-cpu area. bpf programs can only access * value_size of it. During lookup the same extra bytes will be * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ 5 size = round_up(map->value_size, 8); 5 rcu_read_lock(); 5 pptr = array->pptrs[index & array->index_mask]; 5 for_each_possible_cpu(cpu) { 5 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 5 off += size; } 5 rcu_read_unlock(); 8 return 0; } /* Called from syscall or from eBPF program */ static int array_map_delete_elem(struct bpf_map *map, void *key) { 1 return -EINVAL; } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were * disconnected from events. Wait for outstanding programs to complete * and free the array */ synchronize_rcu(); if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); bpf_map_area_free(array); } const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, }; const struct bpf_map_ops percpu_array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, }; static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ 21 if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); 18 return array_map_alloc(attr); } static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; synchronize_rcu(); /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); bpf_map_area_free(array); } static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* only called from syscall */ int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) { void **elem, *ptr; int ret = 0; 5 if (!map->ops->map_fd_sys_lookup_elem) return -ENOTSUPP; 4 rcu_read_lock(); 4 elem = array_map_lookup_elem(map, key); 3 if (elem && (ptr = READ_ONCE(*elem))) 2 *value = map->ops->map_fd_sys_lookup_elem(ptr); else ret = -ENOENT; 4 rcu_read_unlock(); 5 return ret; } /* only called from syscall */ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *new_ptr, *old_ptr; 24 u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) return -EINVAL; 23 if (index >= array->map.max_entries) return -E2BIG; 22 ufd = *(u32 *)value; new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(new_ptr)) 12 return PTR_ERR(new_ptr); 10 old_ptr = xchg(array->ptrs + index, new_ptr); if (old_ptr) 24 map->ops->map_fd_put_ptr(old_ptr); return 0; } static int fd_array_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; 6 u32 index = *(u32 *)key; if (index >= array->map.max_entries) return -E2BIG; 5 old_ptr = xchg(array->ptrs + index, NULL); if (old_ptr) { 3 map->ops->map_fd_put_ptr(old_ptr); 6 return 0; } else { return -ENOENT; } } static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_array *array = container_of(map, struct bpf_array, map); 1 struct bpf_prog *prog = bpf_prog_get(fd); if (IS_ERR(prog)) return prog; 1 if (!bpf_prog_array_compatible(array, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } return prog; } static void prog_fd_array_put_ptr(void *ptr) { bpf_prog_put(ptr); } static u32 prog_fd_array_sys_lookup_elem(void *ptr) { return ((struct bpf_prog *)ptr)->aux->id; } /* decrement refcnt of all bpf_progs that are stored in this map */ static void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; 1 for (i = 0; i < array->map.max_entries; i++) 1 fd_array_map_delete_elem(map, &i); } const struct bpf_map_ops prog_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, struct file *map_file) { struct bpf_event_entry *ee; 5 ee = kzalloc(sizeof(*ee), GFP_ATOMIC); if (ee) { 5 ee->event = perf_file->private_data; ee->perf_file = perf_file; ee->map_file = map_file; } return ee; } static void __bpf_event_entry_free(struct rcu_head *rcu) { struct bpf_event_entry *ee; ee = container_of(rcu, struct bpf_event_entry, rcu); fput(ee->perf_file); kfree(ee); } static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) { 3 call_rcu(&ee->rcu, __bpf_event_entry_free); } static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_event_entry *ee; struct perf_event *event; struct file *perf_file; u64 value; 9 perf_file = perf_event_get(fd); if (IS_ERR(perf_file)) return perf_file; ee = ERR_PTR(-EOPNOTSUPP); 6 event = perf_file->private_data; if (perf_event_read_local(event, &value) == -EOPNOTSUPP) goto err_out; 5 ee = bpf_event_entry_gen(perf_file, map_file); 9 if (ee) return ee; ee = ERR_PTR(-ENOMEM); err_out: 1 fput(perf_file); return ee; } static void perf_event_fd_array_put_ptr(void *ptr) { 3 bpf_event_entry_free_rcu(ptr); } static void perf_event_fd_array_release(struct bpf_map *map, struct file *map_file) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_event_entry *ee; int i; 6 rcu_read_lock(); 6 for (i = 0; i < array->map.max_entries; i++) { 6 ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) 2 fd_array_map_delete_elem(map, &i); } 6 rcu_read_unlock(); } const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, }; #ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int fd) { 1 return cgroup_get_from_fd(fd); } static void cgroup_fd_array_put_ptr(void *ptr) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); } static void cgroup_fd_array_free(struct bpf_map *map) { bpf_fd_array_map_clear(map); fd_array_map_free(map); } const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, }; #endif static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) { struct bpf_map *map, *inner_map_meta; 8 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); if (IS_ERR(inner_map_meta)) 1 return inner_map_meta; 7 map = fd_array_map_alloc(attr); if (IS_ERR(map)) { 3 bpf_map_meta_free(inner_map_meta); 3 return map; } 4 map->inner_map_meta = inner_map_meta; 4 return map; } static void array_of_map_free(struct bpf_map *map) { /* map->inner_map_meta is only accessed by syscall which * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); bpf_fd_array_map_clear(map); fd_array_map_free(map); } static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_map **inner_map = array_map_lookup_elem(map, key); if (!inner_map) return NULL; return READ_ONCE(*inner_map); } static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (map->unpriv_array) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, };
/* * blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blk-mq.h> #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-wbt.h" void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) { if (exit && hctx->sched_data) exit(hctx); kfree(hctx->sched_data); hctx->sched_data = NULL; } } EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) { struct request_queue *q = rq->q; struct io_context *ioc = rq_ioc(bio); struct io_cq *icq; spin_lock_irq(q->queue_lock); icq = ioc_lookup_icq(ioc, q); spin_unlock_irq(q->queue_lock); if (!icq) { icq = ioc_create_icq(ioc, q, GFP_ATOMIC); if (!icq) return; } get_io_context(icq->ioc); rq->elv.icq = icq; } /* * Mark a hardware queue as needing a restart. For shared queues, maintain * a count of how many hardware queues are marked for restart. */ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { 2 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; 2 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { struct request_queue *q = hctx->queue; if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) atomic_inc(&q->shared_hctx_restart); } else 2 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) { 43 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return false; if (hctx->flags & BLK_MQ_F_TAG_SHARED) { struct request_queue *q = hctx->queue; if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) atomic_dec(&q->shared_hctx_restart); } else clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); if (blk_mq_hctx_has_pending(hctx)) { blk_mq_run_hw_queue(hctx, true); return true; } return false; } void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { 1223 struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; 1223 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; bool do_sched_dispatch = true; 1223 LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ 1223 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) 1215 return; 1223 hctx->run++; /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. */ 1223 if (!list_empty_careful(&hctx->dispatch)) { 2 spin_lock(&hctx->lock); if (!list_empty(&hctx->dispatch)) 2 list_splice_init(&hctx->dispatch, &rq_list); 2 spin_unlock(&hctx->lock); } /* * Only ask the scheduler for requests, if we didn't have residual * requests from the dispatch list. This is to avoid the case where * we only ever dispatch a fraction of the requests available because * of low device queue depth. Once we pull requests out of the IO * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. */ 1223 if (!list_empty(&rq_list)) { 2 blk_mq_sched_mark_restart_hctx(hctx); 2 do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list); 1223 } else if (!has_sched_dispatch) { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list); } /* * We want to dispatch from the scheduler if there was nothing * on the dispatch list or we were able to dispatch from the * dispatch list. */ 2 if (do_sched_dispatch && has_sched_dispatch) { do { struct request *rq; 1223 rq = e->type->ops.mq.dispatch_request(hctx); if (!rq) break; 1223 list_add(&rq->queuelist, &rq_list); 1223 } while (blk_mq_dispatch_rq_list(q, &rq_list)); } } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, struct request **merged_request) { struct request *rq; 1733 switch (elv_merge(q, &rq, bio)) { case ELEVATOR_BACK_MERGE: 39 if (!blk_mq_sched_allow_merge(q, rq, bio)) 1731 return false; 39 if (!bio_attempt_back_merge(q, rq, bio)) return false; 39 *merged_request = attempt_back_merge(q, rq); if (!*merged_request) 39 elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); return true; case ELEVATOR_FRONT_MERGE: 13 if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; 13 if (!bio_attempt_front_merge(q, rq, bio)) return false; 13 *merged_request = attempt_front_merge(q, rq); if (!*merged_request) 13 elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); return true; default: return false; } } EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); /* * Reverse check our software queue for entries that we could potentially * merge with. Currently includes a hand-wavy stop count of 8, to not spend * too much time checking for merges. */ static bool blk_mq_attempt_merge(struct request_queue *q, struct blk_mq_ctx *ctx, struct bio *bio) { struct request *rq; int checked = 8; lockdep_assert_held(&ctx->lock); list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { bool merged = false; if (!checked--) break; if (!blk_rq_merge_ok(rq, bio)) continue; switch (blk_try_merge(rq, bio)) { case ELEVATOR_BACK_MERGE: if (blk_mq_sched_allow_merge(q, rq, bio)) merged = bio_attempt_back_merge(q, rq, bio); break; case ELEVATOR_FRONT_MERGE: if (blk_mq_sched_allow_merge(q, rq, bio)) merged = bio_attempt_front_merge(q, rq, bio); break; case ELEVATOR_DISCARD_MERGE: merged = bio_attempt_discard_merge(q, rq, bio); break; default: continue; } if (merged) ctx->rq_merged++; return merged; } return false; } bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { 1733 struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); bool ret = false; 1733 if (e && e->type->ops.mq.bio_merge) { 1733 blk_mq_put_ctx(ctx); 1733 return e->type->ops.mq.bio_merge(hctx, bio); } if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && !list_empty_careful(&ctx->rq_list)) { /* default per sw-queue merge */ spin_lock(&ctx->lock); ret = blk_mq_attempt_merge(q, ctx, bio); spin_unlock(&ctx->lock); } 1731 blk_mq_put_ctx(ctx); return ret; } bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) { 2338 return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); void blk_mq_sched_request_inserted(struct request *rq) { 2338 trace_block_rq_insert(rq->q, rq); } EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) { if (rq->tag == -1) { 1423 rq->rq_flags |= RQF_SORTED; return false; } /* * If we already have a real request tag, send directly to * the dispatch list. */ spin_lock(&hctx->lock); list_add(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); return true; } /** * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list * @pos: loop cursor. * @skip: the list element that will not be examined. Iteration starts at * @skip->next. * @head: head of the list to examine. This list must have at least one * element, namely @skip. * @member: name of the list_head structure within typeof(*pos). */ #define list_for_each_entry_rcu_rr(pos, skip, head, member) \ for ((pos) = (skip); \ (pos = (pos)->member.next != (head) ? list_entry_rcu( \ (pos)->member.next, typeof(*pos), member) : \ list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ (pos) != (skip); ) /* * Called after a driver tag has been freed to check whether a hctx needs to * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware * queues in a round-robin fashion if the tag set of @hctx is shared with other * hardware queues. */ void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) { struct blk_mq_tags *const tags = hctx->tags; 43 struct blk_mq_tag_set *const set = hctx->queue->tag_set; struct request_queue *const queue = hctx->queue, *q; struct blk_mq_hw_ctx *hctx2; unsigned int i, j; if (set->flags & BLK_MQ_F_TAG_SHARED) { /* * If this is 0, then we know that no hardware queues * have RESTART marked. We're done. */ if (!atomic_read(&queue->shared_hctx_restart)) return; rcu_read_lock(); list_for_each_entry_rcu_rr(q, queue, &set->tag_list, tag_set_list) { queue_for_each_hw_ctx(q, hctx2, i) if (hctx2->tags == tags && blk_mq_sched_restart_hctx(hctx2)) goto done; } j = hctx->queue_num + 1; for (i = 0; i < queue->nr_hw_queues; i++, j++) { if (j == queue->nr_hw_queues) j = 0; hctx2 = queue->queue_hw_ctx[j]; if (hctx2->tags == tags && blk_mq_sched_restart_hctx(hctx2)) break; } done: rcu_read_unlock(); } else { 43 blk_mq_sched_restart_hctx(hctx); } } /* * Add flush/fua to the queue. If we fail getting a driver tag, then * punt to the requeue list. Requeue will re-invoke us from a context * that's safe to block from. */ static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, struct request *rq, bool can_block) { if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { 544 blk_insert_flush(rq); blk_mq_run_hw_queue(hctx, true); } else blk_mq_add_to_requeue_list(rq, false, true); } void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block) { 1593 struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 1593 if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { 544 blk_mq_sched_insert_flush(hctx, rq, can_block); return; } 1423 if (e && blk_mq_sched_bypass_insert(hctx, rq)) goto run; 1423 if (e && e->type->ops.mq.insert_requests) { 1423 LIST_HEAD(list); 1423 list_add(&rq->queuelist, &list); 1423 e->type->ops.mq.insert_requests(hctx, &list, at_head); } else { spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); spin_unlock(&ctx->lock); } run: 1591 if (run_queue) 1423 blk_mq_run_hw_queue(hctx, async); } void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { 1368 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); struct elevator_queue *e = hctx->queue->elevator; if (e) { struct request *rq, *next; /* * We bypass requests that already have a driver tag assigned, * which should only be flushes. Flushes are only ever inserted * as single requests, so we shouldn't ever hit the * WARN_ON_ONCE() below (but let's handle it just in case). */ 1368 list_for_each_entry_safe(rq, next, list, queuelist) { 1368 if (WARN_ON_ONCE(rq->tag != -1)) { list_del_init(&rq->queuelist); blk_mq_sched_bypass_insert(hctx, rq); } } } 1368 if (e && e->type->ops.mq.insert_requests) 1368 e->type->ops.mq.insert_requests(hctx, list, false); else blk_mq_insert_requests(hctx, ctx, list); 1368 blk_mq_run_hw_queue(hctx, run_queue_async); } static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { 27 if (hctx->sched_tags) { 27 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); blk_mq_free_rq_map(hctx->sched_tags); hctx->sched_tags = NULL; } } static int blk_mq_sched_alloc_tags(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; int ret; 39 hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, set->reserved_tags); if (!hctx->sched_tags) return -ENOMEM; 40 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); 40 if (ret) blk_mq_sched_free_tags(set, hctx, hctx_idx); return ret; } static void blk_mq_sched_tags_teardown(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_sched_free_tags(set, hctx, i); } int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { 39 struct elevator_queue *e = q->elevator; int ret; if (!e) return 0; ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx); if (ret) return ret; if (e->type->ops.mq.init_hctx) { ret = e->type->ops.mq.init_hctx(hctx, hctx_idx); if (ret) { blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); return ret; } } blk_mq_debugfs_register_sched_hctx(q, hctx); 39 return 0; } void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { 27 struct elevator_queue *e = q->elevator; if (!e) return; 27 blk_mq_debugfs_unregister_sched_hctx(hctx); if (e->type->ops.mq.exit_hctx && hctx->sched_data) { e->type->ops.mq.exit_hctx(hctx, hctx_idx); hctx->sched_data = NULL; } 27 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); } int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; unsigned int i; int ret; 39 if (!e) { q->elevator = NULL; return 0; } /* * Default to double of smaller one between hw queue_depth and 128, * since we don't split into sync/async like the old code did. * Additionally, this is a per-hw queue depth. */ 39 q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, BLKDEV_MAX_RQ); 40 queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_tags(q, hctx, i); if (ret) goto err; } 40 ret = e->ops.mq.init_sched(q, e); if (ret) goto err; 40 blk_mq_debugfs_register_sched(q); 40 queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.mq.init_hctx) { ret = e->ops.mq.init_hctx(hctx, i); if (ret) { eq = q->elevator; blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; } } 40 blk_mq_debugfs_register_sched_hctx(q, hctx); } return 0; err: blk_mq_sched_tags_teardown(q); q->elevator = NULL; 40 return ret; } void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; unsigned int i; queue_for_each_hw_ctx(q, hctx, i) { blk_mq_debugfs_unregister_sched_hctx(hctx); if (e->type->ops.mq.exit_hctx && hctx->sched_data) { e->type->ops.mq.exit_hctx(hctx, i); hctx->sched_data = NULL; } } blk_mq_debugfs_unregister_sched(q); if (e->type->ops.mq.exit_sched) e->type->ops.mq.exit_sched(e); blk_mq_sched_tags_teardown(q); q->elevator = NULL; } int blk_mq_sched_init(struct request_queue *q) { int ret; 39 mutex_lock(&q->sysfs_lock); ret = elevator_init(q, NULL); mutex_unlock(&q->sysfs_lock); return ret; }
/* * Copyright (c) 2012 Taobao. * Written by Tao Ma <boyu.mt@taobao.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include <linux/fiemap.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "truncate.h" #include <trace/events/android_fs.h> #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) return EXT4_I(inode)->i_inline_size; return 0; } static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; int free, min_offs; min_offs = EXT4_SB(inode->i_sb)->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - EXT4_I(inode)->i_extra_isize - sizeof(struct ext4_xattr_ibody_header); /* * We need to subtract another sizeof(__u32) since an in-inode xattr * needs an empty 4 bytes to indicate the gap between the xattr entry * and the name/value pair. */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return EXT4_XATTR_SIZE(min_offs - EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - EXT4_XATTR_ROUND - sizeof(__u32)); raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); entry = IFIRST(header); /* Compute min_offs. */ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; } } free = min_offs - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); if (EXT4_I(inode)->i_inline_off) { entry = (struct ext4_xattr_entry *) ((void *)raw_inode + EXT4_I(inode)->i_inline_off); free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); goto out; } free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); if (free > EXT4_XATTR_ROUND) free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); else free = 0; out: return free; } /* * Get the maximum size we now can store in an inode. * If we can't find the space for a xattr entry, don't use the space * of the extents since we have no space to indicate the inline data. */ int ext4_get_max_inline_size(struct inode *inode) { int error, max_inline_size; struct ext4_iloc iloc; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) { ext4_error_inode(inode, __func__, __LINE__, 0, "can't get inode location %lu", inode->i_ino); return 0; } down_read(&EXT4_I(inode)->xattr_sem); max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); up_read(&EXT4_I(inode)->xattr_sem); brelse(iloc.bh); if (!max_inline_size) return 0; return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; } /* * this function does not take xattr_sem, which is OK because it is * currently only used in a code path coming form ext4_iget, before * the new inode has been unlocked */ int ext4_find_inline_data_nolock(struct inode *inode) { struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; int error; if (EXT4_I(inode)->i_extra_isize == 0) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; if (!is.s.not_found) { if (is.s.here->e_value_inum) { EXT4_ERROR_INODE(inode, "inline data xattr refers " "to an external xattr inode"); error = -EFSCORRUPTED; goto out; } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } out: brelse(is.iloc.bh); return error; } static int ext4_read_inline_data(struct inode *inode, void *buffer, unsigned int len, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; int cp_len = 0; struct ext4_inode *raw_inode; if (!len) return 0; BUG_ON(len > EXT4_I(inode)->i_inline_size); cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? len : EXT4_MIN_INLINE_DATA_SIZE; raw_inode = ext4_raw_inode(iloc); memcpy(buffer, (void *)(raw_inode->i_block), cp_len); len -= cp_len; buffer += cp_len; if (!len) goto out; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); len = min_t(unsigned int, len, (unsigned int)le32_to_cpu(entry->e_value_size)); memcpy(buffer, (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); cp_len += len; out: return cp_len; } /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr * value since it is already handled by ext4_xattr_ibody_inline_set. * That saves us one memcpy. */ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, void *buffer, loff_t pos, unsigned int len) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int cp_len = 0; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); raw_inode = ext4_raw_inode(iloc); buffer += pos; if (pos < EXT4_MIN_INLINE_DATA_SIZE) { cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE - pos : len; memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); len -= cp_len; buffer += cp_len; pos += cp_len; } if (!len) return; pos -= EXT4_MIN_INLINE_DATA_SIZE; header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, buffer, len); } static int ext4_create_inline_data(handle_t *handle, struct inode *inode, unsigned len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; if (len > EXT4_MIN_INLINE_DATA_SIZE) { value = EXT4_ZERO_XATTR_VALUE; len -= EXT4_MIN_INLINE_DATA_SIZE; } else { value = ""; len = 0; } /* Insert the the xttr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(!is.s.not_found); error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: brelse(is.iloc.bh); return error; } static int ext4_update_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int error; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; /* If the old space is ok, write the data directly. */ if (len <= EXT4_I(inode)->i_inline_size) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUG_ON(is.s.not_found); len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); if (!value) { error = -ENOMEM; goto out; } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); if (error == -ENODATA) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; /* Update the xttr entry. */ i.value = value; i.value_len = len; error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) goto out; EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + le32_to_cpu(is.s.here->e_value_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); out: kfree(value); brelse(is.iloc.bh); return error; } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, unsigned int len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return -ENOSPC; size = ext4_get_max_inline_size(inode); if (size < len) return -ENOSPC; ext4_write_lock_xattr(inode, &no_expand); if (ei->i_inline_off) ret = ext4_update_inline_data(handle, inode, len); else ret = ext4_create_inline_data(handle, inode, len); ext4_write_unlock_xattr(inode, &no_expand); return ret; } static int ext4_destroy_inline_data_nolock(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_xattr_ibody_find is = { .s = { .not_found = 0, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, .value = NULL, .value_len = 0, }; int error; if (!ei->i_inline_off) return 0; error = ext4_get_inode_loc(inode, &is.iloc); if (error) return error; error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (error) goto out; memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE); if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); EXT4_I(inode)->i_inline_off = 0; EXT4_I(inode)->i_inline_size = 0; ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); out: brelse(is.iloc.bh); if (error == -ENODATA) error = 0; return error; } static int ext4_read_inline_page(struct inode *inode, struct page *page) { void *kaddr; int ret = 0; size_t len; struct ext4_iloc iloc; BUG_ON(!PageLocked(page)); BUG_ON(!ext4_has_inline_data(inode)); BUG_ON(page->index); if (!EXT4_I(inode)->i_inline_off) { ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", inode->i_ino); goto out; } ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); kaddr = kmap_atomic(page); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); flush_dcache_page(page); kunmap_atomic(kaddr); zero_user_segment(page, len, PAGE_SIZE); SetPageUptodate(page); brelse(iloc.bh); out: return ret; } int ext4_readpage_inline(struct inode *inode, struct page *page) { int ret = 0; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); return -EAGAIN; } if (trace_android_fs_dataread_start_enabled()) { char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; path = android_fstrace_get_pathname(pathbuf, MAX_TRACE_PATHBUF_LEN, inode); trace_android_fs_dataread_start(inode, page_offset(page), PAGE_SIZE, current->pid, path, current->comm); } /* * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ if (!page->index) ret = ext4_read_inline_page(inode, page); else if (!PageUptodate(page)) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE); up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); return ret >= 0 ? 0 : ret; } static int ext4_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, unsigned flags) { int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; struct page *page = NULL; unsigned from, to; struct ext4_iloc iloc; if (!ext4_has_inline_data(inode)) { /* * clear the flag so that no new write * will trap here again. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } /* We cannot recurse into the filesystem as the transaction is already * started */ flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; goto out; } ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; /* If some one has already done this for us, just exit. */ if (!ext4_has_inline_data(inode)) { ret = 0; goto out; } from = 0; to = ext4_get_inline_size(inode); if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) goto out; } ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; if (ext4_should_dioread_nolock(inode)) { ret = __block_write_begin(page, from, to, ext4_get_block_unwritten); } else ret = __block_write_begin(page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, page_buffers(page), from, to, NULL, do_journal_get_write_access); } if (ret) { unlock_page(page); put_page(page); page = NULL; ext4_orphan_add(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; ext4_journal_stop(handle); handle = NULL; ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (page) block_commit_write(page, from, to); out: if (page) { unlock_page(page); put_page(page); } if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); if (handle) ext4_journal_stop(handle); brelse(iloc.bh); return ret; } /* * Try to write data in the inode. * If the inode has inline data, check whether the new write can be * in the inode also. If not, create the page the handle, move the data * to the page make it update and let the later codes create extent for it. */ int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep) { int ret; handle_t *handle; struct page *page; struct ext4_iloc iloc; if (pos + len > ext4_get_max_inline_size(inode)) goto convert; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; /* * The possible write could happen in the inode, * so try to reserve the space in inode first. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; goto out; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out; /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); brelse(iloc.bh); goto convert; } ret = ext4_journal_get_write_access(handle, iloc.bh); if (ret) goto out; flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; goto out; } *pagep = page; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; unlock_page(page); put_page(page); goto out_up_read; } if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) { unlock_page(page); put_page(page); goto out_up_read; } } ret = 1; handle = NULL; out_up_read: up_read(&EXT4_I(inode)->xattr_sem); out: if (handle && (ret != 1)) ext4_journal_stop(handle); brelse(iloc.bh); return ret; convert: return ext4_convert_inline_data_to_extent(mapping, inode, flags); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { int ret, no_expand; void *kaddr; struct ext4_iloc iloc; if (unlikely(copied < len)) { if (!PageUptodate(page)) { copied = 0; goto out; } } ret = ext4_get_inode_loc(inode, &iloc); if (ret) { ext4_std_error(inode->i_sb, ret); copied = 0; goto out; } ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, pos, len); kunmap_atomic(kaddr); SetPageUptodate(page); /* clear page dirty so that writepages wouldn't work for us. */ ClearPageDirty(page); ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); mark_inode_dirty(inode); out: return copied; } struct buffer_head * ext4_journalled_write_inline_data(struct inode *inode, unsigned len, struct page *page) { int ret, no_expand; void *kaddr; struct ext4_iloc iloc; ret = ext4_get_inode_loc(inode, &iloc); if (ret) { ext4_std_error(inode->i_sb, ret); return NULL; } ext4_write_lock_xattr(inode, &no_expand); kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, 0, len); kunmap_atomic(kaddr); ext4_write_unlock_xattr(inode, &no_expand); return iloc.bh; } /* * Try to make the page cache and handle ready for the inline data case. * We can call this function in 2 cases: * 1. The inode is created and the first write exceeds inline size. We can * clear the inode state safely. * 2. The inode has inline data, then we need to read the data, make it * update and dirty so that ext4_da_writepages can handle it. We don't * need to start the journal since the file's metatdata isn't changed now. */ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, unsigned flags, void **fsdata) { int ret = 0, inline_size; struct page *page; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) return -ENOMEM; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); goto out; } inline_size = ext4_get_inline_size(inode); if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) goto out; } ret = __block_write_begin(page, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); put_page(page); ext4_truncate_failed_write(inode); return ret; } SetPageDirty(page); SetPageUptodate(page); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); *fsdata = (void *)CONVERT_INLINE_DATA; out: up_read(&EXT4_I(inode)->xattr_sem); if (page) { unlock_page(page); put_page(page); } return ret; } /* * Prepare the write for the inline data. * If the the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the * normal ext4_da_write_end). */ int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int ret, inline_size; handle_t *handle; struct page *page; struct ext4_iloc iloc; int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } inline_size = ext4_get_max_inline_size(inode); ret = -ENOSPC; if (inline_size >= pos + len) { ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) goto out_journal; } /* * We cannot recurse into the filesystem as the transaction * is already started. */ flags |= AOP_FLAG_NOFS; if (ret == -ENOSPC) { ext4_journal_stop(handle); ret = ext4_da_convert_inline_data_to_extent(mapping, inode, flags, fsdata); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; goto out; } page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; goto out_journal; } down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; goto out_release_page; } if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); if (ret < 0) goto out_release_page; } ret = ext4_journal_get_write_access(handle, iloc.bh); if (ret) goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); *pagep = page; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); put_page(page); out_journal: ext4_journal_stop(handle); out: brelse(iloc.bh); return ret; } int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { int ret; ret = ext4_write_inline_data_end(inode, pos, len, copied, page); if (ret < 0) { unlock_page(page); put_page(page); return ret; } copied = ret; /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold i_mutex. * * But it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. */ if (pos+copied > inode->i_size) i_size_write(inode, pos+copied); unlock_page(page); put_page(page); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock * ordering of page lock and transaction start for journaling * filesystems. */ mark_inode_dirty(inode); return copied; } #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) { int offset; unsigned short de_len; struct ext4_dir_entry_2 *de = inline_start; void *dlimit = inline_start + inline_size; trace_printk("inode %lu\n", dir->i_ino); offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", offset, de_len, de->name_len, de->name, de->name_len, le32_to_cpu(de->inode)); if (ext4_check_dir_entry(dir, NULL, de, bh, inline_start, inline_size, offset)) BUG(); offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } } #else #define ext4_show_inline_dir(dir, bh, inline_start, inline_size) #endif /* * Add a new entry into a inline dir. * It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { int err; struct ext4_dir_entry_2 *de; err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; ext4_insert_dentry(inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; return 1; } static void *ext4_get_inline_xattr_pos(struct inode *inode, struct ext4_iloc *iloc) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; BUG_ON(!EXT4_I(inode)->i_inline_off); header = IHDR(inode, ext4_raw_inode(iloc)); entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + EXT4_I(inode)->i_inline_off); return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); } /* Set the final de to cover the whole block. */ static void ext4_update_final_de(void *de_buf, int old_size, int new_size) { struct ext4_dir_entry_2 *de, *prev_de; void *limit; int de_len; de = (struct ext4_dir_entry_2 *)de_buf; if (old_size) { limit = de_buf + old_size; do { prev_de = de; de_len = ext4_rec_len_from_disk(de->rec_len, old_size); de_buf += de_len; de = (struct ext4_dir_entry_2 *)de_buf; } while (de_buf < limit); prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - old_size, new_size); } else { /* this is just created, so create an empty entry. */ de->inode = 0; de->rec_len = ext4_rec_len_to_disk(new_size, new_size); } } static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, struct ext4_iloc *iloc) { int ret; int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; int new_size = get_max_inline_xattr_value_size(dir, iloc); if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) return -ENOSPC; ret = ext4_update_inline_data(handle, dir, new_size + EXT4_MIN_INLINE_DATA_SIZE); if (ret) return ret; ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE); dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; return 0; } static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc, void *buf, int inline_size) { ext4_create_inline_data(handle, inode, inline_size); ext4_write_inline_data(inode, iloc, buf, 0, inline_size); ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } static int ext4_finish_convert_inline_dir(handle_t *handle, struct inode *inode, struct buffer_head *dir_block, void *buf, int inline_size) { int err, csum_size = 0, header_size = 0; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; void *target = dir_block->b_data; /* * First create "." and ".." and then copy the dir information * back to the block. */ de = (struct ext4_dir_entry_2 *)target; de = ext4_init_dot_dotdot(inode, de, inode->i_sb->s_blocksize, csum_size, le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); header_size = (void *)de - target; memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; i_size_write(inode, inode->i_sb->s_blocksize); EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; ext4_update_final_de(dir_block->b_data, inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, inode->i_sb->s_blocksize - csum_size); if (csum_size) { t = EXT4_DIRENT_TAIL(dir_block->b_data, inode->i_sb->s_blocksize); initialize_dirent_tail(t, inode->i_sb->s_blocksize); } set_buffer_uptodate(dir_block); err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); if (err) return err; set_buffer_verified(dir_block); return ext4_mark_inode_dirty(handle, inode); } static int ext4_convert_inline_data_nolock(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int error; void *buf = NULL; struct buffer_head *data_bh = NULL; struct ext4_map_blocks map; int inline_size; inline_size = ext4_get_inline_size(inode); buf = kmalloc(inline_size, GFP_NOFS); if (!buf) { error = -ENOMEM; goto out; } error = ext4_read_inline_data(inode, buf, inline_size, iloc); if (error < 0) goto out; /* * Make sure the inline directory entries pass checks before we try to * convert them, so that we avoid touching stuff that needs fsck. */ if (S_ISDIR(inode->i_mode)) { error = ext4_check_all_de(inode, iloc->bh, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); if (error) goto out; } error = ext4_destroy_inline_data_nolock(handle, inode); if (error) goto out; map.m_lblk = 0; map.m_len = 1; map.m_flags = 0; error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); if (error < 0) goto out_restore; if (!(map.m_flags & EXT4_MAP_MAPPED)) { error = -EIO; goto out_restore; } data_bh = sb_getblk(inode->i_sb, map.m_pblk); if (!data_bh) { error = -ENOMEM; goto out_restore; } lock_buffer(data_bh); error = ext4_journal_get_create_access(handle, data_bh); if (error) { unlock_buffer(data_bh); error = -EIO; goto out_restore; } memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); if (!S_ISDIR(inode->i_mode)) { memcpy(data_bh->b_data, buf, inline_size); set_buffer_uptodate(data_bh); error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { error = ext4_finish_convert_inline_dir(handle, inode, data_bh, buf, inline_size); } unlock_buffer(data_bh); out_restore: if (error) ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); out: brelse(data_bh); kfree(buf); return error; } /* * Try to add the new entry to the inline data. * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { int ret, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; ret = ext4_get_inode_loc(dir, &iloc); if (ret) return ret; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) goto out; inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; /* check whether it can be inserted to inline xattr space. */ inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; if (!inline_size) { /* Try to use the xattr space.*/ ret = ext4_update_inline_dir(handle, dir, &iloc); if (ret && ret != -ENOSPC) goto out; inline_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; } /* * The inline space is filled up, so create a new block for it. * As the extent tree will be created, we have to save the inline * dir first. */ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); out: ext4_write_unlock_xattr(dir, &no_expand); ext4_mark_inode_dirty(handle, dir); brelse(iloc.bh); return ret; } /* * This function fills a red-black tree with information from an * inlined dir. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ int htree_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data) { int err = 0, count = 0; unsigned int parent_ino; int pos; struct ext4_dir_entry_2 *de; struct inode *inode = file_inode(dir_file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; struct fscrypt_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; pos = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); while (pos < inline_size) { /* * As inlined dir doesn't store any information about '.' and * only the inode number of '..' is stored, we have to handle * them differently. */ if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( EXT4_DIR_REC_LEN(fake.name_len), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( EXT4_DIR_REC_LEN(fake.name_len), inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; } else { de = (struct ext4_dir_entry_2 *)(dir_buf + pos); pos += ext4_rec_len_from_disk(de->rec_len, inline_size); if (ext4_check_dir_entry(inode, dir_file, de, iloc.bh, dir_buf, inline_size, pos)) { ret = count; goto out; } } ext4fs_dirhash(dir, de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); if (err) { count = err; goto out; } count++; } ret = count; out: kfree(dir_buf); brelse(iloc.bh); return ret; } /* * So this function is called when the volume is mkfsed with * dir_index disabled. In order to keep f_pos persistent * after we convert from an inlined dir to a blocked based, * we just pretend that we are a normal dir and return the * offset as if '.' and '..' really take place. * */ int ext4_read_inline_dir(struct file *file, struct dir_context *ctx, int *has_inline_data) { unsigned int offset, parent_ino; int i; struct ext4_dir_entry_2 *de; struct super_block *sb; struct inode *inode = file_inode(file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { up_read(&EXT4_I(inode)->xattr_sem); *has_inline_data = 0; goto out; } inline_size = ext4_get_inline_size(inode); dir_buf = kmalloc(inline_size, GFP_NOFS); if (!dir_buf) { ret = -ENOMEM; up_read(&EXT4_I(inode)->xattr_sem); goto out; } ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); up_read(&EXT4_I(inode)->xattr_sem); if (ret < 0) goto out; ret = 0; sb = inode->i_sb; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); offset = ctx->pos; /* * dotdot_offset and dotdot_size is the real offset and * size for ".." and "." if the dir is block based while * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ dotdot_offset = EXT4_DIR_REC_LEN(1); dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; /* * If the version has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the inline * dir to make sure. */ if (file->f_version != inode->i_version) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and * ".." is dotdot_offset. */ if (!i) { i = dotdot_offset; continue; } else if (i == dotdot_offset) { i = dotdot_size; continue; } /* for other entry, the real offset in * the buf has to be tuned accordingly. */ de = (struct ext4_dir_entry_2 *) (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) < EXT4_DIR_REC_LEN(1)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); } offset = i; ctx->pos = offset; file->f_version = inode->i_version; } while (ctx->pos < extra_size) { if (ctx->pos == 0) { if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) goto out; ctx->pos = dotdot_offset; continue; } if (ctx->pos == dotdot_offset) { if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) goto out; ctx->pos = dotdot_size; continue; } de = (struct ext4_dir_entry_2 *) (dir_buf + ctx->pos - extra_offset); if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, extra_size, ctx->pos)) goto out; if (le32_to_cpu(de->inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto out; } ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); } out: kfree(dir_buf); brelse(iloc.bh); return ret; } struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval) { struct ext4_iloc iloc; *retval = ext4_get_inode_loc(inode, &iloc); if (*retval) return NULL; *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; return iloc.bh; } /* * Try to create the inline data for the new dir. * If it succeeds, return 0, otherwise return the error. * In case of ENOSPC, the caller should create the normal disk layout dir. */ int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode) { int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; struct ext4_iloc iloc; struct ext4_dir_entry_2 *de; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; ret = ext4_prepare_inline_data(handle, inode, inline_size); if (ret) goto out; /* * For inline dir, we only save the inode information for the ".." * and create a fake dentry to cover the left space. */ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; de->inode = cpu_to_le32(parent->i_ino); de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); de->inode = 0; de->rec_len = ext4_rec_len_to_disk( inline_size - EXT4_INLINE_DOTDOT_SIZE, inline_size); set_nlink(inode, 2); inode->i_size = EXT4_I(inode)->i_disksize = inline_size; out: brelse(iloc.bh); return ret; } struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { int ret; struct ext4_iloc iloc; void *inline_start; int inline_size; if (ext4_get_inode_loc(dir, &iloc)) return NULL; down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) goto out; if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) goto out; inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; ret = ext4_search_dir(iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; out: brelse(iloc.bh); iloc.bh = NULL; out_find: up_read(&EXT4_I(dir)->xattr_sem); return iloc.bh; } int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data) { int err, inline_size, no_expand; struct ext4_iloc iloc; void *inline_start; err = ext4_get_inode_loc(dir, &iloc); if (err) return err; ext4_write_lock_xattr(dir, &no_expand); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < EXT4_MIN_INLINE_DATA_SIZE) { inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; } else { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) goto out; err = ext4_generic_delete_entry(handle, dir, de_del, bh, inline_start, inline_size, 0); if (err) goto out; ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); out: ext4_write_unlock_xattr(dir, &no_expand); if (likely(err == 0)) err = ext4_mark_inode_dirty(handle, dir); brelse(iloc.bh); if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Get the inline dentry at offset. */ static inline struct ext4_dir_entry_2 * ext4_get_inline_entry(struct inode *inode, struct ext4_iloc *iloc, unsigned int offset, void **inline_start, int *inline_size) { void *inline_pos; BUG_ON(offset > ext4_get_inline_size(inode)); if (offset < EXT4_MIN_INLINE_DATA_SIZE) { inline_pos = (void *)ext4_raw_inode(iloc)->i_block; *inline_size = EXT4_MIN_INLINE_DATA_SIZE; } else { inline_pos = ext4_get_inline_xattr_pos(inode, iloc); offset -= EXT4_MIN_INLINE_DATA_SIZE; *inline_size = ext4_get_inline_size(inode) - EXT4_MIN_INLINE_DATA_SIZE; } if (inline_start) *inline_start = inline_pos; return (struct ext4_dir_entry_2 *)(inline_pos + offset); } bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; size_t inline_len; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; bool ret = true; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", err, dir->i_ino); return true; } down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; if (!le32_to_cpu(de->inode)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); ret = true; goto out; } inline_len = ext4_get_inline_size(dir); offset = EXT4_INLINE_DOTDOT_SIZE; while (offset < inline_len) { de = ext4_get_inline_entry(dir, &iloc, offset, &inline_pos, &inline_size); if (ext4_check_dir_entry(dir, NULL, de, iloc.bh, inline_pos, inline_size, offset)) { ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - " "inode %u, rec_len %u, name_len %d" "inline size %d", dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); ret = true; goto out; } if (le32_to_cpu(de->inode)) { ret = false; goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); } out: up_read(&EXT4_I(dir)->xattr_sem); brelse(iloc.bh); return ret; } int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) { int ret, no_expand; ext4_write_lock_xattr(inode, &no_expand); ret = ext4_destroy_inline_data_nolock(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); return ret; } int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len) { __u64 physical = 0; __u64 inline_len; __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | FIEMAP_EXTENT_LAST; int error = 0; struct ext4_iloc iloc; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { *has_inline = 0; goto out; } inline_len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); if (start >= inline_len) goto out; if (start + len < inline_len) inline_len = start + len; inline_len -= start; error = ext4_get_inode_loc(inode, &iloc); if (error) goto out; physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); brelse(iloc.bh); out: up_read(&EXT4_I(inode)->xattr_sem); if (physical) error = fiemap_fill_next_extent(fieinfo, start, physical, inline_len, flags); return (error < 0 ? error : 0); } int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; int inline_size, value_len, needed_blocks, no_expand, err = 0; size_t i_size; void *value = NULL; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_info i = { .name_index = EXT4_XATTR_INDEX_SYSTEM, .name = EXT4_XATTR_SYSTEM_DATA, }; needed_blocks = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_write_lock_xattr(inode, &no_expand); if (!ext4_has_inline_data(inode)) { *has_inline = 0; ext4_journal_stop(handle); return 0; } if ((err = ext4_orphan_add(handle, inode)) != 0) goto out; if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0) goto out; down_write(&EXT4_I(inode)->i_data_sem); i_size = inode->i_size; inline_size = ext4_get_inline_size(inode); EXT4_I(inode)->i_disksize = i_size; if (i_size < inline_size) { /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; BUG_ON(is.s.not_found); value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); if (!value) { err = -ENOMEM; goto out_error; } err = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, value_len); if (err <= 0) goto out_error; i.value = value; i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; err = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); if (err) goto out_error; } /* Clear the content within i_blocks. */ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; memset(p + i_size, 0, EXT4_MIN_INLINE_DATA_SIZE - i_size); } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? EXT4_MIN_INLINE_DATA_SIZE : i_size; } out_error: up_write(&EXT4_I(inode)->i_data_sem); out: brelse(is.iloc.bh); ext4_write_unlock_xattr(inode, &no_expand); kfree(value); if (inode->i_nlink) ext4_orphan_del(handle, inode); if (err == 0) { inode->i_mtime = inode->i_ctime = current_time(inode); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_journal_stop(handle); return err; } int ext4_convert_inline_data(struct inode *inode) { int error, needed_blocks, no_expand; handle_t *handle; struct ext4_iloc iloc; 430 if (!ext4_has_inline_data(inode)) { 430 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 430 return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_free; } ext4_write_lock_xattr(inode, &no_expand); if (ext4_has_inline_data(inode)) error = ext4_convert_inline_data_nolock(handle, inode, &iloc); ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); out_free: brelse(iloc.bh); return error; }
/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM exceptions #if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> #include <asm/trace/common.h> extern int trace_pagefault_reg(void); extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, TP_PROTO(unsigned long address, struct pt_regs *regs, unsigned long error_code), TP_ARGS(address, regs, error_code), TP_STRUCT__entry( __field( unsigned long, address ) __field( unsigned long, ip ) __field( unsigned long, error_code ) ), TP_fast_assign( __entry->address = address; __entry->ip = regs->ip; __entry->error_code = error_code; ), TP_printk("address=%pf ip=%pf error_code=0x%lx", (void *)__entry->address, (void *)__entry->ip, __entry->error_code) ); #define DEFINE_PAGE_FAULT_EVENT(name) \ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ trace_pagefault_reg, trace_pagefault_unreg); 6 DEFINE_PAGE_FAULT_EVENT(page_fault_user); 9 DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE exceptions #endif /* _TRACE_PAGE_FAULT_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
/* SPDX-License-Identifier: GPL-2.0 */ /* Atomic operations usable in machine independent code */ #ifndef _LINUX_ATOMIC_H #define _LINUX_ATOMIC_H #include <asm/atomic.h> #include <asm/barrier.h> /* * Relaxed variants of xchg, cmpxchg and some atomic operations. * * We support four variants: * * - Fully ordered: The default implementation, no suffix required. * - Acquire: Provides ACQUIRE semantics, _acquire suffix. * - Release: Provides RELEASE semantics, _release suffix. * - Relaxed: No ordering guarantees, _relaxed suffix. * * For compound atomics performing both a load and a store, ACQUIRE * semantics apply only to the load and RELEASE semantics only to the * store portion of the operation. Note that a failed cmpxchg_acquire * does -not- imply any memory ordering constraints. * * See Documentation/memory-barriers.txt for ACQUIRE/RELEASE definitions. */ #ifndef atomic_read_acquire #define atomic_read_acquire(v) smp_load_acquire(&(v)->counter) #endif #ifndef atomic_set_release #define atomic_set_release(v, i) smp_store_release(&(v)->counter, (i)) #endif /* * The idea here is to build acquire/release variants by adding explicit * barriers on top of the relaxed variant. In the case where the relaxed * variant is already fully ordered, no additional barriers are needed. * * Besides, if an arch has a special barrier for acquire/release, it could * implement its own __atomic_op_* and use the same framework for building * variants * * If an architecture overrides __atomic_op_acquire() it will probably want * to define smp_mb__after_spinlock(). */ #ifndef __atomic_op_acquire #define __atomic_op_acquire(op, args...) \ ({ \ typeof(op##_relaxed(args)) __ret = op##_relaxed(args); \ smp_mb__after_atomic(); \ __ret; \ }) #endif #ifndef __atomic_op_release #define __atomic_op_release(op, args...) \ ({ \ smp_mb__before_atomic(); \ op##_relaxed(args); \ }) #endif #ifndef __atomic_op_fence #define __atomic_op_fence(op, args...) \ ({ \ typeof(op##_relaxed(args)) __ret; \ smp_mb__before_atomic(); \ __ret = op##_relaxed(args); \ smp_mb__after_atomic(); \ __ret; \ }) #endif /* atomic_add_return_relaxed */ #ifndef atomic_add_return_relaxed #define atomic_add_return_relaxed atomic_add_return #define atomic_add_return_acquire atomic_add_return #define atomic_add_return_release atomic_add_return #else /* atomic_add_return_relaxed */ #ifndef atomic_add_return_acquire #define atomic_add_return_acquire(...) \ __atomic_op_acquire(atomic_add_return, __VA_ARGS__) #endif #ifndef atomic_add_return_release #define atomic_add_return_release(...) \ __atomic_op_release(atomic_add_return, __VA_ARGS__) #endif #ifndef atomic_add_return #define atomic_add_return(...) \ __atomic_op_fence(atomic_add_return, __VA_ARGS__) #endif #endif /* atomic_add_return_relaxed */ /* atomic_inc_return_relaxed */ #ifndef atomic_inc_return_relaxed #define atomic_inc_return_relaxed atomic_inc_return #define atomic_inc_return_acquire atomic_inc_return #define atomic_inc_return_release atomic_inc_return #else /* atomic_inc_return_relaxed */ #ifndef atomic_inc_return_acquire #define atomic_inc_return_acquire(...) \ __atomic_op_acquire(atomic_inc_return, __VA_ARGS__) #endif #ifndef atomic_inc_return_release #define atomic_inc_return_release(...) \ __atomic_op_release(atomic_inc_return, __VA_ARGS__) #endif #ifndef atomic_inc_return #define atomic_inc_return(...) \ __atomic_op_fence(atomic_inc_return, __VA_ARGS__) #endif #endif /* atomic_inc_return_relaxed */ /* atomic_sub_return_relaxed */ #ifndef atomic_sub_return_relaxed #define atomic_sub_return_relaxed atomic_sub_return #define atomic_sub_return_acquire atomic_sub_return #define atomic_sub_return_release atomic_sub_return #else /* atomic_sub_return_relaxed */ #ifndef atomic_sub_return_acquire #define atomic_sub_return_acquire(...) \ __atomic_op_acquire(atomic_sub_return, __VA_ARGS__) #endif #ifndef atomic_sub_return_release #define atomic_sub_return_release(...) \ __atomic_op_release(atomic_sub_return, __VA_ARGS__) #endif #ifndef atomic_sub_return #define atomic_sub_return(...) \ __atomic_op_fence(atomic_sub_return, __VA_ARGS__) #endif #endif /* atomic_sub_return_relaxed */ /* atomic_dec_return_relaxed */ #ifndef atomic_dec_return_relaxed #define atomic_dec_return_relaxed atomic_dec_return #define atomic_dec_return_acquire atomic_dec_return #define atomic_dec_return_release atomic_dec_return #else /* atomic_dec_return_relaxed */ #ifndef atomic_dec_return_acquire #define atomic_dec_return_acquire(...) \ __atomic_op_acquire(atomic_dec_return, __VA_ARGS__) #endif #ifndef atomic_dec_return_release #define atomic_dec_return_release(...) \ __atomic_op_release(atomic_dec_return, __VA_ARGS__) #endif #ifndef atomic_dec_return #define atomic_dec_return(...) \ __atomic_op_fence(atomic_dec_return, __VA_ARGS__) #endif #endif /* atomic_dec_return_relaxed */ /* atomic_fetch_add_relaxed */ #ifndef atomic_fetch_add_relaxed #define atomic_fetch_add_relaxed atomic_fetch_add #define atomic_fetch_add_acquire atomic_fetch_add #define atomic_fetch_add_release atomic_fetch_add #else /* atomic_fetch_add_relaxed */ #ifndef atomic_fetch_add_acquire #define atomic_fetch_add_acquire(...) \ __atomic_op_acquire(atomic_fetch_add, __VA_ARGS__) #endif #ifndef atomic_fetch_add_release #define atomic_fetch_add_release(...) \ __atomic_op_release(atomic_fetch_add, __VA_ARGS__) #endif #ifndef atomic_fetch_add #define atomic_fetch_add(...) \ __atomic_op_fence(atomic_fetch_add, __VA_ARGS__) #endif #endif /* atomic_fetch_add_relaxed */ /* atomic_fetch_inc_relaxed */ #ifndef atomic_fetch_inc_relaxed #ifndef atomic_fetch_inc #define atomic_fetch_inc(v) atomic_fetch_add(1, (v)) #define atomic_fetch_inc_relaxed(v) atomic_fetch_add_relaxed(1, (v)) #define atomic_fetch_inc_acquire(v) atomic_fetch_add_acquire(1, (v)) #define atomic_fetch_inc_release(v) atomic_fetch_add_release(1, (v)) #else /* atomic_fetch_inc */ #define atomic_fetch_inc_relaxed atomic_fetch_inc #define atomic_fetch_inc_acquire atomic_fetch_inc #define atomic_fetch_inc_release atomic_fetch_inc #endif /* atomic_fetch_inc */ #else /* atomic_fetch_inc_relaxed */ #ifndef atomic_fetch_inc_acquire #define atomic_fetch_inc_acquire(...) \ __atomic_op_acquire(atomic_fetch_inc, __VA_ARGS__) #endif #ifndef atomic_fetch_inc_release #define atomic_fetch_inc_release(...) \ __atomic_op_release(atomic_fetch_inc, __VA_ARGS__) #endif #ifndef atomic_fetch_inc #define atomic_fetch_inc(...) \ __atomic_op_fence(atomic_fetch_inc, __VA_ARGS__) #endif #endif /* atomic_fetch_inc_relaxed */ /* atomic_fetch_sub_relaxed */ #ifndef atomic_fetch_sub_relaxed #define atomic_fetch_sub_relaxed atomic_fetch_sub #define atomic_fetch_sub_acquire atomic_fetch_sub #define atomic_fetch_sub_release atomic_fetch_sub #else /* atomic_fetch_sub_relaxed */ #ifndef atomic_fetch_sub_acquire #define atomic_fetch_sub_acquire(...) \ __atomic_op_acquire(atomic_fetch_sub, __VA_ARGS__) #endif #ifndef atomic_fetch_sub_release #define atomic_fetch_sub_release(...) \ __atomic_op_release(atomic_fetch_sub, __VA_ARGS__) #endif #ifndef atomic_fetch_sub #define atomic_fetch_sub(...) \ __atomic_op_fence(atomic_fetch_sub, __VA_ARGS__) #endif #endif /* atomic_fetch_sub_relaxed */ /* atomic_fetch_dec_relaxed */ #ifndef atomic_fetch_dec_relaxed #ifndef atomic_fetch_dec #define atomic_fetch_dec(v) atomic_fetch_sub(1, (v)) #define atomic_fetch_dec_relaxed(v) atomic_fetch_sub_relaxed(1, (v)) #define atomic_fetch_dec_acquire(v) atomic_fetch_sub_acquire(1, (v)) #define atomic_fetch_dec_release(v) atomic_fetch_sub_release(1, (v)) #else /* atomic_fetch_dec */ #define atomic_fetch_dec_relaxed atomic_fetch_dec #define atomic_fetch_dec_acquire atomic_fetch_dec #define atomic_fetch_dec_release atomic_fetch_dec #endif /* atomic_fetch_dec */ #else /* atomic_fetch_dec_relaxed */ #ifndef atomic_fetch_dec_acquire #define atomic_fetch_dec_acquire(...) \ __atomic_op_acquire(atomic_fetch_dec, __VA_ARGS__) #endif #ifndef atomic_fetch_dec_release #define atomic_fetch_dec_release(...) \ __atomic_op_release(atomic_fetch_dec, __VA_ARGS__) #endif #ifndef atomic_fetch_dec #define atomic_fetch_dec(...) \ __atomic_op_fence(atomic_fetch_dec, __VA_ARGS__) #endif #endif /* atomic_fetch_dec_relaxed */ /* atomic_fetch_or_relaxed */ #ifndef atomic_fetch_or_relaxed #define atomic_fetch_or_relaxed atomic_fetch_or #define atomic_fetch_or_acquire atomic_fetch_or #define atomic_fetch_or_release atomic_fetch_or #else /* atomic_fetch_or_relaxed */ #ifndef atomic_fetch_or_acquire #define atomic_fetch_or_acquire(...) \ __atomic_op_acquire(atomic_fetch_or, __VA_ARGS__) #endif #ifndef atomic_fetch_or_release #define atomic_fetch_or_release(...) \ __atomic_op_release(atomic_fetch_or, __VA_ARGS__) #endif #ifndef atomic_fetch_or #define atomic_fetch_or(...) \ __atomic_op_fence(atomic_fetch_or, __VA_ARGS__) #endif #endif /* atomic_fetch_or_relaxed */ /* atomic_fetch_and_relaxed */ #ifndef atomic_fetch_and_relaxed #define atomic_fetch_and_relaxed atomic_fetch_and #define atomic_fetch_and_acquire atomic_fetch_and #define atomic_fetch_and_release atomic_fetch_and #else /* atomic_fetch_and_relaxed */ #ifndef atomic_fetch_and_acquire #define atomic_fetch_and_acquire(...) \ __atomic_op_acquire(atomic_fetch_and, __VA_ARGS__) #endif #ifndef atomic_fetch_and_release #define atomic_fetch_and_release(...) \ __atomic_op_release(atomic_fetch_and, __VA_ARGS__) #endif #ifndef atomic_fetch_and #define atomic_fetch_and(...) \ __atomic_op_fence(atomic_fetch_and, __VA_ARGS__) #endif #endif /* atomic_fetch_and_relaxed */ #ifdef atomic_andnot /* atomic_fetch_andnot_relaxed */ #ifndef atomic_fetch_andnot_relaxed #define atomic_fetch_andnot_relaxed atomic_fetch_andnot #define atomic_fetch_andnot_acquire atomic_fetch_andnot #define atomic_fetch_andnot_release atomic_fetch_andnot #else /* atomic_fetch_andnot_relaxed */ #ifndef atomic_fetch_andnot_acquire #define atomic_fetch_andnot_acquire(...) \ __atomic_op_acquire(atomic_fetch_andnot, __VA_ARGS__) #endif #ifndef atomic_fetch_andnot_release #define atomic_fetch_andnot_release(...) \ __atomic_op_release(atomic_fetch_andnot, __VA_ARGS__) #endif #ifndef atomic_fetch_andnot #define atomic_fetch_andnot(...) \ __atomic_op_fence(atomic_fetch_andnot, __VA_ARGS__) #endif #endif /* atomic_fetch_andnot_relaxed */ #endif /* atomic_andnot */ /* atomic_fetch_xor_relaxed */ #ifndef atomic_fetch_xor_relaxed #define atomic_fetch_xor_relaxed atomic_fetch_xor #define atomic_fetch_xor_acquire atomic_fetch_xor #define atomic_fetch_xor_release atomic_fetch_xor #else /* atomic_fetch_xor_relaxed */ #ifndef atomic_fetch_xor_acquire #define atomic_fetch_xor_acquire(...) \ __atomic_op_acquire(atomic_fetch_xor, __VA_ARGS__) #endif #ifndef atomic_fetch_xor_release #define atomic_fetch_xor_release(...) \ __atomic_op_release(atomic_fetch_xor, __VA_ARGS__) #endif #ifndef atomic_fetch_xor #define atomic_fetch_xor(...) \ __atomic_op_fence(atomic_fetch_xor, __VA_ARGS__) #endif #endif /* atomic_fetch_xor_relaxed */ /* atomic_xchg_relaxed */ #ifndef atomic_xchg_relaxed #define atomic_xchg_relaxed atomic_xchg #define atomic_xchg_acquire atomic_xchg #define atomic_xchg_release atomic_xchg #else /* atomic_xchg_relaxed */ #ifndef atomic_xchg_acquire #define atomic_xchg_acquire(...) \ __atomic_op_acquire(atomic_xchg, __VA_ARGS__) #endif #ifndef atomic_xchg_release #define atomic_xchg_release(...) \ __atomic_op_release(atomic_xchg, __VA_ARGS__) #endif #ifndef atomic_xchg #define atomic_xchg(...) \ __atomic_op_fence(atomic_xchg, __VA_ARGS__) #endif #endif /* atomic_xchg_relaxed */ /* atomic_cmpxchg_relaxed */ #ifndef atomic_cmpxchg_relaxed #define atomic_cmpxchg_relaxed atomic_cmpxchg #define atomic_cmpxchg_acquire atomic_cmpxchg #define atomic_cmpxchg_release atomic_cmpxchg #else /* atomic_cmpxchg_relaxed */ #ifndef atomic_cmpxchg_acquire #define atomic_cmpx