/*
 * common.c - C code for kernel entry and exit
 * Copyright (c) 2015 Andrew Lutomirski
 * GPL v2
 *
 * Based on asm and ptrace code by many authors.  The code here originated
 * in ptrace.c and signal.c.
 */

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/nospec.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
#include <linux/syscalls.h>

#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>

#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible inline void enter_from_user_mode(void)
{
        CT_WARN_ON(ct_state() != CONTEXT_USER);
        user_exit_irqoff();
}
#else
static inline void enter_from_user_mode(void) {}
#endif

static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
        if (arch == AUDIT_ARCH_X86_64) {
                audit_syscall_entry(regs->orig_ax, regs->di, /*covered*/
                                    regs->si, regs->dx, regs->r10);
        } else
#endif
        {
                audit_syscall_entry(regs->orig_ax, regs->bx, /*covered*/
                                    regs->cx, regs->dx, regs->si);
        }
}

/*
 * Returns the syscall nr to run (which should match regs->orig_ax) or -1
 * to skip the syscall.
 */
static long syscall_trace_enter(struct pt_regs *regs)
{ /*covered*/
        u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;

        struct thread_info *ti = current_thread_info();
        unsigned long ret = 0;
        bool emulated = false;
        u32 work;

        if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
                BUG_ON(regs != task_pt_regs(current));

        work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;

        if (unlikely(work & _TIF_SYSCALL_EMU))
                emulated = true;

        if ((emulated || (work & _TIF_SYSCALL_TRACE)) &&
            tracehook_report_syscall_entry(regs))
                return -1L;

        if (emulated) /*covered*/
                return -1L;

#ifdef CONFIG_SECCOMP
        /*
         * Do seccomp after ptrace, to catch any tracer changes.
         */
        if (work & _TIF_SECCOMP) { /*covered*/
                struct seccomp_data sd;

                sd.arch = arch;
                sd.nr = regs->orig_ax;
                sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
                if (arch == AUDIT_ARCH_X86_64) {
                        sd.args[0] = regs->di; /*covered*/
                        sd.args[1] = regs->si;
                        sd.args[2] = regs->dx;
                        sd.args[3] = regs->r10;
                        sd.args[4] = regs->r8;
                        sd.args[5] = regs->r9;
                } else
#endif
                {
                        sd.args[0] = regs->bx; /*covered*/
                        sd.args[1] = regs->cx;
                        sd.args[2] = regs->dx;
                        sd.args[3] = regs->si;
                        sd.args[4] = regs->di;
                        sd.args[5] = regs->bp;
                }

                ret = __secure_computing(&sd);
                if (ret == -1) /*covered*/
                        return ret;
        }
#endif

        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) /*covered*/
                trace_sys_enter(regs, regs->orig_ax); /*covered*/

        do_audit_syscall_entry(regs, arch); /*covered*/

        return ret ?: regs->orig_ax; /*covered*/
}

#define EXIT_TO_USERMODE_LOOP_FLAGS                                \
        (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |        \
         _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)

static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
        /*
         * In order to return to user mode, we need to have IRQs off with
         * none of EXIT_TO_USERMODE_LOOP_FLAGS set.  Several of these flags
         * can be set at any time on preemptable kernels if we have IRQs on,
         * so we need to loop.  Disabling preemption wouldn't help: doing the
         * work to clear some of the flags can sleep.
         */
        while (true) {
                /* We have work to do. */
                local_irq_enable();

                if (cached_flags & _TIF_NEED_RESCHED) /*covered*/
                        schedule(); /*covered*/

                if (cached_flags & _TIF_UPROBE) /*covered*/
                        uprobe_notify_resume(regs);

                if (cached_flags & _TIF_PATCH_PENDING)
                        klp_update_patch_state(current);

                /* deal with pending signal delivery */
                if (cached_flags & _TIF_SIGPENDING) /*covered*/
                        do_signal(regs); /*covered*/

                if (cached_flags & _TIF_NOTIFY_RESUME) { /*covered*/
                        clear_thread_flag(TIF_NOTIFY_RESUME);
                        tracehook_notify_resume(regs); /*covered*/
                        rseq_handle_notify_resume(NULL, regs); /*covered*/
                }

                if (cached_flags & _TIF_USER_RETURN_NOTIFY) /*covered*/
                        fire_user_return_notifiers(); /*covered*/

                /* Disable IRQs and retry */
                local_irq_disable();

                cached_flags = READ_ONCE(current_thread_info()->flags);

                if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) /*covered*/
                        break;
        }
}

/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
{ /*covered*/
        struct thread_info *ti = current_thread_info();
        u32 cached_flags;

        addr_limit_user_check(); /*covered*/

        lockdep_assert_irqs_disabled();
        lockdep_sys_exit();

        cached_flags = READ_ONCE(ti->flags);

        if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) /*covered*/
                exit_to_usermode_loop(regs, cached_flags); /*covered*/

#ifdef CONFIG_COMPAT
        /*
         * Compat syscalls set TS_COMPAT.  Make sure we clear it before
         * returning to user mode.  We need to clear it *after* signal
         * handling, because syscall restart has a fixup for compat
         * syscalls.  The fixup is exercised by the ptrace_syscall_32
         * selftest.
         *
         * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
         * special case only applies after poking regs and before the
         * very next return to user mode.
         */
        ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
#endif

        user_enter_irqoff();
}

#define SYSCALL_EXIT_WORK_FLAGS                                \
        (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |        \
         _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)

static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
{
        bool step;

        audit_syscall_exit(regs); /*covered*/

        if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
                trace_sys_exit(regs, regs->ax); /*covered*/

        /*
         * If TIF_SYSCALL_EMU is set, we only get here because of
         * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
         * We already reported this syscall instruction in
         * syscall_trace_enter().
         */
        step = unlikely(
                (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
                == _TIF_SINGLESTEP);
        if (step || cached_flags & _TIF_SYSCALL_TRACE) /*covered*/
                tracehook_report_syscall_exit(regs, step);
}

/*
 * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
 * state such that we can immediately switch to user mode.
 */
__visible inline void syscall_return_slowpath(struct pt_regs *regs)
{ /*covered*/
        struct thread_info *ti = current_thread_info();
        u32 cached_flags = READ_ONCE(ti->flags);

        CT_WARN_ON(ct_state() != CONTEXT_KERNEL);

        if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
            WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
                local_irq_enable();

        rseq_syscall(regs);

        /*
         * First do one-time work.  If these work items are enabled, we
         * want to run them exactly once per syscall exit with IRQs on.
         */
        if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) /*covered*/
                syscall_slow_exit_work(regs, cached_flags); /*covered*/

        local_irq_disable();
        prepare_exit_to_usermode(regs);
}

#ifdef CONFIG_X86_64
__no_sanitize_memory
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{ /*covered*/
        struct thread_info *ti;

        enter_from_user_mode();
        local_irq_enable();
        ti = current_thread_info();
        if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) /*covered*/
                nr = syscall_trace_enter(regs); /*covered*/

        /*
         * NB: Native and x32 syscalls are dispatched from the same
         * table.  The only functional difference is the x32 bit in
         * regs->orig_ax, which changes the behavior of some syscalls.
         */
        nr &= __SYSCALL_MASK;
        if (likely(nr < NR_syscalls)) { /*covered*/
                nr = array_index_nospec(nr, NR_syscalls); /*covered*/
                regs->ax = sys_call_table[nr](regs);
        }

        syscall_return_slowpath(regs);
}
#endif

#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
 * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
 * all entry and exit work and returns with IRQs off.  This function is
 * extremely hot in workloads that use it, and it's usually called from
 * do_fast_syscall_32, so forcibly inline it to improve performance.
 */
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
        struct thread_info *ti = current_thread_info();
        unsigned int nr = (unsigned int)regs->orig_ax;

#ifdef CONFIG_IA32_EMULATION
        ti->status |= TS_COMPAT;
#endif

        if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { /*covered*/
                /*
                 * Subtlety here: if ptrace pokes something larger than
                 * 2^32-1 into orig_ax, this truncates it.  This may or
                 * may not be necessary, but it matches the old asm
                 * behavior.
                 */
                nr = syscall_trace_enter(regs); /*covered*/
        }

        if (likely(nr < IA32_NR_syscalls)) { /*covered*/
                nr = array_index_nospec(nr, IA32_NR_syscalls); /*covered*/
#ifdef CONFIG_IA32_EMULATION
                regs->ax = ia32_sys_call_table[nr](regs);
#else
                /*
                 * It's possible that a 32-bit syscall implementation
                 * takes a 64-bit parameter but nonetheless assumes that
                 * the high bits are zero.  Make sure we zero-extend all
                 * of the args.
                 */
                regs->ax = ia32_sys_call_table[nr](
                        (unsigned int)regs->bx, (unsigned int)regs->cx,
                        (unsigned int)regs->dx, (unsigned int)regs->si,
                        (unsigned int)regs->di, (unsigned int)regs->bp);
#endif /* CONFIG_IA32_EMULATION */
        }

        syscall_return_slowpath(regs);
}

/* Handles int $0x80 */
__visible void do_int80_syscall_32(struct pt_regs *regs)
{ /*covered*/
        enter_from_user_mode();
        local_irq_enable();
        do_syscall_32_irqs_on(regs); /*covered*/
}

/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs)
{ /*covered*/
        /*
         * Called using the internal vDSO SYSENTER/SYSCALL32 calling
         * convention.  Adjust regs so it looks like we entered using int80.
         */

        unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
                vdso_image_32.sym_int80_landing_pad;

        /*
         * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
         * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
         * Fix it up.
         */
        regs->ip = landing_pad;

        enter_from_user_mode();

        local_irq_enable();

        /* Fetch EBP from where the vDSO stashed it. */
        if (
#ifdef CONFIG_X86_64
                /*
                 * Micro-optimization: the pointer we're following is explicitly
                 * 32 bits, so it can't be out of range.
                 */
                __get_user(*(u32 *)&regs->bp,
                            (u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
                get_user(*(u32 *)&regs->bp,
                         (u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
                ) {

                /* User code screwed up. */
                local_irq_disable();
                regs->ax = -EFAULT;
                prepare_exit_to_usermode(regs);
                return 0;        /* Keep it simple: use IRET. */
        }

        /* Now this is just like a normal syscall. */
        do_syscall_32_irqs_on(regs); /*covered*/

#ifdef CONFIG_X86_64
        /*
         * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
         * SYSRETL is available on all 64-bit CPUs, so we don't need to
         * bother with SYSEXIT.
         *
         * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
         * because the ECX fixup above will ensure that this is essentially
         * never the case.
         */
        return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
                regs->ip == landing_pad &&
                (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
        /*
         * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
         *
         * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
         * because the ECX fixup above will ensure that this is essentially
         * never the case.
         *
         * We don't allow syscalls at all from VM86 mode, but we still
         * need to check VM, because we might be returning from sys_vm86.
         */
        return static_cpu_has(X86_FEATURE_SEP) &&
                regs->cs == __USER_CS && regs->ss == __USER_DS &&
                regs->ip == landing_pad &&
                (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
#endif
}
#endif