unwind: Add deferred user space unwinding API

Add unwind_user_deferred() which allows callers to schedule task work to
unwind the user space stack before returning to user space.  This solves
several problems for its callers:

  - Ensure the unwind happens in task context even if the caller may be
    running in interrupt context.

  - Only do the unwind once, even if called multiple times either by the
    same caller or multiple callers.

  - Create a "context cookie" which allows trace post-processing to
    correlate kernel unwinds/traces with the user unwind.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index fc61d02..468b08f 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -12,6 +12,7 @@
 #include <linux/resume_user_mode.h>
 #include <linux/tick.h>
 #include <linux/kmsan.h>
+#include <linux/unwind_user_deferred.h>
 
 #include <asm/entry-common.h>
 
@@ -112,6 +113,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
 	CT_WARN_ON(__ct_state() != CT_STATE_USER);
 	user_exit_irqoff();
 
+	unwind_enter_from_user_mode();
+
 	instrumentation_begin();
 	kmsan_unpoison_entry_regs(regs);
 	trace_hardirqs_off_finish();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 66b311f..984c107 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -47,6 +47,7 @@
 #include <linux/livepatch_sched.h>
 #include <linux/uidgid_types.h>
 #include <asm/kmap_size.h>
+#include <linux/unwind_user_deferred_types.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -1603,6 +1604,10 @@ struct task_struct {
 	struct user_event_mm		*user_event_mm;
 #endif
 
+#ifdef CONFIG_UNWIND_USER
+	struct unwind_task_info		unwind_task_info;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/include/linux/unwind_user_deferred.h b/include/linux/unwind_user_deferred.h
new file mode 100644
index 0000000..a0cbe64
--- /dev/null
+++ b/include/linux/unwind_user_deferred.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_H
+#define _LINUX_UNWIND_USER_DEFERRED_H
+
+#include <linux/unwind_user.h>
+#include <linux/unwind_user_deferred_types.h>
+
+#ifdef CONFIG_UNWIND_USER
+
+void unwind_task_init(struct task_struct *task);
+void unwind_task_free(struct task_struct *task);
+
+int unwind_user_register(struct unwind_callback *callback, unwind_callback_t func);
+int unwind_user_unregister(struct unwind_callback *callback);
+
+int unwind_user_deferred(struct unwind_callback *callback, u64 *ctx_cookie, void *data);
+
+DECLARE_PER_CPU(u64, unwind_ctx_ctr);
+
+static __always_inline void unwind_enter_from_user_mode(void)
+{
+	current->unwind_task_info.ctx_cookie = 0;
+}
+
+#else /* !CONFIG_UNWIND_USER */
+
+static inline void unwind_task_init(struct task_struct *task) {}
+static inline void unwind_task_free(struct task_struct *task) {}
+static inline int unwind_user_register(struct unwind_callback *callback, unwind_callback_t func) { return -ENOSYS; }
+static inline int unwind_user_unregister(struct unwind_callback *callback) { return -ENOSYS; }
+static inline int unwind_user_deferred(struct unwind_callback *callback, u64 *ctx_cookie, void *data) { return -ENOSYS; }
+static inline void unwind_enter_from_user_mode(void) {}
+
+#endif /* !CONFIG_UNWIND_USER */
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_H */
diff --git a/include/linux/unwind_user_deferred_types.h b/include/linux/unwind_user_deferred_types.h
new file mode 100644
index 0000000..b5d1a6d
--- /dev/null
+++ b/include/linux/unwind_user_deferred_types.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+
+#include <linux/unwind_user_types.h>
+
+typedef void (*unwind_callback_t)(struct unwind_stacktrace *trace,
+				  u64 ctx_cookie, void *data);
+
+struct unwind_callback {
+	struct rcu_head			rcu;
+	unwind_callback_t		func;
+	unsigned int			idx;
+	bool				enabled;
+};
+
+#define UNWIND_MAX_CALLBACKS 4
+struct unwind_task_info {
+	u64			ctx_cookie;
+	unsigned int		work_pending;
+	u32			pending_callbacks;
+	u64			last_cookies[UNWIND_MAX_CALLBACKS];
+	void			*privs[UNWIND_MAX_CALLBACKS];
+	unsigned long		*entries;
+	u64			cached_cookie;
+	struct callback_head	work;
+	unsigned int		cached_nr;
+};
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9700157..9a053c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -106,6 +106,7 @@
 #include <linux/pidfs.h>
 #include <linux/tick.h>
 #include <linux/sframe.h>
+#include <linux/unwind_user_deferred.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -978,6 +979,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(refcount_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	unwind_task_free(tsk);
 	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_free(tsk);
@@ -2371,6 +2373,8 @@ __latent_entropy struct task_struct *copy_process(
 	p->bpf_ctx = NULL;
 #endif
 
+	unwind_task_init(p);
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index f70380d..1460381 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1,2 +1,2 @@
- obj-$(CONFIG_UNWIND_USER)		+= user.o
+ obj-$(CONFIG_UNWIND_USER)		+= user.o deferred.o
  obj-$(CONFIG_HAVE_UNWIND_USER_SFRAME)	+= sframe.o
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
new file mode 100644
index 0000000..ef7499f
--- /dev/null
+++ b/kernel/unwind/deferred.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Deferred user space unwinding
+*
+* Copyright (C) 2024 Josh Poimboeuf <jpoimboe@kernel.org>
+*/
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sframe.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+#include <linux/mm.h>
+#include <linux/unwind_user_deferred.h>
+
+#define UNWIND_MAX_ENTRIES 512
+
+DEFINE_STATIC_SRCU(callbacks_srcu);
+static DEFINE_MUTEX(callbacks_mutex);
+static struct unwind_callback __rcu *callbacks[UNWIND_MAX_CALLBACKS];
+
+/* Counter for entries from user space */
+DEFINE_PER_CPU(u64, unwind_ctx_ctr);
+
+/*
+ * The context cookie is a unique identifier which allows post-processing to
+ * correlate kernel trace(s) with user unwinds.  The high 12 bits are the CPU
+ * id; the lower 48 bits are a per-CPU entry counter.
+ */
+static u64 ctx_to_cookie(u64 cpu, u64 ctx)
+{
+	BUILD_BUG_ON(NR_CPUS > 65535);
+	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
+}
+
+/*
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The @callback must have previously been registered with
+ * unwind_user_register().
+ *
+ * The @cookie output is a unique identifer which will also be passed to the
+ * callback function.  It can be used to stitch kernel and user traces together
+ * in post-processing.
+ *
+ * If there are multiple calls to this function for a given @callback, the
+ * cookie will usually be the same and the callback will only be called once.
+ *
+ * The only exception is when the task has migrated to another CPU, *and* this
+ * is called while the task work is running (or has already run).  Then a new
+ * cookie will be generated and the callback will be called again for the new
+ * cookie.
+ */
+int unwind_user_deferred(struct unwind_callback *callback, u64 *ctx_cookie, void *data)
+{
+	struct unwind_task_info *info = &current->unwind_task_info;
+	u64 cookie = info->ctx_cookie;
+	int idx = callback->idx;
+
+	if (WARN_ON_ONCE(in_nmi()))
+		return -EINVAL;
+
+	if (!current->mm)
+		return -EINVAL;
+
+	guard(irqsave)();
+
+	/*
+	 * If this is the first call since the most recent entry from user
+	 * space, initialize the task context cookie.
+	 */
+	if (!cookie) {
+		u64 cpu = raw_smp_processor_id();
+		u64 ctx_ctr;
+
+		ctx_ctr = __this_cpu_inc_return(unwind_ctx_ctr);
+		cookie = ctx_to_cookie(cpu, ctx_ctr);
+		info->ctx_cookie = cookie;
+
+	} else {
+		if (info->pending_callbacks & (1 << idx)) {
+			/* callback already scheduled */
+			goto done;
+		}
+
+		if (cookie == info->last_cookies[idx]) {
+			/* callback already called */
+			goto done;
+		}
+	}
+
+	info->pending_callbacks |= (1 << idx);
+	info->privs[idx] = data;
+	info->last_cookies[idx] = cookie;
+
+	if (!info->work_pending) {
+		info->work_pending = 1;
+		task_work_add(current, &info->work, TWA_RESUME);
+	}
+
+
+done:
+	if (ctx_cookie)
+		*ctx_cookie = cookie;
+	return 0;
+}
+
+static void unwind_user_task_work(struct callback_head *head)
+{
+	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+	struct task_struct *task = container_of(info, struct task_struct, unwind_task_info);
+	void *privs[UNWIND_MAX_CALLBACKS];
+	struct unwind_stacktrace trace;
+	unsigned long pending;
+	u64 cookie = 0;
+	int i;
+
+	BUILD_BUG_ON(UNWIND_MAX_CALLBACKS > 32);
+
+	if (WARN_ON_ONCE(task != current))
+		return;
+
+	if (WARN_ON_ONCE(!info->ctx_cookie || !info->pending_callbacks || !info->work_pending))
+		return;
+
+	scoped_guard(irqsave) {
+		pending = info->pending_callbacks;
+		cookie = info->ctx_cookie;
+
+		info->work_pending = 0;
+		info->pending_callbacks = 0;
+		memcpy(privs, info->privs, sizeof(void *) * UNWIND_MAX_CALLBACKS);
+	}
+
+	if (!info->entries) {
+		info->entries = kmalloc(UNWIND_MAX_ENTRIES * sizeof(long),
+					GFP_KERNEL);
+		if (!info->entries)
+			return;
+	}
+
+	trace.entries = info->entries;
+
+	if (cookie == info->cached_cookie) {
+		trace.nr = info->cached_nr;
+	} else {
+		trace.nr = 0;
+		unwind_user(&trace, UNWIND_MAX_ENTRIES);
+		info->cached_cookie = cookie;
+		info->cached_nr = trace.nr;
+	}
+
+	guard(srcu)(&callbacks_srcu);
+
+	for_each_set_bit(i, &pending, UNWIND_MAX_CALLBACKS) {
+		struct unwind_callback *callback;
+
+		callback = srcu_dereference(callbacks[i], &callbacks_srcu);
+
+		if (callback && callback->enabled)
+			callback->func(&trace, cookie, privs[i]);
+	}
+}
+
+int unwind_user_register(struct unwind_callback *callback, unwind_callback_t func)
+{
+	guard(mutex)(&callbacks_mutex);
+
+	for (int i = 0; i < UNWIND_MAX_CALLBACKS; i++) {
+		if (!callbacks[i]) {
+			callback->func = func;
+			callback->idx = i;
+			callback->enabled = true;
+
+			rcu_assign_pointer(callbacks[i], callback);
+			return 0;
+		}
+	}
+
+	return -ENOSPC;
+}
+
+static void callback_remove(struct rcu_head *rcu)
+{
+	struct unwind_callback *callback = container_of(rcu, struct unwind_callback, rcu);
+
+	guard(mutex)(&callbacks_mutex);
+	rcu_assign_pointer(callbacks[callback->idx], NULL);
+}
+
+int unwind_user_unregister(struct unwind_callback *callback)
+{
+	callback->enabled = false;
+	call_srcu(&callbacks_srcu, &callback->rcu, callback_remove);
+	synchronize_srcu(&callbacks_srcu);
+
+	return 0;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_task_info;
+
+	info->entries		= NULL;
+	info->pending_callbacks	= 0;
+	info->ctx_cookie	= 0;
+	info->work_pending	= 0;
+
+	memset(info->last_cookies, 0, sizeof(u64) * UNWIND_MAX_CALLBACKS);
+	memset(info->privs,	   0, sizeof(u64) * UNWIND_MAX_CALLBACKS);
+
+	init_task_work(&info->work, unwind_user_task_work);
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_task_info;
+
+	kfree(info->entries);
+}