security/landlock/tsync.c - pub/scm/linux/kernel/git/next/linux-next-history - Git at Google

 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Landlock - Cross-thread ruleset enforcement
  *
  * Copyright © 2025 Google LLC
  */

 #include <linux/atomic.h>
 #include <linux/cleanup.h>
 #include <linux/completion.h>
 #include <linux/cred.h>
 #include <linux/errno.h>
 #include <linux/overflow.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/task_work.h>

 #include "cred.h"
 #include "tsync.h"

 /*
  * Shared state between multiple threads which are enforcing Landlock rulesets
  * in lockstep with each other.
  */
 struct tsync_shared_context {
 	/* The old and tentative new creds of the calling thread. */
 	const struct cred *old_cred;
 	const struct cred *new_cred;

 	/* True if sibling tasks need to set the no_new_privs flag. */
 	bool set_no_new_privs;

 	/* An error encountered in preparation step, or 0. */
 	atomic_t preparation_error;

 	/*
 	 * Barrier after preparation step in restrict_one_thread.
 	 * The calling thread waits for completion.
 	 *
 	 * Re-initialized on every round of looking for newly spawned threads.
 	 */
 	atomic_t num_preparing;
 	struct completion all_prepared;

 	/* Sibling threads wait for completion. */
 	struct completion ready_to_commit;

 	/*
 	 * Barrier after commit step (used by syscall impl to wait for
 	 * completion).
 	 */
 	atomic_t num_unfinished;
 	struct completion all_finished;
 };

 struct tsync_work {
 	struct callback_head work;
 	struct task_struct *task;
 	struct tsync_shared_context *shared_ctx;
 };

 /*
  * restrict_one_thread - update a thread's Landlock domain in lockstep with the
  * other threads in the same process
  *
  * When this is run, the same function gets run in all other threads in the same
  * process (except for the calling thread which called landlock_restrict_self).
  * The concurrently running invocations of restrict_one_thread coordinate
  * through the shared ctx object to do their work in lockstep to implement
  * all-or-nothing semantics for enforcing the new Landlock domain.
  *
  * Afterwards, depending on the presence of an error, all threads either commit
  * or abort the prepared credentials.  The commit operation can not fail any
  * more.
  */
 static void restrict_one_thread(struct tsync_shared_context *ctx)
 {
 	int err;
 	struct cred *cred = NULL;

 	if (current_cred() == ctx->old_cred) {
 		/*
 		 * Switch out old_cred with new_cred, if possible.
 		 *
 		 * In the common case, where all threads initially point to the
 		 * same struct cred, this optimization avoids creating separate
 		 * redundant credentials objects for each, which would all have
 		 * the same contents.
 		 *
 		 * Note: We are intentionally dropping the const qualifier
 		 * here, because it is required by commit_creds() and
 		 * abort_creds().
 		 */
 		cred = (struct cred *)get_cred(ctx->new_cred);
 	} else {
 		/* Else, prepare new creds and populate them. */
 		cred = prepare_creds();

 		if (!cred) {
 			atomic_set(&ctx->preparation_error, -ENOMEM);

 			/*
 			 * Even on error, we need to adhere to the protocol and
 			 * coordinate with concurrently running invocations.
 			 */
 			if (atomic_dec_return(&ctx->num_preparing) == 0)
 				complete_all(&ctx->all_prepared);

 			goto out;
 		}

 		landlock_cred_copy(landlock_cred(cred),
 				   landlock_cred(ctx->new_cred));
 	}

 	/*
 	 * Barrier: Wait until all threads are done preparing.
 	 * After this point, we can have no more failures.
 	 */
 	if (atomic_dec_return(&ctx->num_preparing) == 0)
 		complete_all(&ctx->all_prepared);

 	/*
 	 * Wait for signal from calling thread that it's safe to read the
 	 * preparation error now and we are ready to commit (or abort).
 	 */
 	wait_for_completion(&ctx->ready_to_commit);

 	/* Abort the commit if any of the other threads had an error. */
 	err = atomic_read(&ctx->preparation_error);
 	if (err) {
 		abort_creds(cred);
 		goto out;
 	}

 	/*
 	 * Make sure that all sibling tasks fulfill the no_new_privs
 	 * prerequisite.  (This is in line with Seccomp's
 	 * SECCOMP_FILTER_FLAG_TSYNC logic in kernel/seccomp.c)
 	 */
 	if (ctx->set_no_new_privs)
 		task_set_no_new_privs(current);

 	commit_creds(cred);

 out:
 	/* Notify the calling thread once all threads are done */
 	if (atomic_dec_return(&ctx->num_unfinished) == 0)
 		complete_all(&ctx->all_finished);
 }

 /*
  * restrict_one_thread_callback - task_work callback for restricting a thread
  *
  * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
  */
 static void restrict_one_thread_callback(struct callback_head *work)
 {
 	struct tsync_work *ctx = container_of(work, struct tsync_work, work);

 	restrict_one_thread(ctx->shared_ctx);
 }

 /*
  * struct tsync_works - a growable array of per-task contexts
  *
  * The zero-initialized struct represents the empty array.
  */
 struct tsync_works {
 	struct tsync_work **works;
 	size_t size;
 	size_t capacity;
 };

 /*
  * tsync_works_provide - provides a preallocated tsync_work for the given task
  *
  * This also stores a task pointer in the context and increments the reference
  * count of the task.
  *
  * This function may fail in the case where we did not preallocate sufficient
  * capacity.  This can legitimately happen if new threads get started after we
  * grew the capacity.
  *
  * Return: A pointer to the preallocated context struct with task filled in, or
  * NULL if preallocated context structs ran out.
  */
 static struct tsync_work *tsync_works_provide(struct tsync_works *s,
 					      struct task_struct *task)
 {
 	struct tsync_work *ctx;

 	if (s->size >= s->capacity)
 		return NULL;

 	ctx = s->works[s->size];
 	s->size++;

 	ctx->task = get_task_struct(task);
 	return ctx;
 }

 /**
  * tsync_works_trim - Put the last tsync_work element
  *
  * @s: TSYNC works to trim.
  *
  * Put the last task and decrement the size of @s.
  *
  * This helper does not cancel a running task, but just reset the last element
  * to zero.
  */
 static void tsync_works_trim(struct tsync_works *s)
 {
 	struct tsync_work *ctx;

 	if (WARN_ON_ONCE(s->size <= 0))
 		return;

 	ctx = s->works[s->size - 1];

 	/*
 	 * For consistency, remove the task from ctx so that it does not look
 	 * like we handed it a task_work.
 	 */
 	put_task_struct(ctx->task);
 	*ctx = (typeof(*ctx)){};

 	/*
 	 * Cancel the tsync_works_provide() change to recycle the reserved
 	 * memory for the next thread, if any.  This also ensures that
 	 * cancel_tsync_works() and tsync_works_release() do not see any NULL
 	 * task pointers.
 	 */
 	s->size--;
 }

 /*
  * tsync_works_grow_by - preallocates space for n more contexts in s
  *
  * On a successful return, the subsequent n calls to tsync_works_provide() are
  * guaranteed to succeed.  (size + n <= capacity)
  *
  * Return: 0 if sufficient space for n more elements could be provided, -ENOMEM
  * on allocation errors, -EOVERFLOW in case of integer overflow.
  */
 static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
 {
 	size_t i;
 	size_t new_capacity;
 	struct tsync_work **works;
 	struct tsync_work *work;

 	if (check_add_overflow(s->size, n, &new_capacity))
 		return -EOVERFLOW;

 	/* No need to reallocate if s already has sufficient capacity. */
 	if (new_capacity <= s->capacity)
 		return 0;

 	works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
 			       flags);
 	if (!works)
 		return -ENOMEM;

 	s->works = works;

 	for (i = s->capacity; i < new_capacity; i++) {
 		work = kzalloc_obj(*work, flags);
 		if (!work) {
 			/*
 			 * Leave the object in a consistent state,
 			 * but return an error.
 			 */
 			s->capacity = i;
 			return -ENOMEM;
 		}
 		s->works[i] = work;
 	}
 	s->capacity = new_capacity;
 	return 0;
 }

 /*
  * tsync_works_contains - checks for presence of task in s
  */
 static bool tsync_works_contains_task(const struct tsync_works *s,
 				      const struct task_struct *task)
 {
 	size_t i;

 	for (i = 0; i < s->size; i++)
 		if (s->works[i]->task == task)
 			return true;

 	return false;
 }

 /*
  * tsync_works_release - frees memory held by s and drops all task references
  *
  * This does not free s itself, only the data structures held by it.
  */
 static void tsync_works_release(struct tsync_works *s)
 {
 	size_t i;

 	for (i = 0; i < s->size; i++) {
 		if (WARN_ON_ONCE(!s->works[i]->task))
 			continue;

 		put_task_struct(s->works[i]->task);
 	}

 	for (i = 0; i < s->capacity; i++)
 		kfree(s->works[i]);

 	kfree(s->works);
 	s->works = NULL;
 	s->size = 0;
 	s->capacity = 0;
 }

 /*
  * count_additional_threads - counts the sibling threads that are not in works
  */
 static size_t count_additional_threads(const struct tsync_works *works)
 {
 	const struct task_struct *caller, *thread;
 	size_t n = 0;

 	caller = current;

 	guard(rcu)();

 	for_each_thread(caller, thread) {
 		/* Skip current, since it is initiating the sync. */
 		if (thread == caller)
 			continue;

 		/* Skip exited threads. */
 		if (thread->flags & PF_EXITING)
 			continue;

 		/* Skip threads that we have already seen. */
 		if (tsync_works_contains_task(works, thread))
 			continue;

 		n++;
 	}
 	return n;
 }

 /*
  * schedule_task_work - adds task_work for all eligible sibling threads
  *                      which have not been scheduled yet
  *
  * For each added task_work, atomically increments shared_ctx->num_preparing and
  * shared_ctx->num_unfinished.
  *
  * Return: True if at least one eligible sibling thread was found, false
  * otherwise.
  */
 static bool schedule_task_work(struct tsync_works *works,
 			       struct tsync_shared_context *shared_ctx)
 {
 	int err;
 	const struct task_struct *caller;
 	struct task_struct *thread;
 	struct tsync_work *ctx;
 	bool found_more_threads = false;

 	caller = current;

 	guard(rcu)();

 	for_each_thread(caller, thread) {
 		/* Skip current, since it is initiating the sync. */
 		if (thread == caller)
 			continue;

 		/* Skip exited threads. */
 		if (thread->flags & PF_EXITING)
 			continue;

 		/* Skip threads that we already looked at. */
 		if (tsync_works_contains_task(works, thread))
 			continue;

 		/*
 		 * We found a sibling thread that is not doing its task_work
 		 * yet, and which might spawn new threads before our task work
 		 * runs, so we need at least one more round in the outer loop.
 		 */
 		found_more_threads = true;

 		ctx = tsync_works_provide(works, thread);
 		if (!ctx) {
 			/*
 			 * We ran out of preallocated contexts -- we need to
 			 * try again with this thread at a later time!
 			 * found_more_threads is already true at this point.
 			 */
 			break;
 		}

 		ctx->shared_ctx = shared_ctx;

 		atomic_inc(&shared_ctx->num_preparing);
 		atomic_inc(&shared_ctx->num_unfinished);

 		init_task_work(&ctx->work, restrict_one_thread_callback);
 		err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
 		if (unlikely(err)) {
 			/*
 			 * task_work_add() only fails if the task is about to
 			 * exit.  We checked that earlier, but it can happen as
 			 * a race.  Resume without setting an error, as the
 			 * task is probably gone in the next loop iteration.
 			 */
 			tsync_works_trim(works);

 			atomic_dec(&shared_ctx->num_preparing);
 			atomic_dec(&shared_ctx->num_unfinished);
 		}
 	}

 	return found_more_threads;
 }

 /*
  * cancel_tsync_works - cancel all task works where it is possible
  *
  * Task works can be canceled as long as they are still queued and have not
  * started running.  If they get canceled, we decrement
  * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
  * completions if needed, as if the task was never scheduled.
  */
 static void cancel_tsync_works(const struct tsync_works *works,
 			       struct tsync_shared_context *shared_ctx)
 {
 	size_t i;

 	for (i = 0; i < works->size; i++) {
 		if (WARN_ON_ONCE(!works->works[i]->task))
 			continue;

 		if (!task_work_cancel(works->works[i]->task,
 				      &works->works[i]->work))
 			continue;

 		/* After dequeueing, act as if the task work had executed. */

 		if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
 			complete_all(&shared_ctx->all_prepared);

 		if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
 			complete_all(&shared_ctx->all_finished);
 	}
 }

 /*
  * restrict_sibling_threads - enables a Landlock policy for all sibling threads
  */
 int landlock_restrict_sibling_threads(const struct cred *old_cred,
 				      const struct cred *new_cred)
 {
 	int err;
 	struct tsync_shared_context shared_ctx;
 	struct tsync_works works = {};
 	size_t newly_discovered_threads;
 	bool found_more_threads;

 	atomic_set(&shared_ctx.preparation_error, 0);
 	init_completion(&shared_ctx.all_prepared);
 	init_completion(&shared_ctx.ready_to_commit);
 	atomic_set(&shared_ctx.num_unfinished, 1);
 	init_completion(&shared_ctx.all_finished);
 	shared_ctx.old_cred = old_cred;
 	shared_ctx.new_cred = new_cred;
 	shared_ctx.set_no_new_privs = task_no_new_privs(current);

 	/*
 	 * Serialize concurrent TSYNC operations to prevent deadlocks when
 	 * multiple threads call landlock_restrict_self() simultaneously.
 	 * If the lock is already held, we gracefully yield by restarting the
 	 * syscall. This allows the current thread to process pending
 	 * task_works before retrying.
 	 */
 	if (!down_write_trylock(&current->signal->exec_update_lock))
 		return restart_syscall();

 	/*
 	 * We schedule a pseudo-signal task_work for each of the calling task's
 	 * sibling threads.  In the task work, each thread:
 	 *
 	 * 1) runs prepare_creds() and writes back the error to
 	 *    shared_ctx.preparation_error, if needed.
 	 *
 	 * 2) signals that it's done with prepare_creds() to the calling task.
 	 *    (completion "all_prepared").
 	 *
 	 * 3) waits for the completion "ready_to_commit".  This is sent by the
 	 *    calling task after ensuring that all sibling threads have done
 	 *    with the "preparation" stage.
 	 *
 	 *    After this barrier is reached, it's safe to read
 	 *    shared_ctx.preparation_error.
 	 *
 	 * 4) reads shared_ctx.preparation_error and then either does
 	 *    commit_creds() or abort_creds().
 	 *
 	 * 5) signals that it's done altogether (barrier synchronization
 	 *    "all_finished")
 	 *
 	 * Unlike seccomp, which modifies sibling tasks directly, we do not
 	 * need to acquire the cred_guard_mutex and sighand->siglock:
 	 *
 	 * - As in our case, all threads are themselves exchanging their own
 	 *   struct cred through the credentials API, no locks are needed for
 	 *   that.
 	 * - Our for_each_thread() loops are protected by RCU.
 	 * - We do not acquire a lock to keep the list of sibling threads
 	 *   stable between our for_each_thread loops.  If the list of
 	 *   available sibling threads changes between these for_each_thread
 	 *   loops, we make up for that by continuing to look for threads until
 	 *   they are all discovered and have entered their task_work, where
 	 *   they are unable to spawn new threads.
 	 */
 	do {
 		/* In RCU read-lock, count the threads we need. */
 		newly_discovered_threads = count_additional_threads(&works);

 		if (newly_discovered_threads == 0)
 			break; /* done */

 		err = tsync_works_grow_by(&works, newly_discovered_threads,
 					  GFP_KERNEL_ACCOUNT);
 		if (err) {
 			atomic_set(&shared_ctx.preparation_error, err);
 			break;
 		}

 		/*
 		 * The "all_prepared" barrier is used locally to the loop body,
 		 * this use of for_each_thread().  We can reset it on each loop
 		 * iteration because all previous loop iterations are done with
 		 * it already.
 		 *
 		 * num_preparing is initialized to 1 so that the counter can
 		 * not go to 0 and mark the completion as done before all task
 		 * works are registered.  We decrement it at the end of the
 		 * loop body.
 		 */
 		atomic_set(&shared_ctx.num_preparing, 1);
 		reinit_completion(&shared_ctx.all_prepared);

 		/*
 		 * In RCU read-lock, schedule task work on newly discovered
 		 * sibling tasks.
 		 */
 		found_more_threads = schedule_task_work(&works, &shared_ctx);

 		/*
 		 * Decrement num_preparing for current, to undo that we
 		 * initialized it to 1 a few lines above.
 		 */
 		if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
 			if (wait_for_completion_interruptible(
 				    &shared_ctx.all_prepared)) {
 				/*
 				 * In case of interruption, we need to retry
 				 * the system call.
 				 */
 				atomic_set(&shared_ctx.preparation_error,
 					   -ERESTARTNOINTR);

 				/*
 				 * Opportunistic improvement: try to cancel task
 				 * works for tasks that did not start running
 				 * yet. We do not have a guarantee that it
 				 * cancels any of the enqueued task works
 				 * because task_work_run() might already have
 				 * dequeued them.
 				 */
 				cancel_tsync_works(&works, &shared_ctx);

 				/*
 				 * Break the loop with error. The cleanup code
 				 * after the loop unblocks the remaining
 				 * task_works.
 				 */
 				break;
 			}
 		}
 	} while (found_more_threads &&
 		 !atomic_read(&shared_ctx.preparation_error));

 	/*
 	 * We now have either (a) all sibling threads blocking and in "prepared"
 	 * state in the task work, or (b) the preparation error is set. Ask all
 	 * threads to commit (or abort).
 	 */
 	complete_all(&shared_ctx.ready_to_commit);

 	/*
 	 * Decrement num_unfinished for current, to undo that we initialized it
 	 * to 1 at the beginning.
 	 */
 	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
 		wait_for_completion(&shared_ctx.all_finished);

 	tsync_works_release(&works);
 	up_write(&current->signal->exec_update_lock);
 	return atomic_read(&shared_ctx.preparation_error);
 }
	// SPDX-License-Identifier: GPL-2.0-only
	/*
	* Landlock - Cross-thread ruleset enforcement
	*
	* Copyright © 2025 Google LLC
	*/

	#include <linux/atomic.h>
	#include <linux/cleanup.h>
	#include <linux/completion.h>
	#include <linux/cred.h>
	#include <linux/errno.h>
	#include <linux/overflow.h>
	#include <linux/rcupdate.h>
	#include <linux/sched.h>
	#include <linux/sched/signal.h>
	#include <linux/sched/task.h>
	#include <linux/slab.h>
	#include <linux/task_work.h>

	#include "cred.h"
	#include "tsync.h"

	/*
	* Shared state between multiple threads which are enforcing Landlock rulesets
	* in lockstep with each other.
	*/
	struct tsync_shared_context {
	/* The old and tentative new creds of the calling thread. */
	const struct cred *old_cred;
	const struct cred *new_cred;

	/* True if sibling tasks need to set the no_new_privs flag. */
	bool set_no_new_privs;

	/* An error encountered in preparation step, or 0. */
	atomic_t preparation_error;

	/*
	* Barrier after preparation step in restrict_one_thread.
	* The calling thread waits for completion.
	*
	* Re-initialized on every round of looking for newly spawned threads.
	*/
	atomic_t num_preparing;
	struct completion all_prepared;

	/* Sibling threads wait for completion. */
	struct completion ready_to_commit;

	/*
	* Barrier after commit step (used by syscall impl to wait for
	* completion).
	*/
	atomic_t num_unfinished;
	struct completion all_finished;
	};

	struct tsync_work {
	struct callback_head work;
	struct task_struct *task;
	struct tsync_shared_context *shared_ctx;
	};

	/*
	* restrict_one_thread - update a thread's Landlock domain in lockstep with the
	* other threads in the same process
	*
	* When this is run, the same function gets run in all other threads in the same
	* process (except for the calling thread which called landlock_restrict_self).
	* The concurrently running invocations of restrict_one_thread coordinate
	* through the shared ctx object to do their work in lockstep to implement
	* all-or-nothing semantics for enforcing the new Landlock domain.
	*
	* Afterwards, depending on the presence of an error, all threads either commit
	* or abort the prepared credentials. The commit operation can not fail any
	* more.
	*/
	static void restrict_one_thread(struct tsync_shared_context *ctx)
	{
	int err;
	struct cred *cred = NULL;

	if (current_cred() == ctx->old_cred) {
	/*
	* Switch out old_cred with new_cred, if possible.
	*
	* In the common case, where all threads initially point to the
	* same struct cred, this optimization avoids creating separate
	* redundant credentials objects for each, which would all have
	* the same contents.
	*
	* Note: We are intentionally dropping the const qualifier
	* here, because it is required by commit_creds() and
	* abort_creds().
	*/
	cred = (struct cred *)get_cred(ctx->new_cred);
	} else {
	/* Else, prepare new creds and populate them. */
	cred = prepare_creds();

	if (!cred) {
	atomic_set(&ctx->preparation_error, -ENOMEM);

	/*
	* Even on error, we need to adhere to the protocol and
	* coordinate with concurrently running invocations.
	*/
	if (atomic_dec_return(&ctx->num_preparing) == 0)
	complete_all(&ctx->all_prepared);

	goto out;
	}

	landlock_cred_copy(landlock_cred(cred),
	landlock_cred(ctx->new_cred));
	}

	/*
	* Barrier: Wait until all threads are done preparing.
	* After this point, we can have no more failures.
	*/
	if (atomic_dec_return(&ctx->num_preparing) == 0)
	complete_all(&ctx->all_prepared);

	/*
	* Wait for signal from calling thread that it's safe to read the
	* preparation error now and we are ready to commit (or abort).
	*/
	wait_for_completion(&ctx->ready_to_commit);

	/* Abort the commit if any of the other threads had an error. */
	err = atomic_read(&ctx->preparation_error);
	if (err) {
	abort_creds(cred);
	goto out;
	}

	/*
	* Make sure that all sibling tasks fulfill the no_new_privs
	* prerequisite. (This is in line with Seccomp's
	* SECCOMP_FILTER_FLAG_TSYNC logic in kernel/seccomp.c)
	*/
	if (ctx->set_no_new_privs)
	task_set_no_new_privs(current);

	commit_creds(cred);

	out:
	/* Notify the calling thread once all threads are done */
	if (atomic_dec_return(&ctx->num_unfinished) == 0)
	complete_all(&ctx->all_finished);
	}

	/*
	* restrict_one_thread_callback - task_work callback for restricting a thread
	*
	* Calls restrict_one_thread with the struct landlock_shared_tsync_context.
	*/
	static void restrict_one_thread_callback(struct callback_head *work)
	{
	struct tsync_work *ctx = container_of(work, struct tsync_work, work);

	restrict_one_thread(ctx->shared_ctx);
	}

	/*
	* struct tsync_works - a growable array of per-task contexts
	*
	* The zero-initialized struct represents the empty array.
	*/
	struct tsync_works {
	struct tsync_work **works;
	size_t size;
	size_t capacity;
	};

	/*
	* tsync_works_provide - provides a preallocated tsync_work for the given task
	*
	* This also stores a task pointer in the context and increments the reference
	* count of the task.
	*
	* This function may fail in the case where we did not preallocate sufficient
	* capacity. This can legitimately happen if new threads get started after we
	* grew the capacity.
	*
	* Return: A pointer to the preallocated context struct with task filled in, or
	* NULL if preallocated context structs ran out.
	*/
	static struct tsync_work tsync_works_provide(struct tsync_works s,
	struct task_struct *task)
	{
	struct tsync_work *ctx;

	if (s->size >= s->capacity)
	return NULL;

	ctx = s->works[s->size];
	s->size++;

	ctx->task = get_task_struct(task);
	return ctx;
	}

	/**
	* tsync_works_trim - Put the last tsync_work element
	*
	* @s: TSYNC works to trim.
	*
	* Put the last task and decrement the size of @s.
	*
	* This helper does not cancel a running task, but just reset the last element
	* to zero.
	*/
	static void tsync_works_trim(struct tsync_works *s)
	{
	struct tsync_work *ctx;

	if (WARN_ON_ONCE(s->size <= 0))
	return;

	ctx = s->works[s->size - 1];

	/*
	* For consistency, remove the task from ctx so that it does not look
	* like we handed it a task_work.
	*/
	put_task_struct(ctx->task);
	ctx = (typeof(ctx)){};

	/*
	* Cancel the tsync_works_provide() change to recycle the reserved
	* memory for the next thread, if any. This also ensures that
	* cancel_tsync_works() and tsync_works_release() do not see any NULL
	* task pointers.
	*/
	s->size--;
	}

	/*
	* tsync_works_grow_by - preallocates space for n more contexts in s
	*
	* On a successful return, the subsequent n calls to tsync_works_provide() are
	* guaranteed to succeed. (size + n <= capacity)
	*
	* Return: 0 if sufficient space for n more elements could be provided, -ENOMEM
	* on allocation errors, -EOVERFLOW in case of integer overflow.
	*/
	static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
	{
	size_t i;
	size_t new_capacity;
	struct tsync_work **works;
	struct tsync_work *work;

	if (check_add_overflow(s->size, n, &new_capacity))
	return -EOVERFLOW;

	/* No need to reallocate if s already has sufficient capacity. */
	if (new_capacity <= s->capacity)
	return 0;

	works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
	flags);
	if (!works)
	return -ENOMEM;

	s->works = works;

	for (i = s->capacity; i < new_capacity; i++) {
	work = kzalloc_obj(*work, flags);
	if (!work) {
	/*
	* Leave the object in a consistent state,
	* but return an error.
	*/
	s->capacity = i;
	return -ENOMEM;
	}
	s->works[i] = work;
	}
	s->capacity = new_capacity;
	return 0;
	}

	/*
	* tsync_works_contains - checks for presence of task in s
	*/
	static bool tsync_works_contains_task(const struct tsync_works *s,
	const struct task_struct *task)
	{
	size_t i;

	for (i = 0; i < s->size; i++)
	if (s->works[i]->task == task)
	return true;

	return false;
	}

	/*
	* tsync_works_release - frees memory held by s and drops all task references
	*
	* This does not free s itself, only the data structures held by it.
	*/
	static void tsync_works_release(struct tsync_works *s)
	{
	size_t i;

	for (i = 0; i < s->size; i++) {
	if (WARN_ON_ONCE(!s->works[i]->task))
	continue;

	put_task_struct(s->works[i]->task);
	}

	for (i = 0; i < s->capacity; i++)
	kfree(s->works[i]);

	kfree(s->works);
	s->works = NULL;
	s->size = 0;
	s->capacity = 0;
	}

	/*
	* count_additional_threads - counts the sibling threads that are not in works
	*/
	static size_t count_additional_threads(const struct tsync_works *works)
	{
	const struct task_struct caller, thread;
	size_t n = 0;

	caller = current;

	guard(rcu)();

	for_each_thread(caller, thread) {
	/* Skip current, since it is initiating the sync. */
	if (thread == caller)
	continue;

	/* Skip exited threads. */
	if (thread->flags & PF_EXITING)
	continue;

	/* Skip threads that we have already seen. */
	if (tsync_works_contains_task(works, thread))
	continue;

	n++;
	}
	return n;
	}

	/*
	* schedule_task_work - adds task_work for all eligible sibling threads
	* which have not been scheduled yet
	*
	* For each added task_work, atomically increments shared_ctx->num_preparing and
	* shared_ctx->num_unfinished.
	*
	* Return: True if at least one eligible sibling thread was found, false
	* otherwise.
	*/
	static bool schedule_task_work(struct tsync_works *works,
	struct tsync_shared_context *shared_ctx)
	{
	int err;
	const struct task_struct *caller;
	struct task_struct *thread;
	struct tsync_work *ctx;
	bool found_more_threads = false;

	caller = current;

	guard(rcu)();

	for_each_thread(caller, thread) {
	/* Skip current, since it is initiating the sync. */
	if (thread == caller)
	continue;

	/* Skip exited threads. */
	if (thread->flags & PF_EXITING)
	continue;

	/* Skip threads that we already looked at. */
	if (tsync_works_contains_task(works, thread))
	continue;

	/*
	* We found a sibling thread that is not doing its task_work
	* yet, and which might spawn new threads before our task work
	* runs, so we need at least one more round in the outer loop.
	*/
	found_more_threads = true;

	ctx = tsync_works_provide(works, thread);
	if (!ctx) {
	/*
	* We ran out of preallocated contexts -- we need to
	* try again with this thread at a later time!
	* found_more_threads is already true at this point.
	*/
	break;
	}

	ctx->shared_ctx = shared_ctx;

	atomic_inc(&shared_ctx->num_preparing);
	atomic_inc(&shared_ctx->num_unfinished);

	init_task_work(&ctx->work, restrict_one_thread_callback);
	err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
	if (unlikely(err)) {
	/*
	* task_work_add() only fails if the task is about to
	* exit. We checked that earlier, but it can happen as
	* a race. Resume without setting an error, as the
	* task is probably gone in the next loop iteration.
	*/
	tsync_works_trim(works);

	atomic_dec(&shared_ctx->num_preparing);
	atomic_dec(&shared_ctx->num_unfinished);
	}
	}

	return found_more_threads;
	}

	/*
	* cancel_tsync_works - cancel all task works where it is possible
	*
	* Task works can be canceled as long as they are still queued and have not
	* started running. If they get canceled, we decrement
	* shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
	* completions if needed, as if the task was never scheduled.
	*/
	static void cancel_tsync_works(const struct tsync_works *works,
	struct tsync_shared_context *shared_ctx)
	{
	size_t i;

	for (i = 0; i < works->size; i++) {
	if (WARN_ON_ONCE(!works->works[i]->task))
	continue;

	if (!task_work_cancel(works->works[i]->task,
	&works->works[i]->work))
	continue;

	/* After dequeueing, act as if the task work had executed. */

	if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
	complete_all(&shared_ctx->all_prepared);

	if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
	complete_all(&shared_ctx->all_finished);
	}
	}

	/*
	* restrict_sibling_threads - enables a Landlock policy for all sibling threads
	*/
	int landlock_restrict_sibling_threads(const struct cred *old_cred,
	const struct cred *new_cred)
	{
	int err;
	struct tsync_shared_context shared_ctx;
	struct tsync_works works = {};
	size_t newly_discovered_threads;
	bool found_more_threads;

	atomic_set(&shared_ctx.preparation_error, 0);
	init_completion(&shared_ctx.all_prepared);
	init_completion(&shared_ctx.ready_to_commit);
	atomic_set(&shared_ctx.num_unfinished, 1);
	init_completion(&shared_ctx.all_finished);
	shared_ctx.old_cred = old_cred;
	shared_ctx.new_cred = new_cred;
	shared_ctx.set_no_new_privs = task_no_new_privs(current);

	/*
	* Serialize concurrent TSYNC operations to prevent deadlocks when
	* multiple threads call landlock_restrict_self() simultaneously.
	* If the lock is already held, we gracefully yield by restarting the
	* syscall. This allows the current thread to process pending
	* task_works before retrying.
	*/
	if (!down_write_trylock(&current->signal->exec_update_lock))
	return restart_syscall();

	/*
	* We schedule a pseudo-signal task_work for each of the calling task's
	* sibling threads. In the task work, each thread:
	*
	* 1) runs prepare_creds() and writes back the error to
	* shared_ctx.preparation_error, if needed.
	*
	* 2) signals that it's done with prepare_creds() to the calling task.
	* (completion "all_prepared").
	*
	* 3) waits for the completion "ready_to_commit". This is sent by the
	* calling task after ensuring that all sibling threads have done
	* with the "preparation" stage.
	*
	* After this barrier is reached, it's safe to read
	* shared_ctx.preparation_error.
	*
	* 4) reads shared_ctx.preparation_error and then either does
	* commit_creds() or abort_creds().
	*
	* 5) signals that it's done altogether (barrier synchronization
	* "all_finished")
	*
	* Unlike seccomp, which modifies sibling tasks directly, we do not
	* need to acquire the cred_guard_mutex and sighand->siglock:
	*
	* - As in our case, all threads are themselves exchanging their own
	* struct cred through the credentials API, no locks are needed for
	* that.
	* - Our for_each_thread() loops are protected by RCU.
	* - We do not acquire a lock to keep the list of sibling threads
	* stable between our for_each_thread loops. If the list of
	* available sibling threads changes between these for_each_thread
	* loops, we make up for that by continuing to look for threads until
	* they are all discovered and have entered their task_work, where
	* they are unable to spawn new threads.
	*/
	do {
	/* In RCU read-lock, count the threads we need. */
	newly_discovered_threads = count_additional_threads(&works);

	if (newly_discovered_threads == 0)
	break; /* done */

	err = tsync_works_grow_by(&works, newly_discovered_threads,
	GFP_KERNEL_ACCOUNT);
	if (err) {
	atomic_set(&shared_ctx.preparation_error, err);
	break;
	}

	/*
	* The "all_prepared" barrier is used locally to the loop body,
	* this use of for_each_thread(). We can reset it on each loop
	* iteration because all previous loop iterations are done with
	* it already.
	*
	* num_preparing is initialized to 1 so that the counter can
	* not go to 0 and mark the completion as done before all task
	* works are registered. We decrement it at the end of the
	* loop body.
	*/
	atomic_set(&shared_ctx.num_preparing, 1);
	reinit_completion(&shared_ctx.all_prepared);

	/*
	* In RCU read-lock, schedule task work on newly discovered
	* sibling tasks.
	*/
	found_more_threads = schedule_task_work(&works, &shared_ctx);

	/*
	* Decrement num_preparing for current, to undo that we
	* initialized it to 1 a few lines above.
	*/
	if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
	if (wait_for_completion_interruptible(
	&shared_ctx.all_prepared)) {
	/*
	* In case of interruption, we need to retry
	* the system call.
	*/
	atomic_set(&shared_ctx.preparation_error,
	-ERESTARTNOINTR);

	/*
	* Opportunistic improvement: try to cancel task
	* works for tasks that did not start running
	* yet. We do not have a guarantee that it
	* cancels any of the enqueued task works
	* because task_work_run() might already have
	* dequeued them.
	*/
	cancel_tsync_works(&works, &shared_ctx);

	/*
	* Break the loop with error. The cleanup code
	* after the loop unblocks the remaining
	* task_works.
	*/
	break;
	}
	}
	} while (found_more_threads &&
	!atomic_read(&shared_ctx.preparation_error));

	/*
	* We now have either (a) all sibling threads blocking and in "prepared"
	* state in the task work, or (b) the preparation error is set. Ask all
	* threads to commit (or abort).
	*/
	complete_all(&shared_ctx.ready_to_commit);

	/*
	* Decrement num_unfinished for current, to undo that we initialized it
	* to 1 at the beginning.
	*/
	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
	wait_for_completion(&shared_ctx.all_finished);

	tsync_works_release(&works);
	up_write(&current->signal->exec_update_lock);
	return atomic_read(&shared_ctx.preparation_error);
	}