futex: Rework SET_SLOTS Let SET_SLOTS have precedence over default scaling; once user sets a size, stick with it. Notably, doing SET_SLOTS 0 will cause fph->hash_mask to be 0, which will cause __futex_hash() to return global hash buckets. Once in this state, it is impossible to recover, so disable SET_SLOTS. Also, let prctl() users wait-retry the rehash, such that return of prctl() means new size is in effect. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

commit: b37932a2988afdd932334774c6e7536f5f5e1f70 [log] [tgz]
author: Peter Zijlstra <peterz@infradead.org> Thu Feb 27 19:43:59 2025 +0100
committer: Peter Zijlstra <peterz@infradead.org> Sat Mar 01 19:51:16 2025 +0100
tree: 9377e2453f22ca64cae9c395759ef071d79e961c
parent: 44bcbe5284e76b0a4b74d43190d136d0ec0cb471 [diff]
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 7516542..4b9a4fb0 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c

@@ -61,6 +61,8 @@ struct futex_private_hash {
 	rcuref_t	users;
 	unsigned int	hash_mask;
 	struct rcu_head	rcu;
+	void		*mm;
+	bool		custom;
 	struct futex_hash_bucket queues[];
 };
 
@@ -192,12 +194,6 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
 
 	fph = mm->futex_phash;
 	if (fph) {
-		if (fph->hash_mask >= new->hash_mask) {
-			/* It was increased again while we were waiting */
-			kvfree(new);
-			return true;
-		}
-
 		if (rcuref_read(&fph->users) != 0) {
 			mm->futex_phash_new = new;
 			return false;
@@ -207,6 +203,7 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
 	}
 	rcu_assign_pointer(mm->futex_phash, new);
 	kvfree_rcu(fph, rcu);
+	wake_up_var(mm);
 	return true;
 }
 
@@ -258,11 +255,8 @@ bool futex_private_hash_get(struct futex_private_hash *fph)
 
 void futex_private_hash_put(struct futex_private_hash *fph)
 {
-	/*
-	 * Ignore the result; the DEAD state is picked up
-	 * when rcuref_get() starts failing.
-	 */
-	bool __maybe_unused ignore = rcuref_put(&fph->users);
+	if (rcuref_put(&fph->users))
+		wake_up_var(fph->mm);
 }
 
 struct futex_hash_bucket *futex_hash(union futex_key *key)
@@ -1402,71 +1396,124 @@ void futex_hash_free(struct mm_struct *mm)
 	kvfree(mm->futex_phash);
 }
 
-static int futex_hash_allocate(unsigned int hash_slots)
+static bool futex_pivot_pending(struct mm_struct *mm)
 {
-	struct futex_private_hash *fph, *hb_tofree = NULL;
+	guard(rcu)();
+
+	if (!mm->futex_phash_new)
+		return false;
+
+	return !rcuref_read(&mm->futex_phash->users);
+}
+
+static bool futex_hash_less(struct futex_private_hash *a,
+			    struct futex_private_hash *b)
+{
+	/* user provided always wins */
+	if (!a->custom && b->custom)
+		return true;
+	if (a->custom && !b->custom)
+		return false;
+
+	/* zero-sized hash wins */
+	if (!b->hash_mask)
+		return true;
+	if (!a->hash_mask)
+		return false;
+
+	/* keep the biggest */
+	if (a->hash_mask < b->hash_mask)
+		return true;
+	if (a->hash_mask > b->hash_mask)
+		return false;
+
+	return false; /* equal */
+}
+
+static int futex_hash_allocate(unsigned int hash_slots, bool custom)
+{
 	struct mm_struct *mm = current->mm;
-	size_t alloc_size;
+	struct futex_private_hash *fph;
 	int i;
 
-	if (hash_slots == 0)
-		hash_slots = 16;
-	hash_slots = clamp(hash_slots, 2, futex_hashmask + 1);
-	if (!is_power_of_2(hash_slots))
-		hash_slots = rounddown_pow_of_two(hash_slots);
+	if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
+		return -EINVAL;
 
-	if (unlikely(check_mul_overflow(hash_slots, sizeof(struct futex_hash_bucket),
-					&alloc_size)))
-		return -ENOMEM;
+	/*
+	 * Once we've disabled the global hash there is no way back.
+	 */
+	scoped_guard (rcu) {
+		fph = rcu_dereference(mm->futex_phash);
+		if (fph && !mm->futex_phash->hash_mask) {
+			if (custom)
+				return -EBUSY;
+			return 0;
+		}
+	}
 
-	if (unlikely(check_add_overflow(alloc_size, sizeof(struct futex_private_hash),
-					&alloc_size)))
-		return -ENOMEM;
-
-	fph = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
+	fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT);
 	if (!fph)
 		return -ENOMEM;
 
 	rcuref_init(&fph->users, 1);
-	fph->hash_mask = hash_slots - 1;
+	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
+	fph->custom = custom;
+	fph->mm = mm;
 
 	for (i = 0; i < hash_slots; i++)
 		futex_hash_bucket_init(&fph->queues[i], fph);
 
-	scoped_guard(mutex, &mm->futex_hash_lock) {
-		if (mm->futex_phash && !mm->futex_phash_new) {
-			/*
-			 * If we have an existing hash, but do not yet have
-			 * allocated a replacement hash, drop the initial
-			 * reference on the existing hash.
-			 *
-			 * Ignore the return value; removal is serialized by
-			 * mm->futex_hash_lock which we currently hold.
-			 */
-			futex_private_hash_put(mm->futex_phash);
-		}
-
-		if (mm->futex_phash_new) {
-			/*
-			 * If we already have a replacement hash pending;
-			 * keep the larger hash.
-			 */
-			if (mm->futex_phash_new->hash_mask <= fph->hash_mask) {
-				hb_tofree = mm->futex_phash_new;
-			} else {
-				hb_tofree = fph;
-				fph = mm->futex_phash_new;
-			}
-			mm->futex_phash_new = NULL;
-		}
-
+	if (custom) {
 		/*
-		 * Will set mm->futex_phash_new on failure;
-		 * futex_get_private_hash() will try again.
+		 * Only let prctl() wait / retry; don't unduly delay clone().
 		 */
-		__futex_pivot_hash(mm, fph);
+again:
+		wait_var_event(mm, futex_pivot_pending(mm));
 	}
-	kvfree(hb_tofree);
+
+	scoped_guard(mutex, &mm->futex_hash_lock) {
+		struct futex_private_hash *free __free(kvfree) = NULL;
+		struct futex_private_hash *cur, *new;
+
+		cur = mm->futex_phash;
+		new = mm->futex_phash_new;
+		mm->futex_phash_new = NULL;
+
+		if (fph) {
+			if (cur && !new) {
+				/*
+				 * If we have an existing hash, but do not yet have
+				 * allocated a replacement hash, drop the initial
+				 * reference on the existing hash.
+				 */
+				futex_private_hash_put(mm->futex_phash);
+			}
+
+			if (new) {
+				/*
+				 * Two updates raced; throw out the lesser one.
+				 */
+				if (futex_hash_less(new, fph)) {
+					free = new;
+					new = fph;
+				} else {
+					free = fph;
+				}
+			} else {
+				new = fph;
+			}
+			fph = NULL;
+		}
+
+		if (new) {
+			/*
+			 * Will set mm->futex_phash_new on failure;
+			 * futex_get_private_hash() will try again.
+			 */
+			if (!__futex_pivot_hash(mm, new) && custom)
+				goto again;
+		}
+	}
 	return 0;
 }
 
@@ -1479,10 +1526,17 @@ int futex_hash_allocate_default(void)
 		return 0;
 
 	scoped_guard(rcu) {
-		threads = min_t(unsigned int, get_nr_threads(current), num_online_cpus());
+		threads = min_t(unsigned int,
+				get_nr_threads(current),
+				num_online_cpus());
+
 		fph = rcu_dereference(current->mm->futex_phash);
-		if (fph)
+		if (fph) {
+			if (fph->custom)
+				return 0;
+
 			current_buckets = fph->hash_mask + 1;
+		}
 	}
 
 	/*
@@ -1495,7 +1549,7 @@ int futex_hash_allocate_default(void)
 	if (current_buckets >= buckets)
 		return 0;
 
-	return futex_hash_allocate(buckets);
+	return futex_hash_allocate(buckets, false);
 }
 
 static int futex_hash_get_slots(void)
@@ -1511,7 +1565,7 @@ static int futex_hash_get_slots(void)
 
 #else
 
-static int futex_hash_allocate(unsigned int hash_slots)
+static int futex_hash_allocate(unsigned int hash_slots, bool custom)
 {
 	return 0;
 }
@@ -1528,7 +1582,7 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3)
 
 	switch (arg2) {
 	case PR_FUTEX_HASH_SET_SLOTS:
-		ret = futex_hash_allocate(arg3);
+		ret = futex_hash_allocate(arg3, true);
 		break;
 
 	case PR_FUTEX_HASH_GET_SLOTS:
commit	b37932a2988afdd932334774c6e7536f5f5e1f70	[log] [tgz]
author	Peter Zijlstra <peterz@infradead.org>	Thu Feb 27 19:43:59 2025 +0100
committer	Peter Zijlstra <peterz@infradead.org>	Sat Mar 01 19:51:16 2025 +0100
tree	9377e2453f22ca64cae9c395759ef071d79e961c
parent	44bcbe5284e76b0a4b74d43190d136d0ec0cb471 [diff]