Merge tag 'locking-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull locking fixes from Thomas Gleixner: "Three small fixes/updates for the locking core code: - Plug a task struct reference leak in the percpu rswem implementation. - Document the refcount interaction with PID_MAX_LIMIT - Improve the 'invalid wait context' data dump in lockdep so it contains all information which is required to decode the problem" * tag 'locking-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: locking/lockdep: Improve 'invalid wait context' splat locking/refcount: Document interaction with PID_MAX_LIMIT locking/percpu-rwsem: Fix a task_struct refcount

commit: 652fa53caa01dbfdef1b8c5997767e284ac21a5c [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Sun Apr 12 09:47:10 2020 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Sun Apr 12 09:47:10 2020 -0700
tree: d4a0998c9501a6f525ac269169865613a123be16
parent: 4119bf9f1d093b495f5fe3fcb32bde3156d2ba6e [diff]
parent: 9a019db0b6bebc84d6b64636faf73ed6d64cd4bb [diff]
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0ac50cf6..0e3ee25 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h

@@ -38,11 +38,24 @@
  * atomic operations, then the count will continue to edge closer to 0. If it
  * reaches a value of 1 before /any/ of the threads reset it to the saturated
  * value, then a concurrent refcount_dec_and_test() may erroneously free the
- * underlying object. Given the precise timing details involved with the
- * round-robin scheduling of each thread manipulating the refcount and the need
- * to hit the race multiple times in succession, there doesn't appear to be a
- * practical avenue of attack even if using refcount_add() operations with
- * larger increments.
+ * underlying object.
+ * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
+ * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
+ * With the current PID limit, if no batched refcounting operations are used and
+ * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
+ * operations, this makes it impossible for a saturated refcount to leave the
+ * saturation range, even if it is possible for multiple uses of the same
+ * refcount to nest in the context of a single task:
+ *
+ *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
+ *     0x40000000 / 0x400000 = 0x100 = 256
+ *
+ * If hundreds of references are added/removed with a single refcounting
+ * operation, it may potentially be possible to leave the saturation range; but
+ * given the precise timing details involved with the round-robin scheduling of
+ * each thread manipulating the refcount and the need to hit the race multiple
+ * times in succession, there doesn't appear to be a practical avenue of attack
+ * even if using refcount_add() operations with larger increments.
  *
  * Memory ordering
  * ===============

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 1511690..ac10db6 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c

@@ -3952,10 +3952,36 @@
 	return ret;
 }
 
+static inline short task_wait_context(struct task_struct *curr)
+{
+	/*
+	 * Set appropriate wait type for the context; for IRQs we have to take
+	 * into account force_irqthread as that is implied by PREEMPT_RT.
+	 */
+	if (curr->hardirq_context) {
+		/*
+		 * Check if force_irqthreads will run us threaded.
+		 */
+		if (curr->hardirq_threaded || curr->irq_config)
+			return LD_WAIT_CONFIG;
+
+		return LD_WAIT_SPIN;
+	} else if (curr->softirq_context) {
+		/*
+		 * Softirqs are always threaded.
+		 */
+		return LD_WAIT_CONFIG;
+	}
+
+	return LD_WAIT_MAX;
+}
+
 static int
 print_lock_invalid_wait_context(struct task_struct *curr,
 				struct held_lock *hlock)
 {
+	short curr_inner;
+
 	if (!debug_locks_off())
 		return 0;
 	if (debug_locks_silent)
@@ -3971,6 +3997,10 @@
 	print_lock(hlock);
 
 	pr_warn("other info that might help us debug this:\n");
+
+	curr_inner = task_wait_context(curr);
+	pr_warn("context-{%d:%d}\n", curr_inner, curr_inner);
+
 	lockdep_print_held_locks(curr);
 
 	pr_warn("stack backtrace:\n");
@@ -4017,26 +4047,7 @@
 	}
 	depth++;
 
-	/*
-	 * Set appropriate wait type for the context; for IRQs we have to take
-	 * into account force_irqthread as that is implied by PREEMPT_RT.
-	 */
-	if (curr->hardirq_context) {
-		/*
-		 * Check if force_irqthreads will run us threaded.
-		 */
-		if (curr->hardirq_threaded || curr->irq_config)
-			curr_inner = LD_WAIT_CONFIG;
-		else
-			curr_inner = LD_WAIT_SPIN;
-	} else if (curr->softirq_context) {
-		/*
-		 * Softirqs are always threaded.
-		 */
-		curr_inner = LD_WAIT_CONFIG;
-	} else {
-		curr_inner = LD_WAIT_MAX;
-	}
+	curr_inner = task_wait_context(curr);
 
 	for (; depth < curr->lockdep_depth; depth++) {
 		struct held_lock *prev = curr->held_locks + depth;

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index a008a1b..8bbafe3 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c

@@ -118,14 +118,15 @@
 				      unsigned int mode, int wake_flags,
 				      void *key)
 {
-	struct task_struct *p = get_task_struct(wq_entry->private);
 	bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
 	struct percpu_rw_semaphore *sem = key;
+	struct task_struct *p;
 
 	/* concurrent against percpu_down_write(), can get stolen */
 	if (!__percpu_rwsem_trylock(sem, reader))
 		return 1;
 
+	p = get_task_struct(wq_entry->private);
 	list_del_init(&wq_entry->entry);
 	smp_store_release(&wq_entry->private, NULL);
commit	652fa53caa01dbfdef1b8c5997767e284ac21a5c	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Sun Apr 12 09:47:10 2020 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Sun Apr 12 09:47:10 2020 -0700
tree	d4a0998c9501a6f525ac269169865613a123be16
parent	4119bf9f1d093b495f5fe3fcb32bde3156d2ba6e [diff]
parent	9a019db0b6bebc84d6b64636faf73ed6d64cd4bb [diff]