futex: Implement PING!

Add PI New Generation PI Futex that makes SCHED_OTHER pending owners have
their lock stolen by any task that tries to get it before it does.

In other words, this PI futex, SCHED_OTHER is not fair.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 7e2744e..2187462 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -22,6 +22,8 @@
 #define FUTEX_WAIT_REQUEUE_PI	11
 #define FUTEX_CMP_REQUEUE_PI	12
 #define FUTEX_LOCK_PI2		13
+#define FUTEX_LOCK_PING		14
+#define FUTEX_UNLOCK_PING	15
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 30c2afa..f55bec5 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -443,8 +443,9 @@ extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bi
 extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
 			 u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
 
-extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, bool do_ping);
 
-extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
+extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time,
+			 int trylock, bool do_ping);
 
 #endif /* _FUTEX_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 7808068..2fd23cd 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -915,7 +915,8 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
  *
  * Also serves as futex trylock_pi()'ing, and due semantics.
  */
-int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
+int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock,
+		  bool do_ping)
 {
 	struct hrtimer_sleeper timeout, *to;
 	struct task_struct *exiting;
@@ -1032,7 +1033,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 		 * such that futex_unlock_pi() is guaranteed to observe the waiter when
 		 * it sees the futex_q::pi_state.
 		 */
-		ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
+		ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q, do_ping);
 		raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
 
 		if (ret) {
@@ -1065,7 +1066,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 		 *
 		 * What could possibly go wrong...
 		 */
-		if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+		if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, do_ping))
 			ret = 0;
 
 		/*
@@ -1130,7 +1131,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, bool do_ping)
 {
 	u32 curval, uval, vpid = task_pid_vnr(current);
 	union futex_key key = FUTEX_KEY_INIT;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index d818b4d..878c7a84 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -642,7 +642,7 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
 
 			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
 							this->rt_waiter,
-							this->task);
+							this->task, false);
 
 			if (ret == 1) {
 				/*
@@ -862,7 +862,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		/*
 		 * See futex_unlock_pi()'s cleanup: comment.
 		 */
-		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter, false))
 			ret = 0;
 
 		futex_q_lockptr_lock(&q);
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 84ce452..b0e4ac3 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -114,6 +114,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 {
 	unsigned int flags = futex_to_flags(op);
 	int cmd = op & FUTEX_CMD_MASK;
+	bool do_ping = false;
 
 	if (flags & FLAGS_CLOCKRT) {
 		if (cmd != FUTEX_WAIT_BITSET &&
@@ -139,15 +140,21 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
 	case FUTEX_WAKE_OP:
 		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+	case FUTEX_LOCK_PING:
+		do_ping = true;
+		fallthrough;
 	case FUTEX_LOCK_PI:
 		flags |= FLAGS_CLOCKRT;
 		fallthrough;
 	case FUTEX_LOCK_PI2:
-		return futex_lock_pi(uaddr, flags, timeout, 0);
+		return futex_lock_pi(uaddr, flags, timeout, 0, do_ping);
+	case FUTEX_UNLOCK_PING:
+		do_ping = true;
+		fallthrough;
 	case FUTEX_UNLOCK_PI:
-		return futex_unlock_pi(uaddr, flags);
+		return futex_unlock_pi(uaddr, flags, do_ping);
 	case FUTEX_TRYLOCK_PI:
-		return futex_lock_pi(uaddr, flags, NULL, 1);
+		return futex_lock_pi(uaddr, flags, NULL, 1, do_ping);
 	case FUTEX_WAIT_REQUEUE_PI:
 		val3 = FUTEX_BITSET_MATCH_ANY;
 		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
@@ -203,7 +210,7 @@ const char * __futex_cmds[] =
 	"FUTEX_CMP_REQUEUE", "FUTEX_WAKE_OP", "FUTEX_LOCK_PI",
 	"FUTEX_UNLOCK_PI", "FUTEX_TRYLOCK_PI", "FUTEX_WAIT_BITSET",
 	"FUTEX_WAKE_BITSET", "FUTEX_WAIT_REQUEUE_PI", "FUTEX_CMP_REQUEUE_PI",
-	"FUTEX_LOCK_PI2", NULL
+	"FUTEX_LOCK_PI2", "FUTEX_LOCK_PING", "FUTEX_UNLOCK_PING", NULL
 };
 
 void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
@@ -234,6 +241,8 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
 				switch(cmd) {
 				case FUTEX_LOCK_PI:
 				case FUTEX_UNLOCK_PI:
+				case FUTEX_LOCK_PING:
+				case FUTEX_UNLOCK_PING:
 					seq_buf_printf(s, " tid: %d",
 						       val & FUTEX_TID_MASK);
 
@@ -254,7 +263,7 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
 			}
 			continue;
 		case 1:
-			if (cmd <= FUTEX_LOCK_PI2)
+			if (cmd < ARRAY_SIZE(__futex_cmds) - 1)
 				seq_buf_printf(s, ", %s", __futex_cmds[cmd]);
 			else
 				seq_buf_puts(s, ", UNKNOWN");
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index c80902e..d589a54 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -428,12 +428,17 @@ static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
 }
 
 static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
-				  struct rt_mutex_waiter *top_waiter)
+				  struct rt_mutex_waiter *top_waiter, bool do_ping)
 {
+	bool ret;
+
 	if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
 		return true;
 
-#ifdef RT_MUTEX_BUILD_SPINLOCKS
+#ifndef RT_MUTEX_BUILD_SPINLOCKS
+	if (!do_ping)
+		return false;
+#endif
 	/*
 	 * Note that RT tasks are excluded from same priority (lateral)
 	 * steals to prevent the introduction of an unbounded latency.
@@ -441,10 +446,10 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
 	if (rt_or_dl_prio(waiter->tree.prio))
 		return false;
 
-	return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
-#else
-	return false;
-#endif
+	ret = rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
+	if (ret)
+		trace_printk("LOCK STOLEN!\n");
+	return ret;
 }
 
 #define __node_2_waiter(node) \
@@ -1085,7 +1090,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
  */
 static int __sched
 try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
-		     struct rt_mutex_waiter *waiter)
+		     struct rt_mutex_waiter *waiter, bool do_ping)
 {
 	lockdep_assert_held(&lock->wait_lock);
 
@@ -1126,7 +1131,7 @@ try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
 		 * If waiter is the highest priority waiter of @lock,
 		 * or allowed to steal it, take it over.
 		 */
-		if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
+		if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter, do_ping)) {
 			/*
 			 * We can acquire the lock. Remove the waiter from the
 			 * lock waiters tree.
@@ -1147,7 +1152,7 @@ try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
 		if (rt_mutex_has_waiters(lock)) {
 			/* Check whether the trylock can steal it. */
 			if (!rt_mutex_steal(task_to_waiter(task),
-					    rt_mutex_top_waiter(lock)))
+					    rt_mutex_top_waiter(lock), do_ping))
 				return 0;
 
 			/*
@@ -1357,7 +1362,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
 
 static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
 {
-	int ret = try_to_take_rt_mutex(lock, current, NULL);
+	int ret = try_to_take_rt_mutex(lock, current, NULL, false);
 
 	/*
 	 * try_to_take_rt_mutex() sets the lock waiters bit
@@ -1616,7 +1621,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 	lockevent_inc(rtmutex_slow_block);
 	for (;;) {
 		/* Try to acquire the lock: */
-		if (try_to_take_rt_mutex(lock, current, waiter)) {
+		if (try_to_take_rt_mutex(lock, current, waiter, false)) {
 			lockevent_inc(rtmutex_slow_acq3);
 			break;
 		}
@@ -1703,7 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
 	lockevent_inc(rtmutex_slowlock);
 
 	/* Try to acquire the lock again: */
-	if (try_to_take_rt_mutex(lock, current, NULL)) {
+	if (try_to_take_rt_mutex(lock, current, NULL, false)) {
 		if (build_ww_mutex() && ww_ctx) {
 			__ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
 			ww_mutex_lock_acquired(ww, ww_ctx);
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 59dbd29..793696d 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -311,13 +311,13 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
 int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
 					struct rt_mutex_waiter *waiter,
 					struct task_struct *task,
-					struct wake_q_head *wake_q)
+					struct wake_q_head *wake_q, bool do_ping)
 {
 	int ret;
 
 	lockdep_assert_held(&lock->wait_lock);
 
-	if (try_to_take_rt_mutex(lock, task, NULL))
+	if (try_to_take_rt_mutex(lock, task, NULL, do_ping))
 		return 1;
 
 	/* We enforce deadlock detection for futexes */
@@ -358,13 +358,13 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
  */
 int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
 				      struct rt_mutex_waiter *waiter,
-				      struct task_struct *task)
+				      struct task_struct *task, bool do_ping)
 {
 	int ret;
 	DEFINE_WAKE_Q(wake_q);
 
 	raw_spin_lock_irq(&lock->wait_lock);
-	ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q);
+	ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q, do_ping);
 	if (unlikely(ret))
 		remove_waiter(lock, waiter);
 	preempt_disable();
@@ -433,7 +433,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
  * Special API call for PI-futex support
  */
 bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
-					 struct rt_mutex_waiter *waiter)
+					 struct rt_mutex_waiter *waiter, bool do_ping)
 {
 	bool cleanup = false;
 
@@ -449,7 +449,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
 	 * failed the trylock, we're still not owner and we need to remove
 	 * ourselves.
 	 */
-	try_to_take_rt_mutex(lock, current, waiter);
+	try_to_take_rt_mutex(lock, current, waiter, do_ping);
 	/*
 	 * Unless we're the owner; we're still enqueued on the wait_list.
 	 * So check if we became owner, if not, take us off the wait_list.
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index cf6ddd1..fb7f1da 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -84,15 +84,15 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
 extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
 				     struct rt_mutex_waiter *waiter,
 				     struct task_struct *task,
-				     struct wake_q_head *);
+				     struct wake_q_head *, bool do_ping);
 extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
 				     struct rt_mutex_waiter *waiter,
-				     struct task_struct *task);
+				     struct task_struct *task, bool do_ping);
 extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
 			       struct hrtimer_sleeper *to,
 			       struct rt_mutex_waiter *waiter);
 extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
-				 struct rt_mutex_waiter *waiter);
+				 struct rt_mutex_waiter *waiter, bool do_ping);
 
 extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
 extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);