futex: Implement PING! Add PI New Generation PI Futex that makes SCHED_OTHER pending owners have their lock stolen by any task that tries to get it before it does. In other words, this PI futex, SCHED_OTHER is not fair. Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 7e2744e..2187462 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h
@@ -22,6 +22,8 @@ #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_LOCK_PI2 13 +#define FUTEX_LOCK_PING 14 +#define FUTEX_UNLOCK_PING 15 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 30c2afa..f55bec5 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h
@@ -443,8 +443,9 @@ extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bi extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op); -extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); +extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, bool do_ping); -extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); +extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, + int trylock, bool do_ping); #endif /* _FUTEX_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 7808068..2fd23cd 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c
@@ -915,7 +915,8 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) * * Also serves as futex trylock_pi()'ing, and due semantics. */ -int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) +int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock, + bool do_ping) { struct hrtimer_sleeper timeout, *to; struct task_struct *exiting; @@ -1032,7 +1033,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl * such that futex_unlock_pi() is guaranteed to observe the waiter when * it sees the futex_q::pi_state. */ - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q, do_ping); raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); if (ret) { @@ -1065,7 +1066,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl * * What could possibly go wrong... */ - if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, do_ping)) ret = 0; /* @@ -1130,7 +1131,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) +int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, bool do_ping) { u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index d818b4d..878c7a84 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c
@@ -642,7 +642,7 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1, ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, - this->task); + this->task, false); if (ret == 1) { /* @@ -862,7 +862,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, /* * See futex_unlock_pi()'s cleanup: comment. */ - if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter, false)) ret = 0; futex_q_lockptr_lock(&q);
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 84ce452..b0e4ac3 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c
@@ -114,6 +114,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, { unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; + bool do_ping = false; if (flags & FLAGS_CLOCKRT) { if (cmd != FUTEX_WAIT_BITSET && @@ -139,15 +140,21 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); + case FUTEX_LOCK_PING: + do_ping = true; + fallthrough; case FUTEX_LOCK_PI: flags |= FLAGS_CLOCKRT; fallthrough; case FUTEX_LOCK_PI2: - return futex_lock_pi(uaddr, flags, timeout, 0); + return futex_lock_pi(uaddr, flags, timeout, 0, do_ping); + case FUTEX_UNLOCK_PING: + do_ping = true; + fallthrough; case FUTEX_UNLOCK_PI: - return futex_unlock_pi(uaddr, flags); + return futex_unlock_pi(uaddr, flags, do_ping); case FUTEX_TRYLOCK_PI: - return futex_lock_pi(uaddr, flags, NULL, 1); + return futex_lock_pi(uaddr, flags, NULL, 1, do_ping); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, @@ -203,7 +210,7 @@ const char * __futex_cmds[] = "FUTEX_CMP_REQUEUE", "FUTEX_WAKE_OP", "FUTEX_LOCK_PI", "FUTEX_UNLOCK_PI", "FUTEX_TRYLOCK_PI", "FUTEX_WAIT_BITSET", "FUTEX_WAKE_BITSET", "FUTEX_WAIT_REQUEUE_PI", "FUTEX_CMP_REQUEUE_PI", - "FUTEX_LOCK_PI2", NULL + "FUTEX_LOCK_PI2", "FUTEX_LOCK_PING", "FUTEX_UNLOCK_PING", NULL }; void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, @@ -234,6 +241,8 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, switch(cmd) { case FUTEX_LOCK_PI: case FUTEX_UNLOCK_PI: + case FUTEX_LOCK_PING: + case FUTEX_UNLOCK_PING: seq_buf_printf(s, " tid: %d", val & FUTEX_TID_MASK); @@ -254,7 +263,7 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, } continue; case 1: - if (cmd <= FUTEX_LOCK_PI2) + if (cmd < ARRAY_SIZE(__futex_cmds) - 1) seq_buf_printf(s, ", %s", __futex_cmds[cmd]); else seq_buf_puts(s, ", UNKNOWN");
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c80902e..d589a54 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c
@@ -428,12 +428,17 @@ static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, } static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, - struct rt_mutex_waiter *top_waiter) + struct rt_mutex_waiter *top_waiter, bool do_ping) { + bool ret; + if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) return true; -#ifdef RT_MUTEX_BUILD_SPINLOCKS +#ifndef RT_MUTEX_BUILD_SPINLOCKS + if (!do_ping) + return false; +#endif /* * Note that RT tasks are excluded from same priority (lateral) * steals to prevent the introduction of an unbounded latency. @@ -441,10 +446,10 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, if (rt_or_dl_prio(waiter->tree.prio)) return false; - return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); -#else - return false; -#endif + ret = rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); + if (ret) + trace_printk("LOCK STOLEN!\n"); + return ret; } #define __node_2_waiter(node) \ @@ -1085,7 +1090,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, */ static int __sched try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, bool do_ping) { lockdep_assert_held(&lock->wait_lock); @@ -1126,7 +1131,7 @@ try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, * If waiter is the highest priority waiter of @lock, * or allowed to steal it, take it over. */ - if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) { + if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter, do_ping)) { /* * We can acquire the lock. Remove the waiter from the * lock waiters tree. @@ -1147,7 +1152,7 @@ try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, if (rt_mutex_has_waiters(lock)) { /* Check whether the trylock can steal it. */ if (!rt_mutex_steal(task_to_waiter(task), - rt_mutex_top_waiter(lock))) + rt_mutex_top_waiter(lock), do_ping)) return 0; /* @@ -1357,7 +1362,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) { - int ret = try_to_take_rt_mutex(lock, current, NULL); + int ret = try_to_take_rt_mutex(lock, current, NULL, false); /* * try_to_take_rt_mutex() sets the lock waiters bit @@ -1616,7 +1621,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) { + if (try_to_take_rt_mutex(lock, current, waiter, false)) { lockevent_inc(rtmutex_slow_acq3); break; } @@ -1703,7 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, lockevent_inc(rtmutex_slowlock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock, current, NULL)) { + if (try_to_take_rt_mutex(lock, current, NULL, false)) { if (build_ww_mutex() && ww_ctx) { __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx);
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 59dbd29..793696d 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c
@@ -311,13 +311,13 @@ void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock) int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - struct wake_q_head *wake_q) + struct wake_q_head *wake_q, bool do_ping) { int ret; lockdep_assert_held(&lock->wait_lock); - if (try_to_take_rt_mutex(lock, task, NULL)) + if (try_to_take_rt_mutex(lock, task, NULL, do_ping)) return 1; /* We enforce deadlock detection for futexes */ @@ -358,13 +358,13 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, */ int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task) + struct task_struct *task, bool do_ping) { int ret; DEFINE_WAKE_Q(wake_q); raw_spin_lock_irq(&lock->wait_lock); - ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q, do_ping); if (unlikely(ret)) remove_waiter(lock, waiter); preempt_disable(); @@ -433,7 +433,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, * Special API call for PI-futex support */ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, bool do_ping) { bool cleanup = false; @@ -449,7 +449,7 @@ bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, * failed the trylock, we're still not owner and we need to remove * ourselves. */ - try_to_take_rt_mutex(lock, current, waiter); + try_to_take_rt_mutex(lock, current, waiter, do_ping); /* * Unless we're the owner; we're still enqueued on the wait_list. * So check if we became owner, if not, take us off the wait_list.
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index cf6ddd1..fb7f1da 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h
@@ -84,15 +84,15 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - struct wake_q_head *); + struct wake_q_head *, bool do_ping); extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, - struct task_struct *task); + struct task_struct *task, bool do_ping); extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter); extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, - struct rt_mutex_waiter *waiter); + struct rt_mutex_waiter *waiter, bool do_ping); extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);