|  | From: Oleg Nesterov <oleg@redhat.com> | 
|  | Date: Tue, 14 Jul 2015 14:26:34 +0200 | 
|  | Subject: signal/x86: Delay calling signals in atomic | 
|  |  | 
|  | On x86_64 we must disable preemption before we enable interrupts | 
|  | for stack faults, int3 and debugging, because the current task is using | 
|  | a per CPU debug stack defined by the IST. If we schedule out, another task | 
|  | can come in and use the same stack and cause the stack to be corrupted | 
|  | and crash the kernel on return. | 
|  |  | 
|  | When CONFIG_PREEMPT_RT_FULL is enabled, spin_locks become mutexes, and | 
|  | one of these is the spin lock used in signal handling. | 
|  |  | 
|  | Some of the debug code (int3) causes do_trap() to send a signal. | 
|  | This function calls a spin lock that has been converted to a mutex | 
|  | and has the possibility to sleep. If this happens, the above issues with | 
|  | the corrupted stack is possible. | 
|  |  | 
|  | Instead of calling the signal right away, for PREEMPT_RT and x86_64, | 
|  | the signal information is stored on the stacks task_struct and | 
|  | TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume | 
|  | code will send the signal when preemption is enabled. | 
|  |  | 
|  | [ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT_FULL to | 
|  | ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ] | 
|  |  | 
|  |  | 
|  | Signed-off-by: Oleg Nesterov <oleg@redhat.com> | 
|  | Signed-off-by: Steven Rostedt <rostedt@goodmis.org> | 
|  | Signed-off-by: Thomas Gleixner <tglx@linutronix.de> | 
|  | --- | 
|  |  | 
|  | arch/x86/entry/common.c       |    7 +++++++ | 
|  | arch/x86/include/asm/signal.h |   13 +++++++++++++ | 
|  | include/linux/sched.h         |    4 ++++ | 
|  | kernel/signal.c               |   37 +++++++++++++++++++++++++++++++++++-- | 
|  | 4 files changed, 59 insertions(+), 2 deletions(-) | 
|  |  | 
|  | --- a/arch/x86/entry/common.c | 
|  | +++ b/arch/x86/entry/common.c | 
|  | @@ -151,6 +151,13 @@ static void exit_to_usermode_loop(struct | 
|  | if (cached_flags & _TIF_NEED_RESCHED) | 
|  | schedule(); | 
|  |  | 
|  | +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND | 
|  | +		if (unlikely(current->forced_info.si_signo)) { | 
|  | +			struct task_struct *t = current; | 
|  | +			force_sig_info(t->forced_info.si_signo, &t->forced_info, t); | 
|  | +			t->forced_info.si_signo = 0; | 
|  | +		} | 
|  | +#endif | 
|  | if (cached_flags & _TIF_UPROBE) | 
|  | uprobe_notify_resume(regs); | 
|  |  | 
|  | --- a/arch/x86/include/asm/signal.h | 
|  | +++ b/arch/x86/include/asm/signal.h | 
|  | @@ -28,6 +28,19 @@ typedef struct { | 
|  | #define SA_IA32_ABI	0x02000000u | 
|  | #define SA_X32_ABI	0x01000000u | 
|  |  | 
|  | +/* | 
|  | + * Because some traps use the IST stack, we must keep preemption | 
|  | + * disabled while calling do_trap(), but do_trap() may call | 
|  | + * force_sig_info() which will grab the signal spin_locks for the | 
|  | + * task, which in PREEMPT_RT_FULL are mutexes.  By defining | 
|  | + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set | 
|  | + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the | 
|  | + * trap. | 
|  | + */ | 
|  | +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64) | 
|  | +#define ARCH_RT_DELAYS_SIGNAL_SEND | 
|  | +#endif | 
|  | + | 
|  | #ifndef CONFIG_COMPAT | 
|  | typedef sigset_t compat_sigset_t; | 
|  | #endif | 
|  | --- a/include/linux/sched.h | 
|  | +++ b/include/linux/sched.h | 
|  | @@ -805,6 +805,10 @@ struct task_struct { | 
|  | /* Restored if set_restore_sigmask() was used: */ | 
|  | sigset_t			saved_sigmask; | 
|  | struct sigpending		pending; | 
|  | +#ifdef CONFIG_PREEMPT_RT_FULL | 
|  | +	/* TODO: move me into ->restart_block ? */ | 
|  | +	struct				siginfo forced_info; | 
|  | +#endif | 
|  | unsigned long			sas_ss_sp; | 
|  | size_t				sas_ss_size; | 
|  | unsigned int			sas_ss_flags; | 
|  | --- a/kernel/signal.c | 
|  | +++ b/kernel/signal.c | 
|  | @@ -1185,8 +1185,8 @@ int do_send_sig_info(int sig, struct sig | 
|  | * We don't want to have recursive SIGSEGV's etc, for example, | 
|  | * that is why we also clear SIGNAL_UNKILLABLE. | 
|  | */ | 
|  | -int | 
|  | -force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | 
|  | +static int | 
|  | +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | 
|  | { | 
|  | unsigned long int flags; | 
|  | int ret, blocked, ignored; | 
|  | @@ -1215,6 +1215,39 @@ force_sig_info(int sig, struct siginfo * | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | 
|  | +{ | 
|  | +/* | 
|  | + * On some archs, PREEMPT_RT has to delay sending a signal from a trap | 
|  | + * since it can not enable preemption, and the signal code's spin_locks | 
|  | + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will | 
|  | + * send the signal on exit of the trap. | 
|  | + */ | 
|  | +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND | 
|  | +	if (in_atomic()) { | 
|  | +		if (WARN_ON_ONCE(t != current)) | 
|  | +			return 0; | 
|  | +		if (WARN_ON_ONCE(t->forced_info.si_signo)) | 
|  | +			return 0; | 
|  | + | 
|  | +		if (is_si_special(info)) { | 
|  | +			WARN_ON_ONCE(info != SEND_SIG_PRIV); | 
|  | +			t->forced_info.si_signo = sig; | 
|  | +			t->forced_info.si_errno = 0; | 
|  | +			t->forced_info.si_code = SI_KERNEL; | 
|  | +			t->forced_info.si_pid = 0; | 
|  | +			t->forced_info.si_uid = 0; | 
|  | +		} else { | 
|  | +			t->forced_info = *info; | 
|  | +		} | 
|  | + | 
|  | +		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); | 
|  | +		return 0; | 
|  | +	} | 
|  | +#endif | 
|  | +	return do_force_sig_info(sig, info, t); | 
|  | +} | 
|  | + | 
|  | /* | 
|  | * Nuke all other threads in the group. | 
|  | */ |