ARM: entry: avoid 'badr' for setting the link register

Setting the link register directly before performing a jump instruction
is not equivalent to using the bl instruction: this bypasses the
return stack, which results in the wrong return path to be predicted,
hurting performance. So let's switch to ordinary bl sequences where
we can.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9243ea8..47e3200 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -504,8 +504,8 @@
 	@ r2 = PC value for the following instruction (:= regs->ARM_pc)
 	@ r4 = PC value for the faulting instruction
 	@ lr = 32-bit undefined instruction function
-	badr	lr, __und_usr_fault_32
-	b	call_fpe
+	bl	call_fpe
+	b	__und_usr_fault_32
 
 __und_usr_thumb:
 	@ Thumb instruction
@@ -541,11 +541,9 @@
 	add	r2, r2, #2			@ r2 is PC + 2, make it PC + 4
 	str	r2, [sp, #S_PC]			@ it's a 2x16bit instr, update
 	orr	r0, r0, r5, lsl #16
-	badr	lr, __und_usr_fault_32
 	@ r0 = the two 16-bit Thumb instructions which caused the exception
 	@ r2 = PC value for the following Thumb instruction (:= regs->ARM_pc)
 	@ r4 = PC value for the first 16-bit Thumb instruction
-	@ lr = 32bit undefined instruction function
 
 #if __LINUX_ARM_ARCH__ < 7
 /* If the target arch was overridden, change it back: */
@@ -608,7 +606,8 @@
 #ifdef CONFIG_NEON
 	get_thread_info r10			@ get current thread
 	adr	r6, .LCneon_thumb_opcodes
-	b	2f
+	bl	2f
+	b	__und_usr_fault_32
 #endif
 call_fpe:
 	get_thread_info r10			@ get current thread
@@ -738,8 +737,8 @@
 __und_usr_fault_16:
 	mov	r1, #2
 1:	mov	r0, sp
-	badr	lr, ret_from_exception
-	b	__und_fault
+	bl	__und_fault
+	b	ret_from_exception
 ENDPROC(__und_usr_fault_32)
 ENDPROC(__und_usr_fault_16)