releases/5.16.20/riscv-fixed-misaligned-memory-access.-fixed-pointer-.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 10ea3b21cac6cb31c3b15043bebad42700e1f367 Mon Sep 17 00:00:00 2001
 From: Sasha Levin <sashal@kernel.org>
 Date: Mon, 7 Mar 2022 20:03:21 -0500
 Subject: riscv: Fixed misaligned memory access. Fixed pointer comparison.

 From: Michael T. Kloos <michael@michaelkloos.com>

 [ Upstream commit 9d1f0ec9f71780e69ceb9d91697600c747d6e02e ]

 Rewrote the RISC-V memmove() assembly implementation.  The
 previous implementation did not check memory alignment and it
 compared 2 pointers with a signed comparison.  The misaligned
 memory access would cause the kernel to crash on systems that
 did not emulate it in firmware and did not support it in hardware.
 Firmware emulation is slow and may not exist.  The RISC-V spec
 does not guarantee that support for misaligned memory accesses
 will exist.  It should not be depended on.

 This patch now checks for XLEN granularity of co-alignment between
 the pointers.  Failing that, copying is done by loading from the 2
 contiguous and naturally aligned XLEN memory locations containing
 the overlapping XLEN sized data to be copied.  The data is shifted
 into the correct place and binary or'ed together on each
 iteration.  The result is then stored into the corresponding
 naturally aligned XLEN sized location in the destination.  For
 unaligned data at the terminations of the regions to be copied
 or for copies less than (2 * XLEN) in size, byte copy is used.

 This patch also now uses unsigned comparison for the pointers and
 migrates to the newer assembler annotations from the now deprecated
 ones.

 Signed-off-by: Michael T. Kloos <michael@michaelkloos.com>
 Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
 Signed-off-by: Sasha Levin <sashal@kernel.org>
 ---
  arch/riscv/lib/memmove.S | 368 +++++++++++++++++++++++++++++++++------
  1 file changed, 310 insertions(+), 58 deletions(-)

 diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
 index 07d1d2152ba5..e0609e1f0864 100644
 --- a/arch/riscv/lib/memmove.S
 +++ b/arch/riscv/lib/memmove.S
 @@ -1,64 +1,316 @@
 -/* SPDX-License-Identifier: GPL-2.0 */
 +/* SPDX-License-Identifier: GPL-2.0-only */
 +/*
 + * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
 + */

  #include <linux/linkage.h>
  #include <asm/asm.h>

 -ENTRY(__memmove)
 -WEAK(memmove)
 -        move    t0, a0
 -        move    t1, a1
 -
 -        beq     a0, a1, exit_memcpy
 -        beqz    a2, exit_memcpy
 -        srli    t2, a2, 0x2
 -
 -        slt     t3, a0, a1
 -        beqz    t3, do_reverse
 -
 -        andi    a2, a2, 0x3
 -        li      t4, 1
 -        beqz    t2, byte_copy
 -
 -word_copy:
 -        lw      t3, 0(a1)
 -        addi    t2, t2, -1
 -        addi    a1, a1, 4
 -        sw      t3, 0(a0)
 -        addi    a0, a0, 4
 -        bnez    t2, word_copy
 -        beqz    a2, exit_memcpy
 -        j       byte_copy
 -
 -do_reverse:
 -        add     a0, a0, a2
 -        add     a1, a1, a2
 -        andi    a2, a2, 0x3
 -        li      t4, -1
 -        beqz    t2, reverse_byte_copy
 -
 -reverse_word_copy:
 -        addi    a1, a1, -4
 -        addi    t2, t2, -1
 -        lw      t3, 0(a1)
 -        addi    a0, a0, -4
 -        sw      t3, 0(a0)
 -        bnez    t2, reverse_word_copy
 -        beqz    a2, exit_memcpy
 -
 -reverse_byte_copy:
 -        addi    a0, a0, -1
 -        addi    a1, a1, -1
 +SYM_FUNC_START(__memmove)
 +SYM_FUNC_START_WEAK(memmove)
 +	/*
 +	 * Returns
 +	 *   a0 - dest
 +	 *
 +	 * Parameters
 +	 *   a0 - Inclusive first byte of dest
 +	 *   a1 - Inclusive first byte of src
 +	 *   a2 - Length of copy n
 +	 *
 +	 * Because the return matches the parameter register a0,
 +	 * we will not clobber or modify that register.
 +	 *
 +	 * Note: This currently only works on little-endian.
 +	 * To port to big-endian, reverse the direction of shifts
 +	 * in the 2 misaligned fixup copy loops.
 +	 */

 +	/* Return if nothing to do */
 +	beq a0, a1, return_from_memmove
 +	beqz a2, return_from_memmove
 +
 +	/*
 +	 * Register Uses
 +	 *      Forward Copy: a1 - Index counter of src
 +	 *      Reverse Copy: a4 - Index counter of src
 +	 *      Forward Copy: t3 - Index counter of dest
 +	 *      Reverse Copy: t4 - Index counter of dest
 +	 *   Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
 +	 *   Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
 +	 *   Both Copy Modes: t0 - Link / Temporary for load-store
 +	 *   Both Copy Modes: t1 - Temporary for load-store
 +	 *   Both Copy Modes: t2 - Temporary for load-store
 +	 *   Both Copy Modes: a5 - dest to src alignment offset
 +	 *   Both Copy Modes: a6 - Shift ammount
 +	 *   Both Copy Modes: a7 - Inverse Shift ammount
 +	 *   Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
 +	 */
 +
 +	/*
 +	 * Solve for some register values now.
 +	 * Byte copy does not need t5 or t6.
 +	 */
 +	mv   t3, a0
 +	add  t4, a0, a2
 +	add  a4, a1, a2
 +
 +	/*
 +	 * Byte copy if copying less than (2 * SZREG) bytes. This can
 +	 * cause problems with the bulk copy implementation and is
 +	 * small enough not to bother.
 +	 */
 +	andi t0, a2, -(2 * SZREG)
 +	beqz t0, byte_copy
 +
 +	/*
 +	 * Now solve for t5 and t6.
 +	 */
 +	andi t5, t3, -SZREG
 +	andi t6, t4, -SZREG
 +	/*
 +	 * If dest(Register t3) rounded down to the nearest naturally
 +	 * aligned SZREG address, does not equal dest, then add SZREG
 +	 * to find the low-bound of SZREG alignment in the dest memory
 +	 * region.  Note that this could overshoot the dest memory
 +	 * region if n is less than SZREG.  This is one reason why
 +	 * we always byte copy if n is less than SZREG.
 +	 * Otherwise, dest is already naturally aligned to SZREG.
 +	 */
 +	beq  t5, t3, 1f
 +		addi t5, t5, SZREG
 +	1:
 +
 +	/*
 +	 * If the dest and src are co-aligned to SZREG, then there is
 +	 * no need for the full rigmarole of a full misaligned fixup copy.
 +	 * Instead, do a simpler co-aligned copy.
 +	 */
 +	xor  t0, a0, a1
 +	andi t1, t0, (SZREG - 1)
 +	beqz t1, coaligned_copy
 +	/* Fall through to misaligned fixup copy */
 +
 +misaligned_fixup_copy:
 +	bltu a1, a0, misaligned_fixup_copy_reverse
 +
 +misaligned_fixup_copy_forward:
 +	jal  t0, byte_copy_until_aligned_forward
 +
 +	andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
 +	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
 +	sub  a5, a1, t3 /* Find the difference between src and dest */
 +	andi a1, a1, -SZREG /* Align the src pointer */
 +	addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
 +
 +	/*
 +	 * Compute The Inverse Shift
 +	 * a7 = XLEN - a6 = XLEN + -a6
 +	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
 +	 * Add that to XLEN.  XLEN = SZREG * 8.
 +	 */
 +	not  a7, a6
 +	addi a7, a7, (SZREG * 8 + 1)
 +
 +	/*
 +	 * Fix Misalignment Copy Loop - Forward
 +	 * load_val0 = load_ptr[0];
 +	 * do {
 +	 * 	load_val1 = load_ptr[1];
 +	 * 	store_ptr += 2;
 +	 * 	store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
 +	 *
 +	 * 	if (store_ptr == {a2})
 +	 * 		break;
 +	 *
 +	 * 	load_val0 = load_ptr[2];
 +	 * 	load_ptr += 2;
 +	 * 	store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
 +	 *
 +	 * } while (store_ptr != store_ptr_end);
 +	 * store_ptr = store_ptr_end;
 +	 */
 +
 +	REG_L t0, (0 * SZREG)(a1)
 +	1:
 +	REG_L t1, (1 * SZREG)(a1)
 +	addi  t3, t3, (2 * SZREG)
 +	srl   t0, t0, a6
 +	sll   t2, t1, a7
 +	or    t2, t0, t2
 +	REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
 +
 +	beq   t3, a2, 2f
 +
 +	REG_L t0, (2 * SZREG)(a1)
 +	addi  a1, a1, (2 * SZREG)
 +	srl   t1, t1, a6
 +	sll   t2, t0, a7
 +	or    t2, t1, t2
 +	REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
 +
 +	bne   t3, t6, 1b
 +	2:
 +	mv    t3, t6 /* Fix the dest pointer in case the loop was broken */
 +
 +	add  a1, t3, a5 /* Restore the src pointer */
 +	j byte_copy_forward /* Copy any remaining bytes */
 +
 +misaligned_fixup_copy_reverse:
 +	jal  t0, byte_copy_until_aligned_reverse
 +
 +	andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
 +	slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
 +	sub  a5, a4, t4 /* Find the difference between src and dest */
 +	andi a4, a4, -SZREG /* Align the src pointer */
 +	addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
 +
 +	/*
 +	 * Compute The Inverse Shift
 +	 * a7 = XLEN - a6 = XLEN + -a6
 +	 * 2s complement negation to find the negative: -a6 = ~a6 + 1
 +	 * Add that to XLEN.  XLEN = SZREG * 8.
 +	 */
 +	not  a7, a6
 +	addi a7, a7, (SZREG * 8 + 1)
 +
 +	/*
 +	 * Fix Misalignment Copy Loop - Reverse
 +	 * load_val1 = load_ptr[0];
 +	 * do {
 +	 * 	load_val0 = load_ptr[-1];
 +	 * 	store_ptr -= 2;
 +	 * 	store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
 +	 *
 +	 * 	if (store_ptr == {a2})
 +	 * 		break;
 +	 *
 +	 * 	load_val1 = load_ptr[-2];
 +	 * 	load_ptr -= 2;
 +	 * 	store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
 +	 *
 +	 * } while (store_ptr != store_ptr_end);
 +	 * store_ptr = store_ptr_end;
 +	 */
 +
 +	REG_L t1, ( 0 * SZREG)(a4)
 +	1:
 +	REG_L t0, (-1 * SZREG)(a4)
 +	addi  t4, t4, (-2 * SZREG)
 +	sll   t1, t1, a7
 +	srl   t2, t0, a6
 +	or    t2, t1, t2
 +	REG_S t2, ( 1 * SZREG)(t4)
 +
 +	beq   t4, a2, 2f
 +
 +	REG_L t1, (-2 * SZREG)(a4)
 +	addi  a4, a4, (-2 * SZREG)
 +	sll   t0, t0, a7
 +	srl   t2, t1, a6
 +	or    t2, t0, t2
 +	REG_S t2, ( 0 * SZREG)(t4)
 +
 +	bne   t4, t5, 1b
 +	2:
 +	mv    t4, t5 /* Fix the dest pointer in case the loop was broken */
 +
 +	add  a4, t4, a5 /* Restore the src pointer */
 +	j byte_copy_reverse /* Copy any remaining bytes */
 +
 +/*
 + * Simple copy loops for SZREG co-aligned memory locations.
 + * These also make calls to do byte copies for any unaligned
 + * data at their terminations.
 + */
 +coaligned_copy:
 +	bltu a1, a0, coaligned_copy_reverse
 +
 +coaligned_copy_forward:
 +	jal t0, byte_copy_until_aligned_forward
 +
 +	1:
 +	REG_L t1, ( 0 * SZREG)(a1)
 +	addi  a1, a1, SZREG
 +	addi  t3, t3, SZREG
 +	REG_S t1, (-1 * SZREG)(t3)
 +	bne   t3, t6, 1b
 +
 +	j byte_copy_forward /* Copy any remaining bytes */
 +
 +coaligned_copy_reverse:
 +	jal t0, byte_copy_until_aligned_reverse
 +
 +	1:
 +	REG_L t1, (-1 * SZREG)(a4)
 +	addi  a4, a4, -SZREG
 +	addi  t4, t4, -SZREG
 +	REG_S t1, ( 0 * SZREG)(t4)
 +	bne   t4, t5, 1b
 +
 +	j byte_copy_reverse /* Copy any remaining bytes */
 +
 +/*
 + * These are basically sub-functions within the function.  They
 + * are used to byte copy until the dest pointer is in alignment.
 + * At which point, a bulk copy method can be used by the
 + * calling code.  These work on the same registers as the bulk
 + * copy loops.  Therefore, the register values can be picked
 + * up from where they were left and we avoid code duplication
 + * without any overhead except the call in and return jumps.
 + */
 +byte_copy_until_aligned_forward:
 +	beq  t3, t5, 2f
 +	1:
 +	lb   t1,  0(a1)
 +	addi a1, a1, 1
 +	addi t3, t3, 1
 +	sb   t1, -1(t3)
 +	bne  t3, t5, 1b
 +	2:
 +	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
 +
 +byte_copy_until_aligned_reverse:
 +	beq  t4, t6, 2f
 +	1:
 +	lb   t1, -1(a4)
 +	addi a4, a4, -1
 +	addi t4, t4, -1
 +	sb   t1,  0(t4)
 +	bne  t4, t6, 1b
 +	2:
 +	jalr zero, 0x0(t0) /* Return to multibyte copy loop */
 +
 +/*
 + * Simple byte copy loops.
 + * These will byte copy until they reach the end of data to copy.
 + * At that point, they will call to return from memmove.
 + */
  byte_copy:
 -        lb      t3, 0(a1)
 -        addi    a2, a2, -1
 -        sb      t3, 0(a0)
 -        add     a1, a1, t4
 -        add     a0, a0, t4
 -        bnez    a2, byte_copy
 -
 -exit_memcpy:
 -        move a0, t0
 -        move a1, t1
 -        ret
 -END(__memmove)
 +	bltu a1, a0, byte_copy_reverse
 +
 +byte_copy_forward:
 +	beq  t3, t4, 2f
 +	1:
 +	lb   t1,  0(a1)
 +	addi a1, a1, 1
 +	addi t3, t3, 1
 +	sb   t1, -1(t3)
 +	bne  t3, t4, 1b
 +	2:
 +	ret
 +
 +byte_copy_reverse:
 +	beq  t4, t3, 2f
 +	1:
 +	lb   t1, -1(a4)
 +	addi a4, a4, -1
 +	addi t4, t4, -1
 +	sb   t1,  0(t4)
 +	bne  t4, t3, 1b
 +	2:
 +
 +return_from_memmove:
 +	ret
 +
 +SYM_FUNC_END(memmove)
 +SYM_FUNC_END(__memmove)
 --
 2.35.1
	From 10ea3b21cac6cb31c3b15043bebad42700e1f367 Mon Sep 17 00:00:00 2001
	From: Sasha Levin <sashal@kernel.org>
	Date: Mon, 7 Mar 2022 20:03:21 -0500
	Subject: riscv: Fixed misaligned memory access. Fixed pointer comparison.

	From: Michael T. Kloos <michael@michaelkloos.com>

	[ Upstream commit 9d1f0ec9f71780e69ceb9d91697600c747d6e02e ]

	Rewrote the RISC-V memmove() assembly implementation. The
	previous implementation did not check memory alignment and it
	compared 2 pointers with a signed comparison. The misaligned
	memory access would cause the kernel to crash on systems that
	did not emulate it in firmware and did not support it in hardware.
	Firmware emulation is slow and may not exist. The RISC-V spec
	does not guarantee that support for misaligned memory accesses
	will exist. It should not be depended on.

	This patch now checks for XLEN granularity of co-alignment between
	the pointers. Failing that, copying is done by loading from the 2
	contiguous and naturally aligned XLEN memory locations containing
	the overlapping XLEN sized data to be copied. The data is shifted
	into the correct place and binary or'ed together on each
	iteration. The result is then stored into the corresponding
	naturally aligned XLEN sized location in the destination. For
	unaligned data at the terminations of the regions to be copied
	or for copies less than (2 * XLEN) in size, byte copy is used.

	This patch also now uses unsigned comparison for the pointers and
	migrates to the newer assembler annotations from the now deprecated
	ones.

	Signed-off-by: Michael T. Kloos <michael@michaelkloos.com>
	Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
	Signed-off-by: Sasha Levin <sashal@kernel.org>
	---
	arch/riscv/lib/memmove.S \| 368 +++++++++++++++++++++++++++++++++------
	1 file changed, 310 insertions(+), 58 deletions(-)

	diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
	index 07d1d2152ba5..e0609e1f0864 100644
	--- a/arch/riscv/lib/memmove.S
	+++ b/arch/riscv/lib/memmove.S
	@@ -1,64 +1,316 @@
	-/* SPDX-License-Identifier: GPL-2.0 */
	+/* SPDX-License-Identifier: GPL-2.0-only */
	+/*
	+ * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
	+ */

	#include <linux/linkage.h>
	#include <asm/asm.h>

	-ENTRY(__memmove)
	-WEAK(memmove)
	- move t0, a0
	- move t1, a1
	-
	- beq a0, a1, exit_memcpy
	- beqz a2, exit_memcpy
	- srli t2, a2, 0x2
	-
	- slt t3, a0, a1
	- beqz t3, do_reverse
	-
	- andi a2, a2, 0x3
	- li t4, 1
	- beqz t2, byte_copy
	-
	-word_copy:
	- lw t3, 0(a1)
	- addi t2, t2, -1
	- addi a1, a1, 4
	- sw t3, 0(a0)
	- addi a0, a0, 4
	- bnez t2, word_copy
	- beqz a2, exit_memcpy
	- j byte_copy
	-
	-do_reverse:
	- add a0, a0, a2
	- add a1, a1, a2
	- andi a2, a2, 0x3
	- li t4, -1
	- beqz t2, reverse_byte_copy
	-
	-reverse_word_copy:
	- addi a1, a1, -4
	- addi t2, t2, -1
	- lw t3, 0(a1)
	- addi a0, a0, -4
	- sw t3, 0(a0)
	- bnez t2, reverse_word_copy
	- beqz a2, exit_memcpy
	-
	-reverse_byte_copy:
	- addi a0, a0, -1
	- addi a1, a1, -1
	+SYM_FUNC_START(__memmove)
	+SYM_FUNC_START_WEAK(memmove)
	+ /*
	+ * Returns
	+ * a0 - dest
	+ *
	+ * Parameters
	+ * a0 - Inclusive first byte of dest
	+ * a1 - Inclusive first byte of src
	+ * a2 - Length of copy n
	+ *
	+ * Because the return matches the parameter register a0,
	+ * we will not clobber or modify that register.
	+ *
	+ * Note: This currently only works on little-endian.
	+ * To port to big-endian, reverse the direction of shifts
	+ * in the 2 misaligned fixup copy loops.
	+ */

	+ /* Return if nothing to do */
	+ beq a0, a1, return_from_memmove
	+ beqz a2, return_from_memmove
	+
	+ /*
	+ * Register Uses
	+ * Forward Copy: a1 - Index counter of src
	+ * Reverse Copy: a4 - Index counter of src
	+ * Forward Copy: t3 - Index counter of dest
	+ * Reverse Copy: t4 - Index counter of dest
	+ * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
	+ * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
	+ * Both Copy Modes: t0 - Link / Temporary for load-store
	+ * Both Copy Modes: t1 - Temporary for load-store
	+ * Both Copy Modes: t2 - Temporary for load-store
	+ * Both Copy Modes: a5 - dest to src alignment offset
	+ * Both Copy Modes: a6 - Shift ammount
	+ * Both Copy Modes: a7 - Inverse Shift ammount
	+ * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
	+ */
	+
	+ /*
	+ * Solve for some register values now.
	+ * Byte copy does not need t5 or t6.
	+ */
	+ mv t3, a0
	+ add t4, a0, a2
	+ add a4, a1, a2
	+
	+ /*
	+ * Byte copy if copying less than (2 * SZREG) bytes. This can
	+ * cause problems with the bulk copy implementation and is
	+ * small enough not to bother.
	+ */
	+ andi t0, a2, -(2 * SZREG)
	+ beqz t0, byte_copy
	+
	+ /*
	+ * Now solve for t5 and t6.
	+ */
	+ andi t5, t3, -SZREG
	+ andi t6, t4, -SZREG
	+ /*
	+ * If dest(Register t3) rounded down to the nearest naturally
	+ * aligned SZREG address, does not equal dest, then add SZREG
	+ * to find the low-bound of SZREG alignment in the dest memory
	+ * region. Note that this could overshoot the dest memory
	+ * region if n is less than SZREG. This is one reason why
	+ * we always byte copy if n is less than SZREG.
	+ * Otherwise, dest is already naturally aligned to SZREG.
	+ */
	+ beq t5, t3, 1f
	+ addi t5, t5, SZREG
	+ 1:
	+
	+ /*
	+ * If the dest and src are co-aligned to SZREG, then there is
	+ * no need for the full rigmarole of a full misaligned fixup copy.
	+ * Instead, do a simpler co-aligned copy.
	+ */
	+ xor t0, a0, a1
	+ andi t1, t0, (SZREG - 1)
	+ beqz t1, coaligned_copy
	+ /* Fall through to misaligned fixup copy */
	+
	+misaligned_fixup_copy:
	+ bltu a1, a0, misaligned_fixup_copy_reverse
	+
	+misaligned_fixup_copy_forward:
	+ jal t0, byte_copy_until_aligned_forward
	+
	+ andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
	+ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
	+ sub a5, a1, t3 /* Find the difference between src and dest */
	+ andi a1, a1, -SZREG /* Align the src pointer */
	+ addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
	+
	+ /*
	+ * Compute The Inverse Shift
	+ * a7 = XLEN - a6 = XLEN + -a6
	+ * 2s complement negation to find the negative: -a6 = ~a6 + 1
	+ * Add that to XLEN. XLEN = SZREG * 8.
	+ */
	+ not a7, a6
	+ addi a7, a7, (SZREG * 8 + 1)
	+
	+ /*
	+ * Fix Misalignment Copy Loop - Forward
	+ * load_val0 = load_ptr[0];
	+ * do {
	+ * load_val1 = load_ptr[1];
	+ * store_ptr += 2;
	+ * store_ptr[0 - 2] = (load_val0 >> {a6}) \| (load_val1 << {a7});
	+ *
	+ * if (store_ptr == {a2})
	+ * break;
	+ *
	+ * load_val0 = load_ptr[2];
	+ * load_ptr += 2;
	+ * store_ptr[1 - 2] = (load_val1 >> {a6}) \| (load_val0 << {a7});
	+ *
	+ * } while (store_ptr != store_ptr_end);
	+ * store_ptr = store_ptr_end;
	+ */
	+
	+ REG_L t0, (0 * SZREG)(a1)
	+ 1:
	+ REG_L t1, (1 * SZREG)(a1)
	+ addi t3, t3, (2 * SZREG)
	+ srl t0, t0, a6
	+ sll t2, t1, a7
	+ or t2, t0, t2
	+ REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
	+
	+ beq t3, a2, 2f
	+
	+ REG_L t0, (2 * SZREG)(a1)
	+ addi a1, a1, (2 * SZREG)
	+ srl t1, t1, a6
	+ sll t2, t0, a7
	+ or t2, t1, t2
	+ REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
	+
	+ bne t3, t6, 1b
	+ 2:
	+ mv t3, t6 /* Fix the dest pointer in case the loop was broken */
	+
	+ add a1, t3, a5 /* Restore the src pointer */
	+ j byte_copy_forward /* Copy any remaining bytes */
	+
	+misaligned_fixup_copy_reverse:
	+ jal t0, byte_copy_until_aligned_reverse
	+
	+ andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
	+ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
	+ sub a5, a4, t4 /* Find the difference between src and dest */
	+ andi a4, a4, -SZREG /* Align the src pointer */
	+ addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
	+
	+ /*
	+ * Compute The Inverse Shift
	+ * a7 = XLEN - a6 = XLEN + -a6
	+ * 2s complement negation to find the negative: -a6 = ~a6 + 1
	+ * Add that to XLEN. XLEN = SZREG * 8.
	+ */
	+ not a7, a6
	+ addi a7, a7, (SZREG * 8 + 1)
	+
	+ /*
	+ * Fix Misalignment Copy Loop - Reverse
	+ * load_val1 = load_ptr[0];
	+ * do {
	+ * load_val0 = load_ptr[-1];
	+ * store_ptr -= 2;
	+ * store_ptr[1] = (load_val0 >> {a6}) \| (load_val1 << {a7});
	+ *
	+ * if (store_ptr == {a2})
	+ * break;
	+ *
	+ * load_val1 = load_ptr[-2];
	+ * load_ptr -= 2;
	+ * store_ptr[0] = (load_val1 >> {a6}) \| (load_val0 << {a7});
	+ *
	+ * } while (store_ptr != store_ptr_end);
	+ * store_ptr = store_ptr_end;
	+ */
	+
	+ REG_L t1, ( 0 * SZREG)(a4)
	+ 1:
	+ REG_L t0, (-1 * SZREG)(a4)
	+ addi t4, t4, (-2 * SZREG)
	+ sll t1, t1, a7
	+ srl t2, t0, a6
	+ or t2, t1, t2
	+ REG_S t2, ( 1 * SZREG)(t4)
	+
	+ beq t4, a2, 2f
	+
	+ REG_L t1, (-2 * SZREG)(a4)
	+ addi a4, a4, (-2 * SZREG)
	+ sll t0, t0, a7
	+ srl t2, t1, a6
	+ or t2, t0, t2
	+ REG_S t2, ( 0 * SZREG)(t4)
	+
	+ bne t4, t5, 1b
	+ 2:
	+ mv t4, t5 /* Fix the dest pointer in case the loop was broken */
	+
	+ add a4, t4, a5 /* Restore the src pointer */
	+ j byte_copy_reverse /* Copy any remaining bytes */
	+
	+/*
	+ * Simple copy loops for SZREG co-aligned memory locations.
	+ * These also make calls to do byte copies for any unaligned
	+ * data at their terminations.
	+ */
	+coaligned_copy:
	+ bltu a1, a0, coaligned_copy_reverse
	+
	+coaligned_copy_forward:
	+ jal t0, byte_copy_until_aligned_forward
	+
	+ 1:
	+ REG_L t1, ( 0 * SZREG)(a1)
	+ addi a1, a1, SZREG
	+ addi t3, t3, SZREG
	+ REG_S t1, (-1 * SZREG)(t3)
	+ bne t3, t6, 1b
	+
	+ j byte_copy_forward /* Copy any remaining bytes */
	+
	+coaligned_copy_reverse:
	+ jal t0, byte_copy_until_aligned_reverse
	+
	+ 1:
	+ REG_L t1, (-1 * SZREG)(a4)
	+ addi a4, a4, -SZREG
	+ addi t4, t4, -SZREG
	+ REG_S t1, ( 0 * SZREG)(t4)
	+ bne t4, t5, 1b
	+
	+ j byte_copy_reverse /* Copy any remaining bytes */
	+
	+/*
	+ * These are basically sub-functions within the function. They
	+ * are used to byte copy until the dest pointer is in alignment.
	+ * At which point, a bulk copy method can be used by the
	+ * calling code. These work on the same registers as the bulk
	+ * copy loops. Therefore, the register values can be picked
	+ * up from where they were left and we avoid code duplication
	+ * without any overhead except the call in and return jumps.
	+ */
	+byte_copy_until_aligned_forward:
	+ beq t3, t5, 2f
	+ 1:
	+ lb t1, 0(a1)
	+ addi a1, a1, 1
	+ addi t3, t3, 1
	+ sb t1, -1(t3)
	+ bne t3, t5, 1b
	+ 2:
	+ jalr zero, 0x0(t0) /* Return to multibyte copy loop */
	+
	+byte_copy_until_aligned_reverse:
	+ beq t4, t6, 2f
	+ 1:
	+ lb t1, -1(a4)
	+ addi a4, a4, -1
	+ addi t4, t4, -1
	+ sb t1, 0(t4)
	+ bne t4, t6, 1b
	+ 2:
	+ jalr zero, 0x0(t0) /* Return to multibyte copy loop */
	+
	+/*
	+ * Simple byte copy loops.
	+ * These will byte copy until they reach the end of data to copy.
	+ * At that point, they will call to return from memmove.
	+ */
	byte_copy:
	- lb t3, 0(a1)
	- addi a2, a2, -1
	- sb t3, 0(a0)
	- add a1, a1, t4
	- add a0, a0, t4
	- bnez a2, byte_copy
	-
	-exit_memcpy:
	- move a0, t0
	- move a1, t1
	- ret
	-END(__memmove)
	+ bltu a1, a0, byte_copy_reverse
	+
	+byte_copy_forward:
	+ beq t3, t4, 2f
	+ 1:
	+ lb t1, 0(a1)
	+ addi a1, a1, 1
	+ addi t3, t3, 1
	+ sb t1, -1(t3)
	+ bne t3, t4, 1b
	+ 2:
	+ ret
	+
	+byte_copy_reverse:
	+ beq t4, t3, 2f
	+ 1:
	+ lb t1, -1(a4)
	+ addi a4, a4, -1
	+ addi t4, t4, -1
	+ sb t1, 0(t4)
	+ bne t4, t3, 1b
	+ 2:
	+
	+return_from_memmove:
	+ ret
	+
	+SYM_FUNC_END(memmove)
	+SYM_FUNC_END(__memmove)
	--
	2.35.1