| /* SPDX-License-Identifier: GPL-2.0-or-later */ | 
 | # | 
 | # Accelerated chacha20 implementation for ppc64le. | 
 | # | 
 | # Copyright 2023- IBM Corp. All rights reserved | 
 | # | 
 | #=================================================================================== | 
 | # Written by Danny Tsen <dtsen@us.ibm.com> | 
 | # | 
 | # chacha_p10le_8x(u32 *state, byte *dst, const byte *src, | 
 | #				 size_t len, int nrounds); | 
 | # | 
 | # do rounds,  8 quarter rounds | 
 | # 1.  a += b; d ^= a; d <<<= 16; | 
 | # 2.  c += d; b ^= c; b <<<= 12; | 
 | # 3.  a += b; d ^= a; d <<<= 8; | 
 | # 4.  c += d; b ^= c; b <<<= 7 | 
 | # | 
 | # row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16 | 
 | # row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12 | 
 | # row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8 | 
 | # row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7 | 
 | # | 
 | # 4 blocks (a b c d) | 
 | # | 
 | # a0 b0 c0 d0 | 
 | # a1 b1 c1 d1 | 
 | # ... | 
 | # a4 b4 c4 d4 | 
 | # ... | 
 | # a8 b8 c8 d8 | 
 | # ... | 
 | # a12 b12 c12 d12 | 
 | # a13 ... | 
 | # a14 ... | 
 | # a15 b15 c15 d15 | 
 | # | 
 | # Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) | 
 | # Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14) | 
 | # | 
 |  | 
 | #include <asm/ppc_asm.h> | 
 | #include <asm/asm-offsets.h> | 
 | #include <asm/asm-compat.h> | 
 | #include <linux/linkage.h> | 
 |  | 
 | .machine	"any" | 
 | .text | 
 |  | 
 | .macro	SAVE_GPR GPR OFFSET FRAME | 
 | 	std	\GPR,\OFFSET(\FRAME) | 
 | .endm | 
 |  | 
 | .macro	SAVE_VRS VRS OFFSET FRAME | 
 | 	li	16, \OFFSET | 
 | 	stvx	\VRS, 16, \FRAME | 
 | .endm | 
 |  | 
 | .macro	SAVE_VSX VSX OFFSET FRAME | 
 | 	li	16, \OFFSET | 
 | 	stxvx	\VSX, 16, \FRAME | 
 | .endm | 
 |  | 
 | .macro	RESTORE_GPR GPR OFFSET FRAME | 
 | 	ld	\GPR,\OFFSET(\FRAME) | 
 | .endm | 
 |  | 
 | .macro	RESTORE_VRS VRS OFFSET FRAME | 
 | 	li	16, \OFFSET | 
 | 	lvx	\VRS, 16, \FRAME | 
 | .endm | 
 |  | 
 | .macro	RESTORE_VSX VSX OFFSET FRAME | 
 | 	li	16, \OFFSET | 
 | 	lxvx	\VSX, 16, \FRAME | 
 | .endm | 
 |  | 
 | .macro SAVE_REGS | 
 | 	mflr 0 | 
 | 	std 0, 16(1) | 
 | 	stdu 1,-752(1) | 
 |  | 
 | 	SAVE_GPR 14, 112, 1 | 
 | 	SAVE_GPR 15, 120, 1 | 
 | 	SAVE_GPR 16, 128, 1 | 
 | 	SAVE_GPR 17, 136, 1 | 
 | 	SAVE_GPR 18, 144, 1 | 
 | 	SAVE_GPR 19, 152, 1 | 
 | 	SAVE_GPR 20, 160, 1 | 
 | 	SAVE_GPR 21, 168, 1 | 
 | 	SAVE_GPR 22, 176, 1 | 
 | 	SAVE_GPR 23, 184, 1 | 
 | 	SAVE_GPR 24, 192, 1 | 
 | 	SAVE_GPR 25, 200, 1 | 
 | 	SAVE_GPR 26, 208, 1 | 
 | 	SAVE_GPR 27, 216, 1 | 
 | 	SAVE_GPR 28, 224, 1 | 
 | 	SAVE_GPR 29, 232, 1 | 
 | 	SAVE_GPR 30, 240, 1 | 
 | 	SAVE_GPR 31, 248, 1 | 
 |  | 
 | 	addi	9, 1, 256 | 
 | 	SAVE_VRS 20, 0, 9 | 
 | 	SAVE_VRS 21, 16, 9 | 
 | 	SAVE_VRS 22, 32, 9 | 
 | 	SAVE_VRS 23, 48, 9 | 
 | 	SAVE_VRS 24, 64, 9 | 
 | 	SAVE_VRS 25, 80, 9 | 
 | 	SAVE_VRS 26, 96, 9 | 
 | 	SAVE_VRS 27, 112, 9 | 
 | 	SAVE_VRS 28, 128, 9 | 
 | 	SAVE_VRS 29, 144, 9 | 
 | 	SAVE_VRS 30, 160, 9 | 
 | 	SAVE_VRS 31, 176, 9 | 
 |  | 
 | 	SAVE_VSX 14, 192, 9 | 
 | 	SAVE_VSX 15, 208, 9 | 
 | 	SAVE_VSX 16, 224, 9 | 
 | 	SAVE_VSX 17, 240, 9 | 
 | 	SAVE_VSX 18, 256, 9 | 
 | 	SAVE_VSX 19, 272, 9 | 
 | 	SAVE_VSX 20, 288, 9 | 
 | 	SAVE_VSX 21, 304, 9 | 
 | 	SAVE_VSX 22, 320, 9 | 
 | 	SAVE_VSX 23, 336, 9 | 
 | 	SAVE_VSX 24, 352, 9 | 
 | 	SAVE_VSX 25, 368, 9 | 
 | 	SAVE_VSX 26, 384, 9 | 
 | 	SAVE_VSX 27, 400, 9 | 
 | 	SAVE_VSX 28, 416, 9 | 
 | 	SAVE_VSX 29, 432, 9 | 
 | 	SAVE_VSX 30, 448, 9 | 
 | 	SAVE_VSX 31, 464, 9 | 
 | .endm # SAVE_REGS | 
 |  | 
 | .macro RESTORE_REGS | 
 | 	addi	9, 1, 256 | 
 | 	RESTORE_VRS 20, 0, 9 | 
 | 	RESTORE_VRS 21, 16, 9 | 
 | 	RESTORE_VRS 22, 32, 9 | 
 | 	RESTORE_VRS 23, 48, 9 | 
 | 	RESTORE_VRS 24, 64, 9 | 
 | 	RESTORE_VRS 25, 80, 9 | 
 | 	RESTORE_VRS 26, 96, 9 | 
 | 	RESTORE_VRS 27, 112, 9 | 
 | 	RESTORE_VRS 28, 128, 9 | 
 | 	RESTORE_VRS 29, 144, 9 | 
 | 	RESTORE_VRS 30, 160, 9 | 
 | 	RESTORE_VRS 31, 176, 9 | 
 |  | 
 | 	RESTORE_VSX 14, 192, 9 | 
 | 	RESTORE_VSX 15, 208, 9 | 
 | 	RESTORE_VSX 16, 224, 9 | 
 | 	RESTORE_VSX 17, 240, 9 | 
 | 	RESTORE_VSX 18, 256, 9 | 
 | 	RESTORE_VSX 19, 272, 9 | 
 | 	RESTORE_VSX 20, 288, 9 | 
 | 	RESTORE_VSX 21, 304, 9 | 
 | 	RESTORE_VSX 22, 320, 9 | 
 | 	RESTORE_VSX 23, 336, 9 | 
 | 	RESTORE_VSX 24, 352, 9 | 
 | 	RESTORE_VSX 25, 368, 9 | 
 | 	RESTORE_VSX 26, 384, 9 | 
 | 	RESTORE_VSX 27, 400, 9 | 
 | 	RESTORE_VSX 28, 416, 9 | 
 | 	RESTORE_VSX 29, 432, 9 | 
 | 	RESTORE_VSX 30, 448, 9 | 
 | 	RESTORE_VSX 31, 464, 9 | 
 |  | 
 | 	RESTORE_GPR 14, 112, 1 | 
 | 	RESTORE_GPR 15, 120, 1 | 
 | 	RESTORE_GPR 16, 128, 1 | 
 | 	RESTORE_GPR 17, 136, 1 | 
 | 	RESTORE_GPR 18, 144, 1 | 
 | 	RESTORE_GPR 19, 152, 1 | 
 | 	RESTORE_GPR 20, 160, 1 | 
 | 	RESTORE_GPR 21, 168, 1 | 
 | 	RESTORE_GPR 22, 176, 1 | 
 | 	RESTORE_GPR 23, 184, 1 | 
 | 	RESTORE_GPR 24, 192, 1 | 
 | 	RESTORE_GPR 25, 200, 1 | 
 | 	RESTORE_GPR 26, 208, 1 | 
 | 	RESTORE_GPR 27, 216, 1 | 
 | 	RESTORE_GPR 28, 224, 1 | 
 | 	RESTORE_GPR 29, 232, 1 | 
 | 	RESTORE_GPR 30, 240, 1 | 
 | 	RESTORE_GPR 31, 248, 1 | 
 |  | 
 | 	addi    1, 1, 752 | 
 | 	ld 0, 16(1) | 
 | 	mtlr 0 | 
 | .endm # RESTORE_REGS | 
 |  | 
 | .macro QT_loop_8x | 
 | 	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) | 
 | 	xxlor	0, 32+25, 32+25 | 
 | 	xxlor	32+25, 20, 20 | 
 | 	vadduwm 0, 0, 4 | 
 | 	vadduwm 1, 1, 5 | 
 | 	vadduwm 2, 2, 6 | 
 | 	vadduwm 3, 3, 7 | 
 | 	  vadduwm 16, 16, 20 | 
 | 	  vadduwm 17, 17, 21 | 
 | 	  vadduwm 18, 18, 22 | 
 | 	  vadduwm 19, 19, 23 | 
 |  | 
 | 	  vpermxor 12, 12, 0, 25 | 
 | 	  vpermxor 13, 13, 1, 25 | 
 | 	  vpermxor 14, 14, 2, 25 | 
 | 	  vpermxor 15, 15, 3, 25 | 
 | 	  vpermxor 28, 28, 16, 25 | 
 | 	  vpermxor 29, 29, 17, 25 | 
 | 	  vpermxor 30, 30, 18, 25 | 
 | 	  vpermxor 31, 31, 19, 25 | 
 | 	xxlor	32+25, 0, 0 | 
 | 	vadduwm 8, 8, 12 | 
 | 	vadduwm 9, 9, 13 | 
 | 	vadduwm 10, 10, 14 | 
 | 	vadduwm 11, 11, 15 | 
 | 	  vadduwm 24, 24, 28 | 
 | 	  vadduwm 25, 25, 29 | 
 | 	  vadduwm 26, 26, 30 | 
 | 	  vadduwm 27, 27, 31 | 
 | 	vxor 4, 4, 8 | 
 | 	vxor 5, 5, 9 | 
 | 	vxor 6, 6, 10 | 
 | 	vxor 7, 7, 11 | 
 | 	  vxor 20, 20, 24 | 
 | 	  vxor 21, 21, 25 | 
 | 	  vxor 22, 22, 26 | 
 | 	  vxor 23, 23, 27 | 
 |  | 
 | 	xxlor	0, 32+25, 32+25 | 
 | 	xxlor	32+25, 21, 21 | 
 | 	vrlw 4, 4, 25  # | 
 | 	vrlw 5, 5, 25 | 
 | 	vrlw 6, 6, 25 | 
 | 	vrlw 7, 7, 25 | 
 | 	  vrlw 20, 20, 25  # | 
 | 	  vrlw 21, 21, 25 | 
 | 	  vrlw 22, 22, 25 | 
 | 	  vrlw 23, 23, 25 | 
 | 	xxlor	32+25, 0, 0 | 
 | 	vadduwm 0, 0, 4 | 
 | 	vadduwm 1, 1, 5 | 
 | 	vadduwm 2, 2, 6 | 
 | 	vadduwm 3, 3, 7 | 
 | 	  vadduwm 16, 16, 20 | 
 | 	  vadduwm 17, 17, 21 | 
 | 	  vadduwm 18, 18, 22 | 
 | 	  vadduwm 19, 19, 23 | 
 |  | 
 | 	xxlor	0, 32+25, 32+25 | 
 | 	xxlor	32+25, 22, 22 | 
 | 	  vpermxor 12, 12, 0, 25 | 
 | 	  vpermxor 13, 13, 1, 25 | 
 | 	  vpermxor 14, 14, 2, 25 | 
 | 	  vpermxor 15, 15, 3, 25 | 
 | 	  vpermxor 28, 28, 16, 25 | 
 | 	  vpermxor 29, 29, 17, 25 | 
 | 	  vpermxor 30, 30, 18, 25 | 
 | 	  vpermxor 31, 31, 19, 25 | 
 | 	xxlor	32+25, 0, 0 | 
 | 	vadduwm 8, 8, 12 | 
 | 	vadduwm 9, 9, 13 | 
 | 	vadduwm 10, 10, 14 | 
 | 	vadduwm 11, 11, 15 | 
 | 	  vadduwm 24, 24, 28 | 
 | 	  vadduwm 25, 25, 29 | 
 | 	  vadduwm 26, 26, 30 | 
 | 	  vadduwm 27, 27, 31 | 
 | 	xxlor	0, 32+28, 32+28 | 
 | 	xxlor	32+28, 23, 23 | 
 | 	vxor 4, 4, 8 | 
 | 	vxor 5, 5, 9 | 
 | 	vxor 6, 6, 10 | 
 | 	vxor 7, 7, 11 | 
 | 	  vxor 20, 20, 24 | 
 | 	  vxor 21, 21, 25 | 
 | 	  vxor 22, 22, 26 | 
 | 	  vxor 23, 23, 27 | 
 | 	vrlw 4, 4, 28  # | 
 | 	vrlw 5, 5, 28 | 
 | 	vrlw 6, 6, 28 | 
 | 	vrlw 7, 7, 28 | 
 | 	  vrlw 20, 20, 28  # | 
 | 	  vrlw 21, 21, 28 | 
 | 	  vrlw 22, 22, 28 | 
 | 	  vrlw 23, 23, 28 | 
 | 	xxlor	32+28, 0, 0 | 
 |  | 
 | 	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14) | 
 | 	xxlor	0, 32+25, 32+25 | 
 | 	xxlor	32+25, 20, 20 | 
 | 	vadduwm 0, 0, 5 | 
 | 	vadduwm 1, 1, 6 | 
 | 	vadduwm 2, 2, 7 | 
 | 	vadduwm 3, 3, 4 | 
 | 	  vadduwm 16, 16, 21 | 
 | 	  vadduwm 17, 17, 22 | 
 | 	  vadduwm 18, 18, 23 | 
 | 	  vadduwm 19, 19, 20 | 
 |  | 
 | 	  vpermxor 15, 15, 0, 25 | 
 | 	  vpermxor 12, 12, 1, 25 | 
 | 	  vpermxor 13, 13, 2, 25 | 
 | 	  vpermxor 14, 14, 3, 25 | 
 | 	  vpermxor 31, 31, 16, 25 | 
 | 	  vpermxor 28, 28, 17, 25 | 
 | 	  vpermxor 29, 29, 18, 25 | 
 | 	  vpermxor 30, 30, 19, 25 | 
 |  | 
 | 	xxlor	32+25, 0, 0 | 
 | 	vadduwm 10, 10, 15 | 
 | 	vadduwm 11, 11, 12 | 
 | 	vadduwm 8, 8, 13 | 
 | 	vadduwm 9, 9, 14 | 
 | 	  vadduwm 26, 26, 31 | 
 | 	  vadduwm 27, 27, 28 | 
 | 	  vadduwm 24, 24, 29 | 
 | 	  vadduwm 25, 25, 30 | 
 | 	vxor 5, 5, 10 | 
 | 	vxor 6, 6, 11 | 
 | 	vxor 7, 7, 8 | 
 | 	vxor 4, 4, 9 | 
 | 	  vxor 21, 21, 26 | 
 | 	  vxor 22, 22, 27 | 
 | 	  vxor 23, 23, 24 | 
 | 	  vxor 20, 20, 25 | 
 |  | 
 | 	xxlor	0, 32+25, 32+25 | 
 | 	xxlor	32+25, 21, 21 | 
 | 	vrlw 5, 5, 25 | 
 | 	vrlw 6, 6, 25 | 
 | 	vrlw 7, 7, 25 | 
 | 	vrlw 4, 4, 25 | 
 | 	  vrlw 21, 21, 25 | 
 | 	  vrlw 22, 22, 25 | 
 | 	  vrlw 23, 23, 25 | 
 | 	  vrlw 20, 20, 25 | 
 | 	xxlor	32+25, 0, 0 | 
 |  | 
 | 	vadduwm 0, 0, 5 | 
 | 	vadduwm 1, 1, 6 | 
 | 	vadduwm 2, 2, 7 | 
 | 	vadduwm 3, 3, 4 | 
 | 	  vadduwm 16, 16, 21 | 
 | 	  vadduwm 17, 17, 22 | 
 | 	  vadduwm 18, 18, 23 | 
 | 	  vadduwm 19, 19, 20 | 
 |  | 
 | 	xxlor	0, 32+25, 32+25 | 
 | 	xxlor	32+25, 22, 22 | 
 | 	  vpermxor 15, 15, 0, 25 | 
 | 	  vpermxor 12, 12, 1, 25 | 
 | 	  vpermxor 13, 13, 2, 25 | 
 | 	  vpermxor 14, 14, 3, 25 | 
 | 	  vpermxor 31, 31, 16, 25 | 
 | 	  vpermxor 28, 28, 17, 25 | 
 | 	  vpermxor 29, 29, 18, 25 | 
 | 	  vpermxor 30, 30, 19, 25 | 
 | 	xxlor	32+25, 0, 0 | 
 |  | 
 | 	vadduwm 10, 10, 15 | 
 | 	vadduwm 11, 11, 12 | 
 | 	vadduwm 8, 8, 13 | 
 | 	vadduwm 9, 9, 14 | 
 | 	  vadduwm 26, 26, 31 | 
 | 	  vadduwm 27, 27, 28 | 
 | 	  vadduwm 24, 24, 29 | 
 | 	  vadduwm 25, 25, 30 | 
 |  | 
 | 	xxlor	0, 32+28, 32+28 | 
 | 	xxlor	32+28, 23, 23 | 
 | 	vxor 5, 5, 10 | 
 | 	vxor 6, 6, 11 | 
 | 	vxor 7, 7, 8 | 
 | 	vxor 4, 4, 9 | 
 | 	  vxor 21, 21, 26 | 
 | 	  vxor 22, 22, 27 | 
 | 	  vxor 23, 23, 24 | 
 | 	  vxor 20, 20, 25 | 
 | 	vrlw 5, 5, 28 | 
 | 	vrlw 6, 6, 28 | 
 | 	vrlw 7, 7, 28 | 
 | 	vrlw 4, 4, 28 | 
 | 	  vrlw 21, 21, 28 | 
 | 	  vrlw 22, 22, 28 | 
 | 	  vrlw 23, 23, 28 | 
 | 	  vrlw 20, 20, 28 | 
 | 	xxlor	32+28, 0, 0 | 
 | .endm | 
 |  | 
 | .macro QT_loop_4x | 
 | 	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) | 
 | 	vadduwm 0, 0, 4 | 
 | 	vadduwm 1, 1, 5 | 
 | 	vadduwm 2, 2, 6 | 
 | 	vadduwm 3, 3, 7 | 
 | 	  vpermxor 12, 12, 0, 20 | 
 | 	  vpermxor 13, 13, 1, 20 | 
 | 	  vpermxor 14, 14, 2, 20 | 
 | 	  vpermxor 15, 15, 3, 20 | 
 | 	vadduwm 8, 8, 12 | 
 | 	vadduwm 9, 9, 13 | 
 | 	vadduwm 10, 10, 14 | 
 | 	vadduwm 11, 11, 15 | 
 | 	vxor 4, 4, 8 | 
 | 	vxor 5, 5, 9 | 
 | 	vxor 6, 6, 10 | 
 | 	vxor 7, 7, 11 | 
 | 	vrlw 4, 4, 21 | 
 | 	vrlw 5, 5, 21 | 
 | 	vrlw 6, 6, 21 | 
 | 	vrlw 7, 7, 21 | 
 | 	vadduwm 0, 0, 4 | 
 | 	vadduwm 1, 1, 5 | 
 | 	vadduwm 2, 2, 6 | 
 | 	vadduwm 3, 3, 7 | 
 | 	  vpermxor 12, 12, 0, 22 | 
 | 	  vpermxor 13, 13, 1, 22 | 
 | 	  vpermxor 14, 14, 2, 22 | 
 | 	  vpermxor 15, 15, 3, 22 | 
 | 	vadduwm 8, 8, 12 | 
 | 	vadduwm 9, 9, 13 | 
 | 	vadduwm 10, 10, 14 | 
 | 	vadduwm 11, 11, 15 | 
 | 	vxor 4, 4, 8 | 
 | 	vxor 5, 5, 9 | 
 | 	vxor 6, 6, 10 | 
 | 	vxor 7, 7, 11 | 
 | 	vrlw 4, 4, 23 | 
 | 	vrlw 5, 5, 23 | 
 | 	vrlw 6, 6, 23 | 
 | 	vrlw 7, 7, 23 | 
 |  | 
 | 	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14) | 
 | 	vadduwm 0, 0, 5 | 
 | 	vadduwm 1, 1, 6 | 
 | 	vadduwm 2, 2, 7 | 
 | 	vadduwm 3, 3, 4 | 
 | 	  vpermxor 15, 15, 0, 20 | 
 | 	  vpermxor 12, 12, 1, 20 | 
 | 	  vpermxor 13, 13, 2, 20 | 
 | 	  vpermxor 14, 14, 3, 20 | 
 | 	vadduwm 10, 10, 15 | 
 | 	vadduwm 11, 11, 12 | 
 | 	vadduwm 8, 8, 13 | 
 | 	vadduwm 9, 9, 14 | 
 | 	vxor 5, 5, 10 | 
 | 	vxor 6, 6, 11 | 
 | 	vxor 7, 7, 8 | 
 | 	vxor 4, 4, 9 | 
 | 	vrlw 5, 5, 21 | 
 | 	vrlw 6, 6, 21 | 
 | 	vrlw 7, 7, 21 | 
 | 	vrlw 4, 4, 21 | 
 | 	vadduwm 0, 0, 5 | 
 | 	vadduwm 1, 1, 6 | 
 | 	vadduwm 2, 2, 7 | 
 | 	vadduwm 3, 3, 4 | 
 | 	  vpermxor 15, 15, 0, 22 | 
 | 	  vpermxor 12, 12, 1, 22 | 
 | 	  vpermxor 13, 13, 2, 22 | 
 | 	  vpermxor 14, 14, 3, 22 | 
 | 	vadduwm 10, 10, 15 | 
 | 	vadduwm 11, 11, 12 | 
 | 	vadduwm 8, 8, 13 | 
 | 	vadduwm 9, 9, 14 | 
 | 	vxor 5, 5, 10 | 
 | 	vxor 6, 6, 11 | 
 | 	vxor 7, 7, 8 | 
 | 	vxor 4, 4, 9 | 
 | 	vrlw 5, 5, 23 | 
 | 	vrlw 6, 6, 23 | 
 | 	vrlw 7, 7, 23 | 
 | 	vrlw 4, 4, 23 | 
 | .endm | 
 |  | 
 | # Transpose | 
 | .macro TP_4x a0 a1 a2 a3 | 
 | 	xxmrghw  10, 32+\a0, 32+\a1	# a0, a1, b0, b1 | 
 | 	xxmrghw  11, 32+\a2, 32+\a3	# a2, a3, b2, b3 | 
 | 	xxmrglw  12, 32+\a0, 32+\a1	# c0, c1, d0, d1 | 
 | 	xxmrglw  13, 32+\a2, 32+\a3	# c2, c3, d2, d3 | 
 | 	xxpermdi	32+\a0, 10, 11, 0	# a0, a1, a2, a3 | 
 | 	xxpermdi	32+\a1, 10, 11, 3	# b0, b1, b2, b3 | 
 | 	xxpermdi	32+\a2, 12, 13, 0	# c0, c1, c2, c3 | 
 | 	xxpermdi	32+\a3, 12, 13, 3	# d0, d1, d2, d3 | 
 | .endm | 
 |  | 
 | # key stream = working state + state | 
 | .macro Add_state S | 
 | 	vadduwm \S+0, \S+0, 16-\S | 
 | 	vadduwm \S+4, \S+4, 17-\S | 
 | 	vadduwm \S+8, \S+8, 18-\S | 
 | 	vadduwm \S+12, \S+12, 19-\S | 
 |  | 
 | 	vadduwm \S+1, \S+1, 16-\S | 
 | 	vadduwm \S+5, \S+5, 17-\S | 
 | 	vadduwm \S+9, \S+9, 18-\S | 
 | 	vadduwm \S+13, \S+13, 19-\S | 
 |  | 
 | 	vadduwm \S+2, \S+2, 16-\S | 
 | 	vadduwm \S+6, \S+6, 17-\S | 
 | 	vadduwm \S+10, \S+10, 18-\S | 
 | 	vadduwm \S+14, \S+14, 19-\S | 
 |  | 
 | 	vadduwm	\S+3, \S+3, 16-\S | 
 | 	vadduwm	\S+7, \S+7, 17-\S | 
 | 	vadduwm	\S+11, \S+11, 18-\S | 
 | 	vadduwm	\S+15, \S+15, 19-\S | 
 | .endm | 
 |  | 
 | # | 
 | # write 256 bytes | 
 | # | 
 | .macro Write_256 S | 
 | 	add 9, 14, 5 | 
 | 	add 16, 14, 4 | 
 | 	lxvw4x 0, 0, 9 | 
 | 	lxvw4x 1, 17, 9 | 
 | 	lxvw4x 2, 18, 9 | 
 | 	lxvw4x 3, 19, 9 | 
 | 	lxvw4x 4, 20, 9 | 
 | 	lxvw4x 5, 21, 9 | 
 | 	lxvw4x 6, 22, 9 | 
 | 	lxvw4x 7, 23, 9 | 
 | 	lxvw4x 8, 24, 9 | 
 | 	lxvw4x 9, 25, 9 | 
 | 	lxvw4x 10, 26, 9 | 
 | 	lxvw4x 11, 27, 9 | 
 | 	lxvw4x 12, 28, 9 | 
 | 	lxvw4x 13, 29, 9 | 
 | 	lxvw4x 14, 30, 9 | 
 | 	lxvw4x 15, 31, 9 | 
 |  | 
 | 	xxlxor \S+32, \S+32, 0 | 
 | 	xxlxor \S+36, \S+36, 1 | 
 | 	xxlxor \S+40, \S+40, 2 | 
 | 	xxlxor \S+44, \S+44, 3 | 
 | 	xxlxor \S+33, \S+33, 4 | 
 | 	xxlxor \S+37, \S+37, 5 | 
 | 	xxlxor \S+41, \S+41, 6 | 
 | 	xxlxor \S+45, \S+45, 7 | 
 | 	xxlxor \S+34, \S+34, 8 | 
 | 	xxlxor \S+38, \S+38, 9 | 
 | 	xxlxor \S+42, \S+42, 10 | 
 | 	xxlxor \S+46, \S+46, 11 | 
 | 	xxlxor \S+35, \S+35, 12 | 
 | 	xxlxor \S+39, \S+39, 13 | 
 | 	xxlxor \S+43, \S+43, 14 | 
 | 	xxlxor \S+47, \S+47, 15 | 
 |  | 
 | 	stxvw4x \S+32, 0, 16 | 
 | 	stxvw4x \S+36, 17, 16 | 
 | 	stxvw4x \S+40, 18, 16 | 
 | 	stxvw4x \S+44, 19, 16 | 
 |  | 
 | 	stxvw4x \S+33, 20, 16 | 
 | 	stxvw4x \S+37, 21, 16 | 
 | 	stxvw4x \S+41, 22, 16 | 
 | 	stxvw4x \S+45, 23, 16 | 
 |  | 
 | 	stxvw4x \S+34, 24, 16 | 
 | 	stxvw4x \S+38, 25, 16 | 
 | 	stxvw4x \S+42, 26, 16 | 
 | 	stxvw4x \S+46, 27, 16 | 
 |  | 
 | 	stxvw4x \S+35, 28, 16 | 
 | 	stxvw4x \S+39, 29, 16 | 
 | 	stxvw4x \S+43, 30, 16 | 
 | 	stxvw4x \S+47, 31, 16 | 
 |  | 
 | .endm | 
 |  | 
 | # | 
 | # chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds); | 
 | # | 
 | SYM_FUNC_START(chacha_p10le_8x) | 
 | .align 5 | 
 | 	cmpdi	6, 0 | 
 | 	ble	Out_no_chacha | 
 |  | 
 | 	SAVE_REGS | 
 |  | 
 | 	# r17 - r31 mainly for Write_256 macro. | 
 | 	li	17, 16 | 
 | 	li	18, 32 | 
 | 	li	19, 48 | 
 | 	li	20, 64 | 
 | 	li	21, 80 | 
 | 	li	22, 96 | 
 | 	li	23, 112 | 
 | 	li	24, 128 | 
 | 	li	25, 144 | 
 | 	li	26, 160 | 
 | 	li	27, 176 | 
 | 	li	28, 192 | 
 | 	li	29, 208 | 
 | 	li	30, 224 | 
 | 	li	31, 240 | 
 |  | 
 | 	mr 15, 6			# len | 
 | 	li 14, 0			# offset to inp and outp | 
 |  | 
 |         lxvw4x	48, 0, 3		#  vr16, constants | 
 | 	lxvw4x	49, 17, 3		#  vr17, key 1 | 
 | 	lxvw4x	50, 18, 3		#  vr18, key 2 | 
 | 	lxvw4x	51, 19, 3		#  vr19, counter, nonce | 
 |  | 
 | 	# create (0, 1, 2, 3) counters | 
 | 	vspltisw 0, 0 | 
 | 	vspltisw 1, 1 | 
 | 	vspltisw 2, 2 | 
 | 	vspltisw 3, 3 | 
 | 	vmrghw	4, 0, 1 | 
 | 	vmrglw	5, 2, 3 | 
 | 	vsldoi	30, 4, 5, 8		# vr30 counter, 4 (0, 1, 2, 3) | 
 |  | 
 | 	vspltisw 21, 12 | 
 | 	vspltisw 23, 7 | 
 |  | 
 | 	addis	11, 2, permx@toc@ha | 
 | 	addi	11, 11, permx@toc@l | 
 | 	lxvw4x	32+20, 0, 11 | 
 | 	lxvw4x	32+22, 17, 11 | 
 |  | 
 | 	sradi	8, 7, 1 | 
 |  | 
 | 	mtctr 8 | 
 |  | 
 | 	# save constants to vsx | 
 | 	xxlor	16, 48, 48 | 
 | 	xxlor	17, 49, 49 | 
 | 	xxlor	18, 50, 50 | 
 | 	xxlor	19, 51, 51 | 
 |  | 
 | 	vspltisw 25, 4 | 
 | 	vspltisw 26, 8 | 
 |  | 
 | 	xxlor	25, 32+26, 32+26 | 
 | 	xxlor	24, 32+25, 32+25 | 
 |  | 
 | 	vadduwm	31, 30, 25		# counter = (0, 1, 2, 3) + (4, 4, 4, 4) | 
 | 	xxlor	30, 32+30, 32+30 | 
 | 	xxlor	31, 32+31, 32+31 | 
 |  | 
 | 	xxlor	20, 32+20, 32+20 | 
 | 	xxlor	21, 32+21, 32+21 | 
 | 	xxlor	22, 32+22, 32+22 | 
 | 	xxlor	23, 32+23, 32+23 | 
 |  | 
 | 	cmpdi	6, 512 | 
 | 	blt	Loop_last | 
 |  | 
 | Loop_8x: | 
 | 	xxspltw  32+0, 16, 0 | 
 | 	xxspltw  32+1, 16, 1 | 
 | 	xxspltw  32+2, 16, 2 | 
 | 	xxspltw  32+3, 16, 3 | 
 |  | 
 | 	xxspltw  32+4, 17, 0 | 
 | 	xxspltw  32+5, 17, 1 | 
 | 	xxspltw  32+6, 17, 2 | 
 | 	xxspltw  32+7, 17, 3 | 
 | 	xxspltw  32+8, 18, 0 | 
 | 	xxspltw  32+9, 18, 1 | 
 | 	xxspltw  32+10, 18, 2 | 
 | 	xxspltw  32+11, 18, 3 | 
 | 	xxspltw  32+12, 19, 0 | 
 | 	xxspltw  32+13, 19, 1 | 
 | 	xxspltw  32+14, 19, 2 | 
 | 	xxspltw  32+15, 19, 3 | 
 | 	vadduwm	12, 12, 30	# increase counter | 
 |  | 
 | 	xxspltw  32+16, 16, 0 | 
 | 	xxspltw  32+17, 16, 1 | 
 | 	xxspltw  32+18, 16, 2 | 
 | 	xxspltw  32+19, 16, 3 | 
 |  | 
 | 	xxspltw  32+20, 17, 0 | 
 | 	xxspltw  32+21, 17, 1 | 
 | 	xxspltw  32+22, 17, 2 | 
 | 	xxspltw  32+23, 17, 3 | 
 | 	xxspltw  32+24, 18, 0 | 
 | 	xxspltw  32+25, 18, 1 | 
 | 	xxspltw  32+26, 18, 2 | 
 | 	xxspltw  32+27, 18, 3 | 
 | 	xxspltw  32+28, 19, 0 | 
 | 	xxspltw  32+29, 19, 1 | 
 | 	vadduwm	28, 28, 31	# increase counter | 
 | 	xxspltw  32+30, 19, 2 | 
 | 	xxspltw  32+31, 19, 3 | 
 |  | 
 | .align 5 | 
 | quarter_loop_8x: | 
 | 	QT_loop_8x | 
 |  | 
 | 	bdnz	quarter_loop_8x | 
 |  | 
 | 	xxlor	0, 32+30, 32+30 | 
 | 	xxlor	32+30, 30, 30 | 
 | 	vadduwm	12, 12, 30 | 
 | 	xxlor	32+30, 0, 0 | 
 | 	TP_4x 0, 1, 2, 3 | 
 | 	TP_4x 4, 5, 6, 7 | 
 | 	TP_4x 8, 9, 10, 11 | 
 | 	TP_4x 12, 13, 14, 15 | 
 |  | 
 | 	xxlor	0, 48, 48 | 
 | 	xxlor	1, 49, 49 | 
 | 	xxlor	2, 50, 50 | 
 | 	xxlor	3, 51, 51 | 
 | 	xxlor	48, 16, 16 | 
 | 	xxlor	49, 17, 17 | 
 | 	xxlor	50, 18, 18 | 
 | 	xxlor	51, 19, 19 | 
 | 	Add_state 0 | 
 | 	xxlor	48, 0, 0 | 
 | 	xxlor	49, 1, 1 | 
 | 	xxlor	50, 2, 2 | 
 | 	xxlor	51, 3, 3 | 
 | 	Write_256 0 | 
 | 	addi	14, 14, 256	# offset +=256 | 
 | 	addi	15, 15, -256	# len -=256 | 
 |  | 
 | 	xxlor	5, 32+31, 32+31 | 
 | 	xxlor	32+31, 31, 31 | 
 | 	vadduwm	28, 28, 31 | 
 | 	xxlor	32+31, 5, 5 | 
 | 	TP_4x 16+0, 16+1, 16+2, 16+3 | 
 | 	TP_4x 16+4, 16+5, 16+6, 16+7 | 
 | 	TP_4x 16+8, 16+9, 16+10, 16+11 | 
 | 	TP_4x 16+12, 16+13, 16+14, 16+15 | 
 |  | 
 | 	xxlor	32, 16, 16 | 
 | 	xxlor	33, 17, 17 | 
 | 	xxlor	34, 18, 18 | 
 | 	xxlor	35, 19, 19 | 
 | 	Add_state 16 | 
 | 	Write_256 16 | 
 | 	addi	14, 14, 256	# offset +=256 | 
 | 	addi	15, 15, -256	# len +=256 | 
 |  | 
 | 	xxlor	32+24, 24, 24 | 
 | 	xxlor	32+25, 25, 25 | 
 | 	xxlor	32+30, 30, 30 | 
 | 	vadduwm	30, 30, 25 | 
 | 	vadduwm	31, 30, 24 | 
 | 	xxlor	30, 32+30, 32+30 | 
 | 	xxlor	31, 32+31, 32+31 | 
 |  | 
 | 	cmpdi	15, 0 | 
 | 	beq	Out_loop | 
 |  | 
 | 	cmpdi	15, 512 | 
 | 	blt	Loop_last | 
 |  | 
 | 	mtctr 8 | 
 | 	b Loop_8x | 
 |  | 
 | Loop_last: | 
 |         lxvw4x	48, 0, 3		#  vr16, constants | 
 | 	lxvw4x	49, 17, 3		#  vr17, key 1 | 
 | 	lxvw4x	50, 18, 3		#  vr18, key 2 | 
 | 	lxvw4x	51, 19, 3		#  vr19, counter, nonce | 
 |  | 
 | 	vspltisw 21, 12 | 
 | 	vspltisw 23, 7 | 
 | 	addis	11, 2, permx@toc@ha | 
 | 	addi	11, 11, permx@toc@l | 
 | 	lxvw4x	32+20, 0, 11 | 
 | 	lxvw4x	32+22, 17, 11 | 
 |  | 
 | 	sradi	8, 7, 1 | 
 | 	mtctr 8 | 
 |  | 
 | Loop_4x: | 
 | 	vspltw  0, 16, 0 | 
 | 	vspltw  1, 16, 1 | 
 | 	vspltw  2, 16, 2 | 
 | 	vspltw  3, 16, 3 | 
 |  | 
 | 	vspltw  4, 17, 0 | 
 | 	vspltw  5, 17, 1 | 
 | 	vspltw  6, 17, 2 | 
 | 	vspltw  7, 17, 3 | 
 | 	vspltw  8, 18, 0 | 
 | 	vspltw  9, 18, 1 | 
 | 	vspltw  10, 18, 2 | 
 | 	vspltw  11, 18, 3 | 
 | 	vspltw  12, 19, 0 | 
 | 	vadduwm	12, 12, 30	# increase counter | 
 | 	vspltw  13, 19, 1 | 
 | 	vspltw  14, 19, 2 | 
 | 	vspltw  15, 19, 3 | 
 |  | 
 | .align 5 | 
 | quarter_loop: | 
 | 	QT_loop_4x | 
 |  | 
 | 	bdnz	quarter_loop | 
 |  | 
 | 	vadduwm	12, 12, 30 | 
 | 	TP_4x 0, 1, 2, 3 | 
 | 	TP_4x 4, 5, 6, 7 | 
 | 	TP_4x 8, 9, 10, 11 | 
 | 	TP_4x 12, 13, 14, 15 | 
 |  | 
 | 	Add_state 0 | 
 | 	Write_256 0 | 
 | 	addi	14, 14, 256	# offset += 256 | 
 | 	addi	15, 15, -256	# len += 256 | 
 |  | 
 | 	# Update state counter | 
 | 	vspltisw 25, 4 | 
 | 	vadduwm	30, 30, 25 | 
 |  | 
 | 	cmpdi	15, 0 | 
 | 	beq	Out_loop | 
 | 	cmpdi	15, 256 | 
 | 	blt	Out_loop | 
 |  | 
 | 	mtctr 8 | 
 | 	b Loop_4x | 
 |  | 
 | Out_loop: | 
 | 	RESTORE_REGS | 
 | 	blr | 
 |  | 
 | Out_no_chacha: | 
 | 	li	3, 0 | 
 | 	blr | 
 | SYM_FUNC_END(chacha_p10le_8x) | 
 |  | 
 | SYM_DATA_START_LOCAL(PERMX) | 
 | .align 5 | 
 | permx: | 
 | .long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd | 
 | .long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc | 
 | SYM_DATA_END(PERMX) |