|  | /* SPDX-License-Identifier: GPL-2.0 */ | 
|  | /* | 
|  | * Original implementation written by Andy Polyakov, @dot-asm. | 
|  | * This is an adaptation of the original code for kernel use. | 
|  | * | 
|  | * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. | 
|  | */ | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  | #include <asm/nospec-insn.h> | 
|  | #include <asm/fpu-insn.h> | 
|  |  | 
|  | #define SP	%r15 | 
|  | #define FRAME	(16 * 8 + 4 * 8) | 
|  |  | 
|  | .data | 
|  | .balign	32 | 
|  |  | 
|  | SYM_DATA_START_LOCAL(sigma) | 
|  | .long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral | 
|  | .long	1,0,0,0 | 
|  | .long	2,0,0,0 | 
|  | .long	3,0,0,0 | 
|  | .long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap | 
|  |  | 
|  | .long	0,1,2,3 | 
|  | .long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma | 
|  | .long	0x3320646e,0x3320646e,0x3320646e,0x3320646e | 
|  | .long	0x79622d32,0x79622d32,0x79622d32,0x79622d32 | 
|  | .long	0x6b206574,0x6b206574,0x6b206574,0x6b206574 | 
|  | SYM_DATA_END(sigma) | 
|  |  | 
|  | .previous | 
|  |  | 
|  | GEN_BR_THUNK %r14 | 
|  |  | 
|  | .text | 
|  |  | 
|  | ############################################################################# | 
|  | # void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len, | 
|  | #		      counst u32 *key, const u32 *counter) | 
|  |  | 
|  | #define	OUT		%r2 | 
|  | #define	INP		%r3 | 
|  | #define	LEN		%r4 | 
|  | #define	KEY		%r5 | 
|  | #define	COUNTER		%r6 | 
|  |  | 
|  | #define BEPERM		%v31 | 
|  | #define CTR		%v26 | 
|  |  | 
|  | #define K0		%v16 | 
|  | #define K1		%v17 | 
|  | #define K2		%v18 | 
|  | #define K3		%v19 | 
|  |  | 
|  | #define XA0		%v0 | 
|  | #define XA1		%v1 | 
|  | #define XA2		%v2 | 
|  | #define XA3		%v3 | 
|  |  | 
|  | #define XB0		%v4 | 
|  | #define XB1		%v5 | 
|  | #define XB2		%v6 | 
|  | #define XB3		%v7 | 
|  |  | 
|  | #define XC0		%v8 | 
|  | #define XC1		%v9 | 
|  | #define XC2		%v10 | 
|  | #define XC3		%v11 | 
|  |  | 
|  | #define XD0		%v12 | 
|  | #define XD1		%v13 | 
|  | #define XD2		%v14 | 
|  | #define XD3		%v15 | 
|  |  | 
|  | #define XT0		%v27 | 
|  | #define XT1		%v28 | 
|  | #define XT2		%v29 | 
|  | #define XT3		%v30 | 
|  |  | 
|  | SYM_FUNC_START(chacha20_vx_4x) | 
|  | stmg	%r6,%r7,6*8(SP) | 
|  |  | 
|  | larl	%r7,sigma | 
|  | lhi	%r0,10 | 
|  | lhi	%r1,0 | 
|  |  | 
|  | VL	K0,0,,%r7		# load sigma | 
|  | VL	K1,0,,KEY		# load key | 
|  | VL	K2,16,,KEY | 
|  | VL	K3,0,,COUNTER		# load counter | 
|  |  | 
|  | VL	BEPERM,0x40,,%r7 | 
|  | VL	CTR,0x50,,%r7 | 
|  |  | 
|  | VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma | 
|  |  | 
|  | VREPF	XB0,K1,0		# smash the key | 
|  | VREPF	XB1,K1,1 | 
|  | VREPF	XB2,K1,2 | 
|  | VREPF	XB3,K1,3 | 
|  |  | 
|  | VREPF	XD0,K3,0 | 
|  | VREPF	XD1,K3,1 | 
|  | VREPF	XD2,K3,2 | 
|  | VREPF	XD3,K3,3 | 
|  | VAF	XD0,XD0,CTR | 
|  |  | 
|  | VREPF	XC0,K2,0 | 
|  | VREPF	XC1,K2,1 | 
|  | VREPF	XC2,K2,2 | 
|  | VREPF	XC3,K2,3 | 
|  |  | 
|  | .Loop_4x: | 
|  | VAF	XA0,XA0,XB0 | 
|  | VX	XD0,XD0,XA0 | 
|  | VERLLF	XD0,XD0,16 | 
|  |  | 
|  | VAF	XA1,XA1,XB1 | 
|  | VX	XD1,XD1,XA1 | 
|  | VERLLF	XD1,XD1,16 | 
|  |  | 
|  | VAF	XA2,XA2,XB2 | 
|  | VX	XD2,XD2,XA2 | 
|  | VERLLF	XD2,XD2,16 | 
|  |  | 
|  | VAF	XA3,XA3,XB3 | 
|  | VX	XD3,XD3,XA3 | 
|  | VERLLF	XD3,XD3,16 | 
|  |  | 
|  | VAF	XC0,XC0,XD0 | 
|  | VX	XB0,XB0,XC0 | 
|  | VERLLF	XB0,XB0,12 | 
|  |  | 
|  | VAF	XC1,XC1,XD1 | 
|  | VX	XB1,XB1,XC1 | 
|  | VERLLF	XB1,XB1,12 | 
|  |  | 
|  | VAF	XC2,XC2,XD2 | 
|  | VX	XB2,XB2,XC2 | 
|  | VERLLF	XB2,XB2,12 | 
|  |  | 
|  | VAF	XC3,XC3,XD3 | 
|  | VX	XB3,XB3,XC3 | 
|  | VERLLF	XB3,XB3,12 | 
|  |  | 
|  | VAF	XA0,XA0,XB0 | 
|  | VX	XD0,XD0,XA0 | 
|  | VERLLF	XD0,XD0,8 | 
|  |  | 
|  | VAF	XA1,XA1,XB1 | 
|  | VX	XD1,XD1,XA1 | 
|  | VERLLF	XD1,XD1,8 | 
|  |  | 
|  | VAF	XA2,XA2,XB2 | 
|  | VX	XD2,XD2,XA2 | 
|  | VERLLF	XD2,XD2,8 | 
|  |  | 
|  | VAF	XA3,XA3,XB3 | 
|  | VX	XD3,XD3,XA3 | 
|  | VERLLF	XD3,XD3,8 | 
|  |  | 
|  | VAF	XC0,XC0,XD0 | 
|  | VX	XB0,XB0,XC0 | 
|  | VERLLF	XB0,XB0,7 | 
|  |  | 
|  | VAF	XC1,XC1,XD1 | 
|  | VX	XB1,XB1,XC1 | 
|  | VERLLF	XB1,XB1,7 | 
|  |  | 
|  | VAF	XC2,XC2,XD2 | 
|  | VX	XB2,XB2,XC2 | 
|  | VERLLF	XB2,XB2,7 | 
|  |  | 
|  | VAF	XC3,XC3,XD3 | 
|  | VX	XB3,XB3,XC3 | 
|  | VERLLF	XB3,XB3,7 | 
|  |  | 
|  | VAF	XA0,XA0,XB1 | 
|  | VX	XD3,XD3,XA0 | 
|  | VERLLF	XD3,XD3,16 | 
|  |  | 
|  | VAF	XA1,XA1,XB2 | 
|  | VX	XD0,XD0,XA1 | 
|  | VERLLF	XD0,XD0,16 | 
|  |  | 
|  | VAF	XA2,XA2,XB3 | 
|  | VX	XD1,XD1,XA2 | 
|  | VERLLF	XD1,XD1,16 | 
|  |  | 
|  | VAF	XA3,XA3,XB0 | 
|  | VX	XD2,XD2,XA3 | 
|  | VERLLF	XD2,XD2,16 | 
|  |  | 
|  | VAF	XC2,XC2,XD3 | 
|  | VX	XB1,XB1,XC2 | 
|  | VERLLF	XB1,XB1,12 | 
|  |  | 
|  | VAF	XC3,XC3,XD0 | 
|  | VX	XB2,XB2,XC3 | 
|  | VERLLF	XB2,XB2,12 | 
|  |  | 
|  | VAF	XC0,XC0,XD1 | 
|  | VX	XB3,XB3,XC0 | 
|  | VERLLF	XB3,XB3,12 | 
|  |  | 
|  | VAF	XC1,XC1,XD2 | 
|  | VX	XB0,XB0,XC1 | 
|  | VERLLF	XB0,XB0,12 | 
|  |  | 
|  | VAF	XA0,XA0,XB1 | 
|  | VX	XD3,XD3,XA0 | 
|  | VERLLF	XD3,XD3,8 | 
|  |  | 
|  | VAF	XA1,XA1,XB2 | 
|  | VX	XD0,XD0,XA1 | 
|  | VERLLF	XD0,XD0,8 | 
|  |  | 
|  | VAF	XA2,XA2,XB3 | 
|  | VX	XD1,XD1,XA2 | 
|  | VERLLF	XD1,XD1,8 | 
|  |  | 
|  | VAF	XA3,XA3,XB0 | 
|  | VX	XD2,XD2,XA3 | 
|  | VERLLF	XD2,XD2,8 | 
|  |  | 
|  | VAF	XC2,XC2,XD3 | 
|  | VX	XB1,XB1,XC2 | 
|  | VERLLF	XB1,XB1,7 | 
|  |  | 
|  | VAF	XC3,XC3,XD0 | 
|  | VX	XB2,XB2,XC3 | 
|  | VERLLF	XB2,XB2,7 | 
|  |  | 
|  | VAF	XC0,XC0,XD1 | 
|  | VX	XB3,XB3,XC0 | 
|  | VERLLF	XB3,XB3,7 | 
|  |  | 
|  | VAF	XC1,XC1,XD2 | 
|  | VX	XB0,XB0,XC1 | 
|  | VERLLF	XB0,XB0,7 | 
|  | brct	%r0,.Loop_4x | 
|  |  | 
|  | VAF	XD0,XD0,CTR | 
|  |  | 
|  | VMRHF	XT0,XA0,XA1		# transpose data | 
|  | VMRHF	XT1,XA2,XA3 | 
|  | VMRLF	XT2,XA0,XA1 | 
|  | VMRLF	XT3,XA2,XA3 | 
|  | VPDI	XA0,XT0,XT1,0b0000 | 
|  | VPDI	XA1,XT0,XT1,0b0101 | 
|  | VPDI	XA2,XT2,XT3,0b0000 | 
|  | VPDI	XA3,XT2,XT3,0b0101 | 
|  |  | 
|  | VMRHF	XT0,XB0,XB1 | 
|  | VMRHF	XT1,XB2,XB3 | 
|  | VMRLF	XT2,XB0,XB1 | 
|  | VMRLF	XT3,XB2,XB3 | 
|  | VPDI	XB0,XT0,XT1,0b0000 | 
|  | VPDI	XB1,XT0,XT1,0b0101 | 
|  | VPDI	XB2,XT2,XT3,0b0000 | 
|  | VPDI	XB3,XT2,XT3,0b0101 | 
|  |  | 
|  | VMRHF	XT0,XC0,XC1 | 
|  | VMRHF	XT1,XC2,XC3 | 
|  | VMRLF	XT2,XC0,XC1 | 
|  | VMRLF	XT3,XC2,XC3 | 
|  | VPDI	XC0,XT0,XT1,0b0000 | 
|  | VPDI	XC1,XT0,XT1,0b0101 | 
|  | VPDI	XC2,XT2,XT3,0b0000 | 
|  | VPDI	XC3,XT2,XT3,0b0101 | 
|  |  | 
|  | VMRHF	XT0,XD0,XD1 | 
|  | VMRHF	XT1,XD2,XD3 | 
|  | VMRLF	XT2,XD0,XD1 | 
|  | VMRLF	XT3,XD2,XD3 | 
|  | VPDI	XD0,XT0,XT1,0b0000 | 
|  | VPDI	XD1,XT0,XT1,0b0101 | 
|  | VPDI	XD2,XT2,XT3,0b0000 | 
|  | VPDI	XD3,XT2,XT3,0b0101 | 
|  |  | 
|  | VAF	XA0,XA0,K0 | 
|  | VAF	XB0,XB0,K1 | 
|  | VAF	XC0,XC0,K2 | 
|  | VAF	XD0,XD0,K3 | 
|  |  | 
|  | VPERM	XA0,XA0,XA0,BEPERM | 
|  | VPERM	XB0,XB0,XB0,BEPERM | 
|  | VPERM	XC0,XC0,XC0,BEPERM | 
|  | VPERM	XD0,XD0,XD0,BEPERM | 
|  |  | 
|  | VLM	XT0,XT3,0,INP,0 | 
|  |  | 
|  | VX	XT0,XT0,XA0 | 
|  | VX	XT1,XT1,XB0 | 
|  | VX	XT2,XT2,XC0 | 
|  | VX	XT3,XT3,XD0 | 
|  |  | 
|  | VSTM	XT0,XT3,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | aghi	LEN,-0x40 | 
|  |  | 
|  | VAF	XA0,XA1,K0 | 
|  | VAF	XB0,XB1,K1 | 
|  | VAF	XC0,XC1,K2 | 
|  | VAF	XD0,XD1,K3 | 
|  |  | 
|  | VPERM	XA0,XA0,XA0,BEPERM | 
|  | VPERM	XB0,XB0,XB0,BEPERM | 
|  | VPERM	XC0,XC0,XC0,BEPERM | 
|  | VPERM	XD0,XD0,XD0,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_4x | 
|  |  | 
|  | VLM	XT0,XT3,0,INP,0 | 
|  |  | 
|  | VX	XT0,XT0,XA0 | 
|  | VX	XT1,XT1,XB0 | 
|  | VX	XT2,XT2,XC0 | 
|  | VX	XT3,XT3,XD0 | 
|  |  | 
|  | VSTM	XT0,XT3,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | aghi	LEN,-0x40 | 
|  | je	.Ldone_4x | 
|  |  | 
|  | VAF	XA0,XA2,K0 | 
|  | VAF	XB0,XB2,K1 | 
|  | VAF	XC0,XC2,K2 | 
|  | VAF	XD0,XD2,K3 | 
|  |  | 
|  | VPERM	XA0,XA0,XA0,BEPERM | 
|  | VPERM	XB0,XB0,XB0,BEPERM | 
|  | VPERM	XC0,XC0,XC0,BEPERM | 
|  | VPERM	XD0,XD0,XD0,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_4x | 
|  |  | 
|  | VLM	XT0,XT3,0,INP,0 | 
|  |  | 
|  | VX	XT0,XT0,XA0 | 
|  | VX	XT1,XT1,XB0 | 
|  | VX	XT2,XT2,XC0 | 
|  | VX	XT3,XT3,XD0 | 
|  |  | 
|  | VSTM	XT0,XT3,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | aghi	LEN,-0x40 | 
|  | je	.Ldone_4x | 
|  |  | 
|  | VAF	XA0,XA3,K0 | 
|  | VAF	XB0,XB3,K1 | 
|  | VAF	XC0,XC3,K2 | 
|  | VAF	XD0,XD3,K3 | 
|  |  | 
|  | VPERM	XA0,XA0,XA0,BEPERM | 
|  | VPERM	XB0,XB0,XB0,BEPERM | 
|  | VPERM	XC0,XC0,XC0,BEPERM | 
|  | VPERM	XD0,XD0,XD0,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_4x | 
|  |  | 
|  | VLM	XT0,XT3,0,INP,0 | 
|  |  | 
|  | VX	XT0,XT0,XA0 | 
|  | VX	XT1,XT1,XB0 | 
|  | VX	XT2,XT2,XC0 | 
|  | VX	XT3,XT3,XD0 | 
|  |  | 
|  | VSTM	XT0,XT3,0,OUT,0 | 
|  |  | 
|  | .Ldone_4x: | 
|  | lmg	%r6,%r7,6*8(SP) | 
|  | BR_EX	%r14 | 
|  |  | 
|  | .Ltail_4x: | 
|  | VLR	XT0,XC0 | 
|  | VLR	XT1,XD0 | 
|  |  | 
|  | VST	XA0,8*8+0x00,,SP | 
|  | VST	XB0,8*8+0x10,,SP | 
|  | VST	XT0,8*8+0x20,,SP | 
|  | VST	XT1,8*8+0x30,,SP | 
|  |  | 
|  | lghi	%r1,0 | 
|  |  | 
|  | .Loop_tail_4x: | 
|  | llgc	%r5,0(%r1,INP) | 
|  | llgc	%r6,8*8(%r1,SP) | 
|  | xr	%r6,%r5 | 
|  | stc	%r6,0(%r1,OUT) | 
|  | la	%r1,1(%r1) | 
|  | brct	LEN,.Loop_tail_4x | 
|  |  | 
|  | lmg	%r6,%r7,6*8(SP) | 
|  | BR_EX	%r14 | 
|  | SYM_FUNC_END(chacha20_vx_4x) | 
|  |  | 
|  | #undef	OUT | 
|  | #undef	INP | 
|  | #undef	LEN | 
|  | #undef	KEY | 
|  | #undef	COUNTER | 
|  |  | 
|  | #undef BEPERM | 
|  |  | 
|  | #undef K0 | 
|  | #undef K1 | 
|  | #undef K2 | 
|  | #undef K3 | 
|  |  | 
|  |  | 
|  | ############################################################################# | 
|  | # void chacha20_vx(u8 *out, counst u8 *inp, size_t len, | 
|  | #		   counst u32 *key, const u32 *counter) | 
|  |  | 
|  | #define	OUT		%r2 | 
|  | #define	INP		%r3 | 
|  | #define	LEN		%r4 | 
|  | #define	KEY		%r5 | 
|  | #define	COUNTER		%r6 | 
|  |  | 
|  | #define BEPERM		%v31 | 
|  |  | 
|  | #define K0		%v27 | 
|  | #define K1		%v24 | 
|  | #define K2		%v25 | 
|  | #define K3		%v26 | 
|  |  | 
|  | #define A0		%v0 | 
|  | #define B0		%v1 | 
|  | #define C0		%v2 | 
|  | #define D0		%v3 | 
|  |  | 
|  | #define A1		%v4 | 
|  | #define B1		%v5 | 
|  | #define C1		%v6 | 
|  | #define D1		%v7 | 
|  |  | 
|  | #define A2		%v8 | 
|  | #define B2		%v9 | 
|  | #define C2		%v10 | 
|  | #define D2		%v11 | 
|  |  | 
|  | #define A3		%v12 | 
|  | #define B3		%v13 | 
|  | #define C3		%v14 | 
|  | #define D3		%v15 | 
|  |  | 
|  | #define A4		%v16 | 
|  | #define B4		%v17 | 
|  | #define C4		%v18 | 
|  | #define D4		%v19 | 
|  |  | 
|  | #define A5		%v20 | 
|  | #define B5		%v21 | 
|  | #define C5		%v22 | 
|  | #define D5		%v23 | 
|  |  | 
|  | #define T0		%v27 | 
|  | #define T1		%v28 | 
|  | #define T2		%v29 | 
|  | #define T3		%v30 | 
|  |  | 
|  | SYM_FUNC_START(chacha20_vx) | 
|  | clgfi	LEN,256 | 
|  | jle	chacha20_vx_4x | 
|  | stmg	%r6,%r7,6*8(SP) | 
|  |  | 
|  | lghi	%r1,-FRAME | 
|  | lgr	%r0,SP | 
|  | la	SP,0(%r1,SP) | 
|  | stg	%r0,0(SP)		# back-chain | 
|  |  | 
|  | larl	%r7,sigma | 
|  | lhi	%r0,10 | 
|  |  | 
|  | VLM	K1,K2,0,KEY,0		# load key | 
|  | VL	K3,0,,COUNTER		# load counter | 
|  |  | 
|  | VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ... | 
|  |  | 
|  | .Loop_outer_vx: | 
|  | VLR	A0,K0 | 
|  | VLR	B0,K1 | 
|  | VLR	A1,K0 | 
|  | VLR	B1,K1 | 
|  | VLR	A2,K0 | 
|  | VLR	B2,K1 | 
|  | VLR	A3,K0 | 
|  | VLR	B3,K1 | 
|  | VLR	A4,K0 | 
|  | VLR	B4,K1 | 
|  | VLR	A5,K0 | 
|  | VLR	B5,K1 | 
|  |  | 
|  | VLR	D0,K3 | 
|  | VAF	D1,K3,T1		# K[3]+1 | 
|  | VAF	D2,K3,T2		# K[3]+2 | 
|  | VAF	D3,K3,T3		# K[3]+3 | 
|  | VAF	D4,D2,T2		# K[3]+4 | 
|  | VAF	D5,D2,T3		# K[3]+5 | 
|  |  | 
|  | VLR	C0,K2 | 
|  | VLR	C1,K2 | 
|  | VLR	C2,K2 | 
|  | VLR	C3,K2 | 
|  | VLR	C4,K2 | 
|  | VLR	C5,K2 | 
|  |  | 
|  | VLR	T1,D1 | 
|  | VLR	T2,D2 | 
|  | VLR	T3,D3 | 
|  |  | 
|  | .Loop_vx: | 
|  | VAF	A0,A0,B0 | 
|  | VAF	A1,A1,B1 | 
|  | VAF	A2,A2,B2 | 
|  | VAF	A3,A3,B3 | 
|  | VAF	A4,A4,B4 | 
|  | VAF	A5,A5,B5 | 
|  | VX	D0,D0,A0 | 
|  | VX	D1,D1,A1 | 
|  | VX	D2,D2,A2 | 
|  | VX	D3,D3,A3 | 
|  | VX	D4,D4,A4 | 
|  | VX	D5,D5,A5 | 
|  | VERLLF	D0,D0,16 | 
|  | VERLLF	D1,D1,16 | 
|  | VERLLF	D2,D2,16 | 
|  | VERLLF	D3,D3,16 | 
|  | VERLLF	D4,D4,16 | 
|  | VERLLF	D5,D5,16 | 
|  |  | 
|  | VAF	C0,C0,D0 | 
|  | VAF	C1,C1,D1 | 
|  | VAF	C2,C2,D2 | 
|  | VAF	C3,C3,D3 | 
|  | VAF	C4,C4,D4 | 
|  | VAF	C5,C5,D5 | 
|  | VX	B0,B0,C0 | 
|  | VX	B1,B1,C1 | 
|  | VX	B2,B2,C2 | 
|  | VX	B3,B3,C3 | 
|  | VX	B4,B4,C4 | 
|  | VX	B5,B5,C5 | 
|  | VERLLF	B0,B0,12 | 
|  | VERLLF	B1,B1,12 | 
|  | VERLLF	B2,B2,12 | 
|  | VERLLF	B3,B3,12 | 
|  | VERLLF	B4,B4,12 | 
|  | VERLLF	B5,B5,12 | 
|  |  | 
|  | VAF	A0,A0,B0 | 
|  | VAF	A1,A1,B1 | 
|  | VAF	A2,A2,B2 | 
|  | VAF	A3,A3,B3 | 
|  | VAF	A4,A4,B4 | 
|  | VAF	A5,A5,B5 | 
|  | VX	D0,D0,A0 | 
|  | VX	D1,D1,A1 | 
|  | VX	D2,D2,A2 | 
|  | VX	D3,D3,A3 | 
|  | VX	D4,D4,A4 | 
|  | VX	D5,D5,A5 | 
|  | VERLLF	D0,D0,8 | 
|  | VERLLF	D1,D1,8 | 
|  | VERLLF	D2,D2,8 | 
|  | VERLLF	D3,D3,8 | 
|  | VERLLF	D4,D4,8 | 
|  | VERLLF	D5,D5,8 | 
|  |  | 
|  | VAF	C0,C0,D0 | 
|  | VAF	C1,C1,D1 | 
|  | VAF	C2,C2,D2 | 
|  | VAF	C3,C3,D3 | 
|  | VAF	C4,C4,D4 | 
|  | VAF	C5,C5,D5 | 
|  | VX	B0,B0,C0 | 
|  | VX	B1,B1,C1 | 
|  | VX	B2,B2,C2 | 
|  | VX	B3,B3,C3 | 
|  | VX	B4,B4,C4 | 
|  | VX	B5,B5,C5 | 
|  | VERLLF	B0,B0,7 | 
|  | VERLLF	B1,B1,7 | 
|  | VERLLF	B2,B2,7 | 
|  | VERLLF	B3,B3,7 | 
|  | VERLLF	B4,B4,7 | 
|  | VERLLF	B5,B5,7 | 
|  |  | 
|  | VSLDB	C0,C0,C0,8 | 
|  | VSLDB	C1,C1,C1,8 | 
|  | VSLDB	C2,C2,C2,8 | 
|  | VSLDB	C3,C3,C3,8 | 
|  | VSLDB	C4,C4,C4,8 | 
|  | VSLDB	C5,C5,C5,8 | 
|  | VSLDB	B0,B0,B0,4 | 
|  | VSLDB	B1,B1,B1,4 | 
|  | VSLDB	B2,B2,B2,4 | 
|  | VSLDB	B3,B3,B3,4 | 
|  | VSLDB	B4,B4,B4,4 | 
|  | VSLDB	B5,B5,B5,4 | 
|  | VSLDB	D0,D0,D0,12 | 
|  | VSLDB	D1,D1,D1,12 | 
|  | VSLDB	D2,D2,D2,12 | 
|  | VSLDB	D3,D3,D3,12 | 
|  | VSLDB	D4,D4,D4,12 | 
|  | VSLDB	D5,D5,D5,12 | 
|  |  | 
|  | VAF	A0,A0,B0 | 
|  | VAF	A1,A1,B1 | 
|  | VAF	A2,A2,B2 | 
|  | VAF	A3,A3,B3 | 
|  | VAF	A4,A4,B4 | 
|  | VAF	A5,A5,B5 | 
|  | VX	D0,D0,A0 | 
|  | VX	D1,D1,A1 | 
|  | VX	D2,D2,A2 | 
|  | VX	D3,D3,A3 | 
|  | VX	D4,D4,A4 | 
|  | VX	D5,D5,A5 | 
|  | VERLLF	D0,D0,16 | 
|  | VERLLF	D1,D1,16 | 
|  | VERLLF	D2,D2,16 | 
|  | VERLLF	D3,D3,16 | 
|  | VERLLF	D4,D4,16 | 
|  | VERLLF	D5,D5,16 | 
|  |  | 
|  | VAF	C0,C0,D0 | 
|  | VAF	C1,C1,D1 | 
|  | VAF	C2,C2,D2 | 
|  | VAF	C3,C3,D3 | 
|  | VAF	C4,C4,D4 | 
|  | VAF	C5,C5,D5 | 
|  | VX	B0,B0,C0 | 
|  | VX	B1,B1,C1 | 
|  | VX	B2,B2,C2 | 
|  | VX	B3,B3,C3 | 
|  | VX	B4,B4,C4 | 
|  | VX	B5,B5,C5 | 
|  | VERLLF	B0,B0,12 | 
|  | VERLLF	B1,B1,12 | 
|  | VERLLF	B2,B2,12 | 
|  | VERLLF	B3,B3,12 | 
|  | VERLLF	B4,B4,12 | 
|  | VERLLF	B5,B5,12 | 
|  |  | 
|  | VAF	A0,A0,B0 | 
|  | VAF	A1,A1,B1 | 
|  | VAF	A2,A2,B2 | 
|  | VAF	A3,A3,B3 | 
|  | VAF	A4,A4,B4 | 
|  | VAF	A5,A5,B5 | 
|  | VX	D0,D0,A0 | 
|  | VX	D1,D1,A1 | 
|  | VX	D2,D2,A2 | 
|  | VX	D3,D3,A3 | 
|  | VX	D4,D4,A4 | 
|  | VX	D5,D5,A5 | 
|  | VERLLF	D0,D0,8 | 
|  | VERLLF	D1,D1,8 | 
|  | VERLLF	D2,D2,8 | 
|  | VERLLF	D3,D3,8 | 
|  | VERLLF	D4,D4,8 | 
|  | VERLLF	D5,D5,8 | 
|  |  | 
|  | VAF	C0,C0,D0 | 
|  | VAF	C1,C1,D1 | 
|  | VAF	C2,C2,D2 | 
|  | VAF	C3,C3,D3 | 
|  | VAF	C4,C4,D4 | 
|  | VAF	C5,C5,D5 | 
|  | VX	B0,B0,C0 | 
|  | VX	B1,B1,C1 | 
|  | VX	B2,B2,C2 | 
|  | VX	B3,B3,C3 | 
|  | VX	B4,B4,C4 | 
|  | VX	B5,B5,C5 | 
|  | VERLLF	B0,B0,7 | 
|  | VERLLF	B1,B1,7 | 
|  | VERLLF	B2,B2,7 | 
|  | VERLLF	B3,B3,7 | 
|  | VERLLF	B4,B4,7 | 
|  | VERLLF	B5,B5,7 | 
|  |  | 
|  | VSLDB	C0,C0,C0,8 | 
|  | VSLDB	C1,C1,C1,8 | 
|  | VSLDB	C2,C2,C2,8 | 
|  | VSLDB	C3,C3,C3,8 | 
|  | VSLDB	C4,C4,C4,8 | 
|  | VSLDB	C5,C5,C5,8 | 
|  | VSLDB	B0,B0,B0,12 | 
|  | VSLDB	B1,B1,B1,12 | 
|  | VSLDB	B2,B2,B2,12 | 
|  | VSLDB	B3,B3,B3,12 | 
|  | VSLDB	B4,B4,B4,12 | 
|  | VSLDB	B5,B5,B5,12 | 
|  | VSLDB	D0,D0,D0,4 | 
|  | VSLDB	D1,D1,D1,4 | 
|  | VSLDB	D2,D2,D2,4 | 
|  | VSLDB	D3,D3,D3,4 | 
|  | VSLDB	D4,D4,D4,4 | 
|  | VSLDB	D5,D5,D5,4 | 
|  | brct	%r0,.Loop_vx | 
|  |  | 
|  | VAF	A0,A0,K0 | 
|  | VAF	B0,B0,K1 | 
|  | VAF	C0,C0,K2 | 
|  | VAF	D0,D0,K3 | 
|  | VAF	A1,A1,K0 | 
|  | VAF	D1,D1,T1		# +K[3]+1 | 
|  |  | 
|  | VPERM	A0,A0,A0,BEPERM | 
|  | VPERM	B0,B0,B0,BEPERM | 
|  | VPERM	C0,C0,C0,BEPERM | 
|  | VPERM	D0,D0,D0,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_vx | 
|  |  | 
|  | VAF	D2,D2,T2		# +K[3]+2 | 
|  | VAF	D3,D3,T3		# +K[3]+3 | 
|  | VLM	T0,T3,0,INP,0 | 
|  |  | 
|  | VX	A0,A0,T0 | 
|  | VX	B0,B0,T1 | 
|  | VX	C0,C0,T2 | 
|  | VX	D0,D0,T3 | 
|  |  | 
|  | VLM	K0,T3,0,%r7,4		# re-load sigma and increments | 
|  |  | 
|  | VSTM	A0,D0,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | aghi	LEN,-0x40 | 
|  | je	.Ldone_vx | 
|  |  | 
|  | VAF	B1,B1,K1 | 
|  | VAF	C1,C1,K2 | 
|  |  | 
|  | VPERM	A0,A1,A1,BEPERM | 
|  | VPERM	B0,B1,B1,BEPERM | 
|  | VPERM	C0,C1,C1,BEPERM | 
|  | VPERM	D0,D1,D1,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_vx | 
|  |  | 
|  | VLM	A1,D1,0,INP,0 | 
|  |  | 
|  | VX	A0,A0,A1 | 
|  | VX	B0,B0,B1 | 
|  | VX	C0,C0,C1 | 
|  | VX	D0,D0,D1 | 
|  |  | 
|  | VSTM	A0,D0,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | aghi	LEN,-0x40 | 
|  | je	.Ldone_vx | 
|  |  | 
|  | VAF	A2,A2,K0 | 
|  | VAF	B2,B2,K1 | 
|  | VAF	C2,C2,K2 | 
|  |  | 
|  | VPERM	A0,A2,A2,BEPERM | 
|  | VPERM	B0,B2,B2,BEPERM | 
|  | VPERM	C0,C2,C2,BEPERM | 
|  | VPERM	D0,D2,D2,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_vx | 
|  |  | 
|  | VLM	A1,D1,0,INP,0 | 
|  |  | 
|  | VX	A0,A0,A1 | 
|  | VX	B0,B0,B1 | 
|  | VX	C0,C0,C1 | 
|  | VX	D0,D0,D1 | 
|  |  | 
|  | VSTM	A0,D0,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | aghi	LEN,-0x40 | 
|  | je	.Ldone_vx | 
|  |  | 
|  | VAF	A3,A3,K0 | 
|  | VAF	B3,B3,K1 | 
|  | VAF	C3,C3,K2 | 
|  | VAF	D2,K3,T3		# K[3]+3 | 
|  |  | 
|  | VPERM	A0,A3,A3,BEPERM | 
|  | VPERM	B0,B3,B3,BEPERM | 
|  | VPERM	C0,C3,C3,BEPERM | 
|  | VPERM	D0,D3,D3,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_vx | 
|  |  | 
|  | VAF	D3,D2,T1		# K[3]+4 | 
|  | VLM	A1,D1,0,INP,0 | 
|  |  | 
|  | VX	A0,A0,A1 | 
|  | VX	B0,B0,B1 | 
|  | VX	C0,C0,C1 | 
|  | VX	D0,D0,D1 | 
|  |  | 
|  | VSTM	A0,D0,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | aghi	LEN,-0x40 | 
|  | je	.Ldone_vx | 
|  |  | 
|  | VAF	A4,A4,K0 | 
|  | VAF	B4,B4,K1 | 
|  | VAF	C4,C4,K2 | 
|  | VAF	D4,D4,D3		# +K[3]+4 | 
|  | VAF	D3,D3,T1		# K[3]+5 | 
|  | VAF	K3,D2,T3		# K[3]+=6 | 
|  |  | 
|  | VPERM	A0,A4,A4,BEPERM | 
|  | VPERM	B0,B4,B4,BEPERM | 
|  | VPERM	C0,C4,C4,BEPERM | 
|  | VPERM	D0,D4,D4,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_vx | 
|  |  | 
|  | VLM	A1,D1,0,INP,0 | 
|  |  | 
|  | VX	A0,A0,A1 | 
|  | VX	B0,B0,B1 | 
|  | VX	C0,C0,C1 | 
|  | VX	D0,D0,D1 | 
|  |  | 
|  | VSTM	A0,D0,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | aghi	LEN,-0x40 | 
|  | je	.Ldone_vx | 
|  |  | 
|  | VAF	A5,A5,K0 | 
|  | VAF	B5,B5,K1 | 
|  | VAF	C5,C5,K2 | 
|  | VAF	D5,D5,D3		# +K[3]+5 | 
|  |  | 
|  | VPERM	A0,A5,A5,BEPERM | 
|  | VPERM	B0,B5,B5,BEPERM | 
|  | VPERM	C0,C5,C5,BEPERM | 
|  | VPERM	D0,D5,D5,BEPERM | 
|  |  | 
|  | clgfi	LEN,0x40 | 
|  | jl	.Ltail_vx | 
|  |  | 
|  | VLM	A1,D1,0,INP,0 | 
|  |  | 
|  | VX	A0,A0,A1 | 
|  | VX	B0,B0,B1 | 
|  | VX	C0,C0,C1 | 
|  | VX	D0,D0,D1 | 
|  |  | 
|  | VSTM	A0,D0,0,OUT,0 | 
|  |  | 
|  | la	INP,0x40(INP) | 
|  | la	OUT,0x40(OUT) | 
|  | lhi	%r0,10 | 
|  | aghi	LEN,-0x40 | 
|  | jne	.Loop_outer_vx | 
|  |  | 
|  | .Ldone_vx: | 
|  | lmg	%r6,%r7,FRAME+6*8(SP) | 
|  | la	SP,FRAME(SP) | 
|  | BR_EX	%r14 | 
|  |  | 
|  | .Ltail_vx: | 
|  | VSTM	A0,D0,8*8,SP,3 | 
|  | lghi	%r1,0 | 
|  |  | 
|  | .Loop_tail_vx: | 
|  | llgc	%r5,0(%r1,INP) | 
|  | llgc	%r6,8*8(%r1,SP) | 
|  | xr	%r6,%r5 | 
|  | stc	%r6,0(%r1,OUT) | 
|  | la	%r1,1(%r1) | 
|  | brct	LEN,.Loop_tail_vx | 
|  |  | 
|  | lmg	%r6,%r7,FRAME+6*8(SP) | 
|  | la	SP,FRAME(SP) | 
|  | BR_EX	%r14 | 
|  | SYM_FUNC_END(chacha20_vx) | 
|  |  | 
|  | .previous |