blob: 7705eb3f36d4edab9e09c49bafbe3129086b6252 [file] [log] [blame]
/* SPDX-License-Identifier: GPL-2.0 */
/*
* template for memcpy and copy_user with SIMD
*
* $4: 8-byte misalignment of src when dest is 8-byte aligned
* $5: 32-byte misalignment of src when dest is 32-byte aligned
* $7: SIMD status
* 0: not in simd loop
* 1: in simd loop
* 2: in simd_u loop
* $16: latest dest, clobbered
* $17: latest src, clobbered
* $18: bytes left to copy
*
*/
#define NC_STORE_THRESHOLD 2048
#define SAVE_SIMD_REGS \
ldi $sp, -0x60($sp); \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vstd $f1, 0($23); \
vstd $f2, 0x20($23); \
ldi $7, 1
#define RESTORE_SIMD_REGS \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vldd $f1, 0($23); \
vldd $f2, 0x20($23); \
ldi $sp, 0x60($sp); \
bis $31, $31, $7
#define SAVE_SIMD_U_REGS \
ldi $sp, -0xc0($sp); \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vstd $f1, 0($23); \
vstd $f2, 0x20($23); \
vstd $f4, 0x40($23); \
vstd $f5, 0x60($23); \
vstd $f3, 0x80($23); \
ldi $7, 2
#define RESTORE_SIMD_U_REGS \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vldd $f1, 0($23); \
vldd $f2, 0x20($23); \
vldd $f4, 0x40($23); \
vldd $f5, 0x60($23); \
vldd $f3, 0x80($23); \
ldi $sp, 0xc0($sp); \
bis $31, $31, $7
ble $18, $out
and $16, 7, $1
beq $1, $dest_aligned_8
$byte_loop_head:
FIXUP_LDST( ldbu $2, 0($17) )
FIXUP_LDST( stb $2, 0($16) )
subl $18, 1, $18
addl $17, 1, $17
addl $16, 1, $16
ble $18, $out
and $16, 7, $1
bne $1, $byte_loop_head
$dest_aligned_8:
and $17, 7, $4
cmplt $18, 16, $1
bne $1, $quad_loop_end
and $16, 31, $1
beq $1, $dest_aligned_32
cmplt $18, 64, $1
bne $1, $simd_end
bne $4, $quad_u_loop_head
$quad_loop_head:
FIXUP_LDST( ldl $2, 0($17) )
FIXUP_LDST( stl $2, 0($16) )
addl $16, 8, $16
addl $17, 8, $17
subl $18, 8, $18
and $16, 31, $1
beq $1, $dest_aligned_32
br $31, $quad_loop_head
$dest_aligned_32:
cmplt $18, 64, $1
bne $1, $simd_end
and $17, 31, $5
bne $5, $prep_simd_u_loop
$prep_simd_loop:
SAVE_SIMD_REGS
ldi $1, NC_STORE_THRESHOLD($31)
cmple $18, $1, $1
bne $1, $simd_loop
.align 4
$simd_loop_nc:
FIXUP_LDST( vldd $f1, 0($17) )
FIXUP_LDST( vldd $f2, 32($17) )
FIXUP_LDST( vstd_nc $f1, 0($16) )
FIXUP_LDST( vstd_nc $f2, 32($16) )
subl $18, 64, $18
addl $17, 64, $17
addl $16, 64, $16
cmplt $18, 64, $1
beq $1, $simd_loop_nc
memb # required for _nc store instructions
br $31, $simd_loop_end
.align 4
$simd_loop:
FIXUP_LDST( vldd $f1, 0($17) )
FIXUP_LDST( vldd $f2, 32($17) )
FIXUP_LDST( vstd $f1, 0($16) )
FIXUP_LDST( vstd $f2, 32($16) )
subl $18, 64, $18
addl $17, 64, $17
addl $16, 64, $16
cmplt $18, 64, $1
beq $1, $simd_loop
$simd_loop_end:
cmplt $18, 32, $1
bne $1, $no_more_simd
FIXUP_LDST( vldd $f1, 0($17) )
FIXUP_LDST( vstd $f1, 0($16) )
subl $18, 32, $18
addl $17, 32, $17
addl $16, 32, $16
$no_more_simd:
RESTORE_SIMD_REGS
$simd_end:
ble $18, $out
cmplt $18, 16, $1
bne $1, $quad_loop_end
bne $4, $prep_quad_u_loop_tail
.align 4
$quad_loop_tail:
FIXUP_LDST( ldl $2, 0($17) )
FIXUP_LDST( ldl $3, 8($17) )
FIXUP_LDST( stl $2, 0($16) )
FIXUP_LDST( stl $3, 8($16) )
subl $18, 16, $18
addl $17, 16, $17
addl $16, 16, $16
cmplt $18, 16, $1
beq $1, $quad_loop_tail
$quad_loop_end:
ble $18, $out
cmplt $18, 8, $1
bne $1, $byte_loop_tail
bne $4, $move_one_quad_u
$move_one_quad:
FIXUP_LDST( ldl $2, 0($17) )
FIXUP_LDST( stl $2, 0($16) )
subl $18, 8, $18
addl $17, 8, $17
addl $16, 8, $16
ble $18, $out
.align 3
$byte_loop_tail:
FIXUP_LDST( ldbu $2, 0($17) )
FIXUP_LDST( stb $2, 0($16) )
subl $18, 1, $18
addl $17, 1, $17
addl $16, 1, $16
bgt $18, $byte_loop_tail
br $31, $out
/* misaligned src and dst */
$quad_u_loop_head:
FIXUP_LDST( ldl_u $2, 0($17) )
FIXUP_LDST( ldl_u $3, 7($17) )
extll $2, $4, $2
exthl $3, $4, $3
bis $2, $3, $2
FIXUP_LDST( stl $2, 0($16) )
addl $16, 8, $16
addl $17, 8, $17
subl $18, 8, $18
and $16, 31, $1
beq $1, $dest_aligned_32
br $31, $quad_u_loop_head
$prep_simd_u_loop:
SAVE_SIMD_U_REGS
andnot $17, 31, $3
ldi $2, 256($31)
sll $5, 3, $1
subl $2, $1, $2
sll $1, 29, $1
sll $2, 29, $2
ifmovd $1, $f1
ifmovd $2, $f2
FIXUP_LDST( vldd $f4, 0($3) )
ldi $1, NC_STORE_THRESHOLD($31)
cmple $18, $1, $1
bne $1, $simd_u_loop
.align 4
$simd_u_loop_nc:
FIXUP_LDST( vldd $f5, 32($3) )
srlow $f4, $f1, $f4
sllow $f5, $f2, $f3
vlogfc $f3, $f4, $f31, $f3
FIXUP_LDST( vstd_nc $f3, 0($16) )
FIXUP_LDST( vldd $f4, 64($3) )
srlow $f5, $f1, $f5
sllow $f4, $f2, $f3
vlogfc $f5, $f3, $f31, $f5
FIXUP_LDST( vstd_nc $f5, 32($16) )
subl $18, 64, $18
addl $3, 64, $3
addl $16, 64, $16
cmplt $18, 64, $1
beq $1, $simd_u_loop_nc
memb # required for _nc store instructions
br $31, $simd_u_loop_end
.align 4
$simd_u_loop:
FIXUP_LDST( vldd $f5, 32($3) )
srlow $f4, $f1, $f4
sllow $f5, $f2, $f3
vlogfc $f4, $f3, $f31, $f3
FIXUP_LDST( vstd $f3, 0($16) )
FIXUP_LDST( vldd $f4, 64($3) )
srlow $f5, $f1, $f5
sllow $f4, $f2, $f3
vlogfc $f5, $f3, $f31, $f3
FIXUP_LDST( vstd $f3, 32($16) )
subl $18, 64, $18
addl $3, 64, $3
addl $16, 64, $16
cmplt $18, 64, $1
beq $1, $simd_u_loop
$simd_u_loop_end:
cmplt $18, 32, $1
bne $1, $no_more_simd_u
FIXUP_LDST( vldd $f5, 32($3) )
srlow $f4, $f1, $f4
sllow $f5, $f2, $f3
vlogfc $f4, $f3, $f31, $f3
FIXUP_LDST( vstd $f3, 0($16) )
subl $18, 32, $18
addl $3, 32, $3
addl $16, 32, $16
$no_more_simd_u:
RESTORE_SIMD_U_REGS
bis $3, $5, $17
br $31, $simd_end
$prep_quad_u_loop_tail:
FIXUP_LDST( ldl_u $2, 0($17) )
.align 4
$quad_u_loop_tail:
FIXUP_LDST( ldl_u $3, 8($17) )
extll $2, $4, $22
exthl $3, $4, $23
bis $22, $23, $22
FIXUP_LDST( stl $22, 0($16) )
FIXUP_LDST( ldl_u $2, 16($17) )
extll $3, $4, $24
exthl $2, $4, $25
bis $24, $25, $24
FIXUP_LDST( stl $24, 8($16) )
subl $18, 16, $18
addl $17, 16, $17
addl $16, 16, $16
cmplt $18, 16, $1
beq $1, $quad_u_loop_tail
br $31, $quad_loop_end
$move_one_quad_u:
FIXUP_LDST( ldl_u $2, 0($17) )
FIXUP_LDST( ldl_u $3, 8($17) )
extll $2, $4, $22
exthl $3, $4, $23
bis $22, $23, $22
FIXUP_LDST( stl $22, 0($16) )
subl $18, 8, $18
addl $17, 8, $17
addl $16, 8, $16
ble $18, $out
br $31, $byte_loop_tail