lib/crc/x86/crc32.h - pub/scm/linux/kernel/git/next/linux-next - Git at Google

 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * x86-optimized CRC32 functions
  *
  * Copyright (C) 2008 Intel Corporation
  * Copyright 2012 Xyratex Technology Limited
  * Copyright 2024 Google LLC
  */

 #include "crc-pclmul-template.h"

 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);

 DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);

 static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 {
 	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
 		   have_pclmulqdq);
 	return crc32_le_base(crc, p, len);
 }

 #ifdef CONFIG_X86_64
 #define CRC32_INST "crc32q %1, %q0"
 #else
 #define CRC32_INST "crc32l %1, %0"
 #endif

 /*
  * Use carryless multiply version of crc32c when buffer size is >= 512 to
  * account for FPU state save/restore overhead.
  */
 #define CRC32C_PCLMUL_BREAKEVEN	512

 asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);

 static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 {
 	size_t num_longs;

 	if (!static_branch_likely(&have_crc32))
 		return crc32c_base(crc, p, len);

 	if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
 	    static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
 		/*
 		 * Long length, the vector registers are usable, and the CPU is
 		 * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
 		 * It is worthwhile to divide the data into multiple streams,
 		 * CRC them independently, and combine them using PCLMULQDQ.
 		 * crc32c_x86_3way() does this using 3 streams, which is the
 		 * most that x86_64 CPUs have traditionally been capable of.
 		 *
 		 * However, due to improved VPCLMULQDQ performance on newer
 		 * CPUs, use crc32_lsb_vpclmul_avx512() instead of
 		 * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
 		 * "good" implementation of AVX-512.
 		 *
 		 * Future work: the optimal strategy on Zen 3--5 is actually to
 		 * use both crc32q and VPCLMULQDQ in parallel.  Unfortunately,
 		 * different numbers of streams and vector lengths are optimal
 		 * on each CPU microarchitecture, making it challenging to take
 		 * advantage of this.  (Zen 5 even supports 7 parallel crc32q, a
 		 * major upgrade.)  For now, just choose between
 		 * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512().  The latter
 		 * is needed anyway for crc32_le(), so we just reuse it here.
 		 */
 		kernel_fpu_begin();
 		if (static_branch_likely(&have_vpclmul_avx512))
 			crc = crc32_lsb_vpclmul_avx512(crc, p, len,
 				       crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
 		else
 			crc = crc32c_x86_3way(crc, p, len);
 		kernel_fpu_end();
 		return crc;
 	}

 	/*
 	 * Short length, XMM registers unusable, or the CPU is 32-bit; but the
 	 * CPU supports CRC32 instructions.  Just issue a single stream of CRC32
 	 * instructions inline.  While this doesn't use the CPU's CRC32
 	 * throughput very well, it avoids the need to combine streams.  Stream
 	 * combination would be inefficient here.
 	 */

 	for (num_longs = len / sizeof(unsigned long);
 	     num_longs != 0; num_longs--, p += sizeof(unsigned long))
 		asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));

 	if (sizeof(unsigned long) > 4 && (len & 4)) {
 		asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
 		p += 4;
 	}
 	if (len & 2) {
 		asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
 		p += 2;
 	}
 	if (len & 1)
 		asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));

 	return crc;
 }

 #define crc32_be_arch crc32_be_base /* not implemented on this arch */

 #define crc32_mod_init_arch crc32_mod_init_arch
 static inline void crc32_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_XMM4_2))
 		static_branch_enable(&have_crc32);
 	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
 		static_branch_enable(&have_pclmulqdq);
 		if (have_vpclmul()) {
 			if (have_avx512()) {
 				static_call_update(crc32_lsb_pclmul,
 						   crc32_lsb_vpclmul_avx512);
 				static_branch_enable(&have_vpclmul_avx512);
 			} else {
 				static_call_update(crc32_lsb_pclmul,
 						   crc32_lsb_vpclmul_avx2);
 			}
 		}
 	}
 }

 static inline u32 crc32_optimizations_arch(void)
 {
 	u32 optimizations = 0;

 	if (static_key_enabled(&have_crc32))
 		optimizations |= CRC32C_OPTIMIZATION;
 	if (static_key_enabled(&have_pclmulqdq))
 		optimizations |= CRC32_LE_OPTIMIZATION;
 	return optimizations;
 }
	// SPDX-License-Identifier: GPL-2.0-only
	/*
	* x86-optimized CRC32 functions
	*
	* Copyright (C) 2008 Intel Corporation
	* Copyright 2012 Xyratex Technology Limited
	* Copyright 2024 Google LLC
	*/

	#include "crc-pclmul-template.h"

	static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
	static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
	static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);

	DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);

	static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
	{
	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
	have_pclmulqdq);
	return crc32_le_base(crc, p, len);
	}

	#ifdef CONFIG_X86_64
	#define CRC32_INST "crc32q %1, %q0"
	#else
	#define CRC32_INST "crc32l %1, %0"
	#endif

	/*
	* Use carryless multiply version of crc32c when buffer size is >= 512 to
	* account for FPU state save/restore overhead.
	*/
	#define CRC32C_PCLMUL_BREAKEVEN 512

	asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);

	static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
	{
	size_t num_longs;

	if (!static_branch_likely(&have_crc32))
	return crc32c_base(crc, p, len);

	if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
	static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
	/*
	* Long length, the vector registers are usable, and the CPU is
	* 64-bit and supports both CRC32 and PCLMULQDQ instructions.
	* It is worthwhile to divide the data into multiple streams,
	* CRC them independently, and combine them using PCLMULQDQ.
	* crc32c_x86_3way() does this using 3 streams, which is the
	* most that x86_64 CPUs have traditionally been capable of.
	*
	* However, due to improved VPCLMULQDQ performance on newer
	* CPUs, use crc32_lsb_vpclmul_avx512() instead of
	* crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
	* "good" implementation of AVX-512.
	*
	* Future work: the optimal strategy on Zen 3--5 is actually to
	* use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
	* different numbers of streams and vector lengths are optimal
	* on each CPU microarchitecture, making it challenging to take
	* advantage of this. (Zen 5 even supports 7 parallel crc32q, a
	* major upgrade.) For now, just choose between
	* crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
	* is needed anyway for crc32_le(), so we just reuse it here.
	*/
	kernel_fpu_begin();
	if (static_branch_likely(&have_vpclmul_avx512))
	crc = crc32_lsb_vpclmul_avx512(crc, p, len,
	crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
	else
	crc = crc32c_x86_3way(crc, p, len);
	kernel_fpu_end();
	return crc;
	}

	/*
	* Short length, XMM registers unusable, or the CPU is 32-bit; but the
	* CPU supports CRC32 instructions. Just issue a single stream of CRC32
	* instructions inline. While this doesn't use the CPU's CRC32
	* throughput very well, it avoids the need to combine streams. Stream
	* combination would be inefficient here.
	*/

	for (num_longs = len / sizeof(unsigned long);
	num_longs != 0; num_longs--, p += sizeof(unsigned long))
	asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM ((unsigned long )p));

	if (sizeof(unsigned long) > 4 && (len & 4)) {
	asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM ((u32 )p));
	p += 4;
	}
	if (len & 2) {
	asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM ((u16 )p));
	p += 2;
	}
	if (len & 1)
	asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));

	return crc;
	}

	#define crc32_be_arch crc32_be_base /* not implemented on this arch */

	#define crc32_mod_init_arch crc32_mod_init_arch
	static inline void crc32_mod_init_arch(void)
	{
	if (boot_cpu_has(X86_FEATURE_XMM4_2))
	static_branch_enable(&have_crc32);
	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
	static_branch_enable(&have_pclmulqdq);
	if (have_vpclmul()) {
	if (have_avx512()) {
	static_call_update(crc32_lsb_pclmul,
	crc32_lsb_vpclmul_avx512);
	static_branch_enable(&have_vpclmul_avx512);
	} else {
	static_call_update(crc32_lsb_pclmul,
	crc32_lsb_vpclmul_avx2);
	}
	}
	}
	}

	static inline u32 crc32_optimizations_arch(void)
	{
	u32 optimizations = 0;

	if (static_key_enabled(&have_crc32))
	optimizations \|= CRC32C_OPTIMIZATION;
	if (static_key_enabled(&have_pclmulqdq))
	optimizations \|= CRC32_LE_OPTIMIZATION;
	return optimizations;
	}