| From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> |
| Date: Thu, 30 Nov 2017 13:40:10 +0100 |
| Subject: [PATCH] crypto: limit more FPU-enabled sections |
| MIME-Version: 1.0 |
| Content-Type: text/plain; charset=UTF-8 |
| Content-Transfer-Encoding: 8bit |
| |
| Those crypto drivers use SSE/AVX/… for their crypto work and in order to |
| do so in kernel they need to enable the "FPU" in kernel mode which |
| disables preemption. |
| There are two problems with the way they are used: |
| - the while loop which processes X bytes may create latency spikes and |
| should be avoided or limited. |
| - the cipher-walk-next part may allocate/free memory and may use |
| kmap_atomic(). |
| |
| The whole kernel_fpu_begin()/end() processing isn't probably that cheap. |
| It most likely makes sense to process as much of those as possible in one |
| go. The new *_fpu_sched_rt() schedules only if a RT task is pending. |
| |
| Probably we should measure the performance those ciphers in pure SW |
| mode and with this optimisations to see if it makes sense to keep them |
| for RT. |
| |
| This kernel_fpu_resched() makes the code more preemptible which might hurt |
| performance. |
| |
| Cc: stable-rt@vger.kernel.org |
| Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> |
| --- |
| arch/x86/crypto/camellia_aesni_avx2_glue.c | 20 ++++++++++++++++++++ |
| arch/x86/crypto/camellia_aesni_avx_glue.c | 19 +++++++++++++++++++ |
| arch/x86/crypto/cast6_avx_glue.c | 24 +++++++++++++++++++----- |
| arch/x86/crypto/chacha20_glue.c | 9 +++++---- |
| arch/x86/crypto/serpent_avx2_glue.c | 19 +++++++++++++++++++ |
| arch/x86/crypto/serpent_avx_glue.c | 23 +++++++++++++++++++---- |
| arch/x86/crypto/serpent_sse2_glue.c | 23 +++++++++++++++++++---- |
| arch/x86/crypto/twofish_avx_glue.c | 27 +++++++++++++++++++++++++-- |
| arch/x86/include/asm/fpu/api.h | 1 + |
| arch/x86/kernel/fpu/core.c | 12 ++++++++++++ |
| 10 files changed, 158 insertions(+), 19 deletions(-) |
| |
| --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c |
| +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c |
| @@ -206,6 +206,20 @@ struct crypt_priv { |
| bool fpu_enabled; |
| }; |
| |
| +#ifdef CONFIG_PREEMPT_RT_FULL |
| +static void camellia_fpu_end_rt(struct crypt_priv *ctx) |
| +{ |
| + bool fpu_enabled = ctx->fpu_enabled; |
| + |
| + if (!fpu_enabled) |
| + return; |
| + camellia_fpu_end(fpu_enabled); |
| + ctx->fpu_enabled = false; |
| +} |
| +#else |
| +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { } |
| +#endif |
| + |
| static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
| { |
| const unsigned int bsize = CAMELLIA_BLOCK_SIZE; |
| @@ -221,16 +235,19 @@ static void encrypt_callback(void *priv, |
| } |
| |
| if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { |
| + kernel_fpu_resched(); |
| camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); |
| srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; |
| nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; |
| } |
| |
| while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { |
| + kernel_fpu_resched(); |
| camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); |
| srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; |
| nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; |
| } |
| + camellia_fpu_end_rt(ctx); |
| |
| for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) |
| camellia_enc_blk(ctx->ctx, srcdst, srcdst); |
| @@ -251,16 +268,19 @@ static void decrypt_callback(void *priv, |
| } |
| |
| if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { |
| + kernel_fpu_resched(); |
| camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); |
| srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; |
| nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; |
| } |
| |
| while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { |
| + kernel_fpu_resched(); |
| camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); |
| srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; |
| nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; |
| } |
| + camellia_fpu_end_rt(ctx); |
| |
| for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) |
| camellia_dec_blk(ctx->ctx, srcdst, srcdst); |
| --- a/arch/x86/crypto/camellia_aesni_avx_glue.c |
| +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c |
| @@ -210,6 +210,21 @@ struct crypt_priv { |
| bool fpu_enabled; |
| }; |
| |
| +#ifdef CONFIG_PREEMPT_RT_FULL |
| +static void camellia_fpu_end_rt(struct crypt_priv *ctx) |
| +{ |
| + bool fpu_enabled = ctx->fpu_enabled; |
| + |
| + if (!fpu_enabled) |
| + return; |
| + camellia_fpu_end(fpu_enabled); |
| + ctx->fpu_enabled = false; |
| +} |
| + |
| +#else |
| +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { } |
| +#endif |
| + |
| static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
| { |
| const unsigned int bsize = CAMELLIA_BLOCK_SIZE; |
| @@ -225,10 +240,12 @@ static void encrypt_callback(void *priv, |
| } |
| |
| while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { |
| + kernel_fpu_resched(); |
| camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); |
| srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; |
| nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; |
| } |
| + camellia_fpu_end_rt(ctx); |
| |
| for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) |
| camellia_enc_blk(ctx->ctx, srcdst, srcdst); |
| @@ -249,10 +266,12 @@ static void decrypt_callback(void *priv, |
| } |
| |
| while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { |
| + kernel_fpu_resched(); |
| camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); |
| srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; |
| nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; |
| } |
| + camellia_fpu_end_rt(ctx); |
| |
| for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) |
| camellia_dec_blk(ctx->ctx, srcdst, srcdst); |
| --- a/arch/x86/crypto/cast6_avx_glue.c |
| +++ b/arch/x86/crypto/cast6_avx_glue.c |
| @@ -205,19 +205,33 @@ struct crypt_priv { |
| bool fpu_enabled; |
| }; |
| |
| +#ifdef CONFIG_PREEMPT_RT_FULL |
| +static void cast6_fpu_end_rt(struct crypt_priv *ctx) |
| +{ |
| + bool fpu_enabled = ctx->fpu_enabled; |
| + |
| + if (!fpu_enabled) |
| + return; |
| + cast6_fpu_end(fpu_enabled); |
| + ctx->fpu_enabled = false; |
| +} |
| + |
| +#else |
| +static void cast6_fpu_end_rt(struct crypt_priv *ctx) { } |
| +#endif |
| + |
| static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
| { |
| const unsigned int bsize = CAST6_BLOCK_SIZE; |
| struct crypt_priv *ctx = priv; |
| int i; |
| |
| - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); |
| - |
| if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { |
| + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); |
| cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst); |
| + cast6_fpu_end_rt(ctx); |
| return; |
| } |
| - |
| for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) |
| __cast6_encrypt(ctx->ctx, srcdst, srcdst); |
| } |
| @@ -228,10 +242,10 @@ static void decrypt_callback(void *priv, |
| struct crypt_priv *ctx = priv; |
| int i; |
| |
| - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); |
| - |
| if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { |
| + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); |
| cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst); |
| + cast6_fpu_end_rt(ctx); |
| return; |
| } |
| |
| --- a/arch/x86/crypto/chacha20_glue.c |
| +++ b/arch/x86/crypto/chacha20_glue.c |
| @@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher |
| |
| crypto_chacha20_init(state, ctx, walk.iv); |
| |
| - kernel_fpu_begin(); |
| - |
| while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { |
| + kernel_fpu_begin(); |
| + |
| chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, |
| rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); |
| + kernel_fpu_end(); |
| err = skcipher_walk_done(&walk, |
| walk.nbytes % CHACHA20_BLOCK_SIZE); |
| } |
| |
| if (walk.nbytes) { |
| + kernel_fpu_begin(); |
| chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, |
| walk.nbytes); |
| + kernel_fpu_end(); |
| err = skcipher_walk_done(&walk, 0); |
| } |
| |
| - kernel_fpu_end(); |
| - |
| return err; |
| } |
| |
| --- a/arch/x86/crypto/serpent_avx2_glue.c |
| +++ b/arch/x86/crypto/serpent_avx2_glue.c |
| @@ -184,6 +184,21 @@ struct crypt_priv { |
| bool fpu_enabled; |
| }; |
| |
| +#ifdef CONFIG_PREEMPT_RT_FULL |
| +static void serpent_fpu_end_rt(struct crypt_priv *ctx) |
| +{ |
| + bool fpu_enabled = ctx->fpu_enabled; |
| + |
| + if (!fpu_enabled) |
| + return; |
| + serpent_fpu_end(fpu_enabled); |
| + ctx->fpu_enabled = false; |
| +} |
| + |
| +#else |
| +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } |
| +#endif |
| + |
| static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
| { |
| const unsigned int bsize = SERPENT_BLOCK_SIZE; |
| @@ -199,10 +214,12 @@ static void encrypt_callback(void *priv, |
| } |
| |
| while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { |
| + kernel_fpu_resched(); |
| serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); |
| srcdst += bsize * SERPENT_PARALLEL_BLOCKS; |
| nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; |
| } |
| + serpent_fpu_end_rt(ctx); |
| |
| for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) |
| __serpent_encrypt(ctx->ctx, srcdst, srcdst); |
| @@ -223,10 +240,12 @@ static void decrypt_callback(void *priv, |
| } |
| |
| while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { |
| + kernel_fpu_resched(); |
| serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); |
| srcdst += bsize * SERPENT_PARALLEL_BLOCKS; |
| nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; |
| } |
| + serpent_fpu_end_rt(ctx); |
| |
| for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) |
| __serpent_decrypt(ctx->ctx, srcdst, srcdst); |
| --- a/arch/x86/crypto/serpent_avx_glue.c |
| +++ b/arch/x86/crypto/serpent_avx_glue.c |
| @@ -218,16 +218,31 @@ struct crypt_priv { |
| bool fpu_enabled; |
| }; |
| |
| +#ifdef CONFIG_PREEMPT_RT_FULL |
| +static void serpent_fpu_end_rt(struct crypt_priv *ctx) |
| +{ |
| + bool fpu_enabled = ctx->fpu_enabled; |
| + |
| + if (!fpu_enabled) |
| + return; |
| + serpent_fpu_end(fpu_enabled); |
| + ctx->fpu_enabled = false; |
| +} |
| + |
| +#else |
| +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } |
| +#endif |
| + |
| static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
| { |
| const unsigned int bsize = SERPENT_BLOCK_SIZE; |
| struct crypt_priv *ctx = priv; |
| int i; |
| |
| - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
| - |
| if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { |
| + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
| serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); |
| + serpent_fpu_end_rt(ctx); |
| return; |
| } |
| |
| @@ -241,10 +256,10 @@ static void decrypt_callback(void *priv, |
| struct crypt_priv *ctx = priv; |
| int i; |
| |
| - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
| - |
| if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { |
| + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
| serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); |
| + serpent_fpu_end_rt(ctx); |
| return; |
| } |
| |
| --- a/arch/x86/crypto/serpent_sse2_glue.c |
| +++ b/arch/x86/crypto/serpent_sse2_glue.c |
| @@ -187,16 +187,31 @@ struct crypt_priv { |
| bool fpu_enabled; |
| }; |
| |
| +#ifdef CONFIG_PREEMPT_RT_FULL |
| +static void serpent_fpu_end_rt(struct crypt_priv *ctx) |
| +{ |
| + bool fpu_enabled = ctx->fpu_enabled; |
| + |
| + if (!fpu_enabled) |
| + return; |
| + serpent_fpu_end(fpu_enabled); |
| + ctx->fpu_enabled = false; |
| +} |
| + |
| +#else |
| +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } |
| +#endif |
| + |
| static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
| { |
| const unsigned int bsize = SERPENT_BLOCK_SIZE; |
| struct crypt_priv *ctx = priv; |
| int i; |
| |
| - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
| - |
| if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { |
| + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
| serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); |
| + serpent_fpu_end_rt(ctx); |
| return; |
| } |
| |
| @@ -210,10 +225,10 @@ static void decrypt_callback(void *priv, |
| struct crypt_priv *ctx = priv; |
| int i; |
| |
| - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
| - |
| if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { |
| + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
| serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); |
| + serpent_fpu_end_rt(ctx); |
| return; |
| } |
| |
| --- a/arch/x86/crypto/twofish_avx_glue.c |
| +++ b/arch/x86/crypto/twofish_avx_glue.c |
| @@ -218,6 +218,21 @@ struct crypt_priv { |
| bool fpu_enabled; |
| }; |
| |
| +#ifdef CONFIG_PREEMPT_RT_FULL |
| +static void twofish_fpu_end_rt(struct crypt_priv *ctx) |
| +{ |
| + bool fpu_enabled = ctx->fpu_enabled; |
| + |
| + if (!fpu_enabled) |
| + return; |
| + twofish_fpu_end(fpu_enabled); |
| + ctx->fpu_enabled = false; |
| +} |
| + |
| +#else |
| +static void twofish_fpu_end_rt(struct crypt_priv *ctx) { } |
| +#endif |
| + |
| static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
| { |
| const unsigned int bsize = TF_BLOCK_SIZE; |
| @@ -228,12 +243,16 @@ static void encrypt_callback(void *priv, |
| |
| if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { |
| twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); |
| + twofish_fpu_end_rt(ctx); |
| return; |
| } |
| |
| - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) |
| + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) { |
| + kernel_fpu_resched(); |
| twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); |
| + } |
| |
| + twofish_fpu_end_rt(ctx); |
| nbytes %= bsize * 3; |
| |
| for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) |
| @@ -250,11 +269,15 @@ static void decrypt_callback(void *priv, |
| |
| if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { |
| twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); |
| + twofish_fpu_end_rt(ctx); |
| return; |
| } |
| |
| - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) |
| + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) { |
| + kernel_fpu_resched(); |
| twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); |
| + } |
| + twofish_fpu_end_rt(ctx); |
| |
| nbytes %= bsize * 3; |
| |
| --- a/arch/x86/include/asm/fpu/api.h |
| +++ b/arch/x86/include/asm/fpu/api.h |
| @@ -25,6 +25,7 @@ extern void __kernel_fpu_begin(void); |
| extern void __kernel_fpu_end(void); |
| extern void kernel_fpu_begin(void); |
| extern void kernel_fpu_end(void); |
| +extern void kernel_fpu_resched(void); |
| extern bool irq_fpu_usable(void); |
| |
| /* |
| --- a/arch/x86/kernel/fpu/core.c |
| +++ b/arch/x86/kernel/fpu/core.c |
| @@ -137,6 +137,18 @@ void kernel_fpu_end(void) |
| } |
| EXPORT_SYMBOL_GPL(kernel_fpu_end); |
| |
| +void kernel_fpu_resched(void) |
| +{ |
| + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); |
| + |
| + if (should_resched(PREEMPT_OFFSET)) { |
| + kernel_fpu_end(); |
| + cond_resched(); |
| + kernel_fpu_begin(); |
| + } |
| +} |
| +EXPORT_SYMBOL_GPL(kernel_fpu_resched); |
| + |
| /* |
| * Save the FPU state (mark it for reload if necessary): |
| * |