patches/old/lib-min_heap-avoid-indirect-function-call-by-providing-default-swap.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Kuan-Wei Chiu <visitorckw@gmail.com>
 Subject: lib min_heap: avoid indirect function call by providing default swap
 Date: Sun, 20 Oct 2024 12:01:53 +0800

 The non-inline min heap API can result in an indirect function call to the
 custom swap function.  This becomes particularly costly when
 CONFIG_MITIGATION_RETPOLINE is enabled, as indirect function calls are
 expensive in this case.

 To address this, copy the code from lib/sort.c and provide a default
 builtin swap implementation that performs element swaps based on the
 element size.  This change allows most users to avoid the overhead of
 indirect function calls, improving efficiency.

 Link: https://lkml.kernel.org/r/20241020040200.939973-4-visitorckw@gmail.com
 Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
 Cc: Adrian Hunter <adrian.hunter@intel.com>
 Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
 Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
 Cc: Coly Li <colyli@suse.de>
 Cc: Ian Rogers <irogers@google.com>
 Cc: Ingo Molnar <mingo@redhat.com>
 Cc: Jiri Olsa <jolsa@kernel.org>
 Cc: Jonathan Corbet <corbet@lwn.net>
 Cc: Kent Overstreet <kent.overstreet@linux.dev>
 Cc: "Liang, Kan" <kan.liang@linux.intel.com>
 Cc: Mark Rutland <mark.rutland@arm.com>
 Cc: Matthew Sakai <msakai@redhat.com>
 Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
 Cc: Namhyung Kim <namhyung@kernel.org>
 Cc: Peter Zijlstra <peterz@infradead.org>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  include/linux/min_heap.h |  159 ++++++++++++++++++++++++++++++++++++-
  1 file changed, 156 insertions(+), 3 deletions(-)

 --- a/include/linux/min_heap.h~lib-min_heap-avoid-indirect-function-call-by-providing-default-swap
 +++ a/include/linux/min_heap.h
 @@ -39,6 +39,147 @@ struct min_heap_callbacks {
  };

  /**
 + * is_aligned - is this pointer & size okay for word-wide copying?
 + * @base: pointer to data
 + * @size: size of each element
 + * @align: required alignment (typically 4 or 8)
 + *
 + * Returns true if elements can be copied using word loads and stores.
 + * The size must be a multiple of the alignment, and the base address must
 + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
 + *
 + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
 + * to "if ((a | b) & mask)", so we do that by hand.
 + */
 +__attribute_const__ __always_inline
 +static bool is_aligned(const void *base, size_t size, unsigned char align)
 +{
 +	unsigned char lsbits = (unsigned char)size;
 +
 +	(void)base;
 +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 +	lsbits |= (unsigned char)(uintptr_t)base;
 +#endif
 +	return (lsbits & (align - 1)) == 0;
 +}
 +
 +/**
 + * swap_words_32 - swap two elements in 32-bit chunks
 + * @a: pointer to the first element to swap
 + * @b: pointer to the second element to swap
 + * @n: element size (must be a multiple of 4)
 + *
 + * Exchange the two objects in memory.  This exploits base+index addressing,
 + * which basically all CPUs have, to minimize loop overhead computations.
 + *
 + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
 + * bottom of the loop, even though the zero flag is still valid from the
 + * subtract (since the intervening mov instructions don't alter the flags).
 + * Gcc 8.1.0 doesn't have that problem.
 + */
 +static __always_inline
 +void swap_words_32(void *a, void *b, size_t n)
 +{
 +	do {
 +		u32 t = *(u32 *)(a + (n -= 4));
 +		*(u32 *)(a + n) = *(u32 *)(b + n);
 +		*(u32 *)(b + n) = t;
 +	} while (n);
 +}
 +
 +/**
 + * swap_words_64 - swap two elements in 64-bit chunks
 + * @a: pointer to the first element to swap
 + * @b: pointer to the second element to swap
 + * @n: element size (must be a multiple of 8)
 + *
 + * Exchange the two objects in memory.  This exploits base+index
 + * addressing, which basically all CPUs have, to minimize loop overhead
 + * computations.
 + *
 + * We'd like to use 64-bit loads if possible.  If they're not, emulating
 + * one requires base+index+4 addressing which x86 has but most other
 + * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
 + * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
 + * x32 ABI).  Are there any cases the kernel needs to worry about?
 + */
 +static __always_inline
 +void swap_words_64(void *a, void *b, size_t n)
 +{
 +	do {
 +#ifdef CONFIG_64BIT
 +		u64 t = *(u64 *)(a + (n -= 8));
 +		*(u64 *)(a + n) = *(u64 *)(b + n);
 +		*(u64 *)(b + n) = t;
 +#else
 +		/* Use two 32-bit transfers to avoid base+index+4 addressing */
 +		u32 t = *(u32 *)(a + (n -= 4));
 +		*(u32 *)(a + n) = *(u32 *)(b + n);
 +		*(u32 *)(b + n) = t;
 +
 +		t = *(u32 *)(a + (n -= 4));
 +		*(u32 *)(a + n) = *(u32 *)(b + n);
 +		*(u32 *)(b + n) = t;
 +#endif
 +	} while (n);
 +}
 +
 +/**
 + * swap_bytes - swap two elements a byte at a time
 + * @a: pointer to the first element to swap
 + * @b: pointer to the second element to swap
 + * @n: element size
 + *
 + * This is the fallback if alignment doesn't allow using larger chunks.
 + */
 +static __always_inline
 +void swap_bytes(void *a, void *b, size_t n)
 +{
 +	do {
 +		char t = ((char *)a)[--n];
 +		((char *)a)[n] = ((char *)b)[n];
 +		((char *)b)[n] = t;
 +	} while (n);
 +}
 +
 +/*
 + * The values are arbitrary as long as they can't be confused with
 + * a pointer, but small integers make for the smallest compare
 + * instructions.
 + */
 +#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0)
 +#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1)
 +#define SWAP_BYTES    ((void (*)(void *, void *, void *))2)
 +
 +/*
 + * Selects the appropriate swap function based on the element size.
 + */
 +static __always_inline
 +void *select_swap_func(const void *base, size_t size)
 +{
 +	if (is_aligned(base, size, 8))
 +		return SWAP_WORDS_64;
 +	else if (is_aligned(base, size, 4))
 +		return SWAP_WORDS_32;
 +	else
 +		return SWAP_BYTES;
 +}
 +
 +static __always_inline
 +void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args),
 +	     void *priv)
 +{
 +	if (swap_func == SWAP_WORDS_64)
 +		swap_words_64(a, b, size);
 +	else if (swap_func == SWAP_WORDS_32)
 +		swap_words_32(a, b, size);
 +	else if (swap_func == SWAP_BYTES)
 +		swap_bytes(a, b, size);
 +	else
 +		swap_func(a, b, priv);
 +}
 +
 +/**
   * parent - given the offset of the child, find the offset of the parent.
   * @i: the offset of the heap element whose parent is sought.  Non-zero.
   * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
 @@ -106,11 +247,15 @@ void __min_heap_sift_down_inline(min_hea
  {
  	const unsigned long lsbit = elem_size & -elem_size;
  	void *data = heap->data;
 +	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
  	/* pre-scale counters for performance */
  	size_t a = pos * elem_size;
  	size_t b, c, d;
  	size_t n = heap->nr * elem_size;

 +	if (!swp)
 +		swp = select_swap_func(data, elem_size);
 +
  	/* Find the sift-down path all the way to the leaves. */
  	for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
  		b = func->less(data + c, data + d, args) ? c : d;
 @@ -127,7 +272,7 @@ void __min_heap_sift_down_inline(min_hea
  	c = b;
  	while (b != a) {
  		b = parent(b, lsbit, elem_size);
 -		func->swp(data + b, data + c, args);
 +		do_swap(data + b, data + c, elem_size, swp, args);
  	}
  }

 @@ -142,14 +287,18 @@ void __min_heap_sift_up_inline(min_heap_
  {
  	const unsigned long lsbit = elem_size & -elem_size;
  	void *data = heap->data;
 +	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
  	/* pre-scale counters for performance */
  	size_t a = idx * elem_size, b;

 +	if (!swp)
 +		swp = select_swap_func(data, elem_size);
 +
  	while (a) {
  		b = parent(a, lsbit, elem_size);
  		if (func->less(data + b, data + a, args))
  			break;
 -		func->swp(data + a, data + b, args);
 +		do_swap(data + a, data + b, elem_size, swp, args);
  		a = b;
  	}
  }
 @@ -242,15 +391,19 @@ bool __min_heap_del_inline(min_heap_char
  			   const struct min_heap_callbacks *func, void *args)
  {
  	void *data = heap->data;
 +	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;

  	if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
  		return false;

 +	if (!swp)
 +		swp = select_swap_func(data, elem_size);
 +
  	/* Place last element at the root (position 0) and then sift down. */
  	heap->nr--;
  	if (idx == heap->nr)
  		return true;
 -	func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args);
 +	do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args);
  	__min_heap_sift_up_inline(heap, elem_size, idx, func, args);
  	__min_heap_sift_down_inline(heap, idx, elem_size, func, args);

 _
	From: Kuan-Wei Chiu <visitorckw@gmail.com>
	Subject: lib min_heap: avoid indirect function call by providing default swap
	Date: Sun, 20 Oct 2024 12:01:53 +0800

	The non-inline min heap API can result in an indirect function call to the
	custom swap function. This becomes particularly costly when
	CONFIG_MITIGATION_RETPOLINE is enabled, as indirect function calls are
	expensive in this case.

	To address this, copy the code from lib/sort.c and provide a default
	builtin swap implementation that performs element swaps based on the
	element size. This change allows most users to avoid the overhead of
	indirect function calls, improving efficiency.

	Link: https://lkml.kernel.org/r/20241020040200.939973-4-visitorckw@gmail.com
	Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
	Cc: Adrian Hunter <adrian.hunter@intel.com>
	Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
	Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
	Cc: Coly Li <colyli@suse.de>
	Cc: Ian Rogers <irogers@google.com>
	Cc: Ingo Molnar <mingo@redhat.com>
	Cc: Jiri Olsa <jolsa@kernel.org>
	Cc: Jonathan Corbet <corbet@lwn.net>
	Cc: Kent Overstreet <kent.overstreet@linux.dev>
	Cc: "Liang, Kan" <kan.liang@linux.intel.com>
	Cc: Mark Rutland <mark.rutland@arm.com>
	Cc: Matthew Sakai <msakai@redhat.com>
	Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
	Cc: Namhyung Kim <namhyung@kernel.org>
	Cc: Peter Zijlstra <peterz@infradead.org>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	---

	include/linux/min_heap.h \| 159 ++++++++++++++++++++++++++++++++++++-
	1 file changed, 156 insertions(+), 3 deletions(-)

	--- a/include/linux/min_heap.h~lib-min_heap-avoid-indirect-function-call-by-providing-default-swap
	+++ a/include/linux/min_heap.h
	@@ -39,6 +39,147 @@ struct min_heap_callbacks {
	};

	/**
	+ * is_aligned - is this pointer & size okay for word-wide copying?
	+ * @base: pointer to data
	+ * @size: size of each element
	+ * @align: required alignment (typically 4 or 8)
	+ *
	+ * Returns true if elements can be copied using word loads and stores.
	+ * The size must be a multiple of the alignment, and the base address must
	+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
	+ *
	+ * For some reason, gcc doesn't know to optimize "if (a & mask \|\| b & mask)"
	+ * to "if ((a \| b) & mask)", so we do that by hand.
	+ */
	+__attribute_const__ __always_inline
	+static bool is_aligned(const void *base, size_t size, unsigned char align)
	+{
	+ unsigned char lsbits = (unsigned char)size;
	+
	+ (void)base;
	+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	+ lsbits \|= (unsigned char)(uintptr_t)base;
	+#endif
	+ return (lsbits & (align - 1)) == 0;
	+}
	+
	+/**
	+ * swap_words_32 - swap two elements in 32-bit chunks
	+ * @a: pointer to the first element to swap
	+ * @b: pointer to the second element to swap
	+ * @n: element size (must be a multiple of 4)
	+ *
	+ * Exchange the two objects in memory. This exploits base+index addressing,
	+ * which basically all CPUs have, to minimize loop overhead computations.
	+ *
	+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
	+ * bottom of the loop, even though the zero flag is still valid from the
	+ * subtract (since the intervening mov instructions don't alter the flags).
	+ * Gcc 8.1.0 doesn't have that problem.
	+ */
	+static __always_inline
	+void swap_words_32(void a, void b, size_t n)
	+{
	+ do {
	+ u32 t = (u32 )(a + (n -= 4));
	+ (u32 )(a + n) = (u32 )(b + n);
	+ (u32 )(b + n) = t;
	+ } while (n);
	+}
	+
	+/**
	+ * swap_words_64 - swap two elements in 64-bit chunks
	+ * @a: pointer to the first element to swap
	+ * @b: pointer to the second element to swap
	+ * @n: element size (must be a multiple of 8)
	+ *
	+ * Exchange the two objects in memory. This exploits base+index
	+ * addressing, which basically all CPUs have, to minimize loop overhead
	+ * computations.
	+ *
	+ * We'd like to use 64-bit loads if possible. If they're not, emulating
	+ * one requires base+index+4 addressing which x86 has but most other
	+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
	+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
	+ * x32 ABI). Are there any cases the kernel needs to worry about?
	+ */
	+static __always_inline
	+void swap_words_64(void a, void b, size_t n)
	+{
	+ do {
	+#ifdef CONFIG_64BIT
	+ u64 t = (u64 )(a + (n -= 8));
	+ (u64 )(a + n) = (u64 )(b + n);
	+ (u64 )(b + n) = t;
	+#else
	+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
	+ u32 t = (u32 )(a + (n -= 4));
	+ (u32 )(a + n) = (u32 )(b + n);
	+ (u32 )(b + n) = t;
	+
	+ t = (u32 )(a + (n -= 4));
	+ (u32 )(a + n) = (u32 )(b + n);
	+ (u32 )(b + n) = t;
	+#endif
	+ } while (n);
	+}
	+
	+/**
	+ * swap_bytes - swap two elements a byte at a time
	+ * @a: pointer to the first element to swap
	+ * @b: pointer to the second element to swap
	+ * @n: element size
	+ *
	+ * This is the fallback if alignment doesn't allow using larger chunks.
	+ */
	+static __always_inline
	+void swap_bytes(void a, void b, size_t n)
	+{
	+ do {
	+ char t = ((char *)a)[--n];
	+ ((char )a)[n] = ((char )b)[n];
	+ ((char *)b)[n] = t;
	+ } while (n);
	+}
	+
	+/*
	+ * The values are arbitrary as long as they can't be confused with
	+ * a pointer, but small integers make for the smallest compare
	+ * instructions.
	+ */
	+#define SWAP_WORDS_64 ((void ()(void , void , void ))0)
	+#define SWAP_WORDS_32 ((void ()(void , void , void ))1)
	+#define SWAP_BYTES ((void ()(void , void , void ))2)
	+
	+/*
	+ * Selects the appropriate swap function based on the element size.
	+ */
	+static __always_inline
	+void select_swap_func(const void base, size_t size)
	+{
	+ if (is_aligned(base, size, 8))
	+ return SWAP_WORDS_64;
	+ else if (is_aligned(base, size, 4))
	+ return SWAP_WORDS_32;
	+ else
	+ return SWAP_BYTES;
	+}
	+
	+static __always_inline
	+void do_swap(void a, void b, size_t size, void (swap_func)(void lhs, void rhs, void args),
	+ void *priv)
	+{
	+ if (swap_func == SWAP_WORDS_64)
	+ swap_words_64(a, b, size);
	+ else if (swap_func == SWAP_WORDS_32)
	+ swap_words_32(a, b, size);
	+ else if (swap_func == SWAP_BYTES)
	+ swap_bytes(a, b, size);
	+ else
	+ swap_func(a, b, priv);
	+}
	+
	+/**
	* parent - given the offset of the child, find the offset of the parent.
	* @i: the offset of the heap element whose parent is sought. Non-zero.
	* @lsbit: a precomputed 1-bit mask, equal to "size & -size"
	@@ -106,11 +247,15 @@ void __min_heap_sift_down_inline(min_hea
	{
	const unsigned long lsbit = elem_size & -elem_size;
	void *data = heap->data;
	+ void (swp)(void lhs, void rhs, void args) = func->swp;
	/* pre-scale counters for performance */
	size_t a = pos * elem_size;
	size_t b, c, d;
	size_t n = heap->nr * elem_size;

	+ if (!swp)
	+ swp = select_swap_func(data, elem_size);
	+
	/* Find the sift-down path all the way to the leaves. */
	for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
	b = func->less(data + c, data + d, args) ? c : d;
	@@ -127,7 +272,7 @@ void __min_heap_sift_down_inline(min_hea
	c = b;
	while (b != a) {
	b = parent(b, lsbit, elem_size);
	- func->swp(data + b, data + c, args);
	+ do_swap(data + b, data + c, elem_size, swp, args);
	}
	}

	@@ -142,14 +287,18 @@ void __min_heap_sift_up_inline(min_heap_
	{
	const unsigned long lsbit = elem_size & -elem_size;
	void *data = heap->data;
	+ void (swp)(void lhs, void rhs, void args) = func->swp;
	/* pre-scale counters for performance */
	size_t a = idx * elem_size, b;

	+ if (!swp)
	+ swp = select_swap_func(data, elem_size);
	+
	while (a) {
	b = parent(a, lsbit, elem_size);
	if (func->less(data + b, data + a, args))
	break;
	- func->swp(data + a, data + b, args);
	+ do_swap(data + a, data + b, elem_size, swp, args);
	a = b;
	}
	}
	@@ -242,15 +391,19 @@ bool __min_heap_del_inline(min_heap_char
	const struct min_heap_callbacks func, void args)
	{
	void *data = heap->data;
	+ void (swp)(void lhs, void rhs, void args) = func->swp;

	if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
	return false;

	+ if (!swp)
	+ swp = select_swap_func(data, elem_size);
	+
	/* Place last element at the root (position 0) and then sift down. */
	heap->nr--;
	if (idx == heap->nr)
	return true;
	- func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args);
	+ do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args);
	__min_heap_sift_up_inline(heap, elem_size, idx, func, args);
	__min_heap_sift_down_inline(heap, idx, elem_size, func, args);

	_