!2318 objtool: Enable Stack Validation and ORC Generation for ARM64 Kernel Builds

Merge Pull Request from: @popxpo 
 
This pull request introduces objtool support for ARM64 kernel builds, enabling stack validation.

This submission addresses most of the work required to enable objtool on ARM64. 

The primary advantage of leveraging objtool is its capability to generate ORC stack metadata during compilation. This metadata facilitates reliable stack unwinding at runtime, as detailed in the [Linux Kernel documentation](https://kernel.org/doc/html/next/x86/orc-unwinder.html).

However, the objtool has revealed compilation issues in certain files that do not adhere to objtool's requirements. To resolve these issues, this submission includes modifications aimed at bringing these files into compliance. For files where annotations offer a solution, annotations have been added. In cases where an immediate solution could not be found, these files are temporarily excluded ([see commit](https://gitee.com/popxpo/kernel/commit/7b938607595fd105066f0c851b8a8c1f37a91a69)).

To utilize objtool's stack validation feature, the STACK_VALIDATION option must be enabled. When enabled, the compiler performs stack validation on the kernel code during compilation and issues warnings for code that does not conform to objtool's rules.

After analyzing the .o target files, objtool determines the stack information for each instruction address in the file. If the file does not conform to objtool's rules, it can issue warnings:

```
./objtool check arch/arm64/kernel/cpu-park.o
arch/arm64/kernel/cpu-park.o: warning: objtool: .text+0x0: unreachable instruction
``` 
 
Link:https://gitee.com/openeuler/kernel/pulls/2318 

Reviewed-by: Xu Kuohai <xukuohai@huawei.com> 
Signed-off-by: Jialin Zhang <zhangjialin11@huawei.com> 
diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
index 9df29a9..d7e8750 100644
--- a/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@ -324,6 +324,89 @@
 
 	# echo 123 244 1 >> set_event_pid
 
+5.5 Stack filters
+---------------------
+
+Trace events can be filtered by their call stacks.  There could exist
+various paths to tigger an trace event, but people sometimes only care
+about some of them.  Once the stack filter is set, call stack of the
+corresponding event will be compared with the stack filter.  An event
+with matched call stack will appear in the trace output and the rest will
+be discarded.
+
+5.5.1 Expression syntax
+------------------------
+
+Stack filters have the form (in regular expression style) below::
+  '!'?function('/'(function|'**'))*
+
+In the expression, '!' means negating the filter and '**' matches any
+call path (maybe empty).  The top of call stack will be ``stack_filter_match``,
+which means the call path will be something like ``**/stack_filter_match``,
+so we recommand that you add a '**' at the end of stack_filter, if you
+don't know the implementation details of trace system.
+
+Bottom of the call stack can be ignored, which means what
+``work_pending/do_notify_resume/schedule/__schedule/**`` matches can also
+be matched by ``do_notify_resume/schedule/__schedule/**``.
+
+A call stack is matched successfully if the following conditions are
+met simultaneously:
+[1] It matches any positive stack filter.
+[2] It doesn't match any negative stack filter.
+If no positive filter are set, condition 1 don't need to be satisified.
+
+5.5.2 Setting stack filters
+------------------------
+
+Stack filters are add by echo commands to 'stack_filter' file for a given
+event, and unset by echo 0 or blank string to the same file.
+
+Some usage examples:
+Set up a kprobe::
+  # echo 1 > /sys/kernel/tracing/options/stacktrace
+  # echo 'p alloc_pages' > /sys/kernel/tracing/kprobe_events
+
+The call stack contains ``do_sys_openat2``::
+  # echo 'do_sys_openat2/**' > \
+  /sys/kernel/tracing/events/kprobes/p_alloc_pages_0/stack_filter
+
+The call stack doesn't contain ``do_sys_openat2``::
+  # echo '!do_sys_openat2/**' > \
+  /sys/kernel/tracing/events/kprobes/p_alloc_pages_0/stack_filter
+
+The call stack contains ``do_sys_openat2`` or ``do_translation_fault``,
+but not ``el0_sync_handler``::
+  # echo 'do_sys_openat2/**' > \
+  /sys/kernel/tracing/events/kprobes/p_alloc_pages_0/stack_filter
+  # echo 'do_translation_fault/**' >> \
+  /sys/kernel/tracing/events/kprobes/p_alloc_pages_0/stack_filter
+  # echo '!el0_sync_handler/**' >> \
+  /sys/kernel/tracing/events/kprobes/p_alloc_pages_0/stack_filter
+
+The call stack contains ``el0_sync_handler -> el0_da``::
+  # echo 'el0_sync_handler/el0_da/**' > \
+  /sys/kernel/tracing/events/kprobes/p_alloc_pages_0/stack_filter
+
+The call stack contains ``el0_sync_handler -> ... -> do_page_fault``::
+  # echo 'el0_sync_handler/**/do_page_fault/**' > \
+  /sys/kernel/tracing/events/kprobes/p_alloc_pages_0/stack_filter
+
+Enable the kprobe event and check the trace log::
+  # echo 1 > /sys/kernel/tracing/events/kprobes/enable
+  # cat /sys/kernel/tracing/trace
+
+Another example::
+  # cd /sys/kernel/tracing/events/sched/sched_switch
+  # echo \
+  'work_pending/do_notify_resume/schedule/__schedule/**' > stack_filter
+  # echo \
+  '!ret_from_fork/**/kthread/worker_thread/schedule/**' >> stack_filter
+  # cat stack_filter
+
+Disable the stack filter::
+  # echo 0 > stack_filter
+  # echo > stack_filter
 
 6. Event triggers
 =================
diff --git a/arch/arm/boot/compressed/kaslr.c b/arch/arm/boot/compressed/kaslr.c
index 3b37c04..0a08ba0 100644
--- a/arch/arm/boot/compressed/kaslr.c
+++ b/arch/arm/boot/compressed/kaslr.c
@@ -15,7 +15,25 @@
 #include <linux/pgtable.h>
 
 #include CONFIG_UNCOMPRESS_INCLUDE
+#define puthex32(val)	__puthex32(#val, (val))
+static noinline void __puthex32(const char *name, u32 val)
+{
+	int i;
 
+	while (*name)
+		putc(*name++);
+	putc(':');
+	for (i = 28; i >= 0; i -= 4) {
+		char c = (val >> i) & 0xf;
+
+		if (c < 10)
+			putc(c + '0');
+		else
+			putc(c + 'a' - 10);
+	}
+	putc('\r');
+	putc('\n');
+}
 struct regions {
 	u32 pa_start;
 	u32 pa_end;
@@ -84,27 +102,46 @@
 	return ret;
 }
 
-static bool regions_intersect(u32 s1, u32 e1, u32 s2, u32 e2)
+#define ALIGN2MDOWN(x) ((x) & ~(SZ_2M - 1))
+#define ALIGN2MUP(x) (((x) & ~(SZ_2M - 1)) + ((((x) & 0x1FFFFF) != 0) << 21))
+#define BITINDEX(x) (((x) & 0xfff00000) >> 21)
+
+
+static u32 set_occupied_region(const void *fdt, struct regions *regions,
+			       u32 start, u32 size, u32 *bitmap)
 {
-	return e1 >= s2 && e2 >= s1;
+	u32 pa, img_start, img_end, i, ret = 0;
+
+	if (regions->image_size <= start
+		&& start - regions->image_size > regions->pa_start)
+		img_start = ALIGN2MUP(start - regions->image_size);
+	else
+		img_start = regions->pa_start;
+	img_end = min((u64)start + (u64)size, (u64)U32_MAX);
+	i = BITINDEX(img_start - regions->pa_start);
+	for (pa = img_start; pa < img_end; pa += SZ_2M, i++) {
+		/* set 'occupied' bit */
+		bitmap[i >> 5] |= BIT(i & 0x1f);
+		++ret;
+	}
+	return ret;
 }
 
-static bool intersects_reserved_region(const void *fdt, u32 start,
-				       u32 end, struct regions *regions)
+static u32 set_fdt_reserved_region(const void *fdt, struct regions *regions,
+				   u32 *bitmap)
 {
-	int subnode, len, i;
+	int subnode, len, i, ret = 0;
 	u64 base, size;
 
 	/* check for overlap with /memreserve/ entries */
 	for (i = 0; i < fdt_num_mem_rsv(fdt); i++) {
 		if (fdt_get_mem_rsv(fdt, i, &base, &size) < 0)
 			continue;
-		if (regions_intersect(start, end, base, base + size))
-			return true;
+		ret += set_occupied_region(fdt, regions, base, size, bitmap);
 	}
 
 	if (regions->reserved_mem < 0)
-		return false;
+		return ret;
 
 	/* check for overlap with static reservations in /reserved-memory */
 	for (subnode = fdt_first_subnode(fdt, regions->reserved_mem);
@@ -134,47 +171,35 @@
 			if (base >= regions->pa_end)
 				continue;
 
-			if (regions_intersect(start, end, base,
-					      min(base + size, (u64)U32_MAX)))
-				return true;
+			ret += set_occupied_region(fdt, regions, base,
+					size, bitmap);
 		}
 	}
-	return false;
+	return ret;
 }
 
-static bool intersects_occupied_region(const void *fdt, u32 start,
-				       u32 end, struct regions *regions)
+static noinline u32 set_occupied_regions(const void *fdt,
+					 struct regions *regions, u32 *bitmap)
 {
-	if (regions_intersect(start, end, regions->zimage_start,
-			      regions->zimage_start + regions->zimage_size))
-		return true;
+	int ret = (ALIGN2MDOWN(regions->pa_end) -
+				ALIGN2MUP(regions->pa_start)) / SZ_2M;
 
-	if (regions_intersect(start, end, regions->initrd_start,
-			      regions->initrd_start + regions->initrd_size))
-		return true;
-
-	if (regions_intersect(start, end, regions->dtb_start,
-			      regions->dtb_start + regions->dtb_size))
-		return true;
-
-	return intersects_reserved_region(fdt, start, end, regions);
-}
-
-static u32 count_suitable_regions(const void *fdt, struct regions *regions,
-				  u32 *bitmap)
-{
-	u32 pa, i = 0, ret = 0;
-
-	for (pa = regions->pa_start; pa < regions->pa_end; pa += SZ_2M, i++) {
-		if (!intersects_occupied_region(fdt, pa,
-						pa + regions->image_size,
-						regions)) {
-			ret++;
-		} else {
-			/* set 'occupied' bit */
-			bitmap[i >> 5] |= BIT(i & 0x1f);
-		}
+	if (regions->pa_end -
+			ALIGN2MDOWN(regions->pa_end > regions->image_size)) {
+		/* still possible for an image */
+		ret++;
 	}
+	if (regions->pa_start & (SZ_2M - 1)) {
+		/* still possible for an image start */
+		ret++;
+	}
+	ret -= set_occupied_region(fdt, regions, regions->zimage_start,
+			regions->zimage_size, bitmap);
+	ret -= set_occupied_region(fdt, regions, regions->initrd_start,
+			regions->initrd_size, bitmap);
+	ret -= set_occupied_region(fdt, regions, regions->dtb_start,
+			regions->dtb_size, bitmap);
+	ret -= set_fdt_reserved_region(fdt, regions, bitmap);
 	return ret;
 }
 
@@ -230,7 +255,7 @@
 
 	get_cell_sizes(fdt, 0, &address_cells, &size_cells);
 
-	while(mem_node >= 0) {
+	while (mem_node >= 0) {
 		/*
 		 * Now find the 'reg' property of the /memory node, and iterate over
 		 * the base/size pairs.
@@ -239,6 +264,7 @@
 		reg = fdt_getprop(fdt, mem_node, "reg", &len);
 		while (len >= 4 * (address_cells + size_cells)) {
 			u64 base, size;
+
 			base = fdt32_to_cpu(reg[0]);
 			if (address_cells == 2)
 				base = (base << 32) | fdt32_to_cpu(reg[1]);
@@ -301,25 +327,8 @@
 	return NULL;
 }
 
-static void __puthex32(const char *name, u32 val)
-{
-	int i;
 
-	while (*name)
-		putc(*name++);
-	putc(':');
-	for (i = 28; i >= 0; i -= 4) {
-		char c = (val >> i) & 0xf;
 
-		if (c < 10)
-			putc(c + '0');
-		else
-			putc(c + 'a' - 10);
-	}
-	putc('\r');
-	putc('\n');
-}
-#define puthex32(val)	__puthex32(#val, (val))
 
 u32 kaslr_early_init(u32 *kaslr_offset, u32 image_base, u32 image_size,
 		     u32 seed, u32 zimage_start, const void *fdt,
@@ -452,9 +461,10 @@
 	 * until we counted enough iterations, and return the offset we ended
 	 * up at.
 	 */
-	count = count_suitable_regions(fdt, &regions, bitmap);
-	puthex32(count);
+	count = set_occupied_regions(fdt, &regions, bitmap);
 
+
+	puthex32(count);
 	num = ((u16)seed * count) >> 16;
 	puthex32(num);
 
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index d9fbc48..2a64b09 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -43,6 +43,7 @@
 void clear_page_orig(void *page);
 void clear_page_rep(void *page);
 void clear_page_erms(void *page);
+void clear_page_nt(void *page);
 
 static inline void clear_page(void *page)
 {
@@ -54,6 +55,15 @@
 			   : "cc", "memory", "rax", "rcx");
 }
 
+static inline void clear_page_nocache(void *page)
+{
+	alternative_call(clear_page,
+			   clear_page_nt, X86_FEATURE_XMM2,
+			   "=D" (page),
+			   "0" (page)
+			   : "cc", "memory", "rax", "rcx");
+}
+
 void copy_page(void *to, void *from);
 
 void copy_page_nocache(void *to, void *from);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index c4c7dd1..7c16a4b 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -50,3 +50,24 @@
 	ret
 SYM_FUNC_END(clear_page_erms)
 EXPORT_SYMBOL_GPL(clear_page_erms)
+
+SYM_FUNC_START(clear_page_nt)
+	xorl %eax,%eax
+	movl $4096/64,%ecx
+	.p2align 4
+.nt_loop:
+	decl %ecx
+	movnti %rax,(%rdi)
+	movnti %rax,8(%rdi)
+	movnti %rax,16(%rdi)
+	movnti %rax,24(%rdi)
+	movnti %rax,32(%rdi)
+	movnti %rax,40(%rdi)
+	movnti %rax,48(%rdi)
+	movnti %rax,56(%rdi)
+	leaq 64(%rdi),%rdi
+    jnz .nt_loop
+	sfence
+    RET
+SYM_FUNC_END(clear_page_nt)
+EXPORT_SYMBOL_GPL(clear_page_nt)
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 01ce3f4..cc7e94b 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -152,6 +152,16 @@
 	  To compile this driver as a module, choose M here: the
 	  module will be called softdog.
 
+config WATCHDOG_CFS
+	tristate "a module to detect cfs starvation"
+	help
+	  Support for monitoring of cfs threads
+	  being continuously preempted by high-priority threads.
+
+	  I put a limit on the alarm rate,
+	  so if it doesn't seem to be working after a while,
+	  it may be that the number of alarms has reached its limit.
+
 config SOFT_WATCHDOG_PRETIMEOUT
 	bool "Software watchdog pretimeout governor support"
 	depends on SOFT_WATCHDOG && WATCHDOG_PRETIMEOUT_GOV
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 071a2e5..2f208c8 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -227,3 +227,4 @@
 obj-$(CONFIG_RAVE_SP_WATCHDOG) += rave-sp-wdt.o
 obj-$(CONFIG_STPMIC1_WATCHDOG) += stpmic1_wdt.o
 obj-$(CONFIG_SL28CPLD_WATCHDOG) += sl28cpld_wdt.o
+obj-$(CONFIG_WATCHDOG_CFS) += watchcfs.o
diff --git a/drivers/watchdog/watchcfs.c b/drivers/watchdog/watchcfs.c
new file mode 100644
index 0000000..c87e429
--- /dev/null
+++ b/drivers/watchdog/watchcfs.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Watchcfs is a module
+ * that can be used to monitor cfs thread
+ * persistence preempted by high priority threads.
+ *
+ * Copyright(c) 2023 Zhenhao Guo <1641955581@qq.com>
+ */
+
+#define pr_fmt(fmt) "watchcfs: " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/smpboot.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+#include <linux/hrtimer.h>
+#include <linux/moduleparam.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/isolation.h>
+#include <linux/workqueue.h>
+#include <linux/cpuhotplug.h>
+
+static bool __read_mostly watchcfs_enabled = true;
+static uint __read_mostly watchcfs_thresh = 10;
+static u64 __read_mostly sample_period;
+static uint __read_mostly sample_interval = 5;
+static struct cpumask __read_mostly watchcfs_cpumask;
+static struct cpumask __read_mostly watchcfs_allowed_mask;
+static unsigned long *watchcfs_cpumask_bits = cpumask_bits(&watchcfs_cpumask);
+static int watchcfs_cpuhp_state;
+
+static DEFINE_MUTEX(watchcfs_mutex);
+static DEFINE_PER_CPU(unsigned long, watchcfs_touch_ts);
+static DEFINE_PER_CPU(struct hrtimer, watchcfs_hrtimer);
+static DEFINE_PER_CPU(struct work_struct, watchcfs_work);
+static DEFINE_PER_CPU(bool, work_done) = true;
+
+/* Returns seconds, approximately. */
+static unsigned long get_timestamp(void)
+{
+	return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static void set_sample_period(void)
+{
+	/*
+	 * convert watchcfs_thresh from seconds to ns
+	 * the divide by sample_interval is to give hrtimer several chances to increment
+	 */
+	sample_period = watchcfs_thresh * ((u64)NSEC_PER_SEC / sample_interval);
+}
+
+static void __touch_watchcfs(void)
+{
+	__this_cpu_write(watchcfs_touch_ts, get_timestamp());
+}
+
+static void watchcfs_work_handler(struct work_struct *data)
+{
+	__touch_watchcfs();
+	__this_cpu_write(work_done, true);
+}
+
+static int is_soft_starve(unsigned long touch_ts)
+{
+	unsigned long now = get_timestamp();
+
+	if (time_after(now, touch_ts + watchcfs_thresh))
+		return now - touch_ts;
+	return 0;
+}
+
+static enum hrtimer_restart watchcfs_timer_fn(struct hrtimer *hrtimer)
+{
+	int duration;
+	unsigned long touch_ts = __this_cpu_read(watchcfs_touch_ts);
+	static DEFINE_RATELIMIT_STATE(ratelimit, 60 * HZ, 5);
+
+	if (__this_cpu_read(work_done)) {
+		__this_cpu_write(work_done, false);
+		queue_work_on(smp_processor_id(), system_wq, this_cpu_ptr(&watchcfs_work));
+	}
+	/* .. and repeat */
+	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+	duration = is_soft_starve(touch_ts);
+	if (unlikely(duration)) {
+		/* Start period for the next softstarve warning. */
+		__touch_watchcfs();
+		if (__ratelimit(&ratelimit)) {
+			pr_emerg("BUG: soft starve - CPU#%d stuck for %us! [%s:%d]\n",
+				smp_processor_id(), duration,
+				current->comm, task_pid_nr(current));
+			dump_stack();
+		}
+	}
+
+	return HRTIMER_RESTART;
+}
+
+static int softstarve_stop_fn(void *data)
+{
+	struct hrtimer *hrtimer = this_cpu_ptr(&watchcfs_hrtimer);
+
+	hrtimer_cancel(hrtimer);
+	return 0;
+}
+
+static void softstarve_stop_all(void)
+{
+	int cpu;
+
+	for_each_cpu(cpu, &watchcfs_allowed_mask)
+		smp_call_on_cpu(cpu, softstarve_stop_fn, NULL, false);
+	cpumask_clear(&watchcfs_allowed_mask);
+}
+
+static int softstarve_start_fn(void *data)
+{
+	struct hrtimer *hrtimer = this_cpu_ptr(&watchcfs_hrtimer);
+	struct work_struct *work = this_cpu_ptr(&watchcfs_work);
+
+	INIT_WORK(work, watchcfs_work_handler);
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchcfs_timer_fn;
+	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+				HRTIMER_MODE_REL_PINNED);
+	/* Initialize timestamp */
+	__touch_watchcfs();
+	return 0;
+}
+
+static void softstarve_start_all(void)
+{
+	int cpu;
+
+	cpumask_copy(&watchcfs_allowed_mask, &watchcfs_cpumask);
+	for_each_cpu(cpu, &watchcfs_allowed_mask)
+		smp_call_on_cpu(cpu, softstarve_start_fn, NULL, false);
+}
+
+static void starve_detector_reconfigure(void)
+{
+	cpus_read_lock();
+	softstarve_stop_all();
+	set_sample_period();
+	if (watchcfs_enabled && watchcfs_thresh)
+		softstarve_start_all();
+	cpus_read_unlock();
+}
+
+/* Handling CPU online situation */
+static int watchcfs_cpu_online(unsigned int cpu)
+{
+	return softstarve_start_fn(NULL);
+}
+
+/* Handling CPU offline situation */
+static int watchcfs_cpu_offline(unsigned int cpu)
+{
+	return softstarve_stop_fn(NULL);
+}
+
+/* Propagate any changes to the watchcfs module */
+static void watchcfs_update(void)
+{
+	cpumask_and(&watchcfs_cpumask, &watchcfs_cpumask, cpu_possible_mask);
+	starve_detector_reconfigure();
+}
+
+static int param_set_common_bool(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	bool old, *param = kp->arg;
+
+	mutex_lock(&watchcfs_mutex);
+	old = READ_ONCE(*param);
+	ret = param_set_bool(val, kp);
+	if (!ret && old != READ_ONCE(*param))
+		watchcfs_update();
+	mutex_unlock(&watchcfs_mutex);
+	return ret;
+}
+
+static int param_set_common_uint(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	uint old, new;
+	uint *param = kp->arg;
+
+	mutex_lock(&watchcfs_mutex);
+	ret = kstrtouint(val, 0, &new);
+	if (!ret && new == 0) {
+		pr_emerg("Please enter a number greater than 0.\n");
+		mutex_unlock(&watchcfs_mutex);
+		return ret;
+	}
+	old = READ_ONCE(*param);
+	ret = param_set_uint(val, kp);
+	if (!ret && old != READ_ONCE(*param))
+		watchcfs_update();
+	mutex_unlock(&watchcfs_mutex);
+	return ret;
+}
+
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long new, old;
+	unsigned long *cpumask = *(unsigned long **)kp->arg;
+
+	mutex_lock(&watchcfs_mutex);
+	old = READ_ONCE(*cpumask);
+	ret = kstrtoul(val, 0, &new);
+	if (!ret && old != new) {
+		*cpumask = new;
+		watchcfs_update();
+	}
+	mutex_unlock(&watchcfs_mutex);
+	return ret;
+}
+
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+	unsigned long *cpumask = *(unsigned long **)kp->arg;
+
+	return scnprintf(buffer, PAGE_SIZE, "%lu\n", *cpumask);
+}
+
+static const struct kernel_param_ops watchcfs_enabled_param_ops = {
+	.set = param_set_common_bool,
+	.get = param_get_bool
+};
+module_param_cb(watchcfs_enabled, &watchcfs_enabled_param_ops, &watchcfs_enabled, 0644);
+MODULE_PARM_DESC(watchcfs_enabled, "Enable watchcfs.");
+
+static const struct kernel_param_ops thresh_param_ops = {
+	.set = param_set_common_uint,
+	.get = param_get_uint
+};
+module_param_cb(watchcfs_thresh, &thresh_param_ops, &watchcfs_thresh, 0644);
+MODULE_PARM_DESC(watchcfs_thresh, "Threshold of watchcfs.");
+
+static const struct kernel_param_ops sample_interval_param_ops = {
+	.set = param_set_common_uint,
+	.get = param_get_uint
+};
+module_param_cb(sample_interval, &sample_interval_param_ops, &sample_interval, 0644);
+MODULE_PARM_DESC(sample_interval, "Sampling interval of watchcfs. sample_period = watchcfs_thresh / sample_interval");
+
+static const struct kernel_param_ops cpumask_param_ops = {
+	.set = param_set_cpumask,
+	.get = param_get_cpumask
+};
+module_param_cb(watchcfs_cpumask_bits, &cpumask_param_ops, &watchcfs_cpumask_bits, 0644);
+MODULE_PARM_DESC(watchcfs_cpumask_bits, "CPU mask of watchcfs.");
+
+static int __init starve_detector_init(void)
+{
+	if (!(watchcfs_enabled && watchcfs_thresh))
+		return 0;
+
+	set_sample_period();
+	cpumask_copy(&watchcfs_cpumask, housekeeping_cpumask(HK_FLAG_TIMER));
+	cpumask_copy(&watchcfs_allowed_mask, &watchcfs_cpumask);
+	watchcfs_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "watchcfs:online",
+					watchcfs_cpu_online, watchcfs_cpu_offline);
+	if (watchcfs_cpuhp_state < 0) {
+		pr_err("Failed to register 'dyn' cpuhp callbacks in %s()", __func__);
+		return watchcfs_cpuhp_state;
+	}
+	return 0;
+}
+module_init(starve_detector_init);
+
+static void __exit starve_detector_exit(void)
+{
+	cpuhp_remove_state(watchcfs_cpuhp_state);
+}
+module_exit(starve_detector_exit);
+
+MODULE_AUTHOR("Zhenhao Guo");
+MODULE_DESCRIPTION("A module to monitor cfs task starvation");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 5c3df92..55d5cf8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -564,6 +564,13 @@
 	return __alloc_pages_node(nid, gfp_mask, order);
 }
 
+#ifdef CONFIG_KZEROD
+struct page *alloc_prezeroed_page(unsigned int order, unsigned int cpuid);
+unsigned long kzerod_get_zeroed_size(unsigned int order);
+void drain_zerod_page(unsigned int order);
+void kzerod_enable_order(unsigned int order);
+#endif
+
 #ifdef CONFIG_NUMA
 struct page *alloc_pages(gfp_t gfp, unsigned int order);
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index c3b75b4..6e8c222 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -274,6 +274,12 @@
 alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
 					unsigned long vaddr)
 {
+#ifdef CONFIG_KZEROD
+	struct page *prezerod_page = alloc_prezeroed_page(0, smp_processor_id());
+
+	if (prezerod_page)
+		return prezerod_page;
+#endif
 	return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5ae5847..35afee1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -82,12 +82,19 @@
 	 */
 	union {
 		struct {	/* Page cache and anonymous pages */
+#if defined(CONFIG_KZEROD) && !defined(__GENKSYMS__)
+			union{
+				struct list_head lru;
+				struct llist_node kzerod_node;
+			};
+#else
 			/**
 			 * @lru: Pageout list, eg. active_list protected by
 			 * lruvec->lru_lock.  Sometimes used as a generic list
 			 * by the page owner.
 			 */
 			struct list_head lru;
+#endif
 			/* See page-flags.h for PAGE_MAPPING_FLAGS */
 			struct address_space *mapping;
 			pgoff_t index;		/* Our offset within mapping. */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 409385b..f9928ab 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -186,6 +186,18 @@
 
 struct event_filter;
 
+#define STACK_FILTER_ADDR_MAP_SIZE 31
+
+struct stack_filter_addr_map {
+	struct hlist_head map[STACK_FILTER_ADDR_MAP_SIZE];
+	spinlock_t lock;
+};
+
+struct event_stack_filter {
+	struct list_head filters;
+	struct stack_filter_addr_map *addr_map;
+};
+
 enum trace_reg {
 	TRACE_REG_REGISTER,
 	TRACE_REG_UNREGISTER,
@@ -376,6 +388,7 @@
 	EVENT_FILE_FL_TRIGGER_COND_BIT,
 	EVENT_FILE_FL_PID_FILTER_BIT,
 	EVENT_FILE_FL_WAS_ENABLED_BIT,
+	EVENT_FILE_FL_STACK_FILTER_BIT,
 };
 
 extern struct trace_event_file *trace_get_event_file(const char *instance,
@@ -527,12 +540,16 @@
 	EVENT_FILE_FL_TRIGGER_COND	= (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
 	EVENT_FILE_FL_PID_FILTER	= (1 << EVENT_FILE_FL_PID_FILTER_BIT),
 	EVENT_FILE_FL_WAS_ENABLED	= (1 << EVENT_FILE_FL_WAS_ENABLED_BIT),
+	EVENT_FILE_FL_STACK_FILTER	= (1 << EVENT_FILE_FL_STACK_FILTER_BIT),
 };
 
 struct trace_event_file {
 	struct list_head		list;
 	struct trace_event_call		*event_call;
 	struct event_filter __rcu	*filter;
+#ifdef CONFIG_TRACE_EVENT_STACK_FILTER
+	struct event_stack_filter __rcu	*stack_filter;
+#endif
 	struct dentry			*dir;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
@@ -596,6 +613,27 @@
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
 
+#ifdef CONFIG_TRACE_EVENT_STACK_FILTER
+extern int stack_filter_match(struct event_stack_filter *stack_filter);
+
+static inline struct event_stack_filter *
+get_stack_filter(struct trace_event_file *file)
+{
+	return rcu_dereference(file->stack_filter);
+}
+#else
+static inline int stack_filter_match(struct event_stack_filter *stack_filter)
+{
+	return 1;
+}
+
+static inline struct event_stack_filter *
+get_stack_filter(struct trace_event_file *file)
+{
+	return NULL;
+}
+#endif
+
 extern enum event_trigger_type
 event_triggers_call(struct trace_event_file *file, void *rec,
 		    struct ring_buffer_event *event);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9682ceb..8a3a392a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -932,6 +932,14 @@
 
           If unsure, say N.
 
+config TRACE_EVENT_STACK_FILTER
+	bool "Enable call stack filter for trace events"
+	default n
+	depends on STACKTRACE
+	help
+	  This option enables call stack filter for trace events.
+	  See Documentation/trace/events.rst for details.
+
 endif # FTRACE
 
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 8ab4d42..3288c46 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -77,6 +77,9 @@
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+ifeq ($(CONFIG_TRACE_EVENT_STACK_FILTER),y)
+obj-$(CONFIG_EVENT_TRACING) += trace_events_stack_filter.o
+endif
 obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
 obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
 obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b3a0ee2..0c5bdf8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2782,7 +2782,7 @@
 	*current_rb = trace_file->tr->array_buffer.buffer;
 
 	if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
-	     (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
+	    (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED | EVENT_FILE_FL_STACK_FILTER)) &&
 	    (entry = this_cpu_read(trace_buffered_event))) {
 		/* Try to use the per cpu buffer first */
 		val = this_cpu_inc_return(trace_buffered_event_cnt);
@@ -2837,6 +2837,11 @@
 	     !filter_match_preds(file->filter, fbuffer->entry)))
 		return;
 
+	if (IS_ENABLED(CONFIG_TRACE_EVENT_STACK_FILTER) &&
+	    unlikely(file->flags & EVENT_FILE_FL_STACK_FILTER) &&
+	    !stack_filter_match(get_stack_filter(file)))
+		return;
+
 	event = &fbuffer->trace_file->event_call->event;
 
 	spin_lock_irqsave(&tracepoint_iter_lock, flags);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c0596e2..78c7d5d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1512,6 +1512,7 @@
 
 	if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED |
 				    EVENT_FILE_FL_FILTERED |
+				    EVENT_FILE_FL_STACK_FILTER |
 				    EVENT_FILE_FL_PID_FILTER))))
 		return false;
 
@@ -1522,6 +1523,11 @@
 	    !filter_match_preds(file->filter, entry))
 		goto discard;
 
+	if (IS_ENABLED(CONFIG_TRACE_EVENT_STACK_FILTER) &&
+	    (file->flags & EVENT_FILE_FL_STACK_FILTER) &&
+	    !stack_filter_match(get_stack_filter(file)))
+		goto discard;
+
 	if ((file->flags & EVENT_FILE_FL_PID_FILTER) &&
 	    trace_event_ignore_this_pid(file))
 		goto discard;
@@ -1694,6 +1700,10 @@
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
+#ifdef CONFIG_TRACE_EVENT_STACK_FILTER
+extern const struct file_operations event_stack_filter_fops;
+#endif
+
 extern const struct file_operations event_trigger_fops;
 extern const struct file_operations event_hist_fops;
 extern const struct file_operations event_hist_debug_fops;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f4b11f6..f9e604f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2211,6 +2211,11 @@
 		trace_create_file("filter", TRACE_MODE_WRITE, file->dir,
 				  file, &ftrace_event_filter_fops);
 
+#ifdef CONFIG_TRACE_EVENT_STACK_FILTER
+		trace_create_file("stack_filter", TRACE_MODE_WRITE, file->dir,
+				  file, &event_stack_filter_fops);
+#endif
+
 		trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
 				  file, &event_trigger_fops);
 	}
diff --git a/kernel/trace/trace_events_stack_filter.c b/kernel/trace/trace_events_stack_filter.c
new file mode 100644
index 0000000..9e79d478
--- /dev/null
+++ b/kernel/trace/trace_events_stack_filter.c
@@ -0,0 +1,819 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include "trace.h"
+
+#define TP_BUF_SIZE 1023	/* trace parser buf size */
+#define CS_BUF_SIZE 64		/* call stack buf size */
+
+#define MAX_SF_LEN 64		/* max stack filter length */
+#define DSTARS_ADDR 1		/* '**' wildcard */
+
+#define list_length(head) ({ \
+	int __len = 0; \
+	struct list_head *__pos; \
+	list_for_each(__pos, head) \
+		__len++; \
+	__len; \
+})
+
+#define ADDR_MAP_HASH(key) \
+	(((key) >> 2) % STACK_FILTER_ADDR_MAP_SIZE)
+
+struct function_address {
+	struct list_head list;
+	size_t addr;			/* some addresses may represent wildcards */
+};
+
+struct stack_filter {
+	struct list_head list;
+	char *string;			/* original string */
+	struct list_head addrs;		/* function addresses */
+	bool neg;			/* negate the filter */
+};
+
+struct addr_map_node {
+	struct hlist_node node;
+	unsigned long key;
+	unsigned long value;
+};
+
+static inline void
+function_address_list_clear(struct list_head *faddrs)
+{
+	struct function_address *faddr, *tmp;
+
+	list_for_each_entry_safe(faddr, tmp, faddrs, list) {
+		list_del(&faddr->list);
+		kfree(faddr);
+	}
+}
+
+static inline int
+function_address_list_copy(struct list_head *copy, struct list_head *faddrs)
+{
+	struct function_address *faddr, *new_faddr;
+
+	INIT_LIST_HEAD(copy);
+	list_for_each_entry_reverse(faddr, faddrs, list) {
+		new_faddr = kmalloc(sizeof(*new_faddr), GFP_KERNEL);
+		if (!new_faddr) {
+			function_address_list_clear(copy);
+			return -ENOMEM;
+		}
+		new_faddr->addr = faddr->addr;
+		list_add(&new_faddr->list, copy);
+	}
+	return 0;
+}
+
+static inline void
+stack_filter_init(struct stack_filter *filter)
+{
+	INIT_LIST_HEAD(&filter->addrs);
+}
+
+static inline struct stack_filter *
+stack_filter_new(void)
+{
+	struct stack_filter *filter;
+
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!filter)
+		return NULL;
+
+	stack_filter_init(filter);
+	return filter;
+}
+
+static inline void
+stack_filter_free(struct stack_filter *filter)
+{
+	struct function_address *faddr, *tmp;
+
+	list_for_each_entry_safe(faddr, tmp, &filter->addrs, list) {
+		list_del(&faddr->list);
+		kfree(faddr);
+	}
+
+	kfree(filter->string);
+	kfree(filter);
+}
+
+static inline int
+stack_filter_copy(struct stack_filter *copy, struct stack_filter *filter)
+{
+	int ret = 0;
+
+	copy->string = kstrdup(filter->string, GFP_KERNEL);
+	if (!copy->string)
+		return -ENOMEM;
+
+	ret = function_address_list_copy(&copy->addrs, &filter->addrs);
+	if (ret < 0) {
+		kfree(copy->string);
+		return ret;
+	}
+
+	copy->neg = filter->neg;
+	return 0;
+}
+
+static inline void
+stack_filter_list_clear(struct list_head *filters)
+{
+	struct stack_filter *filter, *tmp;
+
+	list_for_each_entry_safe(filter, tmp, filters, list) {
+		list_del(&filter->list);
+		stack_filter_free(filter);
+	}
+}
+
+static inline int
+stack_filter_list_copy(struct list_head *copy, struct list_head *filters)
+{
+	int ret = 0;
+	struct stack_filter *filter, *new_filter;
+
+	/* merge initialization with copy */
+	INIT_LIST_HEAD(copy);
+	list_for_each_entry_reverse(filter, filters, list) {
+		new_filter = kmalloc(sizeof(*new_filter), GFP_KERNEL);
+		if (!new_filter) {
+			ret = -ENOMEM;
+			goto bad;
+		}
+
+		ret = stack_filter_copy(new_filter, filter);
+		if (ret < 0)
+			goto bad;
+
+		list_add(&new_filter->list, copy);
+	}
+	return 0;
+
+ bad:
+	stack_filter_list_clear(copy);
+	return ret;
+}
+
+static inline void
+stack_filter_enable(struct trace_event_file *file)
+{
+	unsigned long old_flags = file->flags;
+
+	file->flags |= EVENT_FILE_FL_STACK_FILTER;
+	if (file->flags != old_flags)
+		trace_buffered_event_enable();
+}
+
+static inline void
+stack_filter_disable(struct trace_event_file *file)
+{
+	unsigned long old_flags = file->flags;
+
+	file->flags &= ~EVENT_FILE_FL_STACK_FILTER;
+	if (file->flags != old_flags)
+		trace_buffered_event_disable();
+}
+
+static inline void
+addr_map_init(struct stack_filter_addr_map *addr_map)
+{
+	int i;
+
+	for (i = 0; i < STACK_FILTER_ADDR_MAP_SIZE; i++)
+		INIT_HLIST_HEAD(&addr_map->map[i]);
+	spin_lock_init(&addr_map->lock);
+}
+
+/*
+ * Typically, the number of functions in the call stack of a trace event
+ * is not large, so we use a simple hash table to store the mapping,
+ * without limiting its cache size.
+ */
+static inline int
+addr_map_insert(struct stack_filter_addr_map *addr_map, unsigned long key, unsigned long value)
+{
+	struct addr_map_node *node;
+	int idx, ret = 0;
+	unsigned long flags;
+
+	idx = ADDR_MAP_HASH(key);
+	spin_lock_irqsave(&addr_map->lock, flags);
+
+	hlist_for_each_entry(node, &addr_map->map[idx], node) {
+		/* new value is always the same as the old here... maybe */
+		if (node->key == key)
+			goto out;
+	}
+
+	node = kmalloc(sizeof(*node), GFP_ATOMIC);
+	if (!node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	node->key = key;
+	node->value = value;
+
+	hlist_add_head_rcu(&node->node, &addr_map->map[idx]);
+
+ out:
+	spin_unlock_irqrestore(&addr_map->lock, flags);
+	return ret;
+}
+
+static inline unsigned long
+addr_map_get(struct stack_filter_addr_map *addr_map, unsigned long key)
+{
+	struct addr_map_node *node;
+	int idx;
+	unsigned long ret = 0; /* value can't be 0 */
+
+	idx = ADDR_MAP_HASH(key);
+	/* nested critical section, not necessary in fact */
+	rcu_read_lock_sched();
+
+	hlist_for_each_entry_rcu(node, &addr_map->map[idx], node) {
+		if (node->key == key) {
+			ret = node->value;
+			goto out;
+		}
+	}
+
+ out:
+	rcu_read_unlock_sched();
+	return ret;
+}
+
+/* require holding event_mutex */
+static inline void
+addr_map_clear(struct hlist_head *addr_map)
+{
+	int i;
+	struct addr_map_node *node;
+	struct hlist_node *tmp;
+
+	for (i = 0; i < STACK_FILTER_ADDR_MAP_SIZE; i++) {
+		hlist_for_each_entry_safe(node, tmp, &addr_map[i], node) {
+			hlist_del(&node->node);
+			kfree(node);
+		}
+	}
+}
+
+static inline void
+addr_map_free(struct stack_filter_addr_map *addr_map)
+{
+	addr_map_clear(addr_map->map);
+	kfree(addr_map);
+}
+
+static inline void
+event_stack_filter_init(struct event_stack_filter *esf)
+{
+	INIT_LIST_HEAD(&esf->filters);
+
+	/* addr_map should be pre-allocated, just init it here */
+	addr_map_init(esf->addr_map);
+}
+
+static inline struct event_stack_filter *
+event_stack_filter_new(void)
+{
+	struct event_stack_filter *esf;
+
+	esf = kmalloc(sizeof(*esf), GFP_KERNEL);
+	if (!esf)
+		return NULL;
+
+	esf->addr_map = kmalloc(sizeof(*esf->addr_map), GFP_KERNEL);
+	if (!esf->addr_map)
+		return NULL;
+
+	event_stack_filter_init(esf);
+	return esf;
+}
+
+static inline void
+event_stack_filter_free(struct event_stack_filter *esf, bool free_addr_map)
+{
+	stack_filter_list_clear(&esf->filters);
+
+	/*
+	 * addr_map may be passed to a new event_stack_filter,
+	 * in this situation, we cannot free it.
+	 */
+	if (free_addr_map)
+		addr_map_free(esf->addr_map);
+
+	kfree(esf);
+}
+
+/* require holding event_mutex */
+static inline int
+event_stack_filter_copy(struct event_stack_filter *copy,
+			struct event_stack_filter *esf)
+{
+	int ret;
+
+	ret = stack_filter_list_copy(&copy->filters, &esf->filters);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Not use deepcopy here to speed up copy.
+	 * Must be vigilant about this when use or free addr_map.
+	 */
+	copy->addr_map = esf->addr_map;
+	return 0;
+}
+
+/*
+ * require holding event_mutex
+ * combine new and copy
+ */
+static inline struct event_stack_filter *
+event_stack_filter_clone(struct event_stack_filter *esf)
+{
+	struct event_stack_filter *copy;
+
+	copy = kmalloc(sizeof(*copy), GFP_KERNEL);
+	if (!copy)
+		return NULL;
+
+	if (event_stack_filter_copy(copy, esf) < 0) {
+		kfree(copy);
+		return NULL;
+	}
+
+	return copy;
+}
+
+/*
+ * parse a string with the form below:
+ *   '!'?function(/(function|'**'))*
+ * where:
+ *   '!' negates the filter
+ *   '**' matches any function call path
+ * e.g.
+ *   [1] work_pending/do_notify_resume/schedule/__schedule/'**'
+ *   [2] '**'/kthread/kcompactd/schedule_timeout/schedule/'**'
+ *   [3] !el0_sync/el0_sync_handler/'**'/invoke_syscall/'**'/schedule/'**'
+ *   [4] !ret_from_fork/'**'/kthread/worker_thread/schedule/'**'
+ * Please remove '' around '**' if you want to use it.
+ *
+ * The full call path will end at stack_filter_match function,
+ * like
+ *   work_pending/do_notify_resume/schedule/__schedule/\
+ *   trace_event_raw_event_sched_switch/trace_event_buffer_commit/stack_filter_match.
+ *
+ * We recommand that you use '**' at the end of the string,
+ * because it will match any function call path.
+ * So that you don't have to know the deeper call path.
+ *
+ * Call paths that matches example [1] can also match
+ *   schedule/__schedule/'**' or '**'/schedule/__schedule/'**',
+ * because we are matching call stacks, not the full path, to speed up filtering.
+ * Function calls at the bottom of stack will be ignored.
+ *
+ * We convert symbols to their addresses here to avoid
+ * changing stacktrace addresses to their names at runtime,
+ * which would greatly slow down the function call.
+ * The downside is that we can't handle '*' wildcard.
+ */
+static int
+stack_filter_parse(struct stack_filter *filter, char *buf)
+{
+	char *p = buf;
+	char name[NAME_MAX + 1];
+	struct function_address *faddr, *tmp;
+	size_t addr;
+	int i, len = 0, ret = 0;
+
+	if (*p == '!') {
+		filter->neg = true;
+		p++;
+	}
+	if (*p == '\0')
+		return -EINVAL;
+
+	while (*p) {
+		i = 0;
+		while (*p && *p != '/') {
+			name[i++] = *(p++);
+			if (i > NAME_MAX) {
+				ret = -EINVAL;
+				goto bad;
+			}
+		}
+		name[i] = '\0';
+
+		while (*p == '/')
+			p++;
+
+		if (!strcmp(name, "**")) {
+			/* wildcard '**' */
+			addr = DSTARS_ADDR;
+		} else {
+			/* function name (maybe empty) */
+			addr = kallsyms_lookup_name(name);
+			if (!addr) {
+				ret = -EINVAL;
+				goto bad;
+			}
+		}
+
+		/* remove repetitive '**' */
+		if (addr == DSTARS_ADDR && !list_empty(&filter->addrs)) {
+			faddr = list_first_entry(&filter->addrs, struct function_address, list);
+
+			if (faddr->addr == DSTARS_ADDR)
+				continue;
+		}
+
+		if (++len > MAX_SF_LEN) {
+			ret = -EINVAL;
+			goto bad;
+		}
+
+		faddr = kzalloc(sizeof(*faddr), GFP_KERNEL);
+		if (!faddr) {
+			ret = -ENOMEM;
+			goto bad;
+		}
+
+		faddr->addr = addr;
+		list_add(&faddr->list, &filter->addrs);
+	}
+
+	if (list_empty(&filter->addrs))
+		return -EINVAL;
+
+	/* save original string as well */
+	filter->string = kstrdup(buf, GFP_KERNEL);
+	if (!filter->string) {
+		ret = -ENOMEM;
+		goto bad;
+	}
+
+	return ret;
+
+ bad:
+	list_for_each_entry_safe(faddr, tmp, &filter->addrs, list) {
+		list_del(&faddr->list);
+		kfree(faddr);
+	}
+	return ret;
+}
+
+static bool
+__stack_filter_match_one(struct stack_filter *filter,
+			 unsigned long *buf, int num_entries, bool *dp)
+{
+	int num_faddrs, i, j;
+	bool ok;
+	struct function_address *faddr;
+
+	num_faddrs = list_length(&filter->addrs);
+
+#define pos(i, j) ((i) * (num_faddrs + 1) + (j))
+
+	/* dynamic programming */
+	dp[pos(0, 0)] = true;
+	ok = false;
+
+	for (i = 0; i <= num_entries; i++) {
+		faddr = list_entry(&filter->addrs, struct function_address, list);
+		for (j = 1; j <= num_faddrs; j++) {
+			faddr = list_next_entry(faddr, list);
+			dp[pos(i, j)] = false;
+
+			if (faddr->addr == DSTARS_ADDR) {
+				dp[pos(i, j)] = dp[pos(i, j - 1)];
+				if (i > 0)
+					dp[pos(i, j)] |= dp[pos(i - 1, j)];
+			} else if (i > 0 && buf[i - 1] == faddr->addr)
+				dp[pos(i, j)] = dp[pos(i - 1, j - 1)];
+		}
+
+		if (dp[pos(i, num_faddrs)]) {
+			ok = true;
+			break;
+		}
+	}
+
+#undef pos
+
+	return ok;
+}
+
+/* return 0 on error */
+static inline unsigned long
+addr_remove_offset(struct event_stack_filter *esf, unsigned long addr)
+{
+	unsigned long new_addr;
+	char name[KSYM_NAME_LEN];
+
+	/*
+	 * This operation is very slow,
+	 * so we use a small cache to optimize it.
+	 */
+	new_addr = addr_map_get(esf->addr_map, addr);
+	if (new_addr)
+		return new_addr;
+
+	if (lookup_symbol_name(addr, name) < 0)
+		return 0;
+
+	new_addr = kallsyms_lookup_name(name);
+	if (!new_addr)
+		return 0;
+
+	if (addr_map_insert(esf->addr_map, addr, new_addr) < 0)
+		return 0;
+
+	return new_addr;
+}
+
+/*
+ * return 1 on matching and 0 otherwise.
+ *
+ * A call path is matched successfully if the following conditions are met simultaneously:
+ * [1] It matches any positive stack filter.
+ * [2] It doesn't match any negative stack filter.
+ * If no positive filter are set, condition [1] don't need to be satisified.
+ */
+int stack_filter_match(struct event_stack_filter *esf)
+{
+	int i, num_entries, num_faddrs;
+	int size, maxsize;
+	bool hasp, okp, *dp;
+	struct stack_filter *filter;
+	unsigned long buf[CS_BUF_SIZE], new_addr;
+	struct list_head *stack_filters;
+
+	/*
+	 * We have already been inside rcu_read_lock_sched critical section.
+	 * It's safe to visit esf.
+	 */
+	if (!esf)
+		return 1;
+
+	stack_filters = &esf->filters;
+	if (list_empty(stack_filters))
+		return 1;
+
+	num_entries = stack_trace_save(buf, CS_BUF_SIZE, 0);
+
+	for (i = num_entries - 1; i >= 0; i--) {
+		/*
+		 * buf[i] contains addr of a symbol plus an offset.
+		 * We should remove the offset here.
+		 */
+		new_addr = addr_remove_offset(esf, buf[i]);
+		if (new_addr)
+			buf[i] = new_addr;
+	}
+
+	/* pre allocate memory for dp */
+	maxsize = 0;
+	list_for_each_entry(filter, stack_filters, list) {
+		num_faddrs = list_length(&filter->addrs);
+		size = (num_entries + 1) * (num_faddrs + 1);
+
+		if (size > maxsize)
+			maxsize = size;
+	}
+
+	dp = kmalloc(maxsize, GFP_ATOMIC);
+	if (!dp)
+		return 0;
+
+	hasp = 0; okp = 0;
+	list_for_each_entry(filter, stack_filters, list) {
+		if (!filter->neg) {
+			hasp = 1;
+			if (__stack_filter_match_one(filter, buf, num_entries, dp)) {
+				okp = 1;
+				break;
+			}
+		}
+	}
+	if (hasp && !okp)
+		goto bad_match;
+
+	list_for_each_entry(filter, stack_filters, list) {
+		if (filter->neg && __stack_filter_match_one(filter, buf, num_entries, dp))
+			goto bad_match;
+	}
+
+	kfree(dp);
+	return 1;
+
+ bad_match:
+	kfree(dp);
+	return 0;
+}
+
+/*
+ * use seq_file APIs to read from stack_filters
+ */
+static void *sf_start(struct seq_file *m, loff_t *pos)
+{
+	struct trace_event_file *file;
+	loff_t n = *pos;
+
+	mutex_lock(&event_mutex);
+	file = m->private;
+
+	if (!file->stack_filter)
+		return NULL;
+
+	return seq_list_start(&file->stack_filter->filters, n);
+}
+
+static void *sf_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_event_file *file = m->private;
+
+	return seq_list_next(v, &file->stack_filter->filters, pos);
+}
+
+static void sf_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&event_mutex);
+}
+
+static int sf_show(struct seq_file *m, void *v)
+{
+	struct stack_filter *filter = v;
+
+	seq_printf(m, "%s\n", filter->string);
+	return 0;
+}
+
+const struct seq_operations stack_filter_seq_ops = {
+	.start = sf_start,
+	.stop = sf_stop,
+	.next = sf_next,
+	.show = sf_show,
+};
+
+/*
+ * operations for stack_filter file
+ * not for 'struct event_stack_filter'
+ */
+static ssize_t
+event_stack_filter_write(struct file *filp, const char __user *ubuf,
+			 size_t cnt, loff_t *ppos)
+{
+	struct trace_event_file *event_file;
+	struct trace_parser parser;
+	struct stack_filter *filter;
+	struct event_stack_filter *esf, *old;
+	int read, ret;
+
+	filter = stack_filter_new();
+	if (!filter)
+		return -ENOMEM;
+
+	if (trace_parser_get_init(&parser, TP_BUF_SIZE + 1)) {
+		kfree(filter);
+		return -ENOMEM;
+	}
+
+	read = trace_get_user(&parser, ubuf, cnt, ppos);
+
+	if (read >= 0 && trace_parser_loaded(&parser)) {
+		/*
+		 * e.g. use 'echo 0 > stack_filter' to disable stack_filter
+		 * Most data structures has been cleared in event_stack_filter_open.
+		 * Just make some judgements to avoid reporting error.
+		 */
+		if (!strcmp(strstrip(parser.buffer), "0")) {
+			kfree(filter);
+			trace_parser_put(&parser);
+
+			event_file = event_file_data(filp);
+			if (!rcu_dereference(event_file->stack_filter))
+				return read;
+
+			/* maybe use append mode or something else */
+			return -EINVAL;
+		}
+
+		ret = stack_filter_parse(filter, parser.buffer);
+		if (ret < 0) {
+			kfree(filter);
+			trace_parser_put(&parser);
+			return ret;
+		}
+	} else {
+		kfree(filter);
+		goto out;
+	}
+
+	mutex_lock(&event_mutex);
+	event_file = event_file_data(filp);
+
+	if (event_file->stack_filter) {
+		/*
+		 * Copy the old and replace it with the new one to follow rcu rules.
+		 * It doesn't cost much time since this function is called seldomly.
+		 * In this way, codes can be simple.
+		 *
+		 * We didn't use a separate rcu for stack_filter->filters
+		 * since its elements cannot be deleted one by one.
+		 */
+		esf = event_stack_filter_clone(event_file->stack_filter);
+		if (!esf) {
+			mutex_unlock(&event_mutex);
+			stack_filter_free(filter);
+			goto out;
+		}
+		list_add_tail(&filter->list, &esf->filters);
+
+		old = event_file->stack_filter;
+		rcu_assign_pointer(event_file->stack_filter, esf);
+
+		/* make sure old esf is not being used */
+		tracepoint_synchronize_unregister();
+		event_stack_filter_free(old, false);
+
+	} else {
+		esf = event_stack_filter_new();
+		if (!esf) {
+			mutex_unlock(&event_mutex);
+			stack_filter_free(filter);
+			goto out;
+		}
+		list_add_tail(&filter->list, &esf->filters);
+
+		rcu_assign_pointer(event_file->stack_filter, esf);
+		tracepoint_synchronize_unregister();
+
+		stack_filter_enable(event_file);
+	}
+
+	mutex_unlock(&event_mutex);
+
+ out:
+	trace_parser_put(&parser);
+	return read;
+}
+
+static int event_stack_filter_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+	struct trace_event_file *event_file;
+	struct event_stack_filter *esf;
+	struct seq_file *seq;
+
+	ret = security_locked_down(LOCKDOWN_TRACEFS);
+	if (ret)
+		return ret;
+
+	mutex_lock(&event_mutex);
+
+	event_file = inode->i_private;
+	if (!event_file) {
+		mutex_unlock(&event_mutex);
+		return -ENODEV;
+	}
+
+	if ((filp->f_mode & FMODE_WRITE) && (filp->f_flags & O_TRUNC)) {
+		stack_filter_disable(event_file);
+
+		if (event_file->stack_filter) {
+			esf = event_file->stack_filter;
+			RCU_INIT_POINTER(event_file->stack_filter, NULL);
+
+			/* wait until esf is not being used */
+			tracepoint_synchronize_unregister();
+			event_stack_filter_free(esf, true);
+		}
+	}
+
+	ret = seq_open(filp, &stack_filter_seq_ops);
+	if (!ret) {
+		seq = filp->private_data;
+		seq->private = inode->i_private;
+	}
+
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+const struct file_operations event_stack_filter_fops = {
+	.open = event_stack_filter_open,
+	.read = seq_read,
+	.write = event_stack_filter_write,
+	.llseek = tracing_lseek,
+	.release = seq_release,
+};
diff --git a/mm/Kconfig b/mm/Kconfig
index 5107934..2fe8f60 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -815,6 +815,19 @@
 	  lifetime of the system until these kthreads finish the
 	  initialisation.
 
+config KZEROD
+	bool "Support kzerod (EXPERIMENTAL)"
+	default n
+	depends on NEED_MULTIPLE_NODES
+	depends on X86 || ARM64
+	help
+	  Support kzerod kernel thread which does prezeroing of free pages.
+
+	  This is marked experimental because it is a new feature. This
+	  feature interacts heavily with anonymous pagefault(include
+	  anonymous thp) and will occupy some memory of system(unmovable),
+	  may cause unknown issues.
+
 config PAGE_IDLE_FLAG
 	bool
 	select PAGE_EXTENSION if !64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 366d9f6..d423847 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@
 obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o
 obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o
 obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o
+obj-$(CONFIG_KZEROD) += kzerod.o
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d720c43..5b37e84e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -201,7 +201,13 @@
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 	} else
 		ret = -EINVAL;
-
+#ifdef CONFIG_KZEROD
+	if (sysfs_streq(buf, "always") || sysfs_streq(buf, "madvise")) {
+		kzerod_enable_order(HPAGE_PMD_ORDER);
+	} else if (sysfs_streq(buf, "never")) {
+		drain_zerod_page(HPAGE_PMD_ORDER);
+	}
+#endif
 	if (ret > 0) {
 		int err = start_stop_khugepaged();
 		if (err)
@@ -592,7 +598,7 @@
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
-			struct page *page, gfp_t gfp)
+			struct page *page, gfp_t gfp, bool prezeroed)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	pgtable_t pgtable;
@@ -615,7 +621,12 @@
 		goto release;
 	}
 
+#ifdef CONFIG_KZEROD
+	if (!prezeroed)
+		clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
+#else
 	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
+#endif
 	/*
 	 * The memory barrier inside __SetPageUptodate makes sure that
 	 * clear_huge_page writes become visible before the set_pmd_at()
@@ -728,6 +739,7 @@
 	gfp_t gfp;
 	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	bool prezeroed = false;
 
 	if (!transhuge_vma_suitable(vma, haddr))
 		return VM_FAULT_FALLBACK;
@@ -774,13 +786,21 @@
 		return ret;
 	}
 	gfp = alloc_hugepage_direct_gfpmask(vma);
+#ifdef CONFIG_KZEROD
+	page = alloc_prezeroed_page(HPAGE_PMD_ORDER, smp_processor_id());
+	if (page)
+		prezeroed = true;
+	else
+		page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+#else
 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+#endif
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
 	}
 	prep_transhuge_page(page);
-	return __do_huge_pmd_anonymous_page(vmf, page, gfp);
+	return __do_huge_pmd_anonymous_page(vmf, page, gfp, prezeroed);
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/kzerod.c b/mm/kzerod.c
new file mode 100644
index 0000000..f0ca132
--- /dev/null
+++ b/mm/kzerod.c
@@ -0,0 +1,600 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel zero pages daemon for anonymous memory
+ */
+#include <uapi/linux/sched/types.h>
+#include <linux/suspend.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/kfifo.h>
+#include <linux/llist.h>
+#include <stdatomic.h>
+#include <linux/mmzone.h>
+
+#define KZEROD_CPUID_SIZE 16
+#define GFP_MASK                                                               \
+	((GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NORETRY | __GFP_NOWARN)
+#define GFP_THP_MASK (GFP_TRANSHUGE_LIGHT | __GFP_NORETRY)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define KZEROD_POOL_SIZE 2
+/* Default thp water mark. */
+static unsigned short thp_wmark_per_numa = 20;
+#else
+#define KZEROD_POOL_SIZE 1
+#endif
+DECLARE_WAIT_QUEUE_HEAD(kzerod_wait);
+DEFINE_SPINLOCK(kfifo_lock);
+static unsigned short wmark_per_numa = 5;
+static unsigned int kzerod_order_map;
+static bool kzerod_enabled;
+static struct task_struct *task_kzerod;
+struct kfifo task_queue;
+static struct kzerod_node **kzerod_pool;
+static struct kzerod_numa_node **kzerod_per_numa;
+struct kzerod_node {
+	unsigned int wmark_high;
+	unsigned int wmark_low;
+	spinlock_t lock;
+	atomic_t cur_nr;
+	bool processing;
+	unsigned short cpuid;
+	struct llist_head zerod_pages;
+};
+
+struct kzerod_numa_node {
+	unsigned short cpu_nr;
+	atomic_t cur;
+	struct kzerod_node **per_cpu_node;
+};
+
+static inline unsigned int kzerod_get_idx(unsigned int order)
+{
+	switch (order) {
+	case 0:
+		return 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	case HPAGE_PMD_ORDER:
+		return 1;
+#endif
+	}
+	return -1;
+}
+
+static inline unsigned int kzerod_get_order(unsigned int idx)
+{
+	switch (idx) {
+	case 0:
+		return 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	case 1:
+		return HPAGE_PMD_ORDER;
+#endif
+	}
+	return -1;
+}
+
+static inline struct kzerod_node *kzerod_get_node(unsigned int order,
+						  unsigned int cpuid)
+{
+	unsigned int idx;
+	struct kzerod_numa_node *node =
+		&kzerod_per_numa[kzerod_get_idx(order)][cpu_to_node(cpuid)];
+
+	idx = ((unsigned int)atomic_inc_return(&node->cur)) % node->cpu_nr;
+	return node->per_cpu_node[idx];
+}
+
+void kzerod_enable_order(unsigned int order)
+{
+	kzerod_order_map |= 1U << order;
+}
+
+static inline void kzerod_disable_order(unsigned int order)
+{
+	kzerod_order_map &= ~(1U << order);
+}
+
+static inline bool kzerod_check_order(unsigned int order)
+{
+	return (kzerod_order_map & (1U << order)) != 0;
+}
+
+/* Get kzerod size by order. */
+unsigned long kzerod_get_zeroed_size(unsigned int order)
+{
+	int cpuid;
+	unsigned long ret = 0;
+	unsigned int idx = kzerod_get_idx(order);
+	unsigned int nr = 1 << order;
+
+	if (!kzerod_enabled)
+		return 0;
+	for (cpuid = 0; cpuid < nr_cpu_ids; cpuid++)
+		ret += atomic_read(&kzerod_pool[idx][cpuid].cur_nr) * nr;
+	return ret;
+}
+
+/* Update water mark for a kerod node. */
+static inline void kzerod_update_wmark(unsigned int order, unsigned int cpuid)
+{
+	int idx = kzerod_get_idx(order);
+	int node = cpu_to_node(cpuid);
+	struct kzerod_node *zn = &kzerod_pool[idx][cpuid];
+	unsigned long long free_pages = NODE_DATA(node)->node_present_pages;
+	/* `cpus_nr` means online cpu number in this numa node. */
+	unsigned int cpus_nr = cpumask_weight(cpumask_of_node(node));
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (order == HPAGE_PMD_ORDER)
+		zn->wmark_high = free_pages / (1 << order) *
+				 thp_wmark_per_numa / cpus_nr / 100;
+	else
+		zn->wmark_high = free_pages * wmark_per_numa / cpus_nr / 100;
+#else
+	zn->wmark_high = free_pages * wmark_per_numa / cpus_nr / 100;
+#endif
+
+	zn->wmark_low = zn->wmark_high >> 2;
+	pr_debug(
+		"%s: Update wmark for cpu: %d, order: %d, cur node free pages nr(4K): %llu, %d < %d",
+		__FILE__, cpuid, order, free_pages, zn->wmark_low,
+		zn->wmark_high);
+}
+
+/* Alloc a prezeroed page by order and cpuid. */
+struct page *alloc_prezeroed_page(unsigned int order, unsigned int cpuid)
+{
+	int ret;
+	unsigned int task = 0;
+	unsigned long _flag;
+	struct page *page = NULL;
+	struct kzerod_node *zn;
+	struct llist_node *lnode;
+
+	if (unlikely(!kzerod_enabled))
+		return NULL;
+	zn = kzerod_get_node(order, cpuid);
+	/* Fast fail if get lock failed. */
+	spin_lock_irqsave(&zn->lock, _flag);
+	if (!llist_empty(&zn->zerod_pages)) {
+		lnode = llist_del_first(&zn->zerod_pages);
+		spin_unlock_irqrestore(&zn->lock, _flag);
+		page = llist_entry(lnode, struct page, kzerod_node);
+		atomic_dec(&zn->cur_nr);
+	} else
+		spin_unlock_irqrestore(&zn->lock, _flag);
+
+	if (atomic_read(&zn->cur_nr) < zn->wmark_low && !zn->processing &&
+	    kzerod_check_order(order)) {
+		zn->processing = true;
+		task = (order << KZEROD_CPUID_SIZE) | zn->cpuid;
+		ret = kfifo_in_spinlocked(&task_queue, &task, sizeof(task),
+					  &kfifo_lock);
+		if (unlikely(ret != sizeof(task))) {
+			pr_err("%s: Write data failed: %d\n", __FILE__, task);
+			return NULL;
+		}
+		wake_up(&kzerod_wait);
+	}
+
+	return page;
+}
+
+/* Drain zero page for a order, also disable this order. */
+void drain_zerod_page(unsigned int order)
+{
+	unsigned int id;
+	unsigned long prev_zero;
+	struct page *page;
+	struct kzerod_node *zn;
+	struct llist_node *head;
+	struct page *last_page;
+	struct pglist_data *node;
+
+	kzerod_disable_order(order);
+	for (id = 0; id < nr_cpu_ids; id++) {
+		zn = &kzerod_pool[kzerod_get_idx(order)][id];
+		if (zn == NULL)
+			continue;
+		prev_zero = atomic_read(&zn->cur_nr);
+		spin_lock(&zn->lock);
+		head = llist_del_all(&zn->zerod_pages);
+		spin_unlock(&zn->lock);
+		page = llist_entry(head, struct page, kzerod_node);
+		if (!member_address_is_nonnull(page, kzerod_node))
+			continue;
+		node = NODE_DATA(cpu_to_node(id));
+		do {
+			last_page = llist_entry(&page->kzerod_node, struct page,
+						kzerod_node);
+			page = llist_entry(last_page->kzerod_node.next, struct page, kzerod_node);
+			__free_pages(last_page, order);
+			atomic_dec(&zn->cur_nr);
+		} while (member_address_is_nonnull(page, kzerod_node));
+	}
+	pr_debug("%s: drained %lu pages for order %d\n", __FILE__,
+		     prev_zero, order);
+}
+
+/* Drain all zero page in all zerod list. */
+static void drain_all_zerod_page(void)
+{
+	unsigned int i, order;
+
+	for (i = 0; i < KZEROD_POOL_SIZE; i++) {
+		order = kzerod_get_order(i);
+		kzerod_disable_order(order);
+		drain_zerod_page(order);
+	}
+}
+
+/* Used to clear page, support non-temporal instruction on x86_64 arch. */
+static inline void kzerod_clear_page(struct page *page,
+				     unsigned int pages_per_huge_page)
+{
+	int i;
+	void *addr;
+
+	for (i = 0; i < pages_per_huge_page; i++) {
+		addr = kmap_atomic(page);
+		clear_page_nocache(addr);
+		kunmap_atomic(addr);
+	}
+}
+
+/* Zeroing for one cpu by order. */
+static int kzerod_zeroing(unsigned int order, unsigned int cpuid, gfp_t gfp)
+{
+	long nr = 0;
+	int ret = 0;
+	struct page *page;
+	struct kzerod_node *zn;
+	unsigned int nodeid = cpu_to_node(cpuid);
+
+	if (!kzerod_enabled)
+		return -ENODEV;
+	zn = &kzerod_pool[kzerod_get_idx(order)][cpuid];
+	while (atomic_read(&zn->cur_nr) < zn->wmark_high) {
+		page = alloc_pages_node(nodeid, gfp, order);
+		if (!page) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		kzerod_clear_page(page, 1 << order);
+		nr += 1 << order;
+		/* No one will touch page->lnode so we don't need lock_page(). */
+		llist_add(&page->kzerod_node, &zn->zerod_pages);
+		atomic_inc(&zn->cur_nr);
+	}
+out:
+	zn->processing = false;
+	return ret;
+}
+
+/* Kernel zero page daemon function. */
+static int kzerod(void *p)
+{
+	int ret;
+	unsigned int task = 0;
+	static unsigned long prev_jiffies;
+	unsigned int prev_zero, cur_zero, cpuid, order;
+	struct kzerod_node *zn;
+
+	while (true) {
+		/* Freeze until kfifo not empty. */
+		wait_event_freezable(kzerod_wait, !kfifo_is_empty(&task_queue));
+		/* Process all kzerod tasks in kfifo. */
+		while (!kfifo_is_empty(&task_queue)) {
+			ret = kfifo_out(&task_queue, &task, sizeof(task));
+			if (unlikely(ret != sizeof(task))) {
+				pr_err("%s: Read data failed: %d\n", __FILE__,
+				       task);
+				return -ENODEV;
+			}
+			cpuid = task & 0xFFFF;
+			order = task >> KZEROD_CPUID_SIZE & 0xFFFF;
+			if (unlikely(!kzerod_check_order(order)))
+				continue;
+			zn = &kzerod_pool[kzerod_get_idx(order)][cpuid];
+			prev_zero = atomic_read(&zn->cur_nr);
+			pr_debug("%s: zeroing for cpu %d, order %d\n",
+				     __FILE__, cpuid, order);
+			prev_jiffies = jiffies;
+			ret = kzerod_zeroing(order, cpuid,
+					     order == HPAGE_PMD_ORDER ?
+							   GFP_THP_MASK :
+							   GFP_MASK);
+			cur_zero = atomic_read(&zn->cur_nr);
+			pr_debug(
+				"%s: ret from cpu %d order %d: %s(%d) zeroed:%d->%d pages %ums\n",
+				__FILE__, cpuid, order,
+				ret ? "failed" : "finished", ret, prev_zero,
+				cur_zero,
+				jiffies_to_msecs(jiffies - prev_jiffies));
+			switch (ret) {
+			case 0:
+				continue;
+			case -ENOMEM:
+				pr_debug("%s: No mem left for node %d\n",
+					     __FILE__, cpu_to_node(cpuid));
+				continue;
+			case -ENODEV:
+				return -ENODEV;
+			}
+		}
+	}
+	return 0;
+}
+
+/* Free all memory alloced by kmalloc. */
+static void __kzerod_free(void)
+{
+	unsigned int idx, nid;
+
+	if (kfifo_initialized(&task_queue))
+		kfifo_free(&task_queue);
+
+	for (idx = 0; idx < KZEROD_POOL_SIZE; idx++) {
+		kfree(kzerod_pool[idx]);
+		for (nid = 0; nid < nr_online_nodes; nid++)
+			kfree(kzerod_per_numa[idx][nid].per_cpu_node);
+		kfree(kzerod_per_numa[idx]);
+	}
+
+	kfree(kzerod_pool);
+	kfree(kzerod_per_numa);
+}
+
+/* Exit kzerod module. */
+static void __kzerod_exit(void)
+{
+	kzerod_enabled = false;
+	kzerod_order_map = 0;
+
+	if (task_kzerod)
+		kthread_stop(task_kzerod);
+
+	if (kzerod_enabled)
+		drain_all_zerod_page();
+
+	__kzerod_free();
+}
+
+static void __exit kzerod_exit(void)
+{
+	__kzerod_exit();
+}
+
+static int kzerod_struct_init(void)
+{
+	int ret;
+	unsigned int idx, nid;
+
+	kzerod_pool =
+		kmalloc_array(KZEROD_POOL_SIZE, sizeof(void *), GFP_KERNEL);
+	if (!kzerod_pool)
+		return -ENOMEM;
+
+	kzerod_per_numa =
+		kmalloc_array(KZEROD_POOL_SIZE, sizeof(void *), GFP_KERNEL);
+
+	if (!kzerod_per_numa) {
+		kfree(kzerod_pool);
+		return -ENOMEM;
+	}
+
+	for (idx = 0; idx < KZEROD_POOL_SIZE; idx++) {
+		/* Alloc kzerod pool, we alloc `nr_cpu_ids` node since some cpus maybe offline. */
+		kzerod_pool[idx] = kmalloc_array(nr_cpu_ids, sizeof(struct kzerod_node),
+		 GFP_KERNEL);
+		if (!kzerod_pool[idx]) {
+			ret = -ENOMEM;
+			goto kzerod_init_failed;
+		}
+		memset(kzerod_pool[idx], 0,
+		       nr_cpu_ids * sizeof(struct kzerod_node));
+
+		kzerod_per_numa[idx] = kmalloc_array(nr_online_nodes,
+				      sizeof(struct kzerod_numa_node), GFP_KERNEL);
+		if (!kzerod_per_numa[idx]) {
+			ret = -ENOMEM;
+			goto kzerod_init_failed;
+		}
+		memset(kzerod_per_numa[idx], 0,
+		       nr_online_nodes * sizeof(struct kzerod_numa_node));
+
+		for (nid = 0; nid < nr_online_nodes; nid++) {
+			kzerod_per_numa[idx][nid].per_cpu_node = kmalloc_array(
+					nr_cpu_ids, sizeof(void *), GFP_KERNEL);
+			if (!kzerod_per_numa[idx][nid].per_cpu_node) {
+				ret = -ENOMEM;
+				goto kzerod_init_failed;
+			}
+			memset(kzerod_per_numa[idx][nid].per_cpu_node, 0,
+		       nr_cpu_ids * sizeof(void *));
+		}
+	}
+
+	ret = kfifo_alloc(&task_queue,
+			  nr_cpu_ids * KZEROD_POOL_SIZE * sizeof(unsigned int),
+			  GFP_KERNEL);
+	if (ret) {
+		ret = -ENOMEM;
+		goto kzerod_init_failed;
+	}
+
+	return 0;
+
+kzerod_init_failed:
+	pr_err("%s: Failed to alloc memory for kzerod\n", __FILE__);
+	__kzerod_free();
+	return ret;
+}
+
+static void kzerod_struct_set(void)
+{
+	unsigned int idx, cpuid, order;
+	struct kzerod_node *node;
+	struct kzerod_numa_node *numa_node;
+
+	for (idx = 0; idx < KZEROD_POOL_SIZE; idx++) {
+		/* Enable current order. */
+		order = kzerod_get_order(idx);
+		kzerod_enable_order(order);
+		for (cpuid = 0; cpuid < nr_cpu_ids; cpuid++) {
+			node = &kzerod_pool[idx][cpuid];
+			node->cpuid = cpuid;
+			numa_node = &kzerod_per_numa[idx][cpu_to_node(cpuid)];
+			kzerod_update_wmark(order, cpuid);
+			init_llist_head(&node->zerod_pages);
+			spin_lock_init(&node->lock);
+			/* CPU hot-plug is not currently supported. */
+			numa_node->per_cpu_node[numa_node->cpu_nr] = node;
+			numa_node->cpu_nr++;
+		}
+	}
+}
+
+static int kzerod_thread_init(void)
+{
+	int ret;
+	struct sched_param param = { .sched_priority = 0 };
+
+	task_kzerod = kthread_run(kzerod, NULL, "kzerod");
+	if (IS_ERR(task_kzerod)) {
+		task_kzerod = NULL;
+		pr_err("%s: Failed to start kzerod\n", __FILE__);
+		return -ENODEV;
+	}
+
+	kzerod_enabled = true;
+	ret = sched_setscheduler(task_kzerod, SCHED_NORMAL, &param);
+	return ret;
+}
+
+/* Init kzerod module. */
+static int __init kzerod_init(void)
+{
+	int ret;
+
+	ret = kzerod_struct_init();
+	if (ret)
+		return ret;
+
+	kzerod_struct_set();
+
+	ret = kzerod_thread_init();
+	if (ret) {
+		__kzerod_exit();
+		return ret;
+	}
+
+	return 0;
+}
+
+/* Enable or disable kzerod param.*/
+static int kzerod_set_enabled_param(const char *val,
+				    const struct kernel_param *kp)
+{
+	int error;
+	bool prev;
+	unsigned int i, order;
+
+	if (!task_kzerod) {
+		pr_err("%s: Can't enable, task_kzerod is not ready\n",
+		       __FILE__);
+		return -ENODEV;
+	}
+
+	prev = kzerod_enabled;
+	error = param_set_bool(val, kp);
+	if (error)
+		return error;
+	if (!prev && kzerod_enabled) {
+		for (i = 0; i < KZEROD_POOL_SIZE; i++) {
+			order = kzerod_get_order(i);
+			kzerod_enable_order(order);
+		}
+		pr_info("%s: enabled\n", __FILE__);
+	} else if (prev && !kzerod_enabled) {
+		drain_all_zerod_page();
+		pr_info("%s: disabled\n", __FILE__);
+	}
+	return error;
+}
+
+static struct kernel_param_ops kzerod_enabled_param_ops = {
+	.set = kzerod_set_enabled_param,
+	.get = param_get_bool,
+};
+module_param_cb(enabled, &kzerod_enabled_param_ops, &kzerod_enabled, 0644);
+
+/* Set water mark for common pages, all cpu in a numa node share these pages. */
+static int kzerod_set_wmark_param(const char *val,
+				  const struct kernel_param *kp)
+{
+	int error;
+	unsigned short result;
+	unsigned int id;
+
+	if (!kzerod_enabled) {
+		pr_err("%s: Can't set, enable kzerod first\n", __FILE__);
+		return -ENODEV;
+	}
+	error = kstrtou16(val, 10, &result);
+	if (error < 0 || result > 90 || (result + wmark_per_numa) > 90) {
+		pr_err("Invalid input: %d\n", result);
+		return error;
+	}
+
+	error = param_set_ushort(val, kp);
+
+	for (id = 0; id < nr_cpu_ids; id++)
+		kzerod_update_wmark(0, id);
+	return error;
+}
+
+static struct kernel_param_ops kzerod_wmark_param_ops = {
+	.set = kzerod_set_wmark_param,
+	.get = param_get_ushort,
+};
+module_param_cb(per_numa_water_mark, &kzerod_wmark_param_ops, &wmark_per_numa,
+		0644);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* Set water mark for transparent pages, all cpu in a numa node share these pages. */
+static int kzerod_set_thp_wmark_param(const char *val,
+				      const struct kernel_param *kp)
+{
+	int error;
+	unsigned short result;
+	unsigned int id;
+
+	if (!kzerod_enabled) {
+		pr_err("%s: Can't set, enable kzerod first\n", __FILE__);
+		return -ENODEV;
+	}
+	error = kstrtou16(val, 10, &result);
+	if (error < 0 || result > 90 || (result + thp_wmark_per_numa) > 90) {
+		pr_err("Invalid input: %d\n", result);
+		return error;
+	}
+
+	error = param_set_ushort(val, kp);
+
+	for (id = 0; id < nr_cpu_ids; id++)
+		kzerod_update_wmark(HPAGE_PMD_ORDER, id);
+	return error;
+}
+
+static struct kernel_param_ops kzerod_thp_wmark_param_ops = {
+	.set = kzerod_set_thp_wmark_param,
+	.get = param_get_ushort,
+};
+module_param_cb(thp_per_numa_water_mark, &kzerod_thp_wmark_param_ops,
+		&thp_wmark_per_numa, 0644);
+#endif
+
+module_init(kzerod_init);
+module_exit(kzerod_exit);