seccomp: Benchmark shenanigans

This introduces a terrible seccomp bypass so we can do some more
detailed benchmarking...

Signed-off-by: Kees Cook <keescook@chromium.org>
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 4192369..7f45935 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -8,7 +8,8 @@
 					 SECCOMP_FILTER_FLAG_LOG | \
 					 SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
 					 SECCOMP_FILTER_FLAG_NEW_LISTENER | \
-					 SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
+					 SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
+					 SECCOMP_FILTER_FLAG_BENCHMARK)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index c173545..56e58a2 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -23,6 +23,7 @@
 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW		(1UL << 2)
 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH		(1UL << 4)
+#define SECCOMP_FILTER_FLAG_BENCHMARK		(1UL << 5)
 
 /*
  * All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184..10479fb 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -111,6 +111,7 @@ struct notification {
  *         outside of a lifetime-guarded section.  In general, this
  *         is only needed for handling filters shared across tasks.
  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @benchmark: true if filter should be skipped and return "allow"
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
  * @notif: the struct that holds all notification related information
@@ -129,6 +130,7 @@ struct notification {
 struct seccomp_filter {
 	refcount_t usage;
 	bool log;
+	bool benchmark;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
 	struct notification *notif;
@@ -242,6 +244,12 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 	return 0;
 }
 
+static noinline u32 __seccomp_benchmark(struct bpf_prog *prog,
+					const struct seccomp_data *sd)
+{
+	return SECCOMP_RET_ALLOW;
+}
+
 /**
  * seccomp_run_filters - evaluates all seccomp filters against @sd
  * @sd: optional seccomp data to be passed to filters
@@ -264,6 +272,11 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	if (WARN_ON(f == NULL))
 		return SECCOMP_RET_KILL_PROCESS;
 
+	if (f->benchmark) {
+		*match = f;
+		return __seccomp_benchmark(f->prog, sd);
+	}
+
 	/*
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
@@ -538,6 +551,9 @@ static long seccomp_attach_filter(unsigned int flags,
 	if (flags & SECCOMP_FILTER_FLAG_LOG)
 		filter->log = true;
 
+	if (flags & SECCOMP_FILTER_FLAG_BENCHMARK)
+		filter->benchmark = true;
+
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
 	 * task reference.
diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c
index 5838c86..4b19029 100644
--- a/tools/testing/selftests/seccomp/seccomp_benchmark.c
+++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c
@@ -8,6 +8,7 @@
 #include <stdlib.h>
 #include <time.h>
 #include <unistd.h>
+#include <errno.h>
 #include <linux/filter.h>
 #include <linux/seccomp.h>
 #include <sys/prctl.h>
@@ -57,6 +58,16 @@ unsigned long long calibrate(void)
 	}
 }
 
+#define SECCOMP_FILTER_FLAG_BENCHMARK	(1UL << 5)
+
+#ifndef seccomp
+int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+	errno = 0;
+	return syscall(__NR_seccomp, op, flags, args);
+}
+#endif
+
 int main(int argc, char *argv[])
 {
 	struct sock_filter filter[] = {
@@ -68,7 +79,7 @@ int main(int argc, char *argv[])
 	};
 	long ret;
 	unsigned long long samples;
-	unsigned long long native, filtered;
+	unsigned long long native, filter1, filter2, no_bpf;
 
 	if (argc > 1)
 		samples = strtoull(argv[1], NULL, 0);
@@ -77,23 +88,49 @@ int main(int argc, char *argv[])
 
 	printf("Benchmarking %llu samples...\n", samples);
 
+	/* Native call */
 	native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
 	printf("getpid native: %llu ns\n", native);
 
 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 	assert(ret == 0);
 
+	/* One filter */
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 	assert(ret == 0);
 
-	filtered = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
-	printf("getpid RET_ALLOW: %llu ns\n", filtered);
+	filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+	printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1);
 
-	printf("Estimated seccomp overhead per syscall: %llu ns\n",
-		filtered - native);
+	/* Two filters */
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+	assert(ret == 0);
 
-	if (filtered == native)
-		printf("Trying running again with more samples.\n");
+	filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+	printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2);
+
+	/* Benchmark bypass filter */
+	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_BENCHMARK, &prog);
+	assert(ret == 0);
+
+	no_bpf = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+	printf("getpid BPF-less allow: %llu ns\n", no_bpf);
+
+	/* Calculations */
+	printf("Estimated total seccomp overhead for 1 filter: %llu ns\n",
+		filter1 - native);
+
+	printf("Estimated total seccomp overhead for 2 filters: %llu ns\n",
+		filter2 - native);
+
+	printf("Estimated seccomp per-filter overhead: %llu ns\n",
+		filter2 - filter1);
+
+	printf("Estimated seccomp entry overhead: %llu ns\n",
+		filter1 - native - (filter2 - filter1));
+
+	printf("Estimated BPF overhead per filter: %llu ns\n",
+		filter1 - no_bpf);
 
 	return 0;
 }