Merge ../../devel/misc-tests
diff --git a/dump-vvar.c b/dump-vvar.c
index dce7b66..afb3244 100644
--- a/dump-vvar.c
+++ b/dump-vvar.c
@@ -3,12 +3,52 @@
 #include <string.h>
 #include <sys/mman.h>
 #include <unistd.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <err.h>
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+                       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static jmp_buf fail_jmp;
+
+static void handler(int sig, siginfo_t *si, void *ctx_void)
+{
+	write(2, "whoops!\n", 8);
+	siglongjmp(fail_jmp, 1);
+}
+
+static void dump_page(int n, const void *base)
+{
+	unsigned char data[4096];
+
+	sethandler(SIGBUS, handler, 0);
+
+	if (sigsetjmp(fail_jmp, 0)) {
+		fprintf(stderr, "Cannot read vvar page %d\n", n);
+		memset(data, 0xff, sizeof(data));
+	} else {
+		memcpy(data, base, sizeof(data));
+	}
+
+	write(1, data, 4096);
+}
 
 int main()
 {
 	FILE *maps;
 	void *vvar_begin, *vvar_end;
 	int found_vvar = 0;
+	int npages;
 
 	maps = fopen("/proc/self/maps", "r");
 	char buf[1024];
@@ -26,23 +66,13 @@
 	}
 
 	sscanf(buf, "%p-%p", &vvar_begin, &vvar_end);
+	npages = ((char *)vvar_end - (char *)vvar_begin) / 4096;;
 
-	fprintf(stderr, "vvar mapping is at 0x%lx to 0x%lx\n",
-		(unsigned long)vvar_begin, (unsigned long)vvar_end);
+	fprintf(stderr, "vvar mapping is %d pages (0x%lx - 0x%lx)\n",
+		npages, (unsigned long)vvar_begin, (unsigned long)vvar_end);
 
-	maps = fopen("/proc/self/maps", "r");
-	mremap(vvar_begin, vvar_end-vvar_begin, vvar_end-vvar_begin, MREMAP_FIXED | MREMAP_MAYMOVE, 0x0badc0de0000);
-
-	mremap(vvar_begin - 4096, 4096, 4096, MREMAP_FIXED | MREMAP_MAYMOVE, 0x0badc0de0000 - 4096);
-
-	while (fgets(buf, 1024, maps)) {
-		fprintf(stderr, "%s", buf);
-	}
-
-	vvar_end = (void*)0x0badc0de0000 + (vvar_end - vvar_begin);
-	vvar_begin = (void*)0x0badc0de0000;
-
-	write(1, vvar_begin, vvar_end - vvar_begin);
+	for (int i = 0; i < npages; i++)
+		dump_page(i, (char *)vvar_begin + i * 4096);
 
 	mprotect(vvar_begin, vvar_end - vvar_begin, PROT_READ | PROT_WRITE);
 
diff --git a/tight_loop/perf_self_monitor.c b/tight_loop/perf_self_monitor.c
index 51c3338..c5a8a44 100644
--- a/tight_loop/perf_self_monitor.c
+++ b/tight_loop/perf_self_monitor.c
@@ -17,7 +17,7 @@
 
 struct psm_counter *psm_counter_create(void)
 {
-	struct psm_counter *counter = malloc(sizeof(struct psm_counter));;
+	struct psm_counter *counter = malloc(sizeof(struct psm_counter));
 
 	struct perf_event_attr attr;
 	memset(&attr, 0, sizeof(attr));
@@ -82,6 +82,7 @@
 
 typedef bool (*psm_sample_fn)(uint64_t *count,
 			      const struct psm_counter *counter,
+			      int reps,
 			      void *opaque);
 
 /*
@@ -146,40 +147,46 @@
 }
 
 bool psm_atomic_sample_empty(uint64_t *count,
-			     const struct psm_counter *ctr, void *opaque)
+			     const struct psm_counter *ctr,
+			     int reps, void *opaque)
 {
 	struct psm_atomic duration = psm_atomic_start(ctr);
 	return psm_atomic_elapsed(count, &duration, ctr);
 }
 
 bool psm_atomic_sample_enosys(uint64_t *count,
-			      const struct psm_counter *ctr, void *opaque)
+			      const struct psm_counter *ctr,
+			      int reps, void *opaque)
 {
 	struct psm_atomic duration = psm_atomic_start(ctr);
 #ifdef __x86_64__
 	unsigned long rax;
-	rax = 0xbfffffff;
-	asm volatile ("syscall" : "+a" (rax) : : "rcx", "r11");
+	for (int i = 0; i < reps; i++) {
+		rax = 0xbfffffff;
+		asm volatile ("syscall" : "+a" (rax) : : "rcx", "r11");
+	}
 #else
-	syscall(0x3fffffff);
+	for (int i = 0; i < reps; i++)
+		syscall(0x3fffffff);
 #endif
 	return psm_atomic_elapsed(count, &duration, ctr);
 }
 
-bool psm_atomic_sample_bad_prctl_x1000(uint64_t *count,
-			      const struct psm_counter *ctr, void *opaque)
+bool psm_atomic_sample_bad_prctl(uint64_t *count,
+				 const struct psm_counter *ctr,
+				 int reps, void *opaque)
 {
 	struct psm_atomic duration = psm_atomic_start(ctr);
 #ifdef __x86_64__
 	unsigned long rax;
 	register unsigned long rdi asm("rdi");
-	for (int i = 0; i < 1000; i++) {
+	for (int i = 0; i < reps; i++) {
 		rax = SYS_prctl;
 		rdi = -1;
 		asm volatile ("syscall" : "+a" (rax), "+r" (rdi) : : "rcx", "r11");
 	}
 #else
-	for (int i = 0; i < 1000; i++)
+	for (int i = 0; i < reps; i++)
 		syscall(SYS_prctl, -1);
 #endif
 	return psm_atomic_elapsed(count, &duration, ctr);
@@ -198,7 +205,8 @@
 }
 
 uint64_t psm_integer_quantile(const struct psm_counter *ctr, psm_sample_fn fn,
-			      void *opaque, size_t q, size_t n)
+			      int reps, void *opaque,
+			      size_t q, size_t n)
 {
 	if (q >= n)
 		abort();
@@ -206,7 +214,7 @@
 	uint64_t *array = calloc(sizeof(uint64_t), n);
 	for (size_t i = 0; i < n; ) {
 		uint64_t sample;
-		if (!fn(&sample, ctr, opaque))
+		if (!fn(&sample, ctr, reps, opaque))
 			continue;
 		array[i++] = sample;
 	}
@@ -218,7 +226,8 @@
 	return ret;
 }
 
-void psm_settle(const struct psm_counter *ctr, psm_sample_fn fn, void *opaque)
+uint64_t psm_settle(const struct psm_counter *ctr, psm_sample_fn fn,
+		    int reps, void *opaque)
 {
 	uint64_t val = UINT64_MAX;
 	int good_iters = 0;
@@ -226,7 +235,7 @@
 		uint64_t best = UINT64_MAX;
 		for (int inner = 0; inner < 10; ) {
 			uint64_t count;
-			if (!fn(&count, ctr, opaque))
+			if (!fn(&count, ctr, reps, opaque))
 				continue;  /* Rejected sample */
 			if (count < best)
 				best = count;
@@ -239,32 +248,114 @@
 		} else {
 			good_iters++;
 			if (good_iters == 10)
-				return;
+				return val;
 		}
 	}
+
+	return ~0ULL;
 }
 
+int reparray[] = {1, 200};
+
+struct psm_costestimate
+{
+	double baseline;
+	double per_rep;
+};
+
+struct pair
+{
+	int x;
+	double y;
+};
+
+bool psm_estimate_cost(struct psm_costestimate *est,
+		       const struct psm_counter *ctr,
+		       psm_sample_fn fn, void *opaque)
+{
+	int rounds = 50;
+	int scale = 1;
+	int steps = 100;
+	int n = rounds * steps;
+	int i = 0;
+	uint64_t sample;
+
+	struct pair *array = calloc(sizeof(struct pair), n);
+	for (int r = 0; r < rounds; r++) {
+		for (int s = 0; s < steps; s++) {
+			array[i].x = s * scale;
+			i++;
+		}
+	}
+
+	/* Now randomly permute it. */
+	for (i = n - 1; i >= 1; i--) {
+		int j = rand() % i;
+		int tmp = array[i].x;
+		array[i].x = array[j].x;
+		array[j].x = tmp;
+	}
+
+	/* Burn a big sample. */
+	fn(&sample, ctr, scale * steps * (rounds / 2), opaque);
+
+	/* Now get all the samples and accumulate. */
+	double sum_xy = 0.0, sum_x = 0.0, sum_xx = 0.0, sum_y = 0.0;
+	for (i = 0; i < n; i++) {
+		while (!fn(&sample, ctr, array[i].x, opaque))
+			;
+		array[i].y = sample;
+
+		sum_x += array[i].x;
+		sum_xy += array[i].x * array[i].y;
+		sum_xx += (double)array[i].x * (double)array[i].x;
+		sum_y += array[i].y;
+	}
+
+	/* Calculate a simple linear regression. */
+	est->per_rep = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x);
+	est->baseline = (sum_y - est->per_rep * sum_x) / n;
+
+	free(array);
+	return true;
+};
+
 int main()
 {
 	struct psm_counter *ctr = psm_counter_create();
 
-	psm_settle(ctr, psm_atomic_sample_empty, NULL);
+	uint64_t baseline = psm_settle(ctr, psm_atomic_sample_empty, 1, NULL);
+	if (baseline == (uint64_t)~0ULL)
+		printf("Self-monitoring warm-up didn't settle down\n");
+	else
+		printf("Self-monitoring warmed up: overhead is %llu cycles\n",
+		       (unsigned long long)baseline);
 
-	uint64_t baseline =
-		psm_integer_quantile(ctr, psm_atomic_sample_empty,
-				     NULL, 250, 500);
-	printf("An empty sample takes %llu cycles\n",
-	       (unsigned long long)baseline);
-
-	for (int i = 0; i < 20; i++) {
-		uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_enosys, NULL, 250, 500) - baseline;
-		printf("1x ENOSYS: %llu\n", (unsigned long long)cost);
+	/*
+	for (int repidx = 0; repidx < 2; repidx++) {
+		int reps = reparray[repidx];
+		for (int i = 0; i < 10; i++) {
+			uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_enosys, reps, NULL, 250, 500) - baseline;
+			printf("%dx ENOSYS: %llu\n", reps, (unsigned long long)cost/reps);
+		}
 	}
 
-	for (int i = 0; i < 20; i++) {
-		uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_bad_prctl_x1000, NULL, 250, 500) - baseline;
-		printf("1000x bad prctl: %llu\n", (unsigned long long)cost/1000);
+	for (int repidx = 0; repidx < 2; repidx++) {
+		int reps = reparray[repidx];
+		for (int i = 0; i < 10; i++) {
+			uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_bad_prctl, reps, NULL, 250, 500) - baseline;
+			printf("%dx bad prctl: %llu\n", reps, (unsigned long long)cost/reps);
+		}
 	}
+	*/
+
+	struct psm_costestimate est;
+
+	psm_estimate_cost(&est, ctr, psm_atomic_sample_enosys, NULL);
+	printf("ENOSYS:\t\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline);
+
+	psm_estimate_cost(&est, ctr, psm_atomic_sample_bad_prctl, NULL);
+	printf("bad prctl:\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline);
 
 	psm_counter_destroy(ctr);
 }