Add how_bad_is_smap, which is a hack
diff --git a/tight_loop/how_bad_is_smap.c b/tight_loop/how_bad_is_smap.c
new file mode 100644
index 0000000..c2df0e8
--- /dev/null
+++ b/tight_loop/how_bad_is_smap.c
@@ -0,0 +1,383 @@
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <err.h>
+#include <fcntl.h>
+
+struct psm_counter {
+	int fd;
+	struct perf_event_mmap_page *metadata;
+};
+
+struct psm_counter *psm_counter_create(void)
+{
+	struct psm_counter *counter = malloc(sizeof(struct psm_counter));
+
+	struct perf_event_attr attr;
+	memset(&attr, 0, sizeof(attr));
+
+	attr.type = PERF_TYPE_HARDWARE;
+	attr.size = sizeof(struct perf_event_attr);
+	attr.config = PERF_COUNT_HW_CPU_CYCLES;
+
+	counter->fd = syscall(
+		SYS_perf_event_open,
+		&attr,			/* attributes */
+		0,			/* monitor me */
+		-1,			/* all CPUs */
+		-1,			/* group leader */
+		PERF_FLAG_FD_CLOEXEC	/* flags */
+		);
+
+	if (counter->fd == -1)
+		err(1, "perf_event_open");
+
+	counter->metadata = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED,
+				 counter->fd, 0);
+	if (counter->metadata == MAP_FAILED) {
+		err(1, "mmap");
+	}
+
+	if (!counter->metadata->cap_user_rdpmc)
+		errx(1, "RDPMC not supported (cap_user_rdpmc == 0)\n");
+
+	if (!counter->metadata->index)
+		errx(1, "RDPMC not supported (no index assigned)\n");
+
+	return counter;
+}
+
+void psm_counter_destroy(struct psm_counter *counter)
+{
+	munmap(counter->metadata, PAGE_SIZE);
+	counter->metadata = MAP_FAILED;
+	close(counter->fd);
+	counter->fd = -1;
+}
+
+static inline void psm_barrier(void)
+{
+	asm volatile ("" : : : "memory");
+}
+
+static inline void psm_serialize(void)
+{
+	unsigned int eax = 0, ecx = 0;
+	asm volatile ("cpuid"
+		      : "+a" (eax), "+c" (ecx) : : "ebx", "edx", "memory");
+}
+
+static inline uint64_t psm_rdpmc(unsigned int ecx)
+{
+        unsigned int eax, edx;
+        asm volatile ("rdpmc" : "=a" (eax), "=d" (edx) : "c" (ecx));
+	return (((uint64_t)edx << 32) | eax);
+}
+
+typedef bool (*psm_sample_fn)(uint64_t *count,
+			      const struct psm_counter *counter,
+			      int reps,
+			      void *opaque);
+
+/*
+ * psm_atomic is a mechanism for very precise counter measurement of
+ * very short operations.  A psm_atomic interval will fail if the
+ * process is preempted during the measurement interval.
+ */
+
+struct psm_atomic {
+	/* Copied from metadata in psm_atomic_start. */
+	uint32_t perf_lock;	/* If odd, this sample is bad. */
+	uint64_t perf_time_offset;
+
+	uint64_t initial_raw_count;
+
+	/*
+	 * This is here to improve code generation.  struct psm_atomic
+	 * is unlikely to be referenced by an escapted pointer and hence
+	 * be affected by a "memory" clobber.
+	 */
+	unsigned int rdpmc_ecx;
+};
+
+/*
+ * Starts measuring an uninterrupted duration.
+ */
+static inline struct psm_atomic psm_atomic_start(const struct psm_counter *ctr)
+{
+	struct psm_atomic state;
+	state.perf_lock = ctr->metadata->lock;
+	psm_barrier();
+	state.perf_time_offset = ctr->metadata->time_offset;
+	state.rdpmc_ecx = ctr->metadata->index - 1;
+	psm_barrier();  /* Do rdpmc last to reduce noise */
+	state.initial_raw_count = psm_rdpmc(state.rdpmc_ecx);
+	return state;
+}
+
+static inline bool psm_atomic_elapsed(uint64_t *elapsed_count,
+				      const struct psm_atomic *state,
+				      const struct psm_counter *ctr)
+{
+	/* Do the RDPMC first to reduce noise. */
+	uint64_t count_now = psm_rdpmc(state->rdpmc_ecx);
+	psm_barrier();  /* No, really, do it first. */
+	unsigned int shift = 64 - ctr->metadata->pmc_width;
+	count_now <<= shift;
+	uint64_t initial_count = state->initial_raw_count;
+	initial_count <<= shift;
+	*elapsed_count = (count_now - initial_count) >> shift;
+	psm_barrier();
+
+	if (ctr->metadata->time_offset != state->perf_time_offset)
+		return false;  /* We were interrupted. */
+
+	/* Now check the lock. */
+	psm_barrier();
+	if (ctr->metadata->lock != state->perf_lock || (state->perf_lock & 1))
+		return false;
+
+	return true;
+}
+
+bool psm_atomic_sample_empty(uint64_t *count,
+			     const struct psm_counter *ctr,
+			     int reps, void *opaque)
+{
+	struct psm_atomic duration = psm_atomic_start(ctr);
+	return psm_atomic_elapsed(count, &duration, ctr);
+}
+
+bool psm_atomic_sample_enosys(uint64_t *count,
+			      const struct psm_counter *ctr,
+			      int reps, void *opaque)
+{
+	struct psm_atomic duration = psm_atomic_start(ctr);
+#ifdef __x86_64__
+	unsigned long rax;
+	for (int i = 0; i < reps; i++) {
+		rax = 0xbfffffff;
+		asm volatile ("syscall" : "+a" (rax) : : "rcx", "r11");
+	}
+#else
+	for (int i = 0; i < reps; i++)
+		syscall(0x3fffffff);
+#endif
+	return psm_atomic_elapsed(count, &duration, ctr);
+}
+
+bool psm_atomic_sample_bad_prctl(uint64_t *count,
+				 const struct psm_counter *ctr,
+				 int reps, void *opaque)
+{
+	struct psm_atomic duration = psm_atomic_start(ctr);
+#ifdef __x86_64__
+	unsigned long rax;
+	register unsigned long rdi asm("rdi");
+	for (int i = 0; i < reps; i++) {
+		rax = SYS_prctl;
+		rdi = -1;
+		asm volatile ("syscall" : "+a" (rax), "+r" (rdi) : : "rcx", "r11");
+	}
+#else
+	for (int i = 0; i < reps; i++)
+		syscall(SYS_prctl, -1);
+#endif
+	return psm_atomic_elapsed(count, &duration, ctr);
+}
+
+bool psm_atomic_sample_pread_fd(uint64_t *count,
+			       const struct psm_counter *ctr,
+			       int reps, void *opaque)
+{
+	int fd = (int)(intptr_t)opaque;
+	char buf[16];
+
+	struct psm_atomic duration = psm_atomic_start(ctr);
+	for (int i = 0; i < reps; i++)
+		if (pread(fd, buf, sizeof(buf), 0) == -1)
+			err(1, "pread");
+	return psm_atomic_elapsed(count, &duration, ctr);
+}
+
+static int compare_ui64(const void *a_void, const void *b_void)
+{
+	const uint64_t a = *(uint64_t*)a_void;
+	const uint64_t b = *(uint64_t*)b_void;
+	if (a < b)
+		return -1;
+	else if (a > b)
+		return 1;
+	else
+		return 0;
+}
+
+uint64_t psm_integer_quantile(const struct psm_counter *ctr, psm_sample_fn fn,
+			      int reps, void *opaque,
+			      size_t q, size_t n)
+{
+	if (q >= n)
+		abort();
+
+	uint64_t *array = calloc(sizeof(uint64_t), n);
+	for (size_t i = 0; i < n; ) {
+		uint64_t sample;
+		if (!fn(&sample, ctr, reps, opaque))
+			continue;
+		array[i++] = sample;
+	}
+
+	qsort(array, n, sizeof(uint64_t), compare_ui64);
+
+	uint64_t ret = array[q];
+	free(array);
+	return ret;
+}
+
+uint64_t psm_settle(const struct psm_counter *ctr, psm_sample_fn fn,
+		    int reps, void *opaque)
+{
+	uint64_t val = UINT64_MAX;
+	int good_iters = 0;
+	for (int total_iters = 0; total_iters < 500; total_iters++) {
+		uint64_t best = UINT64_MAX;
+		for (int inner = 0; inner < 10; ) {
+			uint64_t count;
+			if (!fn(&count, ctr, reps, opaque))
+				continue;  /* Rejected sample */
+			if (count < best)
+				best = count;
+			inner++;
+		}
+
+		if (best != val) {
+			good_iters = 1;
+			val = best;
+		} else {
+			good_iters++;
+			if (good_iters == 10)
+				return val;
+		}
+	}
+
+	return ~0ULL;
+}
+
+int reparray[] = {1, 200};
+
+struct psm_costestimate
+{
+	double baseline;
+	double per_rep;
+};
+
+struct pair
+{
+	int x;
+	double y;
+};
+
+bool psm_estimate_cost(struct psm_costestimate *est,
+		       const struct psm_counter *ctr,
+		       psm_sample_fn fn, void *opaque)
+{
+	int rounds = 50;
+	int scale = 1;
+	int steps = 100;
+	int n = rounds * steps;
+	int i = 0;
+	uint64_t sample;
+
+	struct pair *array = calloc(sizeof(struct pair), n);
+	for (int r = 0; r < rounds; r++) {
+		for (int s = 0; s < steps; s++) {
+			array[i].x = s * scale;
+			i++;
+		}
+	}
+
+	/* Now randomly permute it. */
+	for (i = n - 1; i >= 1; i--) {
+		int j = rand() % i;
+		int tmp = array[i].x;
+		array[i].x = array[j].x;
+		array[j].x = tmp;
+	}
+
+	/* Burn a big sample. */
+	fn(&sample, ctr, scale * steps * (rounds / 2), opaque);
+
+	/* Now get all the samples and accumulate. */
+	double sum_xy = 0.0, sum_x = 0.0, sum_xx = 0.0, sum_y = 0.0;
+	for (i = 0; i < n; i++) {
+		while (!fn(&sample, ctr, array[i].x, opaque))
+			;
+		array[i].y = sample;
+
+		sum_x += array[i].x;
+		sum_xy += array[i].x * array[i].y;
+		sum_xx += (double)array[i].x * (double)array[i].x;
+		sum_y += array[i].y;
+	}
+
+	/* Calculate a simple linear regression. */
+	est->per_rep = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x);
+	est->baseline = (sum_y - est->per_rep * sum_x) / n;
+
+	free(array);
+	return true;
+};
+
+int main()
+{
+	struct psm_counter *ctr = psm_counter_create();
+
+	uint64_t baseline = psm_settle(ctr, psm_atomic_sample_empty, 1, NULL);
+	if (baseline == (uint64_t)~0ULL)
+		printf("Self-monitoring warm-up didn't settle down\n");
+	else
+		printf("Self-monitoring warmed up: overhead is %llu cycles\n",
+		       (unsigned long long)baseline);
+
+	/*
+	for (int repidx = 0; repidx < 2; repidx++) {
+		int reps = reparray[repidx];
+		for (int i = 0; i < 10; i++) {
+			uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_enosys, reps, NULL, 250, 500) - baseline;
+			printf("%dx ENOSYS: %llu\n", reps, (unsigned long long)cost/reps);
+		}
+	}
+
+	for (int repidx = 0; repidx < 2; repidx++) {
+		int reps = reparray[repidx];
+		for (int i = 0; i < 10; i++) {
+			uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_bad_prctl, reps, NULL, 250, 500) - baseline;
+			printf("%dx bad prctl: %llu\n", reps, (unsigned long long)cost/reps);
+		}
+	}
+	*/
+
+	struct psm_costestimate est;
+
+	psm_estimate_cost(&est, ctr, psm_atomic_sample_enosys, NULL);
+	printf("ENOSYS:\t\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline);
+
+	psm_estimate_cost(&est, ctr, psm_atomic_sample_bad_prctl, NULL);
+	printf("bad prctl:\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline);
+
+	int fd = open("/dev/zero", O_RDONLY);
+	if (fd == -1)
+		err(1, "open /dev/zero");
+	
+	psm_estimate_cost(&est, ctr, psm_atomic_sample_pread_fd, (void *)(intptr_t)fd);
+	printf("pread /dev/zero:\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline);
+
+	psm_counter_destroy(ctr);
+}