| #include <linux/perf_event.h> |
| #include <sys/syscall.h> |
| #include <sys/mman.h> |
| #include <sys/user.h> |
| #include <unistd.h> |
| #include <string.h> |
| #include <stdbool.h> |
| #include <stdint.h> |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <err.h> |
| |
| struct psm_counter { |
| int fd; |
| struct perf_event_mmap_page *metadata; |
| }; |
| |
| struct psm_counter *psm_counter_create(void) |
| { |
| struct psm_counter *counter = malloc(sizeof(struct psm_counter)); |
| |
| struct perf_event_attr attr; |
| memset(&attr, 0, sizeof(attr)); |
| |
| attr.type = PERF_TYPE_HARDWARE; |
| attr.size = sizeof(struct perf_event_attr); |
| attr.config = PERF_COUNT_HW_CPU_CYCLES; |
| |
| counter->fd = syscall( |
| SYS_perf_event_open, |
| &attr, /* attributes */ |
| 0, /* monitor me */ |
| -1, /* all CPUs */ |
| -1, /* group leader */ |
| PERF_FLAG_FD_CLOEXEC /* flags */ |
| ); |
| |
| if (counter->fd == -1) |
| err(1, "perf_event_open"); |
| |
| counter->metadata = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, |
| counter->fd, 0); |
| if (counter->metadata == MAP_FAILED) { |
| err(1, "mmap"); |
| } |
| |
| if (!counter->metadata->cap_user_rdpmc) |
| errx(1, "RDPMC not supported (cap_user_rdpmc == 0)\n"); |
| |
| if (!counter->metadata->index) |
| errx(1, "RDPMC not supported (no index assigned)\n"); |
| |
| return counter; |
| } |
| |
| void psm_counter_destroy(struct psm_counter *counter) |
| { |
| munmap(counter->metadata, PAGE_SIZE); |
| counter->metadata = MAP_FAILED; |
| close(counter->fd); |
| counter->fd = -1; |
| } |
| |
| static inline void psm_barrier(void) |
| { |
| asm volatile ("" : : : "memory"); |
| } |
| |
| static inline void psm_serialize(void) |
| { |
| unsigned int eax = 0, ecx = 0; |
| asm volatile ("cpuid" |
| : "+a" (eax), "+c" (ecx) : : "ebx", "edx", "memory"); |
| } |
| |
| static inline uint64_t psm_rdpmc(unsigned int ecx) |
| { |
| unsigned int eax, edx; |
| asm volatile ("rdpmc" : "=a" (eax), "=d" (edx) : "c" (ecx)); |
| return (((uint64_t)edx << 32) | eax); |
| } |
| |
| typedef bool (*psm_sample_fn)(uint64_t *count, |
| const struct psm_counter *counter, |
| int reps, |
| void *opaque); |
| |
| /* |
| * psm_atomic is a mechanism for very precise counter measurement of |
| * very short operations. A psm_atomic interval will fail if the |
| * process is preempted during the measurement interval. |
| */ |
| |
| struct psm_atomic { |
| /* Copied from metadata in psm_atomic_start. */ |
| uint32_t perf_lock; /* If odd, this sample is bad. */ |
| uint64_t perf_time_offset; |
| |
| uint64_t initial_raw_count; |
| |
| /* |
| * This is here to improve code generation. struct psm_atomic |
| * is unlikely to be referenced by an escapted pointer and hence |
| * be affected by a "memory" clobber. |
| */ |
| unsigned int rdpmc_ecx; |
| }; |
| |
| /* |
| * Starts measuring an uninterrupted duration. |
| */ |
| static inline struct psm_atomic psm_atomic_start(const struct psm_counter *ctr) |
| { |
| struct psm_atomic state; |
| state.perf_lock = ctr->metadata->lock; |
| psm_barrier(); |
| state.perf_time_offset = ctr->metadata->time_offset; |
| state.rdpmc_ecx = ctr->metadata->index - 1; |
| psm_barrier(); /* Do rdpmc last to reduce noise */ |
| state.initial_raw_count = psm_rdpmc(state.rdpmc_ecx); |
| return state; |
| } |
| |
| static inline bool psm_atomic_elapsed(uint64_t *elapsed_count, |
| const struct psm_atomic *state, |
| const struct psm_counter *ctr) |
| { |
| /* Do the RDPMC first to reduce noise. */ |
| uint64_t count_now = psm_rdpmc(state->rdpmc_ecx); |
| psm_barrier(); /* No, really, do it first. */ |
| unsigned int shift = 64 - ctr->metadata->pmc_width; |
| count_now <<= shift; |
| uint64_t initial_count = state->initial_raw_count; |
| initial_count <<= shift; |
| *elapsed_count = (count_now - initial_count) >> shift; |
| psm_barrier(); |
| |
| if (ctr->metadata->time_offset != state->perf_time_offset) |
| return false; /* We were interrupted. */ |
| |
| /* Now check the lock. */ |
| psm_barrier(); |
| if (ctr->metadata->lock != state->perf_lock || (state->perf_lock & 1)) |
| return false; |
| |
| return true; |
| } |
| |
| bool psm_atomic_sample_empty(uint64_t *count, |
| const struct psm_counter *ctr, |
| int reps, void *opaque) |
| { |
| struct psm_atomic duration = psm_atomic_start(ctr); |
| return psm_atomic_elapsed(count, &duration, ctr); |
| } |
| |
| bool psm_atomic_sample_enosys(uint64_t *count, |
| const struct psm_counter *ctr, |
| int reps, void *opaque) |
| { |
| struct psm_atomic duration = psm_atomic_start(ctr); |
| #ifdef __x86_64__ |
| unsigned long rax; |
| for (int i = 0; i < reps; i++) { |
| rax = 0xbfffffff; |
| asm volatile ("syscall" : "+a" (rax) : : "rcx", "r11"); |
| } |
| #else |
| for (int i = 0; i < reps; i++) |
| syscall(0x3fffffff); |
| #endif |
| return psm_atomic_elapsed(count, &duration, ctr); |
| } |
| |
| bool psm_atomic_sample_bad_prctl(uint64_t *count, |
| const struct psm_counter *ctr, |
| int reps, void *opaque) |
| { |
| struct psm_atomic duration = psm_atomic_start(ctr); |
| #ifdef __x86_64__ |
| unsigned long rax; |
| register unsigned long rdi asm("rdi"); |
| for (int i = 0; i < reps; i++) { |
| rax = SYS_prctl; |
| rdi = -1; |
| asm volatile ("syscall" : "+a" (rax), "+r" (rdi) : : "rcx", "r11"); |
| } |
| #else |
| for (int i = 0; i < reps; i++) |
| syscall(SYS_prctl, -1); |
| #endif |
| return psm_atomic_elapsed(count, &duration, ctr); |
| } |
| |
| static int compare_ui64(const void *a_void, const void *b_void) |
| { |
| const uint64_t a = *(uint64_t*)a_void; |
| const uint64_t b = *(uint64_t*)b_void; |
| if (a < b) |
| return -1; |
| else if (a > b) |
| return 1; |
| else |
| return 0; |
| } |
| |
| uint64_t psm_integer_quantile(const struct psm_counter *ctr, psm_sample_fn fn, |
| int reps, void *opaque, |
| size_t q, size_t n) |
| { |
| if (q >= n) |
| abort(); |
| |
| uint64_t *array = calloc(sizeof(uint64_t), n); |
| for (size_t i = 0; i < n; ) { |
| uint64_t sample; |
| if (!fn(&sample, ctr, reps, opaque)) |
| continue; |
| array[i++] = sample; |
| } |
| |
| qsort(array, n, sizeof(uint64_t), compare_ui64); |
| |
| uint64_t ret = array[q]; |
| free(array); |
| return ret; |
| } |
| |
| uint64_t psm_settle(const struct psm_counter *ctr, psm_sample_fn fn, |
| int reps, void *opaque) |
| { |
| uint64_t val = UINT64_MAX; |
| int good_iters = 0; |
| for (int total_iters = 0; total_iters < 500; total_iters++) { |
| uint64_t best = UINT64_MAX; |
| for (int inner = 0; inner < 10; ) { |
| uint64_t count; |
| if (!fn(&count, ctr, reps, opaque)) |
| continue; /* Rejected sample */ |
| if (count < best) |
| best = count; |
| inner++; |
| } |
| |
| if (best != val) { |
| good_iters = 1; |
| val = best; |
| } else { |
| good_iters++; |
| if (good_iters == 10) |
| return val; |
| } |
| } |
| |
| return ~0ULL; |
| } |
| |
| int reparray[] = {1, 200}; |
| |
| struct psm_costestimate |
| { |
| double baseline; |
| double per_rep; |
| }; |
| |
| struct pair |
| { |
| int x; |
| double y; |
| }; |
| |
| bool psm_estimate_cost(struct psm_costestimate *est, |
| const struct psm_counter *ctr, |
| psm_sample_fn fn, void *opaque) |
| { |
| int rounds = 50; |
| int scale = 1; |
| int steps = 100; |
| int n = rounds * steps; |
| int i = 0; |
| uint64_t sample; |
| |
| struct pair *array = calloc(sizeof(struct pair), n); |
| for (int r = 0; r < rounds; r++) { |
| for (int s = 0; s < steps; s++) { |
| array[i].x = s * scale; |
| i++; |
| } |
| } |
| |
| /* Now randomly permute it. */ |
| for (i = n - 1; i >= 1; i--) { |
| int j = rand() % i; |
| int tmp = array[i].x; |
| array[i].x = array[j].x; |
| array[j].x = tmp; |
| } |
| |
| /* Burn a big sample. */ |
| fn(&sample, ctr, scale * steps * (rounds / 2), opaque); |
| |
| /* Now get all the samples and accumulate. */ |
| double sum_xy = 0.0, sum_x = 0.0, sum_xx = 0.0, sum_y = 0.0; |
| for (i = 0; i < n; i++) { |
| while (!fn(&sample, ctr, array[i].x, opaque)) |
| ; |
| array[i].y = sample; |
| |
| sum_x += array[i].x; |
| sum_xy += array[i].x * array[i].y; |
| sum_xx += (double)array[i].x * (double)array[i].x; |
| sum_y += array[i].y; |
| } |
| |
| /* Calculate a simple linear regression. */ |
| est->per_rep = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x); |
| est->baseline = (sum_y - est->per_rep * sum_x) / n; |
| |
| free(array); |
| return true; |
| }; |
| |
| int main() |
| { |
| struct psm_counter *ctr = psm_counter_create(); |
| |
| uint64_t baseline = psm_settle(ctr, psm_atomic_sample_empty, 1, NULL); |
| if (baseline == (uint64_t)~0ULL) |
| printf("Self-monitoring warm-up didn't settle down\n"); |
| else |
| printf("Self-monitoring warmed up: overhead is %llu cycles\n", |
| (unsigned long long)baseline); |
| |
| /* |
| for (int repidx = 0; repidx < 2; repidx++) { |
| int reps = reparray[repidx]; |
| for (int i = 0; i < 10; i++) { |
| uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_enosys, reps, NULL, 250, 500) - baseline; |
| printf("%dx ENOSYS: %llu\n", reps, (unsigned long long)cost/reps); |
| } |
| } |
| |
| for (int repidx = 0; repidx < 2; repidx++) { |
| int reps = reparray[repidx]; |
| for (int i = 0; i < 10; i++) { |
| uint64_t cost = psm_integer_quantile(ctr, psm_atomic_sample_bad_prctl, reps, NULL, 250, 500) - baseline; |
| printf("%dx bad prctl: %llu\n", reps, (unsigned long long)cost/reps); |
| } |
| } |
| */ |
| |
| struct psm_costestimate est; |
| |
| psm_estimate_cost(&est, ctr, psm_atomic_sample_enosys, NULL); |
| printf("ENOSYS:\t\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline); |
| |
| psm_estimate_cost(&est, ctr, psm_atomic_sample_bad_prctl, NULL); |
| printf("bad prctl:\t%.2f cycles (plus %.2f cycles self-monitoring overhead)\n", est.per_rep, est.baseline); |
| |
| psm_counter_destroy(ctr); |
| } |