Add support for more vmstat counters

Signed-off-by: Chris Mason <clm@fb.com>
diff --git a/simoop.c b/simoop.c
index 2069c08..4b61f3e 100644
--- a/simoop.c
+++ b/simoop.c
@@ -23,6 +23,8 @@
 #include <sys/mman.h>
 #include <libgen.h>
 #include <locale.h>
+#include <ctype.h>
+#include <limits.h>
 
 /* these are part of the histogram accounting */
 #define PLAT_BITS	8
@@ -71,6 +73,10 @@
 static int thinking_mem = 128 * 1024 * 1024;
 /* should we do a truncate and fsync after every write */
 static int funksync = 0;
+/* are we just appending bytes onto the ends of the working set files */
+static int append_mode = 0;
+/* randomize the write size */
+static int oddsizes = 0;
 
 /* -M how much memory we allocate to benchmark allocations */
 static int mmap_size = 64 * 1024 * 1024;
@@ -132,8 +138,130 @@
 	NULL,
 };
 
-char *option_string = "t:s:C:c:r:n:f:FR:T:m:W:M:w:i:D:";
+
+/* match this with vmstat_labels */
+enum {
+	ALLOCSTALLS,
+	VMSCAN_NR_WRITE,
+	TOTAL_VMSTATS,
+};
+
+char *vmstat_labels[] = {
+	"allocstall",
+	"nr_vmscan_write",
+	NULL,
+};
+
+struct vmstat_info {
+	double instant_rate[TOTAL_VMSTATS];
+	double last_rate[TOTAL_VMSTATS];
+	double rate[TOTAL_VMSTATS];
+	struct stats stats[TOTAL_VMSTATS];
+};
+
+static void save_vmstat_rates(struct vmstat_info *vmstat_info)
+{
+	int i;
+	for (i = 0; i < TOTAL_VMSTATS; i++) {
+		vmstat_info->last_rate[i] = vmstat_info->rate[i];
+	}
+}
+
+static void save_instant_vmstat_rates(struct vmstat_info *vmstat_info)
+{
+	int i;
+	for (i = 0; i < TOTAL_VMSTATS; i++) {
+		vmstat_info->instant_rate[i] = vmstat_info->rate[i];
+	}
+}
+
+/*
+ * A not-so-good version fls64. No fascinating optimization since
+ * no one except parse_size use it
+ */
+static int fls64(unsigned long long x)
+{
+	int i;
+
+	for (i = 0; i <64; i++)
+		if (x << i & (1ULL << 63))
+			return 64 - i;
+	return 64 - i;
+}
+
+unsigned long long parse_size(char *s)
+{
+	char c;
+	char *endptr;
+	unsigned long long mult = 1;
+	unsigned long long ret;
+
+	if (!s) {
+		fprintf(stderr, "size value is empty\n");
+		exit(1);
+	}
+	if (s[0] == '-') {
+		fprintf(stderr, "size value '%s' is less equal than 0\n", s);
+		exit(1);
+	}
+	ret = strtoull(s, &endptr, 10);
+	if (endptr == s) {
+		fprintf(stderr, "size value '%s' is invalid\n", s);
+		exit(1);
+	}
+	if (endptr[0] && endptr[1]) {
+		fprintf(stderr, "illegal suffix contains character '%c' in wrong position\n",
+			endptr[1]);
+		exit(1);
+	}
+	/*
+	 * strtoll returns LLONG_MAX when overflow, if this happens,
+	 * need to call strtoull to get the real size
+	 */
+	if (errno == ERANGE && ret == ULLONG_MAX) {
+		fprintf(stderr, "size value '%s' is too large for unsigned long long", s);
+		exit(1);
+	}
+	if (endptr[0]) {
+		c = tolower(endptr[0]);
+		switch (c) {
+		case 'e':
+			mult *= 1024;
+			/* fallthrough */
+		case 'p':
+			mult *= 1024;
+			/* fallthrough */
+		case 't':
+			mult *= 1024;
+			/* fallthrough */
+		case 'g':
+			mult *= 1024;
+			/* fallthrough */
+		case 'm':
+			mult *= 1024;
+			/* fallthrough */
+		case 'k':
+			mult *= 1024;
+			/* fallthrough */
+		case 'b':
+			break;
+		default:
+			fprintf(stderr, "unknown size descriptor '%c'", c);
+			exit(1);
+		}
+	}
+	/* Check whether ret * mult overflow */
+	if (fls64(ret) + fls64(mult) - 1 > 64) {
+		fprintf(stderr, "size value '%s' is too large for unsigned long long\n", s);
+		exit(1);
+	}
+	ret *= mult;
+	return ret;
+}
+
+char *option_string = "t:s:C:c:r:n:f:FR:T:m:W:M:w:i:D:oa";
 static struct option long_options[] = {
+	{"appendmode", required_argument, 0, 'a'},
 	{"mmapsize", required_argument, 0, 'M'},
 	{"filesize", required_argument, 0, 'f'},
 	{"numfiles", required_argument, 0, 'n'},
@@ -150,6 +278,7 @@
 	{"cputhreads", required_argument, 0, 'C'},
 	{"memory", required_argument, 0, 'm'},
 	{"funksync", no_argument, 0, 'F'},
+	{"oddsizes", no_argument, 0, 'o'},
 	{"help", no_argument, 0, HELP_LONG_OPT},
 	{0, 0, 0, 0}
 };
@@ -157,9 +286,10 @@
 static void print_usage(void)
 {
 	fprintf(stderr, "simoop usage:\n"
+		"\t-a (--appendmode): append onto working files\n"
 		"\t-t (--threads): worker threads (def: 16)\n"
-		"\t-m (--memory): memory in MB to allocate during think time in each worker (def 128)\n"
-		"\t-M (--mmapsize): amount in MB to mmap to time allocator (64MB)\n"
+		"\t-m (--memory): memory to allocate during think time in each worker (def 128m)\n"
+		"\t-M (--mmapsize): amount to mmap to time allocator (64M)\n"
 		"\t-r (--runtime): How long to run before exiting (seconds, def: 30)\n"
 		"\t-w (--warmuptime): How long to warmup before resetting the stats (seconds, def: 60)\n"
 		"\t-i (--interval): Sleep time in seconds between latency reports (sec, def: 120\n"
@@ -167,13 +297,15 @@
 		"\t-c (--cputime): How long to think during each worker loop (seconds, def: 3)\n"
 		"\t-C (--cputhreads): How many threads do the cpu time loop (24)\n"
 		"\t-n (--numfiles): Number of files per directory tree (65536)\n"
-		"\t-f (--filesize): Size of each file in MB (64MB)\n"
-		"\t-R (--readsize): amount in MB to read from each file (2MB)\n"
-		"\t-W (--writesize): amount in MB to write to tmp files (2MB)\n"
+		"\t-f (--filesize): Size of each file (64M)\n"
+		"\t-R (--readsize): amount to read from each file (2M)\n"
+		"\t-W (--writesize): amount to write to tmp files (2M)\n"
 		"\t-T (--rwthreads): how many threads to read/write (8)\n"
 		"\t-D (--duthraeds): how many threads to scanning the working dirs (1)\n"
 		"\t-F (--funksync): should we fsync;truncate(0);fsync after writes\n"
+		"\t-o (--oddsizes): randomize sizes to unaligned values\n"
 		"\t dir1 [dir2 ... dirN]\n"
+		"\nall sizes are in bytes k,m,g,t modifiers can be used\n"
 	       );
 	exit(1);
 }
@@ -195,16 +327,17 @@
 			break;
 
 		switch(c) {
+		case 'a':
+			append_mode = 1;
+			break;
 		case 's':
 			found_sleeptime = atoi(optarg);
 			break;
 		case 'm':
-			thinking_mem = atoi(optarg);
-			thinking_mem *= 1024 *1024;
+			thinking_mem = parse_size(optarg);
 			break;
 		case 'M':
-			mmap_size = atoi(optarg);
-			mmap_size *= 1024 *1024;
+			mmap_size = parse_size(optarg);
 			break;
 		case 'c':
 			found_cputime = atoi(optarg);
@@ -228,19 +361,16 @@
 			funksync = 0;
 			break;
 		case 'f':
-			file_size = atoi(optarg);
-			file_size *= 1024 * 1024;
+			file_size = parse_size(optarg);
 			break;
 		case 'n':
 			num_files = atoi(optarg);
 			break;
 		case 'R':
-			read_size = atoi(optarg);
-			read_size *= 1024 * 1024;
+			read_size = parse_size(optarg);
 			break;
 		case 'W':
-			write_size = atoi(optarg);
-			write_size *= 1024 * 1024;
+			write_size = parse_size(optarg);
 			break;
 		case 'T':
 			rw_threads = atoi(optarg);
@@ -248,6 +378,9 @@
 		case 'D':
 			du_threads = atoi(optarg);
 			break;
+		case 'o':
+			oddsizes = 1;
+			break;
 		case '?':
 		case HELP_LONG_OPT:
 			print_usage();
@@ -281,6 +414,13 @@
 	if (found_cputime >= 0)
 		cputime = found_cputime * 1000000;
 
+	if (cputime == 0 || cpu_threads == 0) {
+		cputime = 0;
+		cpu_threads = 0;
+		if (!found_sleeptime)
+			sleeptime = 0;
+	}
+
 	if (optind < ac) {
 		fprintf(stderr, "Error Extra arguments '%s'\n", av[optind]);
 		exit(1);
@@ -610,6 +750,14 @@
 	return fd;
 }
 
+static int randomize_size(int sz)
+{
+	if (!oddsizes)
+		return sz;
+
+	return rand() % sz;
+}
+
 /* helper for startup, do initial writes to a given fd */
 static void fill_one_file(int fd)
 {
@@ -617,6 +765,7 @@
 	int ret;
 	unsigned long long cur_size;
 	char *buf;
+	unsigned long long this_size = randomize_size(file_size);
 
 	ret = fstat(fd, &st);
 	if (ret < 0) {
@@ -625,8 +774,16 @@
 	}
 	cur_size = st.st_size;
 
-	if (cur_size >= file_size)
+	if (append_mode && oddsizes && this_size > 4096 && rand() % 2) {
+		this_size = this_size % 4096;
+	}
+
+	if (cur_size >= this_size) {
+		if (append_mode) {
+			ftruncate(fd, this_size);
+		}
 		return;
+	}
 
 	buf = malloc(BUF_SIZE);
 	if (!buf) {
@@ -634,21 +791,24 @@
 		exit(1);
 	}
 
-
 	memset(buf, 'a', BUF_SIZE);
-	while (cur_size < file_size) {
-		ret = write(fd, buf, BUF_SIZE);
+	while (cur_size < this_size) {
+		int this_write = this_size - cur_size;
+
+		if (this_write > BUF_SIZE)
+			this_write = BUF_SIZE;
+
+		ret = write(fd, buf, this_write);
 		if (ret < 0) {
 			perror("write");
 			exit(1);
 		}
-		if (ret < BUF_SIZE) {
+		if (ret < this_write) {
 			fprintf(stderr, "short write\n");
 			exit(1);
 		}
 		cur_size += ret;
 	}
-
 	free(buf);
 }
 
@@ -761,13 +921,23 @@
 	int fd;
 	int ret;
 	int i;
-	unsigned long write_bytes = write_size;
+	int write_bytes = randomize_size(write_size);
 	unsigned long long offset = 0;
 
-	fd = open_path(path, seq, RESULT_FILE, 0);
+	if (append_mode)
+		fd = open_path(path, seq, DATA_FILE, O_APPEND);
+	else
+		fd = open_path(path, seq, RESULT_FILE, 0);
+
+	if (oddsizes && write_bytes > 4096)
+		write_bytes = write_bytes % 4096;
 
 	while (write_bytes > 0) {
-		ret = write(fd, buf, BUF_SIZE);
+		int this_write = write_bytes;
+		if (this_write > BUF_SIZE)
+			this_write = BUF_SIZE;
+
+		ret = write(fd, buf, this_write);
 		if (ret == 0)
 			break;
 		if (ret < 0) {
@@ -786,8 +956,13 @@
 	close(fd);
 
 	/* make some dirty inodes */
-	for (i = 0; i < 8; i++)
-		dirty_an_inode(path);
+	if (!append_mode) {
+		for (i = 0; i < 8; i++)
+			dirty_an_inode(path);
+	} else if (rand() % 10 == 0) {
+		/* delete some files */
+		ftruncate(fd, 0);
+	}
 }
 
 /* make all the worker files under a main path */
@@ -797,7 +972,7 @@
 	int fd;
 
 	for (seq = 0; seq < num_files; seq++) {
-		fd = open_path(path, seq, DATA_FILE, 0);
+		fd = open_path(path, seq, DATA_FILE, O_APPEND);
 		fill_one_file(fd);
 		close(fd);
 
@@ -919,9 +1094,10 @@
 
 	read_tids = malloc(sizeof(*read_tids) * rw_threads);
 	write_tids = malloc(sizeof(*write_tids) * rw_threads);
-	mem = malloc(thinking_mem);
+	if (thinking_mem)
+		mem = malloc(thinking_mem);
 
-	if (!read_tids || !write_tids || !mem) {
+	if (!read_tids || !write_tids || (thinking_mem && !mem)) {
 		perror("allocation failed\n");
 		exit(1);
 	}
@@ -944,28 +1120,31 @@
 		}
 
 		/* if someone swapped out our thinking mem, bring it back */
-		memset(mem, 0, thinking_mem);
+		if (thinking_mem)
+			memset(mem, 0, thinking_mem);
 
 		gettimeofday(&start, NULL);
 
 		/* Start the threads to read files */
-		read_some_files(read_buf, read_tids);
+		if (read_size) {
+			read_some_files(read_buf, read_tids);
 
-		/* think in parallel */
-		usec_spin(cputime);
+			/* think in parallel */
+			usec_spin(cputime);
 
-		/* wait for our reads to finish */
-		for (i = 0; i < rw_threads; i++) {
-			pthread_join(read_tids[i], NULL);
+			/* wait for our reads to finish */
+			for (i = 0; i < rw_threads; i++) {
+				pthread_join(read_tids[i], NULL);
+			}
+			gettimeofday(&now, NULL);
+
+			/*
+			 * record how long the reading stage took.  This
+			 * includes all of the latencies for thread creation,
+			 * doing the reads and waiting for completeion
+			 */
+			record_one_lat(&td->stats[READ_STATS], &start, &now);
 		}
-		gettimeofday(&now, NULL);
-
-		/*
-		 * record how long the reading stage took.  This
-		 * includes all of the latencies for thread creation,
-		 * doing the reads and waiting for completeion
-		 */
-		record_one_lat(&td->stats[READ_STATS], &start, &now);
 
 		/* write out the (pretend) results */
 		if (write_size) {
@@ -983,35 +1162,38 @@
 		 * a chunk of pages.  This is basicaly the user-visible
 		 * impact of allocation stalls
 		 */
-		gettimeofday(&start, NULL);
+		if (mmap_size) {
+			gettimeofday(&start, NULL);
 
-		mmap_ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
-				MAP_ANONYMOUS | MAP_PRIVATE,
-				-1, 0);
+			mmap_ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+					MAP_ANONYMOUS | MAP_PRIVATE,
+					-1, 0);
 
-		if (mmap_ptr == MAP_FAILED) {
-			perror("mmap");
-			exit(1);
+			if (mmap_ptr == MAP_FAILED) {
+				perror("mmap");
+				exit(1);
+			}
+
+			/* fault in all those pages */
+			for (i = 0; i < mmap_size; i += 4096) {
+				mmap_ptr[i] = 'a';
+			}
+
+			/* measure how long all of this took */
+			gettimeofday(&now, NULL);
+			record_one_lat(&td->stats[ALLOC_STATS], &start, &now);
+
+			/* think again, pretending we've done something useful */
+			munmap(mmap_ptr, mmap_size);
 		}
-
-		/* fault in all those pages */
-		for (i = 0; i < mmap_size; i += 4096) {
-			mmap_ptr[i] = 'a';
-		}
-
-		/* measure how long all of this took */
-		gettimeofday(&now, NULL);
-		record_one_lat(&td->stats[ALLOC_STATS], &start, &now);
-
-		/* think again, pretending we've done something useful */
-		munmap(mmap_ptr, mmap_size);
 		usec_spin(cputime);
+
 		free(read_buf);
 		free(write_buf);
-
 		td->work_done++;
 
-		usleep(sleeptime);
+		if (sleeptime)
+			usleep(sleeptime);
 	}
 
 	free(mem);
@@ -1039,29 +1221,32 @@
  * read in /proc/vmstat so we can sum the allocation stall lines and
  * print them out
  */
-static int read_allocstalls(void)
+static void read_vmstat(struct vmstat_info *vmstat_info)
 {
-	int stalls = 0;
 	int val;
 	FILE * fp;
 	char * line = NULL;
 	size_t len = 0;
 	ssize_t read;
+	int i;
 
 	fp = fopen("/proc/vmstat", "r");
 	if (fp == NULL)
-		return 0;
+		return;
 
+	memset(vmstat_info->rate, 0, sizeof(double) * TOTAL_VMSTATS);
 	while ((read = getline(&line, &len, fp)) != -1) {
 		/*
 		 * newer kernels break out different types of allocstall,
 		 * just add them all together
 		 */
-		if (strstr(line, "allocstall")) {
-			char *p = strchr(line, ' ');
-			if (p && p[1] != '\0') {
-				val = atoi(p + 1);
-				stalls += val;
+		for (i = 0; i < TOTAL_VMSTATS; i++) {
+			if (strstr(line, vmstat_labels[i])) {
+				char *p = strchr(line, ' ');
+				if (p && p[1] != '\0') {
+					val = atoi(p + 1);
+					vmstat_info->rate[i] += val;
+				}
 			}
 		}
 	}
@@ -1069,7 +1254,6 @@
 	if (line)
 		free(line);
 	fclose(fp);
-	return stalls;
 }
 
 /*
@@ -1101,10 +1285,8 @@
 static void print_latencies(struct thread_data *worker_threads_mem,
 			    struct stats *stats,
 			    struct stats *work_done_stats,
-			    struct stats *allocstall_stats,
+			    struct vmstat_info *vmstat_info,
 			    double work_done, double instant_work_done,
-			    double allocstalls,
-			    double instant_allocstalls,
 			    unsigned long long delta,
 			    unsigned long long instant_delta)
 {
@@ -1112,6 +1294,7 @@
 	double instant_rate;
 	double seconds = (double)delta / 1000000;
 	unsigned int p50, p95, p99;
+	int i;
 
 	printf("___\n");
 	printf("Run time: %.0f seconds\n", seconds);
@@ -1128,13 +1311,24 @@
 	printf("work rate = %.2f/sec (avg %.2f/sec) (p50: %.2f) (p95: %.2f) (p99: %.2f)\n",
 	       instant_rate, rate, (double)p50/100.00, (double)p95/100.00, (double)p99/100.00);
 
-	/* do the same for the allocation stall rate */
-	rate = (allocstalls * 1000000) / delta;
-	instant_rate = (instant_allocstalls * 1000000) / instant_delta;
-	add_lat(allocstall_stats, rate * 100);
-	calc_p99(allocstall_stats, &p50, &p95, &p99);
-	printf("alloc stall rate = %.2f/sec (avg: %.2f) (p50: %.2f) (p95: %.2f) (p99: %.2f)\n",
-	       instant_rate, rate, (double)p50/100.00, (double)p95/100.00, (double)p99/100.00);
+	for (i = 0; i < TOTAL_VMSTATS; i++) {
+		rate = vmstat_info->rate[i] - vmstat_info->last_rate[i];
+		if (rate < 0)
+			rate = 0;
+
+		instant_rate = vmstat_info->rate[i] - vmstat_info->instant_rate[i];
+		if (instant_rate < 0)
+			instant_rate = 0;
+
+		rate = (rate * 1000000) / delta;
+		instant_rate = (instant_rate * 1000000) / delta;
+
+		add_lat(&vmstat_info->stats[i], rate * 100);
+		calc_p99(&vmstat_info->stats[i], &p50, &p95, &p99);
+		printf("%s rate = %.2f/sec (avg: %.2f) (p50: %.2f) (p95: %.2f) (p99: %.2f)\n",
+		       vmstat_labels[i], instant_rate, rate,
+		       (double)p50/100.00, (double)p95/100.00, (double)p99/100.00);
+	}
 
 }
 
@@ -1153,22 +1347,20 @@
 	double work_done = 0;
 	double instant_work_done = 0;
 	double last_work_done = 0;
-	double allocstalls = 0;
-	double instant_allocstalls = 0;
-	double last_allocstalls = 0;
 	struct stats stats[TOTAL_STATS];
+	struct vmstat_info vmstat_info;
 	struct stats work_done_stats;
-	struct stats allocstall_stats;
 	int i;
 
 	gettimeofday(&start, NULL);
 	rate_start = start;
 
 	memset(&work_done_stats, 0, sizeof(work_done_stats));
-	memset(&allocstall_stats, 0, sizeof(allocstall_stats));
+	memset(&vmstat_info, 0, sizeof(vmstat_info));
 
-	last_allocstalls = read_allocstalls();
-	allocstalls = last_allocstalls;
+	read_vmstat(&vmstat_info);
+	save_vmstat_rates(&vmstat_info);
+	save_instant_vmstat_rates(&vmstat_info);
 	while(1) {
 		gettimeofday(&now, NULL);
 		instant_start = now;
@@ -1179,13 +1371,14 @@
 			__sync_synchronize();
 			warmup_done = 1;
 			memset(&work_done_stats, 0, sizeof(work_done_stats));
-			memset(&allocstall_stats, 0, sizeof(allocstall_stats));
-			last_allocstalls = read_allocstalls();
+			memset(&vmstat_info, 0, sizeof(vmstat_info));
+			read_vmstat(&vmstat_info);
+			save_vmstat_rates(&vmstat_info);
+			save_instant_vmstat_rates(&vmstat_info);
 			last_work_done = work_done;
 			rate_start = now;
 		}
 
-		instant_allocstalls = allocstalls;
 		instant_work_done = work_done;
 		if (delta < runtime_usec)
 			sleep(interval_seconds);
@@ -1200,19 +1393,15 @@
 		for (i = 0; i < worker_threads; i++)
 			work_done += worker_threads_mem[i].work_done;
 
-		allocstalls = read_allocstalls();
-		if (allocstalls < last_allocstalls)
-			allocstalls = last_allocstalls;
-
+		read_vmstat(&vmstat_info);
 		print_latencies(worker_threads_mem, stats,
 				&work_done_stats,
-				&allocstall_stats,
+				&vmstat_info,
 				work_done - last_work_done,
 				work_done - instant_work_done,
-				allocstalls - last_allocstalls,
-				allocstalls - instant_allocstalls,
 				rate_delta, instant_delta);
 
+		save_instant_vmstat_rates(&vmstat_info);
 	}
 	__sync_synchronize();
 	stopping = 1;
@@ -1228,13 +1417,12 @@
 	gettimeofday(&now, NULL);
 	rate_delta = tvdelta(&rate_start, &now);
 	instant_delta = tvdelta(&instant_start, &now);
+	read_vmstat(&vmstat_info);
 	print_latencies(worker_threads_mem, stats,
 			&work_done_stats,
-			&allocstall_stats,
+			&vmstat_info,
 			work_done - last_work_done,
 			work_done - instant_work_done,
-			allocstalls - last_allocstalls,
-			allocstalls - instant_allocstalls,
 			rate_delta, instant_delta);
 
 }
@@ -1252,7 +1440,9 @@
 
 	if (du_threads > total_paths)
 		du_threads = total_paths;
-	du_tids = calloc(du_threads, sizeof(pthread_t));
+
+	/* du threads might be zero */
+	du_tids = calloc(du_threads + 1, sizeof(pthread_t));
 
 	worker_threads_mem = calloc(worker_threads + cpu_threads,
 				    sizeof(struct thread_data));