test/cq-overflow.c - pub/scm/linux/kernel/git/axboe/liburing - Git at Google

 /* SPDX-License-Identifier: MIT */
 /*
  * Description: run various CQ ring overflow tests
  *
  */
 #include <errno.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
 #include <fcntl.h>
 #include <assert.h>

 #include "helpers.h"
 #include "liburing.h"

 #define FILE_SIZE	(256 * 1024)
 #define BS		4096
 #define BUFFERS		(FILE_SIZE / BS)

 static struct iovec *vecs;

 #define ENTRIES	8

 /*
  * io_uring has rare cases where CQEs are lost.
  * This happens when there is no space in the CQ ring, and also there is no
  * GFP_ATOMIC memory available. In reality this probably means that the process
  * is about to be killed as many other things might start failing, but we still
  * want to test that liburing and the kernel deal with this properly. The fault
  * injection framework allows us to test this scenario. Unfortunately this
  * requires some system wide changes and so we do not enable this by default.
  * The tests in this file should work in both cases (where overflows are queued
  * and where they are dropped) on recent kernels.
  *
  * In order to test dropped CQEs you should enable fault injection in the kernel
  * config:
  *
  * CONFIG_FAULT_INJECTION=y
  * CONFIG_FAILSLAB=y
  * CONFIG_FAULT_INJECTION_DEBUG_FS=y
  *
  * and then run the test as follows:
  * echo Y > /sys/kernel/debug/failslab/task-filter
  * echo 100 > /sys/kernel/debug/failslab/probability
  * echo 0 > /sys/kernel/debug/failslab/verbose
  * echo 100000 > /sys/kernel/debug/failslab/times
  * bash -c "echo 1 > /proc/self/make-it-fail && exec ./cq-overflow.t"
  */

 static int test_io(const char *file, unsigned long usecs, unsigned *drops,
 		   int fault)
 {
 	struct io_uring_sqe *sqe;
 	struct io_uring_cqe *cqe;
 	struct io_uring_params p;
 	unsigned reaped, total;
 	struct io_uring ring;
 	int nodrop, i, fd, ret;
 	bool cqe_dropped = false;

 	fd = open(file, O_RDONLY | O_DIRECT);
 	if (fd < 0) {
 		if (errno == EINVAL)
 			return T_EXIT_SKIP;
 		perror("file open");
 		return T_EXIT_FAIL;
 	}

 	memset(&p, 0, sizeof(p));
 	ret = io_uring_queue_init_params(ENTRIES, &ring, &p);
 	if (ret) {
 		close(fd);
 		fprintf(stderr, "ring create failed: %d\n", ret);
 		return T_EXIT_FAIL;
 	}
 	nodrop = 0;
 	if (p.features & IORING_FEAT_NODROP)
 		nodrop = 1;

 	total = 0;
 	for (i = 0; i < BUFFERS / 2; i++) {
 		off_t offset;

 		sqe = io_uring_get_sqe(&ring);
 		if (!sqe) {
 			fprintf(stderr, "sqe get failed\n");
 			goto err;
 		}
 		offset = BS * (rand() % BUFFERS);
 		if (fault && i == ENTRIES + 4) {
 			free(vecs[i].iov_base);
 			vecs[i].iov_base = NULL;
 		}
 		io_uring_prep_readv(sqe, fd, &vecs[i], 1, offset);

 		ret = io_uring_submit(&ring);
 		if (nodrop && ret == -EBUSY) {
 			*drops = 1;
 			total = i;
 			break;
 		} else if (ret != 1) {
 			fprintf(stderr, "submit got %d, wanted %d\n", ret, 1);
 			total = i;
 			break;
 		}
 		total++;
 	}

 	if (*drops)
 		goto reap_it;

 	usleep(usecs);

 	for (i = total; i < BUFFERS; i++) {
 		off_t offset;

 		sqe = io_uring_get_sqe(&ring);
 		if (!sqe) {
 			fprintf(stderr, "sqe get failed\n");
 			goto err;
 		}
 		offset = BS * (rand() % BUFFERS);
 		io_uring_prep_readv(sqe, fd, &vecs[i], 1, offset);

 		ret = io_uring_submit(&ring);
 		if (nodrop && ret == -EBUSY) {
 			*drops = 1;
 			break;
 		} else if (ret != 1) {
 			fprintf(stderr, "submit got %d, wanted %d\n", ret, 1);
 			break;
 		}
 		total++;
 	}

 reap_it:
 	reaped = 0;
 	do {
 		if (nodrop && !cqe_dropped) {
 			/* nodrop should never lose events unless cqe_dropped */
 			if (reaped == total)
 				break;
 		} else {
 			if (reaped + *ring.cq.koverflow == total)
 				break;
 		}
 		ret = io_uring_wait_cqe(&ring, &cqe);
 		if (nodrop && ret == -EBADR) {
 			cqe_dropped = true;
 			continue;
 		} else if (ret) {
 			fprintf(stderr, "wait_cqe=%d\n", ret);
 			goto err;
 		}
 		if (cqe->res != BS) {
 			if (!(fault && cqe->res == -EFAULT)) {
 				fprintf(stderr, "cqe res %d, wanted %d\n",
 						cqe->res, BS);
 				goto err;
 			}
 		}
 		io_uring_cqe_seen(&ring, cqe);
 		reaped++;
 	} while (1);

 	if (!io_uring_peek_cqe(&ring, &cqe)) {
 		fprintf(stderr, "found unexpected completion\n");
 		goto err;
 	}

 	if (!nodrop || cqe_dropped) {
 		*drops = *ring.cq.koverflow;
 	} else if (*ring.cq.koverflow) {
 		fprintf(stderr, "Found %u overflows\n", *ring.cq.koverflow);
 		goto err;
 	}

 	io_uring_queue_exit(&ring);
 	close(fd);
 	return T_EXIT_PASS;
 err:
 	if (fd != -1)
 		close(fd);
 	io_uring_queue_exit(&ring);
 	return T_EXIT_SKIP;
 }

 static int reap_events(struct io_uring *ring, unsigned nr_events, int do_wait)
 {
 	struct io_uring_cqe *cqe;
 	int i, ret = 0, seq = 0;
 	unsigned int start_overflow = *ring->cq.koverflow;
 	bool dropped = false;

 	for (i = 0; i < nr_events; i++) {
 		if (do_wait)
 			ret = io_uring_wait_cqe(ring, &cqe);
 		else
 			ret = io_uring_peek_cqe(ring, &cqe);
 		if (do_wait && ret == -EBADR) {
 			unsigned int this_drop = *ring->cq.koverflow -
 				start_overflow;

 			dropped = true;
 			start_overflow = *ring->cq.koverflow;
 			assert(this_drop > 0);
 			i += (this_drop - 1);
 			continue;
 		} else if (ret) {
 			if (ret != -EAGAIN)
 				fprintf(stderr, "cqe peek failed: %d\n", ret);
 			break;
 		}
 		if (!dropped && cqe->user_data != seq) {
 			fprintf(stderr, "cqe sequence out-of-order\n");
 			fprintf(stderr, "got %d, wanted %d\n", (int) cqe->user_data,
 					seq);
 			return -EINVAL;
 		}
 		seq++;
 		io_uring_cqe_seen(ring, cqe);
 	}

 	return i ? i : ret;
 }

 /*
  * Submit some NOPs and watch if the overflow is correct
  */
 static int test_overflow(void)
 {
 	struct io_uring ring;
 	struct io_uring_params p;
 	struct io_uring_sqe *sqe;
 	unsigned pending;
 	int ret, i, j;

 	memset(&p, 0, sizeof(p));
 	ret = io_uring_queue_init_params(4, &ring, &p);
 	if (ret) {
 		fprintf(stderr, "io_uring_queue_init failed %d\n", ret);
 		return 1;
 	}

 	/* submit 4x4 SQEs, should overflow the ring by 8 */
 	pending = 0;
 	for (i = 0; i < 4; i++) {
 		for (j = 0; j < 4; j++) {
 			sqe = io_uring_get_sqe(&ring);
 			if (!sqe) {
 				fprintf(stderr, "get sqe failed\n");
 				goto err;
 			}

 			io_uring_prep_nop(sqe);
 			sqe->user_data = (i * 4) + j;
 		}

 		ret = io_uring_submit(&ring);
 		if (ret == 4) {
 			pending += 4;
 			continue;
 		}
 		if (p.features & IORING_FEAT_NODROP) {
 			if (ret == -EBUSY)
 				break;
 		}
 		fprintf(stderr, "sqe submit failed: %d\n", ret);
 		goto err;
 	}

 	/* we should now have 8 completions ready */
 	ret = reap_events(&ring, pending, 0);
 	if (ret < 0)
 		goto err;

 	if (!(p.features & IORING_FEAT_NODROP)) {
 		if (*ring.cq.koverflow != 8) {
 			fprintf(stderr, "cq ring overflow %d, expected 8\n",
 					*ring.cq.koverflow);
 			goto err;
 		}
 	}
 	io_uring_queue_exit(&ring);
 	return 0;
 err:
 	io_uring_queue_exit(&ring);
 	return 1;
 }


 static void submit_one_nop(struct io_uring *ring, int ud)
 {
 	struct io_uring_sqe *sqe;
 	int ret;

 	sqe = io_uring_get_sqe(ring);
 	assert(sqe);
 	io_uring_prep_nop(sqe);
 	sqe->user_data = ud;
 	ret = io_uring_submit(ring);
 	assert(ret == 1);
 }

 /*
  * Create an overflow condition and ensure that SQEs are still processed
  */
 static int test_overflow_handling(bool batch, int cqe_multiple, bool poll,
 				  bool defer)
 {
 	struct io_uring ring;
 	struct io_uring_params p;
 	int ret, i, j, ud, cqe_count;
 	unsigned int count;
 	int const N = 8;
 	int const LOOPS = 128;
 	int const QUEUE_LENGTH = 1024;
 	int completions[N];
 	int queue[QUEUE_LENGTH];
 	int queued = 0;
 	int outstanding = 0;
 	bool cqe_dropped = false;

 	memset(&completions, 0, sizeof(int) * N);
 	memset(&p, 0, sizeof(p));
 	p.cq_entries = 2 * cqe_multiple;
 	p.flags |= IORING_SETUP_CQSIZE;

 	if (poll)
 		p.flags |= IORING_SETUP_IOPOLL;

 	if (defer)
 		p.flags |= IORING_SETUP_SINGLE_ISSUER |
 			   IORING_SETUP_DEFER_TASKRUN;

 	ret = io_uring_queue_init_params(2, &ring, &p);
 	if (ret) {
 		fprintf(stderr, "io_uring_queue_init failed %d\n", ret);
 		return 1;
 	}

 	assert(p.cq_entries < N);
 	/* submit N SQEs, some should overflow */
 	for (i = 0; i < N; i++) {
 		submit_one_nop(&ring, i);
 		outstanding++;
 	}

 	for (i = 0; i < LOOPS; i++) {
 		struct io_uring_cqe *cqes[N];

 		if (io_uring_cq_has_overflow(&ring)) {
 			/*
 			 * Flush any overflowed CQEs and process those. Actively
 			 * flush these to make sure CQEs arrive in vague order
 			 * of being sent.
 			 */
 			ret = io_uring_get_events(&ring);
 			if (ret != 0) {
 				fprintf(stderr,
 					"io_uring_get_events returned %d\n",
 					ret);
 				goto err;
 			}
 		} else if (!cqe_dropped) {
 			for (j = 0; j < queued; j++) {
 				submit_one_nop(&ring, queue[j]);
 				outstanding++;
 			}
 			queued = 0;
 		}

 		/* We have lost some random cqes, stop if no remaining. */
 		if (cqe_dropped && outstanding == *ring.cq.koverflow)
 			break;

 		ret = io_uring_wait_cqe(&ring, &cqes[0]);
 		if (ret == -EBADR) {
 			cqe_dropped = true;
 			fprintf(stderr, "CQE dropped\n");
 			continue;
 		} else if (ret != 0) {
 			fprintf(stderr, "io_uring_wait_cqes failed %d\n", ret);
 			goto err;
 		}
 		cqe_count = 1;
 		if (batch) {
 			ret = io_uring_peek_batch_cqe(&ring, &cqes[0], 2);
 			if (ret < 0) {
 				fprintf(stderr,
 					"io_uring_peek_batch_cqe failed %d\n",
 					ret);
 				goto err;
 			}
 			cqe_count = ret;
 		}
 		for (j = 0; j < cqe_count; j++) {
 			assert(cqes[j]->user_data < N);
 			ud = cqes[j]->user_data;
 			completions[ud]++;
 			assert(queued < QUEUE_LENGTH);
 			queue[queued++] = (int)ud;
 		}
 		io_uring_cq_advance(&ring, cqe_count);
 		outstanding -= cqe_count;
 	}

 	/* See if there were any drops by flushing the CQ ring *and* overflow */
 	do {
 		struct io_uring_cqe *cqe;

 		ret = io_uring_get_events(&ring);
 		if (ret < 0) {
 			if (ret == -EBADR) {
 				fprintf(stderr, "CQE dropped\n");
 				cqe_dropped = true;
 				break;
 			}
 			goto err;
 		}
 		if (outstanding && !io_uring_cq_ready(&ring))
 			ret = io_uring_wait_cqe_timeout(&ring, &cqe, NULL);

 		if (ret && ret != -ETIME) {
 			if (ret == -EBADR) {
 				fprintf(stderr, "CQE dropped\n");
 				cqe_dropped = true;
 				break;
 			}
 			fprintf(stderr, "wait_cqe_timeout = %d\n", ret);
 			goto err;
 		}
 		count = io_uring_cq_ready(&ring);
 		io_uring_cq_advance(&ring, count);
 		outstanding -= count;
 	} while (count);

 	io_uring_queue_exit(&ring);

 	/* Make sure that completions come back in the same order they were
 	 * sent. If they come back unfairly then this will concentrate on a
 	 * couple of indices.
 	 */
 	for (i = 1; !cqe_dropped && i < N; i++) {
 		if (abs(completions[i] - completions[i - 1]) > 1) {
 			fprintf(stderr, "bad completion size %d %d\n",
 				completions[i], completions[i - 1]);
 			goto err;
 		}
 	}
 	return 0;
 err:
 	io_uring_queue_exit(&ring);
 	return 1;
 }

 int main(int argc, char *argv[])
 {
 	const char *fname = ".cq-overflow";
 	unsigned iters, drops;
 	unsigned long usecs;
 	int ret;
 	int i;
 	bool can_defer;

 	if (argc > 1)
 		return T_EXIT_SKIP;

 	can_defer = t_probe_defer_taskrun();
 	for (i = 0; i < 16; i++) {
 		bool batch = i & 1;
 		int mult = (i & 2) ? 1 : 2;
 		bool poll = i & 4;
 		bool defer = i & 8;

 		if (defer && !can_defer)
 			continue;

 		ret = test_overflow_handling(batch, mult, poll, defer);
 		if (ret) {
 			fprintf(stderr, "test_overflow_handling("
 				"batch=%d, mult=%d, poll=%d, defer=%d) failed\n",
 				batch, mult, poll, defer);
 			goto err;
 		}
 	}

 	ret = test_overflow();
 	if (ret) {
 		fprintf(stderr, "test_overflow failed\n");
 		return ret;
 	}

 	t_create_file(fname, FILE_SIZE);

 	vecs = t_create_buffers(BUFFERS, BS);

 	iters = 0;
 	usecs = 1000;
 	do {
 		drops = 0;

 		ret = test_io(fname, usecs, &drops, 0);
 		if (ret == T_EXIT_SKIP)
 			break;
 		else if (ret != T_EXIT_PASS) {
 			fprintf(stderr, "test_io nofault failed\n");
 			goto err;
 		}
 		if (drops)
 			break;
 		usecs = (usecs * 12) / 10;
 		iters++;
 	} while (iters < 40);

 	if (test_io(fname, usecs, &drops, 0) == T_EXIT_FAIL) {
 		fprintf(stderr, "test_io nofault failed\n");
 		goto err;
 	}

 	if (test_io(fname, usecs, &drops, 1) == T_EXIT_FAIL) {
 		fprintf(stderr, "test_io fault failed\n");
 		goto err;
 	}

 	unlink(fname);
 	if(vecs != NULL) {
 		for (i = 0; i < BUFFERS; i++)
 			free(vecs[i].iov_base);
 	}
 	free(vecs);
 	return T_EXIT_PASS;
 err:
 	unlink(fname);
 	if(vecs != NULL) {
 		for (i = 0; i < BUFFERS; i++)
 			free(vecs[i].iov_base);
 	}
 	free(vecs);
 	return T_EXIT_FAIL;
 }
	/* SPDX-License-Identifier: MIT */
	/*
	* Description: run various CQ ring overflow tests
	*
	*/
	#include <errno.h>
	#include <stdio.h>
	#include <unistd.h>
	#include <stdlib.h>
	#include <string.h>
	#include <fcntl.h>
	#include <assert.h>

	#include "helpers.h"
	#include "liburing.h"

	#define FILE_SIZE (256 * 1024)
	#define BS 4096
	#define BUFFERS (FILE_SIZE / BS)

	static struct iovec *vecs;

	#define ENTRIES 8

	/*
	* io_uring has rare cases where CQEs are lost.
	* This happens when there is no space in the CQ ring, and also there is no
	* GFP_ATOMIC memory available. In reality this probably means that the process
	* is about to be killed as many other things might start failing, but we still
	* want to test that liburing and the kernel deal with this properly. The fault
	* injection framework allows us to test this scenario. Unfortunately this
	* requires some system wide changes and so we do not enable this by default.
	* The tests in this file should work in both cases (where overflows are queued
	* and where they are dropped) on recent kernels.
	*
	* In order to test dropped CQEs you should enable fault injection in the kernel
	* config:
	*
	* CONFIG_FAULT_INJECTION=y
	* CONFIG_FAILSLAB=y
	* CONFIG_FAULT_INJECTION_DEBUG_FS=y
	*
	* and then run the test as follows:
	* echo Y > /sys/kernel/debug/failslab/task-filter
	* echo 100 > /sys/kernel/debug/failslab/probability
	* echo 0 > /sys/kernel/debug/failslab/verbose
	* echo 100000 > /sys/kernel/debug/failslab/times
	* bash -c "echo 1 > /proc/self/make-it-fail && exec ./cq-overflow.t"
	*/

	static int test_io(const char file, unsigned long usecs, unsigned drops,
	int fault)
	{
	struct io_uring_sqe *sqe;
	struct io_uring_cqe *cqe;
	struct io_uring_params p;
	unsigned reaped, total;
	struct io_uring ring;
	int nodrop, i, fd, ret;
	bool cqe_dropped = false;

	fd = open(file, O_RDONLY \| O_DIRECT);
	if (fd < 0) {
	if (errno == EINVAL)
	return T_EXIT_SKIP;
	perror("file open");
	return T_EXIT_FAIL;
	}

	memset(&p, 0, sizeof(p));
	ret = io_uring_queue_init_params(ENTRIES, &ring, &p);
	if (ret) {
	close(fd);
	fprintf(stderr, "ring create failed: %d\n", ret);
	return T_EXIT_FAIL;
	}
	nodrop = 0;
	if (p.features & IORING_FEAT_NODROP)
	nodrop = 1;

	total = 0;
	for (i = 0; i < BUFFERS / 2; i++) {
	off_t offset;

	sqe = io_uring_get_sqe(&ring);
	if (!sqe) {
	fprintf(stderr, "sqe get failed\n");
	goto err;
	}
	offset = BS * (rand() % BUFFERS);
	if (fault && i == ENTRIES + 4) {
	free(vecs[i].iov_base);
	vecs[i].iov_base = NULL;
	}
	io_uring_prep_readv(sqe, fd, &vecs[i], 1, offset);

	ret = io_uring_submit(&ring);
	if (nodrop && ret == -EBUSY) {
	*drops = 1;
	total = i;
	break;
	} else if (ret != 1) {
	fprintf(stderr, "submit got %d, wanted %d\n", ret, 1);
	total = i;
	break;
	}
	total++;
	}

	if (*drops)
	goto reap_it;

	usleep(usecs);

	for (i = total; i < BUFFERS; i++) {
	off_t offset;

	sqe = io_uring_get_sqe(&ring);
	if (!sqe) {
	fprintf(stderr, "sqe get failed\n");
	goto err;
	}
	offset = BS * (rand() % BUFFERS);
	io_uring_prep_readv(sqe, fd, &vecs[i], 1, offset);

	ret = io_uring_submit(&ring);
	if (nodrop && ret == -EBUSY) {
	*drops = 1;
	break;
	} else if (ret != 1) {
	fprintf(stderr, "submit got %d, wanted %d\n", ret, 1);
	break;
	}
	total++;
	}

	reap_it:
	reaped = 0;
	do {
	if (nodrop && !cqe_dropped) {
	/* nodrop should never lose events unless cqe_dropped */
	if (reaped == total)
	break;
	} else {
	if (reaped + *ring.cq.koverflow == total)
	break;
	}
	ret = io_uring_wait_cqe(&ring, &cqe);
	if (nodrop && ret == -EBADR) {
	cqe_dropped = true;
	continue;
	} else if (ret) {
	fprintf(stderr, "wait_cqe=%d\n", ret);
	goto err;
	}
	if (cqe->res != BS) {
	if (!(fault && cqe->res == -EFAULT)) {
	fprintf(stderr, "cqe res %d, wanted %d\n",
	cqe->res, BS);
	goto err;
	}
	}
	io_uring_cqe_seen(&ring, cqe);
	reaped++;
	} while (1);

	if (!io_uring_peek_cqe(&ring, &cqe)) {
	fprintf(stderr, "found unexpected completion\n");
	goto err;
	}

	if (!nodrop \|\| cqe_dropped) {
	drops = ring.cq.koverflow;
	} else if (*ring.cq.koverflow) {
	fprintf(stderr, "Found %u overflows\n", *ring.cq.koverflow);
	goto err;
	}

	io_uring_queue_exit(&ring);
	close(fd);
	return T_EXIT_PASS;
	err:
	if (fd != -1)
	close(fd);
	io_uring_queue_exit(&ring);
	return T_EXIT_SKIP;
	}

	static int reap_events(struct io_uring *ring, unsigned nr_events, int do_wait)
	{
	struct io_uring_cqe *cqe;
	int i, ret = 0, seq = 0;
	unsigned int start_overflow = *ring->cq.koverflow;
	bool dropped = false;

	for (i = 0; i < nr_events; i++) {
	if (do_wait)
	ret = io_uring_wait_cqe(ring, &cqe);
	else
	ret = io_uring_peek_cqe(ring, &cqe);
	if (do_wait && ret == -EBADR) {
	unsigned int this_drop = *ring->cq.koverflow -
	start_overflow;

	dropped = true;
	start_overflow = *ring->cq.koverflow;
	assert(this_drop > 0);
	i += (this_drop - 1);
	continue;
	} else if (ret) {
	if (ret != -EAGAIN)
	fprintf(stderr, "cqe peek failed: %d\n", ret);
	break;
	}
	if (!dropped && cqe->user_data != seq) {
	fprintf(stderr, "cqe sequence out-of-order\n");
	fprintf(stderr, "got %d, wanted %d\n", (int) cqe->user_data,
	seq);
	return -EINVAL;
	}
	seq++;
	io_uring_cqe_seen(ring, cqe);
	}

	return i ? i : ret;
	}

	/*
	* Submit some NOPs and watch if the overflow is correct
	*/
	static int test_overflow(void)
	{
	struct io_uring ring;
	struct io_uring_params p;
	struct io_uring_sqe *sqe;
	unsigned pending;
	int ret, i, j;

	memset(&p, 0, sizeof(p));
	ret = io_uring_queue_init_params(4, &ring, &p);
	if (ret) {
	fprintf(stderr, "io_uring_queue_init failed %d\n", ret);
	return 1;
	}

	/* submit 4x4 SQEs, should overflow the ring by 8 */
	pending = 0;
	for (i = 0; i < 4; i++) {
	for (j = 0; j < 4; j++) {
	sqe = io_uring_get_sqe(&ring);
	if (!sqe) {
	fprintf(stderr, "get sqe failed\n");
	goto err;
	}

	io_uring_prep_nop(sqe);
	sqe->user_data = (i * 4) + j;
	}

	ret = io_uring_submit(&ring);
	if (ret == 4) {
	pending += 4;
	continue;
	}
	if (p.features & IORING_FEAT_NODROP) {
	if (ret == -EBUSY)
	break;
	}
	fprintf(stderr, "sqe submit failed: %d\n", ret);
	goto err;
	}

	/* we should now have 8 completions ready */
	ret = reap_events(&ring, pending, 0);
	if (ret < 0)
	goto err;

	if (!(p.features & IORING_FEAT_NODROP)) {
	if (*ring.cq.koverflow != 8) {
	fprintf(stderr, "cq ring overflow %d, expected 8\n",
	*ring.cq.koverflow);
	goto err;
	}
	}
	io_uring_queue_exit(&ring);
	return 0;
	err:
	io_uring_queue_exit(&ring);
	return 1;
	}


	static void submit_one_nop(struct io_uring *ring, int ud)
	{
	struct io_uring_sqe *sqe;
	int ret;

	sqe = io_uring_get_sqe(ring);
	assert(sqe);
	io_uring_prep_nop(sqe);
	sqe->user_data = ud;
	ret = io_uring_submit(ring);
	assert(ret == 1);
	}

	/*
	* Create an overflow condition and ensure that SQEs are still processed
	*/
	static int test_overflow_handling(bool batch, int cqe_multiple, bool poll,
	bool defer)
	{
	struct io_uring ring;
	struct io_uring_params p;
	int ret, i, j, ud, cqe_count;
	unsigned int count;
	int const N = 8;
	int const LOOPS = 128;
	int const QUEUE_LENGTH = 1024;
	int completions[N];
	int queue[QUEUE_LENGTH];
	int queued = 0;
	int outstanding = 0;
	bool cqe_dropped = false;

	memset(&completions, 0, sizeof(int) * N);
	memset(&p, 0, sizeof(p));
	p.cq_entries = 2 * cqe_multiple;
	p.flags \|= IORING_SETUP_CQSIZE;

	if (poll)
	p.flags \|= IORING_SETUP_IOPOLL;

	if (defer)
	p.flags \|= IORING_SETUP_SINGLE_ISSUER \|
	IORING_SETUP_DEFER_TASKRUN;

	ret = io_uring_queue_init_params(2, &ring, &p);
	if (ret) {
	fprintf(stderr, "io_uring_queue_init failed %d\n", ret);
	return 1;
	}

	assert(p.cq_entries < N);
	/* submit N SQEs, some should overflow */
	for (i = 0; i < N; i++) {
	submit_one_nop(&ring, i);
	outstanding++;
	}

	for (i = 0; i < LOOPS; i++) {
	struct io_uring_cqe *cqes[N];

	if (io_uring_cq_has_overflow(&ring)) {
	/*
	* Flush any overflowed CQEs and process those. Actively
	* flush these to make sure CQEs arrive in vague order
	* of being sent.
	*/
	ret = io_uring_get_events(&ring);
	if (ret != 0) {
	fprintf(stderr,
	"io_uring_get_events returned %d\n",
	ret);
	goto err;
	}
	} else if (!cqe_dropped) {
	for (j = 0; j < queued; j++) {
	submit_one_nop(&ring, queue[j]);
	outstanding++;
	}
	queued = 0;
	}

	/* We have lost some random cqes, stop if no remaining. */
	if (cqe_dropped && outstanding == *ring.cq.koverflow)
	break;

	ret = io_uring_wait_cqe(&ring, &cqes[0]);
	if (ret == -EBADR) {
	cqe_dropped = true;
	fprintf(stderr, "CQE dropped\n");
	continue;
	} else if (ret != 0) {
	fprintf(stderr, "io_uring_wait_cqes failed %d\n", ret);
	goto err;
	}
	cqe_count = 1;
	if (batch) {
	ret = io_uring_peek_batch_cqe(&ring, &cqes[0], 2);
	if (ret < 0) {
	fprintf(stderr,
	"io_uring_peek_batch_cqe failed %d\n",
	ret);
	goto err;
	}
	cqe_count = ret;
	}
	for (j = 0; j < cqe_count; j++) {
	assert(cqes[j]->user_data < N);
	ud = cqes[j]->user_data;
	completions[ud]++;
	assert(queued < QUEUE_LENGTH);
	queue[queued++] = (int)ud;
	}
	io_uring_cq_advance(&ring, cqe_count);
	outstanding -= cqe_count;
	}

	/* See if there were any drops by flushing the CQ ring and overflow */
	do {
	struct io_uring_cqe *cqe;

	ret = io_uring_get_events(&ring);
	if (ret < 0) {
	if (ret == -EBADR) {
	fprintf(stderr, "CQE dropped\n");
	cqe_dropped = true;
	break;
	}
	goto err;
	}
	if (outstanding && !io_uring_cq_ready(&ring))
	ret = io_uring_wait_cqe_timeout(&ring, &cqe, NULL);

	if (ret && ret != -ETIME) {
	if (ret == -EBADR) {
	fprintf(stderr, "CQE dropped\n");
	cqe_dropped = true;
	break;
	}
	fprintf(stderr, "wait_cqe_timeout = %d\n", ret);
	goto err;
	}
	count = io_uring_cq_ready(&ring);
	io_uring_cq_advance(&ring, count);
	outstanding -= count;
	} while (count);

	io_uring_queue_exit(&ring);

	/* Make sure that completions come back in the same order they were
	* sent. If they come back unfairly then this will concentrate on a
	* couple of indices.
	*/
	for (i = 1; !cqe_dropped && i < N; i++) {
	if (abs(completions[i] - completions[i - 1]) > 1) {
	fprintf(stderr, "bad completion size %d %d\n",
	completions[i], completions[i - 1]);
	goto err;
	}
	}
	return 0;
	err:
	io_uring_queue_exit(&ring);
	return 1;
	}

	int main(int argc, char *argv[])
	{
	const char *fname = ".cq-overflow";
	unsigned iters, drops;
	unsigned long usecs;
	int ret;
	int i;
	bool can_defer;

	if (argc > 1)
	return T_EXIT_SKIP;

	can_defer = t_probe_defer_taskrun();
	for (i = 0; i < 16; i++) {
	bool batch = i & 1;
	int mult = (i & 2) ? 1 : 2;
	bool poll = i & 4;
	bool defer = i & 8;

	if (defer && !can_defer)
	continue;

	ret = test_overflow_handling(batch, mult, poll, defer);
	if (ret) {
	fprintf(stderr, "test_overflow_handling("
	"batch=%d, mult=%d, poll=%d, defer=%d) failed\n",
	batch, mult, poll, defer);
	goto err;
	}
	}

	ret = test_overflow();
	if (ret) {
	fprintf(stderr, "test_overflow failed\n");
	return ret;
	}

	t_create_file(fname, FILE_SIZE);

	vecs = t_create_buffers(BUFFERS, BS);

	iters = 0;
	usecs = 1000;
	do {
	drops = 0;

	ret = test_io(fname, usecs, &drops, 0);
	if (ret == T_EXIT_SKIP)
	break;
	else if (ret != T_EXIT_PASS) {
	fprintf(stderr, "test_io nofault failed\n");
	goto err;
	}
	if (drops)
	break;
	usecs = (usecs * 12) / 10;
	iters++;
	} while (iters < 40);

	if (test_io(fname, usecs, &drops, 0) == T_EXIT_FAIL) {
	fprintf(stderr, "test_io nofault failed\n");
	goto err;
	}

	if (test_io(fname, usecs, &drops, 1) == T_EXIT_FAIL) {
	fprintf(stderr, "test_io fault failed\n");
	goto err;
	}

	unlink(fname);
	if(vecs != NULL) {
	for (i = 0; i < BUFFERS; i++)
	free(vecs[i].iov_base);
	}
	free(vecs);
	return T_EXIT_PASS;
	err:
	unlink(fname);
	if(vecs != NULL) {
	for (i = 0; i < BUFFERS; i++)
	free(vecs[i].iov_base);
	}
	free(vecs);
	return T_EXIT_FAIL;
	}