liburing.h: avoid OOL round trip in io_uring_peek_cqe() on empty CQ

With the CQ empty, io_uring_peek_cqe() called into __io_uring_get_cqe()
just to do a second full peek and conclude -EAGAIN, costing a function
call, a redundant acquire load of the CQ tail, and the get_data setup
on every poll. That's wasted work for spin-poll style users.

Return -EAGAIN directly if the peek found nothing and there's nothing
the kernel could flush to the CQ: no IOPOLL completions to reap, no
overflown CQEs, and no pending task work. Those cases, and a peek that
consumed an internal timeout CQE, still take the slow path as before.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/src/include/liburing.h b/src/include/liburing.h
index 0138ee0..0188937 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -1922,8 +1922,20 @@
 				    struct io_uring_cqe **cqe_ptr)
 	LIBURING_NOEXCEPT
 {
-	if (!__io_uring_peek_cqe(ring, cqe_ptr, NULL) && *cqe_ptr)
-		return 0;
+	if (!__io_uring_peek_cqe(ring, cqe_ptr, NULL)) {
+		if (*cqe_ptr)
+			return 0;
+		/*
+		 * If the CQ is empty and there's nothing the kernel could
+		 * flush to it (no IOPOLL completions to reap, no overflown
+		 * CQEs, no pending task work), avoid the round trip into
+		 * the full get_cqe machinery.
+		 */
+		if (!(ring->flags & IORING_SETUP_IOPOLL) &&
+		    !(IO_URING_READ_ONCE(*ring->sq.kflags) &
+		      (IORING_SQ_CQ_OVERFLOW | IORING_SQ_TASKRUN)))
+			return -EAGAIN;
+	}
 
 	return io_uring_wait_cqe_nr(ring, cqe_ptr, 0);
 }