engines/libcufile.c - pub/scm/linux/kernel/git/axboe/fio - Git at Google

 /*
  * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
  * mailto:info@systemfabricworks.com
  *
  * License: GPLv2, see COPYING.
  *
  * libcufile engine
  *
  * fio I/O engine using the NVIDIA cuFile API.
  *
  */

 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <cufile.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <pthread.h>

 #include "../fio.h"
 #include "../lib/pow2.h"
 #include "../optgroup.h"
 #include "../lib/memalign.h"

 #define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)

 #define LOGGED_BUFLEN_NOT_ALIGNED     0x01
 #define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
 #define GPU_ID_SEP ":"

 enum {
 	IO_CUFILE    = 1,
 	IO_POSIX     = 2
 };

 struct libcufile_options {
 	struct thread_data *td;
 	char               *gpu_ids;       /* colon-separated list of GPU ids,
 					      one per job */
 	void               *cu_mem_ptr;    /* GPU memory */
 	void               *junk_buf;      /* buffer to simulate cudaMemcpy with
 					      posix I/O write */
 	int                 my_gpu_id;     /* GPU id to use for this job */
 	unsigned int        cuda_io;       /* Type of I/O to use with CUDA */
 	size_t              total_mem;     /* size for cu_mem_ptr and junk_buf */
 	int                 logged;        /* bitmask of log messages that have
 					      been output, prevent flood */
 };

 struct fio_libcufile_data {
 	CUfileDescr_t  cf_descr;
 	CUfileHandle_t cf_handle;
 };

 static struct fio_option options[] = {
 	{
 		.name	  = "gpu_dev_ids",
 		.lname	  = "libcufile engine gpu dev ids",
 		.type	  = FIO_OPT_STR_STORE,
 		.off1	  = offsetof(struct libcufile_options, gpu_ids),
 		.help	  = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
 		.category = FIO_OPT_C_ENGINE,
 		.group	  = FIO_OPT_G_LIBCUFILE,
 	},
 	{
 		.name	  = "cuda_io",
 		.lname	  = "libcufile cuda io",
 		.type	  = FIO_OPT_STR,
 		.off1	  = offsetof(struct libcufile_options, cuda_io),
 		.help	  = "Type of I/O to use with CUDA",
 		.def      = "cufile",
 		.posval   = {
 			    { .ival = "cufile",
 			      .oval = IO_CUFILE,
 			      .help = "libcufile nvidia-fs"
 			    },
 			    { .ival = "posix",
 			      .oval = IO_POSIX,
 			      .help = "POSIX I/O"
 			    }
 		},
 		.category = FIO_OPT_C_ENGINE,
 		.group	  = FIO_OPT_G_LIBCUFILE,
 	},
 	{
 		.name	 = NULL,
 	},
 };

 static int running = 0;
 static int cufile_initialized = 0;
 static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;

 #define check_cudaruntimecall(fn, rc)                                               \
 	do {                                                                        \
 		cudaError_t res = fn;                                               \
 		if (res != cudaSuccess) {                                           \
 			const char *str = cudaGetErrorName(res);                    \
 			log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
 				#fn, __LINE__, res, str);                           \
 			rc = -1;                                                    \
 		} else                                                              \
 			rc = 0;                                                     \
 	} while(0)

 static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
 {
 	if (IS_CUFILE_ERR(st.err))
 		return cufileop_status_error(st.err);
 	return "unknown";
 }

 /*
  * Assign GPU to subjob roundrobin, similar to how multiple
  * entries in 'directory' are handled by fio.
  */
 static int fio_libcufile_find_gpu_id(struct thread_data *td)
 {
 	struct libcufile_options *o = td->eo;
 	int gpu_id = 0;

 	if (o->gpu_ids != NULL) {
 		char *gpu_ids, *pos, *cur;
 		int i, id_count, gpu_idx;

 		for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
 			cur = strchr(cur, GPU_ID_SEP[0]);
 			if (cur != NULL)
 				cur++;
 		}

 		gpu_idx = td->subjob_number % id_count;

 		pos = gpu_ids = strdup(o->gpu_ids);
 		if (gpu_ids == NULL) {
 			log_err("strdup(gpu_ids): err=%d\n", errno);
 			return -1;
 		}

 		i = 0;
 		while (pos != NULL && i <= gpu_idx) {
 			i++;
 			cur = strsep(&pos, GPU_ID_SEP);
 		}

 		if (cur)
 			gpu_id = atoi(cur);

 		free(gpu_ids);
 	}

 	return gpu_id;
 }

 static int fio_libcufile_init(struct thread_data *td)
 {
 	struct libcufile_options *o = td->eo;
 	CUfileError_t status;
 	int initialized;
 	int rc;

 	pthread_mutex_lock(&running_lock);
 	if (running == 0) {
 		assert(cufile_initialized == 0);
 		if (o->cuda_io == IO_CUFILE) {
 			/* only open the driver if this is the first worker thread */
 			status = cuFileDriverOpen();
 			if (status.err != CU_FILE_SUCCESS)
 				log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
 					fio_libcufile_get_cuda_error(status));
 			else
 				cufile_initialized = 1;
 		}
 	}
 	running++;
 	initialized = cufile_initialized;
 	pthread_mutex_unlock(&running_lock);

 	if (o->cuda_io == IO_CUFILE && !initialized)
 		return 1;

 	o->my_gpu_id = fio_libcufile_find_gpu_id(td);
 	if (o->my_gpu_id < 0)
 		return 1;

 	dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
 	check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
 	if (rc != 0)
 		return 1;

 	return 0;
 }

 static inline int fio_libcufile_pre_write(struct thread_data *td,
 					  struct libcufile_options *o,
 					  struct io_u *io_u,
 					  size_t gpu_offset)
 {
 	int rc = 0;

 	if (o->cuda_io == IO_CUFILE) {
 		if (td->o.verify) {
 			/*
 			  Data is being verified, copy the io_u buffer to GPU memory.
 			  This isn't done in the non-verify case because the data would
 			  already be in GPU memory in a normal cuFile application.
 			*/
 			check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
 							 io_u->xfer_buf,
 							 io_u->xfer_buflen,
 							 cudaMemcpyHostToDevice), rc);
 			if (rc != 0) {
 				log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
 				io_u->error = EIO;
 			}
 		}
 	} else if (o->cuda_io == IO_POSIX) {

 		/*
 		  POSIX I/O is being used, the data has to be copied out of the
 		  GPU into a CPU buffer. GPU memory doesn't contain the actual
 		  data to write, copy the data to the junk buffer. The purpose
 		  of this is to add the overhead of cudaMemcpy() that would be
 		  present in a POSIX I/O CUDA application.
 		*/
 		check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
 						 ((char*) o->cu_mem_ptr) + gpu_offset,
 						 io_u->xfer_buflen,
 						 cudaMemcpyDeviceToHost), rc);
 		if (rc != 0) {
 			log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
 			io_u->error = EIO;
 		}
 	} else {
 		log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
 		assert(0);
 		rc = EINVAL;
 	}

 	return rc;
 }

 static inline int fio_libcufile_post_read(struct thread_data *td,
 					  struct libcufile_options *o,
 					  struct io_u *io_u,
 					  size_t gpu_offset)
 {
 	int rc = 0;

 	if (o->cuda_io == IO_CUFILE) {
 		if (td->o.verify) {
 			/* Copy GPU memory to CPU buffer for verify */
 			check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
 							 ((char*) o->cu_mem_ptr) + gpu_offset,
 							 io_u->xfer_buflen,
 							 cudaMemcpyDeviceToHost), rc);
 			if (rc != 0) {
 				log_err("DDIR_READ cudaMemcpy D2H failed\n");
 				io_u->error = EIO;
 			}
 		}
 	} else if (o->cuda_io == IO_POSIX) {
 		/* POSIX I/O read, copy the CPU buffer to GPU memory */
 		check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
 						 io_u->xfer_buf,
 						 io_u->xfer_buflen,
 						 cudaMemcpyHostToDevice), rc);
 		if (rc != 0) {
 			log_err("DDIR_READ cudaMemcpy H2D failed\n");
 			io_u->error = EIO;
 		}
 	} else {
 		log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
 		assert(0);
 		rc = EINVAL;
 	}

 	return rc;
 }

 static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
 					     struct io_u *io_u)
 {
 	struct libcufile_options *o = td->eo;
 	struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
 	unsigned long long io_offset;
 	ssize_t sz;
 	ssize_t remaining;
 	size_t xfered;
 	size_t gpu_offset;
 	int rc;

 	if (o->cuda_io == IO_CUFILE && fcd == NULL) {
 		io_u->error = EINVAL;
 		td_verror(td, EINVAL, "xfer");
 		return FIO_Q_COMPLETED;
 	}

 	fio_ro_check(td, io_u);

 	switch(io_u->ddir) {
 	case DDIR_SYNC:
 		rc = fsync(io_u->file->fd);
 		if (rc != 0) {
 			io_u->error = errno;
 			log_err("fsync: err=%d\n", errno);
 		}
 		break;

 	case DDIR_DATASYNC:
 		rc = fdatasync(io_u->file->fd);
 		if (rc != 0) {
 			io_u->error = errno;
 			log_err("fdatasync: err=%d\n", errno);
 		}
 		break;

 	case DDIR_READ:
 	case DDIR_WRITE:
 		/*
 		  There may be a better way to calculate gpu_offset. The intent is
 		  that gpu_offset equals the the difference between io_u->xfer_buf and
 		  the page-aligned base address for io_u buffers.
 		*/
 		gpu_offset = io_u->index * io_u->xfer_buflen;
 		io_offset = io_u->offset;
 		remaining = io_u->xfer_buflen;

 		xfered = 0;
 		sz = 0;

 		assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);

 		if (o->cuda_io == IO_CUFILE) {
 			if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
 			      (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
 				log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
 				o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
 			}

 			if (!(ALIGNED_4KB(gpu_offset) ||
 			      (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
 				log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
 				o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
 			}
 		}

 		if (io_u->ddir == DDIR_WRITE)
 			rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);

 		if (io_u->error != 0)
 			break;

 		while (remaining > 0) {
 			assert(gpu_offset + xfered <= o->total_mem);
 			if (io_u->ddir == DDIR_READ) {
 				if (o->cuda_io == IO_CUFILE) {
 					sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
 							io_offset + xfered, gpu_offset + xfered);
 					if (sz == -1) {
 						io_u->error = errno;
 						log_err("cuFileRead: err=%d\n", errno);
 					} else if (sz < 0) {
 						io_u->error = EIO;
 						log_err("cuFileRead: err=%ld:%s\n", sz,
 							cufileop_status_error(-sz));
 					}
 				} else if (o->cuda_io == IO_POSIX) {
 					sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
 						   remaining, io_offset + xfered);
 					if (sz < 0) {
 						io_u->error = errno;
 						log_err("pread: err=%d\n", errno);
 					}
 				} else {
 					log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
 					io_u->error = -1;
 					assert(0);
 				}
 			} else if (io_u->ddir == DDIR_WRITE) {
 				if (o->cuda_io == IO_CUFILE) {
 					sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
 							 io_offset + xfered, gpu_offset + xfered);
 					if (sz == -1) {
 						io_u->error = errno;
 						log_err("cuFileWrite: err=%d\n", errno);
 					} else if (sz < 0) {
 						io_u->error = EIO;
 						log_err("cuFileWrite: err=%ld:%s\n", sz,
 							cufileop_status_error(-sz));
 					}
 				} else if (o->cuda_io == IO_POSIX) {
 					sz = pwrite(io_u->file->fd,
 						    ((char*) io_u->xfer_buf) + xfered,
 						    remaining, io_offset + xfered);
 					if (sz < 0) {
 						io_u->error = errno;
 						log_err("pwrite: err=%d\n", errno);
 					}
 				} else {
 					log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
 					io_u->error = -1;
 					assert(0);
 				}
 			} else {
 				log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
 				io_u->error = -1;
 				assert(0);
 				break;
 			}

 			if (io_u->error != 0)
 				break;

 			remaining -= sz;
 			xfered += sz;

 			if (remaining != 0)
 				log_info("Incomplete %s: %ld bytes remaining\n",
 					 io_u->ddir == DDIR_READ? "read" : "write", remaining);
 		}

 		if (io_u->error != 0)
 			break;

 		if (io_u->ddir == DDIR_READ)
 			rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
 		break;

 	default:
 		io_u->error = EINVAL;
 		break;
 	}

 	if (io_u->error != 0) {
 		log_err("IO failed\n");
 		td_verror(td, io_u->error, "xfer");
 	}

 	return FIO_Q_COMPLETED;
 }

 static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
 {
 	struct libcufile_options *o = td->eo;
 	struct fio_libcufile_data *fcd = NULL;
 	int rc;
 	CUfileError_t status;

 	rc = generic_open_file(td, f);
 	if (rc)
 		return rc;

 	if (o->cuda_io == IO_CUFILE) {
 		fcd = calloc(1, sizeof(*fcd));
 		if (fcd == NULL) {
 			rc = ENOMEM;
 			goto exit_err;
 		}

 		fcd->cf_descr.handle.fd = f->fd;
 		fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
 		status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
 		if (status.err != CU_FILE_SUCCESS) {
 			log_err("cufile register: err=%d:%s\n", status.err,
 				fio_libcufile_get_cuda_error(status));
 			rc = EINVAL;
 			goto exit_err;
 		}
 	}

 	FILE_SET_ENG_DATA(f, fcd);
 	return 0;

 exit_err:
 	if (fcd) {
 		free(fcd);
 		fcd = NULL;
 	}
 	if (f) {
 		int rc2 = generic_close_file(td, f);
 		if (rc2)
 			log_err("generic_close_file: err=%d\n", rc2);
 	}
 	return rc;
 }

 static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
 {
 	struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
 	int rc;

 	if (fcd != NULL) {
 		cuFileHandleDeregister(fcd->cf_handle);
 		FILE_SET_ENG_DATA(f, NULL);
 		free(fcd);
 	}

 	rc = generic_close_file(td, f);

 	return rc;
 }

 static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
 {
 	struct libcufile_options *o = td->eo;
 	int rc;
 	CUfileError_t status;

 	o->total_mem = total_mem;
 	o->logged = 0;
 	o->cu_mem_ptr = NULL;
 	o->junk_buf = NULL;
 	td->orig_buffer = calloc(1, total_mem);
 	if (!td->orig_buffer) {
 		log_err("orig_buffer calloc failed: err=%d\n", errno);
 		goto exit_error;
 	}

 	if (o->cuda_io == IO_POSIX) {
 		o->junk_buf = calloc(1, total_mem);
 		if (o->junk_buf == NULL) {
 			log_err("junk_buf calloc failed: err=%d\n", errno);
 			goto exit_error;
 		}
 	}

 	dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
 	check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
 	if (rc != 0)
 		goto exit_error;
 	check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
 	if (rc != 0)
 		goto exit_error;

 	if (o->cuda_io == IO_CUFILE) {
 		status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
 		if (status.err != CU_FILE_SUCCESS) {
 			log_err("cuFileBufRegister: err=%d:%s\n", status.err,
 				fio_libcufile_get_cuda_error(status));
 			goto exit_error;
 		}
 	}

 	return 0;

 exit_error:
 	if (td->orig_buffer) {
 		free(td->orig_buffer);
 		td->orig_buffer = NULL;
 	}
 	if (o->junk_buf) {
 		free(o->junk_buf);
 		o->junk_buf = NULL;
 	}
 	if (o->cu_mem_ptr) {
 		cudaFree(o->cu_mem_ptr);
 		o->cu_mem_ptr = NULL;
 	}
 	return 1;
 }

 static void fio_libcufile_iomem_free(struct thread_data *td)
 {
 	struct libcufile_options *o = td->eo;

 	if (o->junk_buf) {
 		free(o->junk_buf);
 		o->junk_buf = NULL;
 	}
 	if (o->cu_mem_ptr) {
 		if (o->cuda_io == IO_CUFILE)
 			cuFileBufDeregister(o->cu_mem_ptr);
 		cudaFree(o->cu_mem_ptr);
 		o->cu_mem_ptr = NULL;
 	}
 	if (td->orig_buffer) {
 		free(td->orig_buffer);
 		td->orig_buffer = NULL;
 	}
 }

 static void fio_libcufile_cleanup(struct thread_data *td)
 {
 	struct libcufile_options *o = td->eo;

 	pthread_mutex_lock(&running_lock);
 	running--;
 	assert(running >= 0);
 	if (running == 0) {
 		/* only close the driver if initialized and
 		   this is the last worker thread */
 		if (o->cuda_io == IO_CUFILE && cufile_initialized)
 			cuFileDriverClose();
 		cufile_initialized = 0;
 	}
 	pthread_mutex_unlock(&running_lock);
 }

 FIO_STATIC struct ioengine_ops ioengine = {
 	.name                = "libcufile",
 	.version             = FIO_IOOPS_VERSION,
 	.init                = fio_libcufile_init,
 	.queue               = fio_libcufile_queue,
 	.get_file_size       = generic_get_file_size,
 	.open_file           = fio_libcufile_open_file,
 	.close_file          = fio_libcufile_close_file,
 	.iomem_alloc         = fio_libcufile_iomem_alloc,
 	.iomem_free          = fio_libcufile_iomem_free,
 	.cleanup             = fio_libcufile_cleanup,
 	.flags               = FIO_SYNCIO,
 	.options             = options,
 	.option_struct_size  = sizeof(struct libcufile_options)
 };

 void fio_init fio_libcufile_register(void)
 {
 	register_ioengine(&ioengine);
 }

 void fio_exit fio_libcufile_unregister(void)
 {
 	unregister_ioengine(&ioengine);
 }
	/*
	* Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
	* mailto:info@systemfabricworks.com
	*
	* License: GPLv2, see COPYING.
	*
	* libcufile engine
	*
	* fio I/O engine using the NVIDIA cuFile API.
	*
	*/

	#include <stdlib.h>
	#include <unistd.h>
	#include <errno.h>
	#include <string.h>
	#include <sys/time.h>
	#include <sys/resource.h>
	#include <cufile.h>
	#include <cuda.h>
	#include <cuda_runtime.h>
	#include <pthread.h>

	#include "../fio.h"
	#include "../lib/pow2.h"
	#include "../optgroup.h"
	#include "../lib/memalign.h"

	#define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)

	#define LOGGED_BUFLEN_NOT_ALIGNED 0x01
	#define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
	#define GPU_ID_SEP ":"

	enum {
	IO_CUFILE = 1,
	IO_POSIX = 2
	};

	struct libcufile_options {
	struct thread_data *td;
	char gpu_ids; / colon-separated list of GPU ids,
	one per job */
	void cu_mem_ptr; / GPU memory */
	void junk_buf; / buffer to simulate cudaMemcpy with
	posix I/O write */
	int my_gpu_id; /* GPU id to use for this job */
	unsigned int cuda_io; /* Type of I/O to use with CUDA */
	size_t total_mem; /* size for cu_mem_ptr and junk_buf */
	int logged; /* bitmask of log messages that have
	been output, prevent flood */
	};

	struct fio_libcufile_data {
	CUfileDescr_t cf_descr;
	CUfileHandle_t cf_handle;
	};

	static struct fio_option options[] = {
	{
	.name = "gpu_dev_ids",
	.lname = "libcufile engine gpu dev ids",
	.type = FIO_OPT_STR_STORE,
	.off1 = offsetof(struct libcufile_options, gpu_ids),
	.help = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
	.category = FIO_OPT_C_ENGINE,
	.group = FIO_OPT_G_LIBCUFILE,
	},
	{
	.name = "cuda_io",
	.lname = "libcufile cuda io",
	.type = FIO_OPT_STR,
	.off1 = offsetof(struct libcufile_options, cuda_io),
	.help = "Type of I/O to use with CUDA",
	.def = "cufile",
	.posval = {
	{ .ival = "cufile",
	.oval = IO_CUFILE,
	.help = "libcufile nvidia-fs"
	},
	{ .ival = "posix",
	.oval = IO_POSIX,
	.help = "POSIX I/O"
	}
	},
	.category = FIO_OPT_C_ENGINE,
	.group = FIO_OPT_G_LIBCUFILE,
	},
	{
	.name = NULL,
	},
	};

	static int running = 0;
	static int cufile_initialized = 0;
	static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;

	#define check_cudaruntimecall(fn, rc) \
	do { \
	cudaError_t res = fn; \
	if (res != cudaSuccess) { \
	const char *str = cudaGetErrorName(res); \
	log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
	#fn, __LINE__, res, str); \
	rc = -1; \
	} else \
	rc = 0; \
	} while(0)

	static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
	{
	if (IS_CUFILE_ERR(st.err))
	return cufileop_status_error(st.err);
	return "unknown";
	}

	/*
	* Assign GPU to subjob roundrobin, similar to how multiple
	* entries in 'directory' are handled by fio.
	*/
	static int fio_libcufile_find_gpu_id(struct thread_data *td)
	{
	struct libcufile_options *o = td->eo;
	int gpu_id = 0;

	if (o->gpu_ids != NULL) {
	char gpu_ids, pos, *cur;
	int i, id_count, gpu_idx;

	for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
	cur = strchr(cur, GPU_ID_SEP[0]);
	if (cur != NULL)
	cur++;
	}

	gpu_idx = td->subjob_number % id_count;

	pos = gpu_ids = strdup(o->gpu_ids);
	if (gpu_ids == NULL) {
	log_err("strdup(gpu_ids): err=%d\n", errno);
	return -1;
	}

	i = 0;
	while (pos != NULL && i <= gpu_idx) {
	i++;
	cur = strsep(&pos, GPU_ID_SEP);
	}

	if (cur)
	gpu_id = atoi(cur);

	free(gpu_ids);
	}

	return gpu_id;
	}

	static int fio_libcufile_init(struct thread_data *td)
	{
	struct libcufile_options *o = td->eo;
	CUfileError_t status;
	int initialized;
	int rc;

	pthread_mutex_lock(&running_lock);
	if (running == 0) {
	assert(cufile_initialized == 0);
	if (o->cuda_io == IO_CUFILE) {
	/* only open the driver if this is the first worker thread */
	status = cuFileDriverOpen();
	if (status.err != CU_FILE_SUCCESS)
	log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
	fio_libcufile_get_cuda_error(status));
	else
	cufile_initialized = 1;
	}
	}
	running++;
	initialized = cufile_initialized;
	pthread_mutex_unlock(&running_lock);

	if (o->cuda_io == IO_CUFILE && !initialized)
	return 1;

	o->my_gpu_id = fio_libcufile_find_gpu_id(td);
	if (o->my_gpu_id < 0)
	return 1;

	dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
	check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
	if (rc != 0)
	return 1;

	return 0;
	}

	static inline int fio_libcufile_pre_write(struct thread_data *td,
	struct libcufile_options *o,
	struct io_u *io_u,
	size_t gpu_offset)
	{
	int rc = 0;

	if (o->cuda_io == IO_CUFILE) {
	if (td->o.verify) {
	/*
	Data is being verified, copy the io_u buffer to GPU memory.
	This isn't done in the non-verify case because the data would
	already be in GPU memory in a normal cuFile application.
	*/
	check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
	io_u->xfer_buf,
	io_u->xfer_buflen,
	cudaMemcpyHostToDevice), rc);
	if (rc != 0) {
	log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
	io_u->error = EIO;
	}
	}
	} else if (o->cuda_io == IO_POSIX) {

	/*
	POSIX I/O is being used, the data has to be copied out of the
	GPU into a CPU buffer. GPU memory doesn't contain the actual
	data to write, copy the data to the junk buffer. The purpose
	of this is to add the overhead of cudaMemcpy() that would be
	present in a POSIX I/O CUDA application.
	*/
	check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
	((char*) o->cu_mem_ptr) + gpu_offset,
	io_u->xfer_buflen,
	cudaMemcpyDeviceToHost), rc);
	if (rc != 0) {
	log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
	io_u->error = EIO;
	}
	} else {
	log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
	assert(0);
	rc = EINVAL;
	}

	return rc;
	}

	static inline int fio_libcufile_post_read(struct thread_data *td,
	struct libcufile_options *o,
	struct io_u *io_u,
	size_t gpu_offset)
	{
	int rc = 0;

	if (o->cuda_io == IO_CUFILE) {
	if (td->o.verify) {
	/* Copy GPU memory to CPU buffer for verify */
	check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
	((char*) o->cu_mem_ptr) + gpu_offset,
	io_u->xfer_buflen,
	cudaMemcpyDeviceToHost), rc);
	if (rc != 0) {
	log_err("DDIR_READ cudaMemcpy D2H failed\n");
	io_u->error = EIO;
	}
	}
	} else if (o->cuda_io == IO_POSIX) {
	/* POSIX I/O read, copy the CPU buffer to GPU memory */
	check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
	io_u->xfer_buf,
	io_u->xfer_buflen,
	cudaMemcpyHostToDevice), rc);
	if (rc != 0) {
	log_err("DDIR_READ cudaMemcpy H2D failed\n");
	io_u->error = EIO;
	}
	} else {
	log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
	assert(0);
	rc = EINVAL;
	}

	return rc;
	}

	static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
	struct io_u *io_u)
	{
	struct libcufile_options *o = td->eo;
	struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
	unsigned long long io_offset;
	ssize_t sz;
	ssize_t remaining;
	size_t xfered;
	size_t gpu_offset;
	int rc;

	if (o->cuda_io == IO_CUFILE && fcd == NULL) {
	io_u->error = EINVAL;
	td_verror(td, EINVAL, "xfer");
	return FIO_Q_COMPLETED;
	}

	fio_ro_check(td, io_u);

	switch(io_u->ddir) {
	case DDIR_SYNC:
	rc = fsync(io_u->file->fd);
	if (rc != 0) {
	io_u->error = errno;
	log_err("fsync: err=%d\n", errno);
	}
	break;

	case DDIR_DATASYNC:
	rc = fdatasync(io_u->file->fd);
	if (rc != 0) {
	io_u->error = errno;
	log_err("fdatasync: err=%d\n", errno);
	}
	break;

	case DDIR_READ:
	case DDIR_WRITE:
	/*
	There may be a better way to calculate gpu_offset. The intent is
	that gpu_offset equals the the difference between io_u->xfer_buf and
	the page-aligned base address for io_u buffers.
	*/
	gpu_offset = io_u->index * io_u->xfer_buflen;
	io_offset = io_u->offset;
	remaining = io_u->xfer_buflen;

	xfered = 0;
	sz = 0;

	assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);

	if (o->cuda_io == IO_CUFILE) {
	if (!(ALIGNED_4KB(io_u->xfer_buflen) \|\|
	(o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
	log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
	o->logged \|= LOGGED_BUFLEN_NOT_ALIGNED;
	}

	if (!(ALIGNED_4KB(gpu_offset) \|\|
	(o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
	log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
	o->logged \|= LOGGED_GPU_OFFSET_NOT_ALIGNED;
	}
	}

	if (io_u->ddir == DDIR_WRITE)
	rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);

	if (io_u->error != 0)
	break;

	while (remaining > 0) {
	assert(gpu_offset + xfered <= o->total_mem);
	if (io_u->ddir == DDIR_READ) {
	if (o->cuda_io == IO_CUFILE) {
	sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
	io_offset + xfered, gpu_offset + xfered);
	if (sz == -1) {
	io_u->error = errno;
	log_err("cuFileRead: err=%d\n", errno);
	} else if (sz < 0) {
	io_u->error = EIO;
	log_err("cuFileRead: err=%ld:%s\n", sz,
	cufileop_status_error(-sz));
	}
	} else if (o->cuda_io == IO_POSIX) {
	sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
	remaining, io_offset + xfered);
	if (sz < 0) {
	io_u->error = errno;
	log_err("pread: err=%d\n", errno);
	}
	} else {
	log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
	io_u->error = -1;
	assert(0);
	}
	} else if (io_u->ddir == DDIR_WRITE) {
	if (o->cuda_io == IO_CUFILE) {
	sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
	io_offset + xfered, gpu_offset + xfered);
	if (sz == -1) {
	io_u->error = errno;
	log_err("cuFileWrite: err=%d\n", errno);
	} else if (sz < 0) {
	io_u->error = EIO;
	log_err("cuFileWrite: err=%ld:%s\n", sz,
	cufileop_status_error(-sz));
	}
	} else if (o->cuda_io == IO_POSIX) {
	sz = pwrite(io_u->file->fd,
	((char*) io_u->xfer_buf) + xfered,
	remaining, io_offset + xfered);
	if (sz < 0) {
	io_u->error = errno;
	log_err("pwrite: err=%d\n", errno);
	}
	} else {
	log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
	io_u->error = -1;
	assert(0);
	}
	} else {
	log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
	io_u->error = -1;
	assert(0);
	break;
	}

	if (io_u->error != 0)
	break;

	remaining -= sz;
	xfered += sz;

	if (remaining != 0)
	log_info("Incomplete %s: %ld bytes remaining\n",
	io_u->ddir == DDIR_READ? "read" : "write", remaining);
	}

	if (io_u->error != 0)
	break;

	if (io_u->ddir == DDIR_READ)
	rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
	break;

	default:
	io_u->error = EINVAL;
	break;
	}

	if (io_u->error != 0) {
	log_err("IO failed\n");
	td_verror(td, io_u->error, "xfer");
	}

	return FIO_Q_COMPLETED;
	}

	static int fio_libcufile_open_file(struct thread_data td, struct fio_file f)
	{
	struct libcufile_options *o = td->eo;
	struct fio_libcufile_data *fcd = NULL;
	int rc;
	CUfileError_t status;

	rc = generic_open_file(td, f);
	if (rc)
	return rc;

	if (o->cuda_io == IO_CUFILE) {
	fcd = calloc(1, sizeof(*fcd));
	if (fcd == NULL) {
	rc = ENOMEM;
	goto exit_err;
	}

	fcd->cf_descr.handle.fd = f->fd;
	fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
	status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
	if (status.err != CU_FILE_SUCCESS) {
	log_err("cufile register: err=%d:%s\n", status.err,
	fio_libcufile_get_cuda_error(status));
	rc = EINVAL;
	goto exit_err;
	}
	}

	FILE_SET_ENG_DATA(f, fcd);
	return 0;

	exit_err:
	if (fcd) {
	free(fcd);
	fcd = NULL;
	}
	if (f) {
	int rc2 = generic_close_file(td, f);
	if (rc2)
	log_err("generic_close_file: err=%d\n", rc2);
	}
	return rc;
	}

	static int fio_libcufile_close_file(struct thread_data td, struct fio_file f)
	{
	struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
	int rc;

	if (fcd != NULL) {
	cuFileHandleDeregister(fcd->cf_handle);
	FILE_SET_ENG_DATA(f, NULL);
	free(fcd);
	}

	rc = generic_close_file(td, f);

	return rc;
	}

	static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
	{
	struct libcufile_options *o = td->eo;
	int rc;
	CUfileError_t status;

	o->total_mem = total_mem;
	o->logged = 0;
	o->cu_mem_ptr = NULL;
	o->junk_buf = NULL;
	td->orig_buffer = calloc(1, total_mem);
	if (!td->orig_buffer) {
	log_err("orig_buffer calloc failed: err=%d\n", errno);
	goto exit_error;
	}

	if (o->cuda_io == IO_POSIX) {
	o->junk_buf = calloc(1, total_mem);
	if (o->junk_buf == NULL) {
	log_err("junk_buf calloc failed: err=%d\n", errno);
	goto exit_error;
	}
	}

	dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
	check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
	if (rc != 0)
	goto exit_error;
	check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
	if (rc != 0)
	goto exit_error;

	if (o->cuda_io == IO_CUFILE) {
	status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
	if (status.err != CU_FILE_SUCCESS) {
	log_err("cuFileBufRegister: err=%d:%s\n", status.err,
	fio_libcufile_get_cuda_error(status));
	goto exit_error;
	}
	}

	return 0;

	exit_error:
	if (td->orig_buffer) {
	free(td->orig_buffer);
	td->orig_buffer = NULL;
	}
	if (o->junk_buf) {
	free(o->junk_buf);
	o->junk_buf = NULL;
	}
	if (o->cu_mem_ptr) {
	cudaFree(o->cu_mem_ptr);
	o->cu_mem_ptr = NULL;
	}
	return 1;
	}

	static void fio_libcufile_iomem_free(struct thread_data *td)
	{
	struct libcufile_options *o = td->eo;

	if (o->junk_buf) {
	free(o->junk_buf);
	o->junk_buf = NULL;
	}
	if (o->cu_mem_ptr) {
	if (o->cuda_io == IO_CUFILE)
	cuFileBufDeregister(o->cu_mem_ptr);
	cudaFree(o->cu_mem_ptr);
	o->cu_mem_ptr = NULL;
	}
	if (td->orig_buffer) {
	free(td->orig_buffer);
	td->orig_buffer = NULL;
	}
	}

	static void fio_libcufile_cleanup(struct thread_data *td)
	{
	struct libcufile_options *o = td->eo;

	pthread_mutex_lock(&running_lock);
	running--;
	assert(running >= 0);
	if (running == 0) {
	/* only close the driver if initialized and
	this is the last worker thread */
	if (o->cuda_io == IO_CUFILE && cufile_initialized)
	cuFileDriverClose();
	cufile_initialized = 0;
	}
	pthread_mutex_unlock(&running_lock);
	}

	FIO_STATIC struct ioengine_ops ioengine = {
	.name = "libcufile",
	.version = FIO_IOOPS_VERSION,
	.init = fio_libcufile_init,
	.queue = fio_libcufile_queue,
	.get_file_size = generic_get_file_size,
	.open_file = fio_libcufile_open_file,
	.close_file = fio_libcufile_close_file,
	.iomem_alloc = fio_libcufile_iomem_alloc,
	.iomem_free = fio_libcufile_iomem_free,
	.cleanup = fio_libcufile_cleanup,
	.flags = FIO_SYNCIO,
	.options = options,
	.option_struct_size = sizeof(struct libcufile_options)
	};

	void fio_init fio_libcufile_register(void)
	{
	register_ioengine(&ioengine);
	}

	void fio_exit fio_libcufile_unregister(void)
	{
	unregister_ioengine(&ioengine);
	}