Skip to content
26 changes: 19 additions & 7 deletions include/libvfio-user.h
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,9 @@ vfu_addr_to_sgl(vfu_ctx_t *vfu_ctx, vfu_dma_addr_t dma_addr, size_t len,
* vfu_sgl_put().
*
* This is only supported when a @dma_unregister callback is provided to
* vfu_setup_device_dma().
* vfu_setup_device_dma(). In addition, the client must have registered the
* relevant DMA regions to allow mmap()-ed access and supplied file
* descriptors, otherwise this function will fail with EFAULT.
Comment thread
jlevon marked this conversation as resolved.
*
* @vfu_ctx: the libvfio-user context
* @sgl: array of scatter/gather entries returned by vfu_addr_to_sg. These
Expand Down Expand Up @@ -738,20 +740,27 @@ vfu_sgl_mark_dirty(vfu_ctx_t *vfu_ctx, dma_sg_t *sgl, size_t cnt);
void
vfu_sgl_put(vfu_ctx_t *vfu_ctx, dma_sg_t *sgl, struct iovec *iov, size_t cnt);

/* Flags to vfu_sgl_read() and vfu_sgl_write(). */

/* Perform direct access via file descriptor, if possible. */
#define VFU_SGL_DIRECT_ACCESS (1)

/**
* Read from the dma region exposed by the client. This can be used as an
* alternative to reading from a vfu_sgl_get() mapping, if the region is not
* directly mappable, or DMA notification callbacks have not been provided.
*
* The implementation involves a round-trip communication with the client.
* When the VFU_SGL_DIRECT_ACCESS flag is specified, the DMA operation is
* performed against a client-supplied file descriptor, if applicable.
* Otherwise, the operation incurs an IPC round-trip with the client.
*
* Note that currently, only one @sg entry is supported (@sg_cnt must be 1).
*
* @vfu_ctx: the libvfio-user context
* @sg: array of scatter/gather entries
* @sg_cnt: number of scatter/gather entries
* @data: data buffer to read
* @flags: must be 0
* @flags: VFU_SGL_* flags
*
* @returns 0 on success, -1 on failure. Sets errno.
*/
Expand All @@ -764,18 +773,21 @@ vfu_sgl_read(vfu_ctx_t *vfu_ctx, dma_sg_t *sg, size_t sg_cnt,
* alternative to reading from a vfu_sgl_get() mapping, if the region is not
* directly mappable, or DMA notification callbacks have not been provided.
*
* The implementation involves a round-trip communication with the client.
* When the VFU_SGL_DIRECT_ACCESS flag is specified, the DMA operation is
* performed against a client-supplied file descriptor, if applicable.
* Otherwise, the operation incurs an IPC round-trip with the client.
*
* Note that currently, only one @sg entry is supported (@sg_cnt must be 1).
*
* During live migration, this call does not mark any of the written pages as
* dirty; the client is expected to track this.
* During live migration, this call does only mark the written pages as dirty
* if the access is performed directly against a file descriptor. For access
* via IPC, the client is expected to track which pages have been written.
*
* @vfu_ctx: the libvfio-user context
* @sg: array of scatter/gather entries
* @sg_cnt: number of scatter/gather entries
* @data: data buffer to write
* @flags: must be 0
* @flags: VFU_SGL_* flags
*
* @returns 0 on success, -1 on failure. Sets errno.
*/
Expand Down
2 changes: 2 additions & 0 deletions include/vfio-user.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ struct vfio_user_dma_map {
uint32_t argsz;
#define VFIO_USER_F_DMA_REGION_READ (1 << 0)
#define VFIO_USER_F_DMA_REGION_WRITE (1 << 1)
#define VFIO_USER_F_DMA_REGION_MMAP (1 << 2)
Comment thread
jlevon marked this conversation as resolved.
#define VFIO_USER_F_DMA_REGION_FILE_IO (1 << 3)
uint32_t flags;
uint64_t offset;
uint64_t addr;
Expand Down
193 changes: 98 additions & 95 deletions lib/dma.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ dma_sg_size(void)

bool
dma_sg_is_mappable(const dma_sg_t *sg) {
return sg->region->info.vaddr != NULL;
return sg->region->access_mode == REGION_ACCESS_MODE_MMAP;
}

static inline ssize_t
Expand All @@ -69,7 +69,7 @@ fd_get_blocksize(int fd)
static int
dirty_page_logging_start_on_region(dma_memory_region_t *region, size_t pgsize)
{
assert(region->fd != -1);
assert(region->access_mode != REGION_ACCESS_MODE_MSG);

ssize_t size = get_bitmap_size(region->info.iova.iov_len, pgsize);
if (size < 0) {
Expand Down Expand Up @@ -158,13 +158,42 @@ MOCK_DEFINE(dma_controller_unmap_region)(dma_controller_t *dma,
region->fd, region->info.mapping.iov_base,
iov_end(&region->info.mapping));
}
}

static void
dma_controller_destroy_region(dma_controller_t *dma, dma_memory_region_t *region,
vfu_dma_unregister_cb_t *dma_unregister,
void *data)
{
int err;

assert(region->fd != -1);
assert(dma != NULL);
assert(region != NULL);

dirty_page_logging_stop_on_region(region);
if (dma_unregister != NULL) {
dma->vfu_ctx->in_cb = CB_DMA_UNREGISTER;
dma_unregister(data, &region->info);
dma->vfu_ctx->in_cb = CB_NONE;
}

switch (region->access_mode) {
case REGION_ACCESS_MODE_MSG:
assert(region->fd == -1);
break;
case REGION_ACCESS_MODE_MMAP:
if (region->info.vaddr != NULL) {
Comment thread
jlevon marked this conversation as resolved.
dma_controller_unmap_region(dma, region);
}
/* fall through */
case REGION_ACCESS_MODE_FILE_IO:
err = fd_cache_put(&region->fd);
assert(err == 0);
break;
}

err = fd_cache_put(&region->fd);
assert(err == 0);
dma_controller_increment_regions_generation(dma);
dirty_page_logging_stop_on_region(region);
free(region);
}

/* FIXME not thread safe */
Expand All @@ -190,21 +219,8 @@ MOCK_DEFINE(dma_controller_remove_region)(dma_controller_t *dma,
return ERROR_INT(ENOENT);
}

if (dma_unregister != NULL) {
dma->vfu_ctx->in_cb = CB_DMA_UNREGISTER;
dma_unregister(data, &region->info);
dma->vfu_ctx->in_cb = CB_NONE;
}

if (region->info.vaddr != NULL) {
dma_controller_unmap_region(dma, region);
} else {
assert(region->fd == -1);
}

btree_iter_remove(&iter);
dma_controller_increment_regions_generation(dma);
free(region);
dma_controller_destroy_region(dma, region, dma_unregister, data);

return 0;
}
Expand All @@ -228,19 +244,7 @@ dma_controller_remove_all_regions(dma_controller_t *dma,
region->info.vaddr, region->info.mapping.iov_base,
iov_end(&region->info.mapping));

if (dma_unregister != NULL) {
dma->vfu_ctx->in_cb = CB_DMA_UNREGISTER;
dma_unregister(data, &region->info);
dma->vfu_ctx->in_cb = CB_NONE;
}

if (region->info.vaddr != NULL) {
dma_controller_unmap_region(dma, region);
} else {
assert(region->fd == -1);
}

free(region);
dma_controller_destroy_region(dma, region, dma_unregister, data);
}
}

Expand Down Expand Up @@ -289,20 +293,20 @@ dma_map_region(dma_controller_t *dma, dma_memory_region_t *region)
dma_memory_region_t *
MOCK_DEFINE(dma_controller_add_region)(dma_controller_t *dma,
vfu_dma_addr_t dma_addr, uint64_t size,
int fd, off_t offset, uint32_t prot)
int *fd, off_t offset, uint32_t prot,
enum region_access_mode access_mode)
{
dma_memory_region_t *existing = NULL;
dma_memory_region_t *region = NULL;
btree_iter_t iter;
int page_size = 0;
char rstr[1024];
int ret = 0;
int err;

assert(dma != NULL);

snprintf(rstr, sizeof(rstr), "[%p, %p) fd=%d offset=%#llx prot=%#x",
dma_addr, dma_addr + size, fd, (ull_t)offset, prot);
dma_addr, dma_addr + size, *fd, (ull_t)offset, prot);

if (size > dma->max_size) {
vfu_log(dma->vfu_ctx, LOG_ERR, "DMA region size %llu > max %zu",
Expand All @@ -322,7 +326,7 @@ MOCK_DEFINE(dma_controller_add_region)(dma_controller_t *dma,
(ull_t)existing->offset);
return ERROR_PTR(EINVAL);
}
if (fd_cache_is_same_file(existing->fd, fd) != 0) {
if (fd_cache_is_same_file(existing->fd, *fd) != 0) {
vfu_log(dma->vfu_ctx, LOG_ERR, "bad fd for new DMA region %s; "
"existing=%d", rstr, existing->fd);
return ERROR_PTR(EINVAL);
Expand All @@ -332,7 +336,7 @@ MOCK_DEFINE(dma_controller_add_region)(dma_controller_t *dma,
"%s; existing=%#x", rstr, existing->info.prot);
return ERROR_PTR(EINVAL);
}
close_safely(&fd);
close_safely(fd);
return existing;
}

Expand All @@ -353,82 +357,81 @@ MOCK_DEFINE(dma_controller_add_region)(dma_controller_t *dma,
return ERROR_PTR(EINVAL);
}

if (fd != -1) {
page_size = fd_get_blocksize(fd);
if (page_size < 0) {
vfu_log(dma->vfu_ctx, LOG_ERR, "bad page size %d", page_size);
return ERROR_PTR(EINVAL);
}

fd = fd_cache_get(fd);
if (fd == -1) {
vfu_log(dma->vfu_ctx, LOG_ERR,
"failed to de-duplicate fd for new DMA region %s: %m",
rstr);
return NULL;
}
}
page_size = MAX(page_size, getpagesize());

region = calloc(1, sizeof(*region));
if (region == NULL) {
errno = ENOMEM;
goto rollback;
vfu_log(dma->vfu_ctx, LOG_ERR, "failed to allocate region entry");
return ERROR_PTR(ENOMEM);
}

region->info.iova.iov_base = (void *)dma_addr;
region->info.iova.iov_len = size;
region->info.page_size = page_size;
region->info.prot = prot;
region->info.page_size = getpagesize();
region->access_mode = access_mode;
region->offset = offset;
region->fd = fd;
region->fd = -1;

if (fd != -1) {
/*
* TODO introduce a function that tells whether dirty page logging is
* enabled
*/
if (dma->dirty_pgsize != 0) {
if (dirty_page_logging_start_on_region(region, dma->dirty_pgsize) < 0) {
/*
* TODO We don't necessarily have to fail, we can continue
* and fail the get dirty page bitmap request later.
*/
goto rollback;
switch (access_mode) {
case REGION_ACCESS_MODE_MSG:
assert(*fd == -1);
break;
case REGION_ACCESS_MODE_MMAP:
case REGION_ACCESS_MODE_FILE_IO:
assert(*fd != -1);
region->fd = fd_cache_get(*fd);
if (region->fd == -1) {
vfu_log(dma->vfu_ctx, LOG_ERR,
"failed to de-duplicate fd for new DMA region %s: %m",
rstr);
goto destroy_region;
}
}
*fd = -1; /* The fd is now owned by the cache */

ret = dma_map_region(dma, region);
page_size = fd_get_blocksize(region->fd);
if (page_size < 0) {
vfu_log(dma->vfu_ctx, LOG_ERR, "bad page size %d", page_size);
errno = EINVAL;
goto destroy_region;
}
region->info.page_size = MAX(region->info.page_size, (size_t)page_size);

if (ret != 0) {
vfu_log(dma->vfu_ctx, LOG_ERR,
"failed to memory map DMA region %s: %m", rstr);
goto rollback;
} else {
/* Ownership of the fd is now with the region. */
fd = -1;
}
/*
* TODO introduce a function that tells whether dirty page logging is
* enabled
*/
if (dma->dirty_pgsize != 0) {
if (dirty_page_logging_start_on_region(region, dma->dirty_pgsize) < 0) {
/*
* TODO We don't necessarily have to fail, we can continue
* and fail the get dirty page bitmap request later.
*/
goto destroy_region;
}
}

if (access_mode == REGION_ACCESS_MODE_MMAP) {
ret = dma_map_region(dma, region);

if (ret != 0) {
vfu_log(dma->vfu_ctx, LOG_ERR,
"failed to memory map DMA region %s: %m", rstr);
goto destroy_region;
}
}
break;
}

if (btree_iter_insert(&iter, (uintptr_t)dma_addr + size - 1, region) != 0) {
goto rollback;
goto destroy_region;
}

dma_controller_increment_regions_generation(dma);

return region;

rollback:
destroy_region:
ret = errno;
if (region != NULL) {
if (region->info.vaddr != NULL) {
dma_controller_unmap_region(dma, region);
}
dirty_page_logging_stop_on_region(region);
free(region);
}
err = fd_cache_put(&fd);
assert(err == 0);
dma_controller_destroy_region(dma, region, NULL, NULL);

return ERROR_PTR(ret);
}
Expand Down Expand Up @@ -513,7 +516,7 @@ dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize)
for (btree_iter_init(&dma->regions, 0, &iter);
(region = btree_iter_get(&iter, NULL)) != NULL;
btree_iter_next(&iter)) {
if (region->fd == -1) {
if (region->access_mode == REGION_ACCESS_MODE_MSG) {
continue;
}

Expand Down Expand Up @@ -776,8 +779,8 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,

region = sg.region;

if (region->fd == -1) {
vfu_log(dma->vfu_ctx, LOG_ERR, "region [%p-%p] is not mapped",
if (region->access_mode == REGION_ACCESS_MODE_MSG) {
vfu_log(dma->vfu_ctx, LOG_ERR, "region [%p-%p] isn't accessed directly",
region->info.iova.iov_base, iov_end(&region->info.iova));
return ERROR_INT(EINVAL);
}
Expand Down
Loading
Loading