blob: f01bf924bc3ec94adf12734b27690d53269477b2 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
* Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved.
*/
#include <rdma/ib_verbs.h>
#include <rdma/ib_umem.h>
#include <linux/sched/mm.h>
#include "ib_peer_mem.h"
static DEFINE_MUTEX(peer_memory_mutex);
static LIST_HEAD(peer_memory_list);
static struct kobject *peers_kobj;
#define PEER_NO_INVALIDATION_ID U32_MAX
static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context);
struct peer_mem_attribute {
struct attribute attr;
ssize_t (*show)(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf);
ssize_t (*store)(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, const char *buf,
size_t count);
};
#define PEER_ATTR_RO(_name) \
struct peer_mem_attribute peer_attr_ ## _name = __ATTR_RO(_name)
static ssize_t version_show(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf)
{
return scnprintf(buf, PAGE_SIZE, "%s\n",
ib_peer_client->peer_mem->version);
}
static PEER_ATTR_RO(version);
static ssize_t num_alloc_mrs_show(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf)
{
return scnprintf(
buf, PAGE_SIZE, "%llu\n",
(u64)atomic64_read(&ib_peer_client->stats.num_alloc_mrs));
}
static PEER_ATTR_RO(num_alloc_mrs);
static ssize_t
num_dealloc_mrs_show(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf)
{
return scnprintf(
buf, PAGE_SIZE, "%llu\n",
(u64)atomic64_read(&ib_peer_client->stats.num_dealloc_mrs));
}
static PEER_ATTR_RO(num_dealloc_mrs);
static ssize_t num_reg_pages_show(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf)
{
return scnprintf(
buf, PAGE_SIZE, "%llu\n",
(u64)atomic64_read(&ib_peer_client->stats.num_reg_pages));
}
static PEER_ATTR_RO(num_reg_pages);
static ssize_t
num_dereg_pages_show(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf)
{
return scnprintf(
buf, PAGE_SIZE, "%llu\n",
(u64)atomic64_read(&ib_peer_client->stats.num_dereg_pages));
}
static PEER_ATTR_RO(num_dereg_pages);
static ssize_t num_reg_bytes_show(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf)
{
return scnprintf(
buf, PAGE_SIZE, "%llu\n",
(u64)atomic64_read(&ib_peer_client->stats.num_reg_bytes));
}
static PEER_ATTR_RO(num_reg_bytes);
static ssize_t
num_dereg_bytes_show(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf)
{
return scnprintf(
buf, PAGE_SIZE, "%llu\n",
(u64)atomic64_read(&ib_peer_client->stats.num_dereg_bytes));
}
static PEER_ATTR_RO(num_dereg_bytes);
static ssize_t
num_free_callbacks_show(struct ib_peer_memory_client *ib_peer_client,
struct peer_mem_attribute *attr, char *buf)
{
return scnprintf(buf, PAGE_SIZE, "%lu\n",
ib_peer_client->stats.num_free_callbacks);
}
static PEER_ATTR_RO(num_free_callbacks);
static struct attribute *peer_mem_attrs[] = {
&peer_attr_version.attr,
&peer_attr_num_alloc_mrs.attr,
&peer_attr_num_dealloc_mrs.attr,
&peer_attr_num_reg_pages.attr,
&peer_attr_num_dereg_pages.attr,
&peer_attr_num_reg_bytes.attr,
&peer_attr_num_dereg_bytes.attr,
&peer_attr_num_free_callbacks.attr,
NULL,
};
static const struct attribute_group peer_mem_attr_group = {
.attrs = peer_mem_attrs,
};
static ssize_t peer_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct peer_mem_attribute *peer_attr =
container_of(attr, struct peer_mem_attribute, attr);
if (!peer_attr->show)
return -EIO;
return peer_attr->show(container_of(kobj, struct ib_peer_memory_client,
kobj),
peer_attr, buf);
}
static const struct sysfs_ops peer_mem_sysfs_ops = {
.show = peer_attr_show,
};
static void ib_peer_memory_client_release(struct kobject *kobj)
{
struct ib_peer_memory_client *ib_peer_client =
container_of(kobj, struct ib_peer_memory_client, kobj);
kfree(ib_peer_client);
}
static struct kobj_type peer_mem_type = {
.sysfs_ops = &peer_mem_sysfs_ops,
.release = ib_peer_memory_client_release,
};
static int ib_memory_peer_check_mandatory(const struct peer_memory_client
*peer_client)
{
#define PEER_MEM_MANDATORY_FUNC(x) {offsetof(struct peer_memory_client, x), #x}
int i;
static const struct {
size_t offset;
char *name;
} mandatory_table[] = {
PEER_MEM_MANDATORY_FUNC(acquire),
PEER_MEM_MANDATORY_FUNC(get_pages),
PEER_MEM_MANDATORY_FUNC(put_pages),
PEER_MEM_MANDATORY_FUNC(dma_map),
PEER_MEM_MANDATORY_FUNC(dma_unmap),
};
for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
if (!*(void **)((void *)peer_client +
mandatory_table[i].offset)) {
pr_err("Peer memory %s is missing mandatory function %s\n",
peer_client->name, mandatory_table[i].name);
return -EINVAL;
}
}
return 0;
}
void *
ib_register_peer_memory_client(const struct peer_memory_client *peer_client,
invalidate_peer_memory *invalidate_callback)
{
struct ib_peer_memory_client *ib_peer_client;
int ret;
if (ib_memory_peer_check_mandatory(peer_client))
return NULL;
ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL);
if (!ib_peer_client)
return NULL;
kobject_init(&ib_peer_client->kobj, &peer_mem_type);
refcount_set(&ib_peer_client->usecnt, 1);
init_completion(&ib_peer_client->usecnt_zero);
ib_peer_client->peer_mem = peer_client;
xa_init_flags(&ib_peer_client->umem_xa, XA_FLAGS_ALLOC);
/*
* If the peer wants the invalidation_callback then all memory users
* linked to that peer must support invalidation.
*/
if (invalidate_callback) {
*invalidate_callback = ib_invalidate_peer_memory;
ib_peer_client->invalidation_required = true;
}
mutex_lock(&peer_memory_mutex);
if (!peers_kobj) {
/* Created under /sys/kernel/mm */
peers_kobj = kobject_create_and_add("memory_peers", mm_kobj);
if (!peers_kobj)
goto err_unlock;
}
ret = kobject_add(&ib_peer_client->kobj, peers_kobj, peer_client->name);
if (ret)
goto err_parent;
ret = sysfs_create_group(&ib_peer_client->kobj,
&peer_mem_attr_group);
if (ret)
goto err_parent;
list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list);
mutex_unlock(&peer_memory_mutex);
return ib_peer_client;
err_parent:
if (list_empty(&peer_memory_list)) {
kobject_put(peers_kobj);
peers_kobj = NULL;
}
err_unlock:
mutex_unlock(&peer_memory_mutex);
kobject_put(&ib_peer_client->kobj);
return NULL;
}
EXPORT_SYMBOL(ib_register_peer_memory_client);
void ib_unregister_peer_memory_client(void *reg_handle)
{
struct ib_peer_memory_client *ib_peer_client = reg_handle;
mutex_lock(&peer_memory_mutex);
list_del(&ib_peer_client->core_peer_list);
if (list_empty(&peer_memory_list)) {
kobject_put(peers_kobj);
peers_kobj = NULL;
}
mutex_unlock(&peer_memory_mutex);
/*
* Wait for all umems to be destroyed before returning. Once
* ib_unregister_peer_memory_client() returns no umems will call any
* peer_mem ops.
*/
if (refcount_dec_and_test(&ib_peer_client->usecnt))
complete(&ib_peer_client->usecnt_zero);
wait_for_completion(&ib_peer_client->usecnt_zero);
kobject_put(&ib_peer_client->kobj);
}
EXPORT_SYMBOL(ib_unregister_peer_memory_client);
static struct ib_peer_memory_client *
ib_get_peer_client(unsigned long addr, size_t size,
unsigned long peer_mem_flags, void **peer_client_context)
{
struct ib_peer_memory_client *ib_peer_client;
int ret = 0;
mutex_lock(&peer_memory_mutex);
list_for_each_entry(ib_peer_client, &peer_memory_list,
core_peer_list) {
if (ib_peer_client->invalidation_required &&
(!(peer_mem_flags & IB_PEER_MEM_INVAL_SUPP)))
continue;
ret = ib_peer_client->peer_mem->acquire(addr, size, NULL, NULL,
peer_client_context);
if (ret > 0) {
refcount_inc(&ib_peer_client->usecnt);
mutex_unlock(&peer_memory_mutex);
return ib_peer_client;
}
}
mutex_unlock(&peer_memory_mutex);
return NULL;
}
static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
void *peer_client_context)
{
if (ib_peer_client->peer_mem->release)
ib_peer_client->peer_mem->release(peer_client_context);
if (refcount_dec_and_test(&ib_peer_client->usecnt))
complete(&ib_peer_client->usecnt_zero);
}
static void ib_peer_umem_kref_release(struct kref *kref)
{
kfree(container_of(kref, struct ib_umem_peer, kref));
}
static void ib_unmap_peer_client(struct ib_umem_peer *umem_p)
{
struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client;
const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem;
struct ib_umem *umem = &umem_p->umem;
lockdep_assert_held(&umem_p->mapping_lock);
if (umem_p->last_sg) {
umem_p->last_sg->length = umem_p->last_length;
sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length;
}
if (umem_p->first_sg) {
umem_p->first_sg->dma_address = umem_p->first_dma_address;
umem_p->first_sg->length = umem_p->first_length;
sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length;
}
peer_mem->dma_unmap(&umem_p->umem.sg_head, umem_p->peer_client_context,
umem_p->umem.ibdev->dma_device);
peer_mem->put_pages(&umem_p->umem.sg_head, umem_p->peer_client_context);
memset(&umem->sg_head, 0, sizeof(umem->sg_head));
atomic64_add(umem->nmap, &ib_peer_client->stats.num_dereg_pages);
atomic64_add(umem->length, &ib_peer_client->stats.num_dereg_bytes);
atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
xa_store(&ib_peer_client->umem_xa, umem_p->xa_id, NULL,
GFP_KERNEL);
umem_p->mapped = false;
}
static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
{
struct ib_peer_memory_client *ib_peer_client = reg_handle;
struct ib_umem_peer *umem_p;
/*
* The client is not required to fence against invalidation during
* put_pages() as that would deadlock when we call put_pages() here.
* Thus the core_context cannot be a umem pointer as we have no control
* over the lifetime. Since we won't change the kABI for this to add a
* proper kref, an xarray is used.
*/
xa_lock(&ib_peer_client->umem_xa);
ib_peer_client->stats.num_free_callbacks += 1;
umem_p = xa_load(&ib_peer_client->umem_xa, core_context);
if (!umem_p)
goto out_unlock;
kref_get(&umem_p->kref);
xa_unlock(&ib_peer_client->umem_xa);
mutex_lock(&umem_p->mapping_lock);
if (umem_p->mapped) {
/*
* At this point the invalidation_func must be !NULL as the get
* flow does not unlock mapping_lock until it is set, and umems
* that do not require invalidation are not in the xarray.
*/
umem_p->invalidation_func(&umem_p->umem,
umem_p->invalidation_private);
ib_unmap_peer_client(umem_p);
}
mutex_unlock(&umem_p->mapping_lock);
kref_put(&umem_p->kref, ib_peer_umem_kref_release);
return 0;
out_unlock:
xa_unlock(&ib_peer_client->umem_xa);
return 0;
}
void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
umem_invalidate_func_t func,
void *priv)
{
struct ib_umem_peer *umem_p =
container_of(umem, struct ib_umem_peer, umem);
if (WARN_ON(!umem->is_peer))
return;
if (umem_p->xa_id == PEER_NO_INVALIDATION_ID)
return;
umem_p->invalidation_func = func;
umem_p->invalidation_private = priv;
/* Pairs with the lock in ib_peer_umem_get() */
mutex_unlock(&umem_p->mapping_lock);
/* At this point func can be called asynchronously */
}
EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
static void fix_peer_sgls(struct ib_umem_peer *umem_p, unsigned long peer_page_size)
{
struct ib_umem *umem = &umem_p->umem;
struct scatterlist *sg;
int i;
for_each_sg(umem_p->umem.sg_head.sgl, sg, umem_p->umem.nmap, i) {
if (i == 0) {
unsigned long offset;
umem_p->first_sg = sg;
umem_p->first_dma_address = sg->dma_address;
umem_p->first_dma_length = sg_dma_len(sg);
umem_p->first_length = sg->length;
offset = ALIGN_DOWN(umem->address, PAGE_SIZE) -
ALIGN_DOWN(umem->address, peer_page_size);
sg->dma_address += offset;
sg_dma_len(sg) -= offset;
sg->length -= offset;
}
if (i == umem_p->umem.nmap - 1) {
unsigned long trim;
umem_p->last_sg = sg;
umem_p->last_dma_length = sg_dma_len(sg);
umem_p->last_length = sg->length;
trim = ALIGN(umem->address + umem->length,
peer_page_size) -
ALIGN(umem->address + umem->length, PAGE_SIZE);
sg_dma_len(sg) -= trim;
sg->length -= trim;
}
}
}
struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
unsigned long peer_mem_flags)
{
struct ib_peer_memory_client *ib_peer_client;
unsigned long peer_page_size;
void *peer_client_context;
struct ib_umem_peer *umem_p;
int ret;
ib_peer_client =
ib_get_peer_client(old_umem->address, old_umem->length,
peer_mem_flags, &peer_client_context);
if (!ib_peer_client)
return ERR_PTR(old_ret);
umem_p = kzalloc(sizeof(*umem_p), GFP_KERNEL);
if (!umem_p) {
ret = -ENOMEM;
goto err_client;
}
kref_init(&umem_p->kref);
umem_p->umem = *old_umem;
memset(&umem_p->umem.sg_head, 0, sizeof(umem_p->umem.sg_head));
umem_p->umem.is_peer = 1;
umem_p->ib_peer_client = ib_peer_client;
umem_p->peer_client_context = peer_client_context;
mutex_init(&umem_p->mapping_lock);
umem_p->xa_id = PEER_NO_INVALIDATION_ID;
mutex_lock(&umem_p->mapping_lock);
if (ib_peer_client->invalidation_required) {
ret = xa_alloc_cyclic(&ib_peer_client->umem_xa, &umem_p->xa_id,
umem_p,
XA_LIMIT(0, PEER_NO_INVALIDATION_ID - 1),
&ib_peer_client->xa_cyclic_next,
GFP_KERNEL);
if (ret < 0)
goto err_umem;
}
/*
* We always request write permissions to the pages, to force breaking
* of any CoW during the registration of the MR. For read-only MRs we
* use the "force" flag to indicate that CoW breaking is required but
* the registration should not fail if referencing read-only areas.
*/
ret = ib_peer_client->peer_mem->get_pages(umem_p->umem.address,
umem_p->umem.length, 1,
!umem_p->umem.writable, NULL,
peer_client_context,
umem_p->xa_id);
if (ret)
goto err_xa;
ret = ib_peer_client->peer_mem->dma_map(&umem_p->umem.sg_head,
peer_client_context,
umem_p->umem.ibdev->dma_device,
0, &umem_p->umem.nmap);
if (ret)
goto err_pages;
peer_page_size = ib_peer_client->peer_mem->get_page_size(peer_client_context);
if (peer_page_size != PAGE_SIZE)
fix_peer_sgls(umem_p, peer_page_size);
umem_p->mapped = true;
atomic64_add(umem_p->umem.nmap, &ib_peer_client->stats.num_reg_pages);
atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes);
atomic64_inc(&ib_peer_client->stats.num_alloc_mrs);
/*
* If invalidation is allowed then the caller must call
* ib_umem_activate_invalidation_notifier() or ib_peer_umem_release() to
* unlock this mutex. The call to should be done after the last
* read to sg_head, once the caller is ready for the invalidation
* function to be called.
*/
if (umem_p->xa_id == PEER_NO_INVALIDATION_ID)
mutex_unlock(&umem_p->mapping_lock);
/*
* On success the old umem is replaced with the new, larger, allocation
*/
kfree(old_umem);
return &umem_p->umem;
err_pages:
ib_peer_client->peer_mem->put_pages(&umem_p->umem.sg_head,
umem_p->peer_client_context);
err_xa:
if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
err_umem:
mutex_unlock(&umem_p->mapping_lock);
kref_put(&umem_p->kref, ib_peer_umem_kref_release);
err_client:
ib_put_peer_client(ib_peer_client, peer_client_context);
return ERR_PTR(ret);
}
void ib_peer_umem_release(struct ib_umem *umem)
{
struct ib_umem_peer *umem_p =
container_of(umem, struct ib_umem_peer, umem);
/* invalidation_func being set indicates activate was called */
if (umem_p->xa_id == PEER_NO_INVALIDATION_ID ||
umem_p->invalidation_func)
mutex_lock(&umem_p->mapping_lock);
if (umem_p->mapped)
ib_unmap_peer_client(umem_p);
mutex_unlock(&umem_p->mapping_lock);
if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
ib_put_peer_client(umem_p->ib_peer_client, umem_p->peer_client_context);
umem_p->ib_peer_client = NULL;
/* Must match ib_umem_release() */
atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
mmdrop(umem->owning_mm);
kref_put(&umem_p->kref, ib_peer_umem_kref_release);
}