| From 8754222838308c2c128854873d1074782e48a81d Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Thu, 2 Sep 2021 14:58:59 -0700 |
| Subject: userfaultfd: prevent concurrent API initialization |
| |
| From: Nadav Amit <namit@vmware.com> |
| |
| [ Upstream commit 22e5fe2a2a279d9a6fcbdfb4dffe73821bef1c90 ] |
| |
| userfaultfd assumes that the enabled features are set once and never |
| changed after UFFDIO_API ioctl succeeded. |
| |
| However, currently, UFFDIO_API can be called concurrently from two |
| different threads, succeed on both threads and leave userfaultfd's |
| features in non-deterministic state. Theoretically, other uffd operations |
| (ioctl's and page-faults) can be dispatched while adversely affected by |
| such changes of features. |
| |
| Moreover, the writes to ctx->state and ctx->features are not ordered, |
| which can - theoretically, again - let userfaultfd_ioctl() think that |
| userfaultfd API completed, while the features are still not initialized. |
| |
| To avoid races, it is arguably best to get rid of ctx->state. Since there |
| are only 2 states, record the API initialization in ctx->features as the |
| uppermost bit and remove ctx->state. |
| |
| Link: https://lkml.kernel.org/r/20210808020724.1022515-3-namit@vmware.com |
| Fixes: 9cd75c3cd4c3d ("userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor") |
| Signed-off-by: Nadav Amit <namit@vmware.com> |
| Cc: Alexander Viro <viro@zeniv.linux.org.uk> |
| Cc: Andrea Arcangeli <aarcange@redhat.com> |
| Cc: Axel Rasmussen <axelrasmussen@google.com> |
| Cc: Jens Axboe <axboe@kernel.dk> |
| Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> |
| Cc: Peter Xu <peterx@redhat.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| fs/userfaultfd.c | 91 +++++++++++++++++++++++------------------------- |
| 1 file changed, 44 insertions(+), 47 deletions(-) |
| |
| diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c |
| index 5c2d806e6ae5..c830cc4ea60f 100644 |
| --- a/fs/userfaultfd.c |
| +++ b/fs/userfaultfd.c |
| @@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly; |
| |
| static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; |
| |
| -enum userfaultfd_state { |
| - UFFD_STATE_WAIT_API, |
| - UFFD_STATE_RUNNING, |
| -}; |
| - |
| /* |
| * Start with fault_pending_wqh and fault_wqh so they're more likely |
| * to be in the same cacheline. |
| @@ -69,8 +64,6 @@ struct userfaultfd_ctx { |
| unsigned int flags; |
| /* features requested from the userspace */ |
| unsigned int features; |
| - /* state machine */ |
| - enum userfaultfd_state state; |
| /* released */ |
| bool released; |
| /* memory mappings are changing because of non-cooperative event */ |
| @@ -104,6 +97,14 @@ struct userfaultfd_wake_range { |
| unsigned long len; |
| }; |
| |
| +/* internal indication that UFFD_API ioctl was successfully executed */ |
| +#define UFFD_FEATURE_INITIALIZED (1u << 31) |
| + |
| +static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) |
| +{ |
| + return ctx->features & UFFD_FEATURE_INITIALIZED; |
| +} |
| + |
| static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, |
| int wake_flags, void *key) |
| { |
| @@ -666,7 +667,6 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) |
| |
| refcount_set(&ctx->refcount, 1); |
| ctx->flags = octx->flags; |
| - ctx->state = UFFD_STATE_RUNNING; |
| ctx->features = octx->features; |
| ctx->released = false; |
| ctx->mmap_changing = false; |
| @@ -943,38 +943,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) |
| |
| poll_wait(file, &ctx->fd_wqh, wait); |
| |
| - switch (ctx->state) { |
| - case UFFD_STATE_WAIT_API: |
| + if (!userfaultfd_is_initialized(ctx)) |
| return EPOLLERR; |
| - case UFFD_STATE_RUNNING: |
| - /* |
| - * poll() never guarantees that read won't block. |
| - * userfaults can be waken before they're read(). |
| - */ |
| - if (unlikely(!(file->f_flags & O_NONBLOCK))) |
| - return EPOLLERR; |
| - /* |
| - * lockless access to see if there are pending faults |
| - * __pollwait last action is the add_wait_queue but |
| - * the spin_unlock would allow the waitqueue_active to |
| - * pass above the actual list_add inside |
| - * add_wait_queue critical section. So use a full |
| - * memory barrier to serialize the list_add write of |
| - * add_wait_queue() with the waitqueue_active read |
| - * below. |
| - */ |
| - ret = 0; |
| - smp_mb(); |
| - if (waitqueue_active(&ctx->fault_pending_wqh)) |
| - ret = EPOLLIN; |
| - else if (waitqueue_active(&ctx->event_wqh)) |
| - ret = EPOLLIN; |
| |
| - return ret; |
| - default: |
| - WARN_ON_ONCE(1); |
| + /* |
| + * poll() never guarantees that read won't block. |
| + * userfaults can be waken before they're read(). |
| + */ |
| + if (unlikely(!(file->f_flags & O_NONBLOCK))) |
| return EPOLLERR; |
| - } |
| + /* |
| + * lockless access to see if there are pending faults |
| + * __pollwait last action is the add_wait_queue but |
| + * the spin_unlock would allow the waitqueue_active to |
| + * pass above the actual list_add inside |
| + * add_wait_queue critical section. So use a full |
| + * memory barrier to serialize the list_add write of |
| + * add_wait_queue() with the waitqueue_active read |
| + * below. |
| + */ |
| + ret = 0; |
| + smp_mb(); |
| + if (waitqueue_active(&ctx->fault_pending_wqh)) |
| + ret = EPOLLIN; |
| + else if (waitqueue_active(&ctx->event_wqh)) |
| + ret = EPOLLIN; |
| + |
| + return ret; |
| } |
| |
| static const struct file_operations userfaultfd_fops; |
| @@ -1169,7 +1164,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, |
| int no_wait = file->f_flags & O_NONBLOCK; |
| struct inode *inode = file_inode(file); |
| |
| - if (ctx->state == UFFD_STATE_WAIT_API) |
| + if (!userfaultfd_is_initialized(ctx)) |
| return -EINVAL; |
| |
| for (;;) { |
| @@ -1908,9 +1903,10 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) |
| static inline unsigned int uffd_ctx_features(__u64 user_features) |
| { |
| /* |
| - * For the current set of features the bits just coincide |
| + * For the current set of features the bits just coincide. Set |
| + * UFFD_FEATURE_INITIALIZED to mark the features as enabled. |
| */ |
| - return (unsigned int)user_features; |
| + return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; |
| } |
| |
| /* |
| @@ -1923,12 +1919,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, |
| { |
| struct uffdio_api uffdio_api; |
| void __user *buf = (void __user *)arg; |
| + unsigned int ctx_features; |
| int ret; |
| __u64 features; |
| |
| - ret = -EINVAL; |
| - if (ctx->state != UFFD_STATE_WAIT_API) |
| - goto out; |
| ret = -EFAULT; |
| if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) |
| goto out; |
| @@ -1952,9 +1946,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, |
| ret = -EFAULT; |
| if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) |
| goto out; |
| - ctx->state = UFFD_STATE_RUNNING; |
| + |
| /* only enable the requested features for this uffd context */ |
| - ctx->features = uffd_ctx_features(features); |
| + ctx_features = uffd_ctx_features(features); |
| + ret = -EINVAL; |
| + if (cmpxchg(&ctx->features, 0, ctx_features) != 0) |
| + goto err_out; |
| + |
| ret = 0; |
| out: |
| return ret; |
| @@ -1971,7 +1969,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, |
| int ret = -EINVAL; |
| struct userfaultfd_ctx *ctx = file->private_data; |
| |
| - if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) |
| + if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) |
| return -EINVAL; |
| |
| switch(cmd) { |
| @@ -2085,7 +2083,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) |
| refcount_set(&ctx->refcount, 1); |
| ctx->flags = flags; |
| ctx->features = 0; |
| - ctx->state = UFFD_STATE_WAIT_API; |
| ctx->released = false; |
| ctx->mmap_changing = false; |
| ctx->mm = current->mm; |
| -- |
| 2.30.2 |
| |