Merge tag 'vfs-7.1-rc1.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs fixes from Christian Brauner:
- eventpoll: fix ep_remove() UAF and follow-up cleanup
- fs: aio: set VMA_DONTCOPY_BIT in mmap to fix NULL-pointer-dereference
error
- writeback: Fix use after free in inode_switch_wbs_work_fn()
- fuse: reject oversized dirents in page cache
- fs: aio: reject partial mremap to avoid Null-pointer-dereference
error
- nstree: fix func. parameter kernel-doc warnings
- fs: Handle multiply claimed blocks more gracefully with mmb
* tag 'vfs-7.1-rc1.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
eventpoll: drop vestigial epi->dying flag
eventpoll: drop dead bool return from ep_remove_epi()
eventpoll: refresh eventpoll_release() fast-path comment
eventpoll: move f_lock acquisition into ep_remove_file()
eventpoll: fix ep_remove struct eventpoll / struct file UAF
eventpoll: move epi_fget() up
eventpoll: rename ep_remove_safe() back to ep_remove()
eventpoll: drop vestigial __ prefix from ep_remove_{file,epi}()
eventpoll: kill __ep_remove()
eventpoll: split __ep_remove()
eventpoll: use hlist_is_singular_node() in __ep_remove()
fs: Handle multiply claimed blocks more gracefully with mmb
nstree: fix func. parameter kernel-doc warnings
fs: aio: reject partial mremap to avoid Null-pointer-dereference error
fuse: reject oversized dirents in page cache
writeback: Fix use after free in inode_switch_wbs_work_fn()
fs: aio: set VMA_DONTCOPY_BIT in mmap to fix NULL-pointer-dereference error
diff --git a/fs/aio.c b/fs/aio.c
index ba9b9fa..7224765 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -422,7 +422,8 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
ctx = rcu_dereference(table->table[i]);
if (ctx && ctx->aio_ring_file == file) {
- if (!atomic_read(&ctx->dead)) {
+ if (!atomic_read(&ctx->dead) &&
+ (ctx->mmap_size == (vma->vm_end - vma->vm_start))) {
ctx->user_id = ctx->mmap_base = vma->vm_start;
res = 0;
}
@@ -447,7 +448,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = {
static int aio_ring_mmap_prepare(struct vm_area_desc *desc)
{
- vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT);
+ vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTCOPY_BIT);
desc->vm_ops = &aio_ring_vm_ops;
return 0;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index d6e062c..b0b3792 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -719,8 +719,15 @@ void mmb_mark_buffer_dirty(struct buffer_head *bh,
mark_buffer_dirty(bh);
if (!bh->b_mmb) {
spin_lock(&mmb->lock);
+ /*
+ * For a corrupted filesystem with multiply claimed blocks this
+ * can fail. Avoid corrupting the linked list in that case.
+ */
+ if (cmpxchg(&bh->b_mmb, NULL, mmb) != NULL) {
+ spin_unlock(&mmb->lock);
+ return;
+ }
list_move_tail(&bh->b_assoc_buffers, &mmb->list);
- bh->b_mmb = mmb;
spin_unlock(&mmb->lock);
}
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 23f3c6a..a3090b4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -148,13 +148,6 @@ struct epitem {
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
- /*
- * Protected by file->f_lock, true for to-be-released epitem already
- * removed from the "struct file" items list; together with
- * eventpoll->refcount orchestrates "struct eventpoll" disposal
- */
- bool dying;
-
/* List containing poll wait queues */
struct eppoll_entry *pwqlist;
@@ -220,10 +213,7 @@ struct eventpoll {
struct hlist_head refs;
u8 loop_check_depth;
- /*
- * usage count, used together with epitem->dying to
- * orchestrate the disposal of this struct
- */
+ /* usage count, orchestrates "struct eventpoll" disposal */
refcount_t refcount;
/* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */
@@ -827,36 +817,47 @@ static void ep_free(struct eventpoll *ep)
}
/*
- * Removes a "struct epitem" from the eventpoll RB tree and deallocates
- * all the associated resources. Must be called with "mtx" held.
- * If the dying flag is set, do the removal only if force is true.
- * This prevents ep_clear_and_put() from dropping all the ep references
- * while running concurrently with eventpoll_release_file().
- * Returns true if the eventpoll can be disposed.
+ * The ffd.file pointer may be in the process of being torn down due to
+ * being closed, but we may not have finished eventpoll_release() yet.
+ *
+ * Normally, even with the atomic_long_inc_not_zero, the file may have
+ * been free'd and then gotten re-allocated to something else (since
+ * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
+ *
+ * But for epoll, users hold the ep->mtx mutex, and as such any file in
+ * the process of being free'd will block in eventpoll_release_file()
+ * and thus the underlying file allocation will not be free'd, and the
+ * file re-use cannot happen.
+ *
+ * For the same reason we can avoid a rcu_read_lock() around the
+ * operation - 'ffd.file' cannot go away even if the refcount has
+ * reached zero (but we must still not call out to ->poll() functions
+ * etc).
*/
-static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
+static struct file *epi_fget(const struct epitem *epi)
{
- struct file *file = epi->ffd.file;
- struct epitems_head *to_free;
+ struct file *file;
+
+ file = epi->ffd.file;
+ if (!file_ref_get(&file->f_ref))
+ file = NULL;
+ return file;
+}
+
+/*
+ * Takes &file->f_lock; returns with it released.
+ */
+static void ep_remove_file(struct eventpoll *ep, struct epitem *epi,
+ struct file *file)
+{
+ struct epitems_head *to_free = NULL;
struct hlist_head *head;
- lockdep_assert_irqs_enabled();
+ lockdep_assert_held(&ep->mtx);
- /*
- * Removes poll wait queue hooks.
- */
- ep_unregister_pollwait(ep, epi);
-
- /* Remove the current item from the list of epoll hooks */
spin_lock(&file->f_lock);
- if (epi->dying && !force) {
- spin_unlock(&file->f_lock);
- return false;
- }
-
- to_free = NULL;
head = file->f_ep;
- if (head->first == &epi->fllink && !epi->fllink.next) {
+ if (hlist_is_singular_node(&epi->fllink, head)) {
/* See eventpoll_release() for details. */
WRITE_ONCE(file->f_ep, NULL);
if (!is_file_epoll(file)) {
@@ -869,6 +870,11 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
hlist_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
free_ephead(to_free);
+}
+
+static void ep_remove_epi(struct eventpoll *ep, struct epitem *epi)
+{
+ lockdep_assert_held(&ep->mtx);
rb_erase_cached(&epi->rbn, &ep->rbr);
@@ -888,16 +894,32 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
- return true;
}
/*
* ep_remove variant for callers owing an additional reference to the ep
*/
-static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
+static void ep_remove(struct eventpoll *ep, struct epitem *epi)
{
- if (__ep_remove(ep, epi, false))
- WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
+ struct file *file __free(fput) = NULL;
+
+ lockdep_assert_irqs_enabled();
+ lockdep_assert_held(&ep->mtx);
+
+ ep_unregister_pollwait(ep, epi);
+
+ /*
+ * If we manage to grab a reference it means we're not in
+ * eventpoll_release_file() and aren't going to be: once @file's
+ * refcount has reached zero, file_ref_get() cannot bring it back.
+ */
+ file = epi_fget(epi);
+ if (!file)
+ return;
+
+ ep_remove_file(ep, epi, file);
+ ep_remove_epi(ep, epi);
+ WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
}
static void ep_clear_and_put(struct eventpoll *ep)
@@ -923,7 +945,7 @@ static void ep_clear_and_put(struct eventpoll *ep)
/*
* Walks through the whole tree and try to free each "struct epitem".
- * Note that ep_remove_safe() will not remove the epitem in case of a
+ * Note that ep_remove() will not remove the epitem in case of a
* racing eventpoll_release_file(); the latter will do the removal.
* At this point we are sure no poll callbacks will be lingering around.
* Since we still own a reference to the eventpoll struct, the loop can't
@@ -932,7 +954,7 @@ static void ep_clear_and_put(struct eventpoll *ep)
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
next = rb_next(rbp);
epi = rb_entry(rbp, struct epitem, rbn);
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
cond_resched();
}
@@ -1013,34 +1035,6 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
}
/*
- * The ffd.file pointer may be in the process of being torn down due to
- * being closed, but we may not have finished eventpoll_release() yet.
- *
- * Normally, even with the atomic_long_inc_not_zero, the file may have
- * been free'd and then gotten re-allocated to something else (since
- * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
- *
- * But for epoll, users hold the ep->mtx mutex, and as such any file in
- * the process of being free'd will block in eventpoll_release_file()
- * and thus the underlying file allocation will not be free'd, and the
- * file re-use cannot happen.
- *
- * For the same reason we can avoid a rcu_read_lock() around the
- * operation - 'ffd.file' cannot go away even if the refcount has
- * reached zero (but we must still not call out to ->poll() functions
- * etc).
- */
-static struct file *epi_fget(const struct epitem *epi)
-{
- struct file *file;
-
- file = epi->ffd.file;
- if (!file_ref_get(&file->f_ref))
- file = NULL;
- return file;
-}
-
-/*
* Differs from ep_eventpoll_poll() in that internal callers already have
* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
* is correctly annotated.
@@ -1117,18 +1111,17 @@ void eventpoll_release_file(struct file *file)
{
struct eventpoll *ep;
struct epitem *epi;
- bool dispose;
/*
- * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
- * touching the epitems list before eventpoll_release_file() can access
- * the ep->mtx.
+ * A concurrent ep_remove() cannot outrace us: it pins @file via
+ * epi_fget(), which fails once __fput() has dropped the refcount
+ * to zero -- the path we're on. So any racing ep_remove() bails
+ * and leaves the epi for us to clean up here.
*/
again:
spin_lock(&file->f_lock);
if (file->f_ep && file->f_ep->first) {
epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
- epi->dying = true;
spin_unlock(&file->f_lock);
/*
@@ -1137,10 +1130,15 @@ void eventpoll_release_file(struct file *file)
*/
ep = epi->ep;
mutex_lock(&ep->mtx);
- dispose = __ep_remove(ep, epi, true);
+
+ ep_unregister_pollwait(ep, epi);
+
+ ep_remove_file(ep, epi, file);
+ ep_remove_epi(ep, epi);
+
mutex_unlock(&ep->mtx);
- if (dispose && ep_refcount_dec_and_test(ep))
+ if (ep_refcount_dec_and_test(ep))
ep_free(ep);
goto again;
}
@@ -1619,21 +1617,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
mutex_unlock(&tep->mtx);
/*
- * ep_remove_safe() calls in the later error paths can't lead to
+ * ep_remove() calls in the later error paths can't lead to
* ep_free() as the ep file itself still holds an ep reference.
*/
ep_get(ep);
/* now check if we've created too many backpaths */
if (unlikely(full_check && reverse_path_check())) {
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
return -EINVAL;
}
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error) {
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
return error;
}
}
@@ -1657,7 +1655,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
* high memory pressure.
*/
if (unlikely(!epq.epi)) {
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
return -ENOMEM;
}
@@ -2352,7 +2350,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* The eventpoll itself is still alive: the refcount
* can't go to zero here.
*/
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
error = 0;
} else {
error = -ENOENT;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e1fbdf9..a65694c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -568,28 +568,30 @@ void inode_switch_wbs_work_fn(struct work_struct *work)
struct inode_switch_wbs_context *isw, *next_isw;
struct llist_node *list;
+ list = llist_del_all(&new_wb->switch_wbs_ctxs);
/*
- * Grab out reference to wb so that it cannot get freed under us
+ * Nothing to do? That would be a problem as references held by isw
+ * items protect wb from freeing...
+ */
+ if (WARN_ON_ONCE(!list))
+ return;
+
+ /*
+ * Grab our reference to wb so that it cannot get freed under us
* after we process all the isw items.
*/
wb_get(new_wb);
- while (1) {
- list = llist_del_all(&new_wb->switch_wbs_ctxs);
- /* Nothing to do? */
- if (!list)
- break;
- /*
- * In addition to synchronizing among switchers, I_WB_SWITCH
- * tells the RCU protected stat update paths to grab the i_page
- * lock so that stat transfer can synchronize against them.
- * Let's continue after I_WB_SWITCH is guaranteed to be
- * visible.
- */
- synchronize_rcu();
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH
+ * tells the RCU protected stat update paths to grab the i_page
+ * lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be
+ * visible.
+ */
+ synchronize_rcu();
- llist_for_each_entry_safe(isw, next_isw, list, list)
- process_inode_switch_wbs(new_wb, isw);
- }
+ llist_for_each_entry_safe(isw, next_isw, list, list)
+ process_inode_switch_wbs(new_wb, isw);
wb_put(new_wb);
}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index c88194e..db5ae8e 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -41,6 +41,10 @@ static void fuse_add_dirent_to_cache(struct file *file,
unsigned int offset;
void *addr;
+ /* Dirent doesn't fit in readdir cache page? Skip caching. */
+ if (reclen > PAGE_SIZE)
+ return;
+
spin_lock(&fi->rdc.lock);
/*
* Is cache already completed? Or this entry does not go at the end of
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index ea9ca0e..728fb5d 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -39,12 +39,16 @@ static inline void eventpoll_release(struct file *file)
{
/*
- * Fast check to avoid the get/release of the semaphore. Since
- * we're doing this outside the semaphore lock, it might return
- * false negatives, but we don't care. It'll help in 99.99% of cases
- * to avoid the semaphore lock. False positives simply cannot happen
- * because the file in on the way to be removed and nobody ( but
- * eventpoll ) has still a reference to this file.
+ * Fast check to skip the slow path in the common case where the
+ * file was never attached to an epoll. Safe without file->f_lock
+ * because every f_ep writer excludes a concurrent __fput() on
+ * @file:
+ * - ep_insert() requires the file alive (refcount > 0);
+ * - ep_remove() holds @file pinned via epi_fget() across the
+ * write;
+ * - eventpoll_release_file() runs from __fput() itself.
+ * We are in __fput() here, so none of those can race us: a NULL
+ * observation truly means no epoll path has work left on @file.
*/
if (likely(!READ_ONCE(file->f_ep)))
return;
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 175e462..5b64d45 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -61,7 +61,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t
/**
* ns_tree_add_raw - Add a namespace to a namespace
- * @ns: Namespace to add
+ * @__ns: Namespace to add
*
* This function adds a namespace to the appropriate namespace tree
* without assigning a id.
@@ -70,7 +70,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t
/**
* ns_tree_add - Add a namespace to a namespace tree
- * @ns: Namespace to add
+ * @__ns: Namespace to add
*
* This function assigns a new id to the namespace and adds it to the
* appropriate namespace tree and list.
@@ -81,7 +81,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t
/**
* ns_tree_remove - Remove a namespace from a namespace tree
- * @ns: Namespace to remove
+ * @__ns: Namespace to remove
*
* This function removes a namespace from the appropriate namespace
* tree and list.