| From 8019ad13ef7f64be44d4f892af9c840179009254 Mon Sep 17 00:00:00 2001 |
| From: Peter Zijlstra <peterz@infradead.org> |
| Date: Wed, 4 Mar 2020 11:28:31 +0100 |
| Subject: futex: Fix inode life-time issue |
| |
| From: Peter Zijlstra <peterz@infradead.org> |
| |
| commit 8019ad13ef7f64be44d4f892af9c840179009254 upstream. |
| |
| As reported by Jann, ihold() does not in fact guarantee inode |
| persistence. And instead of making it so, replace the usage of inode |
| pointers with a per boot, machine wide, unique inode identifier. |
| |
| This sequence number is global, but shared (file backed) futexes are |
| rare enough that this should not become a performance issue. |
| |
| Reported-by: Jann Horn <jannh@google.com> |
| Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| fs/inode.c | 1 |
| include/linux/fs.h | 1 |
| include/linux/futex.h | 17 +++++---- |
| kernel/futex.c | 89 +++++++++++++++++++++++++++++--------------------- |
| 4 files changed, 65 insertions(+), 43 deletions(-) |
| |
| --- a/fs/inode.c |
| +++ b/fs/inode.c |
| @@ -135,6 +135,7 @@ int inode_init_always(struct super_block |
| inode->i_sb = sb; |
| inode->i_blkbits = sb->s_blocksize_bits; |
| inode->i_flags = 0; |
| + atomic64_set(&inode->i_sequence, 0); |
| atomic_set(&inode->i_count, 1); |
| inode->i_op = &empty_iops; |
| inode->i_fop = &no_open_fops; |
| --- a/include/linux/fs.h |
| +++ b/include/linux/fs.h |
| @@ -658,6 +658,7 @@ struct inode { |
| struct rcu_head i_rcu; |
| }; |
| u64 i_version; |
| + atomic64_t i_sequence; /* see futex */ |
| atomic_t i_count; |
| atomic_t i_dio_count; |
| atomic_t i_writecount; |
| --- a/include/linux/futex.h |
| +++ b/include/linux/futex.h |
| @@ -34,23 +34,26 @@ handle_futex_death(u32 __user *uaddr, st |
| |
| union futex_key { |
| struct { |
| + u64 i_seq; |
| unsigned long pgoff; |
| - struct inode *inode; |
| - int offset; |
| + unsigned int offset; |
| } shared; |
| struct { |
| + union { |
| + struct mm_struct *mm; |
| + u64 __tmp; |
| + }; |
| unsigned long address; |
| - struct mm_struct *mm; |
| - int offset; |
| + unsigned int offset; |
| } private; |
| struct { |
| + u64 ptr; |
| unsigned long word; |
| - void *ptr; |
| - int offset; |
| + unsigned int offset; |
| } both; |
| }; |
| |
| -#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } |
| +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } } |
| |
| #ifdef CONFIG_FUTEX |
| extern void exit_robust_list(struct task_struct *curr); |
| --- a/kernel/futex.c |
| +++ b/kernel/futex.c |
| @@ -407,7 +407,7 @@ static void get_futex_key_refs(union fut |
| |
| switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
| case FUT_OFF_INODE: |
| - ihold(key->shared.inode); /* implies MB (B) */ |
| + smp_mb(); /* explicit smp_mb(); (B) */ |
| break; |
| case FUT_OFF_MMSHARED: |
| futex_get_mm(key); /* implies MB (B) */ |
| @@ -438,7 +438,6 @@ static void drop_futex_key_refs(union fu |
| |
| switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
| case FUT_OFF_INODE: |
| - iput(key->shared.inode); |
| break; |
| case FUT_OFF_MMSHARED: |
| mmdrop(key->private.mm); |
| @@ -446,6 +445,46 @@ static void drop_futex_key_refs(union fu |
| } |
| } |
| |
| +/* |
| + * Generate a machine wide unique identifier for this inode. |
| + * |
| + * This relies on u64 not wrapping in the life-time of the machine; which with |
| + * 1ns resolution means almost 585 years. |
| + * |
| + * This further relies on the fact that a well formed program will not unmap |
| + * the file while it has a (shared) futex waiting on it. This mapping will have |
| + * a file reference which pins the mount and inode. |
| + * |
| + * If for some reason an inode gets evicted and read back in again, it will get |
| + * a new sequence number and will _NOT_ match, even though it is the exact same |
| + * file. |
| + * |
| + * It is important that match_futex() will never have a false-positive, esp. |
| + * for PI futexes that can mess up the state. The above argues that false-negatives |
| + * are only possible for malformed programs. |
| + */ |
| +static u64 get_inode_sequence_number(struct inode *inode) |
| +{ |
| + static atomic64_t i_seq; |
| + u64 old; |
| + |
| + /* Does the inode already have a sequence number? */ |
| + old = atomic64_read(&inode->i_sequence); |
| + if (likely(old)) |
| + return old; |
| + |
| + for (;;) { |
| + u64 new = atomic64_add_return(1, &i_seq); |
| + if (WARN_ON_ONCE(!new)) |
| + continue; |
| + |
| + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); |
| + if (old) |
| + return old; |
| + return new; |
| + } |
| +} |
| + |
| /** |
| * get_futex_key() - Get parameters which are the keys for a futex |
| * @uaddr: virtual address of the futex |
| @@ -458,9 +497,15 @@ static void drop_futex_key_refs(union fu |
| * |
| * The key words are stored in *key on success. |
| * |
| - * For shared mappings, it's (page->index, file_inode(vma->vm_file), |
| - * offset_within_page). For private mappings, it's (uaddr, current->mm). |
| - * We can usually work out the index without swapping in the page. |
| + * For shared mappings (when @fshared), the key is: |
| + * ( inode->i_sequence, page->index, offset_within_page ) |
| + * [ also see get_inode_sequence_number() ] |
| + * |
| + * For private mappings (or when !@fshared), the key is: |
| + * ( current->mm, address, 0 ) |
| + * |
| + * This allows (cross process, where applicable) identification of the futex |
| + * without keeping the page pinned for the duration of the FUTEX_WAIT. |
| * |
| * lock_page() might sleep, the caller should not hold a spinlock. |
| */ |
| @@ -628,8 +673,6 @@ again: |
| key->private.mm = mm; |
| key->private.address = address; |
| |
| - get_futex_key_refs(key); /* implies smp_mb(); (B) */ |
| - |
| } else { |
| struct inode *inode; |
| |
| @@ -661,40 +704,14 @@ again: |
| goto again; |
| } |
| |
| - /* |
| - * Take a reference unless it is about to be freed. Previously |
| - * this reference was taken by ihold under the page lock |
| - * pinning the inode in place so i_lock was unnecessary. The |
| - * only way for this check to fail is if the inode was |
| - * truncated in parallel which is almost certainly an |
| - * application bug. In such a case, just retry. |
| - * |
| - * We are not calling into get_futex_key_refs() in file-backed |
| - * cases, therefore a successful atomic_inc return below will |
| - * guarantee that get_futex_key() will still imply smp_mb(); (B). |
| - */ |
| - if (!atomic_inc_not_zero(&inode->i_count)) { |
| - rcu_read_unlock(); |
| - put_page(page_head); |
| - |
| - goto again; |
| - } |
| - |
| - /* Should be impossible but lets be paranoid for now */ |
| - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { |
| - err = -EFAULT; |
| - rcu_read_unlock(); |
| - iput(inode); |
| - |
| - goto out; |
| - } |
| - |
| key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
| - key->shared.inode = inode; |
| + key->shared.i_seq = get_inode_sequence_number(inode); |
| key->shared.pgoff = basepage_index(page); |
| rcu_read_unlock(); |
| } |
| |
| + get_futex_key_refs(key); /* implies smp_mb(); (B) */ |
| + |
| out: |
| put_page(page_head); |
| return err; |