| From: Lorenzo Stoakes <lstoakes@gmail.com> |
| Subject: mm: vmalloc: convert vread() to vread_iter() |
| Date: Wed, 22 Mar 2023 18:57:04 +0000 |
| |
| Having previously laid the foundation for converting vread() to an |
| iterator function, pull the trigger and do so. |
| |
| This patch attempts to provide minimal refactoring and to reflect the |
| existing logic as best we can, for example we continue to zero portions of |
| memory not read, as before. |
| |
| Overall, there should be no functional difference other than a performance |
| improvement in /proc/kcore access to vmalloc regions. |
| |
| Now we have eliminated the need for a bounce buffer in read_kcore_iter(), |
| we dispense with it, and try to write to user memory optimistically but |
| with faults disabled via copy_page_to_iter_nofault(). We already have |
| preemption disabled by holding a spin lock. We continue faulting in until |
| the operation is complete. |
| |
| Additionally, we must account for the fact that at any point a copy may |
| fail (most likely due to a fault not being able to occur), we exit |
| indicating fewer bytes retrieved than expected. |
| |
| [sfr@canb.auug.org.au: fix sparc64 warning] |
| Link: https://lkml.kernel.org/r/20230320144721.663280c3@canb.auug.org.au |
| [lstoakes@gmail.com: redo Stephen's sparc build fix] |
| Link: https://lkml.kernel.org/r/8506cbc667c39205e65a323f750ff9c11a463798.1679566220.git.lstoakes@gmail.com |
| [akpm@linux-foundation.org: unbreak uio.h includes] |
| Link: https://lkml.kernel.org/r/941f88bc5ab928e6656e1e2593b91bf0f8c81e1b.1679511146.git.lstoakes@gmail.com |
| Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com> |
| Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> |
| Reviewed-by: Baoquan He <bhe@redhat.com> |
| Cc: Alexander Viro <viro@zeniv.linux.org.uk> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Jens Axboe <axboe@kernel.dk> |
| Cc: Jiri Olsa <jolsa@kernel.org> |
| Cc: Liu Shixin <liushixin2@huawei.com> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Uladzislau Rezki (Sony) <urezki@gmail.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| fs/proc/kcore.c | 44 +++---- |
| include/linux/vmalloc.h | 3 |
| mm/nommu.c | 10 - |
| mm/vmalloc.c | 237 +++++++++++++++++++++++--------------- |
| 4 files changed, 178 insertions(+), 116 deletions(-) |
| |
| --- a/fs/proc/kcore.c~mm-vmalloc-convert-vread-to-vread_iter |
| +++ a/fs/proc/kcore.c |
| @@ -307,13 +307,9 @@ static void append_kcore_note(char *note |
| *i = ALIGN(*i + descsz, 4); |
| } |
| |
| -static ssize_t |
| -read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) |
| +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) |
| { |
| - struct file *file = iocb->ki_filp; |
| - char *buf = file->private_data; |
| loff_t *fpos = &iocb->ki_pos; |
| - |
| size_t phdrs_offset, notes_offset, data_offset; |
| size_t page_offline_frozen = 1; |
| size_t phdrs_len, notes_len; |
| @@ -507,13 +503,30 @@ read_kcore_iter(struct kiocb *iocb, stru |
| |
| switch (m->type) { |
| case KCORE_VMALLOC: |
| - vread(buf, (char *)start, tsz); |
| - /* we have to zero-fill user buffer even if no read */ |
| - if (copy_to_iter(buf, tsz, iter) != tsz) { |
| - ret = -EFAULT; |
| - goto out; |
| + { |
| + const char *src = (char *)start; |
| + size_t read = 0, left = tsz; |
| + |
| + /* |
| + * vmalloc uses spinlocks, so we optimistically try to |
| + * read memory. If this fails, fault pages in and try |
| + * again until we are done. |
| + */ |
| + while (true) { |
| + read += vread_iter(iter, src, left); |
| + if (read == tsz) |
| + break; |
| + |
| + src += read; |
| + left -= read; |
| + |
| + if (fault_in_iov_iter_writeable(iter, left)) { |
| + ret = -EFAULT; |
| + goto out; |
| + } |
| } |
| break; |
| + } |
| case KCORE_USER: |
| /* User page is handled prior to normal kernel page: */ |
| if (copy_to_iter((char *)start, tsz, iter) != tsz) { |
| @@ -582,10 +595,6 @@ static int open_kcore(struct inode *inod |
| if (ret) |
| return ret; |
| |
| - filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); |
| - if (!filp->private_data) |
| - return -ENOMEM; |
| - |
| if (kcore_need_update) |
| kcore_update_ram(); |
| if (i_size_read(inode) != proc_root_kcore->size) { |
| @@ -596,16 +605,9 @@ static int open_kcore(struct inode *inod |
| return 0; |
| } |
| |
| -static int release_kcore(struct inode *inode, struct file *file) |
| -{ |
| - kfree(file->private_data); |
| - return 0; |
| -} |
| - |
| static const struct proc_ops kcore_proc_ops = { |
| .proc_read_iter = read_kcore_iter, |
| .proc_open = open_kcore, |
| - .proc_release = release_kcore, |
| .proc_lseek = default_llseek, |
| }; |
| |
| --- a/include/linux/vmalloc.h~mm-vmalloc-convert-vread-to-vread_iter |
| +++ a/include/linux/vmalloc.h |
| @@ -14,6 +14,7 @@ |
| |
| struct vm_area_struct; /* vma defining user mapping in mm_types.h */ |
| struct notifier_block; /* in notifier.h */ |
| +struct iov_iter; /* in uio.h */ |
| |
| /* bits in flags of vmalloc's vm_struct below */ |
| #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ |
| @@ -247,7 +248,7 @@ static inline void set_vm_flush_reset_pe |
| #endif |
| |
| /* for /proc/kcore */ |
| -extern long vread(char *buf, char *addr, unsigned long count); |
| +extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); |
| |
| /* |
| * Internals. Don't use.. |
| --- a/mm/nommu.c~mm-vmalloc-convert-vread-to-vread_iter |
| +++ a/mm/nommu.c |
| @@ -36,6 +36,7 @@ |
| #include <linux/printk.h> |
| |
| #include <linux/uaccess.h> |
| +#include <linux/uio.h> |
| #include <asm/tlb.h> |
| #include <asm/tlbflush.h> |
| #include <asm/mmu_context.h> |
| @@ -198,14 +199,13 @@ unsigned long vmalloc_to_pfn(const void |
| } |
| EXPORT_SYMBOL(vmalloc_to_pfn); |
| |
| -long vread(char *buf, char *addr, unsigned long count) |
| +long vread_iter(struct iov_iter *iter, const char *addr, size_t count) |
| { |
| /* Don't allow overflow */ |
| - if ((unsigned long) buf + count < count) |
| - count = -(unsigned long) buf; |
| + if ((unsigned long) addr + count < count) |
| + count = -(unsigned long) addr; |
| |
| - memcpy(buf, addr, count); |
| - return count; |
| + return copy_to_iter(addr, count, iter); |
| } |
| |
| /* |
| --- a/mm/vmalloc.c~mm-vmalloc-convert-vread-to-vread_iter |
| +++ a/mm/vmalloc.c |
| @@ -33,11 +33,11 @@ |
| #include <linux/compiler.h> |
| #include <linux/memcontrol.h> |
| #include <linux/llist.h> |
| +#include <linux/uio.h> |
| #include <linux/bitops.h> |
| #include <linux/rbtree_augmented.h> |
| #include <linux/overflow.h> |
| #include <linux/pgtable.h> |
| -#include <linux/uaccess.h> |
| #include <linux/hugetlb.h> |
| #include <linux/sched/mm.h> |
| #include <asm/tlbflush.h> |
| @@ -3442,62 +3442,96 @@ void *vmalloc_32_user(unsigned long size |
| EXPORT_SYMBOL(vmalloc_32_user); |
| |
| /* |
| - * small helper routine , copy contents to buf from addr. |
| - * If the page is not present, fill zero. |
| + * Atomically zero bytes in the iterator. |
| + * |
| + * Returns the number of zeroed bytes. |
| */ |
| +static size_t zero_iter(struct iov_iter *iter, size_t count) |
| +{ |
| + size_t remains = count; |
| + |
| + while (remains > 0) { |
| + size_t num, copied; |
| + |
| + num = remains < PAGE_SIZE ? remains : PAGE_SIZE; |
| + copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); |
| + remains -= copied; |
| + |
| + if (copied < num) |
| + break; |
| + } |
| |
| -static int aligned_vread(char *buf, char *addr, unsigned long count) |
| + return count - remains; |
| +} |
| + |
| +/* |
| + * small helper routine, copy contents to iter from addr. |
| + * If the page is not present, fill zero. |
| + * |
| + * Returns the number of copied bytes. |
| + */ |
| +static size_t aligned_vread_iter(struct iov_iter *iter, |
| + const char *addr, size_t count) |
| { |
| - struct page *p; |
| - int copied = 0; |
| + size_t remains = count; |
| + struct page *page; |
| |
| - while (count) { |
| + while (remains > 0) { |
| unsigned long offset, length; |
| + size_t copied = 0; |
| |
| offset = offset_in_page(addr); |
| length = PAGE_SIZE - offset; |
| - if (length > count) |
| - length = count; |
| - p = vmalloc_to_page(addr); |
| + if (length > remains) |
| + length = remains; |
| + page = vmalloc_to_page(addr); |
| /* |
| - * To do safe access to this _mapped_ area, we need |
| - * lock. But adding lock here means that we need to add |
| - * overhead of vmalloc()/vfree() calls for this _debug_ |
| - * interface, rarely used. Instead of that, we'll use |
| - * kmap() and get small overhead in this access function. |
| + * To do safe access to this _mapped_ area, we need lock. But |
| + * adding lock here means that we need to add overhead of |
| + * vmalloc()/vfree() calls for this _debug_ interface, rarely |
| + * used. Instead of that, we'll use an local mapping via |
| + * copy_page_to_iter_nofault() and accept a small overhead in |
| + * this access function. |
| */ |
| - if (p) { |
| - /* We can expect USER0 is not used -- see vread() */ |
| - void *map = kmap_atomic(p); |
| - memcpy(buf, map + offset, length); |
| - kunmap_atomic(map); |
| - } else |
| - memset(buf, 0, length); |
| + if (page) |
| + copied = copy_page_to_iter_nofault(page, offset, |
| + length, iter); |
| + else |
| + copied = zero_iter(iter, length); |
| |
| - addr += length; |
| - buf += length; |
| - copied += length; |
| - count -= length; |
| + addr += copied; |
| + remains -= copied; |
| + |
| + if (copied != length) |
| + break; |
| } |
| - return copied; |
| + |
| + return count - remains; |
| } |
| |
| -static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) |
| +/* |
| + * Read from a vm_map_ram region of memory. |
| + * |
| + * Returns the number of copied bytes. |
| + */ |
| +static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, |
| + size_t count, unsigned long flags) |
| { |
| char *start; |
| struct vmap_block *vb; |
| unsigned long offset; |
| - unsigned int rs, re, n; |
| + unsigned int rs, re; |
| + size_t remains, n; |
| |
| /* |
| * If it's area created by vm_map_ram() interface directly, but |
| * not further subdividing and delegating management to vmap_block, |
| * handle it here. |
| */ |
| - if (!(flags & VMAP_BLOCK)) { |
| - aligned_vread(buf, addr, count); |
| - return; |
| - } |
| + if (!(flags & VMAP_BLOCK)) |
| + return aligned_vread_iter(iter, addr, count); |
| + |
| + remains = count; |
| |
| /* |
| * Area is split into regions and tracked with vmap_block, read out |
| @@ -3505,50 +3539,64 @@ static void vmap_ram_vread(char *buf, ch |
| */ |
| vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); |
| if (!vb) |
| - goto finished; |
| + goto finished_zero; |
| |
| spin_lock(&vb->lock); |
| if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { |
| spin_unlock(&vb->lock); |
| - goto finished; |
| + goto finished_zero; |
| } |
| + |
| for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { |
| - if (!count) |
| - break; |
| + size_t copied; |
| + |
| + if (remains == 0) |
| + goto finished; |
| + |
| start = vmap_block_vaddr(vb->va->va_start, rs); |
| - while (addr < start) { |
| - if (count == 0) |
| - goto unlock; |
| - *buf = '\0'; |
| - buf++; |
| - addr++; |
| - count--; |
| + |
| + if (addr < start) { |
| + size_t to_zero = min_t(size_t, start - addr, remains); |
| + size_t zeroed = zero_iter(iter, to_zero); |
| + |
| + addr += zeroed; |
| + remains -= zeroed; |
| + |
| + if (remains == 0 || zeroed != to_zero) |
| + goto finished; |
| } |
| + |
| /*it could start reading from the middle of used region*/ |
| offset = offset_in_page(addr); |
| n = ((re - rs + 1) << PAGE_SHIFT) - offset; |
| - if (n > count) |
| - n = count; |
| - aligned_vread(buf, start+offset, n); |
| - |
| - buf += n; |
| - addr += n; |
| - count -= n; |
| + if (n > remains) |
| + n = remains; |
| + |
| + copied = aligned_vread_iter(iter, start + offset, n); |
| + |
| + addr += copied; |
| + remains -= copied; |
| + |
| + if (copied != n) |
| + goto finished; |
| } |
| -unlock: |
| + |
| spin_unlock(&vb->lock); |
| |
| -finished: |
| +finished_zero: |
| /* zero-fill the left dirty or free regions */ |
| - if (count) |
| - memset(buf, 0, count); |
| + return count - remains + zero_iter(iter, remains); |
| +finished: |
| + /* We couldn't copy/zero everything */ |
| + spin_unlock(&vb->lock); |
| + return count - remains; |
| } |
| |
| /** |
| - * vread() - read vmalloc area in a safe way. |
| - * @buf: buffer for reading data |
| - * @addr: vm address. |
| - * @count: number of bytes to be read. |
| + * vread_iter() - read vmalloc area in a safe way to an iterator. |
| + * @iter: the iterator to which data should be written. |
| + * @addr: vm address. |
| + * @count: number of bytes to be read. |
| * |
| * This function checks that addr is a valid vmalloc'ed area, and |
| * copy data from that area to a given buffer. If the given memory range |
| @@ -3568,13 +3616,12 @@ finished: |
| * (same number as @count) or %0 if [addr...addr+count) doesn't |
| * include any intersection with valid vmalloc area |
| */ |
| -long vread(char *buf, char *addr, unsigned long count) |
| +long vread_iter(struct iov_iter *iter, const char *addr, size_t count) |
| { |
| struct vmap_area *va; |
| struct vm_struct *vm; |
| - char *vaddr, *buf_start = buf; |
| - unsigned long buflen = count; |
| - unsigned long n, size, flags; |
| + char *vaddr; |
| + size_t n, size, flags, remains; |
| |
| addr = kasan_reset_tag(addr); |
| |
| @@ -3582,18 +3629,22 @@ long vread(char *buf, char *addr, unsign |
| if ((unsigned long) addr + count < count) |
| count = -(unsigned long) addr; |
| |
| + remains = count; |
| + |
| spin_lock(&vmap_area_lock); |
| va = find_vmap_area_exceed_addr((unsigned long)addr); |
| if (!va) |
| - goto finished; |
| + goto finished_zero; |
| |
| /* no intersects with alive vmap_area */ |
| - if ((unsigned long)addr + count <= va->va_start) |
| - goto finished; |
| + if ((unsigned long)addr + remains <= va->va_start) |
| + goto finished_zero; |
| |
| list_for_each_entry_from(va, &vmap_area_list, list) { |
| - if (!count) |
| - break; |
| + size_t copied; |
| + |
| + if (remains == 0) |
| + goto finished; |
| |
| vm = va->vm; |
| flags = va->flags & VMAP_FLAGS_MASK; |
| @@ -3608,6 +3659,7 @@ long vread(char *buf, char *addr, unsign |
| |
| if (vm && (vm->flags & VM_UNINITIALIZED)) |
| continue; |
| + |
| /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ |
| smp_rmb(); |
| |
| @@ -3616,38 +3668,45 @@ long vread(char *buf, char *addr, unsign |
| |
| if (addr >= vaddr + size) |
| continue; |
| - while (addr < vaddr) { |
| - if (count == 0) |
| + |
| + if (addr < vaddr) { |
| + size_t to_zero = min_t(size_t, vaddr - addr, remains); |
| + size_t zeroed = zero_iter(iter, to_zero); |
| + |
| + addr += zeroed; |
| + remains -= zeroed; |
| + |
| + if (remains == 0 || zeroed != to_zero) |
| goto finished; |
| - *buf = '\0'; |
| - buf++; |
| - addr++; |
| - count--; |
| } |
| + |
| n = vaddr + size - addr; |
| - if (n > count) |
| - n = count; |
| + if (n > remains) |
| + n = remains; |
| |
| if (flags & VMAP_RAM) |
| - vmap_ram_vread(buf, addr, n, flags); |
| + copied = vmap_ram_vread_iter(iter, addr, n, flags); |
| else if (!(vm->flags & VM_IOREMAP)) |
| - aligned_vread(buf, addr, n); |
| + copied = aligned_vread_iter(iter, addr, n); |
| else /* IOREMAP area is treated as memory hole */ |
| - memset(buf, 0, n); |
| - buf += n; |
| - addr += n; |
| - count -= n; |
| + copied = zero_iter(iter, n); |
| + |
| + addr += copied; |
| + remains -= copied; |
| + |
| + if (copied != n) |
| + goto finished; |
| } |
| -finished: |
| - spin_unlock(&vmap_area_lock); |
| |
| - if (buf == buf_start) |
| - return 0; |
| +finished_zero: |
| + spin_unlock(&vmap_area_lock); |
| /* zero-fill memory holes */ |
| - if (buf != buf_start + buflen) |
| - memset(buf, 0, buflen - (buf - buf_start)); |
| + return count - remains + zero_iter(iter, remains); |
| +finished: |
| + /* Nothing remains, or We couldn't copy/zero everything. */ |
| + spin_unlock(&vmap_area_lock); |
| |
| - return buflen; |
| + return count - remains; |
| } |
| |
| /** |
| _ |