mbox_todo-3.18 - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 839d42687dfce0ed0ea2c6bd8d707cc0e276fbe7 Mon Sep 17 00:00:00 2001
 From: "Eric W. Biederman" <ebiederm@xmission.com>
 Date: Fri, 20 Jan 2017 18:28:35 +1300
 Subject: [PATCH 013/251] mnt: Tuck mounts under others instead of creating
  shadow/side mounts.
 Status: RO
 Content-Length: 13585
 Lines: 423

 commit 1064f874abc0d05eeed8993815f584d847b72486 upstream.

 Ever since mount propagation was introduced in cases where a mount in
 propagated to parent mount mountpoint pair that is already in use the
 code has placed the new mount behind the old mount in the mount hash
 table.

 This implementation detail is problematic as it allows creating
 arbitrary length mount hash chains.

 Furthermore it invalidates the constraint maintained elsewhere in the
 mount code that a parent mount and a mountpoint pair will have exactly
 one mount upon them.  Making it hard to deal with and to talk about
 this special case in the mount code.

 Modify mount propagation to notice when there is already a mount at
 the parent mount and mountpoint where a new mount is propagating to
 and place that preexisting mount on top of the new mount.

 Modify unmount propagation to notice when a mount that is being
 unmounted has another mount on top of it (and no other children), and
 to replace the unmounted mount with the mount on top of it.

 Move the MNT_UMUONT test from __lookup_mnt_last into
 __propagate_umount as that is the only call of __lookup_mnt_last where
 MNT_UMOUNT may be set on any mount visible in the mount hash table.

 These modifications allow:
  - __lookup_mnt_last to be removed.
  - attach_shadows to be renamed __attach_mnt and its shadow
    handling to be removed.
  - commit_tree to be simplified
  - copy_tree to be simplified

 The result is an easier to understand tree of mounts that does not
 allow creation of arbitrary length hash chains in the mount hash table.

 The result is also a very slight userspace visible difference in semantics.
 The following two cases now behave identically, where before order
 mattered:

 case 1: (explicit user action)
 	B is a slave of A
 	mount something on A/a , it will propagate to B/a
 	and than mount something on B/a

 case 2: (tucked mount)
 	B is a slave of A
 	mount something on B/a
 	and than mount something on A/a

 Histroically umount A/a would fail in case 1 and succeed in case 2.
 Now umount A/a succeeds in both configurations.

 This very small change in semantics appears if anything to be a bug
 fix to me and my survey of userspace leads me to believe that no programs
 will notice or care of this subtle semantic change.

 v2: Updated to mnt_change_mountpoint to not call dput or mntput
 and instead to decrement the counts directly.  It is guaranteed
 that there will be other references when mnt_change_mountpoint is
 called so this is safe.

 v3: Moved put_mountpoint under mount_lock in attach_recursive_mnt
     As the locking in fs/namespace.c changed between v2 and v3.

 v4: Reworked the logic in propagate_mount_busy and __propagate_umount
     that detects when a mount completely covers another mount.

 v5: Removed unnecessary tests whose result is alwasy true in
     find_topper and attach_recursive_mnt.

 v6: Document the user space visible semantic difference.

 Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
 Tested-by: Andrei Vagin <avagin@virtuozzo.com>
 Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  fs/mount.h     |   1 -
  fs/namespace.c | 109 +++++++++++++++++++++++++++++++--------------------------
  fs/pnode.c     |  61 +++++++++++++++++++++++++-------
  fs/pnode.h     |   2 ++
  4 files changed, 110 insertions(+), 63 deletions(-)

 diff --git a/fs/mount.h b/fs/mount.h
 index 14db05d424f7..3dc7dea5a357 100644
 --- a/fs/mount.h
 +++ b/fs/mount.h
 @@ -86,7 +86,6 @@ static inline int is_mounted(struct vfsmount *mnt)
  }

  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);

  extern int __legitimize_mnt(struct vfsmount *, unsigned);
  extern bool legitimize_mnt(struct vfsmount *, unsigned);
 diff --git a/fs/namespace.c b/fs/namespace.c
 index da98a1bbd8b5..7df3d406d3e0 100644
 --- a/fs/namespace.c
 +++ b/fs/namespace.c
 @@ -638,28 +638,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
  }

  /*
 - * find the last mount at @dentry on vfsmount @mnt.
 - * mount_lock must be held.
 - */
 -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 -{
 -	struct mount *p, *res = NULL;
 -	p = __lookup_mnt(mnt, dentry);
 -	if (!p)
 -		goto out;
 -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
 -		res = p;
 -	hlist_for_each_entry_continue(p, mnt_hash) {
 -		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 -			break;
 -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
 -			res = p;
 -	}
 -out:
 -	return res;
 -}
 -
 -/*
   * lookup_mnt - Return the first child mount mounted at path
   *
   * "First" means first mounted chronologically.  If you create the
 @@ -879,6 +857,13 @@ void mnt_set_mountpoint(struct mount *mnt,
  	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
  }

 +static void __attach_mnt(struct mount *mnt, struct mount *parent)
 +{
 +	hlist_add_head_rcu(&mnt->mnt_hash,
 +			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
 +	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 +}
 +
  /*
   * vfsmount lock must be held for write
   */
 @@ -887,28 +872,45 @@ static void attach_mnt(struct mount *mnt,
  			struct mountpoint *mp)
  {
  	mnt_set_mountpoint(parent, mp, mnt);
 -	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
 -	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 +	__attach_mnt(mnt, parent);
  }

 -static void attach_shadowed(struct mount *mnt,
 -			struct mount *parent,
 -			struct mount *shadows)
 +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
  {
 -	if (shadows) {
 -		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
 -		list_add(&mnt->mnt_child, &shadows->mnt_child);
 -	} else {
 -		hlist_add_head_rcu(&mnt->mnt_hash,
 -				m_hash(&parent->mnt, mnt->mnt_mountpoint));
 -		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 -	}
 +	struct mountpoint *old_mp = mnt->mnt_mp;
 +	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
 +	struct mount *old_parent = mnt->mnt_parent;
 +
 +	list_del_init(&mnt->mnt_child);
 +	hlist_del_init(&mnt->mnt_mp_list);
 +	hlist_del_init_rcu(&mnt->mnt_hash);
 +
 +	attach_mnt(mnt, parent, mp);
 +
 +	put_mountpoint(old_mp);
 +
 +	/*
 +	 * Safely avoid even the suggestion this code might sleep or
 +	 * lock the mount hash by taking advantage of the knowledge that
 +	 * mnt_change_mountpoint will not release the final reference
 +	 * to a mountpoint.
 +	 *
 +	 * During mounting, the mount passed in as the parent mount will
 +	 * continue to use the old mountpoint and during unmounting, the
 +	 * old mountpoint will continue to exist until namespace_unlock,
 +	 * which happens well after mnt_change_mountpoint.
 +	 */
 +	spin_lock(&old_mountpoint->d_lock);
 +	old_mountpoint->d_lockref.count--;
 +	spin_unlock(&old_mountpoint->d_lock);
 +
 +	mnt_add_count(old_parent, -1);
  }

  /*
   * vfsmount lock must be held for write
   */
 -static void commit_tree(struct mount *mnt, struct mount *shadows)
 +static void commit_tree(struct mount *mnt)
  {
  	struct mount *parent = mnt->mnt_parent;
  	struct mount *m;
 @@ -923,7 +925,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)

  	list_splice(&head, n->list.prev);

 -	attach_shadowed(mnt, parent, shadows);
 +	__attach_mnt(mnt, parent);
  	touch_mnt_namespace(n);
  }

 @@ -1718,7 +1720,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
  			continue;

  		for (s = r; s; s = next_mnt(s, r)) {
 -			struct mount *t = NULL;
  			if (!(flag & CL_COPY_UNBINDABLE) &&
  			    IS_MNT_UNBINDABLE(s)) {
  				s = skip_mnt_tree(s);
 @@ -1740,14 +1741,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
  				goto out;
  			lock_mount_hash();
  			list_add_tail(&q->mnt_list, &res->mnt_list);
 -			mnt_set_mountpoint(parent, p->mnt_mp, q);
 -			if (!list_empty(&parent->mnt_mounts)) {
 -				t = list_last_entry(&parent->mnt_mounts,
 -					struct mount, mnt_child);
 -				if (t->mnt_mp != p->mnt_mp)
 -					t = NULL;
 -			}
 -			attach_shadowed(q, parent, t);
 +			attach_mnt(q, parent, p->mnt_mp);
  			unlock_mount_hash();
  		}
  	}
 @@ -1925,10 +1919,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
  			struct path *parent_path)
  {
  	HLIST_HEAD(tree_list);
 +	struct mountpoint *smp;
  	struct mount *child, *p;
  	struct hlist_node *n;
  	int err;

 +	/* Preallocate a mountpoint in case the new mounts need
 +	 * to be tucked under other mounts.
 +	 */
 +	smp = get_mountpoint(source_mnt->mnt.mnt_root);
 +	if (IS_ERR(smp))
 +		return PTR_ERR(smp);
 +
  	if (IS_MNT_SHARED(dest_mnt)) {
  		err = invent_group_ids(source_mnt, true);
  		if (err)
 @@ -1948,16 +1950,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
  		touch_mnt_namespace(source_mnt->mnt_ns);
  	} else {
  		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 -		commit_tree(source_mnt, NULL);
 +		commit_tree(source_mnt);
  	}

  	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
  		struct mount *q;
  		hlist_del_init(&child->mnt_hash);
 -		q = __lookup_mnt_last(&child->mnt_parent->mnt,
 -				      child->mnt_mountpoint);
 -		commit_tree(child, q);
 +		q = __lookup_mnt(&child->mnt_parent->mnt,
 +				 child->mnt_mountpoint);
 +		if (q)
 +			mnt_change_mountpoint(child, smp, q);
 +		commit_tree(child);
  	}
 +	put_mountpoint(smp);
  	unlock_mount_hash();

  	return 0;
 @@ -1970,6 +1975,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
  	unlock_mount_hash();
  	cleanup_group_ids(source_mnt, NULL);
   out:
 +	read_seqlock_excl(&mount_lock);
 +	put_mountpoint(smp);
 +	read_sequnlock_excl(&mount_lock);
 +
  	return err;
  }

 diff --git a/fs/pnode.c b/fs/pnode.c
 index 99899705b105..b9f2af59b9a6 100644
 --- a/fs/pnode.c
 +++ b/fs/pnode.c
 @@ -324,6 +324,21 @@ out:
  	return ret;
  }

 +static struct mount *find_topper(struct mount *mnt)
 +{
 +	/* If there is exactly one mount covering mnt completely return it. */
 +	struct mount *child;
 +
 +	if (!list_is_singular(&mnt->mnt_mounts))
 +		return NULL;
 +
 +	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
 +	if (child->mnt_mountpoint != mnt->mnt.mnt_root)
 +		return NULL;
 +
 +	return child;
 +}
 +
  /*
   * return true if the refcount is greater than count
   */
 @@ -344,9 +359,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
   */
  int propagate_mount_busy(struct mount *mnt, int refcnt)
  {
 -	struct mount *m, *child;
 +	struct mount *m, *child, *topper;
  	struct mount *parent = mnt->mnt_parent;
 -	int ret = 0;

  	if (mnt == parent)
  		return do_refcount_check(mnt, refcnt);
 @@ -361,12 +375,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)

  	for (m = propagation_next(parent, parent); m;
  	     		m = propagation_next(m, parent)) {
 -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
 -		if (child && list_empty(&child->mnt_mounts) &&
 -		    (ret = do_refcount_check(child, 1)))
 -			break;
 +		int count = 1;
 +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
 +		if (!child)
 +			continue;
 +
 +		/* Is there exactly one mount on the child that covers
 +		 * it completely whose reference should be ignored?
 +		 */
 +		topper = find_topper(child);
 +		if (topper)
 +			count += 1;
 +		else if (!list_empty(&child->mnt_mounts))
 +			continue;
 +
 +		if (do_refcount_check(child, count))
 +			return 1;
  	}
 -	return ret;
 +	return 0;
  }

  /*
 @@ -383,7 +409,7 @@ void propagate_mount_unlock(struct mount *mnt)

  	for (m = propagation_next(parent, parent); m;
  			m = propagation_next(m, parent)) {
 -		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
 +		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
  		if (child)
  			child->mnt.mnt_flags &= ~MNT_LOCKED;
  	}
 @@ -401,9 +427,11 @@ static void mark_umount_candidates(struct mount *mnt)

  	for (m = propagation_next(parent, parent); m;
  			m = propagation_next(m, parent)) {
 -		struct mount *child = __lookup_mnt_last(&m->mnt,
 +		struct mount *child = __lookup_mnt(&m->mnt,
  						mnt->mnt_mountpoint);
 -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
 +		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
 +			continue;
 +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
  			SET_MNT_MARK(child);
  		}
  	}
 @@ -422,8 +450,8 @@ static void __propagate_umount(struct mount *mnt)

  	for (m = propagation_next(parent, parent); m;
  			m = propagation_next(m, parent)) {
 -
 -		struct mount *child = __lookup_mnt_last(&m->mnt,
 +		struct mount *topper;
 +		struct mount *child = __lookup_mnt(&m->mnt,
  						mnt->mnt_mountpoint);
  		/*
  		 * umount the child only if the child has no children
 @@ -432,6 +460,15 @@ static void __propagate_umount(struct mount *mnt)
  		if (!child || !IS_MNT_MARKED(child))
  			continue;
  		CLEAR_MNT_MARK(child);
 +
 +		/* If there is exactly one mount covering all of child
 +		 * replace child with that mount.
 +		 */
 +		topper = find_topper(child);
 +		if (topper)
 +			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
 +					      topper);
 +
  		if (list_empty(&child->mnt_mounts)) {
  			list_del_init(&child->mnt_child);
  			child->mnt.mnt_flags |= MNT_UMOUNT;
 diff --git a/fs/pnode.h b/fs/pnode.h
 index 0fcdbe7ca648..623f01772bec 100644
 --- a/fs/pnode.h
 +++ b/fs/pnode.h
 @@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
  unsigned int mnt_get_count(struct mount *mnt);
  void mnt_set_mountpoint(struct mount *, struct mountpoint *,
  			struct mount *);
 +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
 +			   struct mount *mnt);
  struct mount *copy_tree(struct mount *, struct dentry *, int);
  bool is_path_reachable(struct mount *, struct dentry *,
  			 const struct path *root);
 --
 2.12.2

 From ba46d8fab00a8e1538df241681d9161c8ec85778 Mon Sep 17 00:00:00 2001
 From: Ilya Dryomov <idryomov@gmail.com>
 Date: Tue, 21 Mar 2017 13:44:28 +0100
 Subject: [PATCH 225/251] libceph: force GFP_NOIO for socket allocations
 Status: RO
 Content-Length: 4579
 Lines: 104

 commit 633ee407b9d15a75ac9740ba9d3338815e1fcb95 upstream.

 sock_alloc_inode() allocates socket+inode and socket_wq with
 GFP_KERNEL, which is not allowed on the writeback path:

     Workqueue: ceph-msgr con_work [libceph]
     ffff8810871cb018 0000000000000046 0000000000000000 ffff881085d40000
     0000000000012b00 ffff881025cad428 ffff8810871cbfd8 0000000000012b00
     ffff880102fc1000 ffff881085d40000 ffff8810871cb038 ffff8810871cb148
     Call Trace:
     [<ffffffff816dd629>] schedule+0x29/0x70
     [<ffffffff816e066d>] schedule_timeout+0x1bd/0x200
     [<ffffffff81093ffc>] ? ttwu_do_wakeup+0x2c/0x120
     [<ffffffff81094266>] ? ttwu_do_activate.constprop.135+0x66/0x70
     [<ffffffff816deb5f>] wait_for_completion+0xbf/0x180
     [<ffffffff81097cd0>] ? try_to_wake_up+0x390/0x390
     [<ffffffff81086335>] flush_work+0x165/0x250
     [<ffffffff81082940>] ? worker_detach_from_pool+0xd0/0xd0
     [<ffffffffa03b65b1>] xlog_cil_force_lsn+0x81/0x200 [xfs]
     [<ffffffff816d6b42>] ? __slab_free+0xee/0x234
     [<ffffffffa03b4b1d>] _xfs_log_force_lsn+0x4d/0x2c0 [xfs]
     [<ffffffff811adc1e>] ? lookup_page_cgroup_used+0xe/0x30
     [<ffffffffa039a723>] ? xfs_reclaim_inode+0xa3/0x330 [xfs]
     [<ffffffffa03b4dcf>] xfs_log_force_lsn+0x3f/0xf0 [xfs]
     [<ffffffffa039a723>] ? xfs_reclaim_inode+0xa3/0x330 [xfs]
     [<ffffffffa03a62c6>] xfs_iunpin_wait+0xc6/0x1a0 [xfs]
     [<ffffffff810aa250>] ? wake_atomic_t_function+0x40/0x40
     [<ffffffffa039a723>] xfs_reclaim_inode+0xa3/0x330 [xfs]
     [<ffffffffa039ac07>] xfs_reclaim_inodes_ag+0x257/0x3d0 [xfs]
     [<ffffffffa039bb13>] xfs_reclaim_inodes_nr+0x33/0x40 [xfs]
     [<ffffffffa03ab745>] xfs_fs_free_cached_objects+0x15/0x20 [xfs]
     [<ffffffff811c0c18>] super_cache_scan+0x178/0x180
     [<ffffffff8115912e>] shrink_slab_node+0x14e/0x340
     [<ffffffff811afc3b>] ? mem_cgroup_iter+0x16b/0x450
     [<ffffffff8115af70>] shrink_slab+0x100/0x140
     [<ffffffff8115e425>] do_try_to_free_pages+0x335/0x490
     [<ffffffff8115e7f9>] try_to_free_pages+0xb9/0x1f0
     [<ffffffff816d56e4>] ? __alloc_pages_direct_compact+0x69/0x1be
     [<ffffffff81150cba>] __alloc_pages_nodemask+0x69a/0xb40
     [<ffffffff8119743e>] alloc_pages_current+0x9e/0x110
     [<ffffffff811a0ac5>] new_slab+0x2c5/0x390
     [<ffffffff816d71c4>] __slab_alloc+0x33b/0x459
     [<ffffffff815b906d>] ? sock_alloc_inode+0x2d/0xd0
     [<ffffffff8164bda1>] ? inet_sendmsg+0x71/0xc0
     [<ffffffff815b906d>] ? sock_alloc_inode+0x2d/0xd0
     [<ffffffff811a21f2>] kmem_cache_alloc+0x1a2/0x1b0
     [<ffffffff815b906d>] sock_alloc_inode+0x2d/0xd0
     [<ffffffff811d8566>] alloc_inode+0x26/0xa0
     [<ffffffff811da04a>] new_inode_pseudo+0x1a/0x70
     [<ffffffff815b933e>] sock_alloc+0x1e/0x80
     [<ffffffff815ba855>] __sock_create+0x95/0x220
     [<ffffffff815baa04>] sock_create_kern+0x24/0x30
     [<ffffffffa04794d9>] con_work+0xef9/0x2050 [libceph]
     [<ffffffffa04aa9ec>] ? rbd_img_request_submit+0x4c/0x60 [rbd]
     [<ffffffff81084c19>] process_one_work+0x159/0x4f0
     [<ffffffff8108561b>] worker_thread+0x11b/0x530
     [<ffffffff81085500>] ? create_worker+0x1d0/0x1d0
     [<ffffffff8108b6f9>] kthread+0xc9/0xe0
     [<ffffffff8108b630>] ? flush_kthread_worker+0x90/0x90
     [<ffffffff816e1b98>] ret_from_fork+0x58/0x90
     [<ffffffff8108b630>] ? flush_kthread_worker+0x90/0x90

 Use memalloc_noio_{save,restore}() to temporarily force GFP_NOIO here.

 Link: http://tracker.ceph.com/issues/19309
 Reported-by: Sergey Jerusalimov <wintchester@gmail.com>
 Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
 Reviewed-by: Jeff Layton <jlayton@redhat.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  net/ceph/messenger.c | 6 ++++++
  1 file changed, 6 insertions(+)

 diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
 index b8d927c56494..a6b2f2138c9d 100644
 --- a/net/ceph/messenger.c
 +++ b/net/ceph/messenger.c
 @@ -7,6 +7,7 @@
  #include <linux/kthread.h>
  #include <linux/net.h>
  #include <linux/nsproxy.h>
 +#include <linux/sched.h>
  #include <linux/slab.h>
  #include <linux/socket.h>
  #include <linux/string.h>
 @@ -478,11 +479,16 @@ static int ceph_tcp_connect(struct ceph_connection *con)
  {
  	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
  	struct socket *sock;
 +	unsigned int noio_flag;
  	int ret;

  	BUG_ON(con->sock);
 +
 +	/* sock_create_kern() allocates with GFP_KERNEL */
 +	noio_flag = memalloc_noio_save();
  	ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
  			       SOCK_STREAM, IPPROTO_TCP, &sock);
 +	memalloc_noio_restore(noio_flag);
  	if (ret)
  		return ret;
  	sock->sk->sk_allocation = GFP_NOFS;
 --
 2.12.2
	From 839d42687dfce0ed0ea2c6bd8d707cc0e276fbe7 Mon Sep 17 00:00:00 2001
	From: "Eric W. Biederman" <ebiederm@xmission.com>
	Date: Fri, 20 Jan 2017 18:28:35 +1300
	Subject: [PATCH 013/251] mnt: Tuck mounts under others instead of creating
	shadow/side mounts.
	Status: RO
	Content-Length: 13585
	Lines: 423

	commit 1064f874abc0d05eeed8993815f584d847b72486 upstream.

	Ever since mount propagation was introduced in cases where a mount in
	propagated to parent mount mountpoint pair that is already in use the
	code has placed the new mount behind the old mount in the mount hash
	table.

	This implementation detail is problematic as it allows creating
	arbitrary length mount hash chains.

	Furthermore it invalidates the constraint maintained elsewhere in the
	mount code that a parent mount and a mountpoint pair will have exactly
	one mount upon them. Making it hard to deal with and to talk about
	this special case in the mount code.

	Modify mount propagation to notice when there is already a mount at
	the parent mount and mountpoint where a new mount is propagating to
	and place that preexisting mount on top of the new mount.

	Modify unmount propagation to notice when a mount that is being
	unmounted has another mount on top of it (and no other children), and
	to replace the unmounted mount with the mount on top of it.

	Move the MNT_UMUONT test from __lookup_mnt_last into
	__propagate_umount as that is the only call of __lookup_mnt_last where
	MNT_UMOUNT may be set on any mount visible in the mount hash table.

	These modifications allow:
	- __lookup_mnt_last to be removed.
	- attach_shadows to be renamed __attach_mnt and its shadow
	handling to be removed.
	- commit_tree to be simplified
	- copy_tree to be simplified

	The result is an easier to understand tree of mounts that does not
	allow creation of arbitrary length hash chains in the mount hash table.

	The result is also a very slight userspace visible difference in semantics.
	The following two cases now behave identically, where before order
	mattered:

	case 1: (explicit user action)
	B is a slave of A
	mount something on A/a , it will propagate to B/a
	and than mount something on B/a

	case 2: (tucked mount)
	B is a slave of A
	mount something on B/a
	and than mount something on A/a

	Histroically umount A/a would fail in case 1 and succeed in case 2.
	Now umount A/a succeeds in both configurations.

	This very small change in semantics appears if anything to be a bug
	fix to me and my survey of userspace leads me to believe that no programs
	will notice or care of this subtle semantic change.

	v2: Updated to mnt_change_mountpoint to not call dput or mntput
	and instead to decrement the counts directly. It is guaranteed
	that there will be other references when mnt_change_mountpoint is
	called so this is safe.

	v3: Moved put_mountpoint under mount_lock in attach_recursive_mnt
	As the locking in fs/namespace.c changed between v2 and v3.

	v4: Reworked the logic in propagate_mount_busy and __propagate_umount
	that detects when a mount completely covers another mount.

	v5: Removed unnecessary tests whose result is alwasy true in
	find_topper and attach_recursive_mnt.

	v6: Document the user space visible semantic difference.

	Fixes: b90fa9ae8f51 ("[PATCH] shared mount handling: bind and rbind")
	Tested-by: Andrei Vagin <avagin@virtuozzo.com>
	Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	fs/mount.h \| 1 -
	fs/namespace.c \| 109 +++++++++++++++++++++++++++++++--------------------------
	fs/pnode.c \| 61 +++++++++++++++++++++++++-------
	fs/pnode.h \| 2 ++
	4 files changed, 110 insertions(+), 63 deletions(-)

	diff --git a/fs/mount.h b/fs/mount.h
	index 14db05d424f7..3dc7dea5a357 100644
	--- a/fs/mount.h
	+++ b/fs/mount.h
	@@ -86,7 +86,6 @@ static inline int is_mounted(struct vfsmount *mnt)
	}

	extern struct mount __lookup_mnt(struct vfsmount , struct dentry *);
	-extern struct mount __lookup_mnt_last(struct vfsmount , struct dentry *);

	extern int __legitimize_mnt(struct vfsmount *, unsigned);
	extern bool legitimize_mnt(struct vfsmount *, unsigned);
	diff --git a/fs/namespace.c b/fs/namespace.c
	index da98a1bbd8b5..7df3d406d3e0 100644
	--- a/fs/namespace.c
	+++ b/fs/namespace.c
	@@ -638,28 +638,6 @@ struct mount __lookup_mnt(struct vfsmount mnt, struct dentry *dentry)
	}

	/*
	- * find the last mount at @dentry on vfsmount @mnt.
	- * mount_lock must be held.
	- */
	-struct mount __lookup_mnt_last(struct vfsmount mnt, struct dentry *dentry)
	-{
	- struct mount p, res = NULL;
	- p = __lookup_mnt(mnt, dentry);
	- if (!p)
	- goto out;
	- if (!(p->mnt.mnt_flags & MNT_UMOUNT))
	- res = p;
	- hlist_for_each_entry_continue(p, mnt_hash) {
	- if (&p->mnt_parent->mnt != mnt \|\| p->mnt_mountpoint != dentry)
	- break;
	- if (!(p->mnt.mnt_flags & MNT_UMOUNT))
	- res = p;
	- }
	-out:
	- return res;
	-}
	-
	-/*
	* lookup_mnt - Return the first child mount mounted at path
	*
	* "First" means first mounted chronologically. If you create the
	@@ -879,6 +857,13 @@ void mnt_set_mountpoint(struct mount *mnt,
	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
	}

	+static void __attach_mnt(struct mount mnt, struct mount parent)
	+{
	+ hlist_add_head_rcu(&mnt->mnt_hash,
	+ m_hash(&parent->mnt, mnt->mnt_mountpoint));
	+ list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
	+}
	+
	/*
	* vfsmount lock must be held for write
	*/
	@@ -887,28 +872,45 @@ static void attach_mnt(struct mount *mnt,
	struct mountpoint *mp)
	{
	mnt_set_mountpoint(parent, mp, mnt);
	- hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
	- list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
	+ __attach_mnt(mnt, parent);
	}

	-static void attach_shadowed(struct mount *mnt,
	- struct mount *parent,
	- struct mount *shadows)
	+void mnt_change_mountpoint(struct mount parent, struct mountpoint mp, struct mount *mnt)
	{
	- if (shadows) {
	- hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
	- list_add(&mnt->mnt_child, &shadows->mnt_child);
	- } else {
	- hlist_add_head_rcu(&mnt->mnt_hash,
	- m_hash(&parent->mnt, mnt->mnt_mountpoint));
	- list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
	- }
	+ struct mountpoint *old_mp = mnt->mnt_mp;
	+ struct dentry *old_mountpoint = mnt->mnt_mountpoint;
	+ struct mount *old_parent = mnt->mnt_parent;
	+
	+ list_del_init(&mnt->mnt_child);
	+ hlist_del_init(&mnt->mnt_mp_list);
	+ hlist_del_init_rcu(&mnt->mnt_hash);
	+
	+ attach_mnt(mnt, parent, mp);
	+
	+ put_mountpoint(old_mp);
	+
	+ /*
	+ * Safely avoid even the suggestion this code might sleep or
	+ * lock the mount hash by taking advantage of the knowledge that
	+ * mnt_change_mountpoint will not release the final reference
	+ * to a mountpoint.
	+ *
	+ * During mounting, the mount passed in as the parent mount will
	+ * continue to use the old mountpoint and during unmounting, the
	+ * old mountpoint will continue to exist until namespace_unlock,
	+ * which happens well after mnt_change_mountpoint.
	+ */
	+ spin_lock(&old_mountpoint->d_lock);
	+ old_mountpoint->d_lockref.count--;
	+ spin_unlock(&old_mountpoint->d_lock);
	+
	+ mnt_add_count(old_parent, -1);
	}

	/*
	* vfsmount lock must be held for write
	*/
	-static void commit_tree(struct mount mnt, struct mount shadows)
	+static void commit_tree(struct mount *mnt)
	{
	struct mount *parent = mnt->mnt_parent;
	struct mount *m;
	@@ -923,7 +925,7 @@ static void commit_tree(struct mount mnt, struct mount shadows)

	list_splice(&head, n->list.prev);

	- attach_shadowed(mnt, parent, shadows);
	+ __attach_mnt(mnt, parent);
	touch_mnt_namespace(n);
	}

	@@ -1718,7 +1720,6 @@ struct mount copy_tree(struct mount mnt, struct dentry *dentry,
	continue;

	for (s = r; s; s = next_mnt(s, r)) {
	- struct mount *t = NULL;
	if (!(flag & CL_COPY_UNBINDABLE) &&
	IS_MNT_UNBINDABLE(s)) {
	s = skip_mnt_tree(s);
	@@ -1740,14 +1741,7 @@ struct mount copy_tree(struct mount mnt, struct dentry *dentry,
	goto out;
	lock_mount_hash();
	list_add_tail(&q->mnt_list, &res->mnt_list);
	- mnt_set_mountpoint(parent, p->mnt_mp, q);
	- if (!list_empty(&parent->mnt_mounts)) {
	- t = list_last_entry(&parent->mnt_mounts,
	- struct mount, mnt_child);
	- if (t->mnt_mp != p->mnt_mp)
	- t = NULL;
	- }
	- attach_shadowed(q, parent, t);
	+ attach_mnt(q, parent, p->mnt_mp);
	unlock_mount_hash();
	}
	}
	@@ -1925,10 +1919,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
	struct path *parent_path)
	{
	HLIST_HEAD(tree_list);
	+ struct mountpoint *smp;
	struct mount child, p;
	struct hlist_node *n;
	int err;

	+ /* Preallocate a mountpoint in case the new mounts need
	+ * to be tucked under other mounts.
	+ */
	+ smp = get_mountpoint(source_mnt->mnt.mnt_root);
	+ if (IS_ERR(smp))
	+ return PTR_ERR(smp);
	+
	if (IS_MNT_SHARED(dest_mnt)) {
	err = invent_group_ids(source_mnt, true);
	if (err)
	@@ -1948,16 +1950,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
	touch_mnt_namespace(source_mnt->mnt_ns);
	} else {
	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
	- commit_tree(source_mnt, NULL);
	+ commit_tree(source_mnt);
	}

	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
	struct mount *q;
	hlist_del_init(&child->mnt_hash);
	- q = __lookup_mnt_last(&child->mnt_parent->mnt,
	- child->mnt_mountpoint);
	- commit_tree(child, q);
	+ q = __lookup_mnt(&child->mnt_parent->mnt,
	+ child->mnt_mountpoint);
	+ if (q)
	+ mnt_change_mountpoint(child, smp, q);
	+ commit_tree(child);
	}
	+ put_mountpoint(smp);
	unlock_mount_hash();

	return 0;
	@@ -1970,6 +1975,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
	unlock_mount_hash();
	cleanup_group_ids(source_mnt, NULL);
	out:
	+ read_seqlock_excl(&mount_lock);
	+ put_mountpoint(smp);
	+ read_sequnlock_excl(&mount_lock);
	+
	return err;
	}

	diff --git a/fs/pnode.c b/fs/pnode.c
	index 99899705b105..b9f2af59b9a6 100644
	--- a/fs/pnode.c
	+++ b/fs/pnode.c
	@@ -324,6 +324,21 @@ out:
	return ret;
	}

	+static struct mount find_topper(struct mount mnt)
	+{
	+ /* If there is exactly one mount covering mnt completely return it. */
	+ struct mount *child;
	+
	+ if (!list_is_singular(&mnt->mnt_mounts))
	+ return NULL;
	+
	+ child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
	+ if (child->mnt_mountpoint != mnt->mnt.mnt_root)
	+ return NULL;
	+
	+ return child;
	+}
	+
	/*
	* return true if the refcount is greater than count
	*/
	@@ -344,9 +359,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
	*/
	int propagate_mount_busy(struct mount *mnt, int refcnt)
	{
	- struct mount m, child;
	+ struct mount m, child, *topper;
	struct mount *parent = mnt->mnt_parent;
	- int ret = 0;

	if (mnt == parent)
	return do_refcount_check(mnt, refcnt);
	@@ -361,12 +375,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)

	for (m = propagation_next(parent, parent); m;
	m = propagation_next(m, parent)) {
	- child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
	- if (child && list_empty(&child->mnt_mounts) &&
	- (ret = do_refcount_check(child, 1)))
	- break;
	+ int count = 1;
	+ child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
	+ if (!child)
	+ continue;
	+
	+ /* Is there exactly one mount on the child that covers
	+ * it completely whose reference should be ignored?
	+ */
	+ topper = find_topper(child);
	+ if (topper)
	+ count += 1;
	+ else if (!list_empty(&child->mnt_mounts))
	+ continue;
	+
	+ if (do_refcount_check(child, count))
	+ return 1;
	}
	- return ret;
	+ return 0;
	}

	/*
	@@ -383,7 +409,7 @@ void propagate_mount_unlock(struct mount *mnt)

	for (m = propagation_next(parent, parent); m;
	m = propagation_next(m, parent)) {
	- child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
	+ child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
	if (child)
	child->mnt.mnt_flags &= ~MNT_LOCKED;
	}
	@@ -401,9 +427,11 @@ static void mark_umount_candidates(struct mount *mnt)

	for (m = propagation_next(parent, parent); m;
	m = propagation_next(m, parent)) {
	- struct mount *child = __lookup_mnt_last(&m->mnt,
	+ struct mount *child = __lookup_mnt(&m->mnt,
	mnt->mnt_mountpoint);
	- if (child && (!IS_MNT_LOCKED(child) \|\| IS_MNT_MARKED(m))) {
	+ if (!child \|\| (child->mnt.mnt_flags & MNT_UMOUNT))
	+ continue;
	+ if (!IS_MNT_LOCKED(child) \|\| IS_MNT_MARKED(m)) {
	SET_MNT_MARK(child);
	}
	}
	@@ -422,8 +450,8 @@ static void __propagate_umount(struct mount *mnt)

	for (m = propagation_next(parent, parent); m;
	m = propagation_next(m, parent)) {
	-
	- struct mount *child = __lookup_mnt_last(&m->mnt,
	+ struct mount *topper;
	+ struct mount *child = __lookup_mnt(&m->mnt,
	mnt->mnt_mountpoint);
	/*
	* umount the child only if the child has no children
	@@ -432,6 +460,15 @@ static void __propagate_umount(struct mount *mnt)
	if (!child \|\| !IS_MNT_MARKED(child))
	continue;
	CLEAR_MNT_MARK(child);
	+
	+ /* If there is exactly one mount covering all of child
	+ * replace child with that mount.
	+ */
	+ topper = find_topper(child);
	+ if (topper)
	+ mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
	+ topper);
	+
	if (list_empty(&child->mnt_mounts)) {
	list_del_init(&child->mnt_child);
	child->mnt.mnt_flags \|= MNT_UMOUNT;
	diff --git a/fs/pnode.h b/fs/pnode.h
	index 0fcdbe7ca648..623f01772bec 100644
	--- a/fs/pnode.h
	+++ b/fs/pnode.h
	@@ -49,6 +49,8 @@ int get_dominating_id(struct mount mnt, const struct path root);
	unsigned int mnt_get_count(struct mount *mnt);
	void mnt_set_mountpoint(struct mount , struct mountpoint ,
	struct mount *);
	+void mnt_change_mountpoint(struct mount parent, struct mountpoint mp,
	+ struct mount *mnt);
	struct mount copy_tree(struct mount , struct dentry *, int);
	bool is_path_reachable(struct mount , struct dentry ,
	const struct path *root);
	--
	2.12.2

	From ba46d8fab00a8e1538df241681d9161c8ec85778 Mon Sep 17 00:00:00 2001
	From: Ilya Dryomov <idryomov@gmail.com>
	Date: Tue, 21 Mar 2017 13:44:28 +0100
	Subject: [PATCH 225/251] libceph: force GFP_NOIO for socket allocations
	Status: RO
	Content-Length: 4579
	Lines: 104

	commit 633ee407b9d15a75ac9740ba9d3338815e1fcb95 upstream.

	sock_alloc_inode() allocates socket+inode and socket_wq with
	GFP_KERNEL, which is not allowed on the writeback path:

	Workqueue: ceph-msgr con_work [libceph]
	ffff8810871cb018 0000000000000046 0000000000000000 ffff881085d40000
	0000000000012b00 ffff881025cad428 ffff8810871cbfd8 0000000000012b00
	ffff880102fc1000 ffff881085d40000 ffff8810871cb038 ffff8810871cb148
	Call Trace:
	[<ffffffff816dd629>] schedule+0x29/0x70
	[<ffffffff816e066d>] schedule_timeout+0x1bd/0x200
	[<ffffffff81093ffc>] ? ttwu_do_wakeup+0x2c/0x120
	[<ffffffff81094266>] ? ttwu_do_activate.constprop.135+0x66/0x70
	[<ffffffff816deb5f>] wait_for_completion+0xbf/0x180
	[<ffffffff81097cd0>] ? try_to_wake_up+0x390/0x390
	[<ffffffff81086335>] flush_work+0x165/0x250
	[<ffffffff81082940>] ? worker_detach_from_pool+0xd0/0xd0
	[<ffffffffa03b65b1>] xlog_cil_force_lsn+0x81/0x200 [xfs]
	[<ffffffff816d6b42>] ? __slab_free+0xee/0x234
	[<ffffffffa03b4b1d>] _xfs_log_force_lsn+0x4d/0x2c0 [xfs]
	[<ffffffff811adc1e>] ? lookup_page_cgroup_used+0xe/0x30
	[<ffffffffa039a723>] ? xfs_reclaim_inode+0xa3/0x330 [xfs]
	[<ffffffffa03b4dcf>] xfs_log_force_lsn+0x3f/0xf0 [xfs]
	[<ffffffffa039a723>] ? xfs_reclaim_inode+0xa3/0x330 [xfs]
	[<ffffffffa03a62c6>] xfs_iunpin_wait+0xc6/0x1a0 [xfs]
	[<ffffffff810aa250>] ? wake_atomic_t_function+0x40/0x40
	[<ffffffffa039a723>] xfs_reclaim_inode+0xa3/0x330 [xfs]
	[<ffffffffa039ac07>] xfs_reclaim_inodes_ag+0x257/0x3d0 [xfs]
	[<ffffffffa039bb13>] xfs_reclaim_inodes_nr+0x33/0x40 [xfs]
	[<ffffffffa03ab745>] xfs_fs_free_cached_objects+0x15/0x20 [xfs]
	[<ffffffff811c0c18>] super_cache_scan+0x178/0x180
	[<ffffffff8115912e>] shrink_slab_node+0x14e/0x340
	[<ffffffff811afc3b>] ? mem_cgroup_iter+0x16b/0x450
	[<ffffffff8115af70>] shrink_slab+0x100/0x140
	[<ffffffff8115e425>] do_try_to_free_pages+0x335/0x490
	[<ffffffff8115e7f9>] try_to_free_pages+0xb9/0x1f0
	[<ffffffff816d56e4>] ? __alloc_pages_direct_compact+0x69/0x1be
	[<ffffffff81150cba>] __alloc_pages_nodemask+0x69a/0xb40
	[<ffffffff8119743e>] alloc_pages_current+0x9e/0x110
	[<ffffffff811a0ac5>] new_slab+0x2c5/0x390
	[<ffffffff816d71c4>] __slab_alloc+0x33b/0x459
	[<ffffffff815b906d>] ? sock_alloc_inode+0x2d/0xd0
	[<ffffffff8164bda1>] ? inet_sendmsg+0x71/0xc0
	[<ffffffff815b906d>] ? sock_alloc_inode+0x2d/0xd0
	[<ffffffff811a21f2>] kmem_cache_alloc+0x1a2/0x1b0
	[<ffffffff815b906d>] sock_alloc_inode+0x2d/0xd0
	[<ffffffff811d8566>] alloc_inode+0x26/0xa0
	[<ffffffff811da04a>] new_inode_pseudo+0x1a/0x70
	[<ffffffff815b933e>] sock_alloc+0x1e/0x80
	[<ffffffff815ba855>] __sock_create+0x95/0x220
	[<ffffffff815baa04>] sock_create_kern+0x24/0x30
	[<ffffffffa04794d9>] con_work+0xef9/0x2050 [libceph]
	[<ffffffffa04aa9ec>] ? rbd_img_request_submit+0x4c/0x60 [rbd]
	[<ffffffff81084c19>] process_one_work+0x159/0x4f0
	[<ffffffff8108561b>] worker_thread+0x11b/0x530
	[<ffffffff81085500>] ? create_worker+0x1d0/0x1d0
	[<ffffffff8108b6f9>] kthread+0xc9/0xe0
	[<ffffffff8108b630>] ? flush_kthread_worker+0x90/0x90
	[<ffffffff816e1b98>] ret_from_fork+0x58/0x90
	[<ffffffff8108b630>] ? flush_kthread_worker+0x90/0x90

	Use memalloc_noio_{save,restore}() to temporarily force GFP_NOIO here.

	Link: http://tracker.ceph.com/issues/19309
	Reported-by: Sergey Jerusalimov <wintchester@gmail.com>
	Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
	Reviewed-by: Jeff Layton <jlayton@redhat.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	net/ceph/messenger.c \| 6 ++++++
	1 file changed, 6 insertions(+)

	diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
	index b8d927c56494..a6b2f2138c9d 100644
	--- a/net/ceph/messenger.c
	+++ b/net/ceph/messenger.c
	@@ -7,6 +7,7 @@
	#include <linux/kthread.h>
	#include <linux/net.h>
	#include <linux/nsproxy.h>
	+#include <linux/sched.h>
	#include <linux/slab.h>
	#include <linux/socket.h>
	#include <linux/string.h>
	@@ -478,11 +479,16 @@ static int ceph_tcp_connect(struct ceph_connection *con)
	{
	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
	struct socket *sock;
	+ unsigned int noio_flag;
	int ret;

	BUG_ON(con->sock);
	+
	+ /* sock_create_kern() allocates with GFP_KERNEL */
	+ noio_flag = memalloc_noio_save();
	ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
	SOCK_STREAM, IPPROTO_TCP, &sock);
	+ memalloc_noio_restore(noio_flag);
	if (ret)
	return ret;
	sock->sk->sk_allocation = GFP_NOFS;
	--
	2.12.2