| From foo@baz Thu Feb 27 20:11:26 PST 2014 |
| From: "Michael S. Tsirkin" <mst@redhat.com> |
| Date: Thu, 13 Feb 2014 11:42:05 +0200 |
| Subject: vhost: fix ref cnt checking deadlock |
| |
| From: "Michael S. Tsirkin" <mst@redhat.com> |
| |
| [ Upstream commit 0ad8b480d6ee916aa84324f69acf690142aecd0e ] |
| |
| vhost checked the counter within the refcnt before decrementing. It |
| really wanted to know that it is the one that has the last reference, as |
| a way to batch freeing resources a bit more efficiently. |
| |
| Note: we only let refcount go to 0 on device release. |
| |
| This works well but we now access the ref counter twice so there's a |
| race: all users might see a high count and decide to defer freeing |
| resources. |
| In the end no one initiates freeing resources until the last reference |
| is gone (which is on VM shotdown so might happen after a looooong time). |
| |
| Let's do what we probably should have done straight away: |
| switch from kref to plain atomic, documenting the |
| semantics, return the refcount value atomically after decrement, |
| then use that to avoid the deadlock. |
| |
| Reported-by: Qin Chuanyu <qinchuanyu@huawei.com> |
| Signed-off-by: Michael S. Tsirkin <mst@redhat.com> |
| Acked-by: Jason Wang <jasowang@redhat.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| drivers/vhost/net.c | 41 ++++++++++++++++++++--------------------- |
| 1 file changed, 20 insertions(+), 21 deletions(-) |
| |
| --- a/drivers/vhost/net.c |
| +++ b/drivers/vhost/net.c |
| @@ -70,7 +70,12 @@ enum { |
| }; |
| |
| struct vhost_net_ubuf_ref { |
| - struct kref kref; |
| + /* refcount follows semantics similar to kref: |
| + * 0: object is released |
| + * 1: no outstanding ubufs |
| + * >1: outstanding ubufs |
| + */ |
| + atomic_t refcount; |
| wait_queue_head_t wait; |
| struct vhost_virtqueue *vq; |
| }; |
| @@ -116,14 +121,6 @@ static void vhost_net_enable_zcopy(int v |
| vhost_net_zcopy_mask |= 0x1 << vq; |
| } |
| |
| -static void vhost_net_zerocopy_done_signal(struct kref *kref) |
| -{ |
| - struct vhost_net_ubuf_ref *ubufs; |
| - |
| - ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref); |
| - wake_up(&ubufs->wait); |
| -} |
| - |
| static struct vhost_net_ubuf_ref * |
| vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) |
| { |
| @@ -134,21 +131,24 @@ vhost_net_ubuf_alloc(struct vhost_virtqu |
| ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); |
| if (!ubufs) |
| return ERR_PTR(-ENOMEM); |
| - kref_init(&ubufs->kref); |
| + atomic_set(&ubufs->refcount, 1); |
| init_waitqueue_head(&ubufs->wait); |
| ubufs->vq = vq; |
| return ubufs; |
| } |
| |
| -static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) |
| +static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) |
| { |
| - kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal); |
| + int r = atomic_sub_return(1, &ubufs->refcount); |
| + if (unlikely(!r)) |
| + wake_up(&ubufs->wait); |
| + return r; |
| } |
| |
| static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) |
| { |
| - kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal); |
| - wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount)); |
| + vhost_net_ubuf_put(ubufs); |
| + wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); |
| } |
| |
| static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) |
| @@ -306,22 +306,21 @@ static void vhost_zerocopy_callback(stru |
| { |
| struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; |
| struct vhost_virtqueue *vq = ubufs->vq; |
| - int cnt = atomic_read(&ubufs->kref.refcount); |
| + int cnt; |
| |
| /* set len to mark this desc buffers done DMA */ |
| vq->heads[ubuf->desc].len = success ? |
| VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; |
| - vhost_net_ubuf_put(ubufs); |
| + cnt = vhost_net_ubuf_put(ubufs); |
| |
| /* |
| * Trigger polling thread if guest stopped submitting new buffers: |
| - * in this case, the refcount after decrement will eventually reach 1 |
| - * so here it is 2. |
| + * in this case, the refcount after decrement will eventually reach 1. |
| * We also trigger polling periodically after each 16 packets |
| * (the value 16 here is more or less arbitrary, it's tuned to trigger |
| * less than 10% of times). |
| */ |
| - if (cnt <= 2 || !(cnt % 16)) |
| + if (cnt <= 1 || !(cnt % 16)) |
| vhost_poll_queue(&vq->poll); |
| } |
| |
| @@ -420,7 +419,7 @@ static void handle_tx(struct vhost_net * |
| msg.msg_control = ubuf; |
| msg.msg_controllen = sizeof(ubuf); |
| ubufs = nvq->ubufs; |
| - kref_get(&ubufs->kref); |
| + atomic_inc(&ubufs->refcount); |
| nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; |
| } else { |
| msg.msg_control = NULL; |
| @@ -785,7 +784,7 @@ static void vhost_net_flush(struct vhost |
| vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); |
| mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); |
| n->tx_flush = false; |
| - kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref); |
| + atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); |
| mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); |
| } |
| } |