From c23f3445e68e1db0e74099f264bc5ff5d55ebdeb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Jun 2010 20:40:00 +0200
Subject: vhost: replace vhost_workqueue with per-vhost kthread

Replace vhost_workqueue with per-vhost kthread.  Other than callback
argument change from struct work_struct * to struct vhost_work *,
there's no visible change to vhost_poll_*() interface.

This conversion is to make each vhost use a dedicated kthread so that
resource control via cgroup can be applied.

Partially based on Sridhar Samudrala's patch.

* Updated to use sub structure vhost_work instead of directly using
  vhost_poll at Michael's suggestion.

* Added flusher wake_up() optimization at Michael's suggestion.

Changes by MST:
* Converted atomics/barrier use to a spinlock.
* Create thread on SET_OWNER
* Fix flushing

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
---
 drivers/vhost/net.c | 56 +++++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)

(limited to 'drivers/vhost/net.c')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f11e6bb5b036..d395b59289ae 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -302,54 +302,58 @@ static void handle_rx(struct vhost_net *net)
 	unuse_mm(net->dev.mm);
 }
 
-static void handle_tx_kick(struct work_struct *work)
+static void handle_tx_kick(struct vhost_work *work)
 {
-	struct vhost_virtqueue *vq;
-	struct vhost_net *net;
-	vq = container_of(work, struct vhost_virtqueue, poll.work);
-	net = container_of(vq->dev, struct vhost_net, dev);
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						  poll.work);
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
 	handle_tx(net);
 }
 
-static void handle_rx_kick(struct work_struct *work)
+static void handle_rx_kick(struct vhost_work *work)
 {
-	struct vhost_virtqueue *vq;
-	struct vhost_net *net;
-	vq = container_of(work, struct vhost_virtqueue, poll.work);
-	net = container_of(vq->dev, struct vhost_net, dev);
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						  poll.work);
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
 	handle_rx(net);
 }
 
-static void handle_tx_net(struct work_struct *work)
+static void handle_tx_net(struct vhost_work *work)
 {
-	struct vhost_net *net;
-	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
+	struct vhost_net *net = container_of(work, struct vhost_net,
+					     poll[VHOST_NET_VQ_TX].work);
 	handle_tx(net);
 }
 
-static void handle_rx_net(struct work_struct *work)
+static void handle_rx_net(struct vhost_work *work)
 {
-	struct vhost_net *net;
-	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
+	struct vhost_net *net = container_of(work, struct vhost_net,
+					     poll[VHOST_NET_VQ_RX].work);
 	handle_rx(net);
 }
 
 static int vhost_net_open(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+	struct vhost_dev *dev;
 	int r;
+
 	if (!n)
 		return -ENOMEM;
+
+	dev = &n->dev;
 	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
 	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
-	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
+	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
 	if (r < 0) {
 		kfree(n);
 		return r;
 	}
 
-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
 
 	f->private_data = n;
@@ -656,25 +660,13 @@ static struct miscdevice vhost_net_misc = {
 
 static int vhost_net_init(void)
 {
-	int r = vhost_init();
-	if (r)
-		goto err_init;
-	r = misc_register(&vhost_net_misc);
-	if (r)
-		goto err_reg;
-	return 0;
-err_reg:
-	vhost_cleanup();
-err_init:
-	return r;
-
+	return misc_register(&vhost_net_misc);
 }
 module_init(vhost_net_init);
 
 static void vhost_net_exit(void)
 {
 	misc_deregister(&vhost_net_misc);
-	vhost_cleanup();
 }
 module_exit(vhost_net_exit);
 
-- 
cgit 


From 8dd014adfea6f173c1ef6378f7e5e7924866c923 Mon Sep 17 00:00:00 2001
From: David Stevens <dlstevens@us.ibm.com>
Date: Tue, 27 Jul 2010 18:52:21 +0300
Subject: vhost-net: mergeable buffers support

This adds support for mergeable buffers in vhost-net: this is needed
for older guests without indirect buffer support, as well
as for zero copy with some devices.

Includes changes by Michael S. Tsirkin to make the
patch as low risk as possible (i.e., close to no changes
when feature is disabled).

Signed-off-by: David Stevens <dlstevens@us.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 228 insertions(+), 9 deletions(-)

(limited to 'drivers/vhost/net.c')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d395b59289ae..f13e56babe4b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,6 +74,22 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to,
 	}
 	return seg;
 }
+/* Copy iovec entries for len bytes from iovec. */
+static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
+			   size_t len, int iovcount)
+{
+	int seg = 0;
+	size_t size;
+	while (len && seg < iovcount) {
+		size = min(from->iov_len, len);
+		to->iov_base = from->iov_base;
+		to->iov_len = size;
+		len -= size;
+		++from;
+		++to;
+		++seg;
+	}
+}
 
 /* Caller must have TX VQ lock */
 static void tx_poll_stop(struct vhost_net *net)
@@ -129,7 +145,7 @@ static void handle_tx(struct vhost_net *net)
 
 	if (wmem < sock->sk->sk_sndbuf / 2)
 		tx_poll_stop(net);
-	hdr_size = vq->hdr_size;
+	hdr_size = vq->vhost_hlen;
 
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
@@ -172,7 +188,7 @@ static void handle_tx(struct vhost_net *net)
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_vq_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
@@ -191,9 +207,82 @@ static void handle_tx(struct vhost_net *net)
 	unuse_mm(net->dev.mm);
 }
 
+static int peek_head_len(struct sock *sk)
+{
+	struct sk_buff *head;
+	int len = 0;
+
+	lock_sock(sk);
+	head = skb_peek(&sk->sk_receive_queue);
+	if (head)
+		len = head->len;
+	release_sock(sk);
+	return len;
+}
+
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ *	vq has read descriptors only.
+ * @vq		- the relevant virtqueue
+ * @datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ *	returns number of buffer heads allocated, negative on error
+ */
+static int get_rx_bufs(struct vhost_virtqueue *vq,
+		       struct vring_used_elem *heads,
+		       int datalen,
+		       unsigned *iovcount,
+		       struct vhost_log *log,
+		       unsigned *log_num)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int r, nlogs = 0;
+
+	while (datalen > 0) {
+		if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg,
+				      ARRAY_SIZE(vq->iov) - seg, &out,
+				      &in, log, log_num);
+		if (d == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		if (unlikely(log)) {
+			nlogs += *log_num;
+			log += *log_num;
+		}
+		heads[headcount].id = d;
+		heads[headcount].len = iov_length(vq->iov + seg, in);
+		datalen -= heads[headcount].len;
+		++headcount;
+		seg += in;
+	}
+	heads[headcount - 1].len += datalen;
+	*iovcount = seg;
+	if (unlikely(log))
+		*log_num = nlogs;
+	return headcount;
+err:
+	vhost_discard_vq_desc(vq, headcount);
+	return r;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
-static void handle_rx(struct vhost_net *net)
+static void handle_rx_big(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 	unsigned out, in, log, s;
@@ -223,7 +312,7 @@ static void handle_rx(struct vhost_net *net)
 	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
-	hdr_size = vq->hdr_size;
+	hdr_size = vq->vhost_hlen;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
@@ -270,14 +359,14 @@ static void handle_rx(struct vhost_net *net)
 					 len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_vq_desc(vq, 1);
 			break;
 		}
 		/* TODO: Should check and handle checksum. */
 		if (err > len) {
 			pr_debug("Discarded truncated rx packet: "
 				 " len %d > %zd\n", err, len);
-			vhost_discard_vq_desc(vq);
+			vhost_discard_vq_desc(vq, 1);
 			continue;
 		}
 		len = err;
@@ -302,6 +391,123 @@ static void handle_rx(struct vhost_net *net)
 	unuse_mm(net->dev.mm);
 }
 
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+static void handle_rx_mergeable(struct vhost_net *net)
+{
+	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+	unsigned uninitialized_var(in), log;
+	struct vhost_log *vq_log;
+	struct msghdr msg = {
+		.msg_name = NULL,
+		.msg_namelen = 0,
+		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
+		.msg_controllen = 0,
+		.msg_iov = vq->iov,
+		.msg_flags = MSG_DONTWAIT,
+	};
+
+	struct virtio_net_hdr_mrg_rxbuf hdr = {
+		.hdr.flags = 0,
+		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	};
+
+	size_t total_len = 0;
+	int err, headcount;
+	size_t vhost_hlen, sock_hlen;
+	size_t vhost_len, sock_len;
+	struct socket *sock = rcu_dereference(vq->private_data);
+	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+		return;
+
+	use_mm(net->dev.mm);
+	mutex_lock(&vq->mutex);
+	vhost_disable_notify(vq);
+	vhost_hlen = vq->vhost_hlen;
+	sock_hlen = vq->sock_hlen;
+
+	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+		vq->log : NULL;
+
+	while ((sock_len = peek_head_len(sock->sk))) {
+		sock_len += sock_hlen;
+		vhost_len = sock_len + vhost_hlen;
+		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
+					&in, vq_log, &log);
+		/* On error, stop handling until the next kick. */
+		if (unlikely(headcount < 0))
+			break;
+		/* OK, now we need to know about added descriptors. */
+		if (!headcount) {
+			if (unlikely(vhost_enable_notify(vq))) {
+				/* They have slipped one in as we were
+				 * doing that: check again. */
+				vhost_disable_notify(vq);
+				continue;
+			}
+			/* Nothing new?  Wait for eventfd to tell us
+			 * they refilled. */
+			break;
+		}
+		/* We don't need to be notified again. */
+		if (unlikely((vhost_hlen)))
+			/* Skip header. TODO: support TSO. */
+			move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
+		else
+			/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
+			 * needed because sendmsg can modify msg_iov. */
+			copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
+		msg.msg_iovlen = in;
+		err = sock->ops->recvmsg(NULL, sock, &msg,
+					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
+		/* Userspace might have consumed the packet meanwhile:
+		 * it's not supposed to do this usually, but might be hard
+		 * to prevent. Discard data we got (if any) and keep going. */
+		if (unlikely(err != sock_len)) {
+			pr_debug("Discarded rx packet: "
+				 " len %d, expected %zd\n", err, sock_len);
+			vhost_discard_vq_desc(vq, headcount);
+			continue;
+		}
+		if (unlikely(vhost_hlen) &&
+		    memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
+				      vhost_hlen)) {
+			vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
+			       vq->iov->iov_base);
+			break;
+		}
+		/* TODO: Should check and handle checksum. */
+		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
+		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
+				      offsetof(typeof(hdr), num_buffers),
+				      sizeof hdr.num_buffers)) {
+			vq_err(vq, "Failed num_buffers write");
+			vhost_discard_vq_desc(vq, headcount);
+			break;
+		}
+		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+					    headcount);
+		if (unlikely(vq_log))
+			vhost_log_write(vq, vq_log, log, vhost_len);
+		total_len += vhost_len;
+		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+
+	mutex_unlock(&vq->mutex);
+	unuse_mm(net->dev.mm);
+}
+
+static void handle_rx(struct vhost_net *net)
+{
+	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
+		handle_rx_mergeable(net);
+	else
+		handle_rx_big(net);
+}
+
 static void handle_tx_kick(struct vhost_work *work)
 {
 	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
@@ -577,9 +783,21 @@ done:
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
-		sizeof(struct virtio_net_hdr) : 0;
+	size_t vhost_hlen, sock_hlen, hdr_len;
 	int i;
+
+	hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
+			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+			sizeof(struct virtio_net_hdr);
+	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
+		/* vhost provides vnet_hdr */
+		vhost_hlen = hdr_len;
+		sock_hlen = 0;
+	} else {
+		/* socket provides vnet_hdr */
+		vhost_hlen = 0;
+		sock_hlen = hdr_len;
+	}
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
 	    !vhost_log_access_ok(&n->dev)) {
@@ -590,7 +808,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 	smp_wmb();
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
 		mutex_lock(&n->vqs[i].mutex);
-		n->vqs[i].hdr_size = hdr_size;
+		n->vqs[i].vhost_hlen = vhost_hlen;
+		n->vqs[i].sock_hlen = sock_hlen;
 		mutex_unlock(&n->vqs[i].mutex);
 	}
 	vhost_net_flush(n);
-- 
cgit