summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/netlink/specs/netdev.yaml12
-rw-r--r--Documentation/networking/devmem.rst150
-rw-r--r--Documentation/networking/net_cachelines/net_device.rst1
-rw-r--r--Documentation/networking/netdev-features.rst5
-rw-r--r--Documentation/networking/netmem.rst23
-rw-r--r--drivers/net/ethernet/google/gve/gve_main.c3
-rw-r--r--drivers/net/ethernet/google/gve/gve_tx_dqo.c8
-rw-r--r--include/linux/netdevice.h2
-rw-r--r--include/linux/skbuff.h17
-rw-r--r--include/linux/skbuff_ref.h4
-rw-r--r--include/net/netmem.h34
-rw-r--r--include/net/sock.h1
-rw-r--r--include/uapi/linux/netdev.h1
-rw-r--r--io_uring/zcrx.c3
-rw-r--r--net/core/datagram.c48
-rw-r--r--net/core/dev.c34
-rw-r--r--net/core/devmem.c131
-rw-r--r--net/core/devmem.h83
-rw-r--r--net/core/netdev-genl-gen.c13
-rw-r--r--net/core/netdev-genl-gen.h1
-rw-r--r--net/core/netdev-genl.c80
-rw-r--r--net/core/skbuff.c48
-rw-r--r--net/core/sock.c5
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/tcp.c48
-rw-r--r--net/ipv6/ip6_output.c3
-rw-r--r--net/vmw_vsock/virtio_transport_common.c5
-rw-r--r--tools/include/uapi/linux/netdev.h1
-rwxr-xr-xtools/testing/selftests/drivers/net/hw/devmem.py26
-rw-r--r--tools/testing/selftests/drivers/net/hw/ncdevmem.c300
30 files changed, 1007 insertions, 86 deletions
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index f5e0750ab71d..c0ef6d0d7786 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -743,6 +743,18 @@ operations:
- defer-hard-irqs
- gro-flush-timeout
- irq-suspend-timeout
+ -
+ name: bind-tx
+ doc: Bind dmabuf to netdev for TX
+ attribute-set: dmabuf
+ do:
+ request:
+ attributes:
+ - ifindex
+ - fd
+ reply:
+ attributes:
+ - id
kernel-family:
headers: [ "net/netdev_netlink.h"]
diff --git a/Documentation/networking/devmem.rst b/Documentation/networking/devmem.rst
index eb678ca45496..a6cd7236bfbd 100644
--- a/Documentation/networking/devmem.rst
+++ b/Documentation/networking/devmem.rst
@@ -62,15 +62,15 @@ More Info
https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/
-Interface
-=========
+RX Interface
+============
Example
-------
-tools/testing/selftests/net/ncdevmem.c:do_server shows an example of setting up
-the RX path of this API.
+./tools/testing/selftests/drivers/net/hw/ncdevmem:do_server shows an example of
+setting up the RX path of this API.
NIC Setup
@@ -235,6 +235,148 @@ can be less than the tokens provided by the user in case of:
(a) an internal kernel leak bug.
(b) the user passed more than 1024 frags.
+TX Interface
+============
+
+
+Example
+-------
+
+./tools/testing/selftests/drivers/net/hw/ncdevmem:do_client shows an example of
+setting up the TX path of this API.
+
+
+NIC Setup
+---------
+
+The user must bind a TX dmabuf to a given NIC using the netlink API::
+
+ struct netdev_bind_tx_req *req = NULL;
+ struct netdev_bind_tx_rsp *rsp = NULL;
+ struct ynl_error yerr;
+
+ *ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+
+ req = netdev_bind_tx_req_alloc();
+ netdev_bind_tx_req_set_ifindex(req, ifindex);
+ netdev_bind_tx_req_set_fd(req, dmabuf_fd);
+
+ rsp = netdev_bind_tx(*ys, req);
+
+ tx_dmabuf_id = rsp->id;
+
+
+The netlink API returns a dmabuf_id: a unique ID that refers to this dmabuf
+that has been bound.
+
+The user can unbind the dmabuf from the netdevice by closing the netlink socket
+that established the binding. We do this so that the binding is automatically
+unbound even if the userspace process crashes.
+
+Note that any reasonably well-behaved dmabuf from any exporter should work with
+devmem TCP, even if the dmabuf is not actually backed by devmem. An example of
+this is udmabuf, which wraps user memory (non-devmem) in a dmabuf.
+
+Socket Setup
+------------
+
+The user application must use MSG_ZEROCOPY flag when sending devmem TCP. Devmem
+cannot be copied by the kernel, so the semantics of the devmem TX are similar
+to the semantics of MSG_ZEROCOPY::
+
+ setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt));
+
+It is also recommended that the user binds the TX socket to the same interface
+the dma-buf has been bound to via SO_BINDTODEVICE::
+
+ setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname) + 1);
+
+
+Sending data
+------------
+
+Devmem data is sent using the SCM_DEVMEM_DMABUF cmsg.
+
+The user should create a msghdr where,
+
+* iov_base is set to the offset into the dmabuf to start sending from
+* iov_len is set to the number of bytes to be sent from the dmabuf
+
+The user passes the dma-buf id to send from via the dmabuf_tx_cmsg.dmabuf_id.
+
+The example below sends 1024 bytes from offset 100 into the dmabuf, and 2048
+from offset 2000 into the dmabuf. The dmabuf to send from is tx_dmabuf_id::
+
+ char ctrl_data[CMSG_SPACE(sizeof(struct dmabuf_tx_cmsg))];
+ struct dmabuf_tx_cmsg ddmabuf;
+ struct msghdr msg = {};
+ struct cmsghdr *cmsg;
+ struct iovec iov[2];
+
+ iov[0].iov_base = (void*)100;
+ iov[0].iov_len = 1024;
+ iov[1].iov_base = (void*)2000;
+ iov[1].iov_len = 2048;
+
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 2;
+
+ msg.msg_control = ctrl_data;
+ msg.msg_controllen = sizeof(ctrl_data);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_DEVMEM_DMABUF;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct dmabuf_tx_cmsg));
+
+ ddmabuf.dmabuf_id = tx_dmabuf_id;
+
+ *((struct dmabuf_tx_cmsg *)CMSG_DATA(cmsg)) = ddmabuf;
+
+ sendmsg(socket_fd, &msg, MSG_ZEROCOPY);
+
+
+Reusing TX dmabufs
+------------------
+
+Similar to MSG_ZEROCOPY with regular memory, the user should not modify the
+contents of the dma-buf while a send operation is in progress. This is because
+the kernel does not keep a copy of the dmabuf contents. Instead, the kernel
+will pin and send data from the buffer available to the userspace.
+
+Just as in MSG_ZEROCOPY, the kernel notifies the userspace of send completions
+using MSG_ERRQUEUE::
+
+ int64_t tstop = gettimeofday_ms() + waittime_ms;
+ char control[CMSG_SPACE(100)] = {};
+ struct sock_extended_err *serr;
+ struct msghdr msg = {};
+ struct cmsghdr *cm;
+ int retries = 10;
+ __u32 hi, lo;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ while (gettimeofday_ms() < tstop) {
+ if (!do_poll(fd)) continue;
+
+ ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+
+ for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
+ serr = (void *)CMSG_DATA(cm);
+
+ hi = serr->ee_data;
+ lo = serr->ee_info;
+
+ fprintf(stdout, "tx complete [%d,%d]\n", lo, hi);
+ }
+ }
+
+After the associated sendmsg has been completed, the dmabuf can be reused by
+the userspace.
+
+
Implementation & Caveats
========================
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index ca8605eb82ff..c69cc89c958e 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -10,6 +10,7 @@ Type Name fastpath_tx_acce
=================================== =========================== =================== =================== ===================================================================================
unsigned_long:32 priv_flags read_mostly __dev_queue_xmit(tx)
unsigned_long:1 lltx read_mostly HARD_TX_LOCK,HARD_TX_TRYLOCK,HARD_TX_UNLOCK(tx)
+unsigned long:1 netmem_tx:1; read_mostly
char name[16]
struct netdev_name_node* name_node
struct dev_ifalias* ifalias
diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst
index 5014f7cc1398..02bd7536fc0c 100644
--- a/Documentation/networking/netdev-features.rst
+++ b/Documentation/networking/netdev-features.rst
@@ -188,3 +188,8 @@ Redundancy) frames from one port to another in hardware.
This should be set for devices which duplicate outgoing HSR (High-availability
Seamless Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically
frames in hardware.
+
+* netmem-tx
+
+This should be set for devices which support netmem TX. See
+Documentation/networking/netmem.rst
diff --git a/Documentation/networking/netmem.rst b/Documentation/networking/netmem.rst
index 7de21ddb5412..b63aded46337 100644
--- a/Documentation/networking/netmem.rst
+++ b/Documentation/networking/netmem.rst
@@ -19,8 +19,8 @@ Benefits of Netmem :
* Simplified Development: Drivers interact with a consistent API,
regardless of the underlying memory implementation.
-Driver Requirements
-===================
+Driver RX Requirements
+======================
1. The driver must support page_pool.
@@ -77,3 +77,22 @@ Driver Requirements
that purpose, but be mindful that some netmem types might have longer
circulation times, such as when userspace holds a reference in zerocopy
scenarios.
+
+Driver TX Requirements
+======================
+
+1. The Driver must not pass the netmem dma_addr to any of the dma-mapping APIs
+ directly. This is because netmem dma_addrs may come from a source like
+ dma-buf that is not compatible with the dma-mapping APIs.
+
+ Helpers like netmem_dma_unmap_page_attrs() & netmem_dma_unmap_addr_set()
+ should be used in lieu of dma_unmap_page[_attrs](), dma_unmap_addr_set().
+ The netmem variants will handle netmem dma_addrs correctly regardless of the
+ source, delegating to the dma-mapping APIs when appropriate.
+
+ Not all dma-mapping APIs have netmem equivalents at the moment. If your
+ driver relies on a missing netmem API, feel free to add and propose to
+ netdev@, or reach out to the maintainers and/or almasrymina@google.com for
+ help adding the netmem API.
+
+2. Driver should declare support by setting `netdev->netmem_tx = true`
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 446e4b6fd3f1..e1ffbd561fac 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -2659,6 +2659,9 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
if (err)
goto abort_with_wq;
+ if (!gve_is_gqi(priv) && !gve_is_qpl(priv))
+ dev->netmem_tx = true;
+
err = register_netdev(dev);
if (err)
goto abort_with_gve_init;
diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
index 2eba868d8037..a27f1574a733 100644
--- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
@@ -660,7 +660,8 @@ static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
goto err;
dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
- dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
+ netmem_dma_unmap_addr_set(skb_frag_netmem(frag), pkt,
+ dma[pkt->num_bufs], addr);
++pkt->num_bufs;
gve_tx_fill_pkt_desc_dqo(tx, desc_idx, skb, len, addr,
@@ -1038,8 +1039,9 @@ static void gve_unmap_packet(struct device *dev,
dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
for (i = 1; i < pkt->num_bufs; i++) {
- dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
- dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
+ netmem_dma_unmap_page_attrs(dev, dma_unmap_addr(pkt, dma[i]),
+ dma_unmap_len(pkt, len[i]),
+ DMA_TO_DEVICE, 0);
}
pkt->num_bufs = 0;
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 773167508c82..32a1e41636a9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1772,6 +1772,7 @@ enum netdev_reg_state {
* @lltx: device supports lockless Tx. Deprecated for real HW
* drivers. Mainly used by logical interfaces, such as
* bonding and tunnels
+ * @netmem_tx: device support netmem_tx.
*
* @name: This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
@@ -2087,6 +2088,7 @@ struct net_device {
struct_group(priv_flags_fast,
unsigned long priv_flags:32;
unsigned long lltx:1;
+ unsigned long netmem_tx:1;
);
const struct net_device_ops *netdev_ops;
const struct header_ops *header_ops;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f3e72be6f634..c7397b17bb08 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg);
+ struct ubuf_info *uarg, bool devmem);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
+struct net_devmem_dmabuf_binding;
+
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, struct iov_iter *from,
- size_t length);
+ size_t length,
+ struct net_devmem_dmabuf_binding *binding);
int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
struct iov_iter *from, size_t length);
@@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
struct msghdr *msg, int len)
{
- return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
+ return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len,
+ NULL);
}
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg);
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding);
/* Internal */
#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
@@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
size_t offset, size_t size,
enum dma_data_direction dir)
{
+ if (skb_frag_is_net_iov(frag)) {
+ return netmem_to_net_iov(frag->netmem)->dma_addr + offset +
+ frag->offset;
+ }
return dma_map_page(dev, skb_frag_page(frag),
skb_frag_off(frag) + offset, size, dir);
}
diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h
index 0f3c58007488..9e49372ef1a0 100644
--- a/include/linux/skbuff_ref.h
+++ b/include/linux/skbuff_ref.h
@@ -17,7 +17,7 @@
*/
static inline void __skb_frag_ref(skb_frag_t *frag)
{
- get_page(skb_frag_page(frag));
+ get_netmem(skb_frag_netmem(frag));
}
/**
@@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle)
if (recycle && napi_pp_put_page(netmem))
return;
#endif
- put_page(netmem_to_page(netmem));
+ put_netmem(netmem);
}
/**
diff --git a/include/net/netmem.h b/include/net/netmem.h
index c61d5b21e7b4..386164fb9c18 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -8,6 +8,7 @@
#ifndef _NET_NETMEM_H
#define _NET_NETMEM_H
+#include <linux/dma-mapping.h>
#include <linux/mm.h>
#include <net/net_debug.h>
@@ -20,8 +21,17 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
*/
#define NET_IOV 0x01UL
+enum net_iov_type {
+ NET_IOV_DMABUF,
+ NET_IOV_IOURING,
+
+ /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass.
+ */
+ NET_IOV_MAX = ULONG_MAX
+};
+
struct net_iov {
- unsigned long __unused_padding;
+ enum net_iov_type type;
unsigned long pp_magic;
struct page_pool *pp;
struct net_iov_area *owner;
@@ -264,4 +274,26 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
return __netmem_clear_lsb(netmem)->dma_addr;
}
+void get_netmem(netmem_ref netmem);
+void put_netmem(netmem_ref netmem);
+
+#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \
+ do { \
+ if (!netmem_is_net_iov(NETMEM)) \
+ dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \
+ else \
+ dma_unmap_addr_set(PTR, ADDR_NAME, 0); \
+ } while (0)
+
+static inline void netmem_dma_unmap_page_attrs(struct device *dev,
+ dma_addr_t addr, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ if (!addr)
+ return;
+
+ dma_unmap_page_attrs(dev, addr, size, dir, attrs);
+}
+
#endif /* _NET_NETMEM_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index f0fabb9fd28a..3e15d7105ad2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1851,6 +1851,7 @@ struct sockcm_cookie {
u32 tsflags;
u32 ts_opt_id;
u32 priority;
+ u32 dmabuf_id;
};
static inline void sockcm_init(struct sockcm_cookie *sockc,
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7600bf62dbdf..7eb9571786b8 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -219,6 +219,7 @@ enum {
NETDEV_CMD_QSTATS_GET,
NETDEV_CMD_BIND_RX,
NETDEV_CMD_NAPI_SET,
+ NETDEV_CMD_BIND_TX,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index fe86606b9f30..0c64129f5d50 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -259,6 +259,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
niov->owner = &area->nia;
area->freelist[i] = i;
atomic_set(&area->user_refs[i], 0);
+ niov->type = NET_IOV_IOURING;
}
area->free_count = nr_iovs;
@@ -809,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
return io_zcrx_copy_frag(req, ifq, frag, off, len);
niov = netmem_to_net_iov(frag->netmem);
- if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
+ if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
io_pp_to_ifq(niov->pp) != ifq)
return -EFAULT;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f0634f0cb834..9ef5442536f5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -63,6 +63,8 @@
#include <net/busy_poll.h>
#include <crypto/hash.h>
+#include "devmem.h"
+
/*
* Is a socket 'connection oriented' ?
*/
@@ -691,9 +693,49 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
return 0;
}
+static int
+zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
+ int length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+ size_t virt_addr, size, off;
+ struct net_iov *niov;
+
+ /* Devmem filling works by taking an IOVEC from the user where the
+ * iov_addrs are interpreted as an offset in bytes into the dma-buf to
+ * send from. We do not support other iter types.
+ */
+ if (iov_iter_type(from) != ITER_IOVEC)
+ return -EFAULT;
+
+ while (length && iov_iter_count(from)) {
+ if (i == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ virt_addr = (size_t)iter_iov_addr(from);
+ niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
+ if (!niov)
+ return -EFAULT;
+
+ size = min_t(size_t, size, length);
+ size = min_t(size_t, size, iter_iov_len(from));
+
+ get_netmem(net_iov_to_netmem(niov));
+ skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
+ size, PAGE_SIZE);
+ iov_iter_advance(from, size);
+ length -= size;
+ i++;
+ }
+
+ return 0;
+}
+
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, struct iov_iter *from,
- size_t length)
+ size_t length,
+ struct net_devmem_dmabuf_binding *binding)
{
unsigned long orig_size = skb->truesize;
unsigned long truesize;
@@ -701,6 +743,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
if (msg && msg->msg_ubuf && msg->sg_from_iter)
ret = msg->sg_from_iter(skb, from, length);
+ else if (binding)
+ ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
else
ret = zerocopy_fill_skb_from_iter(skb, from, length);
@@ -734,7 +778,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
+ return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
diff --git a/net/core/dev.c b/net/core/dev.c
index c9013632296f..33d5e95209cb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3896,12 +3896,42 @@ sw_checksum:
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *shinfo;
+ struct net_iov *niov;
+
+ if (likely(skb_frags_readable(skb)))
+ goto out;
+
+ if (!dev->netmem_tx)
+ goto out_free;
+
+ shinfo = skb_shinfo(skb);
+
+ if (shinfo->nr_frags > 0) {
+ niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+ if (net_is_devmem_iov(niov) &&
+ net_devmem_iov_binding(niov)->dev != dev)
+ goto out_free;
+ }
+
+out:
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
netdev_features_t features;
- if (!skb_frags_readable(skb))
- goto out_kfree_skb;
+ skb = validate_xmit_unreadable_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 6e27a47d0493..88065e5ef651 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -16,6 +16,7 @@
#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
+#include <net/sock.h>
#include <trace/events/page_pool.h>
#include "devmem.h"
@@ -30,7 +31,7 @@ static const struct memory_provider_ops dmabuf_devmem_ops;
bool net_is_devmem_iov(struct net_iov *niov)
{
- return niov->pp->mp_ops == &dmabuf_devmem_ops;
+ return niov->type == NET_IOV_DMABUF;
}
static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
@@ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
}
-void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
+ struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
+
size_t size, avail;
gen_pool_for_each_chunk(binding->chunk_pool,
@@ -71,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
dma_buf_detach(binding->dmabuf, binding->attachment);
dma_buf_put(binding->dmabuf);
xa_destroy(&binding->bound_rxqs);
+ kvfree(binding->tx_vec);
kfree(binding);
}
+EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free);
struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
@@ -117,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
unsigned long xa_idx;
unsigned int rxq_idx;
+ xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+
+ /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
+ * erase.
+ */
+ synchronize_net();
+
if (binding->list.next)
list_del(&binding->list);
@@ -131,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
__net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
}
- xa_erase(&net_devmem_dmabuf_bindings, binding->id);
-
net_devmem_dmabuf_binding_put(binding);
}
@@ -166,8 +176,9 @@ err_close_rxq:
}
struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
- struct netlink_ext_ack *extack)
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netlink_ext_ack *extack)
{
struct net_devmem_dmabuf_binding *binding;
static u32 id_alloc_next;
@@ -189,13 +200,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
}
binding->dev = dev;
-
- err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
- binding, xa_limit_32b, &id_alloc_next,
- GFP_KERNEL);
- if (err < 0)
- goto err_free_binding;
-
xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
refcount_set(&binding->ref, 1);
@@ -206,26 +210,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
- goto err_free_id;
+ goto err_free_binding;
}
binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
- DMA_FROM_DEVICE);
+ direction);
if (IS_ERR(binding->sgt)) {
err = PTR_ERR(binding->sgt);
NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
goto err_detach;
}
+ if (direction == DMA_TO_DEVICE) {
+ binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+ sizeof(struct net_iov *),
+ GFP_KERNEL);
+ if (!binding->tx_vec) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+ }
+
/* For simplicity we expect to make PAGE_SIZE allocations, but the
* binding can be much more flexible than that. We may be able to
* allocate MTU sized chunks here. Leave that for future work...
*/
- binding->chunk_pool =
- gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev));
+ binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+ dev_to_node(&dev->dev));
if (!binding->chunk_pool) {
err = -ENOMEM;
- goto err_unmap;
+ goto err_tx_vec;
}
virtual = 0;
@@ -266,27 +280,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
for (i = 0; i < owner->area.num_niovs; i++) {
niov = &owner->area.niovs[i];
+ niov->type = NET_IOV_DMABUF;
niov->owner = &owner->area;
page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
net_devmem_get_dma_addr(niov));
+ if (direction == DMA_TO_DEVICE)
+ binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
}
virtual += len;
}
+ err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
+ binding, xa_limit_32b, &id_alloc_next,
+ GFP_KERNEL);
+ if (err < 0)
+ goto err_free_chunks;
+
return binding;
err_free_chunks:
gen_pool_for_each_chunk(binding->chunk_pool,
net_devmem_dmabuf_free_chunk_owner, NULL);
gen_pool_destroy(binding->chunk_pool);
+err_tx_vec:
+ kvfree(binding->tx_vec);
err_unmap:
dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
DMA_FROM_DEVICE);
err_detach:
dma_buf_detach(dmabuf, binding->attachment);
-err_free_id:
- xa_erase(&net_devmem_dmabuf_bindings, binding->id);
err_free_binding:
kfree(binding);
err_put_dmabuf:
@@ -294,6 +317,74 @@ err_put_dmabuf:
return ERR_PTR(err);
}
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+
+ rcu_read_lock();
+ binding = xa_load(&net_devmem_dmabuf_bindings, id);
+ if (binding) {
+ if (!net_devmem_dmabuf_binding_get(binding))
+ binding = NULL;
+ }
+ rcu_read_unlock();
+
+ return binding;
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
+}
+
+void net_devmem_put_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
+ unsigned int dmabuf_id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int err = 0;
+
+ binding = net_devmem_lookup_dmabuf(dmabuf_id);
+ if (!binding || !binding->tx_vec) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ /* The dma-addrs in this binding are only reachable to the corresponding
+ * net_device.
+ */
+ if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
+ err = -ENODEV;
+ goto out_err;
+ }
+
+ return binding;
+
+out_err:
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
+ return ERR_PTR(err);
+}
+
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
+ size_t virt_addr, size_t *off, size_t *size)
+{
+ if (virt_addr >= binding->dmabuf->size)
+ return NULL;
+
+ *off = virt_addr % PAGE_SIZE;
+ *size = PAGE_SIZE - *off;
+
+ return binding->tx_vec[virt_addr / PAGE_SIZE];
+}
+
/*** "Dmabuf devmem memory provider" ***/
int mp_dmabuf_devmem_init(struct page_pool *pool)
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 7fc158d52729..919e6ed28fdc 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -23,12 +23,20 @@ struct net_devmem_dmabuf_binding {
/* The user holds a ref (via the netlink API) for as long as they want
* the binding to remain alive. Each page pool using this binding holds
- * a ref to keep the binding alive. Each allocated net_iov holds a
- * ref.
+ * a ref to keep the binding alive. The page_pool does not release the
+ * ref until all the net_iovs allocated from this binding are released
+ * back to the page_pool.
*
* The binding undos itself and unmaps the underlying dmabuf once all
* those refs are dropped and the binding is no longer desired or in
* use.
+ *
+ * net_devmem_get_net_iov() on dmabuf net_iovs will increment this
+ * reference, making sure that the binding remains alive until all the
+ * net_iovs are no longer used. net_iovs allocated from this binding
+ * that are stuck in the TX path for any reason (such as awaiting
+ * retransmits) hold a reference to the binding until the skb holding
+ * them is freed.
*/
refcount_t ref;
@@ -44,6 +52,14 @@ struct net_devmem_dmabuf_binding {
* active.
*/
u32 id;
+
+ /* Array of net_iov pointers for this binding, sorted by virtual
+ * address. This array is convenient to map the virtual addresses to
+ * net_iovs in the TX path.
+ */
+ struct net_iov **tx_vec;
+
+ struct work_struct unbind_w;
};
#if defined(CONFIG_NET_DEVMEM)
@@ -60,14 +76,17 @@ struct dmabuf_genpool_chunk_owner {
dma_addr_t base_dma_addr;
};
-void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding);
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
- struct netlink_ext_ack *extack);
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netlink_ext_ack *extack);
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
struct net_devmem_dmabuf_binding *binding,
struct netlink_ext_ack *extack);
+void net_devmem_bind_tx_release(struct sock *sk);
static inline struct dmabuf_genpool_chunk_owner *
net_devmem_iov_to_chunk_owner(const struct net_iov *niov)
@@ -96,10 +115,10 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
}
-static inline void
+static inline bool
net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
{
- refcount_inc(&binding->ref);
+ return refcount_inc_not_zero(&binding->ref);
}
static inline void
@@ -108,30 +127,57 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
if (!refcount_dec_and_test(&binding->ref))
return;
- __net_devmem_dmabuf_binding_free(binding);
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
}
+void net_devmem_get_net_iov(struct net_iov *niov);
+void net_devmem_put_net_iov(struct net_iov *niov);
+
struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
void net_devmem_free_dmabuf(struct net_iov *ppiov);
bool net_is_devmem_iov(struct net_iov *niov);
+struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size);
#else
struct net_devmem_dmabuf_binding;
static inline void
-__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline void net_devmem_get_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void net_devmem_put_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
}
static inline struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+ enum dma_data_direction direction,
struct netlink_ext_ack *extack)
{
return ERR_PTR(-EOPNOTSUPP);
}
+static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ return NULL;
+}
+
static inline void
net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
{
@@ -170,6 +216,25 @@ static inline bool net_is_devmem_iov(struct net_iov *niov)
{
return false;
}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size)
+{
+ return NULL;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return NULL;
+}
#endif
#endif /* _NET_DEVMEM_H */
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 739f7b6506a6..4fc44587f493 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -99,6 +99,12 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPE
[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
};
+/* NETDEV_CMD_BIND_TX - do */
+static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+};
+
/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
{
@@ -190,6 +196,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
.maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
+ {
+ .cmd = NETDEV_CMD_BIND_TX,
+ .doit = netdev_nl_bind_tx_doit,
+ .policy = netdev_bind_tx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_CMD_CAP_DO,
+ },
};
static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index 17d39fd64c94..cf3fad74511f 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -34,6 +34,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
enum {
NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 17aa9e42e8d5..cfcf1654d4fb 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -907,7 +907,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock;
}
- binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack);
+ binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
+ info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_unlock;
@@ -968,6 +969,83 @@ err_genlmsg_free:
return err;
}
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ u32 ifindex, dmabuf_fd;
+ struct sk_buff *rsp;
+ int err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+
+ if (!netif_device_present(netdev)) {
+ err = -ENODEV;
+ goto err_unlock_netdev;
+ }
+
+ if (!netdev->netmem_tx) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(info->extack,
+ "Driver does not support netmem TX");
+ goto err_unlock_netdev;
+ }
+
+ binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd,
+ info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_unlock_netdev;
+ }
+
+ list_add(&binding->list, &priv->bindings);
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ netdev_unlock(netdev);
+ mutex_unlock(&priv->lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_unlock_netdev:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
{
INIT_LIST_HEAD(&priv->bindings);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d73ad79fe739..4159107f1666 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -89,6 +89,7 @@
#include <linux/textsearch.h>
#include "dev.h"
+#include "devmem.h"
#include "netmem_priv.h"
#include "sock_destructor.h"
@@ -1654,7 +1655,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
+ bool devmem)
{
struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
@@ -1669,7 +1671,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->mmp.user = NULL;
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
kfree_skb(skb);
return NULL;
}
@@ -1692,7 +1694,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
}
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg, bool devmem)
{
if (uarg) {
struct ubuf_info_msgzc *uarg_zc;
@@ -1722,7 +1724,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
- if (mm_account_pinned_pages(&uarg_zc->mmp, size))
+ if (likely(!devmem) &&
+ mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
@@ -1737,7 +1740,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
new_alloc:
- return msg_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size, devmem);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
@@ -1841,7 +1844,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding)
{
int err, orig_len = skb->len;
@@ -1860,7 +1864,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
return -EEXIST;
}
- err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+ binding);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
@@ -7313,3 +7318,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
+void get_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+ get_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(get_netmem);
+
+void put_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+
+ put_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(put_netmem);
diff --git a/net/core/sock.c b/net/core/sock.c
index b64df2463300..347ce75482f5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3018,6 +3018,11 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
return -EPERM;
sockc->priority = *(u32 *)CMSG_DATA(cmsg);
break;
+ case SCM_DEVMEM_DMABUF:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
+ break;
default:
return -EINVAL;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6e18d7ec5062..a2705d454fd6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1014,7 +1014,8 @@ static int __ip_append_data(struct sock *sk,
uarg = msg->msg_ubuf;
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
- uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 86c427f16636..0ae265d39184 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1059,6 +1059,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
+ struct net_devmem_dmabuf_binding *binding = NULL;
struct tcp_sock *tp = tcp_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
@@ -1066,11 +1067,23 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
int process_backlog = 0;
+ bool sockc_valid = true;
int zc = 0;
long timeo;
flags = msg->msg_flags;
+ sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) };
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (unlikely(err))
+ /* Don't return error until MSG_FASTOPEN has been
+ * processed; that may succeed even if the cmsg is
+ * invalid.
+ */
+ sockc_valid = false;
+ }
+
if ((flags & MSG_ZEROCOPY) && size) {
if (msg->msg_ubuf) {
uarg = msg->msg_ubuf;
@@ -1078,7 +1091,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
zc = MSG_ZEROCOPY;
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
skb = tcp_write_queue_tail(sk);
- uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb),
+ sockc_valid && !!sockc.dmabuf_id);
if (!uarg) {
err = -ENOBUFS;
goto out_err;
@@ -1087,12 +1101,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
zc = MSG_ZEROCOPY;
else
uarg_to_msgzc(uarg)->zerocopy = 0;
+
+ if (sockc_valid && sockc.dmabuf_id) {
+ binding = net_devmem_get_binding(sk, sockc.dmabuf_id);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ binding = NULL;
+ goto out_err;
+ }
+ }
}
} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
if (sk->sk_route_caps & NETIF_F_SG)
zc = MSG_SPLICE_PAGES;
}
+ if (sockc_valid && sockc.dmabuf_id &&
+ (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
if (unlikely(flags & MSG_FASTOPEN ||
inet_test_bit(DEFER_CONNECT, sk)) &&
!tp->repair) {
@@ -1131,13 +1160,10 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
- sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)};
- if (msg->msg_controllen) {
- err = sock_cmsg_send(sk, msg, &sockc);
- if (unlikely(err)) {
+ if (!sockc_valid) {
+ if (!err)
err = -EINVAL;
- goto out_err;
- }
+ goto out_err;
}
/* This should be in poll */
@@ -1258,7 +1284,8 @@ new_segment:
goto wait_for_space;
}
- err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+ err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg,
+ binding);
if (err == -EMSGSIZE || err == -EEXIST) {
tcp_mark_push(tp, skb);
goto new_segment;
@@ -1339,6 +1366,8 @@ out_nopush:
/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
if (uarg && !msg->msg_ubuf)
net_zcopy_put(uarg);
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
return copied + copied_syn;
do_error:
@@ -1356,6 +1385,9 @@ out_err:
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
return err;
}
EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ef052ccd9380..7bd29a9ff0db 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1524,7 +1524,8 @@ emsgsize:
uarg = msg->msg_ubuf;
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
- uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 7f7de6d88096..6e7b727c781c 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
uarg = msg_zerocopy_realloc(sk_vsock(vsk),
iter->count,
- NULL);
+ NULL, false);
if (!uarg)
return -1;
@@ -107,8 +107,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb,
{
if (zcopy)
return __zerocopy_sg_from_iter(info->msg, NULL, skb,
- &info->msg->msg_iter,
- len);
+ &info->msg->msg_iter, len, NULL);
return memcpy_from_msg(skb_put(skb, len), info->msg, len);
}
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 7600bf62dbdf..7eb9571786b8 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -219,6 +219,7 @@ enum {
NETDEV_CMD_QSTATS_GET,
NETDEV_CMD_BIND_RX,
NETDEV_CMD_NAPI_SET,
+ NETDEV_CMD_BIND_TX,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py
index 3947e9157115..7fc686cf47a2 100755
--- a/tools/testing/selftests/drivers/net/hw/devmem.py
+++ b/tools/testing/selftests/drivers/net/hw/devmem.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0
+from os import path
from lib.py import ksft_run, ksft_exit
from lib.py import ksft_eq, KsftSkipEx
from lib.py import NetDrvEpEnv
@@ -10,8 +11,7 @@ from lib.py import ksft_disruptive
def require_devmem(cfg):
if not hasattr(cfg, "_devmem_probed"):
- port = rand_port()
- probe_command = f"./ncdevmem -f {cfg.ifname}"
+ probe_command = f"{cfg.bin_local} -f {cfg.ifname}"
cfg._devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0
cfg._devmem_probed = True
@@ -25,7 +25,7 @@ def check_rx(cfg) -> None:
require_devmem(cfg)
port = rand_port()
- listen_cmd = f"./ncdevmem -l -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}"
+ listen_cmd = f"{cfg.bin_local} -l -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}"
with bkg(listen_cmd) as socat:
wait_port_listen(port)
@@ -34,9 +34,27 @@ def check_rx(cfg) -> None:
ksft_eq(socat.stdout.strip(), "hello\nworld")
+@ksft_disruptive
+def check_tx(cfg) -> None:
+ cfg.require_ipver("6")
+ require_devmem(cfg)
+
+ port = rand_port()
+ listen_cmd = f"socat -U - TCP6-LISTEN:{port}"
+
+ with bkg(listen_cmd, exit_wait=True) as socat:
+ wait_port_listen(port)
+ cmd(f"echo -e \"hello\\nworld\"| {cfg.bin_remote} -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}", host=cfg.remote, shell=True)
+
+ ksft_eq(socat.stdout.strip(), "hello\nworld")
+
+
def main() -> None:
with NetDrvEpEnv(__file__) as cfg:
- ksft_run([check_rx],
+ cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem")
+ cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
+
+ ksft_run([check_rx, check_tx],
args=(cfg, ))
ksft_exit()
diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
index 2bf14ac2b8c6..f801a1b3545f 100644
--- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c
+++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
@@ -9,22 +9,31 @@
* ncdevmem -s <server IP> [-c <client IP>] -f eth1 -l -p 5201
*
* On client:
- * echo -n "hello\nworld" | nc -s <server IP> 5201 -p 5201
+ * echo -n "hello\nworld" | \
+ * ncdevmem -s <server IP> [-c <client IP>] -p 5201 -f eth1
*
- * Test data validation:
+ * Note this is compatible with regular netcat. i.e. the sender or receiver can
+ * be replaced with regular netcat to test the RX or TX path in isolation.
+ *
+ * Test data validation (devmem TCP on RX only):
*
* On server:
* ncdevmem -s <server IP> [-c <client IP>] -f eth1 -l -p 5201 -v 7
*
* On client:
* yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
- * tr \\n \\0 | \
- * head -c 5G | \
+ * head -c 1G | \
* nc <server IP> 5201 -p 5201
*
+ * Test data validation (devmem TCP on RX and TX, validation happens on RX):
*
- * Note this is compatible with regular netcat. i.e. the sender or receiver can
- * be replaced with regular netcat to test the RX or TX path in isolation.
+ * On server:
+ * ncdevmem -s <server IP> [-c <client IP>] -l -p 5201 -v 8 -f eth1
+ *
+ * On client:
+ * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06\\x07) | \
+ * head -c 1M | \
+ * ncdevmem -s <server IP> [-c <client IP>] -p 5201 -f eth1
*/
#define _GNU_SOURCE
#define __EXPORTED_HEADERS__
@@ -40,15 +49,18 @@
#include <fcntl.h>
#include <malloc.h>
#include <error.h>
+#include <poll.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
+#include <sys/time.h>
#include <linux/memfd.h>
#include <linux/dma-buf.h>
+#include <linux/errqueue.h>
#include <linux/udmabuf.h>
#include <linux/types.h>
#include <linux/netlink.h>
@@ -79,6 +91,8 @@ static int num_queues = -1;
static char *ifname;
static unsigned int ifindex;
static unsigned int dmabuf_id;
+static uint32_t tx_dmabuf_id;
+static int waittime_ms = 500;
struct memory_buffer {
int fd;
@@ -92,6 +106,8 @@ struct memory_buffer {
struct memory_provider {
struct memory_buffer *(*alloc)(size_t size);
void (*free)(struct memory_buffer *ctx);
+ void (*memcpy_to_device)(struct memory_buffer *dst, size_t off,
+ void *src, int n);
void (*memcpy_from_device)(void *dst, struct memory_buffer *src,
size_t off, int n);
};
@@ -152,6 +168,20 @@ static void udmabuf_free(struct memory_buffer *ctx)
free(ctx);
}
+static void udmabuf_memcpy_to_device(struct memory_buffer *dst, size_t off,
+ void *src, int n)
+{
+ struct dma_buf_sync sync = {};
+
+ sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE;
+ ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync);
+
+ memcpy(dst->buf_mem + off, src, n);
+
+ sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE;
+ ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync);
+}
+
static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src,
size_t off, int n)
{
@@ -169,6 +199,7 @@ static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src,
static struct memory_provider udmabuf_memory_provider = {
.alloc = udmabuf_alloc,
.free = udmabuf_free,
+ .memcpy_to_device = udmabuf_memcpy_to_device,
.memcpy_from_device = udmabuf_memcpy_from_device,
};
@@ -187,14 +218,16 @@ void validate_buffer(void *line, size_t size)
{
static unsigned char seed = 1;
unsigned char *ptr = line;
- int errors = 0;
+ unsigned char expected;
+ static int errors;
size_t i;
for (i = 0; i < size; i++) {
- if (ptr[i] != seed) {
+ expected = seed ? seed : '\n';
+ if (ptr[i] != expected) {
fprintf(stderr,
"Failed validation: expected=%u, actual=%u, index=%lu\n",
- seed, ptr[i], i);
+ expected, ptr[i], i);
errors++;
if (errors > 20)
error(1, 0, "validation failed.");
@@ -393,6 +426,49 @@ err_close:
return -1;
}
+static int bind_tx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
+ struct ynl_sock **ys)
+{
+ struct netdev_bind_tx_req *req = NULL;
+ struct netdev_bind_tx_rsp *rsp = NULL;
+ struct ynl_error yerr;
+
+ *ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+ if (!*ys) {
+ fprintf(stderr, "YNL: %s\n", yerr.msg);
+ return -1;
+ }
+
+ req = netdev_bind_tx_req_alloc();
+ netdev_bind_tx_req_set_ifindex(req, ifindex);
+ netdev_bind_tx_req_set_fd(req, dmabuf_fd);
+
+ rsp = netdev_bind_tx(*ys, req);
+ if (!rsp) {
+ perror("netdev_bind_tx");
+ goto err_close;
+ }
+
+ if (!rsp->_present.id) {
+ perror("id not present");
+ goto err_close;
+ }
+
+ fprintf(stderr, "got tx dmabuf id=%d\n", rsp->id);
+ tx_dmabuf_id = rsp->id;
+
+ netdev_bind_tx_req_free(req);
+ netdev_bind_tx_rsp_free(rsp);
+
+ return 0;
+
+err_close:
+ fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg);
+ netdev_bind_tx_req_free(req);
+ ynl_sock_destroy(*ys);
+ return -1;
+}
+
static void enable_reuseaddr(int fd)
{
int opt = 1;
@@ -431,7 +507,7 @@ static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6)
return 0;
}
-int do_server(struct memory_buffer *mem)
+static int do_server(struct memory_buffer *mem)
{
char ctrl_data[sizeof(int) * 20000];
struct netdev_queue_id *queues;
@@ -685,6 +761,206 @@ void run_devmem_tests(void)
provider->free(mem);
}
+static uint64_t gettimeofday_ms(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000ULL) + (tv.tv_usec / 1000ULL);
+}
+
+static int do_poll(int fd)
+{
+ struct pollfd pfd;
+ int ret;
+
+ pfd.revents = 0;
+ pfd.fd = fd;
+
+ ret = poll(&pfd, 1, waittime_ms);
+ if (ret == -1)
+ error(1, errno, "poll");
+
+ return ret && (pfd.revents & POLLERR);
+}
+
+static void wait_compl(int fd)
+{
+ int64_t tstop = gettimeofday_ms() + waittime_ms;
+ char control[CMSG_SPACE(100)] = {};
+ struct sock_extended_err *serr;
+ struct msghdr msg = {};
+ struct cmsghdr *cm;
+ __u32 hi, lo;
+ int ret;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ while (gettimeofday_ms() < tstop) {
+ if (!do_poll(fd))
+ continue;
+
+ ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+ if (ret < 0) {
+ if (errno == EAGAIN)
+ continue;
+ error(1, errno, "recvmsg(MSG_ERRQUEUE)");
+ return;
+ }
+ if (msg.msg_flags & MSG_CTRUNC)
+ error(1, 0, "MSG_CTRUNC\n");
+
+ for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
+ if (cm->cmsg_level != SOL_IP &&
+ cm->cmsg_level != SOL_IPV6)
+ continue;
+ if (cm->cmsg_level == SOL_IP &&
+ cm->cmsg_type != IP_RECVERR)
+ continue;
+ if (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type != IPV6_RECVERR)
+ continue;
+
+ serr = (void *)CMSG_DATA(cm);
+ if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
+ error(1, 0, "wrong origin %u", serr->ee_origin);
+ if (serr->ee_errno != 0)
+ error(1, 0, "wrong errno %d", serr->ee_errno);
+
+ hi = serr->ee_data;
+ lo = serr->ee_info;
+
+ fprintf(stderr, "tx complete [%d,%d]\n", lo, hi);
+ return;
+ }
+ }
+
+ error(1, 0, "did not receive tx completion");
+}
+
+static int do_client(struct memory_buffer *mem)
+{
+ char ctrl_data[CMSG_SPACE(sizeof(__u32))];
+ struct sockaddr_in6 server_sin;
+ struct sockaddr_in6 client_sin;
+ struct ynl_sock *ys = NULL;
+ struct msghdr msg = {};
+ ssize_t line_size = 0;
+ struct cmsghdr *cmsg;
+ struct iovec iov[2];
+ char *line = NULL;
+ unsigned long mid;
+ size_t len = 0;
+ int socket_fd;
+ __u32 ddmabuf;
+ int opt = 1;
+ int ret;
+
+ ret = parse_address(server_ip, atoi(port), &server_sin);
+ if (ret < 0)
+ error(1, 0, "parse server address");
+
+ socket_fd = socket(AF_INET6, SOCK_STREAM, 0);
+ if (socket_fd < 0)
+ error(1, socket_fd, "create socket");
+
+ enable_reuseaddr(socket_fd);
+
+ ret = setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname,
+ strlen(ifname) + 1);
+ if (ret)
+ error(1, errno, "bindtodevice");
+
+ if (bind_tx_queue(ifindex, mem->fd, &ys))
+ error(1, 0, "Failed to bind\n");
+
+ if (client_ip) {
+ ret = parse_address(client_ip, atoi(port), &client_sin);
+ if (ret < 0)
+ error(1, 0, "parse client address");
+
+ ret = bind(socket_fd, &client_sin, sizeof(client_sin));
+ if (ret)
+ error(1, errno, "bind");
+ }
+
+ ret = setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt));
+ if (ret)
+ error(1, errno, "set sock opt");
+
+ fprintf(stderr, "Connect to %s %d (via %s)\n", server_ip,
+ ntohs(server_sin.sin6_port), ifname);
+
+ ret = connect(socket_fd, &server_sin, sizeof(server_sin));
+ if (ret)
+ error(1, errno, "connect");
+
+ while (1) {
+ free(line);
+ line = NULL;
+ line_size = getline(&line, &len, stdin);
+
+ if (line_size < 0)
+ break;
+
+ mid = (line_size / 2) + 1;
+
+ iov[0].iov_base = (void *)1;
+ iov[0].iov_len = mid;
+ iov[1].iov_base = (void *)(mid + 2);
+ iov[1].iov_len = line_size - mid;
+
+ provider->memcpy_to_device(mem, (size_t)iov[0].iov_base, line,
+ iov[0].iov_len);
+ provider->memcpy_to_device(mem, (size_t)iov[1].iov_base,
+ line + iov[0].iov_len,
+ iov[1].iov_len);
+
+ fprintf(stderr,
+ "read line_size=%ld iov[0].iov_base=%lu, iov[0].iov_len=%lu, iov[1].iov_base=%lu, iov[1].iov_len=%lu\n",
+ line_size, (unsigned long)iov[0].iov_base,
+ iov[0].iov_len, (unsigned long)iov[1].iov_base,
+ iov[1].iov_len);
+
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 2;
+
+ msg.msg_control = ctrl_data;
+ msg.msg_controllen = sizeof(ctrl_data);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_DEVMEM_DMABUF;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(__u32));
+
+ ddmabuf = tx_dmabuf_id;
+
+ *((__u32 *)CMSG_DATA(cmsg)) = ddmabuf;
+
+ ret = sendmsg(socket_fd, &msg, MSG_ZEROCOPY);
+ if (ret < 0)
+ error(1, errno, "Failed sendmsg");
+
+ fprintf(stderr, "sendmsg_ret=%d\n", ret);
+
+ if (ret != line_size)
+ error(1, errno, "Did not send all bytes");
+
+ wait_compl(socket_fd);
+ }
+
+ fprintf(stderr, "%s: tx ok\n", TEST_PREFIX);
+
+ free(line);
+ close(socket_fd);
+
+ if (ys)
+ ynl_sock_destroy(ys);
+
+ return 0;
+}
+
int main(int argc, char *argv[])
{
struct memory_buffer *mem;
@@ -728,6 +1004,8 @@ int main(int argc, char *argv[])
ifindex = if_nametoindex(ifname);
+ fprintf(stderr, "using ifindex=%u\n", ifindex);
+
if (!server_ip && !client_ip) {
if (start_queue < 0 && num_queues < 0) {
num_queues = rxq_num(ifindex);
@@ -778,7 +1056,7 @@ int main(int argc, char *argv[])
error(1, 0, "Missing -p argument\n");
mem = provider->alloc(getpagesize() * NUM_PAGES);
- ret = is_server ? do_server(mem) : 1;
+ ret = is_server ? do_server(mem) : do_client(mem);
provider->free(mem);
return ret;