summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/netlink/specs/netdev.yaml6
-rw-r--r--Documentation/networking/af_xdp.rst211
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c6
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_txrx.c4
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_txrx.h2
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.c101
-rw-r--r--drivers/net/ethernet/intel/ice/ice_base.c9
-rw-r--r--drivers/net/ethernet/intel/ice/ice_main.c1
-rw-r--r--drivers/net/ethernet/intel/ice/ice_xsk.c221
-rw-r--r--include/linux/netdevice.h1
-rw-r--r--include/net/xdp_sock.h7
-rw-r--r--include/net/xdp_sock_drv.h54
-rw-r--r--include/net/xsk_buff_pool.h7
-rw-r--r--include/uapi/linux/if_xdp.h13
-rw-r--r--include/uapi/linux/netdev.h1
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/filter.c7
-rw-r--r--net/core/netdev-genl.c8
-rw-r--r--net/xdp/xsk.c365
-rw-r--r--net/xdp/xsk_buff_pool.c7
-rw-r--r--net/xdp/xsk_queue.h95
-rw-r--r--tools/include/uapi/linux/if_xdp.h9
-rw-r--r--tools/include/uapi/linux/netdev.h1
-rw-r--r--tools/lib/bpf/libbpf.h3
-rw-r--r--tools/lib/bpf/netlink.c5
-rw-r--r--tools/testing/selftests/bpf/progs/xsk_xdp_progs.c6
-rwxr-xr-xtools/testing/selftests/bpf/test_xsk.sh5
-rw-r--r--tools/testing/selftests/bpf/xsk.c136
-rw-r--r--tools/testing/selftests/bpf/xsk.h2
-rwxr-xr-xtools/testing/selftests/bpf/xsk_prereqs.sh7
-rw-r--r--tools/testing/selftests/bpf/xskxceiver.c458
-rw-r--r--tools/testing/selftests/bpf/xskxceiver.h21
32 files changed, 1505 insertions, 275 deletions
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index b99e7ffef7a1..e41015310a6e 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -62,6 +62,12 @@ attribute-sets:
type: u64
enum: xdp-act
enum-as-flags: true
+ -
+ name: xdp_zc_max_segs
+ doc: max fragment count supported by ZC driver
+ type: u32
+ checks:
+ min: 1
operations:
list:
diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 1cc35de336a4..dceeb0d763aa 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -462,8 +462,92 @@ XDP_OPTIONS getsockopt
Gets options from an XDP socket. The only one supported so far is
XDP_OPTIONS_ZEROCOPY which tells you if zero-copy is on or not.
+Multi-Buffer Support
+====================
+
+With multi-buffer support, programs using AF_XDP sockets can receive
+and transmit packets consisting of multiple buffers both in copy and
+zero-copy mode. For example, a packet can consist of two
+frames/buffers, one with the header and the other one with the data,
+or a 9K Ethernet jumbo frame can be constructed by chaining together
+three 4K frames.
+
+Some definitions:
+
+* A packet consists of one or more frames
+
+* A descriptor in one of the AF_XDP rings always refers to a single
+ frame. In the case the packet consists of a single frame, the
+ descriptor refers to the whole packet.
+
+To enable multi-buffer support for an AF_XDP socket, use the new bind
+flag XDP_USE_SG. If this is not provided, all multi-buffer packets
+will be dropped just as before. Note that the XDP program loaded also
+needs to be in multi-buffer mode. This can be accomplished by using
+"xdp.frags" as the section name of the XDP program used.
+
+To represent a packet consisting of multiple frames, a new flag called
+XDP_PKT_CONTD is introduced in the options field of the Rx and Tx
+descriptors. If it is true (1) the packet continues with the next
+descriptor and if it is false (0) it means this is the last descriptor
+of the packet. Why the reverse logic of end-of-packet (eop) flag found
+in many NICs? Just to preserve compatibility with non-multi-buffer
+applications that have this bit set to false for all packets on Rx,
+and the apps set the options field to zero for Tx, as anything else
+will be treated as an invalid descriptor.
+
+These are the semantics for producing packets onto AF_XDP Tx ring
+consisting of multiple frames:
+
+* When an invalid descriptor is found, all the other
+ descriptors/frames of this packet are marked as invalid and not
+ completed. The next descriptor is treated as the start of a new
+ packet, even if this was not the intent (because we cannot guess
+ the intent). As before, if your program is producing invalid
+ descriptors you have a bug that must be fixed.
+
+* Zero length descriptors are treated as invalid descriptors.
+
+* For copy mode, the maximum supported number of frames in a packet is
+ equal to CONFIG_MAX_SKB_FRAGS + 1. If it is exceeded, all
+ descriptors accumulated so far are dropped and treated as
+ invalid. To produce an application that will work on any system
+ regardless of this config setting, limit the number of frags to 18,
+ as the minimum value of the config is 17.
+
+* For zero-copy mode, the limit is up to what the NIC HW
+ supports. Usually at least five on the NICs we have checked. We
+ consciously chose to not enforce a rigid limit (such as
+ CONFIG_MAX_SKB_FRAGS + 1) for zero-copy mode, as it would have
+ resulted in copy actions under the hood to fit into what limit the
+ NIC supports. Kind of defeats the purpose of zero-copy mode. How to
+ probe for this limit is explained in the "probe for multi-buffer
+ support" section.
+
+On the Rx path in copy-mode, the xsk core copies the XDP data into
+multiple descriptors, if needed, and sets the XDP_PKT_CONTD flag as
+detailed before. Zero-copy mode works the same, though the data is not
+copied. When the application gets a descriptor with the XDP_PKT_CONTD
+flag set to one, it means that the packet consists of multiple buffers
+and it continues with the next buffer in the following
+descriptor. When a descriptor with XDP_PKT_CONTD == 0 is received, it
+means that this is the last buffer of the packet. AF_XDP guarantees
+that only a complete packet (all frames in the packet) is sent to the
+application. If there is not enough space in the AF_XDP Rx ring, all
+frames of the packet will be dropped.
+
+If application reads a batch of descriptors, using for example the libxdp
+interfaces, it is not guaranteed that the batch will end with a full
+packet. It might end in the middle of a packet and the rest of the
+buffers of that packet will arrive at the beginning of the next batch,
+since the libxdp interface does not read the whole ring (unless you
+have an enormous batch size or a very small ring size).
+
+An example program each for Rx and Tx multi-buffer support can be found
+later in this document.
+
Usage
-=====
+-----
In order to use AF_XDP sockets two parts are needed. The
user-space application and the XDP program. For a complete setup and
@@ -541,6 +625,131 @@ like this:
But please use the libbpf functions as they are optimized and ready to
use. Will make your life easier.
+Usage Multi-Buffer Rx
+---------------------
+
+Here is a simple Rx path pseudo-code example (using libxdp interfaces
+for simplicity). Error paths have been excluded to keep it short:
+
+.. code-block:: c
+
+ void rx_packets(struct xsk_socket_info *xsk)
+ {
+ static bool new_packet = true;
+ u32 idx_rx = 0, idx_fq = 0;
+ static char *pkt;
+
+ int rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
+
+ xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+
+ for (int i = 0; i < rcvd; i++) {
+ struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++);
+ char *frag = xsk_umem__get_data(xsk->umem->buffer, desc->addr);
+ bool eop = !(desc->options & XDP_PKT_CONTD);
+
+ if (new_packet)
+ pkt = frag;
+ else
+ add_frag_to_pkt(pkt, frag);
+
+ if (eop)
+ process_pkt(pkt);
+
+ new_packet = eop;
+
+ *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = desc->addr;
+ }
+
+ xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+ xsk_ring_cons__release(&xsk->rx, rcvd);
+ }
+
+Usage Multi-Buffer Tx
+---------------------
+
+Here is an example Tx path pseudo-code (using libxdp interfaces for
+simplicity) ignoring that the umem is finite in size, and that we
+eventually will run out of packets to send. Also assumes pkts.addr
+points to a valid location in the umem.
+
+.. code-block:: c
+
+ void tx_packets(struct xsk_socket_info *xsk, struct pkt *pkts,
+ int batch_size)
+ {
+ u32 idx, i, pkt_nb = 0;
+
+ xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx);
+
+ for (i = 0; i < batch_size;) {
+ u64 addr = pkts[pkt_nb].addr;
+ u32 len = pkts[pkt_nb].size;
+
+ do {
+ struct xdp_desc *tx_desc;
+
+ tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i++);
+ tx_desc->addr = addr;
+
+ if (len > xsk_frame_size) {
+ tx_desc->len = xsk_frame_size;
+ tx_desc->options = XDP_PKT_CONTD;
+ } else {
+ tx_desc->len = len;
+ tx_desc->options = 0;
+ pkt_nb++;
+ }
+ len -= tx_desc->len;
+ addr += xsk_frame_size;
+
+ if (i == batch_size) {
+ /* Remember len, addr, pkt_nb for next iteration.
+ * Skipped for simplicity.
+ */
+ break;
+ }
+ } while (len);
+ }
+
+ xsk_ring_prod__submit(&xsk->tx, i);
+ }
+
+Probing for Multi-Buffer Support
+--------------------------------
+
+To discover if a driver supports multi-buffer AF_XDP in SKB or DRV
+mode, use the XDP_FEATURES feature of netlink in linux/netdev.h to
+query for NETDEV_XDP_ACT_RX_SG support. This is the same flag as for
+querying for XDP multi-buffer support. If XDP supports multi-buffer in
+a driver, then AF_XDP will also support that in SKB and DRV mode.
+
+To discover if a driver supports multi-buffer AF_XDP in zero-copy
+mode, use XDP_FEATURES and first check the NETDEV_XDP_ACT_XSK_ZEROCOPY
+flag. If it is set, it means that at least zero-copy is supported and
+you should go and check the netlink attribute
+NETDEV_A_DEV_XDP_ZC_MAX_SEGS in linux/netdev.h. An unsigned integer
+value will be returned stating the max number of frags that are
+supported by this device in zero-copy mode. These are the possible
+return values:
+
+1: Multi-buffer for zero-copy is not supported by this device, as max
+ one fragment supported means that multi-buffer is not possible.
+
+>=2: Multi-buffer is supported in zero-copy mode for this device. The
+ returned number signifies the max number of frags supported.
+
+For an example on how these are used through libbpf, please take a
+look at tools/testing/selftests/bpf/xskxceiver.c.
+
+Multi-Buffer Support for Zero-Copy Drivers
+------------------------------------------
+
+Zero-copy drivers usually use the batched APIs for Rx and Tx
+processing. Note that the Tx batch API guarantees that it will provide
+a batch of Tx descriptors that ends with full packet at the end. This
+to facilitate extending a zero-copy driver with multi-buffer support.
+
Sample application
==================
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 29ad1797adce..982ae70c51e8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3585,11 +3585,6 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
if (ring->xsk_pool) {
ring->rx_buf_len =
xsk_pool_get_rx_frame_size(ring->xsk_pool);
- /* For AF_XDP ZC, we disallow packets to span on
- * multiple buffers, thus letting us skip that
- * handling in the fast-path.
- */
- chain_len = 1;
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_XSK_BUFF_POOL,
NULL);
@@ -13822,6 +13817,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
NETDEV_XDP_ACT_REDIRECT |
NETDEV_XDP_ACT_XSK_ZEROCOPY |
NETDEV_XDP_ACT_RX_SG;
+ netdev->xdp_zc_max_segs = I40E_MAX_BUFFER_TXD;
} else {
/* Relate the VSI_VMDQ name to the VSI_MAIN name. Note that we
* are still limited by IFNAMSIZ, but we're adding 'v%d\0' to
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 8b8bf4880faa..0b3a27f118fb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2284,8 +2284,8 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
* If the buffer is an EOP buffer, this function exits returning false,
* otherwise return true indicating that this is in fact a non-EOP buffer.
*/
-static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc)
+bool i40e_is_non_eop(struct i40e_ring *rx_ring,
+ union i40e_rx_desc *rx_desc)
{
/* if we are the last buffer then there is nothing else to do */
#define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 8c3d24012c54..900b0d9ede9f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -473,6 +473,8 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
u32 flags);
+bool i40e_is_non_eop(struct i40e_ring *rx_ring,
+ union i40e_rx_desc *rx_desc);
/**
* i40e_get_head - Retrieve head from head writeback
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 05ec1181471e..37f41c8a682f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -294,8 +294,14 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
{
unsigned int totalsize = xdp->data_end - xdp->data_meta;
unsigned int metasize = xdp->data - xdp->data_meta;
+ struct skb_shared_info *sinfo = NULL;
struct sk_buff *skb;
+ u32 nr_frags = 0;
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
net_prefetch(xdp->data_meta);
/* allocate a skb to store the frags */
@@ -312,6 +318,28 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
__skb_pull(skb, metasize);
}
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+
+ for (int i = 0; i < nr_frags; i++) {
+ struct skb_shared_info *skinfo = skb_shinfo(skb);
+ skb_frag_t *frag = &sinfo->frags[i];
+ struct page *page;
+ void *addr;
+
+ page = dev_alloc_page();
+ if (!page) {
+ dev_kfree_skb(skb);
+ return NULL;
+ }
+ addr = page_to_virt(page);
+
+ memcpy(addr, skb_frag_page(frag), skb_frag_size(frag));
+
+ __skb_fill_page_desc_noacc(skinfo, skinfo->nr_frags++,
+ addr, 0, skb_frag_size(frag));
+ }
+
out:
xsk_buff_free(xdp);
return skb;
@@ -322,14 +350,13 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc,
unsigned int *rx_packets,
unsigned int *rx_bytes,
- unsigned int size,
unsigned int xdp_res,
bool *failure)
{
struct sk_buff *skb;
*rx_packets = 1;
- *rx_bytes = size;
+ *rx_bytes = xdp_get_buff_len(xdp_buff);
if (likely(xdp_res == I40E_XDP_REDIR) || xdp_res == I40E_XDP_TX)
return;
@@ -363,7 +390,6 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring,
return;
}
- *rx_bytes = skb->len;
i40e_process_skb_fields(rx_ring, rx_desc, skb);
napi_gro_receive(&rx_ring->q_vector->napi, skb);
return;
@@ -374,6 +400,31 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring,
WARN_ON_ONCE(1);
}
+static int
+i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first,
+ struct xdp_buff *xdp, const unsigned int size)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first);
+
+ if (!xdp_buff_has_frags(first)) {
+ sinfo->nr_frags = 0;
+ sinfo->xdp_frags_size = 0;
+ xdp_buff_set_frags_flag(first);
+ }
+
+ if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
+ xsk_buff_free(first);
+ return -ENOMEM;
+ }
+
+ __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++,
+ virt_to_page(xdp->data_hard_start), 0, size);
+ sinfo->xdp_frags_size += size;
+ xsk_buff_add_frag(xdp);
+
+ return 0;
+}
+
/**
* i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring
* @rx_ring: Rx ring
@@ -384,13 +435,18 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring,
int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+ u16 next_to_process = rx_ring->next_to_process;
u16 next_to_clean = rx_ring->next_to_clean;
u16 count_mask = rx_ring->count - 1;
unsigned int xdp_res, xdp_xmit = 0;
+ struct xdp_buff *first = NULL;
struct bpf_prog *xdp_prog;
bool failure = false;
u16 cleaned_count;
+ if (next_to_process != next_to_clean)
+ first = *i40e_rx_bi(rx_ring, next_to_clean);
+
/* NB! xdp_prog will always be !NULL, due to the fact that
* this path is enabled by setting an XDP program.
*/
@@ -404,7 +460,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
unsigned int size;
u64 qword;
- rx_desc = I40E_RX_DESC(rx_ring, next_to_clean);
+ rx_desc = I40E_RX_DESC(rx_ring, next_to_process);
qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
/* This memory barrier is needed to keep us from reading
@@ -417,9 +473,9 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
i40e_clean_programming_status(rx_ring,
rx_desc->raw.qword[0],
qword);
- bi = *i40e_rx_bi(rx_ring, next_to_clean);
+ bi = *i40e_rx_bi(rx_ring, next_to_process);
xsk_buff_free(bi);
- next_to_clean = (next_to_clean + 1) & count_mask;
+ next_to_process = (next_to_process + 1) & count_mask;
continue;
}
@@ -428,22 +484,35 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
if (!size)
break;
- bi = *i40e_rx_bi(rx_ring, next_to_clean);
+ bi = *i40e_rx_bi(rx_ring, next_to_process);
xsk_buff_set_size(bi, size);
xsk_buff_dma_sync_for_cpu(bi, rx_ring->xsk_pool);
- xdp_res = i40e_run_xdp_zc(rx_ring, bi, xdp_prog);
- i40e_handle_xdp_result_zc(rx_ring, bi, rx_desc, &rx_packets,
- &rx_bytes, size, xdp_res, &failure);
+ if (!first)
+ first = bi;
+ else if (i40e_add_xsk_frag(rx_ring, first, bi, size))
+ break;
+
+ next_to_process = (next_to_process + 1) & count_mask;
+
+ if (i40e_is_non_eop(rx_ring, rx_desc))
+ continue;
+
+ xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog);
+ i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets,
+ &rx_bytes, xdp_res, &failure);
+ first->flags = 0;
+ next_to_clean = next_to_process;
if (failure)
break;
total_rx_packets += rx_packets;
total_rx_bytes += rx_bytes;
xdp_xmit |= xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR);
- next_to_clean = (next_to_clean + 1) & count_mask;
+ first = NULL;
}
rx_ring->next_to_clean = next_to_clean;
+ rx_ring->next_to_process = next_to_process;
cleaned_count = (next_to_clean - rx_ring->next_to_use - 1) & count_mask;
if (cleaned_count >= I40E_RX_BUFFER_WRITE)
@@ -466,6 +535,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
unsigned int *total_bytes)
{
+ u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(desc);
struct i40e_tx_desc *tx_desc;
dma_addr_t dma;
@@ -474,8 +544,7 @@ static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
tx_desc->buffer_addr = cpu_to_le64(dma);
- tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP,
- 0, desc->len, 0);
+ tx_desc->cmd_type_offset_bsz = build_ctob(cmd, 0, desc->len, 0);
*total_bytes += desc->len;
}
@@ -489,14 +558,14 @@ static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *des
u32 i;
loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
+ u32 cmd = I40E_TX_DESC_CMD_ICRC | xsk_is_eop_desc(&desc[i]);
+
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
tx_desc->buffer_addr = cpu_to_le64(dma);
- tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
- I40E_TX_DESC_CMD_EOP,
- 0, desc[i].len, 0);
+ tx_desc->cmd_type_offset_bsz = build_ctob(cmd, 0, desc[i].len, 0);
*total_bytes += desc[i].len;
}
diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
index 4a12316f7b46..3367b8ba9851 100644
--- a/drivers/net/ethernet/intel/ice/ice_base.c
+++ b/drivers/net/ethernet/intel/ice/ice_base.c
@@ -408,7 +408,6 @@ static unsigned int ice_rx_offset(struct ice_rx_ring *rx_ring)
*/
static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
{
- int chain_len = ICE_MAX_CHAINED_RX_BUFS;
struct ice_vsi *vsi = ring->vsi;
u32 rxdid = ICE_RXDID_FLEX_NIC;
struct ice_rlan_ctx rlan_ctx;
@@ -472,17 +471,11 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring)
*/
rlan_ctx.showiv = 0;
- /* For AF_XDP ZC, we disallow packets to span on
- * multiple buffers, thus letting us skip that
- * handling in the fast-path.
- */
- if (ring->xsk_pool)
- chain_len = 1;
/* Max packet size for this queue - must not be set to a larger value
* than 5 x DBUF
*/
rlan_ctx.rxmax = min_t(u32, vsi->max_frame,
- chain_len * ring->rx_buf_len);
+ ICE_MAX_CHAINED_RX_BUFS * ring->rx_buf_len);
/* Rx queue threshold in units of 64 */
rlan_ctx.lrxqthresh = 1;
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 19a5e7f3a075..ca83379b2de1 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -3392,6 +3392,7 @@ static void ice_set_ops(struct ice_vsi *vsi)
netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
NETDEV_XDP_ACT_XSK_ZEROCOPY |
NETDEV_XDP_ACT_RX_SG;
+ netdev->xdp_zc_max_segs = ICE_MAX_BUF_TXD;
}
/**
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index a7fe2b4ce655..2a3f0834e139 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -546,19 +546,6 @@ bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count)
}
/**
- * ice_bump_ntc - Bump the next_to_clean counter of an Rx ring
- * @rx_ring: Rx ring
- */
-static void ice_bump_ntc(struct ice_rx_ring *rx_ring)
-{
- int ntc = rx_ring->next_to_clean + 1;
-
- ntc = (ntc < rx_ring->count) ? ntc : 0;
- rx_ring->next_to_clean = ntc;
- prefetch(ICE_RX_DESC(rx_ring, ntc));
-}
-
-/**
* ice_construct_skb_zc - Create an sk_buff from zero-copy buffer
* @rx_ring: Rx ring
* @xdp: Pointer to XDP buffer
@@ -572,8 +559,14 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
{
unsigned int totalsize = xdp->data_end - xdp->data_meta;
unsigned int metasize = xdp->data - xdp->data_meta;
+ struct skb_shared_info *sinfo = NULL;
struct sk_buff *skb;
+ u32 nr_frags = 0;
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
net_prefetch(xdp->data_meta);
skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
@@ -589,6 +582,29 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
__skb_pull(skb, metasize);
}
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+
+ for (int i = 0; i < nr_frags; i++) {
+ struct skb_shared_info *skinfo = skb_shinfo(skb);
+ skb_frag_t *frag = &sinfo->frags[i];
+ struct page *page;
+ void *addr;
+
+ page = dev_alloc_page();
+ if (!page) {
+ dev_kfree_skb(skb);
+ return NULL;
+ }
+ addr = page_to_virt(page);
+
+ memcpy(addr, skb_frag_page(frag), skb_frag_size(frag));
+
+ __skb_fill_page_desc_noacc(skinfo, skinfo->nr_frags++,
+ addr, 0, skb_frag_size(frag));
+ }
+
+out:
xsk_buff_free(xdp);
return skb;
}
@@ -597,7 +613,7 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
* ice_clean_xdp_irq_zc - produce AF_XDP descriptors to CQ
* @xdp_ring: XDP Tx ring
*/
-static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring)
+static u32 ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring)
{
u16 ntc = xdp_ring->next_to_clean;
struct ice_tx_desc *tx_desc;
@@ -619,7 +635,7 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring)
}
if (!completed_frames)
- return;
+ return 0;
if (likely(!xdp_ring->xdp_tx_active)) {
xsk_frames = completed_frames;
@@ -649,6 +665,8 @@ skip:
xdp_ring->next_to_clean -= cnt;
if (xsk_frames)
xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+
+ return completed_frames;
}
/**
@@ -666,37 +684,72 @@ skip:
static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp,
struct ice_tx_ring *xdp_ring)
{
+ struct skb_shared_info *sinfo = NULL;
u32 size = xdp->data_end - xdp->data;
u32 ntu = xdp_ring->next_to_use;
struct ice_tx_desc *tx_desc;
struct ice_tx_buf *tx_buf;
- dma_addr_t dma;
-
- if (ICE_DESC_UNUSED(xdp_ring) < ICE_RING_QUARTER(xdp_ring)) {
- ice_clean_xdp_irq_zc(xdp_ring);
- if (!ICE_DESC_UNUSED(xdp_ring)) {
- xdp_ring->ring_stats->tx_stats.tx_busy++;
- return ICE_XDP_CONSUMED;
- }
+ struct xdp_buff *head;
+ u32 nr_frags = 0;
+ u32 free_space;
+ u32 frag = 0;
+
+ free_space = ICE_DESC_UNUSED(xdp_ring);
+ if (free_space < ICE_RING_QUARTER(xdp_ring))
+ free_space += ice_clean_xdp_irq_zc(xdp_ring);
+
+ if (unlikely(!free_space))
+ goto busy;
+
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ if (free_space < nr_frags + 1)
+ goto busy;
}
- dma = xsk_buff_xdp_get_dma(xdp);
- xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, size);
-
- tx_buf = &xdp_ring->tx_buf[ntu];
- tx_buf->xdp = xdp;
- tx_buf->type = ICE_TX_BUF_XSK_TX;
tx_desc = ICE_TX_DESC(xdp_ring, ntu);
- tx_desc->buf_addr = cpu_to_le64(dma);
- tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
- 0, size, 0);
- xdp_ring->xdp_tx_active++;
+ tx_buf = &xdp_ring->tx_buf[ntu];
+ head = xdp;
+
+ for (;;) {
+ dma_addr_t dma;
+
+ dma = xsk_buff_xdp_get_dma(xdp);
+ xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, size);
+
+ tx_buf->xdp = xdp;
+ tx_buf->type = ICE_TX_BUF_XSK_TX;
+ tx_desc->buf_addr = cpu_to_le64(dma);
+ tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0);
+ /* account for each xdp_buff from xsk_buff_pool */
+ xdp_ring->xdp_tx_active++;
+
+ if (++ntu == xdp_ring->count)
+ ntu = 0;
+
+ if (frag == nr_frags)
+ break;
+
+ tx_desc = ICE_TX_DESC(xdp_ring, ntu);
+ tx_buf = &xdp_ring->tx_buf[ntu];
+
+ xdp = xsk_buff_get_frag(head);
+ size = skb_frag_size(&sinfo->frags[frag]);
+ frag++;
+ }
- if (++ntu == xdp_ring->count)
- ntu = 0;
xdp_ring->next_to_use = ntu;
+ /* update last descriptor from a frame with EOP */
+ tx_desc->cmd_type_offset_bsz |=
+ cpu_to_le64(ICE_TX_DESC_CMD_EOP << ICE_TXD_QW1_CMD_S);
return ICE_XDP_TX;
+
+busy:
+ xdp_ring->ring_stats->tx_stats.tx_busy++;
+
+ return ICE_XDP_CONSUMED;
}
/**
@@ -752,6 +805,34 @@ out_failure:
return result;
}
+static int
+ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first,
+ struct xdp_buff *xdp, const unsigned int size)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first);
+
+ if (!size)
+ return 0;
+
+ if (!xdp_buff_has_frags(first)) {
+ sinfo->nr_frags = 0;
+ sinfo->xdp_frags_size = 0;
+ xdp_buff_set_frags_flag(first);
+ }
+
+ if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
+ xsk_buff_free(first);
+ return -ENOMEM;
+ }
+
+ __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++,
+ virt_to_page(xdp->data_hard_start), 0, size);
+ sinfo->xdp_frags_size += size;
+ xsk_buff_add_frag(xdp);
+
+ return 0;
+}
+
/**
* ice_clean_rx_irq_zc - consumes packets from the hardware ring
* @rx_ring: AF_XDP Rx ring
@@ -762,9 +843,14 @@ out_failure:
int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+ struct xsk_buff_pool *xsk_pool = rx_ring->xsk_pool;
+ u32 ntc = rx_ring->next_to_clean;
+ u32 ntu = rx_ring->next_to_use;
+ struct xdp_buff *first = NULL;
struct ice_tx_ring *xdp_ring;
unsigned int xdp_xmit = 0;
struct bpf_prog *xdp_prog;
+ u32 cnt = rx_ring->count;
bool failure = false;
int entries_to_alloc;
@@ -774,6 +860,9 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
xdp_ring = rx_ring->xdp_ring;
+ if (ntc != rx_ring->first_desc)
+ first = *ice_xdp_buf(rx_ring, rx_ring->first_desc);
+
while (likely(total_rx_packets < (unsigned int)budget)) {
union ice_32b_rx_flex_desc *rx_desc;
unsigned int size, xdp_res = 0;
@@ -783,7 +872,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
u16 vlan_tag = 0;
u16 rx_ptype;
- rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
+ rx_desc = ICE_RX_DESC(rx_ring, ntc);
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
@@ -795,51 +884,61 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
*/
dma_rmb();
- if (unlikely(rx_ring->next_to_clean == rx_ring->next_to_use))
+ if (unlikely(ntc == ntu))
break;
- xdp = *ice_xdp_buf(rx_ring, rx_ring->next_to_clean);
+ xdp = *ice_xdp_buf(rx_ring, ntc);
size = le16_to_cpu(rx_desc->wb.pkt_len) &
ICE_RX_FLX_DESC_PKT_LEN_M;
- if (!size) {
- xdp->data = NULL;
- xdp->data_end = NULL;
- xdp->data_hard_start = NULL;
- xdp->data_meta = NULL;
- goto construct_skb;
- }
xsk_buff_set_size(xdp, size);
- xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool);
+ xsk_buff_dma_sync_for_cpu(xdp, xsk_pool);
+
+ if (!first) {
+ first = xdp;
+ xdp_buff_clear_frags_flag(first);
+ } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) {
+ break;
+ }
+
+ if (++ntc == cnt)
+ ntc = 0;
- xdp_res = ice_run_xdp_zc(rx_ring, xdp, xdp_prog, xdp_ring);
+ if (ice_is_non_eop(rx_ring, rx_desc))
+ continue;
+
+ xdp_res = ice_run_xdp_zc(rx_ring, first, xdp_prog, xdp_ring);
if (likely(xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))) {
xdp_xmit |= xdp_res;
} else if (xdp_res == ICE_XDP_EXIT) {
failure = true;
+ first = NULL;
+ rx_ring->first_desc = ntc;
break;
} else if (xdp_res == ICE_XDP_CONSUMED) {
- xsk_buff_free(xdp);
+ xsk_buff_free(first);
} else if (xdp_res == ICE_XDP_PASS) {
goto construct_skb;
}
- total_rx_bytes += size;
+ total_rx_bytes += xdp_get_buff_len(first);
total_rx_packets++;
- ice_bump_ntc(rx_ring);
+ first = NULL;
+ rx_ring->first_desc = ntc;
continue;
construct_skb:
/* XDP_PASS path */
- skb = ice_construct_skb_zc(rx_ring, xdp);
+ skb = ice_construct_skb_zc(rx_ring, first);
if (!skb) {
rx_ring->ring_stats->rx_stats.alloc_buf_failed++;
break;
}
- ice_bump_ntc(rx_ring);
+ first = NULL;
+ rx_ring->first_desc = ntc;
if (eth_skb_pad(skb)) {
skb = NULL;
@@ -858,18 +957,22 @@ construct_skb:
ice_receive_skb(rx_ring, skb, vlan_tag);
}
- entries_to_alloc = ICE_DESC_UNUSED(rx_ring);
+ rx_ring->next_to_clean = ntc;
+ entries_to_alloc = ICE_RX_DESC_UNUSED(rx_ring);
if (entries_to_alloc > ICE_RING_QUARTER(rx_ring))
failure |= !ice_alloc_rx_bufs_zc(rx_ring, entries_to_alloc);
ice_finalize_xdp_rx(xdp_ring, xdp_xmit, 0);
ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes);
- if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
- if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
- xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
+ if (xsk_uses_need_wakeup(xsk_pool)) {
+ /* ntu could have changed when allocating entries above, so
+ * use rx_ring value instead of stack based one
+ */
+ if (failure || ntc == rx_ring->next_to_use)
+ xsk_set_rx_need_wakeup(xsk_pool);
else
- xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
+ xsk_clear_rx_need_wakeup(xsk_pool);
return (int)total_rx_packets;
}
@@ -894,7 +997,7 @@ static void ice_xmit_pkt(struct ice_tx_ring *xdp_ring, struct xdp_desc *desc,
tx_desc = ICE_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
tx_desc->buf_addr = cpu_to_le64(dma);
- tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
+ tx_desc->cmd_type_offset_bsz = ice_build_ctob(xsk_is_eop_desc(desc),
0, desc->len, 0);
*total_bytes += desc->len;
@@ -921,7 +1024,7 @@ static void ice_xmit_pkt_batch(struct ice_tx_ring *xdp_ring, struct xdp_desc *de
tx_desc = ICE_TX_DESC(xdp_ring, ntu++);
tx_desc->buf_addr = cpu_to_le64(dma);
- tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
+ tx_desc->cmd_type_offset_bsz = ice_build_ctob(xsk_is_eop_desc(&descs[i]),
0, descs[i].len, 0);
*total_bytes += descs[i].len;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b828c7a75be2..b12477ea4032 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2250,6 +2250,7 @@ struct net_device {
#define GRO_MAX_SIZE (8 * 65535u)
unsigned int gro_max_size;
unsigned int gro_ipv4_max_size;
+ unsigned int xdp_zc_max_segs;
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index e96a1151ec75..1617af380162 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -52,6 +52,7 @@ struct xdp_sock {
struct xsk_buff_pool *pool;
u16 queue_id;
bool zc;
+ bool sg;
enum {
XSK_READY = 0,
XSK_BOUND,
@@ -67,6 +68,12 @@ struct xdp_sock {
u64 rx_dropped;
u64 rx_queue_full;
+ /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current
+ * packet, the partially built skb is saved here so that packet building can resume in next
+ * call of __xsk_generic_xmit().
+ */
+ struct sk_buff *skb;
+
struct list_head map_list;
/* Protects map_list */
spinlock_t map_list_lock;
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index c243f906ebed..1f6fc8c7a84c 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -89,6 +89,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool)
return xp_alloc(pool);
}
+static inline bool xsk_is_eop_desc(struct xdp_desc *desc)
+{
+ return !xp_mb_desc(desc);
+}
+
/* Returns as many entries as possible up to max. 0 <= N <= max. */
static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
{
@@ -103,10 +108,45 @@ static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count)
static inline void xsk_buff_free(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+ struct list_head *xskb_list = &xskb->pool->xskb_list;
+ struct xdp_buff_xsk *pos, *tmp;
+
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+ list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
+ list_del(&pos->xskb_list_node);
+ xp_free(pos);
+ }
+
+ xdp_get_shared_info_from_buff(xdp)->nr_frags = 0;
+out:
xp_free(xskb);
}
+static inline void xsk_buff_add_frag(struct xdp_buff *xdp)
+{
+ struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list);
+}
+
+static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+{
+ struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
+ struct xdp_buff *ret = NULL;
+ struct xdp_buff_xsk *frag;
+
+ frag = list_first_entry_or_null(&xskb->pool->xskb_list,
+ struct xdp_buff_xsk, xskb_list_node);
+ if (frag) {
+ list_del(&frag->xskb_list_node);
+ ret = &frag->xdp;
+ }
+
+ return ret;
+}
+
static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
{
xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
@@ -241,6 +281,11 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool)
return NULL;
}
+static inline bool xsk_is_eop_desc(struct xdp_desc *desc)
+{
+ return false;
+}
+
static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
{
return 0;
@@ -255,6 +300,15 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
{
}
+static inline void xsk_buff_add_frag(struct xdp_buff *xdp)
+{
+}
+
+static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+{
+ return NULL;
+}
+
static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
{
}
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index a8d7b8a3688a..b0bdff26fc88 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -29,6 +29,7 @@ struct xdp_buff_xsk {
struct xsk_buff_pool *pool;
u64 orig_addr;
struct list_head free_list_node;
+ struct list_head xskb_list_node;
};
#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb))
@@ -54,6 +55,7 @@ struct xsk_buff_pool {
struct xdp_umem *umem;
struct work_struct work;
struct list_head free_list;
+ struct list_head xskb_list;
u32 heads_cnt;
u16 queue_id;
@@ -184,6 +186,11 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
!(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK);
}
+static inline bool xp_mb_desc(struct xdp_desc *desc)
+{
+ return desc->options & XDP_PKT_CONTD;
+}
+
static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
{
return addr & pool->chunk_mask;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index a78a8096f4ce..8d48863472b9 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -25,6 +25,12 @@
* application.
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
+/* By setting this option, userspace application indicates that it can
+ * handle multiple descriptors per packet thus enabling AF_XDP to split
+ * multi-buffer XDP frames into multiple Rx descriptors. Without this set
+ * such frames will be dropped.
+ */
+#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */
#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
@@ -108,4 +114,11 @@ struct xdp_desc {
/* UMEM descriptor is __u64 */
+/* Flag indicating that the packet continues with the buffer pointed out by the
+ * next frame in the ring. The end of the packet is signalled by setting this
+ * bit to zero. For single buffer packets, every descriptor has 'options' set
+ * to 0 and this maintains backward compatibility.
+ */
+#define XDP_PKT_CONTD (1 << 0)
+
#endif /* _LINUX_IF_XDP_H */
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 639524b59930..bf71698a1e82 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -41,6 +41,7 @@ enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
NETDEV_A_DEV_XDP_FEATURES,
+ NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
__NETDEV_A_DEV_MAX,
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/net/core/dev.c b/net/core/dev.c
index d6e1b786c5c5..dd4f114a7cbf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10613,6 +10613,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
+ dev->xdp_zc_max_segs = 1;
dev->gso_max_segs = GSO_MAX_SEGS;
dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
diff --git a/net/core/filter.c b/net/core/filter.c
index 06ba0e56e369..b4410dc841a0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4345,13 +4345,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
enum bpf_map_type map_type = ri->map_type;
- if (map_type == BPF_MAP_TYPE_XSKMAP) {
- /* XDP_REDIRECT is not supported AF_XDP yet. */
- if (unlikely(xdp_buff_has_frags(xdp)))
- return -EOPNOTSUPP;
-
+ if (map_type == BPF_MAP_TYPE_XSKMAP)
return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
- }
return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
xdp_prog);
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index a4270fafdf11..65ef4867fc49 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -25,6 +25,14 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
return -EINVAL;
}
+ if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
+ if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
+ netdev->xdp_zc_max_segs)) {
+ genlmsg_cancel(rsp, hdr);
+ return -EINVAL;
+ }
+ }
+
genlmsg_end(rsp, hdr);
return 0;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 31dca4ecb2c5..4f1e0599146e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -135,14 +135,14 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
return 0;
}
-static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
+ u32 flags)
{
- struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
u64 addr;
int err;
addr = xp_get_handle(xskb);
- err = xskq_prod_reserve_desc(xs->rx, addr, len);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
if (err) {
xs->rx_queue_full++;
return err;
@@ -152,48 +152,138 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
return 0;
}
-static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
+static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- void *from_buf, *to_buf;
- u32 metalen;
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+ u32 frags = xdp_buff_has_frags(xdp);
+ struct xdp_buff_xsk *pos, *tmp;
+ struct list_head *xskb_list;
+ u32 contd = 0;
+ int err;
- if (unlikely(xdp_data_meta_unsupported(from))) {
- from_buf = from->data;
- to_buf = to->data;
- metalen = 0;
- } else {
- from_buf = from->data_meta;
- metalen = from->data - from->data_meta;
- to_buf = to->data - metalen;
+ if (frags)
+ contd = XDP_PKT_CONTD;
+
+ err = __xsk_rcv_zc(xs, xskb, len, contd);
+ if (err || likely(!frags))
+ goto out;
+
+ xskb_list = &xskb->pool->xskb_list;
+ list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
+ if (list_is_singular(xskb_list))
+ contd = 0;
+ len = pos->xdp.data_end - pos->xdp.data;
+ err = __xsk_rcv_zc(xs, pos, len, contd);
+ if (err)
+ return err;
+ list_del(&pos->xskb_list_node);
}
- memcpy(to_buf, from_buf, len + metalen);
+out:
+ return err;
}
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static void *xsk_copy_xdp_start(struct xdp_buff *from)
{
+ if (unlikely(xdp_data_meta_unsupported(from)))
+ return from->data;
+ else
+ return from->data_meta;
+}
+
+static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
+ u32 *from_len, skb_frag_t **frag, u32 rem)
+{
+ u32 copied = 0;
+
+ while (1) {
+ u32 copy_len = min_t(u32, *from_len, to_len);
+
+ memcpy(to, *from, copy_len);
+ copied += copy_len;
+ if (rem == copied)
+ return copied;
+
+ if (*from_len == copy_len) {
+ *from = skb_frag_address(*frag);
+ *from_len = skb_frag_size((*frag)++);
+ } else {
+ *from += copy_len;
+ *from_len -= copy_len;
+ }
+ if (to_len == copy_len)
+ return copied;
+
+ to_len -= copy_len;
+ to += copy_len;
+ }
+}
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+ u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
+ void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
+ u32 from_len, meta_len, rem, num_desc;
+ struct xdp_buff_xsk *xskb;
struct xdp_buff *xsk_xdp;
- int err;
- u32 len;
+ skb_frag_t *frag;
- len = xdp->data_end - xdp->data;
- if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
- xs->rx_dropped++;
- return -ENOSPC;
+ from_len = xdp->data_end - copy_from;
+ meta_len = xdp->data - copy_from;
+ rem = len + meta_len;
+
+ if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
+ int err;
+
+ xsk_xdp = xsk_buff_alloc(xs->pool);
+ if (!xsk_xdp) {
+ xs->rx_dropped++;
+ return -ENOMEM;
+ }
+ memcpy(xsk_xdp->data - meta_len, copy_from, rem);
+ xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
+ err = __xsk_rcv_zc(xs, xskb, len, 0);
+ if (err) {
+ xsk_buff_free(xsk_xdp);
+ return err;
+ }
+
+ return 0;
}
- xsk_xdp = xsk_buff_alloc(xs->pool);
- if (!xsk_xdp) {
+ num_desc = (len - 1) / frame_size + 1;
+
+ if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
xs->rx_dropped++;
return -ENOMEM;
}
+ if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
+ xs->rx_queue_full++;
+ return -ENOBUFS;
+ }
- xsk_copy_xdp(xsk_xdp, xdp, len);
- err = __xsk_rcv_zc(xs, xsk_xdp, len);
- if (err) {
- xsk_buff_free(xsk_xdp);
- return err;
+ if (xdp_buff_has_frags(xdp)) {
+ struct skb_shared_info *sinfo;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ frag = &sinfo->frags[0];
}
+
+ do {
+ u32 to_len = frame_size + meta_len;
+ u32 copied;
+
+ xsk_xdp = xsk_buff_alloc(xs->pool);
+ copy_to = xsk_xdp->data - meta_len;
+
+ copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
+ rem -= copied;
+
+ xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
+ __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
+ meta_len = 0;
+ } while (rem);
+
return 0;
}
@@ -215,7 +305,7 @@ static bool xsk_is_bound(struct xdp_sock *xs)
return false;
}
-static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
if (!xsk_is_bound(xs))
return -ENXIO;
@@ -223,6 +313,11 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL;
+ if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
sk_mark_napi_id_once_xdp(&xs->sk, xdp);
return 0;
}
@@ -236,12 +331,13 @@ static void xsk_flush(struct xdp_sock *xs)
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
+ u32 len = xdp_get_buff_len(xdp);
int err;
spin_lock_bh(&xs->rx_lock);
- err = xsk_rcv_check(xs, xdp);
+ err = xsk_rcv_check(xs, xdp, len);
if (!err) {
- err = __xsk_rcv(xs, xdp);
+ err = __xsk_rcv(xs, xdp, len);
xsk_flush(xs);
}
spin_unlock_bh(&xs->rx_lock);
@@ -250,19 +346,19 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
+ u32 len = xdp_get_buff_len(xdp);
int err;
- u32 len;
- err = xsk_rcv_check(xs, xdp);
+ err = xsk_rcv_check(xs, xdp, len);
if (err)
return err;
if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
len = xdp->data_end - xdp->data;
- return __xsk_rcv_zc(xs, xdp, len);
+ return xsk_rcv_zc(xs, xdp, len);
}
- err = __xsk_rcv(xs, xdp);
+ err = __xsk_rcv(xs, xdp, len);
if (!err)
xdp_return_buff(xdp);
return err;
@@ -321,7 +417,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
rcu_read_lock();
list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
- xs->tx->queue_empty_descs++;
+ if (xskq_has_descs(xs->tx))
+ xskq_cons_release(xs->tx);
continue;
}
@@ -408,37 +505,91 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
}
-static void xsk_destruct_skb(struct sk_buff *skb)
+static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ ret = xskq_prod_reserve_addr(xs->pool->cq, addr);
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+
+ return ret;
+}
+
+static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n)
{
- u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
- struct xdp_sock *xs = xdp_sk(skb->sk);
unsigned long flags;
spin_lock_irqsave(&xs->pool->cq_lock, flags);
- xskq_prod_submit_addr(xs->pool->cq, addr);
+ xskq_prod_submit_n(xs->pool->cq, n);
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+}
+
+static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ xskq_prod_cancel_n(xs->pool->cq, n);
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+}
+
+static u32 xsk_get_num_desc(struct sk_buff *skb)
+{
+ return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
+}
+
+static void xsk_destruct_skb(struct sk_buff *skb)
+{
+ xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
sock_wfree(skb);
}
+static void xsk_set_destructor_arg(struct sk_buff *skb)
+{
+ long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+
+ skb_shinfo(skb)->destructor_arg = (void *)num;
+}
+
+static void xsk_consume_skb(struct sk_buff *skb)
+{
+ struct xdp_sock *xs = xdp_sk(skb->sk);
+
+ skb->destructor = sock_wfree;
+ xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb));
+ /* Free skb without triggering the perf drop trace */
+ consume_skb(skb);
+ xs->skb = NULL;
+}
+
+static void xsk_drop_skb(struct sk_buff *skb)
+{
+ xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
+ xsk_consume_skb(skb);
+}
+
static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
struct xdp_desc *desc)
{
struct xsk_buff_pool *pool = xs->pool;
u32 hr, len, ts, offset, copy, copied;
- struct sk_buff *skb;
+ struct sk_buff *skb = xs->skb;
struct page *page;
void *buffer;
int err, i;
u64 addr;
- hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
+ if (!skb) {
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
- skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
- if (unlikely(!skb))
- return ERR_PTR(err);
+ skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+ if (unlikely(!skb))
+ return ERR_PTR(err);
- skb_reserve(skb, hr);
+ skb_reserve(skb, hr);
+ }
addr = desc->addr;
len = desc->len;
@@ -448,7 +599,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
offset = offset_in_page(buffer);
addr = buffer - pool->addrs;
- for (copied = 0, i = 0; copied < len; i++) {
+ for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
+ if (unlikely(i >= MAX_SKB_FRAGS))
+ return ERR_PTR(-EFAULT);
+
page = pool->umem->pgs[addr >> PAGE_SHIFT];
get_page(page);
@@ -473,43 +627,77 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct xdp_desc *desc)
{
struct net_device *dev = xs->dev;
- struct sk_buff *skb;
+ struct sk_buff *skb = xs->skb;
+ int err;
if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
skb = xsk_build_skb_zerocopy(xs, desc);
- if (IS_ERR(skb))
- return skb;
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ goto free_err;
+ }
} else {
u32 hr, tr, len;
void *buffer;
- int err;
- hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
- tr = dev->needed_tailroom;
+ buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
len = desc->len;
- skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
- if (unlikely(!skb))
- return ERR_PTR(err);
+ if (!skb) {
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+ tr = dev->needed_tailroom;
+ skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+ if (unlikely(!skb))
+ goto free_err;
- skb_reserve(skb, hr);
- skb_put(skb, len);
+ skb_reserve(skb, hr);
+ skb_put(skb, len);
- buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
- err = skb_store_bits(skb, 0, buffer, len);
- if (unlikely(err)) {
- kfree_skb(skb);
- return ERR_PTR(err);
+ err = skb_store_bits(skb, 0, buffer, len);
+ if (unlikely(err))
+ goto free_err;
+ } else {
+ int nr_frags = skb_shinfo(skb)->nr_frags;
+ struct page *page;
+ u8 *vaddr;
+
+ if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
+ err = -EFAULT;
+ goto free_err;
+ }
+
+ page = alloc_page(xs->sk.sk_allocation);
+ if (unlikely(!page)) {
+ err = -EAGAIN;
+ goto free_err;
+ }
+
+ vaddr = kmap_local_page(page);
+ memcpy(vaddr, buffer, len);
+ kunmap_local(vaddr);
+
+ skb_add_rx_frag(skb, nr_frags, page, 0, len, 0);
}
}
skb->dev = dev;
skb->priority = xs->sk.sk_priority;
skb->mark = xs->sk.sk_mark;
- skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
skb->destructor = xsk_destruct_skb;
+ xsk_set_destructor_arg(skb);
return skb;
+
+free_err:
+ if (err == -EAGAIN) {
+ xsk_cq_cancel_locked(xs, 1);
+ } else {
+ xsk_set_destructor_arg(skb);
+ xsk_drop_skb(skb);
+ xskq_cons_release(xs->tx);
+ }
+
+ return ERR_PTR(err);
}
static int __xsk_generic_xmit(struct sock *sk)
@@ -519,7 +707,6 @@ static int __xsk_generic_xmit(struct sock *sk)
bool sent_frame = false;
struct xdp_desc desc;
struct sk_buff *skb;
- unsigned long flags;
int err = 0;
mutex_lock(&xs->mutex);
@@ -544,47 +731,51 @@ static int __xsk_generic_xmit(struct sock *sk)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- spin_lock_irqsave(&xs->pool->cq_lock, flags);
- if (xskq_prod_reserve(xs->pool->cq)) {
- spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+ if (xsk_cq_reserve_addr_locked(xs, desc.addr))
goto out;
- }
- spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
skb = xsk_build_skb(xs, &desc);
if (IS_ERR(skb)) {
err = PTR_ERR(skb);
- spin_lock_irqsave(&xs->pool->cq_lock, flags);
- xskq_prod_cancel(xs->pool->cq);
- spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
- goto out;
+ if (err == -EAGAIN)
+ goto out;
+ err = 0;
+ continue;
+ }
+
+ xskq_cons_release(xs->tx);
+
+ if (xp_mb_desc(&desc)) {
+ xs->skb = skb;
+ continue;
}
err = __dev_direct_xmit(skb, xs->queue_id);
if (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
- skb->destructor = sock_wfree;
- spin_lock_irqsave(&xs->pool->cq_lock, flags);
- xskq_prod_cancel(xs->pool->cq);
- spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
- /* Free skb without triggering the perf drop trace */
- consume_skb(skb);
+ xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
+ xsk_consume_skb(skb);
err = -EAGAIN;
goto out;
}
- xskq_cons_release(xs->tx);
/* Ignore NET_XMIT_CN as packet might have been sent */
if (err == NET_XMIT_DROP) {
/* SKB completed but not sent */
err = -EBUSY;
+ xs->skb = NULL;
goto out;
}
sent_frame = true;
+ xs->skb = NULL;
}
- xs->tx->queue_empty_descs++;
+ if (xskq_has_descs(xs->tx)) {
+ if (xs->skb)
+ xsk_drop_skb(xs->skb);
+ xskq_cons_release(xs->tx);
+ }
out:
if (sent_frame)
@@ -834,6 +1025,9 @@ static int xsk_release(struct socket *sock)
net = sock_net(sk);
+ if (xs->skb)
+ xsk_drop_skb(xs->skb);
+
mutex_lock(&net->xdp.lock);
sk_del_node_init_rcu(sk);
mutex_unlock(&net->xdp.lock);
@@ -897,7 +1091,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
flags = sxdp->sxdp_flags;
if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
- XDP_USE_NEED_WAKEUP))
+ XDP_USE_NEED_WAKEUP | XDP_USE_SG))
return -EINVAL;
bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
@@ -929,7 +1123,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
struct socket *sock;
if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
- (flags & XDP_USE_NEED_WAKEUP)) {
+ (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
/* Cannot specify flags for shared sockets. */
err = -EINVAL;
goto out_unlock;
@@ -1028,6 +1222,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xs->dev = dev;
xs->zc = xs->umem->zc;
+ xs->sg = !!(flags & XDP_USE_SG);
xs->queue_id = qid;
xp_add_xsk(xs->pool, xs);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 26f6d304451e..b3f7b310811e 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -86,6 +86,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
pool->umem = umem;
pool->addrs = umem->addrs;
INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->xskb_list);
INIT_LIST_HEAD(&pool->xsk_tx_list);
spin_lock_init(&pool->xsk_tx_list_lock);
spin_lock_init(&pool->cq_lock);
@@ -99,6 +100,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
xskb->pool = pool;
xskb->xdp.frame_sz = umem->chunk_size - umem->headroom;
INIT_LIST_HEAD(&xskb->free_list_node);
+ INIT_LIST_HEAD(&xskb->xskb_list_node);
if (pool->unaligned)
pool->free_heads[i] = xskb;
else
@@ -187,6 +189,11 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
goto err_unreg_pool;
}
+ if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) {
+ err = -EOPNOTSUPP;
+ goto err_unreg_pool;
+ }
+
bpf.command = XDP_SETUP_XSK_POOL;
bpf.xsk.pool = pool;
bpf.xsk.queue_id = queue_id;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 6d40a77fccbe..13354a1e4280 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -48,6 +48,11 @@ struct xsk_queue {
size_t ring_vmalloc_size;
};
+struct parsed_desc {
+ u32 mb;
+ u32 valid;
+};
+
/* The structure of the shared state of the rings are a simple
* circular buffer, as outlined in
* Documentation/core-api/circular-buffers.rst. For the Rx and
@@ -130,18 +135,26 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
return false;
}
+static inline bool xp_unused_options_set(u32 options)
+{
+ return options & ~XDP_PKT_CONTD;
+}
+
static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
u64 offset = desc->addr & (pool->chunk_size - 1);
+ if (!desc->len)
+ return false;
+
if (offset + desc->len > pool->chunk_size)
return false;
if (desc->addr >= pool->addrs_cnt)
return false;
- if (desc->options)
+ if (xp_unused_options_set(desc->options))
return false;
return true;
}
@@ -151,6 +164,9 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
{
u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
+ if (!desc->len)
+ return false;
+
if (desc->len > pool->chunk_size)
return false;
@@ -158,7 +174,7 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
return false;
- if (desc->options)
+ if (xp_unused_options_set(desc->options))
return false;
return true;
}
@@ -170,6 +186,11 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
xp_aligned_validate_desc(pool, desc);
}
+static inline bool xskq_has_descs(struct xsk_queue *q)
+{
+ return q->cached_cons != q->cached_prod;
+}
+
static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
struct xdp_desc *d,
struct xsk_buff_pool *pool)
@@ -185,17 +206,15 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
struct xdp_desc *desc,
struct xsk_buff_pool *pool)
{
- while (q->cached_cons != q->cached_prod) {
+ if (q->cached_cons != q->cached_prod) {
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
u32 idx = q->cached_cons & q->ring_mask;
*desc = ring->desc[idx];
- if (xskq_cons_is_valid_desc(q, desc, pool))
- return true;
-
- q->cached_cons++;
+ return xskq_cons_is_valid_desc(q, desc, pool);
}
+ q->queue_empty_descs++;
return false;
}
@@ -204,30 +223,52 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
q->cached_cons += cnt;
}
-static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
- u32 max)
+static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool,
+ struct xdp_desc *desc, struct parsed_desc *parsed)
+{
+ parsed->valid = xskq_cons_is_valid_desc(q, desc, pool);
+ parsed->mb = xp_mb_desc(desc);
+}
+
+static inline
+u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+ u32 max)
{
u32 cached_cons = q->cached_cons, nb_entries = 0;
struct xdp_desc *descs = pool->tx_descs;
+ u32 total_descs = 0, nr_frags = 0;
+ /* track first entry, if stumble upon *any* invalid descriptor, rewind
+ * current packet that consists of frags and stop the processing
+ */
while (cached_cons != q->cached_prod && nb_entries < max) {
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
u32 idx = cached_cons & q->ring_mask;
+ struct parsed_desc parsed;
descs[nb_entries] = ring->desc[idx];
- if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) {
- /* Skip the entry */
- cached_cons++;
- continue;
+ cached_cons++;
+ parse_desc(q, pool, &descs[nb_entries], &parsed);
+ if (unlikely(!parsed.valid))
+ break;
+
+ if (likely(!parsed.mb)) {
+ total_descs += (nr_frags + 1);
+ nr_frags = 0;
+ } else {
+ nr_frags++;
+ if (nr_frags == pool->netdev->xdp_zc_max_segs) {
+ nr_frags = 0;
+ break;
+ }
}
-
nb_entries++;
- cached_cons++;
}
+ cached_cons -= nr_frags;
/* Release valid plus any invalid entries */
xskq_cons_release_n(q, cached_cons - q->cached_cons);
- return nb_entries;
+ return total_descs;
}
/* Functions for consumers */
@@ -292,6 +333,11 @@ static inline void xskq_cons_release(struct xsk_queue *q)
q->cached_cons++;
}
+static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt)
+{
+ q->cached_cons -= cnt;
+}
+
static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
{
/* No barriers needed since data is not accessed */
@@ -319,9 +365,9 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q)
return xskq_prod_nb_free(q, 1) ? false : true;
}
-static inline void xskq_prod_cancel(struct xsk_queue *q)
+static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt)
{
- q->cached_prod--;
+ q->cached_prod -= cnt;
}
static inline int xskq_prod_reserve(struct xsk_queue *q)
@@ -360,7 +406,7 @@ static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_de
}
static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
- u64 addr, u32 len)
+ u64 addr, u32 len, u32 flags)
{
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
u32 idx;
@@ -372,6 +418,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
idx = q->cached_prod++ & q->ring_mask;
ring->desc[idx].addr = addr;
ring->desc[idx].len = len;
+ ring->desc[idx].options = flags;
return 0;
}
@@ -386,16 +433,6 @@ static inline void xskq_prod_submit(struct xsk_queue *q)
__xskq_prod_submit(q, q->cached_prod);
}
-static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr)
-{
- struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
- u32 idx = q->ring->producer;
-
- ring->desc[idx++ & q->ring_mask] = addr;
-
- __xskq_prod_submit(q, idx);
-}
-
static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries)
{
__xskq_prod_submit(q, q->ring->producer + nb_entries);
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index a78a8096f4ce..73a47da885dc 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -25,6 +25,12 @@
* application.
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
+/* By setting this option, userspace application indicates that it can
+ * handle multiple descriptors per packet thus enabling xsk core to split
+ * multi-buffer XDP frames into multiple Rx descriptors. Without this set
+ * such frames will be dropped by xsk.
+ */
+#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */
#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
@@ -106,6 +112,9 @@ struct xdp_desc {
__u32 options;
};
+/* Flag indicating packet constitutes of multiple buffers*/
+#define XDP_PKT_CONTD (1 << 0)
+
/* UMEM descriptor is __u64 */
#endif /* _LINUX_IF_XDP_H */
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 639524b59930..bf71698a1e82 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -41,6 +41,7 @@ enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
NETDEV_A_DEV_XDP_FEATURES,
+ NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
__NETDEV_A_DEV_MAX,
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 10642ad69d76..674e5788eb10 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1105,9 +1105,10 @@ struct bpf_xdp_query_opts {
__u32 skb_prog_id; /* output */
__u8 attach_mode; /* output */
__u64 feature_flags; /* output */
+ __u32 xdp_zc_max_segs; /* output */
size_t :0;
};
-#define bpf_xdp_query_opts__last_field feature_flags
+#define bpf_xdp_query_opts__last_field xdp_zc_max_segs
LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags,
const struct bpf_xdp_attach_opts *opts);
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index 84dd5fa14905..090bcf6e3b3d 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -45,6 +45,7 @@ struct xdp_id_md {
struct xdp_features_md {
int ifindex;
+ __u32 xdp_zc_max_segs;
__u64 flags;
};
@@ -421,6 +422,9 @@ static int parse_xdp_features(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn,
return NL_CONT;
md->flags = libbpf_nla_getattr_u64(tb[NETDEV_A_DEV_XDP_FEATURES]);
+ if (tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS])
+ md->xdp_zc_max_segs =
+ libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]);
return NL_DONE;
}
@@ -493,6 +497,7 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts)
return libbpf_err(err);
opts->feature_flags = md.flags;
+ opts->xdp_zc_max_segs = md.xdp_zc_max_segs;
skip_feature_flags:
return 0;
diff --git a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c
index a630c95c7471..24369f242853 100644
--- a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c
+++ b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c
@@ -15,12 +15,12 @@ struct {
static unsigned int idx;
int count = 0;
-SEC("xdp") int xsk_def_prog(struct xdp_md *xdp)
+SEC("xdp.frags") int xsk_def_prog(struct xdp_md *xdp)
{
return bpf_redirect_map(&xsk, 0, XDP_DROP);
}
-SEC("xdp") int xsk_xdp_drop(struct xdp_md *xdp)
+SEC("xdp.frags") int xsk_xdp_drop(struct xdp_md *xdp)
{
/* Drop every other packet */
if (idx++ % 2)
@@ -29,7 +29,7 @@ SEC("xdp") int xsk_xdp_drop(struct xdp_md *xdp)
return bpf_redirect_map(&xsk, 0, XDP_DROP);
}
-SEC("xdp") int xsk_xdp_populate_metadata(struct xdp_md *xdp)
+SEC("xdp.frags") int xsk_xdp_populate_metadata(struct xdp_md *xdp)
{
void *data, *data_meta;
struct xdp_info *meta;
diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh
index c2ad50f26b63..2aa5a3445056 100755
--- a/tools/testing/selftests/bpf/test_xsk.sh
+++ b/tools/testing/selftests/bpf/test_xsk.sh
@@ -171,7 +171,10 @@ exec_xskxceiver
if [ -z $ETH ]; then
cleanup_exit ${VETH0} ${VETH1}
+else
+ cleanup_iface ${ETH} ${MTU}
fi
+
TEST_NAME="XSK_SELFTESTS_${VETH0}_BUSY_POLL"
busy_poll=1
@@ -184,6 +187,8 @@ exec_xskxceiver
if [ -z $ETH ]; then
cleanup_exit ${VETH0} ${VETH1}
+else
+ cleanup_iface ${ETH} ${MTU}
fi
failures=0
diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index 687d83e707f8..d9fb2b730a2c 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -18,17 +18,19 @@
#include <linux/ethtool.h>
#include <linux/filter.h>
#include <linux/if_ether.h>
+#include <linux/if_link.h>
#include <linux/if_packet.h>
#include <linux/if_xdp.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
#include <linux/sockios.h>
#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/types.h>
-#include <linux/if_link.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
@@ -81,6 +83,12 @@ struct xsk_socket {
int fd;
};
+struct nl_mtu_req {
+ struct nlmsghdr nh;
+ struct ifinfomsg msg;
+ char buf[512];
+};
+
int xsk_umem__fd(const struct xsk_umem *umem)
{
return umem ? umem->fd : -EINVAL;
@@ -286,6 +294,132 @@ bool xsk_is_in_mode(u32 ifindex, int mode)
return false;
}
+/* Lifted from netlink.c in tools/lib/bpf */
+static int netlink_recvmsg(int sock, struct msghdr *mhdr, int flags)
+{
+ int len;
+
+ do {
+ len = recvmsg(sock, mhdr, flags);
+ } while (len < 0 && (errno == EINTR || errno == EAGAIN));
+
+ if (len < 0)
+ return -errno;
+ return len;
+}
+
+/* Lifted from netlink.c in tools/lib/bpf */
+static int alloc_iov(struct iovec *iov, int len)
+{
+ void *nbuf;
+
+ nbuf = realloc(iov->iov_base, len);
+ if (!nbuf)
+ return -ENOMEM;
+
+ iov->iov_base = nbuf;
+ iov->iov_len = len;
+ return 0;
+}
+
+/* Original version lifted from netlink.c in tools/lib/bpf */
+static int netlink_recv(int sock)
+{
+ struct iovec iov = {};
+ struct msghdr mhdr = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ };
+ bool multipart = true;
+ struct nlmsgerr *err;
+ struct nlmsghdr *nh;
+ int len, ret;
+
+ ret = alloc_iov(&iov, 4096);
+ if (ret)
+ goto done;
+
+ while (multipart) {
+ multipart = false;
+ len = netlink_recvmsg(sock, &mhdr, MSG_PEEK | MSG_TRUNC);
+ if (len < 0) {
+ ret = len;
+ goto done;
+ }
+
+ if (len > iov.iov_len) {
+ ret = alloc_iov(&iov, len);
+ if (ret)
+ goto done;
+ }
+
+ len = netlink_recvmsg(sock, &mhdr, 0);
+ if (len < 0) {
+ ret = len;
+ goto done;
+ }
+
+ if (len == 0)
+ break;
+
+ for (nh = (struct nlmsghdr *)iov.iov_base; NLMSG_OK(nh, len);
+ nh = NLMSG_NEXT(nh, len)) {
+ if (nh->nlmsg_flags & NLM_F_MULTI)
+ multipart = true;
+ switch (nh->nlmsg_type) {
+ case NLMSG_ERROR:
+ err = (struct nlmsgerr *)NLMSG_DATA(nh);
+ if (!err->error)
+ continue;
+ ret = err->error;
+ goto done;
+ case NLMSG_DONE:
+ ret = 0;
+ goto done;
+ default:
+ break;
+ }
+ }
+ }
+ ret = 0;
+done:
+ free(iov.iov_base);
+ return ret;
+}
+
+int xsk_set_mtu(int ifindex, int mtu)
+{
+ struct nl_mtu_req req;
+ struct rtattr *rta;
+ int fd, ret;
+
+ fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE);
+ if (fd < 0)
+ return fd;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_type = RTM_NEWLINK;
+ req.msg.ifi_family = AF_UNSPEC;
+ req.msg.ifi_index = ifindex;
+ rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len));
+ rta->rta_type = IFLA_MTU;
+ rta->rta_len = RTA_LENGTH(sizeof(unsigned int));
+ req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + RTA_LENGTH(sizeof(mtu));
+ memcpy(RTA_DATA(rta), &mtu, sizeof(mtu));
+
+ ret = send(fd, &req, req.nh.nlmsg_len, 0);
+ if (ret < 0) {
+ close(fd);
+ return errno;
+ }
+
+ ret = netlink_recv(fd);
+ close(fd);
+ return ret;
+}
+
int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags)
{
int prog_fd;
diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h
index 8da8d557768b..d93200fdaa8d 100644
--- a/tools/testing/selftests/bpf/xsk.h
+++ b/tools/testing/selftests/bpf/xsk.h
@@ -239,6 +239,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
int xsk_umem__delete(struct xsk_umem *umem);
void xsk_socket__delete(struct xsk_socket *xsk);
+int xsk_set_mtu(int ifindex, int mtu);
+
#ifdef __cplusplus
} /* extern "C" */
#endif
diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh
index ae697a10a056..29175682c44d 100755
--- a/tools/testing/selftests/bpf/xsk_prereqs.sh
+++ b/tools/testing/selftests/bpf/xsk_prereqs.sh
@@ -53,6 +53,13 @@ test_exit()
exit 1
}
+cleanup_iface()
+{
+ ip link set $1 mtu $2
+ ip link set $1 xdp off
+ ip link set $1 xdpgeneric off
+}
+
clear_configs()
{
[ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] &&
diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 218d7f694e5c..3ff436706640 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -49,8 +49,11 @@
* h. tests for invalid and corner case Tx descriptors so that the correct ones
* are discarded and let through, respectively.
* i. 2K frame size tests
- *
- * Total tests: 12
+ * j. If multi-buffer is supported, send 9k packets divided into 3 frames
+ * k. If multi-buffer and huge pages are supported, send 9k packets in a single frame
+ * using unaligned mode
+ * l. If multi-buffer is supported, try various nasty combinations of descriptors to
+ * check if they pass the validation or not
*
* Flow:
* -----
@@ -73,10 +76,10 @@
#include <fcntl.h>
#include <errno.h>
#include <getopt.h>
-#include <asm/barrier.h>
#include <linux/if_link.h>
#include <linux/if_ether.h>
#include <linux/mman.h>
+#include <linux/netdev.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <locale.h>
@@ -91,7 +94,6 @@
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/types.h>
-#include <time.h>
#include <unistd.h>
#include "xsk_xdp_progs.skel.h"
@@ -253,6 +255,8 @@ static int __xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_i
cfg.bind_flags = ifobject->bind_flags;
if (shared)
cfg.bind_flags |= XDP_SHARED_UMEM;
+ if (ifobject->pkt_stream && ifobject->mtu > MAX_ETH_PKT_SIZE)
+ cfg.bind_flags |= XDP_USE_SG;
txr = ifobject->tx_on ? &xsk->tx : NULL;
rxr = ifobject->rx_on ? &xsk->rx : NULL;
@@ -415,6 +419,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx,
test->total_steps = 1;
test->nb_sockets = 1;
test->fail = false;
+ test->mtu = MAX_ETH_PKT_SIZE;
test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog;
test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk;
test->xdp_prog_tx = ifobj_tx->xdp_progs->progs.xsk_def_prog;
@@ -468,6 +473,26 @@ static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *x
test->xskmap_tx = xskmap_tx;
}
+static int test_spec_set_mtu(struct test_spec *test, int mtu)
+{
+ int err;
+
+ if (test->ifobj_rx->mtu != mtu) {
+ err = xsk_set_mtu(test->ifobj_rx->ifindex, mtu);
+ if (err)
+ return err;
+ test->ifobj_rx->mtu = mtu;
+ }
+ if (test->ifobj_tx->mtu != mtu) {
+ err = xsk_set_mtu(test->ifobj_tx->ifindex, mtu);
+ if (err)
+ return err;
+ test->ifobj_tx->mtu = mtu;
+ }
+
+ return 0;
+}
+
static void pkt_stream_reset(struct pkt_stream *pkt_stream)
{
if (pkt_stream)
@@ -533,23 +558,49 @@ static struct pkt_stream *__pkt_stream_alloc(u32 nb_pkts)
return pkt_stream;
}
+static bool pkt_continues(u32 options)
+{
+ return options & XDP_PKT_CONTD;
+}
+
static u32 ceil_u32(u32 a, u32 b)
{
return (a + b - 1) / b;
}
-static u32 pkt_nb_frags(u32 frame_size, struct pkt *pkt)
+static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pkt *pkt)
{
- if (!pkt || !pkt->valid)
+ u32 nb_frags = 1, next_frag;
+
+ if (!pkt)
return 1;
- return ceil_u32(pkt->len, frame_size);
+
+ if (!pkt_stream->verbatim) {
+ if (!pkt->valid || !pkt->len)
+ return 1;
+ return ceil_u32(pkt->len, frame_size);
+ }
+
+ /* Search for the end of the packet in verbatim mode */
+ if (!pkt_continues(pkt->options))
+ return nb_frags;
+
+ next_frag = pkt_stream->current_pkt_nb;
+ pkt++;
+ while (next_frag++ < pkt_stream->nb_pkts) {
+ nb_frags++;
+ if (!pkt_continues(pkt->options) || !pkt->valid)
+ break;
+ pkt++;
+ }
+ return nb_frags;
}
static void pkt_set(struct xsk_umem_info *umem, struct pkt *pkt, int offset, u32 len)
{
pkt->offset = offset;
pkt->len = len;
- if (len > umem->frame_size - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 2 - umem->frame_headroom)
+ if (len > MAX_ETH_JUMBO_SIZE)
pkt->valid = false;
else
pkt->valid = true;
@@ -637,6 +688,11 @@ static u64 pkt_get_addr(struct pkt *pkt, struct xsk_umem_info *umem)
return pkt->offset + umem_alloc_buffer(umem);
}
+static void pkt_stream_cancel(struct pkt_stream *pkt_stream)
+{
+ pkt_stream->current_pkt_nb--;
+}
+
static void pkt_generate(struct ifobject *ifobject, u64 addr, u32 len, u32 pkt_nb,
u32 bytes_written)
{
@@ -657,34 +713,59 @@ static void pkt_generate(struct ifobject *ifobject, u64 addr, u32 len, u32 pkt_n
write_payload(data, pkt_nb, bytes_written, len);
}
-static void __pkt_stream_generate_custom(struct ifobject *ifobj,
- struct pkt *pkts, u32 nb_pkts)
+static struct pkt_stream *__pkt_stream_generate_custom(struct ifobject *ifobj, struct pkt *frames,
+ u32 nb_frames, bool verbatim)
{
+ u32 i, len = 0, pkt_nb = 0, payload = 0;
struct pkt_stream *pkt_stream;
- u32 i;
- pkt_stream = __pkt_stream_alloc(nb_pkts);
+ pkt_stream = __pkt_stream_alloc(nb_frames);
if (!pkt_stream)
exit_with_error(ENOMEM);
- for (i = 0; i < nb_pkts; i++) {
- struct pkt *pkt = &pkt_stream->pkts[i];
+ for (i = 0; i < nb_frames; i++) {
+ struct pkt *pkt = &pkt_stream->pkts[pkt_nb];
+ struct pkt *frame = &frames[i];
- pkt->offset = pkts[i].offset;
- pkt->len = pkts[i].len;
- pkt->pkt_nb = i;
- pkt->valid = pkts[i].valid;
- if (pkt->len > pkt_stream->max_pkt_len)
+ pkt->offset = frame->offset;
+ if (verbatim) {
+ *pkt = *frame;
+ pkt->pkt_nb = payload;
+ if (!frame->valid || !pkt_continues(frame->options))
+ payload++;
+ } else {
+ if (frame->valid)
+ len += frame->len;
+ if (frame->valid && pkt_continues(frame->options))
+ continue;
+
+ pkt->pkt_nb = pkt_nb;
+ pkt->len = len;
+ pkt->valid = frame->valid;
+ pkt->options = 0;
+
+ len = 0;
+ }
+
+ if (pkt->valid && pkt->len > pkt_stream->max_pkt_len)
pkt_stream->max_pkt_len = pkt->len;
+ pkt_nb++;
}
- ifobj->pkt_stream = pkt_stream;
+ pkt_stream->nb_pkts = pkt_nb;
+ pkt_stream->verbatim = verbatim;
+ return pkt_stream;
}
static void pkt_stream_generate_custom(struct test_spec *test, struct pkt *pkts, u32 nb_pkts)
{
- __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts);
- __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts);
+ struct pkt_stream *pkt_stream;
+
+ pkt_stream = __pkt_stream_generate_custom(test->ifobj_tx, pkts, nb_pkts, true);
+ test->ifobj_tx->pkt_stream = pkt_stream;
+
+ pkt_stream = __pkt_stream_generate_custom(test->ifobj_rx, pkts, nb_pkts, false);
+ test->ifobj_rx->pkt_stream = pkt_stream;
}
static void pkt_print_data(u32 *data, u32 cnt)
@@ -765,43 +846,76 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr)
return true;
}
-static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len)
+static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb,
+ u32 bytes_processed)
{
- void *data = xsk_umem__get_data(buffer, addr);
- u32 seqnum, pkt_data;
+ u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum;
+ void *data = xsk_umem__get_data(umem->buffer, addr);
- if (!pkt) {
- ksft_print_msg("[%s] too many packets received\n", __func__);
- goto error;
+ addr -= umem->base_addr;
+
+ if (addr >= umem->num_frames * umem->frame_size ||
+ addr + len > umem->num_frames * umem->frame_size) {
+ ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len);
+ return false;
+ }
+ if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) {
+ ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len);
+ return false;
}
- if (len < MIN_PKT_SIZE || pkt->len < MIN_PKT_SIZE) {
- /* Do not try to verify packets that are smaller than minimum size. */
- return true;
+ pkt_data = data;
+ if (!bytes_processed) {
+ pkt_data += PKT_HDR_SIZE / sizeof(*pkt_data);
+ len -= PKT_HDR_SIZE;
+ } else {
+ bytes_processed -= PKT_HDR_SIZE;
}
- if (pkt->len != len) {
- ksft_print_msg("[%s] expected length [%d], got length [%d]\n",
- __func__, pkt->len, len);
+ expected_seqnum = bytes_processed / sizeof(*pkt_data);
+ seqnum = ntohl(*pkt_data) & 0xffff;
+ pkt_nb = ntohl(*pkt_data) >> 16;
+
+ if (expected_pkt_nb != pkt_nb) {
+ ksft_print_msg("[%s] expected pkt_nb [%u], got pkt_nb [%u]\n",
+ __func__, expected_pkt_nb, pkt_nb);
+ goto error;
+ }
+ if (expected_seqnum != seqnum) {
+ ksft_print_msg("[%s] expected seqnum at start [%u], got seqnum [%u]\n",
+ __func__, expected_seqnum, seqnum);
goto error;
}
- pkt_data = ntohl(*((u32 *)(data + PKT_HDR_SIZE)));
- seqnum = pkt_data >> 16;
-
- if (pkt->pkt_nb != seqnum) {
- ksft_print_msg("[%s] expected seqnum [%d], got seqnum [%d]\n",
- __func__, pkt->pkt_nb, seqnum);
+ words_to_end = len / sizeof(*pkt_data) - 1;
+ pkt_data += words_to_end;
+ seqnum = ntohl(*pkt_data) & 0xffff;
+ expected_seqnum += words_to_end;
+ if (expected_seqnum != seqnum) {
+ ksft_print_msg("[%s] expected seqnum at end [%u], got seqnum [%u]\n",
+ __func__, expected_seqnum, seqnum);
goto error;
}
return true;
error:
- pkt_dump(data, len, true);
+ pkt_dump(data, len, !bytes_processed);
return false;
}
+static bool is_pkt_valid(struct pkt *pkt, void *buffer, u64 addr, u32 len)
+{
+ if (pkt->len != len) {
+ ksft_print_msg("[%s] expected packet length [%d], got length [%d]\n",
+ __func__, pkt->len, len);
+ pkt_dump(xsk_umem__get_data(buffer, addr), len, true);
+ return false;
+ }
+
+ return true;
+}
+
static void kick_tx(struct xsk_socket_info *xsk)
{
int ret;
@@ -854,8 +968,8 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds)
{
struct timeval tv_end, tv_now, tv_timeout = {THREAD_TMOUT, 0};
struct pkt_stream *pkt_stream = test->ifobj_rx->pkt_stream;
- u32 idx_rx = 0, idx_fq = 0, rcvd, i, pkts_sent = 0;
struct xsk_socket_info *xsk = test->ifobj_rx->xsk;
+ u32 idx_rx = 0, idx_fq = 0, rcvd, pkts_sent = 0;
struct ifobject *ifobj = test->ifobj_rx;
struct xsk_umem_info *umem = xsk->umem;
struct pkt *pkt;
@@ -868,6 +982,9 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds)
pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent);
while (pkt) {
+ u32 frags_processed = 0, nb_frags = 0, pkt_len = 0;
+ u64 first_addr;
+
ret = gettimeofday(&tv_now, NULL);
if (ret)
exit_with_error(errno);
@@ -888,7 +1005,6 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds)
ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__);
return TEST_FAILURE;
-
}
if (!(fds->revents & POLLIN))
@@ -913,27 +1029,59 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds)
}
}
- for (i = 0; i < rcvd; i++) {
+ while (frags_processed < rcvd) {
const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++);
u64 addr = desc->addr, orig;
orig = xsk_umem__extract_addr(addr);
addr = xsk_umem__add_offset_to_addr(addr);
- if (!is_pkt_valid(pkt, umem->buffer, addr, desc->len) ||
+ if (!pkt) {
+ ksft_print_msg("[%s] received too many packets addr: %lx len %u\n",
+ __func__, addr, desc->len);
+ return TEST_FAILURE;
+ }
+
+ if (!is_frag_valid(umem, addr, desc->len, pkt->pkt_nb, pkt_len) ||
!is_offset_correct(umem, pkt, addr) ||
(ifobj->use_metadata && !is_metadata_correct(pkt, umem->buffer, addr)))
return TEST_FAILURE;
+ if (!nb_frags++)
+ first_addr = addr;
+ frags_processed++;
+ pkt_len += desc->len;
if (ifobj->use_fill_ring)
*xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = orig;
+
+ if (pkt_continues(desc->options))
+ continue;
+
+ /* The complete packet has been received */
+ if (!is_pkt_valid(pkt, umem->buffer, first_addr, pkt_len) ||
+ !is_offset_correct(umem, pkt, addr))
+ return TEST_FAILURE;
+
pkt = pkt_stream_get_next_rx_pkt(pkt_stream, &pkts_sent);
+ nb_frags = 0;
+ pkt_len = 0;
+ }
+
+ if (nb_frags) {
+ /* In the middle of a packet. Start over from beginning of packet. */
+ idx_rx -= nb_frags;
+ xsk_ring_cons__cancel(&xsk->rx, nb_frags);
+ if (ifobj->use_fill_ring) {
+ idx_fq -= nb_frags;
+ xsk_ring_prod__cancel(&umem->fq, nb_frags);
+ }
+ frags_processed -= nb_frags;
}
if (ifobj->use_fill_ring)
- xsk_ring_prod__submit(&umem->fq, rcvd);
+ xsk_ring_prod__submit(&umem->fq, frags_processed);
if (ifobj->release_rx)
- xsk_ring_cons__release(&xsk->rx, rcvd);
+ xsk_ring_cons__release(&xsk->rx, frags_processed);
pthread_mutex_lock(&pacing_mutex);
pkts_in_flight -= pkts_sent;
@@ -946,13 +1094,14 @@ static int receive_pkts(struct test_spec *test, struct pollfd *fds)
static int __send_pkts(struct ifobject *ifobject, struct pollfd *fds, bool timeout)
{
+ u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len;
+ struct pkt_stream *pkt_stream = ifobject->pkt_stream;
struct xsk_socket_info *xsk = ifobject->xsk;
struct xsk_umem_info *umem = ifobject->umem;
- u32 i, idx = 0, valid_pkts = 0, buffer_len;
bool use_poll = ifobject->use_poll;
int ret;
- buffer_len = pkt_get_buffer_len(umem, ifobject->pkt_stream->max_pkt_len);
+ buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len);
/* pkts_in_flight might be negative if many invalid packets are sent */
if (pkts_in_flight >= (int)((umem_size(umem) - BATCH_SIZE * buffer_len) / buffer_len)) {
kick_tx(xsk);
@@ -983,17 +1132,49 @@ static int __send_pkts(struct ifobject *ifobject, struct pollfd *fds, bool timeo
}
for (i = 0; i < BATCH_SIZE; i++) {
- struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i);
- struct pkt *pkt = pkt_stream_get_next_tx_pkt(ifobject->pkt_stream);
+ struct pkt *pkt = pkt_stream_get_next_tx_pkt(pkt_stream);
+ u32 nb_frags_left, nb_frags, bytes_written = 0;
if (!pkt)
break;
- tx_desc->addr = pkt_get_addr(pkt, umem);
- tx_desc->len = pkt->len;
- if (pkt->valid) {
+ nb_frags = pkt_nb_frags(umem->frame_size, pkt_stream, pkt);
+ if (nb_frags > BATCH_SIZE - i) {
+ pkt_stream_cancel(pkt_stream);
+ xsk_ring_prod__cancel(&xsk->tx, BATCH_SIZE - i);
+ break;
+ }
+ nb_frags_left = nb_frags;
+
+ while (nb_frags_left--) {
+ struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i);
+
+ tx_desc->addr = pkt_get_addr(pkt, ifobject->umem);
+ if (pkt_stream->verbatim) {
+ tx_desc->len = pkt->len;
+ tx_desc->options = pkt->options;
+ } else if (nb_frags_left) {
+ tx_desc->len = umem->frame_size;
+ tx_desc->options = XDP_PKT_CONTD;
+ } else {
+ tx_desc->len = pkt->len - bytes_written;
+ tx_desc->options = 0;
+ }
+ if (pkt->valid)
+ pkt_generate(ifobject, tx_desc->addr, tx_desc->len, pkt->pkt_nb,
+ bytes_written);
+ bytes_written += tx_desc->len;
+
+ if (nb_frags_left) {
+ i++;
+ if (pkt_stream->verbatim)
+ pkt = pkt_stream_get_next_tx_pkt(pkt_stream);
+ }
+ }
+
+ if (pkt && pkt->valid) {
valid_pkts++;
- pkt_generate(ifobject, tx_desc->addr, tx_desc->len, pkt->pkt_nb, 0);
+ valid_frags += nb_frags;
}
}
@@ -1002,7 +1183,7 @@ static int __send_pkts(struct ifobject *ifobject, struct pollfd *fds, bool timeo
pthread_mutex_unlock(&pacing_mutex);
xsk_ring_prod__submit(&xsk->tx, i);
- xsk->outstanding_tx += valid_pkts;
+ xsk->outstanding_tx += valid_frags;
if (use_poll) {
ret = poll(fds, 1, POLL_TMOUT);
@@ -1222,7 +1403,7 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream
u64 addr;
u32 i;
- for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt); i++) {
+ for (i = 0; i < pkt_nb_frags(rx_frame_size, pkt_stream, pkt); i++) {
if (!pkt) {
if (!fill_up)
break;
@@ -1415,6 +1596,25 @@ static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *i
struct ifobject *ifobj2)
{
pthread_t t0, t1;
+ int err;
+
+ if (test->mtu > MAX_ETH_PKT_SIZE) {
+ if (test->mode == TEST_MODE_ZC && (!ifobj1->multi_buff_zc_supp ||
+ (ifobj2 && !ifobj2->multi_buff_zc_supp))) {
+ ksft_test_result_skip("Multi buffer for zero-copy not supported.\n");
+ return TEST_SKIP;
+ }
+ if (test->mode != TEST_MODE_ZC && (!ifobj1->multi_buff_supp ||
+ (ifobj2 && !ifobj2->multi_buff_supp))) {
+ ksft_test_result_skip("Multi buffer not supported.\n");
+ return TEST_SKIP;
+ }
+ }
+ err = test_spec_set_mtu(test, test->mtu);
+ if (err) {
+ ksft_print_msg("Error, could not set mtu.\n");
+ exit_with_error(err);
+ }
if (ifobj2) {
if (pthread_barrier_init(&barr, NULL, 2))
@@ -1616,6 +1816,16 @@ static int testapp_unaligned(struct test_spec *test)
return testapp_validate_traffic(test);
}
+static int testapp_unaligned_mb(struct test_spec *test)
+{
+ test_spec_set_name(test, "UNALIGNED_MODE_9K");
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ test->ifobj_tx->umem->unaligned_mode = true;
+ test->ifobj_rx->umem->unaligned_mode = true;
+ pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE);
+ return testapp_validate_traffic(test);
+}
+
static int testapp_single_pkt(struct test_spec *test)
{
struct pkt pkts[] = {{0, MIN_PKT_SIZE, 0, true}};
@@ -1624,6 +1834,55 @@ static int testapp_single_pkt(struct test_spec *test)
return testapp_validate_traffic(test);
}
+static int testapp_multi_buffer(struct test_spec *test)
+{
+ test_spec_set_name(test, "RUN_TO_COMPLETION_9K_PACKETS");
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE);
+
+ return testapp_validate_traffic(test);
+}
+
+static int testapp_invalid_desc_mb(struct test_spec *test)
+{
+ struct xsk_umem_info *umem = test->ifobj_tx->umem;
+ u64 umem_size = umem->num_frames * umem->frame_size;
+ struct pkt pkts[] = {
+ /* Valid packet for synch to start with */
+ {0, MIN_PKT_SIZE, 0, true, 0},
+ /* Zero frame len is not legal */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {0, 0, 0, false, 0},
+ /* Invalid address in the second frame */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ /* Invalid len in the middle */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ /* Invalid options in the middle */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XSK_DESC__INVALID_OPTION},
+ /* Transmit 2 frags, receive 3 */
+ {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, XDP_PKT_CONTD},
+ {0, XSK_UMEM__MAX_FRAME_SIZE, 0, true, 0},
+ /* Middle frame crosses chunk boundary with small length */
+ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD},
+ {-MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false, 0},
+ /* Valid packet for synch so that something is received */
+ {0, MIN_PKT_SIZE, 0, true, 0}};
+
+ if (umem->unaligned_mode) {
+ /* Crossing a chunk boundary allowed */
+ pkts[12].valid = true;
+ pkts[13].valid = true;
+ }
+
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+ pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts));
+ return testapp_validate_traffic(test);
+}
+
static int testapp_invalid_desc(struct test_spec *test)
{
struct xsk_umem_info *umem = test->ifobj_tx->umem;
@@ -1690,7 +1949,6 @@ static int testapp_xdp_metadata_count(struct test_spec *test)
int count = 0;
int key = 0;
- test_spec_set_name(test, "XDP_METADATA_COUNT");
test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_populate_metadata,
skel_tx->progs.xsk_xdp_populate_metadata,
skel_rx->maps.xsk, skel_tx->maps.xsk);
@@ -1724,6 +1982,48 @@ static int testapp_poll_rxq_tmout(struct test_spec *test)
return testapp_validate_traffic_single_thread(test, test->ifobj_rx);
}
+static int testapp_too_many_frags(struct test_spec *test)
+{
+ struct pkt pkts[2 * XSK_DESC__MAX_SKB_FRAGS + 2] = {};
+ u32 max_frags, i;
+
+ test_spec_set_name(test, "TOO_MANY_FRAGS");
+ if (test->mode == TEST_MODE_ZC)
+ max_frags = test->ifobj_tx->xdp_zc_max_segs;
+ else
+ max_frags = XSK_DESC__MAX_SKB_FRAGS;
+
+ test->mtu = MAX_ETH_JUMBO_SIZE;
+
+ /* Valid packet for synch */
+ pkts[0].len = MIN_PKT_SIZE;
+ pkts[0].valid = true;
+
+ /* One valid packet with the max amount of frags */
+ for (i = 1; i < max_frags + 1; i++) {
+ pkts[i].len = MIN_PKT_SIZE;
+ pkts[i].options = XDP_PKT_CONTD;
+ pkts[i].valid = true;
+ }
+ pkts[max_frags].options = 0;
+
+ /* An invalid packet with the max amount of frags but signals packet
+ * continues on the last frag
+ */
+ for (i = max_frags + 1; i < 2 * max_frags + 1; i++) {
+ pkts[i].len = MIN_PKT_SIZE;
+ pkts[i].options = XDP_PKT_CONTD;
+ pkts[i].valid = false;
+ }
+
+ /* Valid packet for synch */
+ pkts[2 * max_frags + 1].len = MIN_PKT_SIZE;
+ pkts[2 * max_frags + 1].valid = true;
+
+ pkt_stream_generate_custom(test, pkts, 2 * max_frags + 2);
+ return testapp_validate_traffic(test);
+}
+
static int xsk_load_xdp_programs(struct ifobject *ifobj)
{
ifobj->xdp_progs = xsk_xdp_progs__open_and_load();
@@ -1757,6 +2057,7 @@ static bool hugepages_present(void)
static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *src_mac,
thread_func_t func_ptr)
{
+ LIBBPF_OPTS(bpf_xdp_query_opts, query_opts);
int err;
memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN);
@@ -1772,6 +2073,22 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *
if (hugepages_present())
ifobj->unaligned_supp = true;
+
+ err = bpf_xdp_query(ifobj->ifindex, XDP_FLAGS_DRV_MODE, &query_opts);
+ if (err) {
+ ksft_print_msg("Error querrying XDP capabilities\n");
+ exit_with_error(-err);
+ }
+ if (query_opts.feature_flags & NETDEV_XDP_ACT_RX_SG)
+ ifobj->multi_buff_supp = true;
+ if (query_opts.feature_flags & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
+ if (query_opts.xdp_zc_max_segs > 1) {
+ ifobj->multi_buff_zc_supp = true;
+ ifobj->xdp_zc_max_segs = query_opts.xdp_zc_max_segs;
+ } else {
+ ifobj->xdp_zc_max_segs = 0;
+ }
+ }
}
static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_type type)
@@ -1804,6 +2121,9 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
test_spec_set_name(test, "RUN_TO_COMPLETION");
ret = testapp_validate_traffic(test);
break;
+ case TEST_TYPE_RUN_TO_COMPLETION_MB:
+ ret = testapp_multi_buffer(test);
+ break;
case TEST_TYPE_RUN_TO_COMPLETION_SINGLE_PKT:
test_spec_set_name(test, "RUN_TO_COMPLETION_SINGLE_PKT");
ret = testapp_single_pkt(test);
@@ -1866,9 +2186,22 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
ret = testapp_invalid_desc(test);
break;
}
+ case TEST_TYPE_ALIGNED_INV_DESC_MB:
+ test_spec_set_name(test, "ALIGNED_INV_DESC_MULTI_BUFF");
+ ret = testapp_invalid_desc_mb(test);
+ break;
+ case TEST_TYPE_UNALIGNED_INV_DESC_MB:
+ test_spec_set_name(test, "UNALIGNED_INV_DESC_MULTI_BUFF");
+ test->ifobj_tx->umem->unaligned_mode = true;
+ test->ifobj_rx->umem->unaligned_mode = true;
+ ret = testapp_invalid_desc_mb(test);
+ break;
case TEST_TYPE_UNALIGNED:
ret = testapp_unaligned(test);
break;
+ case TEST_TYPE_UNALIGNED_MB:
+ ret = testapp_unaligned_mb(test);
+ break;
case TEST_TYPE_HEADROOM:
ret = testapp_headroom(test);
break;
@@ -1876,8 +2209,17 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_
ret = testapp_xdp_drop(test);
break;
case TEST_TYPE_XDP_METADATA_COUNT:
+ test_spec_set_name(test, "XDP_METADATA_COUNT");
+ ret = testapp_xdp_metadata_count(test);
+ break;
+ case TEST_TYPE_XDP_METADATA_COUNT_MB:
+ test_spec_set_name(test, "XDP_METADATA_COUNT_MULTI_BUFF");
+ test->mtu = MAX_ETH_JUMBO_SIZE;
ret = testapp_xdp_metadata_count(test);
break;
+ case TEST_TYPE_TOO_MANY_FRAGS:
+ ret = testapp_too_many_frags(test);
+ break;
default:
break;
}
diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h
index aaf27e067640..233b66cef64a 100644
--- a/tools/testing/selftests/bpf/xskxceiver.h
+++ b/tools/testing/selftests/bpf/xskxceiver.h
@@ -38,6 +38,8 @@
#define MAX_TEARDOWN_ITER 10
#define PKT_HDR_SIZE (sizeof(struct ethhdr) + 2) /* Just to align the data in the packet */
#define MIN_PKT_SIZE 64
+#define MAX_ETH_PKT_SIZE 1518
+#define MAX_ETH_JUMBO_SIZE 9000
#define USLEEP_MAX 10000
#define SOCK_RECONF_CTR 10
#define BATCH_SIZE 64
@@ -47,7 +49,11 @@
#define DEFAULT_UMEM_BUFFERS (DEFAULT_PKT_CNT / 4)
#define RX_FULL_RXQSIZE 32
#define UMEM_HEADROOM_TEST_SIZE 128
-#define XSK_UMEM__INVALID_FRAME_SIZE (XSK_UMEM__DEFAULT_FRAME_SIZE + 1)
+#define XSK_UMEM__INVALID_FRAME_SIZE (MAX_ETH_JUMBO_SIZE + 1)
+#define XSK_UMEM__LARGE_FRAME_SIZE (3 * 1024)
+#define XSK_UMEM__MAX_FRAME_SIZE (4 * 1024)
+#define XSK_DESC__INVALID_OPTION (0xffff)
+#define XSK_DESC__MAX_SKB_FRAGS 18
#define HUGEPAGE_SIZE (2 * 1024 * 1024)
#define PKT_DUMP_NB_TO_PRINT 16
@@ -83,6 +89,12 @@ enum test_type {
TEST_TYPE_BPF_RES,
TEST_TYPE_XDP_DROP_HALF,
TEST_TYPE_XDP_METADATA_COUNT,
+ TEST_TYPE_XDP_METADATA_COUNT_MB,
+ TEST_TYPE_RUN_TO_COMPLETION_MB,
+ TEST_TYPE_UNALIGNED_MB,
+ TEST_TYPE_ALIGNED_INV_DESC_MB,
+ TEST_TYPE_UNALIGNED_INV_DESC_MB,
+ TEST_TYPE_TOO_MANY_FRAGS,
TEST_TYPE_MAX
};
@@ -115,6 +127,7 @@ struct pkt {
u32 len;
u32 pkt_nb;
bool valid;
+ u16 options;
};
struct pkt_stream {
@@ -122,6 +135,7 @@ struct pkt_stream {
u32 current_pkt_nb;
struct pkt *pkts;
u32 max_pkt_len;
+ bool verbatim;
};
struct ifobject;
@@ -141,7 +155,9 @@ struct ifobject {
struct bpf_program *xdp_prog;
enum test_mode mode;
int ifindex;
+ int mtu;
u32 bind_flags;
+ u32 xdp_zc_max_segs;
bool tx_on;
bool rx_on;
bool use_poll;
@@ -151,6 +167,8 @@ struct ifobject {
bool shared_umem;
bool use_metadata;
bool unaligned_supp;
+ bool multi_buff_supp;
+ bool multi_buff_zc_supp;
u8 dst_mac[ETH_ALEN];
u8 src_mac[ETH_ALEN];
};
@@ -164,6 +182,7 @@ struct test_spec {
struct bpf_program *xdp_prog_tx;
struct bpf_map *xskmap_rx;
struct bpf_map *xskmap_tx;
+ int mtu;
u16 total_steps;
u16 current_step;
u16 nb_sockets;