diff options
Diffstat (limited to 'drivers/infiniband/hw')
59 files changed, 4369 insertions, 365 deletions
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index b706dc0d0263..c42b22ac3303 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -13,5 +13,6 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ +obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re/ obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ obj-$(CONFIG_INFINIBAND_IONIC) += ionic/ diff --git a/drivers/infiniband/hw/bng_re/Kconfig b/drivers/infiniband/hw/bng_re/Kconfig new file mode 100644 index 000000000000..85845f72c64d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_BNG_RE + tristate "Broadcom Next generation RoCE HCA support" + depends on 64BIT + depends on INET && DCB && BNGE + help + This driver supports Broadcom Next generation + 50/100/200/400/800 gigabit RoCE HCAs. The module + will be called bng_re. To compile this driver + as a module, choose M here. diff --git a/drivers/infiniband/hw/bng_re/Makefile b/drivers/infiniband/hw/bng_re/Makefile new file mode 100644 index 000000000000..c6aaaf853c77 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnge -I $(srctree)/drivers/infiniband/hw/bnxt_re + +obj-$(CONFIG_INFINIBAND_BNG_RE) += bng_re.o + +bng_re-y := bng_dev.o bng_fw.o \ + bng_res.o bng_sp.o \ + bng_debugfs.o diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.c b/drivers/infiniband/hw/bng_re/bng_debugfs.c new file mode 100644 index 000000000000..9ec5a8785250 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_debugfs.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include <linux/debugfs.h> +#include <linux/pci.h> + +#include <rdma/ib_verbs.h> + +#include "bng_res.h" +#include "bng_fw.h" +#include "bnge.h" +#include "bnge_auxr.h" +#include "bng_re.h" +#include "bng_debugfs.h" + +static struct dentry *bng_re_debugfs_root; + +void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev) +{ + struct pci_dev *pdev = rdev->aux_dev->pdev; + + rdev->dbg_root = + debugfs_create_dir(dev_name(&pdev->dev), bng_re_debugfs_root); +} + +void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->dbg_root); + rdev->dbg_root = NULL; +} + +void bng_re_register_debugfs(void) +{ + bng_re_debugfs_root = debugfs_create_dir("bng_re", NULL); +} + +void bng_re_unregister_debugfs(void) +{ + debugfs_remove(bng_re_debugfs_root); +} diff --git a/drivers/infiniband/hw/bng_re/bng_debugfs.h b/drivers/infiniband/hw/bng_re/bng_debugfs.h new file mode 100644 index 000000000000..baef71df4242 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_debugfs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RE_DEBUGFS__ +#define __BNG_RE_DEBUGFS__ + +void bng_re_debugfs_add_pdev(struct bng_re_dev *rdev); +void bng_re_debugfs_rem_pdev(struct bng_re_dev *rdev); + +void bng_re_register_debugfs(void); +void bng_re_unregister_debugfs(void); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_dev.c b/drivers/infiniband/hw/bng_re/bng_dev.c new file mode 100644 index 000000000000..d8f8d7f7075f --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_dev.c @@ -0,0 +1,534 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/auxiliary_bus.h> + +#include <rdma/ib_verbs.h> + +#include "bng_res.h" +#include "bng_sp.h" +#include "bng_fw.h" +#include "bnge.h" +#include "bnge_auxr.h" +#include "bng_re.h" +#include "bnge_hwrm.h" +#include "bng_debugfs.h" + +MODULE_AUTHOR("Siva Reddy Kallam <siva.kallam@broadcom.com>"); +MODULE_DESCRIPTION(BNG_RE_DESC); +MODULE_LICENSE("Dual BSD/GPL"); + +static struct bng_re_dev *bng_re_dev_add(struct auxiliary_device *adev, + struct bnge_auxr_dev *aux_dev) +{ + struct bng_re_dev *rdev; + + /* Allocate bng_re_dev instance */ + rdev = ib_alloc_device(bng_re_dev, ibdev); + if (!rdev) { + pr_err("%s: bng_re_dev allocation failure!", KBUILD_MODNAME); + return NULL; + } + + /* Assign auxiliary device specific data */ + rdev->netdev = aux_dev->net; + rdev->aux_dev = aux_dev; + rdev->adev = adev; + rdev->fn_id = rdev->aux_dev->pdev->devfn; + + return rdev; +} + + +static int bng_re_register_netdev(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev; + + aux_dev = rdev->aux_dev; + return bnge_register_dev(aux_dev, rdev->adev); +} + +static void bng_re_destroy_chip_ctx(struct bng_re_dev *rdev) +{ + struct bng_re_chip_ctx *chip_ctx; + + if (!rdev->chip_ctx) + return; + + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; + + chip_ctx = rdev->chip_ctx; + rdev->chip_ctx = NULL; + rdev->rcfw.res = NULL; + rdev->bng_res.cctx = NULL; + rdev->bng_res.pdev = NULL; + kfree(chip_ctx); +} + +static int bng_re_setup_chip_ctx(struct bng_re_dev *rdev) +{ + struct bng_re_chip_ctx *chip_ctx; + struct bnge_auxr_dev *aux_dev; + int rc = -ENOMEM; + + aux_dev = rdev->aux_dev; + rdev->bng_res.pdev = aux_dev->pdev; + rdev->rcfw.res = &rdev->bng_res; + chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); + if (!chip_ctx) + return -ENOMEM; + chip_ctx->chip_num = aux_dev->chip_num; + chip_ctx->hw_stats_size = aux_dev->hw_ring_stats_size; + + rdev->chip_ctx = chip_ctx; + rdev->bng_res.cctx = rdev->chip_ctx; + rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL); + if (!rdev->dev_attr) + goto free_chip_ctx; + rdev->bng_res.dattr = rdev->dev_attr; + + return 0; +free_chip_ctx: + kfree(rdev->chip_ctx); + rdev->chip_ctx = NULL; + return rc; +} + +static void bng_re_init_hwrm_hdr(struct input *hdr, u16 opcd) +{ + hdr->req_type = cpu_to_le16(opcd); + hdr->cmpl_ring = cpu_to_le16(-1); + hdr->target_id = cpu_to_le16(-1); +} + +static void bng_re_fill_fw_msg(struct bnge_fw_msg *fw_msg, void *msg, + int msg_len, void *resp, int resp_max_len, + int timeout) +{ + fw_msg->msg = msg; + fw_msg->msg_len = msg_len; + fw_msg->resp = resp; + fw_msg->resp_max_len = resp_max_len; + fw_msg->timeout = timeout; +} + +static int bng_re_net_ring_free(struct bng_re_dev *rdev, + u16 fw_ring_id, int type) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ring_free_input req = {}; + struct hwrm_ring_free_output resp; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!rdev) + return rc; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE); + req.ring_type = type; + req.ring_id = cpu_to_le16(fw_ring_id); + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) + ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x", + req.ring_id, rc); + return rc; +} + +static int bng_re_net_ring_alloc(struct bng_re_dev *rdev, + struct bng_re_ring_attr *ring_attr, + u16 *fw_ring_id) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ring_alloc_input req = {}; + struct hwrm_ring_alloc_output resp; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC); + req.enables = 0; + req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]); + if (ring_attr->pages > 1) { + /* Page size is in log2 units */ + req.page_size = BNGE_PAGE_SHIFT; + req.page_tbl_depth = 1; + } + req.fbo = 0; + /* Association of ring index with doorbell index and MSIX number */ + req.logical_id = cpu_to_le16(ring_attr->lrid); + req.length = cpu_to_le32(ring_attr->depth + 1); + req.ring_type = ring_attr->type; + req.int_mode = ring_attr->mode; + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (!rc) + *fw_ring_id = le16_to_cpu(resp.ring_id); + + return rc; +} + +static int bng_re_stats_ctx_free(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_stat_ctx_free_input req = {}; + struct hwrm_stat_ctx_free_output resp = {}; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE); + req.stat_ctx_id = cpu_to_le32(rdev->stats_ctx.fw_id); + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) + ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x", + rc); + + return rc; +} + +static int bng_re_stats_ctx_alloc(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct bng_re_stats *stats = &rdev->stats_ctx; + struct hwrm_stat_ctx_alloc_output resp = {}; + struct hwrm_stat_ctx_alloc_input req = {}; + struct bnge_fw_msg fw_msg = {}; + int rc = -EINVAL; + + stats->fw_id = BNGE_INVALID_STATS_CTX_ID; + + if (!aux_dev) + return rc; + + bng_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC); + req.update_period_ms = cpu_to_le32(1000); + req.stats_dma_addr = cpu_to_le64(stats->dma_map); + req.stats_dma_length = cpu_to_le16(rdev->chip_ctx->hw_stats_size); + req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE; + bng_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (!rc) + stats->fw_id = le32_to_cpu(resp.stat_ctx_id); + return rc; +} + +static void bng_re_query_hwrm_version(struct bng_re_dev *rdev) +{ + struct bnge_auxr_dev *aux_dev = rdev->aux_dev; + struct hwrm_ver_get_output ver_get_resp = {}; + struct hwrm_ver_get_input ver_get_req = {}; + struct bng_re_chip_ctx *cctx; + struct bnge_fw_msg fw_msg = {}; + int rc; + + bng_re_init_hwrm_hdr((void *)&ver_get_req, HWRM_VER_GET); + ver_get_req.hwrm_intf_maj = HWRM_VERSION_MAJOR; + ver_get_req.hwrm_intf_min = HWRM_VERSION_MINOR; + ver_get_req.hwrm_intf_upd = HWRM_VERSION_UPDATE; + bng_re_fill_fw_msg(&fw_msg, (void *)&ver_get_req, sizeof(ver_get_req), + (void *)&ver_get_resp, sizeof(ver_get_resp), + BNGE_DFLT_HWRM_CMD_TIMEOUT); + rc = bnge_send_msg(aux_dev, &fw_msg); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x", + rc); + return; + } + + cctx = rdev->chip_ctx; + cctx->hwrm_intf_ver = + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_major) << 48 | + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_minor) << 32 | + (u64)le16_to_cpu(ver_get_resp.hwrm_intf_build) << 16 | + le16_to_cpu(ver_get_resp.hwrm_intf_patch); + + cctx->hwrm_cmd_max_timeout = le16_to_cpu(ver_get_resp.max_req_timeout); + + if (!cctx->hwrm_cmd_max_timeout) + cctx->hwrm_cmd_max_timeout = BNG_ROCE_FW_MAX_TIMEOUT; +} + +static void bng_re_dev_uninit(struct bng_re_dev *rdev) +{ + int rc; + bng_re_debugfs_rem_pdev(rdev); + + if (test_and_clear_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) { + rc = bng_re_deinit_rcfw(&rdev->rcfw); + if (rc) + ibdev_warn(&rdev->ibdev, + "Failed to deinitialize RCFW: %#x", rc); + bng_re_stats_ctx_free(rdev); + bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx); + bng_re_disable_rcfw_channel(&rdev->rcfw); + bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, + RING_ALLOC_REQ_RING_TYPE_NQ); + bng_re_free_rcfw_channel(&rdev->rcfw); + } + + kfree(rdev->nqr); + rdev->nqr = NULL; + bng_re_destroy_chip_ctx(rdev); + if (test_and_clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnge_unregister_dev(rdev->aux_dev); +} + +static int bng_re_dev_init(struct bng_re_dev *rdev) +{ + struct bng_re_ring_attr rattr = {}; + struct bng_re_creq_ctx *creq; + u32 db_offt; + int vid; + u8 type; + int rc; + + /* Registered a new RoCE device instance to netdev */ + rc = bng_re_register_netdev(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to register with netedev: %#x\n", rc); + return -EINVAL; + } + + set_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + + if (rdev->aux_dev->auxr_info->msix_requested < BNG_RE_MIN_MSIX) { + ibdev_err(&rdev->ibdev, + "RoCE requires minimum 2 MSI-X vectors, but only %d reserved\n", + rdev->aux_dev->auxr_info->msix_requested); + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return -EINVAL; + } + ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", + rdev->aux_dev->auxr_info->msix_requested); + + rc = bng_re_setup_chip_ctx(rdev); + if (rc) { + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + ibdev_err(&rdev->ibdev, "Failed to get chip context\n"); + return -EINVAL; + } + + bng_re_query_hwrm_version(rdev); + + rc = bng_re_alloc_fw_channel(&rdev->bng_res, &rdev->rcfw); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate RCFW Channel: %#x\n", rc); + goto fail; + } + + /* Allocate nq record memory */ + rdev->nqr = kzalloc(sizeof(*rdev->nqr), GFP_KERNEL); + if (!rdev->nqr) { + bng_re_destroy_chip_ctx(rdev); + bnge_unregister_dev(rdev->aux_dev); + clear_bit(BNG_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + return -ENOMEM; + } + + rdev->nqr->num_msix = rdev->aux_dev->auxr_info->msix_requested; + memcpy(rdev->nqr->msix_entries, rdev->aux_dev->msix_info, + sizeof(struct bnge_msix_info) * rdev->nqr->num_msix); + + type = RING_ALLOC_REQ_RING_TYPE_NQ; + creq = &rdev->rcfw.creq; + rattr.dma_arr = creq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr; + rattr.pages = creq->hwq.pbl[creq->hwq.level].pg_count; + rattr.type = type; + rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; + rattr.depth = BNG_FW_CREQE_MAX_CNT - 1; + rattr.lrid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].ring_idx; + rc = bng_re_net_ring_alloc(rdev, &rattr, &creq->ring_id); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc); + goto free_rcfw; + } + db_offt = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].db_offset; + vid = rdev->nqr->msix_entries[BNG_RE_CREQ_NQ_IDX].vector; + + rc = bng_re_enable_fw_channel(&rdev->rcfw, + vid, db_offt); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n", + rc); + goto free_ring; + } + + rc = bng_re_get_dev_attr(&rdev->rcfw); + if (rc) + goto disable_rcfw; + + bng_re_debugfs_add_pdev(rdev); + rc = bng_re_alloc_stats_ctx_mem(rdev->bng_res.pdev, rdev->chip_ctx, + &rdev->stats_ctx); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate stats context: %#x\n", rc); + goto disable_rcfw; + } + + rc = bng_re_stats_ctx_alloc(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to allocate QPLIB context: %#x\n", rc); + goto free_stats_ctx; + } + + rc = bng_re_init_rcfw(&rdev->rcfw, &rdev->stats_ctx); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to initialize RCFW: %#x\n", rc); + goto free_sctx; + } + set_bit(BNG_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags); + + return 0; +free_sctx: + bng_re_stats_ctx_free(rdev); +free_stats_ctx: + bng_re_free_stats_ctx_mem(rdev->bng_res.pdev, &rdev->stats_ctx); +disable_rcfw: + bng_re_disable_rcfw_channel(&rdev->rcfw); +free_ring: + bng_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type); +free_rcfw: + bng_re_free_rcfw_channel(&rdev->rcfw); +fail: + bng_re_dev_uninit(rdev); + return rc; +} + +static int bng_re_add_device(struct auxiliary_device *adev) +{ + struct bnge_auxr_priv *auxr_priv = + container_of(adev, struct bnge_auxr_priv, aux_dev); + struct bng_re_en_dev_info *dev_info; + struct bng_re_dev *rdev; + int rc; + + dev_info = auxiliary_get_drvdata(adev); + + rdev = bng_re_dev_add(adev, auxr_priv->auxr_dev); + if (!rdev) { + rc = -ENOMEM; + goto exit; + } + + dev_info->rdev = rdev; + + rc = bng_re_dev_init(rdev); + if (rc) + goto re_dev_dealloc; + + return 0; + +re_dev_dealloc: + ib_dealloc_device(&rdev->ibdev); +exit: + return rc; +} + + +static void bng_re_remove_device(struct bng_re_dev *rdev, + struct auxiliary_device *aux_dev) +{ + bng_re_dev_uninit(rdev); + ib_dealloc_device(&rdev->ibdev); +} + + +static int bng_re_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct bnge_auxr_priv *aux_priv = + container_of(adev, struct bnge_auxr_priv, aux_dev); + struct bng_re_en_dev_info *en_info; + int rc; + + en_info = kzalloc(sizeof(*en_info), GFP_KERNEL); + if (!en_info) + return -ENOMEM; + + en_info->auxr_dev = aux_priv->auxr_dev; + + auxiliary_set_drvdata(adev, en_info); + + rc = bng_re_add_device(adev); + if (rc) + kfree(en_info); + + return rc; +} + +static void bng_re_remove(struct auxiliary_device *adev) +{ + struct bng_re_en_dev_info *dev_info = auxiliary_get_drvdata(adev); + struct bng_re_dev *rdev; + + rdev = dev_info->rdev; + + if (rdev) + bng_re_remove_device(rdev, adev); + kfree(dev_info); +} + +static const struct auxiliary_device_id bng_re_id_table[] = { + { .name = BNG_RE_ADEV_NAME ".rdma", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, bng_re_id_table); + +static struct auxiliary_driver bng_re_driver = { + .name = "rdma", + .probe = bng_re_probe, + .remove = bng_re_remove, + .id_table = bng_re_id_table, +}; + +static int __init bng_re_mod_init(void) +{ + int rc; + + + bng_re_register_debugfs(); + + rc = auxiliary_driver_register(&bng_re_driver); + if (rc) { + pr_err("%s: Failed to register auxiliary driver\n", + KBUILD_MODNAME); + goto unreg_debugfs; + } + return 0; +unreg_debugfs: + bng_re_unregister_debugfs(); + return rc; +} + +static void __exit bng_re_mod_exit(void) +{ + auxiliary_driver_unregister(&bng_re_driver); + bng_re_unregister_debugfs(); +} + +module_init(bng_re_mod_init); +module_exit(bng_re_mod_exit); diff --git a/drivers/infiniband/hw/bng_re/bng_fw.c b/drivers/infiniband/hw/bng_re/bng_fw.c new file mode 100644 index 000000000000..7d9539113cf5 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_fw.c @@ -0,0 +1,767 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include <linux/pci.h> + +#include "roce_hsi.h" +#include "bng_res.h" +#include "bng_fw.h" +#include "bng_sp.h" + +/** + * bng_re_map_rc - map return type based on opcode + * @opcode: roce slow path opcode + * + * case #1 + * Firmware initiated error recovery is a safe state machine and + * driver can consider all the underlying rdma resources are free. + * In this state, it is safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * case #2 + * If driver detect potential firmware stall, it is not safe state machine + * and the driver can not consider all the underlying rdma resources are + * freed. + * In this state, it is not safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * Scope of this helper function is only for case #1. + * + * Returns: + * 0 to communicate success to caller. + * Non zero error code to communicate failure to caller. + */ +static int bng_re_map_rc(u8 opcode) +{ + switch (opcode) { + case CMDQ_BASE_OPCODE_DESTROY_QP: + case CMDQ_BASE_OPCODE_DESTROY_SRQ: + case CMDQ_BASE_OPCODE_DESTROY_CQ: + case CMDQ_BASE_OPCODE_DEALLOCATE_KEY: + case CMDQ_BASE_OPCODE_DEREGISTER_MR: + case CMDQ_BASE_OPCODE_DELETE_GID: + case CMDQ_BASE_OPCODE_DESTROY_QP1: + case CMDQ_BASE_OPCODE_DESTROY_AH: + case CMDQ_BASE_OPCODE_DEINITIALIZE_FW: + case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC: + case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE: + return 0; + default: + return -ETIMEDOUT; + } +} + +void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw) +{ + kfree(rcfw->crsqe_tbl); + bng_re_free_hwq(rcfw->res, &rcfw->cmdq.hwq); + bng_re_free_hwq(rcfw->res, &rcfw->creq.hwq); + rcfw->pdev = NULL; +} + +int bng_re_alloc_fw_channel(struct bng_re_res *res, + struct bng_re_rcfw *rcfw) +{ + struct bng_re_hwq_attr hwq_attr = {}; + struct bng_re_sg_info sginfo = {}; + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_creq_ctx *creq; + + rcfw->pdev = res->pdev; + cmdq = &rcfw->cmdq; + creq = &rcfw->creq; + rcfw->res = res; + + sginfo.pgsize = PAGE_SIZE; + sginfo.pgshft = PAGE_SHIFT; + + hwq_attr.sginfo = &sginfo; + hwq_attr.res = rcfw->res; + hwq_attr.depth = BNG_FW_CREQE_MAX_CNT; + hwq_attr.stride = BNG_FW_CREQE_UNITS; + hwq_attr.type = BNG_HWQ_TYPE_QUEUE; + + if (bng_re_alloc_init_hwq(&creq->hwq, &hwq_attr)) { + dev_err(&rcfw->pdev->dev, + "HW channel CREQ allocation failed\n"); + goto fail; + } + + rcfw->cmdq_depth = BNG_FW_CMDQE_MAX_CNT; + + sginfo.pgsize = bng_fw_cmdqe_page_size(rcfw->cmdq_depth); + hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF; + hwq_attr.stride = BNG_FW_CMDQE_UNITS; + hwq_attr.type = BNG_HWQ_TYPE_CTX; + if (bng_re_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) { + dev_err(&rcfw->pdev->dev, + "HW channel CMDQ allocation failed\n"); + goto fail; + } + + rcfw->crsqe_tbl = kcalloc(cmdq->hwq.max_elements, + sizeof(*rcfw->crsqe_tbl), GFP_KERNEL); + if (!rcfw->crsqe_tbl) + goto fail; + + spin_lock_init(&rcfw->tbl_lock); + + rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; + return 0; + +fail: + bng_re_free_rcfw_channel(rcfw); + return -ENOMEM; +} + +static int bng_re_process_qp_event(struct bng_re_rcfw *rcfw, + struct creq_qp_event *qp_event, + u32 *num_wait) +{ + struct bng_re_hwq *hwq = &rcfw->cmdq.hwq; + struct bng_re_crsqe *crsqe; + u32 req_size; + u16 cookie; + bool is_waiter_alive; + struct pci_dev *pdev; + u32 wait_cmds = 0; + int rc = 0; + + pdev = rcfw->pdev; + switch (qp_event->event) { + case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION: + dev_err(&pdev->dev, "Received QP error notification\n"); + break; + default: + /* + * Command Response + * cmdq->lock needs to be acquired to synchronie + * the command send and completion reaping. This function + * is always called with creq->lock held. Using + * the nested variant of spin_lock. + * + */ + + spin_lock_nested(&hwq->lock, SINGLE_DEPTH_NESTING); + cookie = le16_to_cpu(qp_event->cookie); + cookie &= BNG_FW_MAX_COOKIE_VALUE; + crsqe = &rcfw->crsqe_tbl[cookie]; + + if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED, + &rcfw->cmdq.flags), + "Unreponsive rcfw channel detected.!!")) { + dev_info(&pdev->dev, + "rcfw timedout: cookie = %#x, free_slots = %d", + cookie, crsqe->free_slots); + spin_unlock(&hwq->lock); + return rc; + } + + if (crsqe->is_waiter_alive) { + if (crsqe->resp) { + memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); + /* Insert write memory barrier to ensure that + * response data is copied before clearing the + * flags + */ + smp_wmb(); + } + } + + wait_cmds++; + + req_size = crsqe->req_size; + is_waiter_alive = crsqe->is_waiter_alive; + + crsqe->req_size = 0; + if (!is_waiter_alive) + crsqe->resp = NULL; + + crsqe->is_in_used = false; + + hwq->cons += req_size; + + spin_unlock(&hwq->lock); + } + *num_wait += wait_cmds; + return rc; +} + +/* function events */ +static int bng_re_process_func_event(struct bng_re_rcfw *rcfw, + struct creq_func_event *func_event) +{ + switch (func_event->event) { + case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR: + case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR: + case CREQ_FUNC_EVENT_EVENT_RX_WQE_ERROR: + case CREQ_FUNC_EVENT_EVENT_RX_DATA_ERROR: + case CREQ_FUNC_EVENT_EVENT_CQ_ERROR: + case CREQ_FUNC_EVENT_EVENT_TQM_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCQ_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCS_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCC_ERROR: + case CREQ_FUNC_EVENT_EVENT_CFCM_ERROR: + case CREQ_FUNC_EVENT_EVENT_TIM_ERROR: + case CREQ_FUNC_EVENT_EVENT_VF_COMM_REQUEST: + case CREQ_FUNC_EVENT_EVENT_RESOURCE_EXHAUSTED: + break; + default: + return -EINVAL; + } + + return 0; +} + +/* CREQ Completion handlers */ +static void bng_re_service_creq(struct tasklet_struct *t) +{ + struct bng_re_rcfw *rcfw = from_tasklet(rcfw, t, creq.creq_tasklet); + struct bng_re_creq_ctx *creq = &rcfw->creq; + u32 type, budget = BNG_FW_CREQ_ENTRY_POLL_BUDGET; + struct bng_re_hwq *hwq = &creq->hwq; + struct creq_base *creqe; + u32 num_wakeup = 0; + u32 hw_polled = 0; + + /* Service the CREQ until budget is over */ + spin_lock_bh(&hwq->lock); + while (budget > 0) { + creqe = bng_re_get_qe(hwq, hwq->cons, NULL); + if (!BNG_FW_CREQ_CMP_VALID(creqe, creq->creq_db.dbinfo.flags)) + break; + /* The valid test of the entry must be done first before + * reading any further. + */ + dma_rmb(); + + type = creqe->type & CREQ_BASE_TYPE_MASK; + switch (type) { + case CREQ_BASE_TYPE_QP_EVENT: + bng_re_process_qp_event + (rcfw, (struct creq_qp_event *)creqe, + &num_wakeup); + creq->stats.creq_qp_event_processed++; + break; + case CREQ_BASE_TYPE_FUNC_EVENT: + if (!bng_re_process_func_event + (rcfw, (struct creq_func_event *)creqe)) + creq->stats.creq_func_event_processed++; + else + dev_warn(&rcfw->pdev->dev, + "aeqe:%#x Not handled\n", type); + break; + default: + if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT) + dev_warn(&rcfw->pdev->dev, + "creqe with event 0x%x not handled\n", + type); + break; + } + budget--; + hw_polled++; + bng_re_hwq_incr_cons(hwq->max_elements, &hwq->cons, + 1, &creq->creq_db.dbinfo.flags); + } + + if (hw_polled) + bng_re_ring_nq_db(&creq->creq_db.dbinfo, + rcfw->res->cctx, true); + spin_unlock_bh(&hwq->lock); + if (num_wakeup) + wake_up_nr(&rcfw->cmdq.waitq, num_wakeup); +} + +static int __send_message_basic_sanity(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg, + u8 opcode) +{ + struct bng_re_cmdq_ctx *cmdq; + + cmdq = &rcfw->cmdq; + + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; + + if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { + dev_err(&rcfw->pdev->dev, "RCFW already initialized!"); + return -EINVAL; + } + + if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && + opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && + opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { + dev_err(&rcfw->pdev->dev, + "RCFW not initialized, reject opcode 0x%x", + opcode); + return -EOPNOTSUPP; + } + + return 0; +} + +static int __send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg, u8 opcode) +{ + u32 bsize, free_slots, required_slots; + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_crsqe *crsqe; + struct bng_fw_cmdqe *cmdqe; + struct bng_re_hwq *hwq; + u32 sw_prod, cmdq_prod; + struct pci_dev *pdev; + u16 cookie; + u8 *preq; + + cmdq = &rcfw->cmdq; + hwq = &cmdq->hwq; + pdev = rcfw->pdev; + + /* Cmdq are in 16-byte units, each request can consume 1 or more + * cmdqe + */ + spin_lock_bh(&hwq->lock); + required_slots = bng_re_get_cmd_slots(msg->req); + free_slots = HWQ_FREE_SLOTS(hwq); + cookie = cmdq->seq_num & BNG_FW_MAX_COOKIE_VALUE; + crsqe = &rcfw->crsqe_tbl[cookie]; + + if (required_slots >= free_slots) { + dev_info_ratelimited(&pdev->dev, + "CMDQ is full req/free %d/%d!", + required_slots, free_slots); + spin_unlock_bh(&hwq->lock); + return -EAGAIN; + } + __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); + + bsize = bng_re_set_cmd_slots(msg->req); + crsqe->free_slots = free_slots; + crsqe->resp = (struct creq_qp_event *)msg->resp; + crsqe->is_waiter_alive = true; + crsqe->is_in_used = true; + crsqe->opcode = opcode; + + crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); + if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) { + struct bng_re_rcfw_sbuf *sbuf = msg->sb; + + __set_cmdq_base_resp_addr(msg->req, msg->req_sz, + cpu_to_le64(sbuf->dma_addr)); + __set_cmdq_base_resp_size(msg->req, msg->req_sz, + ALIGN(sbuf->size, + BNG_FW_CMDQE_UNITS) / + BNG_FW_CMDQE_UNITS); + } + + preq = (u8 *)msg->req; + do { + /* Locate the next cmdq slot */ + sw_prod = HWQ_CMP(hwq->prod, hwq); + cmdqe = bng_re_get_qe(hwq, sw_prod, NULL); + /* Copy a segment of the req cmd to the cmdq */ + memset(cmdqe, 0, sizeof(*cmdqe)); + memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe))); + preq += min_t(u32, bsize, sizeof(*cmdqe)); + bsize -= min_t(u32, bsize, sizeof(*cmdqe)); + hwq->prod++; + } while (bsize > 0); + cmdq->seq_num++; + + cmdq_prod = hwq->prod & 0xFFFF; + if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) { + /* The very first doorbell write + * is required to set this flag + * which prompts the FW to reset + * its internal pointers + */ + cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG); + clear_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags); + } + /* ring CMDQ DB */ + wmb(); + writel(cmdq_prod, cmdq->cmdq_mbox.prod); + writel(BNG_FW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); + spin_unlock_bh(&hwq->lock); + /* Return the CREQ response pointer */ + return 0; +} + +/** + * __wait_for_resp - Don't hold the cpu context and wait for response + * @rcfw: rcfw channel instance of rdev + * @cookie: cookie to track the command + * + * Wait for command completion in sleepable context. + * + * Returns: + * 0 if command is completed by firmware. + * Non zero error code for rest of the case. + */ +static int __wait_for_resp(struct bng_re_rcfw *rcfw, u16 cookie) +{ + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_crsqe *crsqe; + + cmdq = &rcfw->cmdq; + crsqe = &rcfw->crsqe_tbl[cookie]; + + do { + wait_event_timeout(cmdq->waitq, + !crsqe->is_in_used, + secs_to_jiffies(rcfw->max_timeout)); + + if (!crsqe->is_in_used) + return 0; + + bng_re_service_creq(&rcfw->creq.creq_tasklet); + + if (!crsqe->is_in_used) + return 0; + } while (true); +}; + +/** + * bng_re_rcfw_send_message - interface to send + * and complete rcfw command. + * @rcfw: rcfw channel instance of rdev + * @msg: message to send + * + * This function does not account shadow queue depth. It will send + * all the command unconditionally as long as send queue is not full. + * + * Returns: + * 0 if command completed by firmware. + * Non zero if the command is not completed by firmware. + */ +int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg) +{ + struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; + struct bng_re_crsqe *crsqe; + u16 cookie; + int rc; + u8 opcode; + + opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); + + rc = __send_message_basic_sanity(rcfw, msg, opcode); + if (rc) + return rc == -ENXIO ? bng_re_map_rc(opcode) : rc; + + rc = __send_message(rcfw, msg, opcode); + if (rc) + return rc; + + cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) + & BNG_FW_MAX_COOKIE_VALUE; + + rc = __wait_for_resp(rcfw, cookie); + + if (rc) { + spin_lock_bh(&rcfw->cmdq.hwq.lock); + crsqe = &rcfw->crsqe_tbl[cookie]; + crsqe->is_waiter_alive = false; + if (rc == -ENODEV) + set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); + spin_unlock_bh(&rcfw->cmdq.hwq.lock); + return -ETIMEDOUT; + } + + if (evnt->status) { + /* failed with status */ + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", + cookie, opcode, evnt->status); + rc = -EIO; + } + + return rc; +} + +static int bng_re_map_cmdq_mbox(struct bng_re_rcfw *rcfw) +{ + struct bng_re_cmdq_mbox *mbox; + resource_size_t bar_reg; + struct pci_dev *pdev; + + pdev = rcfw->pdev; + mbox = &rcfw->cmdq.cmdq_mbox; + + mbox->reg.bar_id = BNG_FW_COMM_PCI_BAR_REGION; + mbox->reg.len = BNG_FW_COMM_SIZE; + mbox->reg.bar_base = pci_resource_start(pdev, mbox->reg.bar_id); + if (!mbox->reg.bar_base) { + dev_err(&pdev->dev, + "CMDQ BAR region %d resc start is 0!\n", + mbox->reg.bar_id); + return -ENOMEM; + } + + bar_reg = mbox->reg.bar_base + BNG_FW_COMM_BASE_OFFSET; + mbox->reg.len = BNG_FW_COMM_SIZE; + mbox->reg.bar_reg = ioremap(bar_reg, mbox->reg.len); + if (!mbox->reg.bar_reg) { + dev_err(&pdev->dev, + "CMDQ BAR region %d mapping failed\n", + mbox->reg.bar_id); + return -ENOMEM; + } + + mbox->prod = (void __iomem *)(mbox->reg.bar_reg + + BNG_FW_PF_VF_COMM_PROD_OFFSET); + mbox->db = (void __iomem *)(mbox->reg.bar_reg + BNG_FW_COMM_TRIG_OFFSET); + return 0; +} + +static irqreturn_t bng_re_creq_irq(int irq, void *dev_instance) +{ + struct bng_re_rcfw *rcfw = dev_instance; + struct bng_re_creq_ctx *creq; + struct bng_re_hwq *hwq; + u32 sw_cons; + + creq = &rcfw->creq; + hwq = &creq->hwq; + /* Prefetch the CREQ element */ + sw_cons = HWQ_CMP(hwq->cons, hwq); + bng_re_get_qe(hwq, sw_cons, NULL); + + tasklet_schedule(&creq->creq_tasklet); + return IRQ_HANDLED; +} + +int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector, + bool need_init) +{ + struct bng_re_creq_ctx *creq; + struct bng_re_res *res; + int rc; + + creq = &rcfw->creq; + res = rcfw->res; + + if (creq->irq_handler_avail) + return -EFAULT; + + creq->msix_vec = msix_vector; + if (need_init) + tasklet_setup(&creq->creq_tasklet, bng_re_service_creq); + else + tasklet_enable(&creq->creq_tasklet); + + creq->irq_name = kasprintf(GFP_KERNEL, "bng_re-creq@pci:%s", + pci_name(res->pdev)); + if (!creq->irq_name) + return -ENOMEM; + rc = request_irq(creq->msix_vec, bng_re_creq_irq, 0, + creq->irq_name, rcfw); + if (rc) { + kfree(creq->irq_name); + creq->irq_name = NULL; + tasklet_disable(&creq->creq_tasklet); + return rc; + } + creq->irq_handler_avail = true; + + bng_re_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true); + atomic_inc(&rcfw->rcfw_intr_enabled); + + return 0; +} + +static int bng_re_map_creq_db(struct bng_re_rcfw *rcfw, u32 reg_offt) +{ + struct bng_re_creq_db *creq_db; + resource_size_t bar_reg; + struct pci_dev *pdev; + + pdev = rcfw->pdev; + creq_db = &rcfw->creq.creq_db; + + creq_db->dbinfo.flags = 0; + creq_db->reg.bar_id = BNG_FW_COMM_CONS_PCI_BAR_REGION; + creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id); + if (!creq_db->reg.bar_id) + dev_err(&pdev->dev, + "CREQ BAR region %d resc start is 0!", + creq_db->reg.bar_id); + + bar_reg = creq_db->reg.bar_base + reg_offt; + + creq_db->reg.len = BNG_FW_CREQ_DB_LEN; + creq_db->reg.bar_reg = ioremap(bar_reg, creq_db->reg.len); + if (!creq_db->reg.bar_reg) { + dev_err(&pdev->dev, + "CREQ BAR region %d mapping failed", + creq_db->reg.bar_id); + return -ENOMEM; + } + creq_db->dbinfo.db = creq_db->reg.bar_reg; + creq_db->dbinfo.hwq = &rcfw->creq.hwq; + creq_db->dbinfo.xid = rcfw->creq.ring_id; + return 0; +} + +void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill) +{ + struct bng_re_creq_ctx *creq; + + creq = &rcfw->creq; + + if (!creq->irq_handler_avail) + return; + + creq->irq_handler_avail = false; + /* Mask h/w interrupts */ + bng_re_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false); + /* Sync with last running IRQ-handler */ + synchronize_irq(creq->msix_vec); + free_irq(creq->msix_vec, rcfw); + kfree(creq->irq_name); + creq->irq_name = NULL; + atomic_set(&rcfw->rcfw_intr_enabled, 0); + if (kill) + tasklet_kill(&creq->creq_tasklet); + tasklet_disable(&creq->creq_tasklet); +} + +void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw) +{ + struct bng_re_creq_ctx *creq; + struct bng_re_cmdq_ctx *cmdq; + + creq = &rcfw->creq; + cmdq = &rcfw->cmdq; + /* Make sure the HW channel is stopped! */ + bng_re_rcfw_stop_irq(rcfw, true); + + iounmap(cmdq->cmdq_mbox.reg.bar_reg); + iounmap(creq->creq_db.reg.bar_reg); + + cmdq->cmdq_mbox.reg.bar_reg = NULL; + creq->creq_db.reg.bar_reg = NULL; + creq->msix_vec = 0; +} + +static void bng_re_start_rcfw(struct bng_re_rcfw *rcfw) +{ + struct bng_re_cmdq_ctx *cmdq; + struct bng_re_creq_ctx *creq; + struct bng_re_cmdq_mbox *mbox; + struct cmdq_init init = {0}; + + cmdq = &rcfw->cmdq; + creq = &rcfw->creq; + mbox = &cmdq->cmdq_mbox; + + init.cmdq_pbl = cpu_to_le64(cmdq->hwq.pbl[BNG_PBL_LVL_0].pg_map_arr[0]); + init.cmdq_size_cmdq_lvl = + cpu_to_le16(((rcfw->cmdq_depth << + CMDQ_INIT_CMDQ_SIZE_SFT) & + CMDQ_INIT_CMDQ_SIZE_MASK) | + ((cmdq->hwq.level << + CMDQ_INIT_CMDQ_LVL_SFT) & + CMDQ_INIT_CMDQ_LVL_MASK)); + init.creq_ring_id = cpu_to_le16(creq->ring_id); + /* Write to the mailbox register */ + __iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4); +} + +int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, + int msix_vector, + int cp_bar_reg_off) +{ + struct bng_re_cmdq_ctx *cmdq; + int rc; + + cmdq = &rcfw->cmdq; + + /* Assign defaults */ + cmdq->seq_num = 0; + set_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags); + init_waitqueue_head(&cmdq->waitq); + + rc = bng_re_map_cmdq_mbox(rcfw); + if (rc) + return rc; + + rc = bng_re_map_creq_db(rcfw, cp_bar_reg_off); + if (rc) + return rc; + + rc = bng_re_rcfw_start_irq(rcfw, msix_vector, true); + if (rc) { + dev_err(&rcfw->pdev->dev, + "Failed to request IRQ for CREQ rc = 0x%x\n", rc); + bng_re_disable_rcfw_channel(rcfw); + return rc; + } + + bng_re_start_rcfw(rcfw); + return 0; +} + +int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw) +{ + struct creq_deinitialize_fw_resp resp = {}; + struct cmdq_deinitialize_fw req = {}; + struct bng_re_cmdqmsg msg = {}; + int rc; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_DEINITIALIZE_FW, + sizeof(req)); + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, + sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return rc; + + clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags); + return 0; +} +static inline bool _is_hw_retx_supported(u16 dev_cap_flags) +{ + return dev_cap_flags & + (CREQ_QUERY_FUNC_RESP_SB_HW_REQUESTER_RETX_ENABLED | + CREQ_QUERY_FUNC_RESP_SB_HW_RESPONDER_RETX_ENABLED); +} + +#define BNG_RE_HW_RETX(a) _is_hw_retx_supported((a)) +static inline bool _is_optimize_modify_qp_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & + CREQ_QUERY_FUNC_RESP_SB_OPTIMIZE_MODIFY_QP_SUPPORTED; +} + +int bng_re_init_rcfw(struct bng_re_rcfw *rcfw, + struct bng_re_stats *stats_ctx) +{ + struct creq_initialize_fw_resp resp = {}; + struct cmdq_initialize_fw req = {}; + struct bng_re_cmdqmsg msg = {}; + int rc; + u16 flags = 0; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_INITIALIZE_FW, + sizeof(req)); + /* Supply (log-base-2-of-host-page-size - base-page-shift) + * to bono to adjust the doorbell page sizes. + */ + req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT - + BNG_FW_DBR_BASE_PAGE_SHIFT); + if (BNG_RE_HW_RETX(rcfw->res->dattr->dev_cap_flags)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED; + if (_is_optimize_modify_qp_supported(rcfw->res->dattr->dev_cap_flags2)) + flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED; + req.flags |= cpu_to_le16(flags); + req.stat_ctx_id = cpu_to_le32(stats_ctx->fw_id); + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return rc; + set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags); + return 0; +} diff --git a/drivers/infiniband/hw/bng_re/bng_fw.h b/drivers/infiniband/hw/bng_re/bng_fw.h new file mode 100644 index 000000000000..c89c926ec2fc --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_fw.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_FW_H__ +#define __BNG_FW_H__ + +#include "bng_tlv.h" + +/* FW DB related */ +#define BNG_FW_CMDQ_TRIG_VAL 1 +#define BNG_FW_COMM_PCI_BAR_REGION 0 +#define BNG_FW_COMM_CONS_PCI_BAR_REGION 2 +#define BNG_FW_DBR_BASE_PAGE_SHIFT 12 +#define BNG_FW_COMM_SIZE 0x104 +#define BNG_FW_COMM_BASE_OFFSET 0x600 +#define BNG_FW_COMM_TRIG_OFFSET 0x100 +#define BNG_FW_PF_VF_COMM_PROD_OFFSET 0xc +#define BNG_FW_CREQ_DB_LEN 8 + +/* CREQ */ +#define BNG_FW_CREQE_MAX_CNT (64 * 1024) +#define BNG_FW_CREQE_UNITS 16 +#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100 +#define BNG_FW_CREQ_CMP_VALID(hdr, pass) \ + (!!((hdr)->v & CREQ_BASE_V) == \ + !((pass) & BNG_RE_FLAG_EPOCH_CONS_MASK)) +#define BNG_FW_CREQ_ENTRY_POLL_BUDGET 0x100 + +/* CMDQ */ +struct bng_fw_cmdqe { + u8 data[16]; +}; + +#define BNG_FW_CMDQE_MAX_CNT 8192 +#define BNG_FW_CMDQE_UNITS sizeof(struct bng_fw_cmdqe) +#define BNG_FW_CMDQE_BYTES(depth) ((depth) * BNG_FW_CMDQE_UNITS) + +#define BNG_FW_MAX_COOKIE_VALUE (BNG_FW_CMDQE_MAX_CNT - 1) +#define BNG_FW_CMD_IS_BLOCKING 0x8000 + +/* Crsq buf is 1024-Byte */ +struct bng_re_crsbe { + u8 data[1024]; +}; + + +static inline u32 bng_fw_cmdqe_npages(u32 depth) +{ + u32 npages; + + npages = BNG_FW_CMDQE_BYTES(depth) / PAGE_SIZE; + if (BNG_FW_CMDQE_BYTES(depth) % PAGE_SIZE) + npages++; + return npages; +} + +static inline u32 bng_fw_cmdqe_page_size(u32 depth) +{ + return (bng_fw_cmdqe_npages(depth) * PAGE_SIZE); +} +struct bng_re_cmdq_mbox { + struct bng_re_reg_desc reg; + void __iomem *prod; + void __iomem *db; +}; + +/* HWQ */ +struct bng_re_cmdq_ctx { + struct bng_re_hwq hwq; + struct bng_re_cmdq_mbox cmdq_mbox; + unsigned long flags; +#define FIRMWARE_INITIALIZED_FLAG (0) +#define FIRMWARE_STALL_DETECTED (3) +#define FIRMWARE_FIRST_FLAG (31) + wait_queue_head_t waitq; + u32 seq_num; +}; + +struct bng_re_creq_db { + struct bng_re_reg_desc reg; + struct bng_re_db_info dbinfo; +}; + +struct bng_re_creq_stat { + u64 creq_qp_event_processed; + u64 creq_func_event_processed; +}; + +struct bng_re_creq_ctx { + struct bng_re_hwq hwq; + struct bng_re_creq_db creq_db; + struct bng_re_creq_stat stats; + struct tasklet_struct creq_tasklet; + u16 ring_id; + int msix_vec; + bool irq_handler_avail; + char *irq_name; +}; + +struct bng_re_crsqe { + struct creq_qp_event *resp; + u32 req_size; + /* Free slots at the time of submission */ + u32 free_slots; + u8 opcode; + bool is_waiter_alive; + bool is_in_used; +}; + +struct bng_re_rcfw_sbuf { + void *sb; + dma_addr_t dma_addr; + u32 size; +}; + +/* RoCE FW Communication Channels */ +struct bng_re_rcfw { + struct pci_dev *pdev; + struct bng_re_res *res; + struct bng_re_cmdq_ctx cmdq; + struct bng_re_creq_ctx creq; + struct bng_re_crsqe *crsqe_tbl; + /* To synchronize the qp-handle hash table */ + spinlock_t tbl_lock; + u32 cmdq_depth; + /* cached from chip cctx for quick reference in slow path */ + u16 max_timeout; + atomic_t rcfw_intr_enabled; +}; + +struct bng_re_cmdqmsg { + struct cmdq_base *req; + struct creq_base *resp; + void *sb; + u32 req_sz; + u32 res_sz; + u8 block; +}; + +static inline void bng_re_rcfw_cmd_prep(struct cmdq_base *req, + u8 opcode, u8 cmd_size) +{ + req->opcode = opcode; + req->cmd_size = cmd_size; +} + +static inline void bng_re_fill_cmdqmsg(struct bng_re_cmdqmsg *msg, + void *req, void *resp, void *sb, + u32 req_sz, u32 res_sz, u8 block) +{ + msg->req = req; + msg->resp = resp; + msg->sb = sb; + msg->req_sz = req_sz; + msg->res_sz = res_sz; + msg->block = block; +} + +/* Get the number of command units required for the req. The + * function returns correct value only if called before + * setting using bng_re_set_cmd_slots + */ +static inline u32 bng_re_get_cmd_slots(struct cmdq_base *req) +{ + u32 cmd_units = 0; + + if (HAS_TLV_HEADER(req)) { + struct roce_tlv *tlv_req = (struct roce_tlv *)req; + + cmd_units = tlv_req->total_size; + } else { + cmd_units = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) / + BNG_FW_CMDQE_UNITS; + } + + return cmd_units; +} + +static inline u32 bng_re_set_cmd_slots(struct cmdq_base *req) +{ + u32 cmd_byte = 0; + + if (HAS_TLV_HEADER(req)) { + struct roce_tlv *tlv_req = (struct roce_tlv *)req; + + cmd_byte = tlv_req->total_size * BNG_FW_CMDQE_UNITS; + } else { + cmd_byte = req->cmd_size; + req->cmd_size = (req->cmd_size + BNG_FW_CMDQE_UNITS - 1) / + BNG_FW_CMDQE_UNITS; + } + + return cmd_byte; +} + +void bng_re_free_rcfw_channel(struct bng_re_rcfw *rcfw); +int bng_re_alloc_fw_channel(struct bng_re_res *res, + struct bng_re_rcfw *rcfw); +int bng_re_enable_fw_channel(struct bng_re_rcfw *rcfw, + int msix_vector, + int cp_bar_reg_off); +void bng_re_disable_rcfw_channel(struct bng_re_rcfw *rcfw); +int bng_re_rcfw_start_irq(struct bng_re_rcfw *rcfw, int msix_vector, + bool need_init); +void bng_re_rcfw_stop_irq(struct bng_re_rcfw *rcfw, bool kill); +int bng_re_rcfw_send_message(struct bng_re_rcfw *rcfw, + struct bng_re_cmdqmsg *msg); +int bng_re_init_rcfw(struct bng_re_rcfw *rcfw, + struct bng_re_stats *stats_ctx); +int bng_re_deinit_rcfw(struct bng_re_rcfw *rcfw); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_re.h b/drivers/infiniband/hw/bng_re/bng_re.h new file mode 100644 index 000000000000..dae4862621a7 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_re.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RE_H__ +#define __BNG_RE_H__ + +#include "bng_res.h" + +#define BNG_RE_ADEV_NAME "bng_en" + +#define BNG_RE_DESC "Broadcom 800G RoCE Driver" + +#define rdev_to_dev(rdev) ((rdev) ? (&(rdev)->ibdev.dev) : NULL) + +#define BNG_RE_MIN_MSIX 2 +#define BNG_RE_MAX_MSIX BNGE_MAX_ROCE_MSIX + +#define BNG_RE_CREQ_NQ_IDX 0 + +#define BNGE_INVALID_STATS_CTX_ID -1 +/* NQ specific structures */ +struct bng_re_nq_db { + struct bng_re_reg_desc reg; + struct bng_re_db_info dbinfo; +}; + +struct bng_re_nq { + struct pci_dev *pdev; + struct bng_re_res *res; + char *name; + struct bng_re_hwq hwq; + struct bng_re_nq_db nq_db; + u16 ring_id; + int msix_vec; + cpumask_t mask; + struct tasklet_struct nq_tasklet; + bool requested; + int budget; + u32 load; + + struct workqueue_struct *cqn_wq; +}; + +struct bng_re_nq_record { + struct bnge_msix_info msix_entries[BNG_RE_MAX_MSIX]; + struct bng_re_nq nq[BNG_RE_MAX_MSIX]; + int num_msix; + /* serialize NQ access */ + struct mutex load_lock; +}; + +struct bng_re_en_dev_info { + struct bng_re_dev *rdev; + struct bnge_auxr_dev *auxr_dev; +}; + +struct bng_re_ring_attr { + dma_addr_t *dma_arr; + int pages; + int type; + u32 depth; + u32 lrid; /* Logical ring id */ + u8 mode; +}; + +struct bng_re_dev { + struct ib_device ibdev; + unsigned long flags; +#define BNG_RE_FLAG_NETDEV_REGISTERED 0 +#define BNG_RE_FLAG_RCFW_CHANNEL_EN 1 + struct net_device *netdev; + struct auxiliary_device *adev; + struct bnge_auxr_dev *aux_dev; + struct bng_re_chip_ctx *chip_ctx; + int fn_id; + struct bng_re_res bng_res; + struct bng_re_rcfw rcfw; + struct bng_re_nq_record *nqr; + /* Device Resources */ + struct bng_re_dev_attr *dev_attr; + struct dentry *dbg_root; + struct bng_re_stats stats_ctx; +}; + +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_res.c b/drivers/infiniband/hw/bng_re/bng_res.c new file mode 100644 index 000000000000..c50823758b53 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_res.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. + +#include <linux/pci.h> +#include <linux/vmalloc.h> +#include <rdma/ib_umem.h> + +#include <linux/bnxt/hsi.h> +#include "bng_res.h" +#include "roce_hsi.h" + +/* Stats */ +void bng_re_free_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_stats *stats) +{ + if (stats->dma) { + dma_free_coherent(&pdev->dev, stats->size, + stats->dma, stats->dma_map); + } + memset(stats, 0, sizeof(*stats)); + stats->fw_id = -1; +} + +int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_chip_ctx *cctx, + struct bng_re_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->fw_id = -1; + stats->size = cctx->hw_stats_size; + stats->dma = dma_alloc_coherent(&pdev->dev, stats->size, + &stats->dma_map, GFP_KERNEL); + if (!stats->dma) + return -ENOMEM; + + return 0; +} + +static void bng_free_pbl(struct bng_re_res *res, struct bng_re_pbl *pbl) +{ + struct pci_dev *pdev = res->pdev; + int i; + + for (i = 0; i < pbl->pg_count; i++) { + if (pbl->pg_arr[i]) + dma_free_coherent(&pdev->dev, pbl->pg_size, + (void *)((unsigned long) + pbl->pg_arr[i] & + PAGE_MASK), + pbl->pg_map_arr[i]); + else + dev_warn(&pdev->dev, + "PBL free pg_arr[%d] empty?!\n", i); + pbl->pg_arr[i] = NULL; + } + + vfree(pbl->pg_arr); + pbl->pg_arr = NULL; + vfree(pbl->pg_map_arr); + pbl->pg_map_arr = NULL; + pbl->pg_count = 0; + pbl->pg_size = 0; +} + +static int bng_alloc_pbl(struct bng_re_res *res, + struct bng_re_pbl *pbl, + struct bng_re_sg_info *sginfo) +{ + struct pci_dev *pdev = res->pdev; + u32 pages; + int i; + + if (sginfo->nopte) + return 0; + pages = sginfo->npages; + + /* page ptr arrays */ + pbl->pg_arr = vmalloc_array(pages, sizeof(void *)); + if (!pbl->pg_arr) + return -ENOMEM; + + pbl->pg_map_arr = vmalloc_array(pages, sizeof(dma_addr_t)); + if (!pbl->pg_map_arr) { + vfree(pbl->pg_arr); + pbl->pg_arr = NULL; + return -ENOMEM; + } + pbl->pg_count = 0; + pbl->pg_size = sginfo->pgsize; + + for (i = 0; i < pages; i++) { + pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev, + pbl->pg_size, + &pbl->pg_map_arr[i], + GFP_KERNEL); + if (!pbl->pg_arr[i]) + goto fail; + pbl->pg_count++; + } + + return 0; +fail: + bng_free_pbl(res, pbl); + return -ENOMEM; +} + +void bng_re_free_hwq(struct bng_re_res *res, + struct bng_re_hwq *hwq) +{ + int i; + + if (!hwq->max_elements) + return; + if (hwq->level >= BNG_PBL_LVL_MAX) + return; + + for (i = 0; i < hwq->level + 1; i++) + bng_free_pbl(res, &hwq->pbl[i]); + + hwq->level = BNG_PBL_LVL_MAX; + hwq->max_elements = 0; + hwq->element_size = 0; + hwq->prod = 0; + hwq->cons = 0; +} + +/* All HWQs are power of 2 in size */ +int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq, + struct bng_re_hwq_attr *hwq_attr) +{ + u32 npages, pg_size; + struct bng_re_sg_info sginfo = {}; + u32 depth, stride, npbl, npde; + dma_addr_t *src_phys_ptr, **dst_virt_ptr; + struct bng_re_res *res; + struct pci_dev *pdev; + int i, rc, lvl; + + res = hwq_attr->res; + pdev = res->pdev; + pg_size = hwq_attr->sginfo->pgsize; + hwq->level = BNG_PBL_LVL_MAX; + + depth = roundup_pow_of_two(hwq_attr->depth); + stride = roundup_pow_of_two(hwq_attr->stride); + + npages = (depth * stride) / pg_size; + if ((depth * stride) % pg_size) + npages++; + if (!npages) + return -EINVAL; + hwq_attr->sginfo->npages = npages; + + if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) { + /* This request is Level 0, map PTE */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_0; + goto done; + } + + if (npages >= MAX_PBL_LVL_0_PGS) { + if (npages > MAX_PBL_LVL_1_PGS) { + u32 flag = PTU_PTE_VALID; + /* 2 levels of indirection */ + npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT; + if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT)) + npbl++; + npde = npbl >> MAX_PDL_LVL_SHIFT; + if (npbl % BIT(MAX_PDL_LVL_SHIFT)) + npde++; + /* Alloc PDE pages */ + sginfo.pgsize = npde * pg_size; + sginfo.npages = 1; + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo); + if (rc) + goto fail; + + /* Alloc PBL pages */ + sginfo.npages = npbl; + sginfo.pgsize = PAGE_SIZE; + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], &sginfo); + if (rc) + goto fail; + /* Fill PDL with PBL page pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++) + dst_virt_ptr[0][i] = src_phys_ptr[i] | flag; + + /* Alloc or init PTEs */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_2], + hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_2; + if (hwq_attr->sginfo->nopte) + goto done; + /* Fill PBLs with PTE pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_1].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_2].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_2].pg_count; i++) { + dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = + src_phys_ptr[i] | PTU_PTE_VALID; + } + if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) { + /* Find the last pg of the size */ + i = hwq->pbl[BNG_PBL_LVL_2].pg_count; + dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |= + PTU_PTE_LAST; + if (i > 1) + dst_virt_ptr[PTR_PG(i - 2)] + [PTR_IDX(i - 2)] |= + PTU_PTE_NEXT_TO_LAST; + } + } else { /* pages < 512 npbl = 1, npde = 0 */ + u32 flag = PTU_PTE_VALID; + + /* 1 level of indirection */ + npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT; + if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT)) + npbl++; + sginfo.npages = npbl; + sginfo.pgsize = PAGE_SIZE; + /* Alloc PBL page */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_0], &sginfo); + if (rc) + goto fail; + /* Alloc or init PTEs */ + rc = bng_alloc_pbl(res, &hwq->pbl[BNG_PBL_LVL_1], + hwq_attr->sginfo); + if (rc) + goto fail; + hwq->level = BNG_PBL_LVL_1; + if (hwq_attr->sginfo->nopte) + goto done; + /* Fill PBL with PTE pointers */ + dst_virt_ptr = + (dma_addr_t **)hwq->pbl[BNG_PBL_LVL_0].pg_arr; + src_phys_ptr = hwq->pbl[BNG_PBL_LVL_1].pg_map_arr; + for (i = 0; i < hwq->pbl[BNG_PBL_LVL_1].pg_count; i++) + dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] = + src_phys_ptr[i] | flag; + if (hwq_attr->type == BNG_HWQ_TYPE_QUEUE) { + /* Find the last pg of the size */ + i = hwq->pbl[BNG_PBL_LVL_1].pg_count; + dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |= + PTU_PTE_LAST; + if (i > 1) + dst_virt_ptr[PTR_PG(i - 2)] + [PTR_IDX(i - 2)] |= + PTU_PTE_NEXT_TO_LAST; + } + } + } +done: + hwq->prod = 0; + hwq->cons = 0; + hwq->pdev = pdev; + hwq->depth = hwq_attr->depth; + hwq->max_elements = hwq->depth; + hwq->element_size = stride; + hwq->qe_ppg = pg_size / stride; + /* For direct access to the elements */ + lvl = hwq->level; + if (hwq_attr->sginfo->nopte && hwq->level) + lvl = hwq->level - 1; + hwq->pbl_ptr = hwq->pbl[lvl].pg_arr; + hwq->pbl_dma_ptr = hwq->pbl[lvl].pg_map_arr; + spin_lock_init(&hwq->lock); + + return 0; +fail: + bng_re_free_hwq(res, hwq); + return -ENOMEM; +} diff --git a/drivers/infiniband/hw/bng_re/bng_res.h b/drivers/infiniband/hw/bng_re/bng_res.h new file mode 100644 index 000000000000..9997f86d6a0e --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_res.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_RES_H__ +#define __BNG_RES_H__ + +#include "roce_hsi.h" + +#define BNG_ROCE_FW_MAX_TIMEOUT 60 + +#define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *)) +#define PTR_MAX_IDX_PER_PG (PTR_CNT_PER_PG - 1) +#define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG) +#define PTR_IDX(x) ((x) & PTR_MAX_IDX_PER_PG) + +#define HWQ_CMP(idx, hwq) ((idx) & ((hwq)->max_elements - 1)) +#define HWQ_FREE_SLOTS(hwq) (hwq->max_elements - \ + ((HWQ_CMP(hwq->prod, hwq)\ + - HWQ_CMP(hwq->cons, hwq))\ + & (hwq->max_elements - 1))) + +#define MAX_PBL_LVL_0_PGS 1 +#define MAX_PBL_LVL_1_PGS 512 +#define MAX_PBL_LVL_1_PGS_SHIFT 9 +#define MAX_PBL_LVL_1_PGS_FOR_LVL_2 256 +#define MAX_PBL_LVL_2_PGS (256 * 512) +#define MAX_PDL_LVL_SHIFT 9 + +#define BNG_RE_DBR_VALID (0x1UL << 26) +#define BNG_RE_DBR_EPOCH_SHIFT 24 +#define BNG_RE_DBR_TOGGLE_SHIFT 25 + +#define BNG_MAX_TQM_ALLOC_REQ 48 + +struct bng_re_reg_desc { + u8 bar_id; + resource_size_t bar_base; + unsigned long offset; + void __iomem *bar_reg; + size_t len; +}; + +struct bng_re_db_info { + void __iomem *db; + void __iomem *priv_db; + struct bng_re_hwq *hwq; + u32 xid; + u32 max_slot; + u32 flags; + u8 toggle; +}; + +enum bng_re_db_info_flags_mask { + BNG_RE_FLAG_EPOCH_CONS_SHIFT = 0x0UL, + BNG_RE_FLAG_EPOCH_PROD_SHIFT = 0x1UL, + BNG_RE_FLAG_EPOCH_CONS_MASK = 0x1UL, + BNG_RE_FLAG_EPOCH_PROD_MASK = 0x2UL, +}; + +enum bng_re_db_epoch_flag_shift { + BNG_RE_DB_EPOCH_CONS_SHIFT = BNG_RE_DBR_EPOCH_SHIFT, + BNG_RE_DB_EPOCH_PROD_SHIFT = (BNG_RE_DBR_EPOCH_SHIFT - 1), +}; + +struct bng_re_chip_ctx { + u16 chip_num; + u16 hw_stats_size; + u64 hwrm_intf_ver; + u16 hwrm_cmd_max_timeout; +}; + +struct bng_re_pbl { + u32 pg_count; + u32 pg_size; + void **pg_arr; + dma_addr_t *pg_map_arr; +}; + +enum bng_re_pbl_lvl { + BNG_PBL_LVL_0, + BNG_PBL_LVL_1, + BNG_PBL_LVL_2, + BNG_PBL_LVL_MAX +}; + +enum bng_re_hwq_type { + BNG_HWQ_TYPE_CTX, + BNG_HWQ_TYPE_QUEUE +}; + +struct bng_re_sg_info { + u32 npages; + u32 pgshft; + u32 pgsize; + bool nopte; +}; + +struct bng_re_hwq_attr { + struct bng_re_res *res; + struct bng_re_sg_info *sginfo; + enum bng_re_hwq_type type; + u32 depth; + u32 stride; + u32 aux_stride; + u32 aux_depth; +}; + +struct bng_re_hwq { + struct pci_dev *pdev; + /* lock to protect hwq */ + spinlock_t lock; + struct bng_re_pbl pbl[BNG_PBL_LVL_MAX + 1]; + /* Valid values: 0, 1, 2 */ + enum bng_re_pbl_lvl level; + /* PBL entries */ + void **pbl_ptr; + /* PBL dma_addr */ + dma_addr_t *pbl_dma_ptr; + u32 max_elements; + u32 depth; + u16 element_size; + u32 prod; + u32 cons; + /* queue entry per page */ + u16 qe_ppg; +}; + +struct bng_re_stats { + dma_addr_t dma_map; + void *dma; + u32 size; + u32 fw_id; +}; + +struct bng_re_res { + struct pci_dev *pdev; + struct bng_re_chip_ctx *cctx; + struct bng_re_dev_attr *dattr; +}; + +static inline void *bng_re_get_qe(struct bng_re_hwq *hwq, + u32 indx, u64 *pg) +{ + u32 pg_num, pg_idx; + + pg_num = (indx / hwq->qe_ppg); + pg_idx = (indx % hwq->qe_ppg); + if (pg) + *pg = (u64)&hwq->pbl_ptr[pg_num]; + return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx); +} + +#define BNG_RE_INIT_DBHDR(xid, type, indx, toggle) \ + (((u64)(((xid) & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | \ + (type) | BNG_RE_DBR_VALID) << 32) | (indx) | \ + (((u32)(toggle)) << (BNG_RE_DBR_TOGGLE_SHIFT))) + +static inline void bng_re_ring_db(struct bng_re_db_info *info, + u32 type) +{ + u64 key = 0; + u32 indx; + u8 toggle = 0; + + if (type == DBC_DBC_TYPE_CQ_ARMALL || + type == DBC_DBC_TYPE_CQ_ARMSE) + toggle = info->toggle; + + indx = (info->hwq->cons & DBC_DBC_INDEX_MASK) | + ((info->flags & BNG_RE_FLAG_EPOCH_CONS_MASK) << + BNG_RE_DB_EPOCH_CONS_SHIFT); + + key = BNG_RE_INIT_DBHDR(info->xid, type, indx, toggle); + writeq(key, info->db); +} + +static inline void bng_re_ring_nq_db(struct bng_re_db_info *info, + struct bng_re_chip_ctx *cctx, + bool arm) +{ + u32 type; + + type = arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ; + bng_re_ring_db(info, type); +} + +static inline void bng_re_hwq_incr_cons(u32 max_elements, u32 *cons, u32 cnt, + u32 *dbinfo_flags) +{ + /* move cons and update toggle/epoch if wrap around */ + *cons += cnt; + if (*cons >= max_elements) { + *cons %= max_elements; + *dbinfo_flags ^= 1UL << BNG_RE_FLAG_EPOCH_CONS_SHIFT; + } +} + +static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) +{ + return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); +} + +void bng_re_free_hwq(struct bng_re_res *res, + struct bng_re_hwq *hwq); + +int bng_re_alloc_init_hwq(struct bng_re_hwq *hwq, + struct bng_re_hwq_attr *hwq_attr); + +void bng_re_free_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_stats *stats); + +int bng_re_alloc_stats_ctx_mem(struct pci_dev *pdev, + struct bng_re_chip_ctx *cctx, + struct bng_re_stats *stats); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_sp.c b/drivers/infiniband/hw/bng_re/bng_sp.c new file mode 100644 index 000000000000..83099e05328d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_sp.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2025 Broadcom. +#include <linux/interrupt.h> +#include <linux/pci.h> + +#include "bng_res.h" +#include "bng_fw.h" +#include "bng_sp.h" +#include "bng_tlv.h" + +static bool bng_re_is_atomic_cap(struct bng_re_rcfw *rcfw) +{ + u16 pcie_ctl2 = 0; + + pcie_capability_read_word(rcfw->pdev, PCI_EXP_DEVCTL2, &pcie_ctl2); + return (pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ); +} + +static void bng_re_query_version(struct bng_re_rcfw *rcfw, + char *fw_ver) +{ + struct creq_query_version_resp resp = {}; + struct bng_re_cmdqmsg msg = {}; + struct cmdq_query_version req = {}; + int rc; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_QUERY_VERSION, + sizeof(req)); + + bng_re_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + return; + fw_ver[0] = resp.fw_maj; + fw_ver[1] = resp.fw_minor; + fw_ver[2] = resp.fw_bld; + fw_ver[3] = resp.fw_rsvd; +} + +int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw) +{ + struct bng_re_dev_attr *attr = rcfw->res->dattr; + struct creq_query_func_resp resp = {}; + struct bng_re_cmdqmsg msg = {}; + struct creq_query_func_resp_sb *sb; + struct bng_re_rcfw_sbuf sbuf; + struct cmdq_query_func req = {}; + u8 *tqm_alloc; + int i, rc; + u32 temp; + + bng_re_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_QUERY_FUNC, + sizeof(req)); + + sbuf.size = ALIGN(sizeof(*sb), BNG_FW_CMDQE_UNITS); + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + sb = sbuf.sb; + req.resp_size = sbuf.size / BNG_FW_CMDQE_UNITS; + bng_re_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bng_re_rcfw_send_message(rcfw, &msg); + if (rc) + goto bail; + /* Extract the context from the side buffer */ + attr->max_qp = le32_to_cpu(sb->max_qp); + /* max_qp value reported by FW doesn't include the QP1 */ + attr->max_qp += 1; + attr->max_qp_rd_atom = + sb->max_qp_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ? + BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom; + attr->max_qp_init_rd_atom = + sb->max_qp_init_rd_atom > BNG_RE_MAX_OUT_RD_ATOM ? + BNG_RE_MAX_OUT_RD_ATOM : sb->max_qp_init_rd_atom; + attr->max_qp_wqes = le16_to_cpu(sb->max_qp_wr) - 1; + + /* Adjust for max_qp_wqes for variable wqe */ + attr->max_qp_wqes = min_t(u32, attr->max_qp_wqes, BNG_VAR_MAX_WQE - 1); + + attr->max_qp_sges = min_t(u32, sb->max_sge_var_wqe, BNG_VAR_MAX_SGE); + attr->max_cq = le32_to_cpu(sb->max_cq); + attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); + attr->max_cq_sges = attr->max_qp_sges; + attr->max_mr = le32_to_cpu(sb->max_mr); + attr->max_mw = le32_to_cpu(sb->max_mw); + + attr->max_mr_size = le64_to_cpu(sb->max_mr_size); + attr->max_pd = 64 * 1024; + attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp); + attr->max_ah = le32_to_cpu(sb->max_ah); + + attr->max_srq = le16_to_cpu(sb->max_srq); + attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; + attr->max_srq_sges = sb->max_srq_sge; + attr->max_pkey = 1; + attr->max_inline_data = le32_to_cpu(sb->max_inline_data); + /* + * Read the max gid supported by HW. + * For each entry in HW GID in HW table, we consume 2 + * GID entries in the kernel GID table. So max_gid reported + * to stack can be up to twice the value reported by the HW, up to 256 gids. + */ + attr->max_sgid = le32_to_cpu(sb->max_gid); + attr->max_sgid = min_t(u32, BNG_RE_NUM_GIDS_SUPPORTED, 2 * attr->max_sgid); + attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); + attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); + + if (_is_max_srq_ext_supported(attr->dev_cap_flags2)) + attr->max_srq += le16_to_cpu(sb->max_srq_ext); + + bng_re_query_version(rcfw, attr->fw_ver); + for (i = 0; i < BNG_MAX_TQM_ALLOC_REQ / 4; i++) { + temp = le32_to_cpu(sb->tqm_alloc_reqs[i]); + tqm_alloc = (u8 *)&temp; + attr->tqm_alloc_reqs[i * 4] = *tqm_alloc; + attr->tqm_alloc_reqs[i * 4 + 1] = *(++tqm_alloc); + attr->tqm_alloc_reqs[i * 4 + 2] = *(++tqm_alloc); + attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc); + } + + attr->max_dpi = le32_to_cpu(sb->max_dpi); + attr->is_atomic = bng_re_is_atomic_cap(rcfw); +bail: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, + sbuf.sb, sbuf.dma_addr); + return rc; +} diff --git a/drivers/infiniband/hw/bng_re/bng_sp.h b/drivers/infiniband/hw/bng_re/bng_sp.h new file mode 100644 index 000000000000..e15190515ed1 --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_sp.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2025 Broadcom. + +#ifndef __BNG_SP_H__ +#define __BNG_SP_H__ + +#include "bng_fw.h" + +#define BNG_VAR_MAX_WQE 4352 +#define BNG_VAR_MAX_SGE 13 + +struct bng_re_dev_attr { +#define FW_VER_ARR_LEN 4 + u8 fw_ver[FW_VER_ARR_LEN]; +#define BNG_RE_NUM_GIDS_SUPPORTED 256 + u16 max_sgid; + u16 max_mrw; + u32 max_qp; +#define BNG_RE_MAX_OUT_RD_ATOM 126 + u32 max_qp_rd_atom; + u32 max_qp_init_rd_atom; + u32 max_qp_wqes; + u32 max_qp_sges; + u32 max_cq; + u32 max_cq_wqes; + u32 max_cq_sges; + u32 max_mr; + u64 max_mr_size; + u32 max_pd; + u32 max_mw; + u32 max_raw_ethy_qp; + u32 max_ah; + u32 max_srq; + u32 max_srq_wqes; + u32 max_srq_sges; + u32 max_pkey; + u32 max_inline_data; + u32 l2_db_size; + u8 tqm_alloc_reqs[BNG_MAX_TQM_ALLOC_REQ]; + bool is_atomic; + u16 dev_cap_flags; + u16 dev_cap_flags2; + u32 max_dpi; +}; + +int bng_re_get_dev_attr(struct bng_re_rcfw *rcfw); +#endif diff --git a/drivers/infiniband/hw/bng_re/bng_tlv.h b/drivers/infiniband/hw/bng_re/bng_tlv.h new file mode 100644 index 000000000000..278f4922962d --- /dev/null +++ b/drivers/infiniband/hw/bng_re/bng_tlv.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +#ifndef __BNG_TLV_H__ +#define __BNG_TLV_H__ + +#include "roce_hsi.h" + +struct roce_tlv { + struct tlv tlv; + u8 total_size; // in units of 16 byte chunks + u8 unused[7]; // for 16 byte alignment +}; + +/* + * TLV size in units of 16 byte chunks + */ +#define TLV_SIZE ((sizeof(struct roce_tlv) + 15) / 16) +/* + * TLV length in bytes + */ +#define TLV_BYTES (TLV_SIZE * 16) + +#define HAS_TLV_HEADER(msg) (le16_to_cpu(((struct tlv *)(msg))->cmd_discr) == CMD_DISCR_TLV_ENCAP) +#define GET_TLV_DATA(tlv) ((void *)&((uint8_t *)(tlv))[TLV_BYTES]) + +static inline u8 __get_cmdq_base_opcode(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->opcode; + else + return req->opcode; +} + +static inline void __set_cmdq_base_opcode(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->opcode = val; + else + req->opcode = val; +} + +static inline __le16 __get_cmdq_base_cookie(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->cookie; + else + return req->cookie; +} + +static inline void __set_cmdq_base_cookie(struct cmdq_base *req, + u32 size, __le16 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->cookie = val; + else + req->cookie = val; +} + +static inline __le64 __get_cmdq_base_resp_addr(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr; + else + return req->resp_addr; +} + +static inline void __set_cmdq_base_resp_addr(struct cmdq_base *req, + u32 size, __le64 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->resp_addr = val; + else + req->resp_addr = val; +} + +static inline u8 __get_cmdq_base_resp_size(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size; + else + return req->resp_size; +} + +static inline void __set_cmdq_base_resp_size(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->resp_size = val; + else + req->resp_size = val; +} + +static inline u8 __get_cmdq_base_cmd_size(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct roce_tlv *)(req))->total_size; + else + return req->cmd_size; +} + +static inline void __set_cmdq_base_cmd_size(struct cmdq_base *req, + u32 size, u8 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->cmd_size = val; + else + req->cmd_size = val; +} + +static inline __le16 __get_cmdq_base_flags(struct cmdq_base *req, u32 size) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + return ((struct cmdq_base *)GET_TLV_DATA(req))->flags; + else + return req->flags; +} + +static inline void __set_cmdq_base_flags(struct cmdq_base *req, + u32 size, __le16 val) +{ + if (HAS_TLV_HEADER(req) && size > TLV_BYTES) + ((struct cmdq_base *)GET_TLV_DATA(req))->flags = val; + else + req->flags = val; +} + +#endif /* __BNG_TLV_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 3485e495ac6a..3a7ce4729fcf 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -224,6 +224,8 @@ struct bnxt_re_dev { struct workqueue_struct *dcb_wq; struct dentry *cc_config; struct bnxt_re_dbg_cc_config_params *cc_config_params; + struct dentry *cq_coal_cfg; + struct bnxt_re_dbg_cq_coal_params *cq_coal_cfg_params; #define BNXT_VPD_FLD_LEN 32 char board_partno[BNXT_VPD_FLD_LEN]; /* RoCE mirror */ diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c index be5e9b5ca2f0..88817c86ae24 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.c +++ b/drivers/infiniband/hw/bnxt_re/debugfs.c @@ -23,6 +23,14 @@ static struct dentry *bnxt_re_debugfs_root; +static const char * const bnxt_re_cq_coal_str[] = { + "buf_maxtime", + "normal_maxbuf", + "during_maxbuf", + "en_ring_idle_mode", + "enable", +}; + static const char * const bnxt_re_cc_gen0_name[] = { "enable_cc", "run_avg_weight_g", @@ -349,6 +357,123 @@ static void bnxt_re_debugfs_add_info(struct bnxt_re_dev *rdev) debugfs_create_file("info", 0400, rdev->dbg_root, rdev, &info_fops); } +static ssize_t cq_coal_cfg_write(struct file *file, + const char __user *buf, + size_t count, loff_t *pos) +{ + struct seq_file *s = file->private_data; + struct bnxt_re_cq_coal_param *param = s->private; + struct bnxt_re_dev *rdev = param->rdev; + int offset = param->offset; + char lbuf[16] = { }; + u32 val; + + if (count > sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = '\0'; + + if (kstrtou32(lbuf, 0, &val)) + return -EINVAL; + + switch (offset) { + case BNXT_RE_COAL_CQ_BUF_MAXTIME: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_BUF_MAXTIME) + return -EINVAL; + rdev->cq_coalescing.buf_maxtime = val; + break; + case BNXT_RE_COAL_CQ_NORMAL_MAXBUF: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_NORMAL_MAXBUF) + return -EINVAL; + rdev->cq_coalescing.normal_maxbuf = val; + break; + case BNXT_RE_COAL_CQ_DURING_MAXBUF: + if (val < 1 || val > BNXT_QPLIB_CQ_COAL_MAX_DURING_MAXBUF) + return -EINVAL; + rdev->cq_coalescing.during_maxbuf = val; + break; + case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE: + if (val > BNXT_QPLIB_CQ_COAL_MAX_EN_RING_IDLE_MODE) + return -EINVAL; + rdev->cq_coalescing.en_ring_idle_mode = val; + break; + case BNXT_RE_COAL_CQ_ENABLE: + if (val > 1) + return -EINVAL; + rdev->cq_coalescing.enable = val; + break; + default: + return -EINVAL; + } + return count; +} + +static int cq_coal_cfg_show(struct seq_file *s, void *unused) +{ + struct bnxt_re_cq_coal_param *param = s->private; + struct bnxt_re_dev *rdev = param->rdev; + int offset = param->offset; + u32 val = 0; + + switch (offset) { + case BNXT_RE_COAL_CQ_BUF_MAXTIME: + val = rdev->cq_coalescing.buf_maxtime; + break; + case BNXT_RE_COAL_CQ_NORMAL_MAXBUF: + val = rdev->cq_coalescing.normal_maxbuf; + break; + case BNXT_RE_COAL_CQ_DURING_MAXBUF: + val = rdev->cq_coalescing.during_maxbuf; + break; + case BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE: + val = rdev->cq_coalescing.en_ring_idle_mode; + break; + case BNXT_RE_COAL_CQ_ENABLE: + val = rdev->cq_coalescing.enable; + break; + default: + return -EINVAL; + } + + seq_printf(s, "%u\n", val); + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(cq_coal_cfg); + +static void bnxt_re_cleanup_cq_coal_debugfs(struct bnxt_re_dev *rdev) +{ + debugfs_remove_recursive(rdev->cq_coal_cfg); + kfree(rdev->cq_coal_cfg_params); +} + +static void bnxt_re_init_cq_coal_debugfs(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_dbg_cq_coal_params *dbg_cq_coal_params; + int i; + + if (!_is_cq_coalescing_supported(rdev->dev_attr->dev_cap_flags2)) + return; + + dbg_cq_coal_params = kzalloc(sizeof(*dbg_cq_coal_params), GFP_KERNEL); + if (!dbg_cq_coal_params) + return; + + rdev->cq_coal_cfg = debugfs_create_dir("cq_coal_cfg", rdev->dbg_root); + rdev->cq_coal_cfg_params = dbg_cq_coal_params; + + for (i = 0; i < BNXT_RE_COAL_CQ_MAX; i++) { + dbg_cq_coal_params->params[i].offset = i; + dbg_cq_coal_params->params[i].rdev = rdev; + debugfs_create_file(bnxt_re_cq_coal_str[i], + 0600, rdev->cq_coal_cfg, + &dbg_cq_coal_params->params[i], + &cq_coal_cfg_fops); + } +} + void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) { struct pci_dev *pdev = rdev->en_dev->pdev; @@ -374,10 +499,13 @@ void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev) rdev->cc_config, tmp_params, &bnxt_re_cc_config_ops); } + + bnxt_re_init_cq_coal_debugfs(rdev); } void bnxt_re_debugfs_rem_pdev(struct bnxt_re_dev *rdev) { + bnxt_re_cleanup_cq_coal_debugfs(rdev); debugfs_remove_recursive(rdev->qp_debugfs); debugfs_remove_recursive(rdev->cc_config); kfree(rdev->cc_config_params); diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.h b/drivers/infiniband/hw/bnxt_re/debugfs.h index 8f101df4e838..98f4620ef245 100644 --- a/drivers/infiniband/hw/bnxt_re/debugfs.h +++ b/drivers/infiniband/hw/bnxt_re/debugfs.h @@ -33,4 +33,23 @@ struct bnxt_re_cc_param { struct bnxt_re_dbg_cc_config_params { struct bnxt_re_cc_param gen0_parms[BNXT_RE_CC_PARAM_GEN0]; }; + +struct bnxt_re_cq_coal_param { + struct bnxt_re_dev *rdev; + u32 offset; +}; + +enum bnxt_re_cq_coal_types { + BNXT_RE_COAL_CQ_BUF_MAXTIME, + BNXT_RE_COAL_CQ_NORMAL_MAXBUF, + BNXT_RE_COAL_CQ_DURING_MAXBUF, + BNXT_RE_COAL_CQ_EN_RING_IDLE_MODE, + BNXT_RE_COAL_CQ_ENABLE, + BNXT_RE_COAL_CQ_MAX + +}; + +struct bnxt_re_dbg_cq_coal_params { + struct bnxt_re_cq_coal_param params[BNXT_RE_COAL_CQ_MAX]; +}; #endif diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 84ce3fce2826..f19b55c13d58 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -601,7 +601,8 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->qplib_mr.va = (u64)(unsigned long)fence->va; mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, - BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE); + BNXT_RE_FENCE_PBL_SIZE, PAGE_SIZE, + _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)); if (rc) { ibdev_err(&rdev->ibdev, "Failed to register fence-MR\n"); goto fail; @@ -4027,7 +4028,7 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->qplib_mr.hwq.level = PBL_LVL_MAX; mr->qplib_mr.total_size = -1; /* Infinte length */ rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, 0, - PAGE_SIZE); + PAGE_SIZE, false); if (rc) goto fail_mr; @@ -4257,7 +4258,8 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 umem_pgs = ib_umem_num_dma_blocks(umem, page_size); rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem, - umem_pgs, page_size); + umem_pgs, page_size, + _is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)); if (rc) { ibdev_err(&rdev->ibdev, "Failed to register user MR - rc = %d\n", rc); rc = -EIO; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b13810572c2e..73003ad25ee8 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1453,6 +1453,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev, atomic_set(&rdev->stats.res.pd_count, 0); rdev->cosq[0] = 0xFFFF; rdev->cosq[1] = 0xFFFF; + rdev->cq_coalescing.enable = 1; rdev->cq_coalescing.buf_maxtime = BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME; if (bnxt_re_chip_gen_p7(en_dev->chip_num)) { rdev->cq_coalescing.normal_maxbuf = BNXT_QPLIB_CQ_COAL_DEF_NORMAL_MAXBUF_P7; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index ce90d3d834d4..c88f049136fc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2226,7 +2226,8 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->max_wqe); - if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2)) { + if (_is_cq_coalescing_supported(res->dattr->dev_cap_flags2) && + cq->coalescing->enable) { req.flags |= cpu_to_le16(CMDQ_CREATE_CQ_FLAGS_COALESCING_VALID); coalescing |= ((cq->coalescing->buf_maxtime << CMDQ_CREATE_CQ_BUF_MAXTIME_SFT) & diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index b990d0c0ce1a..1b414a73b46d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -395,6 +395,7 @@ struct bnxt_qplib_cq_coal_param { u8 normal_maxbuf; u8 during_maxbuf; u8 en_ring_idle_mode; + u8 enable; }; #define BNXT_QPLIB_CQ_COAL_DEF_BUF_MAXTIME 0x1 diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 9ef581ed785c..408a34df2667 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -162,7 +162,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; attr->max_srq_sges = sb->max_srq_sge; attr->max_pkey = 1; - attr->max_inline_data = le32_to_cpu(sb->max_inline_data); + attr->max_inline_data = attr->max_qp_sges * sizeof(struct sq_sge); if (!bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx)) attr->l2_db_size = (sb->l2_db_space_size + 1) * (0x01 << RCFW_DBR_BASE_PAGE_SHIFT); @@ -578,7 +578,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, } int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - struct ib_umem *umem, int num_pbls, u32 buf_pg_size) + struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct bnxt_qplib_hwq_attr hwq_attr = {}; @@ -640,7 +640,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, req.access = (mr->access_flags & BNXT_QPLIB_MR_ACCESS_MASK); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); - if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) + if (unified_mr) req.key = cpu_to_le32(mr->pd->id); req.flags = cpu_to_le16(mr->flags); req.mr_size = cpu_to_le64(mr->total_size); @@ -651,7 +651,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, if (rc) goto fail; - if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) { + if (unified_mr) { mr->lkey = le32_to_cpu(resp.xid); mr->rkey = mr->lkey; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 147b5d9c0313..5a45c55c6464 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -341,7 +341,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, bool block); int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - struct ib_umem *umem, int num_pbls, u32 buf_pg_size); + struct ib_umem *umem, int num_pbls, u32 buf_pg_size, bool unified_mr); int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr); int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, int max); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index dcdfe250bdbe..adeed7447e7b 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -348,7 +348,7 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl, { int err; - pr_debug("*pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + pr_debug("*pbl_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", pbl_addr, rdev->lldi.vr->pbl.start, pbl_size); diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index e0acc185e719..ed21ba0037a4 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -993,10 +993,10 @@ static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, int ret; sock_set_reuseaddr(s->sk); - ret = s->ops->bind(s, laddr, laddrlen); + ret = s->ops->bind(s, (struct sockaddr_unsized *)laddr, laddrlen); if (ret) return ret; - ret = s->ops->connect(s, raddr, raddrlen, flags); + ret = s->ops->connect(s, (struct sockaddr_unsized *)raddr, raddrlen, flags); return ret < 0 ? ret : 0; } @@ -1315,7 +1315,7 @@ int erdma_create_listen(struct iw_cm_id *id, int backlog) if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) s->sk->sk_bound_dev_if = dev->netdev->ifindex; - ret = s->ops->bind(s, (struct sockaddr *)laddr, + ret = s->ops->bind(s, (struct sockaddr_unsized *)laddr, sizeof(struct sockaddr_in)); if (ret) goto error; diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index b35f92e7d865..e4aef102dac0 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -745,8 +745,8 @@ static int create_workqueues(struct hfi1_devdata *dd) ppd->hfi1_wq = alloc_workqueue( "hfi%d_%d", - WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | - WQ_MEM_RECLAIM, + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | + WQ_PERCPU, HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, dd->unit, pidx); if (!ppd->hfi1_wq) diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c index 370a5a8eaa71..6e0e3458d202 100644 --- a/drivers/infiniband/hw/hfi1/opfn.c +++ b/drivers/infiniband/hw/hfi1/opfn.c @@ -305,8 +305,8 @@ void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1) int opfn_init(void) { opfn_wq = alloc_workqueue("hfi_opfn", - WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | - WQ_MEM_RECLAIM, + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | + WQ_PERCPU, HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES); if (!opfn_wq) return -ENOMEM; diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index baf592e6f21b..d07ef02c5231 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -4,11 +4,13 @@ # ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 +ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3pf +ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3_common ccflags-y += -I $(src) hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_debugfs.o hns_roce_hw_v2.o + hns_roce_debugfs.o hns_roce_hw_v2.o hns_roce_bond.o obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 307c35888b30..0c1c32d23c88 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include "hns_roce_device.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c new file mode 100644 index 000000000000..cc85f3ce1f3e --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -0,0 +1,1012 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#include <net/lag.h> +#include <net/bonding.h> +#include "hns_roce_device.h" +#include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" + +static DEFINE_XARRAY(roce_bond_xa); + +static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev) +{ + struct ib_device *ibdev = + ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS); + + if (!ibdev) + return NULL; + + return container_of(ibdev, struct hns_roce_dev, ib_dev); +} + +static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev) +{ + struct net_device *upper_dev; + + rcu_read_lock(); + upper_dev = netdev_master_upper_dev_get_rcu(net_dev); + dev_hold(upper_dev); + rcu_read_unlock(); + + return upper_dev; +} + +static int get_netdev_bond_slave_id(struct net_device *net_dev, + struct hns_roce_bond_group *bond_grp) +{ + int i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) + if (net_dev == bond_grp->bond_func_info[i].net_dev) + return i; + + return -ENOENT; +} + +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, + u8 bus_num) +{ + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + struct hns_roce_bond_group *bond_grp; + struct net_device *upper_dev = NULL; + int i; + + if (!die_info) + return NULL; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + return bond_grp; + if (bond_grp->upper_dev) { + upper_dev = get_upper_dev_from_ndev(net_dev); + if (bond_grp->upper_dev == upper_dev) { + dev_put(upper_dev); + return bond_grp; + } + dev_put(upper_dev); + } + } + + return NULL; +} + +static int hns_roce_set_bond_netdev(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev) +{ + struct net_device *active_dev; + struct net_device *old_dev; + int i, ret = 0; + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + rcu_read_lock(); + active_dev = + bond_option_active_slave_get_rcu(netdev_priv(bond_grp->upper_dev)); + rcu_read_unlock(); + } else { + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + active_dev = bond_grp->bond_func_info[i].net_dev; + if (active_dev && + ib_get_curr_port_state(active_dev) == IB_PORT_ACTIVE) + break; + } + } + + if (!active_dev || i == ROCE_BOND_FUNC_MAX) + active_dev = get_hr_netdev(hr_dev, 0); + + old_dev = ib_device_get_netdev(&hr_dev->ib_dev, 1); + if (old_dev == active_dev) + goto out; + + ret = ib_device_set_netdev(&hr_dev->ib_dev, active_dev, 1); + if (ret) { + dev_err(hr_dev->dev, "failed to set netdev for bond.\n"); + goto out; + } + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + if (old_dev) + roce_del_all_netdev_gids(&hr_dev->ib_dev, 1, old_dev); + rdma_roce_rescan_port(&hr_dev->ib_dev, 1); + } +out: + dev_put(old_dev); + return ret; +} + +bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev) +{ + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED && + bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED) + return true; + + return false; +} + +static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + u32 active_slave_map = 0; + u8 active_slave_num = 0; + bool active; + u8 i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (!net_dev || !(bond_grp->slave_map & (1U << i))) + continue; + + active = (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ? + net_lag_port_dev_txable(net_dev) : + (ib_get_curr_port_state(net_dev) == IB_PORT_ACTIVE); + if (active) { + active_slave_num++; + active_slave_map |= (1U << i); + } + } + + bond_grp->active_slave_num = active_slave_num; + bond_grp->active_slave_map = active_slave_map; +} + +static int hns_roce_recover_bond(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev) +{ + bond_grp->main_hr_dev = hr_dev; + hns_roce_bond_get_active_slave(bond_grp); + + return hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); +} + +static void hns_roce_slave_uninit(struct hns_roce_bond_group *bond_grp, + u8 func_idx) +{ + struct hnae3_handle *handle; + + handle = bond_grp->bond_func_info[func_idx].handle; + if (handle->priv) + hns_roce_bond_uninit_client(bond_grp, func_idx); +} + +static struct hns_roce_dev + *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp, + u8 func_idx, bool need_switch); + +static int switch_main_dev(struct hns_roce_bond_group *bond_grp, + u8 main_func_idx) +{ + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + u8 i; + + bond_grp->main_hr_dev = NULL; + hns_roce_bond_uninit_client(bond_grp, main_func_idx); + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if ((bond_grp->slave_map & (1U << i)) && net_dev) { + /* In case this slave is still being registered as + * a non-bonded PF, uninit it first and then re-init + * it as the main device. + */ + hns_roce_slave_uninit(bond_grp, i); + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + + if (!bond_grp->main_hr_dev) + return -ENODEV; + + return 0; +} + +static struct hns_roce_dev + *hns_roce_slave_init(struct hns_roce_bond_group *bond_grp, + u8 func_idx, bool need_switch) +{ + struct hns_roce_dev *hr_dev = NULL; + struct hnae3_handle *handle; + u8 main_func_idx; + int ret; + + if (need_switch) { + main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); + if (func_idx == main_func_idx) { + ret = switch_main_dev(bond_grp, main_func_idx); + if (ret == -ENODEV) + return NULL; + } + } + + handle = bond_grp->bond_func_info[func_idx].handle; + if (handle) { + if (handle->priv) + return handle->priv; + /* Prevent this device from being initialized as a bond device */ + if (need_switch) + bond_grp->bond_func_info[func_idx].net_dev = NULL; + hr_dev = hns_roce_bond_init_client(bond_grp, func_idx); + if (!hr_dev) + BOND_ERR_LOG("failed to init slave %u.\n", func_idx); + } + + return hr_dev; +} + +static struct hns_roce_die_info *alloc_die_info(int bus_num) +{ + struct hns_roce_die_info *die_info; + int ret; + + die_info = kzalloc(sizeof(*die_info), GFP_KERNEL); + if (!die_info) + return NULL; + + ret = xa_err(xa_store(&roce_bond_xa, bus_num, die_info, GFP_KERNEL)); + if (ret) { + kfree(die_info); + return NULL; + } + + mutex_init(&die_info->die_mutex); + + return die_info; +} + +static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num) +{ + mutex_destroy(&die_info->die_mutex); + xa_erase(&roce_bond_xa, bus_num); + kfree(die_info); +} + +static int alloc_bond_id(struct hns_roce_bond_group *bond_grp) +{ + u8 bus_num = bond_grp->bus_num; + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + int i; + + if (!die_info) { + die_info = alloc_die_info(bus_num); + if (!die_info) + return -ENOMEM; + } + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + if (die_info->bond_id_mask & BOND_ID(i)) + continue; + + die_info->bond_id_mask |= BOND_ID(i); + die_info->bgrps[i] = bond_grp; + bond_grp->bond_id = i; + + return 0; + } + + return -ENOSPC; +} + +static int remove_bond_id(int bus_num, u8 bond_id) +{ + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + + if (bond_id >= ROCE_BOND_NUM_MAX) + return -EINVAL; + + if (!die_info) + return -ENODEV; + + die_info->bond_id_mask &= ~BOND_ID(bond_id); + die_info->bgrps[bond_id] = NULL; + if (!die_info->bond_id_mask) + dealloc_die_info(die_info, bus_num); + + return 0; +} + +static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp) +{ + struct hns_roce_dev *hr_dev; + int ret; + int i; + + for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) { + if (bond_grp->slave_map & (1 << i)) + hns_roce_slave_uninit(bond_grp, i); + } + + mutex_lock(&bond_grp->bond_mutex); + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + bond_grp->main_hr_dev = NULL; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + if (bond_grp->slave_map & (1 << i)) { + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + + if (!bond_grp->main_hr_dev) { + ret = -ENODEV; + goto out; + } + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); + +out: + if (ret) { + BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret); + hns_roce_cleanup_bond(bond_grp); + } else { + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE set bond finished!\n"); + } +} + +static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp) +{ + u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); + struct hns_roce_dev *hr_dev; + u8 i; + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED) + goto out; + + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + bond_grp->main_hr_dev = NULL; + + hns_roce_slave_uninit(bond_grp, main_func_idx); + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + hr_dev = hns_roce_slave_init(bond_grp, i, false); + if (hr_dev) + bond_grp->main_hr_dev = hr_dev; + } + +out: + hns_roce_cleanup_bond(bond_grp); +} + +static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp) +{ + int ret; + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + + mutex_lock(&bond_grp->bond_mutex); + if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGESTATE) + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to change RoCE bond slave state, ret = %d.\n", + ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave changestate finished!\n"); +} + +static void hns_roce_slave_change_num(struct hns_roce_bond_group *bond_grp) +{ + int ret; + u8 i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + if (bond_grp->slave_map & (1U << i)) { + if (i == PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn)) + continue; + hns_roce_slave_uninit(bond_grp, i); + } else { + hns_roce_slave_init(bond_grp, i, true); + if (!bond_grp->main_hr_dev) { + ret = -ENODEV; + goto out; + } + bond_grp->bond_func_info[i].net_dev = NULL; + bond_grp->bond_func_info[i].handle = NULL; + } + } + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + +out: + if (ret) { + BOND_ERR_LOG("failed to change RoCE bond slave num, ret = %d.\n", ret); + hns_roce_cleanup_bond(bond_grp); + } else { + mutex_lock(&bond_grp->bond_mutex); + if (bond_grp->bond_state == HNS_ROCE_BOND_SLAVE_CHANGE_NUM) + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + mutex_unlock(&bond_grp->bond_mutex); + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave change num finished!\n"); + } +} + +static void hns_roce_bond_info_update_nolock(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + struct hns_roce_v2_priv *priv; + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + int func_idx; + + bond_grp->slave_map = 0; + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper_dev, net_dev) { + func_idx = get_netdev_bond_slave_id(net_dev, bond_grp); + if (func_idx < 0) { + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) + continue; + func_idx = PCI_FUNC(hr_dev->pci_dev->devfn); + if (!bond_grp->bond_func_info[func_idx].net_dev) { + priv = hr_dev->priv; + bond_grp->bond_func_info[func_idx].net_dev = + net_dev; + bond_grp->bond_func_info[func_idx].handle = + priv->handle; + } + ib_device_put(&hr_dev->ib_dev); + } + + bond_grp->slave_map |= (1 << func_idx); + } + rcu_read_unlock(); +} + +static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct hns_roce_dev *hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + bool ret = true; + + if (!hr_dev) { + if (bond_grp && + get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + return true; + else + return false; + } + + if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) { + ret = false; + goto out; + } + + if (hr_dev->is_vf || pci_num_vf(hr_dev->pci_dev) > 0) { + ret = false; + goto out; + } + + if (bond_grp->bus_num != get_hr_bus_num(hr_dev)) + ret = false; + +out: + ib_device_put(&hr_dev->ib_dev); + return ret; +} + +static bool check_slave_support(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + struct net_device *net_dev; + u8 slave_num = 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper_dev, net_dev) { + if (is_dev_bond_supported(bond_grp, net_dev)) { + slave_num++; + continue; + } + rcu_read_unlock(); + return false; + } + rcu_read_unlock(); + + return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX); +} + +static void hns_roce_bond_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct hns_roce_bond_group *bond_grp = + container_of(delayed_work, struct hns_roce_bond_group, + bond_work); + enum hns_roce_bond_state bond_state; + bool bond_ready; + + mutex_lock(&bond_grp->bond_mutex); + bond_ready = check_slave_support(bond_grp, bond_grp->upper_dev); + hns_roce_bond_info_update_nolock(bond_grp, bond_grp->upper_dev); + bond_state = bond_grp->bond_state; + bond_grp->bond_ready = bond_ready; + mutex_unlock(&bond_grp->bond_mutex); + + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "bond work: bond_ready - %d, bond_state - %d.\n", + bond_ready, bond_state); + + if (!bond_ready) { + hns_roce_clear_bond(bond_grp); + return; + } + + switch (bond_state) { + case HNS_ROCE_BOND_NOT_BONDED: + hns_roce_set_bond(bond_grp); + /* In set_bond flow, we don't need to set bond netdev here as + * it has been done when bond_grp->main_hr_dev is registered. + */ + return; + case HNS_ROCE_BOND_SLAVE_CHANGESTATE: + hns_roce_slave_changestate(bond_grp); + break; + case HNS_ROCE_BOND_SLAVE_CHANGE_NUM: + hns_roce_slave_change_num(bond_grp); + break; + default: + return; + } + hns_roce_set_bond_netdev(bond_grp, bond_grp->main_hr_dev); +} + +static void hns_roce_attach_bond_grp(struct hns_roce_bond_group *bond_grp, + struct hns_roce_dev *hr_dev, + struct net_device *upper_dev) +{ + bond_grp->upper_dev = upper_dev; + bond_grp->main_hr_dev = hr_dev; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + bond_grp->bond_ready = false; +} + +static void hns_roce_detach_bond_grp(struct hns_roce_bond_group *bond_grp) +{ + mutex_lock(&bond_grp->bond_mutex); + + cancel_delayed_work(&bond_grp->bond_work); + bond_grp->upper_dev = NULL; + bond_grp->main_hr_dev = NULL; + bond_grp->bond_ready = false; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED; + bond_grp->slave_map = 0; + memset(bond_grp->bond_func_info, 0, sizeof(bond_grp->bond_func_info)); + + mutex_unlock(&bond_grp->bond_mutex); +} + +void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp) +{ + int ret; + + ret = bond_grp->main_hr_dev ? + hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO; + if (ret) + BOND_ERR_LOG("failed to clear RoCE bond, ret = %d.\n", ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE clear bond finished!\n"); + + hns_roce_detach_bond_grp(bond_grp); +} + +static bool lowerstate_event_filter(struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct hns_roce_bond_group *bond_grp_tmp; + + bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bond_grp->bus_num); + return bond_grp_tmp == bond_grp; +} + +static void lowerstate_event_setting(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changelowerstate_info *info) +{ + mutex_lock(&bond_grp->bond_mutex); + + if (bond_grp->bond_ready && + bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED) + bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGESTATE; + + mutex_unlock(&bond_grp->bond_mutex); +} + +static bool hns_roce_bond_lowerstate_event(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changelowerstate_info *info) +{ + struct net_device *net_dev = + netdev_notifier_info_to_dev((struct netdev_notifier_info *)info); + + if (!netif_is_lag_port(net_dev)) + return false; + + if (!lowerstate_event_filter(bond_grp, net_dev)) + return false; + + lowerstate_event_setting(bond_grp, info); + + return true; +} + +static bool is_bond_setting_supported(struct netdev_lag_upper_info *bond_info) +{ + if (!bond_info) + return false; + + if (bond_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + bond_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) + return false; + + if (bond_info->tx_type == NETDEV_LAG_TX_TYPE_HASH && + bond_info->hash_type > NETDEV_LAG_HASH_L23) + return false; + + return true; +} + +static void upper_event_setting(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changeupper_info *info) +{ + struct netdev_lag_upper_info *bond_upper_info = NULL; + bool slave_inc = info->linking; + + if (slave_inc) + bond_upper_info = info->upper_info; + + if (bond_upper_info) { + bond_grp->tx_type = bond_upper_info->tx_type; + bond_grp->hash_type = bond_upper_info->hash_type; + } +} + +static bool check_unlinking_bond_support(struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + u8 slave_num = 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(bond_grp->upper_dev, net_dev) { + if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + slave_num++; + } + rcu_read_unlock(); + + return (slave_num > 1); +} + +static bool check_linking_bond_support(struct netdev_lag_upper_info *bond_info, + struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev) +{ + if (!is_bond_setting_supported(bond_info)) + return false; + + return check_slave_support(bond_grp, upper_dev); +} + +static enum bond_support_type + check_bond_support(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev, + struct netdev_notifier_changeupper_info *info) +{ + bool bond_grp_exist = false; + bool support; + + if (upper_dev == bond_grp->upper_dev) + bond_grp_exist = true; + + if (!info->linking && !bond_grp_exist) + return BOND_NOT_SUPPORT; + + if (info->linking) + support = check_linking_bond_support(info->upper_info, bond_grp, + upper_dev); + else + support = check_unlinking_bond_support(bond_grp); + + if (support) + return BOND_SUPPORT; + + return bond_grp_exist ? BOND_EXISTING_NOT_SUPPORT : BOND_NOT_SUPPORT; +} + +static bool upper_event_filter(struct netdev_notifier_changeupper_info *info, + struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev) +{ + struct net_device *upper_dev = info->upper_dev; + struct hns_roce_bond_group *bond_grp_tmp; + struct hns_roce_dev *hr_dev; + bool ret = true; + u8 bus_num; + + if (!info->linking || + bond_grp->bond_state != HNS_ROCE_BOND_NOT_ATTACHED) + return bond_grp->upper_dev == upper_dev; + + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) + return false; + + bus_num = get_hr_bus_num(hr_dev); + if (bond_grp->bus_num != bus_num) { + ret = false; + goto out; + } + + bond_grp_tmp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp_tmp && bond_grp_tmp != bond_grp) + ret = false; +out: + ib_device_put(&hr_dev->ib_dev); + return ret; +} + +static bool hns_roce_bond_upper_event(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changeupper_info *info) +{ + struct net_device *net_dev = + netdev_notifier_info_to_dev((struct netdev_notifier_info *)info); + struct net_device *upper_dev = info->upper_dev; + enum bond_support_type support = BOND_SUPPORT; + struct hns_roce_dev *hr_dev; + int slave_id; + + if (!upper_dev || !netif_is_lag_master(upper_dev)) + return false; + + if (!upper_event_filter(info, bond_grp, net_dev)) + return false; + + mutex_lock(&bond_grp->bond_mutex); + support = check_bond_support(bond_grp, upper_dev, info); + if (support == BOND_NOT_SUPPORT) { + mutex_unlock(&bond_grp->bond_mutex); + return false; + } + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_ATTACHED) { + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (!hr_dev) { + mutex_unlock(&bond_grp->bond_mutex); + return false; + } + hns_roce_attach_bond_grp(bond_grp, hr_dev, upper_dev); + ib_device_put(&hr_dev->ib_dev); + } + + /* In the case of netdev being unregistered, the roce + * instance shouldn't be inited. + */ + if (net_dev->reg_state >= NETREG_UNREGISTERING) { + slave_id = get_netdev_bond_slave_id(net_dev, bond_grp); + if (slave_id >= 0) { + bond_grp->bond_func_info[slave_id].net_dev = NULL; + bond_grp->bond_func_info[slave_id].handle = NULL; + } + } + + if (support == BOND_SUPPORT) { + bond_grp->bond_ready = true; + if (bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED) + bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGE_NUM; + } + mutex_unlock(&bond_grp->bond_mutex); + if (support == BOND_SUPPORT) + upper_event_setting(bond_grp, info); + + return true; +} + +static int hns_roce_bond_event(struct notifier_block *self, + unsigned long event, void *ptr) +{ + struct hns_roce_bond_group *bond_grp = + container_of(self, struct hns_roce_bond_group, bond_nb); + bool changed = false; + + if (event == NETDEV_CHANGEUPPER) + changed = hns_roce_bond_upper_event(bond_grp, ptr); + if (event == NETDEV_CHANGELOWERSTATE) + changed = hns_roce_bond_lowerstate_event(bond_grp, ptr); + + if (changed) + schedule_delayed_work(&bond_grp->bond_work, HZ); + + return NOTIFY_DONE; +} + +int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + int ret; + int i; + + if (xa_load(&roce_bond_xa, bus_num)) + return 0; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = kvzalloc(sizeof(*bond_grp), GFP_KERNEL); + if (!bond_grp) { + ret = -ENOMEM; + goto mem_err; + } + + mutex_init(&bond_grp->bond_mutex); + INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_bond_work); + + bond_grp->bond_ready = false; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_ATTACHED; + bond_grp->bus_num = bus_num; + + ret = alloc_bond_id(bond_grp); + if (ret) { + dev_err(hr_dev->dev, + "failed to alloc bond ID, ret = %d.\n", ret); + goto alloc_id_err; + } + + bond_grp->bond_nb.notifier_call = hns_roce_bond_event; + ret = register_netdevice_notifier(&bond_grp->bond_nb); + if (ret) { + ibdev_err(&hr_dev->ib_dev, + "failed to register bond nb, ret = %d.\n", ret); + goto register_nb_err; + } + bgrps[i] = bond_grp; + } + + return 0; + +register_nb_err: + remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); +alloc_id_err: + mutex_destroy(&bond_grp->bond_mutex); + kvfree(bond_grp); +mem_err: + for (i--; i >= 0; i--) { + unregister_netdevice_notifier(&bgrps[i]->bond_nb); + cancel_delayed_work_sync(&bgrps[i]->bond_work); + remove_bond_id(bgrps[i]->bus_num, bgrps[i]->bond_id); + mutex_destroy(&bgrps[i]->bond_mutex); + kvfree(bgrps[i]); + } + return ret; +} + +void hns_roce_dealloc_bond_grp(void) +{ + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + unsigned long id; + int i; + + xa_for_each(&roce_bond_xa, id, die_info) { + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + unregister_netdevice_notifier(&bond_grp->bond_nb); + cancel_delayed_work_sync(&bond_grp->bond_work); + remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); + mutex_destroy(&bond_grp->bond_mutex); + kvfree(bond_grp); + } + } +} + +int hns_roce_bond_init(struct hns_roce_dev *hr_dev) +{ + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + int ret; + + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + + if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT) { + ret = hns_roce_recover_bond(bond_grp, hr_dev); + if (ret) { + dev_err(hr_dev->dev, + "failed to recover RoCE bond, ret = %d.\n", ret); + return ret; + } + } + + return hns_roce_set_bond_netdev(bond_grp, hr_dev); +} + +void hns_roce_bond_suspend(struct hnae3_handle *handle) +{ + u8 bus_num = handle->pdev->bus->number; + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + int i; + + die_info = xa_load(&roce_bond_xa, bus_num); + if (!die_info) + return; + + mutex_lock(&die_info->die_mutex); + + /* + * Avoid duplicated processing when calling this function + * multiple times. + */ + if (die_info->suspend_cnt) + goto out; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + unregister_netdevice_notifier(&bond_grp->bond_nb); + cancel_delayed_work_sync(&bond_grp->bond_work); + } + +out: + die_info->suspend_cnt++; + mutex_unlock(&die_info->die_mutex); +} + +void hns_roce_bond_resume(struct hnae3_handle *handle) +{ + u8 bus_num = handle->pdev->bus->number; + struct hns_roce_bond_group *bond_grp; + struct hns_roce_die_info *die_info; + int i, ret; + + die_info = xa_load(&roce_bond_xa, bus_num); + if (!die_info) + return; + + mutex_lock(&die_info->die_mutex); + + die_info->suspend_cnt--; + if (die_info->suspend_cnt) + goto out; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + ret = register_netdevice_notifier(&bond_grp->bond_nb); + if (ret) + dev_err(&handle->pdev->dev, + "failed to resume bond notifier(bus_num = %u, id = %u), ret = %d.\n", + bus_num, bond_grp->bond_id, ret); + } + +out: + mutex_unlock(&die_info->die_mutex); +} diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h new file mode 100644 index 000000000000..98c295d78ca1 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2025 Hisilicon Limited. + */ + +#ifndef _HNS_ROCE_BOND_H +#define _HNS_ROCE_BOND_H + +#include <linux/netdevice.h> +#include <net/bonding.h> + +#define ROCE_BOND_FUNC_MAX 4 +#define ROCE_BOND_NUM_MAX 2 + +#define BOND_ID(id) BIT(id) + +#define BOND_ERR_LOG(fmt, ...) \ + pr_err("HNS RoCE Bonding: " fmt, ##__VA_ARGS__) + +enum { + BOND_MODE_1, + BOND_MODE_2_4, +}; + +enum hns_roce_bond_hashtype { + BOND_HASH_L2, + BOND_HASH_L34, + BOND_HASH_L23, +}; + +enum bond_support_type { + BOND_NOT_SUPPORT, + /* + * bond_grp already exists, but in the current + * conditions it's no longer supported + */ + BOND_EXISTING_NOT_SUPPORT, + BOND_SUPPORT, +}; + +enum hns_roce_bond_state { + HNS_ROCE_BOND_NOT_ATTACHED, + HNS_ROCE_BOND_NOT_BONDED, + HNS_ROCE_BOND_IS_BONDED, + HNS_ROCE_BOND_SLAVE_CHANGE_NUM, + HNS_ROCE_BOND_SLAVE_CHANGESTATE, +}; + +enum hns_roce_bond_cmd_type { + HNS_ROCE_SET_BOND, + HNS_ROCE_CHANGE_BOND, + HNS_ROCE_CLEAR_BOND, +}; + +struct hns_roce_func_info { + struct net_device *net_dev; + struct hnae3_handle *handle; +}; + +struct hns_roce_bond_group { + struct net_device *upper_dev; + struct hns_roce_dev *main_hr_dev; + u8 active_slave_num; + u32 slave_map; + u32 active_slave_map; + u8 bond_id; + u8 bus_num; + struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX]; + bool bond_ready; + enum hns_roce_bond_state bond_state; + enum netdev_lag_tx_type tx_type; + enum netdev_lag_hash hash_type; + struct mutex bond_mutex; + struct notifier_block bond_nb; + struct delayed_work bond_work; +}; + +struct hns_roce_die_info { + u8 bond_id_mask; + struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; + struct mutex die_mutex; + u8 suspend_cnt; +}; + +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, + u8 bus_num); +int hns_roce_alloc_bond_grp(struct hns_roce_dev *hr_dev); +void hns_roce_dealloc_bond_grp(void); +void hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp); +bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev); +int hns_roce_bond_init(struct hns_roce_dev *hr_dev); +void hns_roce_bond_suspend(struct hnae3_handle *handle); +void hns_roce_bond_resume(struct hnae3_handle *handle); + +#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 06832c0ac055..318f18cf37aa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -33,6 +33,7 @@ #ifndef _HNS_ROCE_DEVICE_H #define _HNS_ROCE_DEVICE_H +#include <linux/pci.h> #include <rdma/ib_verbs.h> #include <rdma/hns-abi.h> #include "hns_roce_debugfs.h" @@ -153,6 +154,7 @@ enum { HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), + HNS_ROCE_CAP_FLAG_BOND = BIT(21), HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22), }; @@ -177,6 +179,7 @@ enum hns_roce_instance_state { HNS_ROCE_STATE_INIT, HNS_ROCE_STATE_INITED, HNS_ROCE_STATE_UNINIT, + HNS_ROCE_STATE_BOND_UNINIT, }; enum { @@ -1167,6 +1170,17 @@ static inline u8 get_tclass(const struct ib_global_route *grh) grh->traffic_class >> DSCP_SHIFT : grh->traffic_class; } +static inline struct net_device *get_hr_netdev(struct hns_roce_dev *hr_dev, + u8 port) +{ + return hr_dev->iboe.netdevs[port]; +} + +static inline u8 get_hr_bus_num(struct hns_roce_dev *hr_dev) +{ + return hr_dev->pci_dev->bus->number; +} + void hns_roce_init_uar_table(struct hns_roce_dev *dev); int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar); @@ -1293,7 +1307,7 @@ void hns_roce_flush_cqe(struct hns_roce_dev *hr_dev, u32 qpn); void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type); void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev); int hns_roce_init(struct hns_roce_dev *hr_dev); -void hns_roce_exit(struct hns_roce_dev *hr_dev); +void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup); int hns_roce_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq); int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq); int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 63052c0e7613..2d6ae89e525b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -43,11 +43,13 @@ #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> +#include "hclge_main.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" #define CREATE_TRACE_POINTS #include "hns_roce_trace.h" @@ -1434,6 +1436,79 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, return ret; } +static enum hns_roce_opcode_type + get_bond_opcode(enum hns_roce_bond_cmd_type bond_type) +{ + switch (bond_type) { + case HNS_ROCE_SET_BOND: + return HNS_ROCE_OPC_SET_BOND_INFO; + case HNS_ROCE_CHANGE_BOND: + return HNS_ROCE_OPC_CHANGE_ACTIVE_PORT; + case HNS_ROCE_CLEAR_BOND: + return HNS_ROCE_OPC_CLEAR_BOND_INFO; + default: + WARN(true, "Invalid bond type %d!\n", bond_type); + return HNS_ROCE_OPC_SET_BOND_INFO; + } +} + +static enum hns_roce_bond_hashtype + get_bond_hashtype(enum netdev_lag_hash netdev_hashtype) +{ + switch (netdev_hashtype) { + case NETDEV_LAG_HASH_L2: + return BOND_HASH_L2; + case NETDEV_LAG_HASH_L34: + return BOND_HASH_L34; + case NETDEV_LAG_HASH_L23: + return BOND_HASH_L23; + default: + WARN(true, "Invalid hash type %d!\n", netdev_hashtype); + return BOND_HASH_L2; + } +} + +int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, + enum hns_roce_bond_cmd_type bond_type) +{ + enum hns_roce_opcode_type opcode = get_bond_opcode(bond_type); + struct hns_roce_bond_info *slave_info; + struct hns_roce_cmq_desc desc = {}; + int ret; + + slave_info = (struct hns_roce_bond_info *)desc.data; + hns_roce_cmq_setup_basic_desc(&desc, opcode, false); + + slave_info->bond_id = cpu_to_le32(bond_grp->bond_id); + if (bond_type == HNS_ROCE_CLEAR_BOND) + goto out; + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + slave_info->bond_mode = cpu_to_le32(BOND_MODE_1); + if (bond_grp->active_slave_num != 1) + ibdev_warn(&bond_grp->main_hr_dev->ib_dev, + "active slave cnt(%u) in Mode 1 is invalid.\n", + bond_grp->active_slave_num); + } else { + slave_info->bond_mode = cpu_to_le32(BOND_MODE_2_4); + slave_info->hash_policy = + cpu_to_le32(get_bond_hashtype(bond_grp->hash_type)); + } + + slave_info->active_slave_cnt = cpu_to_le32(bond_grp->active_slave_num); + slave_info->active_slave_mask = cpu_to_le32(bond_grp->active_slave_map); + slave_info->slave_mask = cpu_to_le32(bond_grp->slave_map); + +out: + ret = hns_roce_cmq_send(bond_grp->main_hr_dev, &desc, 1); + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "cmq bond type(%d) failed, ret = %d.\n", + bond_type, ret); + + return ret; +} + static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, dma_addr_t base_addr, u8 cmd, unsigned long tag) { @@ -2275,6 +2350,9 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev) caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) << HNS_ROCE_CAP_FLAGS_EX_SHIFT; + if (hr_dev->is_vf) + caps->flags &= ~HNS_ROCE_CAP_FLAG_BOND; + caps->num_cqs = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_CQS); caps->gid_table_len[0] = hr_reg_read(resp_c, PF_CAPS_C_MAX_GID); caps->max_cqes = 1 << hr_reg_read(resp_c, PF_CAPS_C_CQ_DEPTH); @@ -7067,7 +7145,7 @@ error_failed_kzalloc: } static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, - bool reset) + bool reset, bool bond_cleanup) { struct hns_roce_dev *hr_dev = handle->priv; @@ -7079,7 +7157,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); - hns_roce_exit(hr_dev); + hns_roce_exit(hr_dev, bond_cleanup); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } @@ -7130,12 +7208,51 @@ reset_chk_err: static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, bool reset) { + /* Suspend bond to avoid concurrency */ + hns_roce_bond_suspend(handle); + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) - return; + goto out; handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT; - __hns_roce_hw_v2_uninit_instance(handle, reset); + __hns_roce_hw_v2_uninit_instance(handle, reset, true); + + handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; + +out: + hns_roce_bond_resume(handle); +} + +struct hns_roce_dev + *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, + int func_idx) +{ + struct hnae3_handle *handle; + int ret; + + handle = bond_grp->bond_func_info[func_idx].handle; + if (!handle || !handle->client) + return NULL; + + ret = hns_roce_hw_v2_init_instance(handle); + if (ret) + return NULL; + + return handle->priv; +} + +void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx) +{ + struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle; + + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) + return; + + handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT; + + __hns_roce_hw_v2_uninit_instance(handle, false, false); handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; } @@ -7144,6 +7261,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; + /* Suspend bond to avoid concurrency */ + hns_roce_bond_suspend(handle); + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) { set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state); return 0; @@ -7174,6 +7294,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle) if (test_and_clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state)) { handle->rinfo.reset_state = HNS_ROCE_STATE_RST_INITED; + hns_roce_bond_resume(handle); return 0; } @@ -7193,6 +7314,7 @@ static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle) dev_info(dev, "reset done, RoCE client reinit finished.\n"); } + hns_roce_bond_resume(handle); return ret; } @@ -7204,7 +7326,7 @@ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle) handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT; dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n"); msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY); - __hns_roce_hw_v2_uninit_instance(handle, false); + __hns_roce_hw_v2_uninit_instance(handle, false, false); return 0; } @@ -7240,6 +7362,14 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, if (linkup || !hr_dev) return; + /* For bond device, the link status depends on the upper netdev, + * and the upper device's link status depends on all the slaves' + * netdev but not only one. So bond device cannot get a correct + * link status from this path. + */ + if (hns_roce_get_bond_grp(netdev, get_hr_bus_num(hr_dev))) + return; + ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev); } @@ -7264,6 +7394,7 @@ static int __init hns_roce_hw_v2_init(void) static void __exit hns_roce_hw_v2_exit(void) { + hns_roce_dealloc_bond_grp(); hnae3_unregister_client(&hns_roce_hw_v2_client); hns_roce_cleanup_debugfs(); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index e64a04d6f85b..285fe0875fac 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -35,6 +35,7 @@ #include <linux/bitops.h> #include "hnae3.h" +#include "hns_roce_bond.h" #define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32 #define HNS_ROCE_V2_MTT_ENTRY_SZ 64 @@ -228,6 +229,9 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, + HNS_ROCE_OPC_SET_BOND_INFO = 0x8601, + HNS_ROCE_OPC_CLEAR_BOND_INFO = 0x8602, + HNS_ROCE_OPC_CHANGE_ACTIVE_PORT = 0x8603, }; #define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000 @@ -1465,7 +1469,23 @@ struct hns_roce_sccc_clr_done { __le32 rsv[5]; }; +struct hns_roce_bond_info { + __le32 bond_id; + __le32 bond_mode; + __le32 active_slave_cnt; + __le32 active_slave_mask; + __le32 slave_mask; + __le32 hash_policy; +}; + +struct hns_roce_dev + *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, + int func_idx); +void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx); int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, + enum hns_roce_bond_cmd_type bond_type); static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2], void __iomem *dest) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index f3607fe107a7..2f4864ab7d4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -32,7 +32,6 @@ */ #include <linux/acpi.h> #include <linux/module.h> -#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> @@ -41,6 +40,7 @@ #include "hns_roce_device.h" #include "hns_roce_hem.h" #include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -89,30 +89,75 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) return ret; } -static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port, - unsigned long event) +static int hns_roce_get_port_state(struct hns_roce_dev *hr_dev, u32 port_num, + enum ib_port_state *state) { + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + struct net_device *net_dev; + + net_dev = ib_device_get_netdev(&hr_dev->ib_dev, port_num); + if (!net_dev) + return -ENODEV; + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp) { + *state = ib_get_curr_port_state(bond_grp->upper_dev); + goto out; + } + } + + *state = ib_get_curr_port_state(net_dev); +out: + dev_put(net_dev); + return 0; +} + +static int handle_en_event(struct net_device *netdev, + struct hns_roce_dev *hr_dev, + u32 port, unsigned long event) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; struct device *dev = hr_dev->dev; - struct net_device *netdev; + enum ib_port_state curr_state; + struct ib_event ibevent; int ret = 0; - netdev = hr_dev->iboe.netdevs[port]; if (!netdev) { dev_err(dev, "can't find netdev on port(%u)!\n", port); return -ENODEV; } switch (event) { - case NETDEV_UP: - case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_CHANGEADDR: ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr); break; + case NETDEV_UP: + case NETDEV_CHANGE: + ret = hns_roce_set_mac(hr_dev, port, netdev->dev_addr); + if (ret) + return ret; + fallthrough; case NETDEV_DOWN: - /* - * In v1 engine, only support all ports closed together. - */ + if (!netif_is_lag_master(netdev)) + break; + curr_state = ib_get_curr_port_state(netdev); + + write_lock_irq(&ibdev->cache_lock); + if (ibdev->port_data[port].cache.last_port_state == curr_state) { + write_unlock_irq(&ibdev->cache_lock); + return 0; + } + ibdev->port_data[port].cache.last_port_state = curr_state; + write_unlock_irq(&ibdev->cache_lock); + + ibevent.event = (curr_state == IB_PORT_DOWN) ? + IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; + ibevent.device = ibdev; + ibevent.element.port_num = port + 1; + ib_dispatch_event(&ibevent); break; default: dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event)); @@ -126,17 +171,25 @@ static int hns_roce_netdev_event(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct hns_roce_bond_group *bond_grp; struct hns_roce_ib_iboe *iboe = NULL; struct hns_roce_dev *hr_dev = NULL; + struct net_device *upper = NULL; int ret; u32 port; hr_dev = container_of(self, struct hns_roce_dev, iboe.nb); iboe = &hr_dev->iboe; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0), + get_hr_bus_num(hr_dev)); + upper = bond_grp ? bond_grp->upper_dev : NULL; + } for (port = 0; port < hr_dev->caps.num_ports; port++) { - if (dev == iboe->netdevs[port]) { - ret = handle_en_event(hr_dev, port, event); + if ((!upper && dev == iboe->netdevs[port]) || + (upper && dev == upper)) { + ret = handle_en_event(dev, hr_dev, port, event); if (ret) return NOTIFY_DONE; break; @@ -148,12 +201,13 @@ static int hns_roce_netdev_event(struct notifier_block *self, static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev) { + struct net_device *net_dev; int ret; u8 i; for (i = 0; i < hr_dev->caps.num_ports; i++) { - ret = hns_roce_set_mac(hr_dev, i, - hr_dev->iboe.netdevs[i]->dev_addr); + net_dev = get_hr_netdev(hr_dev, i); + ret = hns_roce_set_mac(hr_dev, i, net_dev->dev_addr); if (ret) return ret; } @@ -221,9 +275,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, struct ib_port_attr *props) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); - struct device *dev = hr_dev->dev; struct net_device *net_dev; - unsigned long flags; enum ib_mtu mtu; u32 port; int ret; @@ -244,26 +296,26 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, if (ret) ibdev_warn(ib_dev, "failed to get speed, ret = %d.\n", ret); - spin_lock_irqsave(&hr_dev->iboe.lock, flags); - - net_dev = hr_dev->iboe.netdevs[port]; + net_dev = ib_device_get_netdev(ib_dev, port_num); if (!net_dev) { - spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); - dev_err(dev, "find netdev %u failed!\n", port); + ibdev_err(ib_dev, "find netdev %u failed!\n", port); return -EINVAL; } mtu = iboe_get_mtu(net_dev->mtu); props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256; - props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ? - IB_PORT_ACTIVE : - IB_PORT_DOWN; + + dev_put(net_dev); + + ret = hns_roce_get_port_state(hr_dev, port_num, &props->state); + if (ret) { + ibdev_err(ib_dev, "failed to get port state.\n"); + return ret; + } + props->phys_state = props->state == IB_PORT_ACTIVE ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; - - spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); - return 0; } @@ -617,9 +669,40 @@ static int hns_roce_get_hw_stats(struct ib_device *device, return num_counters; } -static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) +static void + hns_roce_unregister_bond_cleanup(struct hns_roce_dev *hr_dev, + struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + int i; + + /* To avoid the loss of other slave devices when main_hr_dev + * is unregistered, re-initialize the remaining slaves before + * the bond resources cleanup. + */ + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev && net_dev != get_hr_netdev(hr_dev, 0)) + hns_roce_bond_init_client(bond_grp, i); + } + + hns_roce_cleanup_bond(bond_grp); +} + +static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev, + bool bond_cleanup) { + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + + if (bond_cleanup && hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (bond_grp) + hns_roce_unregister_bond_cleanup(hr_dev, bond_grp); + } hr_dev->active = false; unregister_netdevice_notifier(&iboe->nb); @@ -708,11 +791,12 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = { static int hns_roce_register_device(struct hns_roce_dev *hr_dev) { - int ret; struct hns_roce_ib_iboe *iboe = NULL; - struct ib_device *ib_dev = NULL; struct device *dev = hr_dev->dev; + struct ib_device *ib_dev = NULL; + struct net_device *net_dev; unsigned int i; + int ret; iboe = &hr_dev->iboe; spin_lock_init(&iboe->lock); @@ -747,17 +831,38 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops); - for (i = 0; i < hr_dev->caps.num_ports; i++) { - if (!hr_dev->iboe.netdevs[i]) - continue; - ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i], - i + 1); - if (ret) + dma_set_max_seg_size(dev, SZ_2G); + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + ret = hns_roce_alloc_bond_grp(hr_dev); + if (ret) { + dev_err(dev, "failed to alloc bond_grp for bus %u, ret = %d\n", + get_hr_bus_num(hr_dev), ret); return ret; + } + } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND && + hns_roce_bond_is_active(hr_dev)) { + ret = hns_roce_bond_init(hr_dev); + if (ret) { + dev_err(dev, "failed to init bond!\n"); + return ret; + } + ret = ib_register_device(ib_dev, "hns_bond_%d", dev); + } else { + for (i = 0; i < hr_dev->caps.num_ports; i++) { + net_dev = get_hr_netdev(hr_dev, i); + if (!net_dev) + continue; + + ret = ib_device_set_netdev(ib_dev, net_dev, i + 1); + if (ret) + return ret; + } + ret = ib_register_device(ib_dev, "hns_%d", dev); } - dma_set_max_seg_size(dev, SZ_2G); - ret = ib_register_device(ib_dev, "hns_%d", dev); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; @@ -1157,10 +1262,10 @@ error_failed_alloc_dfx_cnt: return ret; } -void hns_roce_exit(struct hns_roce_dev *hr_dev) +void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup) { hns_roce_unregister_debugfs(hr_dev); - hns_roce_unregister_device(hr_dev); + hns_roce_unregister_device(hr_dev, bond_cleanup); if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index d35cf59d0f43..225c3e328e0e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include <linux/pci.h> #include "hns_roce_device.h" void hns_roce_init_pd_table(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index bdd879ac12dd..d1640c5fbaab 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -31,7 +31,6 @@ * SOFTWARE. */ -#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> @@ -1348,11 +1347,13 @@ static int check_mtu_validate(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_attr *attr, int attr_mask) { + struct net_device *net_dev; enum ib_mtu active_mtu; int p; p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port; - active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu); + net_dev = get_hr_netdev(hr_dev, p); + active_mtu = iboe_get_mtu(net_dev->mtu); if ((hr_dev->caps.max_mtu >= IB_MTU_2048 && attr->path_mtu > hr_dev->caps.max_mtu) || diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 1090051f493b..8a6efb6b9c9e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -3,7 +3,6 @@ * Copyright (c) 2018 Hisilicon Limited. */ -#include <linux/pci.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> #include "hns_roce_device.h" diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index c6a0a661d6e7..f4f4f92ba63a 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -3710,7 +3710,7 @@ int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) iwpd = iwqp->iwpd; tagged_offset = (uintptr_t)iwqp->ietf_mem.va; ibmr = irdma_reg_phys_mr(&iwpd->ibpd, iwqp->ietf_mem.pa, buf_len, - IB_ACCESS_LOCAL_WRITE, &tagged_offset); + IB_ACCESS_LOCAL_WRITE, &tagged_offset, false); if (IS_ERR(ibmr)) { ret = -ENOMEM; goto error; diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 4ef1c29032f7..ce5cf89c463c 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -2943,8 +2943,6 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, __le64 *wqe; struct irdma_sc_cqp *cqp; u64 hdr; - struct irdma_sc_ceq *ceq; - int ret_code = 0; cqp = cq->dev->cqp; if (cq->cq_uk.cq_id >= cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt) @@ -2953,19 +2951,9 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, if (cq->ceq_id >= cq->dev->hmc_fpm_misc.max_ceqs) return -EINVAL; - ceq = cq->dev->ceq[cq->ceq_id]; - if (ceq && ceq->reg_cq) - ret_code = irdma_sc_add_cq_ctx(ceq, cq); - - if (ret_code) - return ret_code; - wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); - if (!wqe) { - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, cq); + if (!wqe) return -ENOMEM; - } set_64bit_val(wqe, 0, cq->cq_uk.cq_size); set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); @@ -3018,17 +3006,12 @@ int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq) struct irdma_sc_cqp *cqp; __le64 *wqe; u64 hdr; - struct irdma_sc_ceq *ceq; cqp = cq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) return -ENOMEM; - ceq = cq->dev->ceq[cq->ceq_id]; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, cq); - set_64bit_val(wqe, 0, cq->cq_uk.cq_size); set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); set_64bit_val(wqe, 40, cq->shadow_area_pa); @@ -3602,71 +3585,6 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, } /** - * irdma_sc_find_reg_cq - find cq ctx index - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -static u32 irdma_sc_find_reg_cq(struct irdma_sc_ceq *ceq, - struct irdma_sc_cq *cq) -{ - u32 i; - - for (i = 0; i < ceq->reg_cq_size; i++) { - if (cq == ceq->reg_cq[i]) - return i; - } - - return IRDMA_INVALID_CQ_IDX; -} - -/** - * irdma_sc_add_cq_ctx - add cq ctx tracking for ceq - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -int irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq) -{ - unsigned long flags; - - spin_lock_irqsave(&ceq->req_cq_lock, flags); - - if (ceq->reg_cq_size == ceq->elem_cnt) { - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - return -ENOMEM; - } - - ceq->reg_cq[ceq->reg_cq_size++] = cq; - - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - - return 0; -} - -/** - * irdma_sc_remove_cq_ctx - remove cq ctx tracking for ceq - * @ceq: ceq sc structure - * @cq: cq sc structure - */ -void irdma_sc_remove_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq) -{ - unsigned long flags; - u32 cq_ctx_idx; - - spin_lock_irqsave(&ceq->req_cq_lock, flags); - cq_ctx_idx = irdma_sc_find_reg_cq(ceq, cq); - if (cq_ctx_idx == IRDMA_INVALID_CQ_IDX) - goto exit; - - ceq->reg_cq_size--; - if (cq_ctx_idx != ceq->reg_cq_size) - ceq->reg_cq[cq_ctx_idx] = ceq->reg_cq[ceq->reg_cq_size]; - ceq->reg_cq[ceq->reg_cq_size] = NULL; - -exit: - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); -} - -/** * irdma_sc_cqp_init - Initialize buffers for a control Queue Pair * @cqp: IWARP control queue pair pointer * @info: IWARP control queue pair init info pointer @@ -3950,11 +3868,13 @@ int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp) */ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) { + unsigned long flags; u64 temp_val; u16 sw_cq_sel; u8 arm_next_se; u8 arm_seq_num; + spin_lock_irqsave(&ccq->dev->cqp_lock, flags); get_64bit_val(ccq->cq_uk.shadow_area, 32, &temp_val); sw_cq_sel = (u16)FIELD_GET(IRDMA_CQ_DBSA_SW_CQ_SELECT, temp_val); arm_next_se = (u8)FIELD_GET(IRDMA_CQ_DBSA_ARM_NEXT_SE, temp_val); @@ -3965,6 +3885,7 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT_SE, arm_next_se) | FIELD_PREP(IRDMA_CQ_DBSA_ARM_NEXT, 1); set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val); + spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags); dma_wmb(); /* make sure shadow area is updated before arming */ @@ -4387,9 +4308,6 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, ceq->ceq_elem_pa = info->ceqe_pa; ceq->virtual_map = info->virtual_map; ceq->itr_no_expire = info->itr_no_expire; - ceq->reg_cq = info->reg_cq; - ceq->reg_cq_size = 0; - spin_lock_init(&ceq->req_cq_lock); ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0); ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0); ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL); @@ -4472,9 +4390,6 @@ int irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq) { struct irdma_sc_cqp *cqp; - if (ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, ceq->dev->ccq); - cqp = ceq->dev->cqp; cqp->process_cqp_sds = irdma_update_sds_noccq; @@ -4493,11 +4408,6 @@ int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch) struct irdma_sc_dev *dev = ceq->dev; dev->ccq->vsi_idx = ceq->vsi_idx; - if (ceq->reg_cq) { - ret_code = irdma_sc_add_cq_ctx(ceq, ceq->dev->ccq); - if (ret_code) - return ret_code; - } ret_code = irdma_sc_ceq_create(ceq, scratch, true); if (!ret_code) @@ -4562,7 +4472,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) struct irdma_sc_cq *temp_cq; u8 polarity; u32 cq_idx; - unsigned long flags; do { cq_idx = 0; @@ -4583,11 +4492,6 @@ void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) } cq = temp_cq; - if (ceq->reg_cq) { - spin_lock_irqsave(&ceq->req_cq_lock, flags); - cq_idx = irdma_sc_find_reg_cq(ceq, cq); - spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - } IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) @@ -4731,7 +4635,8 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch, u64 hdr; dev = aeq->dev; - if (dev->privileged) + + if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2) writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]); cqp = dev->cqp; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 7bad0e38786a..d1fc5726b979 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -2365,7 +2365,6 @@ static int irdma_cqp_manage_apbvt_cmd(struct irdma_device *iwdev, cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_apbvt_entry.info; - memset(info, 0, sizeof(*info)); info->add = add_port; info->port = accel_local_port; cqp_info->cqp_cmd = IRDMA_OP_MANAGE_APBVT_ENTRY; @@ -2474,7 +2473,6 @@ void irdma_manage_arp_cache(struct irdma_pci_f *rf, if (action == IRDMA_ARP_ADD) { cqp_info->cqp_cmd = IRDMA_OP_ADD_ARP_CACHE_ENTRY; info = &cqp_info->in.u.add_arp_cache_entry.info; - memset(info, 0, sizeof(*info)); info->arp_index = (u16)arp_index; info->permanent = true; ether_addr_copy(info->mac_addr, mac_addr); @@ -2533,7 +2531,6 @@ int irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo, cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_qhash_table_entry.info; - memset(info, 0, sizeof(*info)); info->vsi = &iwdev->vsi; info->manage = mtype; info->entry_type = etype; diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c index 27b191f61caf..b49fd9cf2476 100644 --- a/drivers/infiniband/hw/irdma/icrdma_if.c +++ b/drivers/infiniband/hw/irdma/icrdma_if.c @@ -302,7 +302,8 @@ err_rt_init: err_ctrl_init: icrdma_deinit_interrupts(rf, cdev_info); err_init_interrupts: - kfree(iwdev->rf); + mutex_destroy(&rf->ah_tbl_lock); + kfree(rf); ib_dealloc_device(&iwdev->ibdev); return err; @@ -319,6 +320,9 @@ static void icrdma_remove(struct auxiliary_device *aux_dev) ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false); irdma_ib_unregister_device(iwdev); icrdma_deinit_interrupts(iwdev->rf, cdev_info); + mutex_destroy(&iwdev->rf->ah_tbl_lock); + + kfree(iwdev->rf); pr_debug("INIT: Gen[%d] func[%d] device remove success\n", rdma_ver, PCI_FUNC(cdev_info->pdev->devfn)); diff --git a/drivers/infiniband/hw/irdma/ig3rdma_if.c b/drivers/infiniband/hw/irdma/ig3rdma_if.c index 1bb42eb298ba..e1d6670d9396 100644 --- a/drivers/infiniband/hw/irdma/ig3rdma_if.c +++ b/drivers/infiniband/hw/irdma/ig3rdma_if.c @@ -55,6 +55,7 @@ static int ig3rdma_vchnl_init(struct irdma_pci_f *rf, ret = irdma_sc_vchnl_init(&rf->sc_dev, &virt_info); if (ret) { destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); return ret; } @@ -124,7 +125,9 @@ static void ig3rdma_decfg_rf(struct irdma_pci_f *rf) { struct irdma_hw *hw = &rf->hw; + mutex_destroy(&rf->ah_tbl_lock); destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); kfree(hw->io_regs); iounmap(hw->rdma_reg.addr); } @@ -149,6 +152,7 @@ static int ig3rdma_cfg_rf(struct irdma_pci_f *rf, err = ig3rdma_cfg_regions(&rf->hw, cdev_info); if (err) { destroy_workqueue(rf->vchnl_wq); + mutex_destroy(&rf->sc_dev.vchnl_mutex); return err; } diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 886b30da188a..baab61e424a2 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -556,7 +556,7 @@ void irdma_copy_ip_htonl(__be32 *dst, u32 *src); u16 irdma_get_vlan_ipv4(u32 *addr); void irdma_get_vlan_mac_ipv6(u32 *addr, u16 *vlan_id, u8 *mac); struct ib_mr *irdma_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size, - int acc, u64 *iova_start); + int acc, u64 *iova_start, bool dma_mr); int irdma_upload_qp_context(struct irdma_qp *iwqp, bool freeze, bool raw); void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq); int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd, @@ -564,7 +564,6 @@ int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd, void (*callback_fcn)(struct irdma_cqp_request *cqp_request), void *cb_param); void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request); -bool irdma_cq_empty(struct irdma_cq *iwcq); int irdma_inetaddr_event(struct notifier_block *notifier, unsigned long event, void *ptr); int irdma_inet6addr_event(struct notifier_block *notifier, unsigned long event, diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index fa6325adaede..28dfad7f940c 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -506,12 +506,14 @@ exit: void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, struct irdma_pble_alloc *palloc) { - pble_rsrc->freedpbles += palloc->total_cnt; - if (palloc->level == PBLE_LEVEL_2) free_lvl2(pble_rsrc, palloc); else irdma_prm_return_pbles(&pble_rsrc->pinfo, &palloc->level1.chunkinfo); + + mutex_lock(&pble_rsrc->pble_mutex_lock); + pble_rsrc->freedpbles += palloc->total_cnt; pble_rsrc->stats_alloc_freed++; + mutex_unlock(&pble_rsrc->pble_mutex_lock); } diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 694e5a9ed15d..cee47ddbd1b5 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -685,7 +685,6 @@ static int irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc) ukqp->rq_size = rsrc->rq_size; IRDMA_RING_INIT(ukqp->sq_ring, ukqp->sq_size); - IRDMA_RING_INIT(ukqp->initial_ring, ukqp->sq_size); IRDMA_RING_INIT(ukqp->rq_ring, ukqp->rq_size); ukqp->wqe_alloc_db = qp->pd->dev->wqe_alloc_db; @@ -726,7 +725,6 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) struct irdma_sc_cqp *cqp; u64 hdr; struct irdma_ccq_cqe_info compl_info; - int status = 0; cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0); @@ -756,16 +754,8 @@ static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) print_hex_dump_debug("PUDA: PUDA CREATE CQ", DUMP_PREFIX_OFFSET, 16, 8, wqe, IRDMA_CQP_WQE_SIZE * 8, false); irdma_sc_cqp_post_sq(dev->cqp); - status = irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ, - &compl_info); - if (!status) { - struct irdma_sc_ceq *ceq = dev->ceq[0]; - - if (ceq && ceq->reg_cq) - status = irdma_sc_add_cq_ctx(ceq, cq); - } - - return status; + return irdma_sc_poll_for_cqp_op_done(dev->cqp, IRDMA_CQP_OP_CREATE_CQ, + &compl_info); } /** @@ -897,23 +887,17 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type, struct irdma_puda_buf *buf = NULL; struct irdma_puda_buf *nextbuf = NULL; struct irdma_virt_mem *vmem; - struct irdma_sc_ceq *ceq; - ceq = vsi->dev->ceq[0]; switch (type) { case IRDMA_PUDA_RSRC_TYPE_ILQ: rsrc = vsi->ilq; vmem = &vsi->ilq_mem; vsi->ilq = NULL; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, &rsrc->cq); break; case IRDMA_PUDA_RSRC_TYPE_IEQ: rsrc = vsi->ieq; vmem = &vsi->ieq_mem; vsi->ieq = NULL; - if (ceq && ceq->reg_cq) - irdma_sc_remove_cq_ctx(ceq, &rsrc->cq); break; default: ibdev_dbg(to_ibdev(dev), "PUDA: error resource type = 0x%x\n", diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index c1b8f81ea283..cab4896640a1 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -492,9 +492,6 @@ struct irdma_sc_ceq { u32 first_pm_pbl_idx; u8 polarity; u16 vsi_idx; - struct irdma_sc_cq **reg_cq; - u32 reg_cq_size; - spinlock_t req_cq_lock; /* protect access to reg_cq array */ bool virtual_map:1; bool tph_en:1; bool itr_no_expire:1; @@ -894,8 +891,6 @@ struct irdma_ceq_init_info { u8 tph_val; u16 vsi_idx; u32 first_pm_pbl_idx; - struct irdma_sc_cq **reg_cq; - u32 reg_cq_idx; }; struct irdma_aeq_init_info { diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index ce1ae10c30fc..f0846b800913 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -114,33 +114,8 @@ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx) */ void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp) { - u64 temp; - u32 hw_sq_tail; - u32 sw_sq_head; - - /* valid bit is written and loads completed before reading shadow */ - mb(); - - /* read the doorbell shadow area */ - get_64bit_val(qp->shadow_area, 0, &temp); - - hw_sq_tail = (u32)FIELD_GET(IRDMA_QP_DBSA_HW_SQ_TAIL, temp); - sw_sq_head = IRDMA_RING_CURRENT_HEAD(qp->sq_ring); - if (sw_sq_head != qp->initial_ring.head) { - if (sw_sq_head != hw_sq_tail) { - if (sw_sq_head > qp->initial_ring.head) { - if (hw_sq_tail >= qp->initial_ring.head && - hw_sq_tail < sw_sq_head) - writel(qp->qp_id, qp->wqe_alloc_db); - } else { - if (hw_sq_tail >= qp->initial_ring.head || - hw_sq_tail < sw_sq_head) - writel(qp->qp_id, qp->wqe_alloc_db); - } - } - } - - qp->initial_ring.head = qp->sq_ring.head; + dma_wmb(); + writel(qp->qp_id, qp->wqe_alloc_db); } /** @@ -194,6 +169,7 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, qp->sq_wrtrk_array[*wqe_idx].wrid = info->wr_id; qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size; qp->sq_wrtrk_array[*wqe_idx].quanta = quanta; + qp->sq_wrtrk_array[*wqe_idx].signaled = info->signaled; return wqe; } @@ -1137,6 +1113,27 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, } /** + * irdma_uk_cq_empty - Check if CQ is empty + * @cq: hw cq + */ +bool irdma_uk_cq_empty(struct irdma_cq_uk *cq) +{ + __le64 *cqe; + u8 polarity; + u64 qword3; + + if (cq->avoid_mem_cflct) + cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(cq); + else + cqe = IRDMA_GET_CURRENT_CQ_ELEM(cq); + + get_64bit_val(cqe, 24, &qword3); + polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); + + return polarity != cq->polarity; +} + +/** * irdma_uk_cq_poll_cmpl - get cq completion info * @cq: hw cq * @info: cq poll information returned @@ -1287,6 +1284,8 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3); if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) { + unsigned long flags; + srq = qp->srq_uk; get_64bit_val(cqe, 8, &info->wr_id); @@ -1299,8 +1298,11 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, } else { info->stag_invalid_set = false; } + spin_lock_irqsave(srq->lock, flags); IRDMA_RING_MOVE_TAIL(srq->srq_ring); + spin_unlock_irqrestore(srq->lock, flags); pring = &srq->srq_ring; + } else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) { u32 array_idx; @@ -1355,6 +1357,10 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid; if (!info->comp_status) info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len; + if (!qp->sq_wrtrk_array[wqe_idx].signaled) { + ret_code = -EFAULT; + goto exit; + } info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3); IRDMA_RING_SET_TAIL(qp->sq_ring, wqe_idx + qp->sq_wrtrk_array[wqe_idx].quanta); @@ -1420,8 +1426,9 @@ exit: IRDMA_RING_MOVE_TAIL(cq->cq_ring); if (!cq->avoid_mem_cflct && ext_valid) IRDMA_RING_MOVE_TAIL(cq->cq_ring); - set_64bit_val(cq->shadow_area, 0, - IRDMA_RING_CURRENT_HEAD(cq->cq_ring)); + if (IRDMA_RING_CURRENT_HEAD(cq->cq_ring) & 0x3F || irdma_uk_cq_empty(cq)) + set_64bit_val(cq->shadow_area, 0, + IRDMA_RING_CURRENT_HEAD(cq->cq_ring)); } else { qword3 &= ~IRDMA_CQ_WQEIDX; qword3 |= FIELD_PREP(IRDMA_CQ_WQEIDX, pring->tail); @@ -1574,7 +1581,6 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp, qp->conn_wqes = move_cnt; IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->sq_ring, move_cnt); IRDMA_RING_MOVE_TAIL_BY_COUNT(qp->sq_ring, move_cnt); - IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(qp->initial_ring, move_cnt); } /** @@ -1719,7 +1725,6 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info) qp->max_sq_frag_cnt = info->max_sq_frag_cnt; sq_ring_size = qp->sq_size << info->sq_shift; IRDMA_RING_INIT(qp->sq_ring, sq_ring_size); - IRDMA_RING_INIT(qp->initial_ring, sq_ring_size); if (info->first_sq_wq) { irdma_setup_connection_wqes(qp, info); qp->swqe_polarity = 1; diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index ab57f689827a..9eb7fd0b1cbf 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -429,6 +429,7 @@ struct irdma_wqe_uk_ops { struct irdma_bind_window *op_info); }; +bool irdma_uk_cq_empty(struct irdma_cq_uk *cq); int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info); void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, @@ -456,7 +457,6 @@ struct irdma_srq_uk { struct irdma_uk_attrs *uk_attrs; __le64 *shadow_area; struct irdma_ring srq_ring; - struct irdma_ring initial_ring; u32 srq_id; u32 srq_size; u32 max_srq_frag_cnt; @@ -465,6 +465,7 @@ struct irdma_srq_uk { u8 wqe_size; u8 wqe_size_multiplier; u8 deferred_flag; + spinlock_t *lock; }; struct irdma_srq_uk_init_info { @@ -482,7 +483,8 @@ struct irdma_sq_uk_wr_trk_info { u64 wrid; u32 wr_len; u16 quanta; - u8 reserved[2]; + u8 signaled; + u8 reserved[1]; }; struct irdma_qp_quanta { diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 8b94d87b0192..cc2a12f735d3 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -452,6 +452,7 @@ struct irdma_cqp_request *irdma_alloc_and_get_cqp_request(struct irdma_cqp *cqp, cqp_request->waiting = wait; refcount_set(&cqp_request->refcnt, 1); memset(&cqp_request->compl_info, 0, sizeof(cqp_request->compl_info)); + memset(&cqp_request->info, 0, sizeof(cqp_request->info)); return cqp_request; } @@ -1068,7 +1069,6 @@ int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) cqp_info = &cqp_request->info; qp_info = &cqp_request->info.in.u.qp_create.info; - memset(qp_info, 0, sizeof(*qp_info)); qp_info->cq_num_valid = true; qp_info->next_iwarp_state = IRDMA_QP_STATE_RTS; cqp_info->cqp_cmd = IRDMA_OP_QP_CREATE; @@ -1343,7 +1343,6 @@ int irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = IRDMA_OP_QP_DESTROY; cqp_info->post_sq = 1; cqp_info->in.u.qp_destroy.qp = qp; @@ -1749,7 +1748,6 @@ int irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = IRDMA_OP_STATS_GATHER; cqp_info->post_sq = 1; cqp_info->in.u.stats_gather.info = pestat->gather_info; @@ -1789,7 +1787,6 @@ int irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = cmd; cqp_info->post_sq = 1; cqp_info->in.u.stats_manage.info = *stats_info; @@ -1890,7 +1887,6 @@ int irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd, return -ENOMEM; cqp_info = &cqp_request->info; - memset(cqp_info, 0, sizeof(*cqp_info)); cqp_info->cqp_cmd = cmd; cqp_info->post_sq = 1; cqp_info->in.u.ws_node.info = *node_info; @@ -2357,24 +2353,6 @@ void irdma_ib_qp_event(struct irdma_qp *iwqp, enum irdma_qp_event_type event) iwqp->ibqp.event_handler(&ibevent, iwqp->ibqp.qp_context); } -bool irdma_cq_empty(struct irdma_cq *iwcq) -{ - struct irdma_cq_uk *ukcq; - u64 qword3; - __le64 *cqe; - u8 polarity; - - ukcq = &iwcq->sc_cq.cq_uk; - if (ukcq->avoid_mem_cflct) - cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(ukcq); - else - cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq); - get_64bit_val(cqe, 24, &qword3); - polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); - - return polarity != ukcq->polarity; -} - void irdma_remove_cmpls_list(struct irdma_cq *iwcq) { struct irdma_cmpl_gen *cmpl_node; @@ -2436,6 +2414,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) struct irdma_qp_uk *qp = &iwqp->sc_qp.qp_uk; struct irdma_ring *sq_ring = &qp->sq_ring; struct irdma_ring *rq_ring = &qp->rq_ring; + struct irdma_cq *iwscq = iwqp->iwscq; + struct irdma_cq *iwrcq = iwqp->iwrcq; struct irdma_cmpl_gen *cmpl; __le64 *sw_wqe; u64 wqe_qword; @@ -2443,8 +2423,8 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) bool compl_generated = false; unsigned long flags1; - spin_lock_irqsave(&iwqp->iwscq->lock, flags1); - if (irdma_cq_empty(iwqp->iwscq)) { + spin_lock_irqsave(&iwscq->lock, flags1); + if (irdma_uk_cq_empty(&iwscq->sc_cq.cq_uk)) { unsigned long flags2; spin_lock_irqsave(&iwqp->lock, flags2); @@ -2452,7 +2432,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC); if (!cmpl) { spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); return; } @@ -2471,24 +2451,24 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) kfree(cmpl); continue; } - ibdev_dbg(iwqp->iwscq->ibcq.device, + ibdev_dbg(iwscq->ibcq.device, "DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n", __func__, cmpl->cpi.wr_id, qp->qp_id); - list_add_tail(&cmpl->list, &iwqp->iwscq->cmpl_generated); + list_add_tail(&cmpl->list, &iwscq->cmpl_generated); compl_generated = true; } spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); if (compl_generated) - irdma_comp_handler(iwqp->iwscq); + irdma_comp_handler(iwscq); } else { - spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + spin_unlock_irqrestore(&iwscq->lock, flags1); mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } - spin_lock_irqsave(&iwqp->iwrcq->lock, flags1); - if (irdma_cq_empty(iwqp->iwrcq)) { + spin_lock_irqsave(&iwrcq->lock, flags1); + if (irdma_uk_cq_empty(&iwrcq->sc_cq.cq_uk)) { unsigned long flags2; spin_lock_irqsave(&iwqp->lock, flags2); @@ -2496,7 +2476,7 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC); if (!cmpl) { spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); return; } @@ -2508,20 +2488,20 @@ void irdma_generate_flush_completions(struct irdma_qp *iwqp) cmpl->cpi.q_type = IRDMA_CQE_QTYPE_RQ; /* remove the RQ WR by moving RQ tail */ IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1); - ibdev_dbg(iwqp->iwrcq->ibcq.device, + ibdev_dbg(iwrcq->ibcq.device, "DEV: %s: adding wr_id = 0x%llx RQ Completion to list qp_id=%d, wqe_idx=%d\n", __func__, cmpl->cpi.wr_id, qp->qp_id, wqe_idx); - list_add_tail(&cmpl->list, &iwqp->iwrcq->cmpl_generated); + list_add_tail(&cmpl->list, &iwrcq->cmpl_generated); compl_generated = true; } spin_unlock_irqrestore(&iwqp->lock, flags2); - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); if (compl_generated) - irdma_comp_handler(iwqp->iwrcq); + irdma_comp_handler(iwrcq); } else { - spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + spin_unlock_irqrestore(&iwrcq->lock, flags1); mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index c883c9ea5a83..6d9af41a2884 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -27,7 +27,8 @@ static int irdma_query_device(struct ib_device *ibdev, irdma_fw_minor_ver(&rf->sc_dev); props->device_cap_flags = IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_MGT_EXTENSIONS; - props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; + if (hw_attrs->uk_attrs.hw_rev < IRDMA_GEN_3) + props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; props->vendor_id = pcidev->vendor; props->vendor_part_id = pcidev->device; @@ -771,7 +772,6 @@ static int irdma_cqp_create_qp_cmd(struct irdma_qp *iwqp) cqp_info = &cqp_request->info; qp_info = &cqp_request->info.in.u.qp_create.info; - memset(qp_info, 0, sizeof(*qp_info)); qp_info->mac_valid = true; qp_info->cq_num_valid = true; qp_info->next_iwarp_state = IRDMA_QP_STATE_IDLE; @@ -2029,6 +2029,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, struct irdma_pci_f *rf; struct irdma_cq_buf *cq_buf = NULL; unsigned long flags; + u8 cqe_size; int ret; iwdev = to_iwdev(ibcq->device); @@ -2045,7 +2046,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, return -EINVAL; if (!iwcq->user_mode) { - entries++; + entries += 2; if (!iwcq->sc_cq.cq_uk.avoid_mem_cflct && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) @@ -2053,6 +2054,10 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, if (entries & 1) entries += 1; /* cq size must be an even number */ + + cqe_size = iwcq->sc_cq.cq_uk.avoid_mem_cflct ? 64 : 32; + if (entries * cqe_size == IRDMA_HW_PAGE_SIZE) + entries += 2; } info.cq_size = max(entries, 4); @@ -2306,8 +2311,8 @@ static int irdma_setup_kmode_srq(struct irdma_device *iwdev, ukinfo->srq_size = depth >> shift; ukinfo->shadow_area = mem->va + ring_size; - info->shadow_area_pa = info->srq_pa + ring_size; info->srq_pa = mem->pa; + info->shadow_area_pa = info->srq_pa + ring_size; return 0; } @@ -2384,6 +2389,7 @@ static int irdma_create_srq(struct ib_srq *ibsrq, info.vsi = &iwdev->vsi; info.pd = &iwpd->sc_pd; + iwsrq->sc_srq.srq_uk.lock = &iwsrq->lock; err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info); if (err_code) goto free_dmem; @@ -2483,6 +2489,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, int err_code; int entries = attr->cqe; bool cqe_64byte_ena; + u8 cqe_size; err_code = cq_validate_flags(attr->flags, dev->hw_attrs.uk_attrs.hw_rev); if (err_code) @@ -2509,6 +2516,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, ukinfo->cq_id = cq_num; cqe_64byte_ena = dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_64_BYTE_CQE ? true : false; + cqe_size = cqe_64byte_ena ? 64 : 32; ukinfo->avoid_mem_cflct = cqe_64byte_ena; iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size; if (attr->comp_vector < rf->ceqs_count) @@ -2581,13 +2589,16 @@ static int irdma_create_cq(struct ib_cq *ibcq, goto cq_free_rsrc; } - entries++; + entries += 2; if (!cqe_64byte_ena && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) entries *= 2; if (entries & 1) entries += 1; /* cq size must be an even number */ + if (entries * cqe_size == IRDMA_HW_PAGE_SIZE) + entries += 2; + ukinfo->cq_size = entries; if (cqe_64byte_ena) @@ -3103,12 +3114,10 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, cqp_info = &cqp_request->info; info = &cqp_info->in.u.alloc_stag.info; - memset(info, 0, sizeof(*info)); info->page_size = PAGE_SIZE; info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S; info->pd_id = iwpd->sc_pd.pd_id; info->total_len = iwmr->len; - info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; info->remote_access = true; cqp_info->cqp_cmd = IRDMA_OP_ALLOC_STAG; cqp_info->post_sq = 1; @@ -3119,7 +3128,7 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, if (status) return status; - iwmr->is_hwreg = 1; + iwmr->is_hwreg = true; return 0; } @@ -3253,7 +3262,6 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, cqp_info = &cqp_request->info; stag_info = &cqp_info->in.u.mr_reg_non_shared.info; - memset(stag_info, 0, sizeof(*stag_info)); stag_info->va = iwpbl->user_base; stag_info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S; stag_info->stag_key = (u8)iwmr->stag; @@ -3263,7 +3271,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS) stag_info->remote_atomics_en = (access & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; stag_info->pd_id = iwpd->sc_pd.pd_id; - stag_info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY; + stag_info->all_memory = iwmr->dma_mr; if (stag_info->access_rights & IRDMA_ACCESS_FLAGS_ZERO_BASED) stag_info->addr_type = IRDMA_ADDR_TYPE_ZERO_BASED; else @@ -3290,7 +3298,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request); if (!ret) - iwmr->is_hwreg = 1; + iwmr->is_hwreg = true; return ret; } @@ -3647,7 +3655,6 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr) cqp_info = &cqp_request->info; info = &cqp_info->in.u.dealloc_stag.info; - memset(info, 0, sizeof(*info)); info->pd_id = iwpd->sc_pd.pd_id; info->stag_idx = ib_mr->rkey >> IRDMA_CQPSQ_STAG_IDX_S; info->mr = true; @@ -3663,7 +3670,7 @@ static int irdma_hwdereg_mr(struct ib_mr *ib_mr) if (status) return status; - iwmr->is_hwreg = 0; + iwmr->is_hwreg = false; return 0; } @@ -3786,9 +3793,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags, * @size: size of memory to register * @access: Access rights * @iova_start: start of virtual address for physical buffers + * @dma_mr: Flag indicating whether this region is a PD DMA MR */ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access, - u64 *iova_start) + u64 *iova_start, bool dma_mr) { struct irdma_device *iwdev = to_iwdev(pd->device); struct irdma_pbl *iwpbl; @@ -3805,6 +3813,7 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access iwpbl = &iwmr->iwpbl; iwpbl->iwmr = iwmr; iwmr->type = IRDMA_MEMREG_TYPE_MEM; + iwmr->dma_mr = dma_mr; iwpbl->user_base = *iova_start; stag = irdma_create_stag(iwdev); if (!stag) { @@ -3843,7 +3852,7 @@ static struct ib_mr *irdma_get_dma_mr(struct ib_pd *pd, int acc) { u64 kva = 0; - return irdma_reg_phys_mr(pd, 0, 0, acc, &kva); + return irdma_reg_phys_mr(pd, 0, 0, acc, &kva, true); } /** @@ -4078,7 +4087,7 @@ static int irdma_post_send(struct ib_qp *ibqp, break; case IB_WR_LOCAL_INV: info.op_type = IRDMA_OP_TYPE_INV_STAG; - info.local_fence = info.read_fence; + info.local_fence = true; info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; err = irdma_uk_stag_local_invalidate(ukqp, &info, true); break; @@ -4505,7 +4514,7 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq, } if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && - (!irdma_cq_empty(iwcq) || !list_empty(&iwcq->cmpl_generated))) + (!irdma_uk_cq_empty(ukcq) || !list_empty(&iwcq->cmpl_generated))) ret = 1; spin_unlock_irqrestore(&iwcq->lock, flags); @@ -5204,7 +5213,7 @@ static int irdma_create_user_ah(struct ib_ah *ibah, struct irdma_ah *parent_ah; int err; - if (udata && udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN) + if (udata->outlen < IRDMA_CREATE_AH_MIN_RESP_LEN) return -EINVAL; err = irdma_setup_ah(ibah, attr); @@ -5500,7 +5509,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev) irdma_rt_deinit_hw(iwdev); if (!iwdev->is_vport) { irdma_ctrl_deinit_hw(iwdev->rf); - if (iwdev->rf->vchnl_wq) + if (iwdev->rf->vchnl_wq) { destroy_workqueue(iwdev->rf->vchnl_wq); + mutex_destroy(&iwdev->rf->sc_dev.vchnl_mutex); + } } } diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index ac8b38701835..aabbb3442098 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -111,7 +111,8 @@ struct irdma_mr { }; struct ib_umem *region; int access; - u8 is_hwreg; + bool is_hwreg:1; + bool dma_mr:1; u16 type; u32 page_cnt; u64 page_size; diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index 12b481d138cf..03aacd526860 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -591,7 +591,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) int mlx4_ib_cm_init(void) { - cm_wq = alloc_workqueue("mlx4_ib_cm", 0, 0); + cm_wq = alloc_workqueue("mlx4_ib_cm", WQ_PERCPU, 0); if (!cm_wq) return -ENOMEM; diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 8b506417ad2f..d31d7f3005c6 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1225,6 +1225,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(create_flow_table_in, in, other_vport)); MLX5_SET(destroy_flow_table_in, din, vport_number, MLX5_GET(create_flow_table_in, in, vport_number)); + MLX5_SET(destroy_flow_table_in, din, other_eswitch, + MLX5_GET(create_flow_table_in, in, other_eswitch)); + MLX5_SET(destroy_flow_table_in, din, eswitch_owner_vhca_id, + MLX5_GET(create_flow_table_in, in, + eswitch_owner_vhca_id)); MLX5_SET(destroy_flow_table_in, din, table_type, MLX5_GET(create_flow_table_in, in, table_type)); MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id); @@ -1237,6 +1242,11 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(create_flow_group_in, in, other_vport)); MLX5_SET(destroy_flow_group_in, din, vport_number, MLX5_GET(create_flow_group_in, in, vport_number)); + MLX5_SET(destroy_flow_group_in, din, other_eswitch, + MLX5_GET(create_flow_group_in, in, other_eswitch)); + MLX5_SET(destroy_flow_group_in, din, eswitch_owner_vhca_id, + MLX5_GET(create_flow_group_in, in, + eswitch_owner_vhca_id)); MLX5_SET(destroy_flow_group_in, din, table_type, MLX5_GET(create_flow_group_in, in, table_type)); MLX5_SET(destroy_flow_group_in, din, table_id, @@ -1251,6 +1261,10 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, MLX5_GET(set_fte_in, in, other_vport)); MLX5_SET(delete_fte_in, din, vport_number, MLX5_GET(set_fte_in, in, vport_number)); + MLX5_SET(delete_fte_in, din, other_eswitch, + MLX5_GET(set_fte_in, in, other_eswitch)); + MLX5_SET(delete_fte_in, din, eswitch_owner_vhca_id, + MLX5_GET(set_fte_in, in, eswitch_owner_vhca_id)); MLX5_SET(delete_fte_in, din, table_type, MLX5_GET(set_fte_in, in, table_type)); MLX5_SET(delete_fte_in, din, table_id, diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index b0f7663c24c1..d17823ce7f38 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -691,22 +691,13 @@ static bool __maybe_unused mlx5_ib_shared_ft_allowed(struct ib_device *device) return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed); } -static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, - struct mlx5_flow_namespace *ns, +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, struct mlx5_ib_flow_prio *prio, - int priority, - int num_entries, int num_groups, - u32 flags, u16 vport) + struct mlx5_flow_table_attr *ft_attr) { - struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; - ft_attr.prio = priority; - ft_attr.max_fte = num_entries; - ft_attr.flags = flags; - ft_attr.vport = vport; - ft_attr.autogroup.max_num_groups = num_groups; - ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + ft = mlx5_create_auto_grouped_flow_table(ns, ft_attr); if (IS_ERR(ft)) return ERR_CAST(ft); @@ -720,6 +711,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, enum flow_table_type ft_type) { bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; enum mlx5_flow_namespace_type fn_type; struct mlx5_ib_flow_prio *prio; @@ -797,11 +789,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, max_table_size = min_t(int, num_entries, max_table_size); ft = prio->flow_table; - if (!ft) - return _get_prio(dev, ns, prio, priority, max_table_size, - num_groups, flags, 0); + if (ft) + return prio; - return prio; + ft_attr.prio = priority; + ft_attr.max_fte = max_table_size; + ft_attr.flags = flags; + ft_attr.autogroup.max_num_groups = num_groups; + return _get_prio(ns, prio, &ft_attr); } enum { @@ -950,6 +945,7 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, enum mlx5_ib_optional_counter_type type) { enum mlx5_ib_optional_counter_type per_qp_type; + struct mlx5_flow_table_attr ft_attr = {}; enum mlx5_flow_namespace_type fn_type; struct mlx5_flow_namespace *ns; struct mlx5_ib_flow_prio *prio; @@ -1003,7 +999,10 @@ static int get_per_qp_prio(struct mlx5_ib_dev *dev, if (prio->flow_table) return 0; - prio = _get_prio(dev, ns, prio, priority, MLX5_FS_MAX_POOL_SIZE, 1, 0, 0); + ft_attr.prio = priority; + ft_attr.max_fte = MLX5_FS_MAX_POOL_SIZE; + ft_attr.autogroup.max_num_groups = 1; + prio = _get_prio(ns, prio, &ft_attr); if (IS_ERR(prio)) return PTR_ERR(prio); @@ -1223,6 +1222,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_op_fc *opfc, enum mlx5_ib_optional_counter_type type) { + struct mlx5_flow_table_attr ft_attr = {}; enum mlx5_flow_namespace_type fn_type; int priority, i, err, spec_num; struct mlx5_flow_act flow_act = {}; @@ -1304,8 +1304,10 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, if (err) goto free; - prio = _get_prio(dev, ns, prio, priority, - dev->num_ports * MAX_OPFC_RULES, 1, 0, 0); + ft_attr.prio = priority; + ft_attr.max_fte = dev->num_ports * MAX_OPFC_RULES; + ft_attr.autogroup.max_num_groups = 1; + prio = _get_prio(ns, prio, &ft_attr); if (IS_ERR(prio)) { err = PTR_ERR(prio); goto put_prio; @@ -1872,7 +1874,7 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, u32 *flags, u16 *vport_idx, u16 *vport, struct mlx5_core_dev **ft_mdev, - u32 ib_port) + u32 ib_port, u16 *esw_owner_vhca_id) { struct mlx5_core_dev *esw_mdev; @@ -1886,8 +1888,13 @@ static int mlx5_ib_fill_transport_ns_info(struct mlx5_ib_dev *dev, return -EINVAL; esw_mdev = mlx5_eswitch_get_core_dev(dev->port[ib_port - 1].rep->esw); - if (esw_mdev != dev->mdev) - return -EOPNOTSUPP; + if (esw_mdev != dev->mdev) { + if (!MLX5_CAP_ADV_RDMA(dev->mdev, + rdma_transport_manager_other_eswitch)) + return -EOPNOTSUPP; + *flags |= MLX5_FLOW_TABLE_OTHER_ESWITCH; + *esw_owner_vhca_id = MLX5_CAP_GEN(esw_mdev, vhca_id); + } *flags |= MLX5_FLOW_TABLE_OTHER_VPORT; *ft_mdev = esw_mdev; @@ -1903,8 +1910,10 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, bool mcast, u32 ib_port) { struct mlx5_core_dev *ft_mdev = dev->mdev; + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio = NULL; + u16 esw_owner_vhca_id = 0; int max_table_size = 0; u16 vport_idx = 0; bool esw_encap; @@ -1966,7 +1975,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, return ERR_PTR(-EINVAL); ret = mlx5_ib_fill_transport_ns_info(dev, ns_type, &flags, &vport_idx, &vport, - &ft_mdev, ib_port); + &ft_mdev, ib_port, + &esw_owner_vhca_id); if (ret) return ERR_PTR(ret); @@ -2026,8 +2036,13 @@ _get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, if (prio->flow_table) return prio; - return _get_prio(dev, ns, prio, priority, max_table_size, - MLX5_FS_MAX_TYPES, flags, vport); + ft_attr.prio = priority; + ft_attr.max_fte = max_table_size; + ft_attr.flags = flags; + ft_attr.vport = vport; + ft_attr.esw_owner_vhca_id = esw_owner_vhca_id; + ft_attr.autogroup.max_num_groups = MLX5_FS_MAX_TYPES; + return _get_prio(ns, prio, &ft_attr); } static struct mlx5_ib_flow_handler * diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index cc8859d3c2f5..bbecca405171 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -44,6 +44,63 @@ static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports) } } +static int mlx5_ib_set_owner_transport(struct mlx5_core_dev *cur_owner, + struct mlx5_core_dev *new_owner) +{ + int ret; + + if (!MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(cur_owner, ft_support) || + !MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(cur_owner, ft_support)) + return 0; + + if (!MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager) || + !MLX5_CAP_ADV_RDMA(new_owner, rdma_transport_manager_other_eswitch)) + return 0; + + ret = mlx5_fs_set_root_dev(cur_owner, new_owner, + FS_FT_RDMA_TRANSPORT_TX); + if (ret) + return ret; + + ret = mlx5_fs_set_root_dev(cur_owner, new_owner, + FS_FT_RDMA_TRANSPORT_RX); + if (ret) { + mlx5_fs_set_root_dev(cur_owner, cur_owner, + FS_FT_RDMA_TRANSPORT_TX); + return ret; + } + + return 0; +} + +static void mlx5_ib_release_transport(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev; + int i, ret; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + ret = mlx5_ib_set_owner_transport(peer_dev, peer_dev); + WARN_ON_ONCE(ret); + } +} + +static int mlx5_ib_take_transport(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev; + int ret; + int i; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + ret = mlx5_ib_set_owner_transport(peer_dev, dev); + if (ret) { + mlx5_ib_release_transport(dev); + return ret; + } + } + + return 0; +} + static int mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { @@ -88,10 +145,18 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) else return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); + if (mlx5_lag_is_shared_fdb(dev)) { + ret = mlx5_ib_take_transport(lag_master); + if (ret) + return ret; + } + ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, mlx5_core_net(lag_master)); - if (!ibdev) - return -ENOMEM; + if (!ibdev) { + ret = -ENOMEM; + goto release_transport; + } ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port), GFP_KERNEL); @@ -127,6 +192,10 @@ fail_add: kfree(ibdev->port); fail_port: ib_dealloc_device(&ibdev->ib_dev); +release_transport: + if (mlx5_lag_is_shared_fdb(lag_master)) + mlx5_ib_release_transport(lag_master); + return ret; } @@ -182,6 +251,7 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) esw = peer_mdev->priv.eswitch; mlx5_eswitch_unregister_vport_reps(esw, REP_IB); } + mlx5_ib_release_transport(mdev); } __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fc1e86f6c409..40284bbb45d6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -511,6 +511,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, *active_width = IB_WIDTH_4X; *active_speed = IB_SPEED_XDR; break; + case MLX5E_PROT_MASK(MLX5E_1600TAUI_8_1600TBASE_CR8_KR8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_XDR; + break; default: return -EINVAL; } @@ -842,7 +846,7 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, break; case MLX5_VPORT_ACCESS_METHOD_NIC: - err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + err = mlx5_query_nic_vport_node_guid(dev->mdev, 0, false, &tmp); break; default: diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 0e8ae85af5a6..e71ee3d52eb0 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -97,33 +97,28 @@ struct mlx5_pagefault { * a pagefault. */ #define MMU_NOTIFIER_TIMEOUT 1000 -#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) -#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) -#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) -#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) -#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) - -#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT - static u64 mlx5_imr_ksm_entries; +static u64 mlx5_imr_mtt_entries; +static u64 mlx5_imr_mtt_size; +static u8 mlx5_imr_mtt_shift; +static u8 mlx5_imr_ksm_page_shift; -static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, +static void populate_ksm(struct mlx5_ksm *pksm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; - struct mlx5_klm *end = pklm + nentries; - int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + struct mlx5_ksm *end = pksm + nentries; + u64 step = MLX5_CAP_ODP(dev, mem_page_fault) ? mlx5_imr_mtt_size : 0; __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? cpu_to_be32(imr->null_mmkey.key) : mr_to_mdev(imr)->mkeys.null_mkey; u64 va = - MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * mlx5_imr_mtt_size : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++, va += step) { - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = key; - pklm->va = cpu_to_be64(va); + for (; pksm != end; pksm++, idx++, va += step) { + pksm->key = key; + pksm->va = cpu_to_be64(va); } return; } @@ -147,16 +142,15 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++, va += step) { + for (; pksm != end; pksm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); - pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); if (mtt) { - pklm->key = cpu_to_be32(mtt->ibmr.lkey); - pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); + pksm->key = cpu_to_be32(mtt->ibmr.lkey); + pksm->va = cpu_to_be64(idx * mlx5_imr_mtt_size); } else { - pklm->key = key; - pklm->va = cpu_to_be64(va); + pksm->key = key; + pksm->va = cpu_to_be64(va); } } } @@ -201,7 +195,7 @@ int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) { if (flags & MLX5_IB_UPD_XLT_INDIRECT) { - populate_klm(xlt, idx, nentries, mr, flags); + populate_ksm(xlt, idx, nentries, mr, flags); return 0; } else { return populate_mtt(xlt, idx, nentries, mr, flags); @@ -226,7 +220,7 @@ static void free_implicit_child_mr_work(struct work_struct *work) mutex_lock(&odp_imr->umem_mutex); mlx5r_umr_update_xlt(mr->parent, - ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0, + ib_umem_start(odp) >> mlx5_imr_mtt_shift, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); mlx5_ib_dereg_mr(&mr->ibmr, NULL); @@ -237,7 +231,7 @@ static void free_implicit_child_mr_work(struct work_struct *work) static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = ib_umem_start(odp) >> mlx5_imr_mtt_shift; struct mlx5_ib_mr *imr = mr->parent; /* @@ -265,7 +259,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); - queue_work(system_unbound_wq, &mr->odp_destroy.work); + queue_work(system_dfl_wq, &mr->odp_destroy.work); } static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, @@ -425,7 +419,10 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && MLX5_CAP_GEN(dev->mdev, null_mkey) && MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && - !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) + !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled) && + mlx5_imr_ksm_entries != 0 && + !(mlx5_imr_ksm_page_shift > + get_max_log_entity_size_cap(dev, MLX5_MKC_ACCESS_MODE_KSM))) caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; } @@ -476,14 +473,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, int err; odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), - idx * MLX5_IMR_MTT_SIZE, - MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); + idx * mlx5_imr_mtt_size, + mlx5_imr_mtt_size, &mlx5_mn_ops); if (IS_ERR(odp)) return ERR_CAST(odp); mr = mlx5_mr_cache_alloc(dev, imr->access_flags, MLX5_MKC_ACCESS_MODE_MTT, - MLX5_IMR_MTT_ENTRIES); + mlx5_imr_mtt_entries); if (IS_ERR(mr)) { ib_umem_odp_release(odp); return mr; @@ -495,7 +492,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, mr->umem = &odp->umem; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; - mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE; + mr->ibmr.iova = idx * mlx5_imr_mtt_size; mr->parent = imr; odp->private = mr; @@ -506,7 +503,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_set(&mr->mmkey.usecount, 2); err = mlx5r_umr_update_xlt(mr, 0, - MLX5_IMR_MTT_ENTRIES, + mlx5_imr_mtt_entries, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ENABLE); @@ -611,7 +608,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct mlx5_ib_mr *imr; int err; - if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) + if (!mlx5r_umr_can_load_pas(dev, mlx5_imr_mtt_entries * PAGE_SIZE)) return ERR_PTR(-EOPNOTSUPP); umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); @@ -647,7 +644,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, - MLX5_KSM_PAGE_SHIFT, + mlx5_imr_ksm_page_shift, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ENABLE); @@ -750,20 +747,20 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, struct ib_umem_odp *odp_imr, u64 user_va, size_t bcnt, u32 *bytes_mapped, u32 flags) { - unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long end_idx = (user_va + bcnt - 1) >> mlx5_imr_mtt_shift; unsigned long upd_start_idx = end_idx + 1; unsigned long upd_len = 0; unsigned long npages = 0; int err; int ret; - if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || - mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) + if (unlikely(user_va >= mlx5_imr_ksm_entries * mlx5_imr_mtt_size || + mlx5_imr_ksm_entries * mlx5_imr_mtt_size - user_va < bcnt)) return -EFAULT; /* Fault each child mr that intersects with our interval. */ while (bcnt) { - unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; + unsigned long idx = user_va >> mlx5_imr_mtt_shift; struct ib_umem_odp *umem_odp; struct mlx5_ib_mr *mtt; u64 len; @@ -1924,9 +1921,25 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) int mlx5_ib_odp_init(void) { + u32 log_va_pages = ilog2(TASK_SIZE) - PAGE_SHIFT; + u8 mlx5_imr_mtt_bits; + + /* 48 is default ARM64 VA space and covers X86 4-level paging which is 47 */ + if (log_va_pages <= 48 - PAGE_SHIFT) + mlx5_imr_mtt_shift = 30; + /* 56 is x86-64, 5-level paging */ + else if (log_va_pages <= 56 - PAGE_SHIFT) + mlx5_imr_mtt_shift = 34; + else + return 0; + + mlx5_imr_mtt_size = BIT_ULL(mlx5_imr_mtt_shift); + mlx5_imr_mtt_bits = mlx5_imr_mtt_shift - PAGE_SHIFT; + mlx5_imr_mtt_entries = BIT_ULL(mlx5_imr_mtt_bits); mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - - MLX5_IMR_MTT_BITS); + mlx5_imr_mtt_bits); + mlx5_imr_ksm_page_shift = mlx5_imr_mtt_shift; return 0; } @@ -2093,6 +2106,6 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, destroy_prefetch_work(work); return rc; } - queue_work(system_unbound_wq, &work->work); + queue_work(system_dfl_wq, &work->work); return 0; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 88724d15705d..69af20790481 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3451,10 +3451,11 @@ int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate) { u32 stat_rate_support; - if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS) + if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS || + rate == IB_RATE_1600_GBPS) return 0; - if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS) + if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_1600_GBPS) return -EINVAL; stat_rate_support = MLX5_CAP_GEN(dev->mdev, stat_rate_support); |
