summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-05-30 10:18:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-05-30 10:18:56 -0700
commitdd91b5e1d6448794c07378d1be12e3261c8769e7 (patch)
tree920e661b98a658c5c3abe1bc61b78fbfdba43cd2
parent883e3c9f40814377a239ca0becbcc77deab5ffe5 (diff)
parent92a251c3df8ea1991cd9fe00f1ab0cfce18d7711 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdmaHEADmaster
Pull rdma updates from Jason Gunthorpe: "Usual collection of driver fixes: - Small bug fixes and cleansup in hfi, hns, rxe, mlx5, mana siw - Further ODP functionality in rxe - Remote access MRs in mana, along with more page sizes - Improve CM scalability with a rwlock around the agent - More trace points for hns - ODP hmm conversion to the new two step dma API - Support the ethernet HW device in mana as well as the RNIC - Cleanups: - Use secs_to_jiffies() when appropriate - Use ERR_CAST() instead of naked casts - Don't use %pK in printk - Unusued functions removed - Allocation type matching" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (57 commits) RDMA/cma: Fix hang when cma_netevent_callback fails to queue_work RDMA/bnxt_re: Support extended stats for Thor2 VF RDMA/hns: Fix endian issue in trace events RDMA/mlx5: Avoid flexible array warning IB/cm: Remove dead code and adjust naming RDMA/core: Avoid hmm_dma_map_alloc() for virtual DMA devices RDMA/rxe: Break endless pagefault loop for RO pages RDMA/bnxt_re: Fix return code of bnxt_re_configure_cc RDMA/bnxt_re: Fix missing error handling for tx_queue RDMA/bnxt_re: Fix incorrect display of inactivity_cp in debugfs output RDMA/mlx5: Add support for 200Gbps per lane speeds RDMA/mlx5: Remove the redundant MLX5_IB_STAGE_UAR stage RDMA/iwcm: Fix use-after-free of work objects after cm_id destruction net: mana: Add support for auxiliary device servicing events RDMA/mana_ib: unify mana_ib functions to support any gdma device RDMA/mana_ib: Add support of mana_ib for RNIC and ETH nic net: mana: Probe rdma device in mana driver RDMA/siw: replace redundant ternary operator with just rv RDMA/umem: Separate implicit ODP initialization from explicit ODP RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage ...
-rw-r--r--drivers/infiniband/core/cm.c78
-rw-r--r--drivers/infiniband/core/cm_trace.h2
-rw-r--r--drivers/infiniband/core/cma.c25
-rw-r--r--drivers/infiniband/core/cma_trace.h2
-rw-r--r--drivers/infiniband/core/iwcm.c29
-rw-r--r--drivers/infiniband/core/mad_rmpp.c2
-rw-r--r--drivers/infiniband/core/umem_odp.c271
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c2
-rw-r--r--drivers/infiniband/core/verbs.c2
-rw-r--r--drivers/infiniband/hw/bnxt_re/debugfs.c20
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_fp.c2
-rw-r--r--drivers/infiniband/hw/bnxt_re/qplib_sp.c7
-rw-r--r--drivers/infiniband/hw/hfi1/mad.h1
-rw-r--r--drivers/infiniband/hw/hfi1/pio.c10
-rw-r--r--drivers/infiniband/hw/hfi1/pio.h1
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.c18
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.h1
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.c2
-rw-r--r--drivers/infiniband/hw/hns/Makefile1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_ah.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_device.h20
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.c26
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_hw_v2.h1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_main.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_mr.c3
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_restrack.c1
-rw-r--r--drivers/infiniband/hw/hns/hns_roce_trace.h216
-rw-r--r--drivers/infiniband/hw/irdma/ctrl.c2
-rw-r--r--drivers/infiniband/hw/irdma/pble.c2
-rw-r--r--drivers/infiniband/hw/mana/cq.c4
-rw-r--r--drivers/infiniband/hw/mana/device.c174
-rw-r--r--drivers/infiniband/hw/mana/main.c92
-rw-r--r--drivers/infiniband/hw/mana/mana_ib.h7
-rw-r--r--drivers/infiniband/hw/mana/mr.c29
-rw-r--r--drivers/infiniband/hw/mana/qp.c5
-rw-r--r--drivers/infiniband/hw/mlx4/mcg.c8
-rw-r--r--drivers/infiniband/hw/mlx5/fs.c58
-rw-r--r--drivers/infiniband/hw/mlx5/main.c29
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h13
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c6
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c65
-rw-r--r--drivers/infiniband/hw/mlx5/qpc.c30
-rw-r--r--drivers/infiniband/hw/mlx5/umr.c12
-rw-r--r--drivers/infiniband/hw/mthca/mthca_mr.c2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c2
-rw-r--r--drivers/infiniband/sw/rxe/Kconfig2
-rw-r--r--drivers/infiniband/sw/rxe/rxe.c2
-rw-r--r--drivers/infiniband/sw/rxe/rxe_loc.h29
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c66
-rw-r--r--drivers/infiniband/sw/rxe/rxe_odp.c144
-rw-r--r--drivers/infiniband/sw/rxe/rxe_param.h5
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c7
-rw-r--r--drivers/infiniband/sw/rxe/rxe_resp.c15
-rw-r--r--drivers/infiniband/sw/rxe/rxe_task.c40
-rw-r--r--drivers/infiniband/sw/rxe/rxe_task.h2
-rw-r--r--drivers/infiniband/sw/siw/siw.h2
-rw-r--r--drivers/infiniband/sw/siw/siw_cq.c2
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.c28
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.h1
-rw-r--r--drivers/infiniband/sw/siw/siw_qp_rx.c8
-rw-r--r--drivers/infiniband/sw/siw/siw_verbs.c8
-rw-r--r--drivers/net/ethernet/microsoft/mana/gdma_main.c27
-rw-r--r--drivers/net/ethernet/microsoft/mana/hw_channel.c19
-rw-r--r--drivers/net/ethernet/microsoft/mana/mana_en.c108
-rw-r--r--include/linux/hmm-dma.h33
-rw-r--r--include/linux/hmm.h24
-rw-r--r--include/linux/mlx5/driver.h1
-rw-r--r--include/net/mana/gdma.h47
-rw-r--r--include/net/mana/hw_channel.h9
-rw-r--r--include/net/mana/mana.h3
-rw-r--r--include/rdma/ib_cm.h17
-rw-r--r--include/rdma/ib_umem_odp.h25
-rw-r--r--include/rdma/ib_verbs.h18
-rw-r--r--include/rdma/rdma_cm.h1
-rw-r--r--include/uapi/rdma/ib_user_verbs.h16
-rw-r--r--mm/hmm.c262
76 files changed, 1484 insertions, 772 deletions
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 142170473e75..8670e58675c6 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -36,6 +36,7 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
+#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
static const char * const ibcm_rej_reason_strs[] = {
[IB_CM_REJ_NO_QP] = "no QP",
@@ -167,7 +168,7 @@ struct cm_port {
struct cm_device {
struct kref kref;
struct list_head list;
- spinlock_t mad_agent_lock;
+ rwlock_t mad_agent_lock;
struct ib_device *ib_device;
u8 ack_delay;
int going_down;
@@ -241,7 +242,6 @@ struct cm_id_private {
u8 initiator_depth;
u8 retry_count;
u8 rnr_retry_count;
- u8 service_timeout;
u8 target_ack_delay;
struct list_head work_list;
@@ -285,7 +285,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return ERR_PTR(-EINVAL);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
mad_agent = cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
m = ERR_PTR(-EINVAL);
@@ -311,7 +311,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
m->ah = ah;
out:
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return m;
}
@@ -1297,10 +1297,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return cpu_to_be64(low_tid);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
if (cm_id_priv->av.port->mad_agent)
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return cpu_to_be64(hi_tid | low_tid);
}
@@ -1872,7 +1872,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv,
static void cm_format_mra(struct cm_mra_msg *mra_msg,
struct cm_id_private *cm_id_priv,
- enum cm_msg_response msg_mraed, u8 service_timeout,
+ enum cm_msg_response msg_mraed,
const void *private_data, u8 private_data_len)
{
cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
@@ -1881,7 +1881,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
- IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout);
+ IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING);
if (private_data && private_data_len)
IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data,
@@ -1960,7 +1960,7 @@ static void cm_dup_req_handler(struct cm_work *work,
switch (cm_id_priv->id.state) {
case IB_CM_MRA_REQ_SENT:
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REQ,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
break;
@@ -2454,7 +2454,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
cm_id_priv->private_data_len);
else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REP,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
else
@@ -3094,26 +3094,13 @@ out:
return -EINVAL;
}
-int ib_send_cm_mra(struct ib_cm_id *cm_id,
- u8 service_timeout,
- const void *private_data,
- u8 private_data_len)
+int ib_prepare_cm_mra(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
- struct ib_mad_send_buf *msg;
enum ib_cm_state cm_state;
enum ib_cm_lap_state lap_state;
- enum cm_msg_response msg_response;
- void *data;
unsigned long flags;
- int ret;
-
- if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
- return -EINVAL;
-
- data = cm_copy_private_data(private_data, private_data_len);
- if (IS_ERR(data))
- return PTR_ERR(data);
+ int ret = 0;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -3122,58 +3109,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
case IB_CM_REQ_RCVD:
cm_state = IB_CM_MRA_REQ_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REQ;
break;
case IB_CM_REP_RCVD:
cm_state = IB_CM_MRA_REP_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REP;
break;
case IB_CM_ESTABLISHED:
if (cm_id->lap_state == IB_CM_LAP_RCVD) {
cm_state = cm_id->state;
lap_state = IB_CM_MRA_LAP_SENT;
- msg_response = CM_MSG_RESPONSE_OTHER;
break;
}
fallthrough;
default:
- trace_icm_send_mra_unknown_err(&cm_id_priv->id);
+ trace_icm_prepare_mra_unknown_err(&cm_id_priv->id);
ret = -EINVAL;
goto error_unlock;
}
- if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
- msg = cm_alloc_msg(cm_id_priv);
- if (IS_ERR(msg)) {
- ret = PTR_ERR(msg);
- goto error_unlock;
- }
-
- cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- msg_response, service_timeout,
- private_data, private_data_len);
- trace_icm_send_mra(cm_id);
- ret = ib_post_send_mad(msg, NULL);
- if (ret)
- goto error_free_msg;
- }
-
cm_id->state = cm_state;
cm_id->lap_state = lap_state;
- cm_id_priv->service_timeout = service_timeout;
- cm_set_private_data(cm_id_priv, data, private_data_len);
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- return 0;
+ cm_set_private_data(cm_id_priv, NULL, 0);
-error_free_msg:
- cm_free_msg(msg);
error_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- kfree(data);
return ret;
}
-EXPORT_SYMBOL(ib_send_cm_mra);
+EXPORT_SYMBOL(ib_prepare_cm_mra);
static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
{
@@ -3377,7 +3339,6 @@ static int cm_lap_handler(struct cm_work *work)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_OTHER,
- cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
spin_unlock_irq(&cm_id_priv->lock);
@@ -3786,7 +3747,8 @@ static void cm_process_send_error(struct cm_id_private *cm_id_priv,
spin_lock_irq(&cm_id_priv->lock);
if (msg != cm_id_priv->msg) {
spin_unlock_irq(&cm_id_priv->lock);
- cm_free_priv_msg(msg);
+ cm_free_msg(msg);
+ cm_deref_id(cm_id_priv);
return;
}
cm_free_priv_msg(msg);
@@ -4378,7 +4340,7 @@ static int cm_add_one(struct ib_device *ib_device)
return -ENOMEM;
kref_init(&cm_dev->kref);
- spin_lock_init(&cm_dev->mad_agent_lock);
+ rwlock_init(&cm_dev->mad_agent_lock);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
@@ -4494,9 +4456,9 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
* The above ensures no call paths from the work are running,
* the remaining paths all take the mad_agent_lock.
*/
- spin_lock(&cm_dev->mad_agent_lock);
+ write_lock(&cm_dev->mad_agent_lock);
port->mad_agent = NULL;
- spin_unlock(&cm_dev->mad_agent_lock);
+ write_unlock(&cm_dev->mad_agent_lock);
ib_unregister_mad_agent(mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h
index 944d9071245d..4a4987da69d4 100644
--- a/drivers/infiniband/core/cm_trace.h
+++ b/drivers/infiniband/core/cm_trace.h
@@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep);
DEFINE_CM_ERR_EVENT(dreq_unknown);
DEFINE_CM_ERR_EVENT(send_unknown_rej);
DEFINE_CM_ERR_EVENT(rej_unknown);
-DEFINE_CM_ERR_EVENT(send_mra_unknown);
+DEFINE_CM_ERR_EVENT(prepare_mra_unknown);
DEFINE_CM_ERR_EVENT(mra_unknown);
DEFINE_CM_ERR_EVENT(qp_init);
DEFINE_CM_ERR_EVENT(qp_rtr);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index ab31eefa916b..9b471548e7ae 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,7 +46,6 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
-#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 16
#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
@@ -146,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
}
EXPORT_SYMBOL(rdma_iw_cm_id);
-/**
- * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
- * @res: rdma resource tracking entry pointer
- */
-struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
-{
- struct rdma_id_private *id_priv =
- container_of(res, struct rdma_id_private, res);
-
- return &id_priv->id;
-}
-EXPORT_SYMBOL(rdma_res_to_id);
-
static int cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device, void *client_data);
@@ -2214,8 +2200,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id,
case IB_CM_REP_RECEIVED:
if (state == RDMA_CM_CONNECT &&
(id_priv->id.qp_type != IB_QPT_UD)) {
- trace_cm_send_mra(id_priv);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(id_priv);
+ ib_prepare_cm_mra(cm_id);
}
if (id_priv->id.qp) {
event.status = cma_rep_recv(id_priv);
@@ -2476,8 +2462,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT &&
conn_id->id.qp_type != IB_QPT_UD) {
- trace_cm_send_mra(cm_id->context);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(cm_id->context);
+ ib_prepare_cm_mra(cm_id);
}
mutex_unlock(&conn_id->handler_mutex);
@@ -5245,7 +5231,8 @@ static int cma_netevent_callback(struct notifier_block *self,
neigh->ha, ETH_ALEN))
continue;
cma_id_get(current_id);
- queue_work(cma_wq, &current_id->id.net_work);
+ if (!queue_work(cma_wq, &current_id->id.net_work))
+ cma_id_put(current_id);
}
out:
spin_unlock_irqrestore(&id_table_lock, flags);
diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h
index dc622f3778be..3456d5f3aa47 100644
--- a/drivers/infiniband/core/cma_trace.h
+++ b/drivers/infiniband/core/cma_trace.h
@@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class,
DEFINE_CMA_FSM_EVENT(send_rtu);
DEFINE_CMA_FSM_EVENT(send_rej);
-DEFINE_CMA_FSM_EVENT(send_mra);
+DEFINE_CMA_FSM_EVENT(prepare_mra);
DEFINE_CMA_FSM_EVENT(send_sidr_req);
DEFINE_CMA_FSM_EVENT(send_sidr_rep);
DEFINE_CMA_FSM_EVENT(disconnect);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index f4486cbd8f45..62410578dec3 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -368,12 +368,9 @@ EXPORT_SYMBOL(iw_cm_disconnect);
/*
* CM_ID <-- DESTROYING
*
- * Clean up all resources associated with the connection and release
- * the initial reference taken by iw_create_cm_id.
- *
- * Returns true if and only if the last cm_id_priv reference has been dropped.
+ * Clean up all resources associated with the connection.
*/
-static bool destroy_cm_id(struct iw_cm_id *cm_id)
+static void destroy_cm_id(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
struct ib_qp *qp;
@@ -442,20 +439,22 @@ static bool destroy_cm_id(struct iw_cm_id *cm_id)
iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
}
-
- return iwcm_deref_id(cm_id_priv);
}
/*
- * This function is only called by the application thread and cannot
- * be called by the event thread. The function will wait for all
- * references to be released on the cm_id and then kfree the cm_id
- * object.
+ * Destroy cm_id. If the cm_id still has other references, wait for all
+ * references to be released on the cm_id and then release the initial
+ * reference taken by iw_create_cm_id.
*/
void iw_destroy_cm_id(struct iw_cm_id *cm_id)
{
- if (!destroy_cm_id(cm_id))
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ destroy_cm_id(cm_id);
+ if (refcount_read(&cm_id_priv->refcount) > 1)
flush_workqueue(iwcm_wq);
+ iwcm_deref_id(cm_id_priv);
}
EXPORT_SYMBOL(iw_destroy_cm_id);
@@ -1035,8 +1034,10 @@ static void cm_work_handler(struct work_struct *_work)
if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
ret = process_event(cm_id_priv, &levent);
- if (ret)
- WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id));
+ if (ret) {
+ destroy_cm_id(&cm_id_priv->id);
+ WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
+ }
} else
pr_debug("dropping event %d\n", levent.event);
if (iwcm_deref_id(cm_id_priv))
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index 8af0619a39cd..b4b10e8a6495 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
recv_wc->recv_buf.grh, agent->port_num);
if (IS_ERR(ah))
- return (void *) ah;
+ return ERR_CAST(ah);
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index c48ef6083020..c752ae9fad6c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -41,67 +41,72 @@
#include <linux/hugetlb.h>
#include <linux/interval_tree.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/pagemap.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
-static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
- const struct mmu_interval_notifier_ops *ops)
+static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp)
{
- int ret;
+ umem_odp->is_implicit_odp = 1;
+ umem_odp->umem.is_odp = 1;
+ mutex_init(&umem_odp->umem_mutex);
+}
+
+static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+ size_t page_size = 1UL << umem_odp->page_shift;
+ struct hmm_dma_map *map;
+ unsigned long start;
+ unsigned long end;
+ size_t nr_entries;
+ int ret = 0;
umem_odp->umem.is_odp = 1;
mutex_init(&umem_odp->umem_mutex);
- if (!umem_odp->is_implicit_odp) {
- size_t page_size = 1UL << umem_odp->page_shift;
- unsigned long start;
- unsigned long end;
- size_t ndmas, npfns;
-
- start = ALIGN_DOWN(umem_odp->umem.address, page_size);
- if (check_add_overflow(umem_odp->umem.address,
- (unsigned long)umem_odp->umem.length,
- &end))
- return -EOVERFLOW;
- end = ALIGN(end, page_size);
- if (unlikely(end < page_size))
- return -EOVERFLOW;
-
- ndmas = (end - start) >> umem_odp->page_shift;
- if (!ndmas)
- return -EINVAL;
-
- npfns = (end - start) >> PAGE_SHIFT;
- umem_odp->pfn_list = kvcalloc(
- npfns, sizeof(*umem_odp->pfn_list),
- GFP_KERNEL | __GFP_NOWARN);
- if (!umem_odp->pfn_list)
- return -ENOMEM;
-
- umem_odp->dma_list = kvcalloc(
- ndmas, sizeof(*umem_odp->dma_list),
- GFP_KERNEL | __GFP_NOWARN);
- if (!umem_odp->dma_list) {
+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
+ if (check_add_overflow(umem_odp->umem.address,
+ (unsigned long)umem_odp->umem.length, &end))
+ return -EOVERFLOW;
+ end = ALIGN(end, page_size);
+ if (unlikely(end < page_size))
+ return -EOVERFLOW;
+
+ nr_entries = (end - start) >> PAGE_SHIFT;
+ if (!(nr_entries * PAGE_SIZE / page_size))
+ return -EINVAL;
+
+ map = &umem_odp->map;
+ if (ib_uses_virt_dma(dev)) {
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
ret = -ENOMEM;
- goto out_pfn_list;
- }
+ } else
+ ret = hmm_dma_map_alloc(dev->dma_device, map,
+ (end - start) >> PAGE_SHIFT,
+ 1 << umem_odp->page_shift);
+ if (ret)
+ return ret;
- ret = mmu_interval_notifier_insert(&umem_odp->notifier,
- umem_odp->umem.owning_mm,
- start, end - start, ops);
- if (ret)
- goto out_dma_list;
- }
+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
+ umem_odp->umem.owning_mm, start,
+ end - start, ops);
+ if (ret)
+ goto out_free_map;
return 0;
-out_dma_list:
- kvfree(umem_odp->dma_list);
-out_pfn_list:
- kvfree(umem_odp->pfn_list);
+out_free_map:
+ if (ib_uses_virt_dma(dev))
+ kfree(map->pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, map);
return ret;
}
@@ -120,7 +125,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
{
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
- int ret;
if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL);
@@ -132,16 +136,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
- umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
- ret = ib_init_umem_odp(umem_odp, NULL);
- if (ret) {
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
- return ERR_PTR(ret);
- }
+ ib_init_umem_implicit_odp(umem_odp);
return umem_odp;
}
EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
@@ -262,74 +260,41 @@ err_put_pid:
}
EXPORT_SYMBOL(ib_umem_odp_get);
-void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
+static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+
/*
* Ensure that no more pages are mapped in the umem.
*
* It is the driver's responsibility to ensure, before calling us,
* that the hardware will not attempt to access the MR any more.
*/
- if (!umem_odp->is_implicit_odp) {
- mutex_lock(&umem_odp->umem_mutex);
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
- mutex_unlock(&umem_odp->umem_mutex);
- mmu_interval_notifier_remove(&umem_odp->notifier);
- kvfree(umem_odp->dma_list);
- kvfree(umem_odp->pfn_list);
- }
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
+ mutex_lock(&umem_odp->umem_mutex);
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
+ mutex_unlock(&umem_odp->umem_mutex);
+ mmu_interval_notifier_remove(&umem_odp->notifier);
+ if (ib_uses_virt_dma(dev))
+ kfree(umem_odp->map.pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, &umem_odp->map);
}
-EXPORT_SYMBOL(ib_umem_odp_release);
-/*
- * Map for DMA and insert a single page into the on-demand paging page tables.
- *
- * @umem: the umem to insert the page to.
- * @dma_index: index in the umem to add the dma to.
- * @page: the page struct to map and add.
- * @access_mask: access permissions needed for this page.
- *
- * The function returns -EFAULT if the DMA mapping operation fails.
- *
- */
-static int ib_umem_odp_map_dma_single_page(
- struct ib_umem_odp *umem_odp,
- unsigned int dma_index,
- struct page *page,
- u64 access_mask)
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
- struct ib_device *dev = umem_odp->umem.ibdev;
- dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
-
- if (*dma_addr) {
- /*
- * If the page is already dma mapped it means it went through
- * a non-invalidating trasition, like read-only to writable.
- * Resync the flags.
- */
- *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
- return 0;
- }
+ if (!umem_odp->is_implicit_odp)
+ ib_umem_odp_free(umem_odp);
- *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
- DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(dev, *dma_addr)) {
- *dma_addr = 0;
- return -EFAULT;
- }
- umem_odp->npages++;
- *dma_addr |= access_mask;
- return 0;
+ put_pid(umem_odp->tgid);
+ kfree(umem_odp);
}
+EXPORT_SYMBOL(ib_umem_odp_release);
/**
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
*
* Maps the range passed in the argument to DMA addresses.
- * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
* Upon success the ODP MR will be locked to let caller complete its device
* page table update.
*
@@ -357,9 +322,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
struct hmm_range range = {};
unsigned long timeout;
- if (access_mask == 0)
- return -EINVAL;
-
if (user_virt < ib_umem_start(umem_odp) ||
user_virt + bcnt > ib_umem_end(umem_odp))
return -EFAULT;
@@ -385,11 +347,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
if (fault) {
range.default_flags = HMM_PFN_REQ_FAULT;
- if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ if (access_mask & HMM_PFN_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}
- range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
+ range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
retry:
@@ -417,22 +379,17 @@ retry:
for (pfn_index = 0; pfn_index < num_pfns;
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
- if (fault) {
- /*
- * Since we asked for hmm_range_fault() to populate
- * pages it shouldn't return an error entry on success.
- */
- WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
- WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
- } else {
- if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
- WARN_ON(umem_odp->dma_list[dma_index]);
- continue;
- }
- access_mask = ODP_READ_ALLOWED_BIT;
- if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
- }
+ /*
+ * Since we asked for hmm_range_fault() to populate
+ * pages it shouldn't return an error entry on success.
+ */
+ WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
+ WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
+ if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
+ continue;
+
+ if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
+ continue;
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -445,15 +402,6 @@ retry:
__func__, hmm_order, page_shift);
break;
}
-
- ret = ib_umem_odp_map_dma_single_page(
- umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
- access_mask);
- if (ret < 0) {
- ibdev_dbg(umem_odp->umem.ibdev,
- "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
- break;
- }
}
/* upon success lock should stay on hold for the callee */
if (!ret)
@@ -473,45 +421,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
- dma_addr_t dma_addr;
- dma_addr_t dma;
- int idx;
- u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
+ u64 addr;
lockdep_assert_held(&umem_odp->umem_mutex);
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
- idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- dma = umem_odp->dma_list[idx];
-
- /* The access flags guaranteed a valid DMA address in case was NULL */
- if (dma) {
- unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
- struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
-
- dma_addr = dma & ODP_DMA_ADDR_MASK;
- ib_dma_unmap_page(dev, dma_addr,
- BIT(umem_odp->page_shift),
- DMA_BIDIRECTIONAL);
- if (dma & ODP_WRITE_ALLOWED_BIT) {
- struct page *head_page = compound_head(page);
- /*
- * set_page_dirty prefers being called with
- * the page lock. However, MMU notifiers are
- * called sometimes with and sometimes without
- * the lock. We rely on the umem_mutex instead
- * to prevent other mmu notifiers from
- * continuing and allowing the page mapping to
- * be removed.
- */
- set_page_dirty(head_page);
- }
- umem_odp->dma_list[idx] = 0;
- umem_odp->npages--;
+ u64 offset = addr - ib_umem_start(umem_odp);
+ size_t idx = offset >> umem_odp->page_shift;
+ unsigned long pfn = umem_odp->map.pfn_list[idx];
+
+ if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
+ goto clear;
+
+ if (pfn & HMM_PFN_WRITE) {
+ struct page *page = hmm_pfn_to_page(pfn);
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
}
+ umem_odp->npages--;
+clear:
+ umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 3c3bb670c805..bc9fe3ceca4d 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -193,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
fd, attrs);
if (IS_ERR(uobj))
- return (void *)uobj;
+ return ERR_CAST(uobj);
uverbs_uobject_get(uobj);
uobj_put_read(uobj);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index c5e78bbefbd0..75fde0fe9989 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -572,7 +572,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
GFP_KERNEL : GFP_ATOMIC);
if (IS_ERR(slave)) {
rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
- return (void *)slave;
+ return ERR_CAST(slave);
}
ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave);
rdma_lag_put_ah_roce_slave(slave);
diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c
index af91d16c3c77..e632f1661b92 100644
--- a/drivers/infiniband/hw/bnxt_re/debugfs.c
+++ b/drivers/infiniband/hw/bnxt_re/debugfs.c
@@ -170,6 +170,9 @@ static int map_cc_config_offset_gen0_ext0(u32 offset, struct bnxt_qplib_cc_param
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP:
*val = ccparam->tcp_cp;
break;
+ case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP:
+ *val = ccparam->inact_th;
+ break;
default:
return -EINVAL;
}
@@ -203,7 +206,7 @@ static ssize_t bnxt_re_cc_config_get(struct file *filp, char __user *buffer,
return simple_read_from_buffer(buffer, usr_buf_len, ppos, (u8 *)(buf), rc);
}
-static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val)
+static int bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val)
{
u32 modify_mask;
@@ -247,7 +250,9 @@ static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offs
ccparam->tcp_cp = val;
break;
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TX_QUEUE:
+ return -EOPNOTSUPP;
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP:
+ ccparam->inact_th = val;
break;
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TIME_PER_PHASE:
ccparam->time_pph = val;
@@ -258,17 +263,20 @@ static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offs
}
ccparam->mask = modify_mask;
+ return 0;
}
static int bnxt_re_configure_cc(struct bnxt_re_dev *rdev, u32 gen_ext, u32 offset, u32 val)
{
struct bnxt_qplib_cc_param ccparam = { };
+ int rc;
- /* Supporting only Gen 0 now */
- if (gen_ext == CC_CONFIG_GEN0_EXT0)
- bnxt_re_fill_gen0_ext0(&ccparam, offset, val);
- else
- return -EINVAL;
+ if (gen_ext != CC_CONFIG_GEN0_EXT0)
+ return -EOPNOTSUPP;
+
+ rc = bnxt_re_fill_gen0_ext0(&ccparam, offset, val);
+ if (rc)
+ return rc;
bnxt_qplib_modify_cc(&rdev->qplib_res, &ccparam);
return 0;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index 457eecb99f96..be34c605d516 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -1113,7 +1113,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION;
if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE)
qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_VARIABLE_SIZED_WQE_ENABLED;
- if (_is_ext_stats_supported(res->dattr->dev_cap_flags) && !res->is_vf)
+ if (bnxt_ext_stats_supported(res->cctx, res->dattr->dev_cap_flags, res->is_vf))
qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED;
req.qp_flags = cpu_to_le32(qp_flags);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index f231e886ad9d..9efd32a3dc55 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -846,7 +846,12 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid,
req.resp_size = sbuf.size / BNXT_QPLIB_CMDQE_UNITS;
req.resp_addr = cpu_to_le64(sbuf.dma_addr);
- req.function_id = cpu_to_le32(fid);
+ if (bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx) && rcfw->res->is_vf)
+ req.function_id =
+ cpu_to_le32(CMDQ_QUERY_ROCE_STATS_EXT_VF_VALID |
+ (fid << CMDQ_QUERY_ROCE_STATS_EXT_VF_NUM_SFT));
+ else
+ req.function_id = cpu_to_le32(fid);
req.flags = cpu_to_le16(CMDQ_QUERY_ROCE_STATS_EXT_FLAGS_FUNCTION_ID);
bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req),
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
index b6e3141253c4..d6dde762921a 100644
--- a/drivers/infiniband/hw/hfi1/mad.h
+++ b/drivers/infiniband/hw/hfi1/mad.h
@@ -124,7 +124,6 @@ struct opa_mad_notice_attr {
} __packed ntc_2048;
};
- u8 class_data[];
};
#define IB_VLARB_LOWPRI_0_31 1
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 5a91cbda4aee..764286da2ce8 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -1361,16 +1361,6 @@ void sc_flush(struct send_context *sc)
sc_wait_for_packet_egress(sc, 1);
}
-/* drop all packets on the context, no waiting until they are sent */
-void sc_drop(struct send_context *sc)
-{
- if (!sc)
- return;
-
- dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
- __func__, sc->sw_index, sc->hw_context);
-}
-
/*
* Start the software reaction to a context halt or SPC freeze:
* - mark the context as halted or frozen
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
index d07cc6ea7c63..ab0f9a3a8d12 100644
--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -246,7 +246,6 @@ void sc_disable(struct send_context *sc);
int sc_restart(struct send_context *sc);
void sc_return_credits(struct send_context *sc);
void sc_flush(struct send_context *sc);
-void sc_drop(struct send_context *sc);
void sc_stop(struct send_context *sc, int bit);
struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
pio_release_cb cb, void *arg);
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 0d2b39b7c8b5..16a749d16ee9 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -1521,24 +1521,6 @@ void sdma_all_running(struct hfi1_devdata *dd)
}
/**
- * sdma_all_idle() - called when the link goes down
- * @dd: hfi1_devdata
- *
- * This routine moves all engines to the idle state.
- */
-void sdma_all_idle(struct hfi1_devdata *dd)
-{
- struct sdma_engine *sde;
- unsigned int i;
-
- /* idle all engines */
- for (i = 0; i < dd->num_sdma; ++i) {
- sde = &dd->per_sdma[i];
- sdma_process_event(sde, sdma_event_e70_go_idle);
- }
-}
-
-/**
* sdma_start() - called to kick off state processing for all engines
* @dd: hfi1_devdata
*
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index d77246b48434..91dfd5d0c419 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -373,7 +373,6 @@ void sdma_start(struct hfi1_devdata *dd);
void sdma_exit(struct hfi1_devdata *dd);
void sdma_clean(struct hfi1_devdata *dd, size_t num_engines);
void sdma_all_running(struct hfi1_devdata *dd);
-void sdma_all_idle(struct hfi1_devdata *dd);
void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
void sdma_freeze(struct hfi1_devdata *dd);
void sdma_unfreeze(struct hfi1_devdata *dd);
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index cf2d29098406..62b4f16dab27 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -53,7 +53,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
int ret = 0;
fd->entry_to_rb = kcalloc(uctxt->expected_count,
- sizeof(struct rb_node *),
+ sizeof(*fd->entry_to_rb),
GFP_KERNEL);
if (!fd->entry_to_rb)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile
index 7917af8e6380..baf592e6f21b 100644
--- a/drivers/infiniband/hw/hns/Makefile
+++ b/drivers/infiniband/hw/hns/Makefile
@@ -4,6 +4,7 @@
#
ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
+ccflags-y += -I $(src)
hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index 4fc5b9d5fea8..307c35888b30 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -33,7 +33,6 @@
#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
-#include "hnae3.h"
#include "hns_roce_device.h"
#include "hns_roce_hw_v2.h"
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 560a1d9de408..1dcc9cbb4678 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1027,6 +1027,26 @@ struct hns_roce_dev {
atomic64_t *dfx_cnt;
};
+enum hns_roce_trace_type {
+ TRACE_SQ,
+ TRACE_RQ,
+ TRACE_SRQ,
+};
+
+static inline const char *trace_type_to_str(enum hns_roce_trace_type type)
+{
+ switch (type) {
+ case TRACE_SQ:
+ return "SQ";
+ case TRACE_RQ:
+ return "RQ";
+ case TRACE_SRQ:
+ return "SRQ";
+ default:
+ return "UNKNOWN";
+ }
+}
+
static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
{
return container_of(ib_dev, struct hns_roce_dev, ib_dev);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 160e8927d364..fa8747656f25 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -43,13 +43,15 @@
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
-#include "hnae3.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_cmd.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
+#define CREATE_TRACE_POINTS
+#include "hns_roce_trace.h"
+
enum {
CMD_RST_PRC_OTHERS,
CMD_RST_PRC_SUCCESS,
@@ -738,6 +740,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
else
ret = set_ud_wqe(qp, wr, wqe, &sge_idx, owner_bit);
+ trace_hns_sq_wqe(qp->qpn, wqe_idx, wqe, 1 << qp->sq.wqe_shift,
+ wr->wr_id, TRACE_SQ);
if (unlikely(ret)) {
*bad_wr = wr;
goto out;
@@ -807,6 +811,9 @@ static void fill_rq_wqe(struct hns_roce_qp *hr_qp, const struct ib_recv_wr *wr,
wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx);
fill_recv_sge_to_wqe(wr, wqe, max_sge, hr_qp->rq.rsv_sge);
+
+ trace_hns_rq_wqe(hr_qp->qpn, wqe_idx, wqe, 1 << hr_qp->rq.wqe_shift,
+ wr->wr_id, TRACE_RQ);
}
static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
@@ -943,7 +950,7 @@ static void fill_wqe_idx(struct hns_roce_srq *srq, unsigned int wqe_idx)
static void update_srq_db(struct hns_roce_srq *srq)
{
struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
- struct hns_roce_v2_db db;
+ struct hns_roce_v2_db db = {};
hr_reg_write(&db, DB_TAG, srq->srqn);
hr_reg_write(&db, DB_CMD, HNS_ROCE_V2_SRQ_DB);
@@ -984,6 +991,9 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
fill_recv_sge_to_wqe(wr, wqe, max_sge, srq->rsv_sge);
fill_wqe_idx(srq, wqe_idx);
srq->wrid[wqe_idx] = wr->wr_id;
+
+ trace_hns_srq_wqe(srq->srqn, wqe_idx, wqe, 1 << srq->wqe_shift,
+ wr->wr_id, TRACE_SRQ);
}
if (likely(nreq)) {
@@ -1311,6 +1321,8 @@ static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev,
tail = csq->head;
for (i = 0; i < num; i++) {
+ trace_hns_cmdq_req(hr_dev, &desc[i]);
+
csq->desc[csq->head++] = desc[i];
if (csq->head == csq->desc_num)
csq->head = 0;
@@ -1325,6 +1337,8 @@ static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev,
if (hns_roce_cmq_csq_done(hr_dev)) {
ret = 0;
for (i = 0; i < num; i++) {
+ trace_hns_cmdq_resp(hr_dev, &csq->desc[tail]);
+
/* check the result of hardware write back */
desc_ret = le16_to_cpu(csq->desc[tail++].retval);
if (tail == csq->desc_num)
@@ -4302,8 +4316,7 @@ static inline int get_pdn(struct ib_pd *ib_pd)
}
static void modify_qp_reset_to_init(struct ib_qp *ibqp,
- struct hns_roce_v2_qp_context *context,
- struct hns_roce_v2_qp_context *qpc_mask)
+ struct hns_roce_v2_qp_context *context)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
@@ -5122,7 +5135,7 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
memset(qpc_mask, 0, hr_dev->caps.qpc_sz);
- modify_qp_reset_to_init(ibqp, context, qpc_mask);
+ modify_qp_reset_to_init(ibqp, context);
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
modify_qp_init_to_init(ibqp, context, qpc_mask);
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
@@ -5313,6 +5326,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp,
return;
spin_lock_irqsave(&hr_qp->sq.lock, sq_flag);
+ trace_hns_sq_flush_cqe(hr_qp->qpn, hr_qp->sq.head, TRACE_SQ);
hr_reg_write(context, QPC_SQ_PRODUCER_IDX, hr_qp->sq.head);
hr_reg_clear(qpc_mask, QPC_SQ_PRODUCER_IDX);
hr_qp->state = IB_QPS_ERR;
@@ -5322,6 +5336,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp,
return;
spin_lock_irqsave(&hr_qp->rq.lock, rq_flag);
+ trace_hns_rq_flush_cqe(hr_qp->qpn, hr_qp->rq.head, TRACE_RQ);
hr_reg_write(context, QPC_RQ_PRODUCER_IDX, hr_qp->rq.head);
hr_reg_clear(qpc_mask, QPC_RQ_PRODUCER_IDX);
spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag);
@@ -6248,6 +6263,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
eq->sub_type = sub_type;
++eq->cons_index;
aeqe_found = IRQ_HANDLED;
+ trace_hns_ae_info(event_type, aeqe, eq->eqe_size);
atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_AEQE_CNT]);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 91a5665465ff..bc7466830eaf 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -34,6 +34,7 @@
#define _HNS_ROCE_HW_V2_H
#include <linux/bitops.h>
+#include "hnae3.h"
#define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32
#define HNS_ROCE_V2_MTT_ENTRY_SZ 64
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 8d0b63d4b50a..e7a497cc125c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -37,7 +37,6 @@
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_cache.h>
-#include "hnae3.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_hem.h"
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 09da3496843b..93a48b41955b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -38,6 +38,7 @@
#include "hns_roce_device.h"
#include "hns_roce_cmd.h"
#include "hns_roce_hem.h"
+#include "hns_roce_trace.h"
static u32 hw_index_to_key(int ind)
{
@@ -159,6 +160,7 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev,
if (IS_ERR(mailbox))
return PTR_ERR(mailbox);
+ trace_hns_mr(mr);
if (mr->type != MR_TYPE_FRMR)
ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr);
else
@@ -1146,6 +1148,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
struct ib_device *ibdev = &hr_dev->ib_dev;
int ret;
+ trace_hns_buf_attr(buf_attr);
/* The caller has its own buffer list and invokes the hns_roce_mtr_map()
* to finish the MTT configuration.
*/
diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c
index 356d98816949..f637b73b946e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_restrack.c
+++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c
@@ -4,7 +4,6 @@
#include <rdma/rdma_cm.h>
#include <rdma/restrack.h>
#include <uapi/rdma/rdma_netlink.h>
-#include "hnae3.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_hw_v2.h"
diff --git a/drivers/infiniband/hw/hns/hns_roce_trace.h b/drivers/infiniband/hw/hns/hns_roce_trace.h
new file mode 100644
index 000000000000..59ceb591b3a1
--- /dev/null
+++ b/drivers/infiniband/hw/hns/hns_roce_trace.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2025 Hisilicon Limited.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hns_roce
+
+#if !defined(__HNS_ROCE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HNS_ROCE_TRACE_H
+
+#include <linux/tracepoint.h>
+#include <linux/string_choices.h>
+#include "hns_roce_device.h"
+#include "hns_roce_hw_v2.h"
+
+DECLARE_EVENT_CLASS(flush_head_template,
+ TP_PROTO(unsigned long qpn, u32 pi,
+ enum hns_roce_trace_type type),
+ TP_ARGS(qpn, pi, type),
+
+ TP_STRUCT__entry(__field(unsigned long, qpn)
+ __field(u32, pi)
+ __field(enum hns_roce_trace_type, type)
+ ),
+
+ TP_fast_assign(__entry->qpn = qpn;
+ __entry->pi = pi;
+ __entry->type = type;
+ ),
+
+ TP_printk("%s 0x%lx flush head 0x%x.",
+ trace_type_to_str(__entry->type),
+ __entry->qpn, __entry->pi)
+);
+
+DEFINE_EVENT(flush_head_template, hns_sq_flush_cqe,
+ TP_PROTO(unsigned long qpn, u32 pi,
+ enum hns_roce_trace_type type),
+ TP_ARGS(qpn, pi, type));
+DEFINE_EVENT(flush_head_template, hns_rq_flush_cqe,
+ TP_PROTO(unsigned long qpn, u32 pi,
+ enum hns_roce_trace_type type),
+ TP_ARGS(qpn, pi, type));
+
+#define MAX_SGE_PER_WQE 64
+#define MAX_WQE_SIZE (MAX_SGE_PER_WQE * HNS_ROCE_SGE_SIZE)
+DECLARE_EVENT_CLASS(wqe_template,
+ TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len,
+ u64 id, enum hns_roce_trace_type type),
+ TP_ARGS(qpn, idx, wqe, len, id, type),
+
+ TP_STRUCT__entry(__field(unsigned long, qpn)
+ __field(u32, idx)
+ __array(u32, wqe,
+ MAX_WQE_SIZE / sizeof(__le32))
+ __field(u32, len)
+ __field(u64, id)
+ __field(enum hns_roce_trace_type, type)
+ ),
+
+ TP_fast_assign(__entry->qpn = qpn;
+ __entry->idx = idx;
+ __entry->id = id;
+ __entry->len = len / sizeof(__le32);
+ __entry->type = type;
+ for (int i = 0; i < __entry->len; i++)
+ __entry->wqe[i] = le32_to_cpu(((__le32 *)wqe)[i]);
+ ),
+
+ TP_printk("%s 0x%lx wqe(0x%x/0x%llx): %s",
+ trace_type_to_str(__entry->type),
+ __entry->qpn, __entry->idx, __entry->id,
+ __print_array(__entry->wqe, __entry->len,
+ sizeof(__le32)))
+);
+
+DEFINE_EVENT(wqe_template, hns_sq_wqe,
+ TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id,
+ enum hns_roce_trace_type type),
+ TP_ARGS(qpn, idx, wqe, len, id, type));
+DEFINE_EVENT(wqe_template, hns_rq_wqe,
+ TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id,
+ enum hns_roce_trace_type type),
+ TP_ARGS(qpn, idx, wqe, len, id, type));
+DEFINE_EVENT(wqe_template, hns_srq_wqe,
+ TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id,
+ enum hns_roce_trace_type type),
+ TP_ARGS(qpn, idx, wqe, len, id, type));
+
+TRACE_EVENT(hns_ae_info,
+ TP_PROTO(int event_type, void *aeqe, unsigned int len),
+ TP_ARGS(event_type, aeqe, len),
+
+ TP_STRUCT__entry(__field(int, event_type)
+ __array(u32, aeqe,
+ HNS_ROCE_V3_EQE_SIZE / sizeof(__le32))
+ __field(u32, len)
+ ),
+
+ TP_fast_assign(__entry->event_type = event_type;
+ __entry->len = len / sizeof(__le32);
+ for (int i = 0; i < __entry->len; i++)
+ __entry->aeqe[i] = le32_to_cpu(((__le32 *)aeqe)[i]);
+ ),
+
+ TP_printk("event %2d aeqe: %s", __entry->event_type,
+ __print_array(__entry->aeqe, __entry->len, sizeof(__le32)))
+);
+
+TRACE_EVENT(hns_mr,
+ TP_PROTO(struct hns_roce_mr *mr),
+ TP_ARGS(mr),
+
+ TP_STRUCT__entry(__field(u64, iova)
+ __field(u64, size)
+ __field(u32, key)
+ __field(u32, pd)
+ __field(u32, pbl_hop_num)
+ __field(u32, npages)
+ __field(int, type)
+ __field(int, enabled)
+ ),
+
+ TP_fast_assign(__entry->iova = mr->iova;
+ __entry->size = mr->size;
+ __entry->key = mr->key;
+ __entry->pd = mr->pd;
+ __entry->pbl_hop_num = mr->pbl_hop_num;
+ __entry->npages = mr->npages;
+ __entry->type = mr->type;
+ __entry->enabled = mr->enabled;
+ ),
+
+ TP_printk("iova:0x%llx, size:%llu, key:%u, pd:%u, pbl_hop:%u, npages:%u, type:%d, status:%d",
+ __entry->iova, __entry->size, __entry->key,
+ __entry->pd, __entry->pbl_hop_num, __entry->npages,
+ __entry->type, __entry->enabled)
+);
+
+TRACE_EVENT(hns_buf_attr,
+ TP_PROTO(struct hns_roce_buf_attr *attr),
+ TP_ARGS(attr),
+
+ TP_STRUCT__entry(__field(unsigned int, region_count)
+ __field(unsigned int, region0_size)
+ __field(int, region0_hopnum)
+ __field(unsigned int, region1_size)
+ __field(int, region1_hopnum)
+ __field(unsigned int, region2_size)
+ __field(int, region2_hopnum)
+ __field(unsigned int, page_shift)
+ __field(bool, mtt_only)
+ ),
+
+ TP_fast_assign(__entry->region_count = attr->region_count;
+ __entry->region0_size = attr->region[0].size;
+ __entry->region0_hopnum = attr->region[0].hopnum;
+ __entry->region1_size = attr->region[1].size;
+ __entry->region1_hopnum = attr->region[1].hopnum;
+ __entry->region2_size = attr->region[2].size;
+ __entry->region2_hopnum = attr->region[2].hopnum;
+ __entry->page_shift = attr->page_shift;
+ __entry->mtt_only = attr->mtt_only;
+ ),
+
+ TP_printk("rg cnt:%u, pg_sft:0x%x, mtt_only:%s, rg 0 (sz:%u, hop:%u), rg 1 (sz:%u, hop:%u), rg 2 (sz:%u, hop:%u)\n",
+ __entry->region_count, __entry->page_shift,
+ str_yes_no(__entry->mtt_only),
+ __entry->region0_size, __entry->region0_hopnum,
+ __entry->region1_size, __entry->region1_hopnum,
+ __entry->region2_size, __entry->region2_hopnum)
+);
+
+DECLARE_EVENT_CLASS(cmdq,
+ TP_PROTO(struct hns_roce_dev *hr_dev,
+ struct hns_roce_cmq_desc *desc),
+ TP_ARGS(hr_dev, desc),
+
+ TP_STRUCT__entry(__string(dev_name, dev_name(hr_dev->dev))
+ __field(u16, opcode)
+ __field(u16, flag)
+ __field(u16, retval)
+ __array(u32, data, 6)
+ ),
+
+ TP_fast_assign(__assign_str(dev_name);
+ __entry->opcode = le16_to_cpu(desc->opcode);
+ __entry->flag = le16_to_cpu(desc->flag);
+ __entry->retval = le16_to_cpu(desc->retval);
+ for (int i = 0; i < 6; i++)
+ __entry->data[i] = le32_to_cpu(desc->data[i]);
+ ),
+
+ TP_printk("%s cmdq opcode:0x%x, flag:0x%x, retval:0x%x, data:%s\n",
+ __get_str(dev_name), __entry->opcode,
+ __entry->flag, __entry->retval,
+ __print_array(__entry->data, 6, sizeof(__le32)))
+);
+
+DEFINE_EVENT(cmdq, hns_cmdq_req,
+ TP_PROTO(struct hns_roce_dev *hr_dev,
+ struct hns_roce_cmq_desc *desc),
+ TP_ARGS(hr_dev, desc));
+DEFINE_EVENT(cmdq, hns_cmdq_resp,
+ TP_PROTO(struct hns_roce_dev *hr_dev,
+ struct hns_roce_cmq_desc *desc),
+ TP_ARGS(hr_dev, desc));
+
+#endif /* __HNS_ROCE_TRACE_H */
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE hns_roce_trace
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c
index 6aed6169c07d..99a7f1a6c0b5 100644
--- a/drivers/infiniband/hw/irdma/ctrl.c
+++ b/drivers/infiniband/hw/irdma/ctrl.c
@@ -3131,7 +3131,7 @@ int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp,
writel(0, cqp->dev->hw_regs[IRDMA_CCQPSTATUS]);
ibdev_dbg(to_ibdev(cqp->dev),
- "WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%pK] cqp[%p] polarity[x%04x]\n",
+ "WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%p] cqp[%p] polarity[x%04x]\n",
cqp->sq_size, cqp->hw_sq_size, cqp->sq_base,
(u64 *)(uintptr_t)cqp->sq_pa, cqp, cqp->polarity);
return 0;
diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c
index e7ce6840755f..37ce35cb10e7 100644
--- a/drivers/infiniband/hw/irdma/pble.c
+++ b/drivers/infiniband/hw/irdma/pble.c
@@ -108,7 +108,7 @@ static int add_sd_direct(struct irdma_hmc_pble_rsrc *pble_rsrc,
chunk->vaddr = sd_entry->u.bp.addr.va + offset;
chunk->fpm_addr = pble_rsrc->next_fpm_addr;
ibdev_dbg(to_ibdev(dev),
- "PBLE: chunk_size[%lld] = 0x%llx vaddr=0x%pK fpm_addr = %llx\n",
+ "PBLE: chunk_size[%lld] = 0x%llx vaddr=0x%p fpm_addr = %llx\n",
chunk->size, chunk->size, chunk->vaddr, chunk->fpm_addr);
return 0;
diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index 0fc4e2679218..28e154bbb50f 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -15,14 +15,12 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct ib_device *ibdev = ibcq->device;
struct mana_ib_create_cq ucmd = {};
struct mana_ib_dev *mdev;
- struct gdma_context *gc;
bool is_rnic_cq;
u32 doorbell;
u32 buf_size;
int err;
mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
- gc = mdev_to_gc(mdev);
cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors;
cq->cq_handle = INVALID_MANA_HANDLE;
@@ -65,7 +63,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err);
return err;
}
- doorbell = gc->mana_ib.doorbell;
+ doorbell = mdev->gdma_dev->doorbell;
}
if (is_rnic_cq) {
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index b31089320aa5..165c0a1e67d1 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -101,103 +101,95 @@ static int mana_ib_probe(struct auxiliary_device *adev,
const struct auxiliary_device_id *id)
{
struct mana_adev *madev = container_of(adev, struct mana_adev, adev);
+ struct gdma_context *gc = madev->mdev->gdma_context;
+ struct mana_context *mc = gc->mana.driver_data;
struct gdma_dev *mdev = madev->mdev;
struct net_device *ndev;
- struct mana_context *mc;
struct mana_ib_dev *dev;
u8 mac_addr[ETH_ALEN];
int ret;
- mc = mdev->driver_data;
-
dev = ib_alloc_device(mana_ib_dev, ib_dev);
if (!dev)
return -ENOMEM;
ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops);
-
- dev->ib_dev.phys_port_cnt = mc->num_ports;
-
- ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
- mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt);
-
dev->ib_dev.node_type = RDMA_NODE_IB_CA;
-
- /*
- * num_comp_vectors needs to set to the max MSIX index
- * when interrupts and event queues are implemented
- */
- dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues;
- dev->ib_dev.dev.parent = mdev->gdma_context->dev;
-
- ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker);
- if (!ndev) {
- ret = -ENODEV;
- ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1");
- goto free_ib_device;
- }
- ether_addr_copy(mac_addr, ndev->dev_addr);
- addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr);
- ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1);
- /* mana_get_primary_netdev() returns ndev with refcount held */
- netdev_put(ndev, &dev->dev_tracker);
- if (ret) {
- ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret);
- goto free_ib_device;
- }
-
- ret = mana_gd_register_device(&mdev->gdma_context->mana_ib);
- if (ret) {
- ibdev_err(&dev->ib_dev, "Failed to register device, ret %d",
- ret);
- goto free_ib_device;
- }
- dev->gdma_dev = &mdev->gdma_context->mana_ib;
-
- dev->nb.notifier_call = mana_ib_netdev_event;
- ret = register_netdevice_notifier(&dev->nb);
- if (ret) {
- ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d",
- ret);
- goto deregister_device;
- }
-
- ret = mana_ib_gd_query_adapter_caps(dev);
- if (ret) {
- ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d",
- ret);
- goto deregister_net_notifier;
- }
-
- ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops);
-
- ret = mana_ib_create_eqs(dev);
- if (ret) {
- ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret);
- goto deregister_net_notifier;
- }
-
- ret = mana_ib_gd_create_rnic_adapter(dev);
- if (ret)
- goto destroy_eqs;
-
+ dev->ib_dev.num_comp_vectors = gc->max_num_queues;
+ dev->ib_dev.dev.parent = gc->dev;
+ dev->gdma_dev = mdev;
xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ);
- ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr);
- if (ret) {
- ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d",
- ret);
- goto destroy_rnic;
+
+ if (mana_ib_is_rnic(dev)) {
+ dev->ib_dev.phys_port_cnt = 1;
+ ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker);
+ if (!ndev) {
+ ret = -ENODEV;
+ ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1");
+ goto free_ib_device;
+ }
+ ether_addr_copy(mac_addr, ndev->dev_addr);
+ addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr);
+ ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1);
+ /* mana_get_primary_netdev() returns ndev with refcount held */
+ netdev_put(ndev, &dev->dev_tracker);
+ if (ret) {
+ ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret);
+ goto free_ib_device;
+ }
+
+ dev->nb.notifier_call = mana_ib_netdev_event;
+ ret = register_netdevice_notifier(&dev->nb);
+ if (ret) {
+ ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d",
+ ret);
+ goto free_ib_device;
+ }
+
+ ret = mana_ib_gd_query_adapter_caps(dev);
+ if (ret) {
+ ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", ret);
+ goto deregister_net_notifier;
+ }
+
+ ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops);
+
+ ret = mana_ib_create_eqs(dev);
+ if (ret) {
+ ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret);
+ goto deregister_net_notifier;
+ }
+
+ ret = mana_ib_gd_create_rnic_adapter(dev);
+ if (ret)
+ goto destroy_eqs;
+
+ ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr);
+ if (ret) {
+ ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d", ret);
+ goto destroy_rnic;
+ }
+ } else {
+ dev->ib_dev.phys_port_cnt = mc->num_ports;
+ ret = mana_eth_query_adapter_caps(dev);
+ if (ret) {
+ ibdev_err(&dev->ib_dev, "Failed to query ETH device caps, ret %d", ret);
+ goto free_ib_device;
+ }
}
- dev->av_pool = dma_pool_create("mana_ib_av", mdev->gdma_context->dev,
- MANA_AV_BUFFER_SIZE, MANA_AV_BUFFER_SIZE, 0);
+ dev->av_pool = dma_pool_create("mana_ib_av", gc->dev, MANA_AV_BUFFER_SIZE,
+ MANA_AV_BUFFER_SIZE, 0);
if (!dev->av_pool) {
ret = -ENOMEM;
goto destroy_rnic;
}
- ret = ib_register_device(&dev->ib_dev, "mana_%d",
- mdev->gdma_context->dev);
+ ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
+ mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt);
+
+ ret = ib_register_device(&dev->ib_dev, mana_ib_is_rnic(dev) ? "mana_%d" : "manae_%d",
+ gc->dev);
if (ret)
goto deallocate_pool;
@@ -208,15 +200,16 @@ static int mana_ib_probe(struct auxiliary_device *adev,
deallocate_pool:
dma_pool_destroy(dev->av_pool);
destroy_rnic:
- xa_destroy(&dev->qp_table_wq);
- mana_ib_gd_destroy_rnic_adapter(dev);
+ if (mana_ib_is_rnic(dev))
+ mana_ib_gd_destroy_rnic_adapter(dev);
destroy_eqs:
- mana_ib_destroy_eqs(dev);
+ if (mana_ib_is_rnic(dev))
+ mana_ib_destroy_eqs(dev);
deregister_net_notifier:
- unregister_netdevice_notifier(&dev->nb);
-deregister_device:
- mana_gd_deregister_device(dev->gdma_dev);
+ if (mana_ib_is_rnic(dev))
+ unregister_netdevice_notifier(&dev->nb);
free_ib_device:
+ xa_destroy(&dev->qp_table_wq);
ib_dealloc_device(&dev->ib_dev);
return ret;
}
@@ -227,25 +220,24 @@ static void mana_ib_remove(struct auxiliary_device *adev)
ib_unregister_device(&dev->ib_dev);
dma_pool_destroy(dev->av_pool);
+ if (mana_ib_is_rnic(dev)) {
+ mana_ib_gd_destroy_rnic_adapter(dev);
+ mana_ib_destroy_eqs(dev);
+ unregister_netdevice_notifier(&dev->nb);
+ }
xa_destroy(&dev->qp_table_wq);
- mana_ib_gd_destroy_rnic_adapter(dev);
- mana_ib_destroy_eqs(dev);
- unregister_netdevice_notifier(&dev->nb);
- mana_gd_deregister_device(dev->gdma_dev);
ib_dealloc_device(&dev->ib_dev);
}
static const struct auxiliary_device_id mana_id_table[] = {
- {
- .name = "mana.rdma",
- },
+ { .name = "mana.rdma", },
+ { .name = "mana.eth", },
{},
};
MODULE_DEVICE_TABLE(auxiliary, mana_id_table);
static struct auxiliary_driver mana_driver = {
- .name = "rdma",
.probe = mana_ib_probe,
.remove = mana_ib_remove,
.id_table = mana_id_table,
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index eda9c5b971de..41a24a186f9d 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -4,6 +4,7 @@
*/
#include "mana_ib.h"
+#include "linux/pci.h"
void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
u32 port)
@@ -243,7 +244,6 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type,
struct mana_ib_queue *queue)
{
- struct gdma_context *gc = mdev_to_gc(mdev);
struct gdma_queue_spec spec = {};
int err;
@@ -252,7 +252,7 @@ int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_qu
spec.type = type;
spec.monitor_avl_buf = false;
spec.queue_size = size;
- err = mana_gd_create_mana_wq_cq(&gc->mana_ib, &spec, &queue->kmem);
+ err = mana_gd_create_mana_wq_cq(mdev->gdma_dev, &spec, &queue->kmem);
if (err)
return err;
/* take ownership into mana_ib from mana */
@@ -479,7 +479,7 @@ int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
{
unsigned long page_sz;
- page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt);
+ page_sz = ib_umem_find_best_pgsz(umem, dev->adapter_caps.page_size_cap, virt);
if (!page_sz) {
ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n");
return -EINVAL;
@@ -494,7 +494,7 @@ int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_ume
unsigned long page_sz;
/* Hardware requires dma region to align to chosen page size */
- page_sz = ib_umem_find_best_pgoff(umem, PAGE_SZ_BM, 0);
+ page_sz = ib_umem_find_best_pgoff(umem, dev->adapter_caps.page_size_cap, 0);
if (!page_sz) {
ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n");
return -EINVAL;
@@ -551,6 +551,7 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
struct ib_port_immutable *immutable)
{
+ struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
struct ib_port_attr attr;
int err;
@@ -560,10 +561,12 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
immutable->pkey_tbl_len = attr.pkey_tbl_len;
immutable->gid_tbl_len = attr.gid_tbl_len;
- immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
- if (port_num == 1) {
- immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+ if (mana_ib_is_rnic(dev)) {
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+ } else {
+ immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
}
return 0;
@@ -572,12 +575,14 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
struct ib_udata *uhw)
{
- struct mana_ib_dev *dev = container_of(ibdev,
- struct mana_ib_dev, ib_dev);
+ struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+ struct pci_dev *pdev = to_pci_dev(mdev_to_gc(dev)->dev);
memset(props, 0, sizeof(*props));
+ props->vendor_id = pdev->vendor;
+ props->vendor_part_id = dev->gdma_dev->dev_id.type;
props->max_mr_size = MANA_IB_MAX_MR_SIZE;
- props->page_size_cap = PAGE_SZ_BM;
+ props->page_size_cap = dev->adapter_caps.page_size_cap;
props->max_qp = dev->adapter_caps.max_qp_count;
props->max_qp_wr = dev->adapter_caps.max_qp_wr;
props->device_cap_flags = IB_DEVICE_RC_RNR_NAK_GEN;
@@ -596,6 +601,8 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
props->max_ah = INT_MAX;
props->max_pkeys = 1;
props->local_ca_ack_delay = MANA_CA_ACK_DELAY;
+ if (!mana_ib_is_rnic(dev))
+ props->raw_packet_caps = IB_RAW_PACKET_CAP_IP_CSUM;
return 0;
}
@@ -603,6 +610,7 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
int mana_ib_query_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props)
{
+ struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
struct net_device *ndev = mana_ib_get_netdev(ibdev, port);
if (!ndev)
@@ -623,7 +631,7 @@ int mana_ib_query_port(struct ib_device *ibdev, u32 port,
props->active_width = IB_WIDTH_4X;
props->active_speed = IB_SPEED_EDR;
props->pkey_tbl_len = 1;
- if (port == 1) {
+ if (mana_ib_is_rnic(dev)) {
props->gid_tbl_len = 16;
props->port_cap_flags = IB_PORT_CM_SUP;
props->ip_gids = true;
@@ -696,6 +704,41 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev)
caps->max_recv_sge_count = resp.max_recv_sge_count;
caps->feature_flags = resp.feature_flags;
+ caps->page_size_cap = PAGE_SZ_BM;
+ if (mdev_to_gc(dev)->pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB)
+ caps->page_size_cap |= (SZ_4M | SZ_1G | SZ_2G);
+
+ return 0;
+}
+
+int mana_eth_query_adapter_caps(struct mana_ib_dev *dev)
+{
+ struct mana_ib_adapter_caps *caps = &dev->adapter_caps;
+ struct gdma_query_max_resources_resp resp = {};
+ struct gdma_general_req req = {};
+ int err;
+
+ mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
+ sizeof(req), sizeof(resp));
+
+ err = mana_gd_send_request(mdev_to_gc(dev), sizeof(req), &req, sizeof(resp), &resp);
+ if (err) {
+ ibdev_err(&dev->ib_dev,
+ "Failed to query adapter caps err %d", err);
+ return err;
+ }
+
+ caps->max_qp_count = min_t(u32, resp.max_sq, resp.max_rq);
+ caps->max_cq_count = resp.max_cq;
+ caps->max_mr_count = resp.max_mst;
+ caps->max_pd_count = 0x6000;
+ caps->max_qp_wr = min_t(u32,
+ 0x100000 / GDMA_MAX_SQE_SIZE,
+ 0x100000 / GDMA_MAX_RQE_SIZE);
+ caps->max_send_sge_count = 30;
+ caps->max_recv_sge_count = 15;
+ caps->page_size_cap = PAGE_SZ_BM;
+
return 0;
}
@@ -740,7 +783,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
spec.eq.msix_index = 0;
- err = mana_gd_create_mana_eq(&gc->mana_ib, &spec, &mdev->fatal_err_eq);
+ err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->fatal_err_eq);
if (err)
return err;
@@ -791,7 +834,7 @@ int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev)
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_ADAPTER, sizeof(req), sizeof(resp));
req.hdr.req.msg_version = GDMA_MESSAGE_V2;
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.notify_eq_id = mdev->fatal_err_eq->id;
if (mdev->adapter_caps.feature_flags & MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT)
@@ -816,7 +859,7 @@ int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev)
gc = mdev_to_gc(mdev);
mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_ADAPTER, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
@@ -843,7 +886,7 @@ int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context)
}
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.op = ADDR_OP_ADD;
req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4;
@@ -873,7 +916,7 @@ int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context)
}
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.op = ADDR_OP_REMOVE;
req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4;
@@ -896,7 +939,7 @@ int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8
int err;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_MAC_ADDR, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.op = op;
copy_in_reverse(req.mac_addr, mac, ETH_ALEN);
@@ -917,8 +960,11 @@ int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 do
struct mana_rnic_create_cq_req req = {};
int err;
+ if (!mdev->eqs)
+ return -EINVAL;
+
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_CQ, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.gdma_region = cq->queue.gdma_region;
req.eq_id = mdev->eqs[cq->comp_vector]->id;
@@ -950,7 +996,7 @@ int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq)
return 0;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_CQ, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.cq_handle = cq->cq_handle;
@@ -976,7 +1022,7 @@ int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp,
int err, i;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_RC_QP, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.pd_handle = pd->pd_handle;
req.send_cq_handle = send_cq->cq_handle;
@@ -1012,7 +1058,7 @@ int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp)
int err;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_RC_QP, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.rc_qp_handle = qp->qp_handle;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
@@ -1035,7 +1081,7 @@ int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp,
int err, i;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_UD_QP, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.pd_handle = pd->pd_handle;
req.send_cq_handle = send_cq->cq_handle;
@@ -1070,7 +1116,7 @@ int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp)
int err;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_UD_QP, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.qp_handle = qp->qp_handle;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index 6903946677e5..42bebd6cd4f7 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -60,6 +60,7 @@ struct mana_ib_adapter_caps {
u32 max_recv_sge_count;
u32 max_inline_data_size;
u64 feature_flags;
+ u64 page_size_cap;
};
struct mana_ib_queue {
@@ -543,6 +544,11 @@ static inline void mana_put_qp_ref(struct mana_ib_qp *qp)
complete(&qp->free);
}
+static inline bool mana_ib_is_rnic(struct mana_ib_dev *mdev)
+{
+ return mdev->gdma_dev->dev_id.type == GDMA_DEVICE_MANA_IB;
+}
+
static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 port)
{
struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
@@ -642,6 +648,7 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext);
int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *mdev);
+int mana_eth_query_adapter_caps(struct mana_ib_dev *mdev);
int mana_ib_create_eqs(struct mana_ib_dev *mdev);
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index f99557ec7767..6d974d0a8400 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -5,8 +5,8 @@
#include "mana_ib.h"
-#define VALID_MR_FLAGS \
- (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ)
+#define VALID_MR_FLAGS (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ |\
+ IB_ACCESS_REMOTE_ATOMIC | IB_ZERO_BASED)
#define VALID_DMA_MR_FLAGS (IB_ACCESS_LOCAL_WRITE)
@@ -24,6 +24,9 @@ mana_ib_verbs_to_gdma_access_flags(int access_flags)
if (access_flags & IB_ACCESS_REMOTE_READ)
flags |= GDMA_ACCESS_FLAG_REMOTE_READ;
+ if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+ flags |= GDMA_ACCESS_FLAG_REMOTE_ATOMIC;
+
return flags;
}
@@ -48,7 +51,10 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
req.gva.virtual_address = mr_params->gva.virtual_address;
req.gva.access_flags = mr_params->gva.access_flags;
break;
-
+ case GDMA_MR_TYPE_ZBVA:
+ req.zbva.dma_region_handle = mr_params->zbva.dma_region_handle;
+ req.zbva.access_flags = mr_params->zbva.access_flags;
+ break;
default:
ibdev_dbg(&dev->ib_dev,
"invalid param (GDMA_MR_TYPE) passed, type %d\n",
@@ -144,11 +150,18 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
dma_region_handle);
mr_params.pd_handle = pd->pd_handle;
- mr_params.mr_type = GDMA_MR_TYPE_GVA;
- mr_params.gva.dma_region_handle = dma_region_handle;
- mr_params.gva.virtual_address = iova;
- mr_params.gva.access_flags =
- mana_ib_verbs_to_gdma_access_flags(access_flags);
+ if (access_flags & IB_ZERO_BASED) {
+ mr_params.mr_type = GDMA_MR_TYPE_ZBVA;
+ mr_params.zbva.dma_region_handle = dma_region_handle;
+ mr_params.zbva.access_flags =
+ mana_ib_verbs_to_gdma_access_flags(access_flags);
+ } else {
+ mr_params.mr_type = GDMA_MR_TYPE_GVA;
+ mr_params.gva.dma_region_handle = dma_region_handle;
+ mr_params.gva.virtual_address = iova;
+ mr_params.gva.access_flags =
+ mana_ib_verbs_to_gdma_access_flags(access_flags);
+ }
err = mana_ib_gd_create_mr(dev, mr, &mr_params);
if (err)
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index c928af58f38b..14fd7d6c54a2 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -635,7 +635,6 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd,
{
struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev);
struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
- struct gdma_context *gc = mdev_to_gc(mdev);
u32 doorbell, queue_size;
int i, err;
@@ -654,7 +653,7 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd,
goto destroy_queues;
}
}
- doorbell = gc->mana_ib.doorbell;
+ doorbell = mdev->gdma_dev->doorbell;
err = create_shadow_queue(&qp->shadow_rq, attr->cap.max_recv_wr,
sizeof(struct ud_rq_shadow_wqe));
@@ -736,7 +735,7 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
int err;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp));
- req.hdr.dev_id = gc->mana_ib.dev_id;
+ req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.qp_handle = qp->qp_handle;
req.qp_state = attr->qp_state;
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index 33f525b744f2..e279e69b9a51 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -43,7 +43,7 @@
#define MAX_VFS 80
#define MAX_PEND_REQS_PER_FUNC 4
-#define MAD_TIMEOUT_MS 2000
+#define MAD_TIMEOUT_SEC 2
#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg)
#define mcg_error(fmt, arg...) pr_err(fmt, ##arg)
@@ -270,7 +270,7 @@ static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad
if (!ret) {
/* calls mlx4_ib_mcg_timeout_handler */
queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
- msecs_to_jiffies(MAD_TIMEOUT_MS));
+ secs_to_jiffies(MAD_TIMEOUT_SEC));
}
return ret;
@@ -309,7 +309,7 @@ static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
if (!ret) {
/* calls mlx4_ib_mcg_timeout_handler */
queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
- msecs_to_jiffies(MAD_TIMEOUT_MS));
+ secs_to_jiffies(MAD_TIMEOUT_SEC));
}
return ret;
@@ -1091,7 +1091,7 @@ static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy
for (i = 0; i < MAX_VFS; ++i)
clean_vf_mcast(ctx, i);
- end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
+ end = jiffies + secs_to_jiffies(MAD_TIMEOUT_SEC + 3);
do {
count = 0;
mutex_lock(&ctx->mcg_table_lock);
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index 0ff9f18a71e8..680627f1de33 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -1645,11 +1645,6 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
}
-enum {
- LEFTOVERS_MC,
- LEFTOVERS_UC,
-};
-
static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
struct mlx5_ib_flow_prio *ft_prio,
struct ib_flow_attr *flow_attr,
@@ -1659,43 +1654,32 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de
struct mlx5_ib_flow_handler *handler = NULL;
static struct {
- struct ib_flow_attr flow_attr;
struct ib_flow_spec_eth eth_flow;
- } leftovers_specs[] = {
- [LEFTOVERS_MC] = {
- .flow_attr = {
- .num_of_specs = 1,
- .size = sizeof(leftovers_specs[0])
- },
- .eth_flow = {
- .type = IB_FLOW_SPEC_ETH,
- .size = sizeof(struct ib_flow_spec_eth),
- .mask = {.dst_mac = {0x1} },
- .val = {.dst_mac = {0x1} }
- }
- },
- [LEFTOVERS_UC] = {
- .flow_attr = {
- .num_of_specs = 1,
- .size = sizeof(leftovers_specs[0])
- },
- .eth_flow = {
- .type = IB_FLOW_SPEC_ETH,
- .size = sizeof(struct ib_flow_spec_eth),
- .mask = {.dst_mac = {0x1} },
- .val = {.dst_mac = {} }
- }
- }
- };
+ struct ib_flow_attr flow_attr;
+ } leftovers_wc = { .flow_attr = { .num_of_specs = 1,
+ .size = sizeof(leftovers_wc) },
+ .eth_flow = {
+ .type = IB_FLOW_SPEC_ETH,
+ .size = sizeof(struct ib_flow_spec_eth),
+ .mask = { .dst_mac = { 0x1 } },
+ .val = { .dst_mac = { 0x1 } } } };
- handler = create_flow_rule(dev, ft_prio,
- &leftovers_specs[LEFTOVERS_MC].flow_attr,
- dst);
+ static struct {
+ struct ib_flow_spec_eth eth_flow;
+ struct ib_flow_attr flow_attr;
+ } leftovers_uc = { .flow_attr = { .num_of_specs = 1,
+ .size = sizeof(leftovers_uc) },
+ .eth_flow = {
+ .type = IB_FLOW_SPEC_ETH,
+ .size = sizeof(struct ib_flow_spec_eth),
+ .mask = { .dst_mac = { 0x1 } },
+ .val = { .dst_mac = {} } } };
+
+ handler = create_flow_rule(dev, ft_prio, &leftovers_wc.flow_attr, dst);
if (!IS_ERR(handler) &&
flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
handler_ucast = create_flow_rule(dev, ft_prio,
- &leftovers_specs[LEFTOVERS_UC].flow_attr,
- dst);
+ &leftovers_uc.flow_attr, dst);
if (IS_ERR(handler_ucast)) {
mlx5_del_flow_rules(handler->rule);
ft_prio->refcount--;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d07cacaa0abd..ce7610740412 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -485,6 +485,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_2X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_200GAUI_1_200GBASE_CR1_KR1):
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_XDR;
+ break;
case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_HDR;
@@ -493,10 +497,18 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_400GAUI_2_400GBASE_CR2_KR2):
+ *active_width = IB_WIDTH_2X;
+ *active_speed = IB_SPEED_XDR;
+ break;
case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_NDR;
break;
+ case MLX5E_PROT_MASK(MLX5E_800GAUI_4_800GBASE_CR4_KR4):
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_XDR;
+ break;
default:
return -EINVAL;
}
@@ -4422,17 +4434,6 @@ static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
mlx5_core_native_port_num(dev->mdev) - 1);
}
-static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
-{
- dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
- return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
-}
-
-static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
-{
- mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
-}
-
static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
{
int err;
@@ -4662,9 +4663,6 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_UAR,
- mlx5_ib_stage_uar_init,
- mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
@@ -4722,9 +4720,6 @@ const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_UAR,
- mlx5_ib_stage_uar_init,
- mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ace2df3e1d9f..fde859d207ae 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -351,6 +351,7 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_UPD_XLT_PD BIT(4)
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
+#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
*
@@ -1005,7 +1006,6 @@ enum mlx5_ib_stages {
MLX5_IB_STAGE_ODP,
MLX5_IB_STAGE_COUNTERS,
MLX5_IB_STAGE_CONG_DEBUGFS,
- MLX5_IB_STAGE_UAR,
MLX5_IB_STAGE_BFREG,
MLX5_IB_STAGE_PRE_IB_REG_UMR,
MLX5_IB_STAGE_WHITELIST_UID,
@@ -1473,8 +1473,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev);
-void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags);
+int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags);
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
@@ -1495,8 +1495,11 @@ static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
{
return 0;
}
-static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags) {}
+static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
+{
+ return -EOPNOTSUPP;
+}
static inline int
mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 5fbebafc8774..6dd813bac5b2 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -525,7 +525,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
ent->fill_to_high_water = false;
if (ent->pending)
queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
- msecs_to_jiffies(1000));
+ secs_to_jiffies(1));
else
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
}
@@ -576,7 +576,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
"add keys command failed, err %d\n",
err);
queue_delayed_work(cache->wq, &ent->dwork,
- msecs_to_jiffies(1000));
+ secs_to_jiffies(1));
}
}
} else if (ent->mkeys_queue.ci > 2 * ent->limit) {
@@ -2051,7 +2051,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
ent->in_use--;
if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
- msecs_to_jiffies(30 * 1000));
+ secs_to_jiffies(30));
ent->tmp_cleanup_scheduled = true;
}
spin_unlock_irq(&ent->mkeys_queue.lock);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 86d8fa63bf69..eaa2f9f5f3a9 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -34,6 +34,9 @@
#include <linux/kernel.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
+#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
+#include <linux/pci-p2pdma.h>
#include "mlx5_ib.h"
#include "cmd.h"
@@ -158,41 +161,50 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
}
}
-static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
-{
- u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
-
- if (umem_dma & ODP_READ_ALLOWED_BIT)
- mtt_entry |= MLX5_IB_MTT_READ;
- if (umem_dma & ODP_WRITE_ALLOWED_BIT)
- mtt_entry |= MLX5_IB_MTT_WRITE;
-
- return mtt_entry;
-}
-
-static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags)
+static int populate_mtt(__be64 *pas, size_t start, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
- dma_addr_t pa;
+ bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
+ struct pci_p2pdma_map_state p2pdma_state = {};
+ struct ib_device *dev = odp->umem.ibdev;
size_t i;
if (flags & MLX5_IB_UPD_XLT_ZAP)
- return;
+ return 0;
for (i = 0; i < nentries; i++) {
- pa = odp->dma_list[idx + i];
- pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+ unsigned long pfn = odp->map.pfn_list[start + i];
+ dma_addr_t dma_addr;
+
+ pfn = odp->map.pfn_list[start + i];
+ if (!(pfn & HMM_PFN_VALID))
+ /* ODP initialization */
+ continue;
+
+ dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map,
+ start + i, &p2pdma_state);
+ if (ib_dma_mapping_error(dev, dma_addr))
+ return -EFAULT;
+
+ dma_addr |= MLX5_IB_MTT_READ;
+ if ((pfn & HMM_PFN_WRITE) && !downgrade)
+ dma_addr |= MLX5_IB_MTT_WRITE;
+
+ pas[i] = cpu_to_be64(dma_addr);
+ odp->npages++;
}
+ return 0;
}
-void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
- struct mlx5_ib_mr *mr, int flags)
+int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
{
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
populate_klm(xlt, idx, nentries, mr, flags);
+ return 0;
} else {
- populate_mtt(xlt, idx, nentries, mr, flags);
+ return populate_mtt(xlt, idx, nentries, mr, flags);
}
}
@@ -303,8 +315,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
* estimate the cost of another UMR vs. the cost of bigger
* UMR.
*/
- if (umem_odp->dma_list[idx] &
- (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+ if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) {
if (!in_block) {
blk_start_idx = idx;
in_block = 1;
@@ -687,7 +698,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
{
int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
- u64 access_mask;
+ u64 access_mask = 0;
u64 start_idx;
bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
@@ -695,12 +706,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
+ if (flags & MLX5_PF_FLAGS_DOWNGRADE)
+ xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
+
page_shift = odp->page_shift;
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
- access_mask = ODP_READ_ALLOWED_BIT;
if (odp->umem.writable && !downgrade)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
+ access_mask |= HMM_PFN_WRITE;
np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
if (np < 0)
diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
index d3dcc272200a..146d03ae40bd 100644
--- a/drivers/infiniband/hw/mlx5/qpc.c
+++ b/drivers/infiniband/hw/mlx5/qpc.c
@@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn)
spin_lock_irqsave(&table->lock, flags);
common = radix_tree_lookup(&table->tree, rsn);
- if (common)
+ if (common && !common->invalid)
refcount_inc(&common->refcount);
+ else
+ common = NULL;
spin_unlock_irqrestore(&table->lock, flags);
@@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev,
return 0;
}
+static void modify_resource_common_state(struct mlx5_ib_dev *dev,
+ struct mlx5_core_qp *qp,
+ bool invalid)
+{
+ struct mlx5_qp_table *table = &dev->qp_table;
+ unsigned long flags;
+
+ spin_lock_irqsave(&table->lock, flags);
+ qp->common.invalid = invalid;
+ spin_unlock_irqrestore(&table->lock, flags);
+}
+
static void destroy_resource_common(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *qp)
{
@@ -609,8 +623,20 @@ err_destroy_rq:
int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *rq)
{
+ int ret;
+
+ /* The rq destruction can be called again in case it fails, hence we
+ * mark the common resource as invalid and only once FW destruction
+ * is completed successfully we actually destroy the resources.
+ */
+ modify_resource_common_state(dev, rq, true);
+ ret = destroy_rq_tracked(dev, rq->qpn, rq->uid);
+ if (ret) {
+ modify_resource_common_state(dev, rq, false);
+ return ret;
+ }
destroy_resource_common(dev, rq);
- return destroy_rq_tracked(dev, rq->qpn, rq->uid);
+ return 0;
}
static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid)
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index 793f3c5c4d01..5be4426a2884 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -840,7 +840,17 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
size_to_map = npages * desc_size;
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
- mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
+ /*
+ * npages is the maximum number of pages to map, but we
+ * can't guarantee that all pages are actually mapped.
+ *
+ * For example, if page is p2p of type which is not supported
+ * for mapping, the number of pages mapped will be less than
+ * requested.
+ */
+ err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
+ if (err)
+ return err;
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
index 192f83fd7c8a..dacb8ceeebe0 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -144,7 +144,7 @@ static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order)
buddy->max_order = max_order;
spin_lock_init(&buddy->lock);
- buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *),
+ buddy->bits = kcalloc(buddy->max_order + 1, sizeof(*buddy->bits),
GFP_KERNEL);
buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free,
GFP_KERNEL);
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index f948b76f984d..3fbf99757b11 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -56,7 +56,7 @@ static int usnic_uiom_dma_fault(struct iommu_domain *domain,
unsigned long iova, int flags,
void *token)
{
- usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n",
+ usnic_err("Device %s iommu fault domain 0x%p va 0x%lx flags 0x%x\n",
dev_name(dev),
domain, iova, flags);
return -ENOSYS;
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig
index c180e7ebcfc5..1ed5b63f8afc 100644
--- a/drivers/infiniband/sw/rxe/Kconfig
+++ b/drivers/infiniband/sw/rxe/Kconfig
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config RDMA_RXE
tristate "Software RDMA over Ethernet (RoCE) driver"
- depends on INET && PCI && INFINIBAND
+ depends on INET && PCI && INFINIBAND && 64BIT
depends on INFINIBAND_VIRT_DMA
select NET_UDP_TUNNEL
select CRC32
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index b248c68bf9b1..3a77d6db1720 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -101,6 +101,8 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
}
}
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 0bc3fbb6554f..876702058c84 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -70,9 +70,9 @@ int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma,
void *addr, int length, enum rxe_mr_copy_dir dir);
int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
int sg_nents, unsigned int *sg_offset);
-int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
- u64 compare, u64 swap_add, u64 *orig_val);
-int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
+enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val);
+enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
enum rxe_mr_lookup_type type);
int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length);
@@ -193,13 +193,16 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
/* rxe_odp.c */
extern const struct mmu_interval_notifier_ops rxe_mn_ops;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+#if defined CONFIG_INFINIBAND_ON_DEMAND_PAGING
int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
u64 iova, int access_flags, struct rxe_mr *mr);
int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
enum rxe_mr_copy_dir dir);
-int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
- u64 compare, u64 swap_add, u64 *orig_val);
+enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val);
+int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
+ unsigned int length);
+enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline int
rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
@@ -212,9 +215,19 @@ static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
{
return -EOPNOTSUPP;
}
-static inline int
+static inline enum resp_states
rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
- u64 compare, u64 swap_add, u64 *orig_val)
+ u64 compare, u64 swap_add, u64 *orig_val)
+{
+ return RESPST_ERR_UNSUPPORTED_OPCODE;
+}
+static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
+ unsigned int length)
+{
+ return -EOPNOTSUPP;
+}
+static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
+ u64 iova, u64 value)
{
return RESPST_ERR_UNSUPPORTED_OPCODE;
}
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 432d864c3ce9..bcb97b3ea58a 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -424,7 +424,7 @@ err1:
return err;
}
-int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
+static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
{
unsigned int page_offset;
unsigned long index;
@@ -433,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
int err;
u8 *va;
- /* mr must be valid even if length is zero */
- if (WARN_ON(!mr))
- return -EINVAL;
-
- if (length == 0)
- return 0;
-
- if (mr->ibmr.type == IB_MR_TYPE_DMA)
- return -EFAULT;
-
err = mr_check_range(mr, iova, length);
if (err)
return err;
@@ -454,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
if (!page)
return -EFAULT;
bytes = min_t(unsigned int, length,
- mr_page_size(mr) - page_offset);
+ mr_page_size(mr) - page_offset);
va = kmap_local_page(page);
arch_wb_cache_pmem(va + page_offset, bytes);
@@ -468,11 +458,33 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
return 0;
}
+int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length)
+{
+ int err;
+
+ /* mr must be valid even if length is zero */
+ if (WARN_ON(!mr))
+ return -EINVAL;
+
+ if (length == 0)
+ return 0;
+
+ if (mr->ibmr.type == IB_MR_TYPE_DMA)
+ return -EFAULT;
+
+ if (is_odp_mr(mr))
+ err = rxe_odp_flush_pmem_iova(mr, start, length);
+ else
+ err = rxe_mr_flush_pmem_iova(mr, start, length);
+
+ return err;
+}
+
/* Guarantee atomicity of atomic operations at the machine level. */
DEFINE_SPINLOCK(atomic_ops_lock);
-int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
- u64 compare, u64 swap_add, u64 *orig_val)
+enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val)
{
unsigned int page_offset;
struct page *page;
@@ -524,27 +536,15 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
kunmap_local(va);
- return 0;
+ return RESPST_NONE;
}
-#if defined CONFIG_64BIT
-/* only implemented or called for 64 bit architectures */
-int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
+enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
{
unsigned int page_offset;
struct page *page;
u64 *va;
- /* ODP is not supported right now. WIP. */
- if (is_odp_mr(mr))
- return RESPST_ERR_UNSUPPORTED_OPCODE;
-
- /* See IBA oA19-28 */
- if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
- rxe_dbg_mr(mr, "mr not in valid state\n");
- return RESPST_ERR_RKEY_VIOLATION;
- }
-
if (mr->ibmr.type == IB_MR_TYPE_DMA) {
page_offset = iova & (PAGE_SIZE - 1);
page = ib_virt_dma_to_page(iova);
@@ -572,20 +572,12 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
}
va = kmap_local_page(page);
-
/* Do atomic write after all prior operations have completed */
smp_store_release(&va[page_offset >> 3], value);
-
kunmap_local(va);
- return 0;
-}
-#else
-int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
-{
- return RESPST_ERR_UNSUPPORTED_OPCODE;
+ return RESPST_NONE;
}
-#endif
int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
{
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index 9f6e2bb2a269..dbc5a5600eb7 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -4,6 +4,7 @@
*/
#include <linux/hmm.h>
+#include <linux/libnvdimm.h>
#include <rdma/ib_umem_odp.h>
@@ -26,7 +27,7 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
start = max_t(u64, ib_umem_start(umem_odp), range->start);
end = min_t(u64, ib_umem_end(umem_odp), range->end);
- /* update umem_odp->dma_list */
+ /* update umem_odp->map.pfn_list */
ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
mutex_unlock(&umem_odp->umem_mutex);
@@ -44,12 +45,11 @@ static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcn
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
- u64 access_mask;
+ u64 access_mask = 0;
int np;
- access_mask = ODP_READ_ALLOWED_BIT;
if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
- access_mask |= ODP_WRITE_ALLOWED_BIT;
+ access_mask |= HMM_PFN_WRITE;
/*
* ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
@@ -124,8 +124,8 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
return err;
}
-static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
- u64 iova, int length, u32 perm)
+static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, u64 iova,
+ int length)
{
bool need_fault = false;
u64 addr;
@@ -137,7 +137,7 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
while (addr < iova + length) {
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- if (!(umem_odp->dma_list[idx] & perm)) {
+ if (!(umem_odp->map.pfn_list[idx] & HMM_PFN_VALID)) {
need_fault = true;
break;
}
@@ -147,23 +147,28 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
return need_fault;
}
+static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova)
+{
+ return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
+}
+
+static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova)
+{
+ return iova & (BIT(umem_odp->page_shift) - 1);
+}
+
static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
bool need_fault;
- u64 perm;
int err;
if (unlikely(length < 1))
return -EINVAL;
- perm = ODP_READ_ALLOWED_BIT;
- if (!(flags & RXE_PAGEFAULT_RDONLY))
- perm |= ODP_WRITE_ALLOWED_BIT;
-
mutex_lock(&umem_odp->umem_mutex);
- need_fault = rxe_check_pagefault(umem_odp, iova, length, perm);
+ need_fault = rxe_check_pagefault(umem_odp, iova, length);
if (need_fault) {
mutex_unlock(&umem_odp->umem_mutex);
@@ -173,7 +178,7 @@ static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u
if (err < 0)
return err;
- need_fault = rxe_check_pagefault(umem_odp, iova, length, perm);
+ need_fault = rxe_check_pagefault(umem_odp, iova, length);
if (need_fault)
return -EFAULT;
}
@@ -190,13 +195,13 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
size_t offset;
u8 *user_va;
- idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- offset = iova & (BIT(umem_odp->page_shift) - 1);
+ idx = rxe_odp_iova_to_index(umem_odp, iova);
+ offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
while (length > 0) {
u8 *src, *dest;
- page = hmm_pfn_to_page(umem_odp->pfn_list[idx]);
+ page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
user_va = kmap_local_page(page);
if (!user_va)
return -EFAULT;
@@ -255,8 +260,9 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
return err;
}
-static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
- u64 compare, u64 swap_add, u64 *orig_val)
+static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova,
+ int opcode, u64 compare,
+ u64 swap_add, u64 *orig_val)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
unsigned int page_offset;
@@ -277,9 +283,9 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
return RESPST_ERR_RKEY_VIOLATION;
}
- idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- page_offset = iova & (BIT(umem_odp->page_shift) - 1);
- page = hmm_pfn_to_page(umem_odp->pfn_list[idx]);
+ idx = rxe_odp_iova_to_index(umem_odp, iova);
+ page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
+ page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
if (!page)
return RESPST_ERR_RKEY_VIOLATION;
@@ -304,11 +310,11 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
kunmap_local(va);
- return 0;
+ return RESPST_NONE;
}
-int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
- u64 compare, u64 swap_add, u64 *orig_val)
+enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
int err;
@@ -324,3 +330,91 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
return err;
}
+
+int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
+ unsigned int length)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ unsigned int page_offset;
+ unsigned long index;
+ struct page *page;
+ unsigned int bytes;
+ int err;
+ u8 *va;
+
+ err = rxe_odp_map_range_and_lock(mr, iova, length,
+ RXE_PAGEFAULT_DEFAULT);
+ if (err)
+ return err;
+
+ while (length > 0) {
+ index = rxe_odp_iova_to_index(umem_odp, iova);
+ page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
+
+ page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
+ if (!page) {
+ mutex_unlock(&umem_odp->umem_mutex);
+ return -EFAULT;
+ }
+
+ bytes = min_t(unsigned int, length,
+ mr_page_size(mr) - page_offset);
+
+ va = kmap_local_page(page);
+ arch_wb_cache_pmem(va + page_offset, bytes);
+ kunmap_local(va);
+
+ length -= bytes;
+ iova += bytes;
+ page_offset = 0;
+ }
+
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ return 0;
+}
+
+enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ unsigned int page_offset;
+ unsigned long index;
+ struct page *page;
+ int err;
+ u64 *va;
+
+ /* See IBA oA19-28 */
+ err = mr_check_range(mr, iova, sizeof(value));
+ if (unlikely(err)) {
+ rxe_dbg_mr(mr, "iova out of range\n");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value),
+ RXE_PAGEFAULT_DEFAULT);
+ if (err)
+ return RESPST_ERR_RKEY_VIOLATION;
+
+ page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
+ index = rxe_odp_iova_to_index(umem_odp, iova);
+ page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
+ if (!page) {
+ mutex_unlock(&umem_odp->umem_mutex);
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+ /* See IBA A19.4.2 */
+ if (unlikely(page_offset & 0x7)) {
+ mutex_unlock(&umem_odp->umem_mutex);
+ rxe_dbg_mr(mr, "misaligned address\n");
+ return RESPST_ERR_MISALIGNED_ATOMIC;
+ }
+
+ va = kmap_local_page(page);
+ /* Do atomic write after all prior operations have completed */
+ smp_store_release(&va[page_offset >> 3], value);
+ kunmap_local(va);
+
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ return RESPST_NONE;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
index 003f681e5dc0..767870568372 100644
--- a/drivers/infiniband/sw/rxe/rxe_param.h
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -53,12 +53,9 @@ enum rxe_device_param {
| IB_DEVICE_MEM_WINDOW
| IB_DEVICE_FLUSH_GLOBAL
| IB_DEVICE_FLUSH_PERSISTENT
-#ifdef CONFIG_64BIT
| IB_DEVICE_MEM_WINDOW_TYPE_2B
| IB_DEVICE_ATOMIC_WRITE,
-#else
- | IB_DEVICE_MEM_WINDOW_TYPE_2B,
-#endif /* CONFIG_64BIT */
+
RXE_MAX_SGE = 32,
RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) +
sizeof(struct ib_sge) * RXE_MAX_SGE,
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 7975fb0e2782..f2af3e0aef35 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -811,7 +811,12 @@ static void rxe_qp_do_cleanup(struct work_struct *work)
spin_unlock_irqrestore(&qp->state_lock, flags);
qp->qp_timeout_jiffies = 0;
- if (qp_type(qp) == IB_QPT_RC) {
+ /* In the function timer_setup, .function is initialized. If .function
+ * is NULL, it indicates the function timer_setup is not called, the
+ * timer is not initialized. Or else, the timer is initialized.
+ */
+ if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function &&
+ qp->rnr_nak_timer.function) {
timer_delete_sync(&qp->retrans_timer);
timer_delete_sync(&qp->rnr_nak_timer);
}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 5d9174e408db..711f73e0bbb1 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -649,10 +649,6 @@ static enum resp_states process_flush(struct rxe_qp *qp,
struct rxe_mr *mr = qp->resp.mr;
struct resp_res *res = qp->resp.res;
- /* ODP is not supported right now. WIP. */
- if (is_odp_mr(mr))
- return RESPST_ERR_UNSUPPORTED_OPCODE;
-
/* oA19-14, oA19-15 */
if (res && res->replay)
return RESPST_ACKNOWLEDGE;
@@ -753,7 +749,16 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp,
value = *(u64 *)payload_addr(pkt);
iova = qp->resp.va + qp->resp.offset;
- err = rxe_mr_do_atomic_write(mr, iova, value);
+ /* See IBA oA19-28 */
+ if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
+ rxe_dbg_mr(mr, "mr not in valid state\n");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ if (is_odp_mr(mr))
+ err = rxe_odp_do_atomic_write(mr, iova, value);
+ else
+ err = rxe_mr_do_atomic_write(mr, iova, value);
if (err)
return err;
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
index 80332638d9e3..6f8f353e9583 100644
--- a/drivers/infiniband/sw/rxe/rxe_task.c
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -85,17 +85,17 @@ static bool is_done(struct rxe_task *task)
/* do_task is a wrapper for the three tasks (requester,
* completer, responder) and calls them in a loop until
- * they return a non-zero value. It is called either
- * directly by rxe_run_task or indirectly if rxe_sched_task
- * schedules the task. They must call __reserve_if_idle to
- * move the task to busy before calling or scheduling.
- * The task can also be moved to drained or invalid
- * by calls to rxe_cleanup_task or rxe_disable_task.
- * In that case tasks which get here are not executed but
- * just flushed. The tasks are designed to look to see if
- * there is work to do and then do part of it before returning
- * here with a return value of zero until all the work
- * has been consumed then it returns a non-zero value.
+ * they return a non-zero value. It is called indirectly
+ * when rxe_sched_task schedules the task. They must
+ * call __reserve_if_idle to move the task to busy before
+ * calling or scheduling. The task can also be moved to
+ * drained or invalid by calls to rxe_cleanup_task or
+ * rxe_disable_task. In that case tasks which get here
+ * are not executed but just flushed. The tasks are
+ * designed to look to see if there is work to do and
+ * then do part of it before returning here with a return
+ * value of zero until all the work has been consumed then
+ * it returns a non-zero value.
* The number of times the task can be run is limited by
* max iterations so one task cannot hold the cpu forever.
* If the limit is hit and work remains the task is rescheduled.
@@ -234,24 +234,6 @@ void rxe_cleanup_task(struct rxe_task *task)
spin_unlock_irqrestore(&task->lock, flags);
}
-/* run the task inline if it is currently idle
- * cannot call do_task holding the lock
- */
-void rxe_run_task(struct rxe_task *task)
-{
- unsigned long flags;
- bool run;
-
- WARN_ON(rxe_read(task->qp) <= 0);
-
- spin_lock_irqsave(&task->lock, flags);
- run = __reserve_if_idle(task);
- spin_unlock_irqrestore(&task->lock, flags);
-
- if (run)
- do_task(task);
-}
-
/* schedule the task to run later as a work queue entry.
* the queue_work call can be called holding
* the lock.
diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
index a63e258b3d66..a8c9a77b6027 100644
--- a/drivers/infiniband/sw/rxe/rxe_task.h
+++ b/drivers/infiniband/sw/rxe/rxe_task.h
@@ -47,8 +47,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp,
/* cleanup task */
void rxe_cleanup_task(struct rxe_task *task);
-void rxe_run_task(struct rxe_task *task);
-
void rxe_sched_task(struct rxe_task *task);
/* keep a task from scheduling */
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
index d9e5a2e4c471..f5fd71717b80 100644
--- a/drivers/infiniband/sw/siw/siw.h
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -718,7 +718,7 @@ static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len)
"MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__)
#define siw_dbg_cep(cep, fmt, ...) \
- ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%pK] %s: " fmt, \
+ ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \
cep, __func__, ##__VA_ARGS__)
void siw_cq_flush(struct siw_cq *cq);
diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c
index f3c2226aff94..25b3c741b66b 100644
--- a/drivers/infiniband/sw/siw/siw_cq.c
+++ b/drivers/infiniband/sw/siw/siw_cq.c
@@ -72,7 +72,7 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
wc->opcode = map_wc_opcode[cqe->opcode];
wc->status = map_cqe_status[cqe->status].ib;
siw_dbg_cq(cq,
- "idx %u, type %d, flags %2x, id 0x%pK\n",
+ "idx %u, type %d, flags %2x, id 0x%p\n",
cq->cq_get % cq->num_cqe, cqe->opcode,
cqe->flags, (void *)(uintptr_t)cqe->id);
} else {
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
index dcb963607c8b..d5ddeb17bd22 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -18,30 +18,6 @@
#define SIW_STAG_MAX_INDEX 0x00ffffff
/*
- * The code avoids special Stag of zero and tries to randomize
- * STag values between 1 and SIW_STAG_MAX_INDEX.
- */
-int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
-{
- struct xa_limit limit = XA_LIMIT(1, SIW_STAG_MAX_INDEX);
- u32 id, next;
-
- get_random_bytes(&next, 4);
- next &= SIW_STAG_MAX_INDEX;
-
- if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
- GFP_KERNEL) < 0)
- return -ENOMEM;
-
- /* Set the STag index part */
- m->stag = id << 8;
-
- siw_dbg_mem(m, "new MEM object\n");
-
- return 0;
-}
-
-/*
* siw_mem_id2obj()
*
* resolves memory from stag given by id. might be called from:
@@ -181,10 +157,10 @@ int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
*/
if (addr < mem->va || addr + len > mem->va + mem->len) {
siw_dbg_pd(pd, "MEM interval len %d\n", len);
- siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n",
+ siw_dbg_pd(pd, "[0x%p, 0x%p] out of bounds\n",
(void *)(uintptr_t)addr,
(void *)(uintptr_t)(addr + len));
- siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n",
+ siw_dbg_pd(pd, "[0x%p, 0x%p] STag=0x%08x\n",
(void *)(uintptr_t)mem->va,
(void *)(uintptr_t)(mem->va + mem->len),
mem->stag);
diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h
index e74cfcd6dbc1..8e769d30e2ac 100644
--- a/drivers/infiniband/sw/siw/siw_mem.h
+++ b/drivers/infiniband/sw/siw/siw_mem.h
@@ -12,7 +12,6 @@ void siw_umem_release(struct siw_umem *umem);
struct siw_pbl *siw_pbl_alloc(u32 num_buf);
dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx);
struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index);
-int siw_mem_add(struct siw_device *sdev, struct siw_mem *m);
int siw_invalidate_stag(struct ib_pd *pd, u32 stag);
int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
enum ib_access_flags perms, int len);
diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
index 32554eba1eac..a10820e33887 100644
--- a/drivers/infiniband/sw/siw/siw_qp_rx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_rx.c
@@ -38,7 +38,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
p = siw_get_upage(umem, dest_addr);
if (unlikely(!p)) {
- pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
+ pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n",
__func__, qp_id(rx_qp(srx)),
(void *)(uintptr_t)dest_addr,
(void *)(uintptr_t)umem->fp_addr);
@@ -51,7 +51,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
pg_off = dest_addr & ~PAGE_MASK;
bytes = min(len, (int)PAGE_SIZE - pg_off);
- siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
+ siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes);
dest = kmap_atomic(p);
rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
@@ -105,11 +105,11 @@ static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
{
int rv;
- siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
+ siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len);
rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
if (unlikely(rv)) {
- pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
+ pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n",
qp_id(rx_qp(srx)), __func__, len, kva, rv);
return rv;
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index fd7b266a221b..2b2a7b8e93b0 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -936,7 +936,7 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
rv = -EINVAL;
break;
}
- siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
+ siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n",
sqe->opcode, sqe->flags,
(void *)(uintptr_t)sqe->id);
@@ -1102,7 +1102,7 @@ int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
siw_dbg_qp(qp, "error %d\n", rv);
*bad_wr = wr;
}
- return rv > 0 ? 0 : rv;
+ return rv;
}
int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
@@ -1332,7 +1332,7 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
struct siw_device *sdev = to_siw_dev(pd->device);
int rv;
- siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
+ siw_dbg_pd(pd, "start: 0x%p, va: 0x%p, len: %llu\n",
(void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
(unsigned long long)len);
@@ -1525,7 +1525,7 @@ int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
mem->len = base_mr->length;
mem->va = base_mr->iova;
siw_dbg_mem(mem,
- "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
+ "%llu bytes, start 0x%p, %u SLE to %u entries\n",
mem->len, (void *)(uintptr_t)mem->va, num_sle,
pbl->num_buf);
}
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 4ffaf7588885..3504507477c6 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -391,6 +391,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
case GDMA_EQE_HWC_INIT_EQ_ID_DB:
case GDMA_EQE_HWC_INIT_DATA:
case GDMA_EQE_HWC_INIT_DONE:
+ case GDMA_EQE_HWC_SOC_SERVICE:
case GDMA_EQE_RNIC_QP_FATAL:
if (!eq->eq.callback)
break;
@@ -964,6 +965,7 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev)
err, resp.hdr.status);
return err ? err : -EPROTO;
}
+ gc->pf_cap_flags1 = resp.pf_cap_flags1;
if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) {
err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout);
if (err) {
@@ -1004,7 +1006,6 @@ int mana_gd_register_device(struct gdma_dev *gd)
return 0;
}
-EXPORT_SYMBOL_NS(mana_gd_register_device, "NET_MANA");
int mana_gd_deregister_device(struct gdma_dev *gd)
{
@@ -1035,7 +1036,6 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
return err;
}
-EXPORT_SYMBOL_NS(mana_gd_deregister_device, "NET_MANA");
u32 mana_gd_wq_avail_space(struct gdma_queue *wq)
{
@@ -1469,10 +1469,14 @@ static int mana_gd_setup(struct pci_dev *pdev)
mana_gd_init_registers(pdev);
mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base);
+ gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0);
+ if (!gc->service_wq)
+ return -ENOMEM;
+
err = mana_gd_setup_irqs(pdev);
if (err) {
dev_err(gc->dev, "Failed to setup IRQs: %d\n", err);
- return err;
+ goto free_workqueue;
}
err = mana_hwc_create_channel(gc);
@@ -1498,6 +1502,8 @@ destroy_hwc:
mana_hwc_destroy_channel(gc);
remove_irq:
mana_gd_remove_irqs(pdev);
+free_workqueue:
+ destroy_workqueue(gc->service_wq);
dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err);
return err;
}
@@ -1509,6 +1515,8 @@ static void mana_gd_cleanup(struct pci_dev *pdev)
mana_hwc_destroy_channel(gc);
mana_gd_remove_irqs(pdev);
+
+ destroy_workqueue(gc->service_wq);
dev_dbg(&pdev->dev, "mana gdma cleanup successful\n");
}
@@ -1578,8 +1586,14 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
if (err)
goto cleanup_gd;
+ err = mana_rdma_probe(&gc->mana_ib);
+ if (err)
+ goto cleanup_mana;
+
return 0;
+cleanup_mana:
+ mana_remove(&gc->mana, false);
cleanup_gd:
mana_gd_cleanup(pdev);
unmap_bar:
@@ -1607,6 +1621,7 @@ static void mana_gd_remove(struct pci_dev *pdev)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
+ mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, false);
mana_gd_cleanup(pdev);
@@ -1630,6 +1645,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
+ mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
mana_gd_cleanup(pdev);
@@ -1654,6 +1670,10 @@ static int mana_gd_resume(struct pci_dev *pdev)
if (err)
return err;
+ err = mana_rdma_probe(&gc->mana_ib);
+ if (err)
+ return err;
+
return 0;
}
@@ -1664,6 +1684,7 @@ static void mana_gd_shutdown(struct pci_dev *pdev)
dev_info(&pdev->dev, "Shutdown was called\n");
+ mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
mana_gd_cleanup(pdev);
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 1ba49602089b..a8c4d8db75a5 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -112,11 +112,13 @@ out:
static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
struct gdma_event *event)
{
+ union hwc_init_soc_service_type service_data;
struct hw_channel_context *hwc = ctx;
struct gdma_dev *gd = hwc->gdma_dev;
union hwc_init_type_data type_data;
union hwc_init_eq_id_db eq_db;
u32 type, val;
+ int ret;
switch (event->type) {
case GDMA_EQE_HWC_INIT_EQ_ID_DB:
@@ -199,7 +201,24 @@ static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
}
break;
+ case GDMA_EQE_HWC_SOC_SERVICE:
+ service_data.as_uint32 = event->details[0];
+ type = service_data.type;
+ switch (type) {
+ case GDMA_SERVICE_TYPE_RDMA_SUSPEND:
+ case GDMA_SERVICE_TYPE_RDMA_RESUME:
+ ret = mana_rdma_service_event(gd->gdma_context, type);
+ if (ret)
+ dev_err(hwc->dev, "Failed to schedule adev service event: %d\n",
+ ret);
+ break;
+ default:
+ dev_warn(hwc->dev, "Received unknown SOC service type %u\n", type);
+ break;
+ }
+
+ break;
default:
dev_warn(hwc->dev, "Received unknown gdma event %u\n", event->type);
/* Ignore unknown events, which should never happen. */
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 9c58d9e0bbb5..ccd2885c939e 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2950,7 +2950,7 @@ static void remove_adev(struct gdma_dev *gd)
gd->adev = NULL;
}
-static int add_adev(struct gdma_dev *gd)
+static int add_adev(struct gdma_dev *gd, const char *name)
{
struct auxiliary_device *adev;
struct mana_adev *madev;
@@ -2966,7 +2966,7 @@ static int add_adev(struct gdma_dev *gd)
goto idx_fail;
adev->id = ret;
- adev->name = "rdma";
+ adev->name = name;
adev->dev.parent = gd->gdma_context->dev;
adev->dev.release = adev_release;
madev->mdev = gd;
@@ -2998,6 +2998,70 @@ idx_fail:
return ret;
}
+static void mana_rdma_service_handle(struct work_struct *work)
+{
+ struct mana_service_work *serv_work =
+ container_of(work, struct mana_service_work, work);
+ struct gdma_dev *gd = serv_work->gdma_dev;
+ struct device *dev = gd->gdma_context->dev;
+ int ret;
+
+ if (READ_ONCE(gd->rdma_teardown))
+ goto out;
+
+ switch (serv_work->event) {
+ case GDMA_SERVICE_TYPE_RDMA_SUSPEND:
+ if (!gd->adev || gd->is_suspended)
+ break;
+
+ remove_adev(gd);
+ gd->is_suspended = true;
+ break;
+
+ case GDMA_SERVICE_TYPE_RDMA_RESUME:
+ if (!gd->is_suspended)
+ break;
+
+ ret = add_adev(gd, "rdma");
+ if (ret)
+ dev_err(dev, "Failed to add adev on resume: %d\n", ret);
+ else
+ gd->is_suspended = false;
+ break;
+
+ default:
+ dev_warn(dev, "unknown adev service event %u\n",
+ serv_work->event);
+ break;
+ }
+
+out:
+ kfree(serv_work);
+}
+
+int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event)
+{
+ struct gdma_dev *gd = &gc->mana_ib;
+ struct mana_service_work *serv_work;
+
+ if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) {
+ /* RDMA device is not detected on pci */
+ return 0;
+ }
+
+ serv_work = kzalloc(sizeof(*serv_work), GFP_ATOMIC);
+ if (!serv_work)
+ return -ENOMEM;
+
+ serv_work->event = event;
+ serv_work->gdma_dev = gd;
+
+ INIT_WORK(&serv_work->work, mana_rdma_service_handle);
+ queue_work(gc->service_wq, &serv_work->work);
+
+ return 0;
+}
+
int mana_probe(struct gdma_dev *gd, bool resuming)
{
struct gdma_context *gc = gd->gdma_context;
@@ -3085,7 +3149,7 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
}
}
- err = add_adev(gd);
+ err = add_adev(gd, "eth");
out:
if (err) {
mana_remove(gd, false);
@@ -3159,6 +3223,44 @@ out:
dev_dbg(dev, "%s succeeded\n", __func__);
}
+int mana_rdma_probe(struct gdma_dev *gd)
+{
+ int err = 0;
+
+ if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) {
+ /* RDMA device is not detected on pci */
+ return err;
+ }
+
+ err = mana_gd_register_device(gd);
+ if (err)
+ return err;
+
+ err = add_adev(gd, "rdma");
+ if (err)
+ mana_gd_deregister_device(gd);
+
+ return err;
+}
+
+void mana_rdma_remove(struct gdma_dev *gd)
+{
+ struct gdma_context *gc = gd->gdma_context;
+
+ if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) {
+ /* RDMA device is not detected on pci */
+ return;
+ }
+
+ WRITE_ONCE(gd->rdma_teardown, true);
+ flush_workqueue(gc->service_wq);
+
+ if (gd->adev)
+ remove_adev(gd);
+
+ mana_gd_deregister_device(gd);
+}
+
struct net_device *mana_get_primary_netdev(struct mana_context *ac,
u32 port_index,
netdevice_tracker *tracker)
diff --git a/include/linux/hmm-dma.h b/include/linux/hmm-dma.h
new file mode 100644
index 000000000000..f58b9fc71999
--- /dev/null
+++ b/include/linux/hmm-dma.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */
+#ifndef LINUX_HMM_DMA_H
+#define LINUX_HMM_DMA_H
+
+#include <linux/dma-mapping.h>
+
+struct dma_iova_state;
+struct pci_p2pdma_map_state;
+
+/*
+ * struct hmm_dma_map - array of PFNs and DMA addresses
+ *
+ * @state: DMA IOVA state
+ * @pfns: array of PFNs
+ * @dma_list: array of DMA addresses
+ * @dma_entry_size: size of each DMA entry in the array
+ */
+struct hmm_dma_map {
+ struct dma_iova_state state;
+ unsigned long *pfn_list;
+ dma_addr_t *dma_list;
+ size_t dma_entry_size;
+};
+
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size);
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map);
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state);
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx);
+#endif /* LINUX_HMM_DMA_H */
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 126a36571667..db75ffc949a7 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -23,6 +23,10 @@ struct mmu_interval_notifier;
* HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID)
* HMM_PFN_ERROR - accessing the pfn is impossible and the device should
* fail. ie poisoned memory, special pages, no vma, etc
+ * HMM_PFN_P2PDMA - P2P page
+ * HMM_PFN_P2PDMA_BUS - Bus mapped P2P transfer
+ * HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation
+ * to mark that page is already DMA mapped
*
* On input:
* 0 - Return the current state of the page, do not fault it.
@@ -36,13 +40,21 @@ enum hmm_pfn_flags {
HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1),
HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2),
HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3),
- HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8),
+ /*
+ * Sticky flags, carried from input to output,
+ * don't forget to update HMM_PFN_INOUT_FLAGS
+ */
+ HMM_PFN_DMA_MAPPED = 1UL << (BITS_PER_LONG - 4),
+ HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
+ HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
+
+ HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
/* Input flags */
HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
- HMM_PFN_FLAGS = 0xFFUL << HMM_PFN_ORDER_SHIFT,
+ HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1),
};
/*
@@ -58,6 +70,14 @@ static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn)
}
/*
+ * hmm_pfn_to_phys() - return physical address pointed to by a device entry
+ */
+static inline phys_addr_t hmm_pfn_to_phys(unsigned long hmm_pfn)
+{
+ return __pfn_to_phys(hmm_pfn & ~HMM_PFN_FLAGS);
+}
+
+/*
* hmm_pfn_to_map_order() - return the CPU mapping size order
*
* This is optionally useful to optimize processing of the pfn result
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d1dfbad9a447..e6ba8f4f4bd1 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -398,6 +398,7 @@ struct mlx5_core_rsc_common {
enum mlx5_res_type res;
refcount_t refcount;
struct completion free;
+ bool invalid;
};
struct mlx5_uars_page {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 228603bf03f2..3ce56a816425 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -60,6 +60,7 @@ enum gdma_eqe_type {
GDMA_EQE_HWC_INIT_DONE = 131,
GDMA_EQE_HWC_SOC_RECONFIG = 132,
GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133,
+ GDMA_EQE_HWC_SOC_SERVICE = 134,
GDMA_EQE_RNIC_QP_FATAL = 176,
};
@@ -70,6 +71,18 @@ enum {
GDMA_DEVICE_MANA_IB = 3,
};
+enum gdma_service_type {
+ GDMA_SERVICE_TYPE_NONE = 0,
+ GDMA_SERVICE_TYPE_RDMA_SUSPEND = 1,
+ GDMA_SERVICE_TYPE_RDMA_RESUME = 2,
+};
+
+struct mana_service_work {
+ struct work_struct work;
+ struct gdma_dev *gdma_dev;
+ enum gdma_service_type event;
+};
+
struct gdma_resource {
/* Protect the bitmap */
spinlock_t lock;
@@ -224,6 +237,8 @@ struct gdma_dev {
void *driver_data;
struct auxiliary_device *adev;
+ bool is_suspended;
+ bool rdma_teardown;
};
/* MANA_PAGE_SIZE is the DMA unit */
@@ -407,6 +422,10 @@ struct gdma_context {
/* Azure RDMA adapter */
struct gdma_dev mana_ib;
+
+ u64 pf_cap_flags1;
+
+ struct workqueue_struct *service_wq;
};
static inline bool mana_gd_is_mana(struct gdma_dev *gd)
@@ -553,6 +572,7 @@ enum {
*/
#define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2)
#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3)
+#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4)
#define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5)
/* Driver can handle holes (zeros) in the device list */
@@ -707,20 +727,6 @@ struct gdma_query_hwc_timeout_resp {
u32 reserved;
};
-enum atb_page_size {
- ATB_PAGE_SIZE_4K,
- ATB_PAGE_SIZE_8K,
- ATB_PAGE_SIZE_16K,
- ATB_PAGE_SIZE_32K,
- ATB_PAGE_SIZE_64K,
- ATB_PAGE_SIZE_128K,
- ATB_PAGE_SIZE_256K,
- ATB_PAGE_SIZE_512K,
- ATB_PAGE_SIZE_1M,
- ATB_PAGE_SIZE_2M,
- ATB_PAGE_SIZE_MAX,
-};
-
enum gdma_mr_access_flags {
GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0),
GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1),
@@ -815,6 +821,8 @@ enum gdma_mr_type {
* address that is set up in the MST
*/
GDMA_MR_TYPE_GVA = 2,
+ /* Guest zero-based address MRs */
+ GDMA_MR_TYPE_ZBVA = 4,
};
struct gdma_create_mr_params {
@@ -826,6 +834,10 @@ struct gdma_create_mr_params {
u64 virtual_address;
enum gdma_mr_access_flags access_flags;
} gva;
+ struct {
+ u64 dma_region_handle;
+ enum gdma_mr_access_flags access_flags;
+ } zbva;
};
};
@@ -841,7 +853,10 @@ struct gdma_create_mr_request {
u64 virtual_address;
enum gdma_mr_access_flags access_flags;
} gva;
-
+ struct {
+ u64 dma_region_handle;
+ enum gdma_mr_access_flags access_flags;
+ } zbva;
};
u32 reserved_2;
};/* HW DATA */
@@ -893,4 +908,6 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle);
void mana_register_debugfs(void);
void mana_unregister_debugfs(void);
+int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event);
+
#endif /* _GDMA_H */
diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h
index 158b125692c2..83cf93338eb3 100644
--- a/include/net/mana/hw_channel.h
+++ b/include/net/mana/hw_channel.h
@@ -49,6 +49,15 @@ union hwc_init_type_data {
};
}; /* HW DATA */
+union hwc_init_soc_service_type {
+ u32 as_uint32;
+
+ struct {
+ u32 value : 28;
+ u32 type : 4;
+ };
+}; /* HW DATA */
+
struct hwc_rx_oob {
u32 type : 6;
u32 eom : 1;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 38238c1d00bf..9abb66461211 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -489,6 +489,9 @@ int mana_detach(struct net_device *ndev, bool from_close);
int mana_probe(struct gdma_dev *gd, bool resuming);
void mana_remove(struct gdma_dev *gd, bool suspending);
+int mana_rdma_probe(struct gdma_dev *gd);
+void mana_rdma_remove(struct gdma_dev *gd);
+
void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev);
int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames,
u32 flags);
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index a2ac62b4a6cf..1fa3786f82f4 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -480,23 +480,12 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
const void *private_data,
u8 private_data_len);
-#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */
-
/**
- * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
- * message.
+ * ib_prepare_cm_mra - Prepares to send a message receipt acknowledgment to a
+ connection message in case duplicates are received.
* @cm_id: Connection identifier associated with the connection message.
- * @service_timeout: The lower 5-bits specify the maximum time required for
- * the sender to reply to the connection message. The upper 3-bits
- * specify additional control flags.
- * @private_data: Optional user-defined private data sent with the
- * message receipt acknowledgement.
- * @private_data_len: Size of the private data buffer, in bytes.
*/
-int ib_send_cm_mra(struct ib_cm_id *cm_id,
- u8 service_timeout,
- const void *private_data,
- u8 private_data_len);
+int ib_prepare_cm_mra(struct ib_cm_id *cm_id);
/**
* ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 0844c1d05ac6..2a24bf791c10 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -8,23 +8,17 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
+#include <linux/hmm-dma.h>
struct ib_umem_odp {
struct ib_umem umem;
struct mmu_interval_notifier notifier;
struct pid *tgid;
- /* An array of the pfns included in the on-demand paging umem. */
- unsigned long *pfn_list;
+ struct hmm_dma_map map;
/*
- * An array with DMA addresses mapped for pfns in pfn_list.
- * The lower two bits designate access permissions.
- * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT.
- */
- dma_addr_t *dma_list;
- /*
- * The umem_mutex protects the page_list and dma_list fields of an ODP
+ * The umem_mutex protects the page_list field of an ODP
* umem, allowing only a single thread to map/unmap pages. The mutex
* also protects access to the mmu notifier counters.
*/
@@ -67,19 +61,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
umem_odp->page_shift;
}
-/*
- * The lower 2 bits of the DMA address signal the R/W permissions for
- * the entry. To upgrade the permissions, provide the appropriate
- * bitmask to the map_dma_pages function.
- *
- * Be aware that upgrading a mapped address might result in change of
- * the DMA address for the page.
- */
-#define ODP_READ_ALLOWED_BIT (1<<0ULL)
-#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
-
-#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
-
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_umem_odp *
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 901353796fbb..af43a8d2a74a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -314,17 +314,19 @@ enum ib_atomic_cap {
};
enum ib_odp_general_cap_bits {
- IB_ODP_SUPPORT = 1 << 0,
- IB_ODP_SUPPORT_IMPLICIT = 1 << 1,
+ IB_ODP_SUPPORT = IB_UVERBS_ODP_SUPPORT,
+ IB_ODP_SUPPORT_IMPLICIT = IB_UVERBS_ODP_SUPPORT_IMPLICIT,
};
enum ib_odp_transport_cap_bits {
- IB_ODP_SUPPORT_SEND = 1 << 0,
- IB_ODP_SUPPORT_RECV = 1 << 1,
- IB_ODP_SUPPORT_WRITE = 1 << 2,
- IB_ODP_SUPPORT_READ = 1 << 3,
- IB_ODP_SUPPORT_ATOMIC = 1 << 4,
- IB_ODP_SUPPORT_SRQ_RECV = 1 << 5,
+ IB_ODP_SUPPORT_SEND = IB_UVERBS_ODP_SUPPORT_SEND,
+ IB_ODP_SUPPORT_RECV = IB_UVERBS_ODP_SUPPORT_RECV,
+ IB_ODP_SUPPORT_WRITE = IB_UVERBS_ODP_SUPPORT_WRITE,
+ IB_ODP_SUPPORT_READ = IB_UVERBS_ODP_SUPPORT_READ,
+ IB_ODP_SUPPORT_ATOMIC = IB_UVERBS_ODP_SUPPORT_ATOMIC,
+ IB_ODP_SUPPORT_SRQ_RECV = IB_UVERBS_ODP_SUPPORT_SRQ_RECV,
+ IB_ODP_SUPPORT_FLUSH = IB_UVERBS_ODP_SUPPORT_FLUSH,
+ IB_ODP_SUPPORT_ATOMIC_WRITE = IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE,
};
struct ib_odp_caps {
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 8a8ab2f793ab..d1593ad47e28 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -388,6 +388,5 @@ void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
union ib_gid *dgid);
struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *cm_id);
-struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res);
#endif /* RDMA_CM_H */
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index e16650f0c85d..3b7bd99813e9 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -233,6 +233,22 @@ struct ib_uverbs_ex_query_device {
__u32 reserved;
};
+enum ib_uverbs_odp_general_cap_bits {
+ IB_UVERBS_ODP_SUPPORT = 1 << 0,
+ IB_UVERBS_ODP_SUPPORT_IMPLICIT = 1 << 1,
+};
+
+enum ib_uverbs_odp_transport_cap_bits {
+ IB_UVERBS_ODP_SUPPORT_SEND = 1 << 0,
+ IB_UVERBS_ODP_SUPPORT_RECV = 1 << 1,
+ IB_UVERBS_ODP_SUPPORT_WRITE = 1 << 2,
+ IB_UVERBS_ODP_SUPPORT_READ = 1 << 3,
+ IB_UVERBS_ODP_SUPPORT_ATOMIC = 1 << 4,
+ IB_UVERBS_ODP_SUPPORT_SRQ_RECV = 1 << 5,
+ IB_UVERBS_ODP_SUPPORT_FLUSH = 1 << 6,
+ IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE = 1 << 7,
+};
+
struct ib_uverbs_odp_caps {
__aligned_u64 general_caps;
struct {
diff --git a/mm/hmm.c b/mm/hmm.c
index 082f7b7c0b9e..feac86196a65 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}
@@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
+ uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
- return 0;
+ goto out;
}
if (!pte_present(pte)) {
@@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
- return 0;
+ new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
- if (!required_fault) {
- *hmm_pfn = 0;
- return 0;
- }
+ if (!required_fault)
+ goto out;
if (!non_swap_entry(entry))
goto fault;
@@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
- return 0;
+ new_pfn_flags = HMM_PFN_ERROR;
+ goto out;
}
- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ new_pfn_flags = pte_pfn(pte) | cpu_flags;
+out:
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}
@@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }
spin_unlock(ptl);
return 0;
@@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+#ifdef CONFIG_DMA_NEED_SYNC
+ dma_need_sync = !dev->dma_skip_sync;
+#endif /* CONFIG_DMA_NEED_SYNC */
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kvfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kvfree(map->pfn_list);
+ kvfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @p2pdma_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ unsigned long attrs = 0;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ pfns[idx] |= HMM_PFN_P2PDMA;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
+ return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (ret)
+ goto error;
+
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
+ if (ret) {
+ dma_iova_unlink(dev, state, offset, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ goto error;
+ }
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ goto error;
+
+ dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, dma_addr))
+ goto error;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+error:
+ pfns[idx] &= ~HMM_PFN_P2PDMA;
+ return DMA_MAPPING_ERROR;
+
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ unsigned long attrs = 0;
+
+ if ((pfns[idx] & valid_dma) != valid_dma)
+ return false;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state)) {
+ if (pfns[idx] & HMM_PFN_P2PDMA)
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
+ } else if (dma_need_unmap(dev))
+ dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+
+ pfns[idx] &=
+ ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);