Commit 838b6fd2 authored by Kaike Wan's avatar Kaike Wan Committed by Doug Ledford

IB/hfi1: TID RDMA RcvArray programming and TID allocation

TID entries are used by hfi1 hardware to receive data payload from
incoming packets directly into a user buffer and thus avoid data copying
by software. This patch implements the functions for TID allocation,
freeing, and programming TID RcvArray entries in hardware for kernel
clients. TID entries are managed via lists of TID groups similar to PSM.
Furthermore, to track TID resource allocation for each request, software
flows are also allocated and freed as needed. Since software flows
consume large amount of memory for tracking TID allocation and freeing,
it is generally desirable to allocate them dynamically in the send queue
and only for TID RDMA requests, but pre-allocate them for receive queue
because the send queue could have thousands of entries while the receive
queue has only a limited number of entries.
Signed-off-by: default avatarMitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: default avatarAshutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarKaike Wan <kaike.wan@intel.com>
Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent 37356e78
......@@ -303,6 +303,8 @@ struct hfi1_ctxtdata {
spinlock_t exp_lock;
/* Queue for QP's waiting for HW TID flows */
struct tid_queue flow_queue;
/* Queue for QP's waiting for HW receive array entries */
struct tid_queue rarr_queue;
/* when waiting for rcv or pioavail */
wait_queue_head_t wait;
/* uuid from PSM */
......
......@@ -372,6 +372,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
mutex_init(&rcd->exp_mutex);
spin_lock_init(&rcd->exp_lock);
INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
......@@ -1596,7 +1597,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
if (rcd) {
hfi1_clear_tids(rcd);
hfi1_free_ctxt_rcv_groups(rcd);
hfi1_free_ctxt(rcd);
}
}
......
This diff is collapsed.
......@@ -6,7 +6,16 @@
#ifndef HFI1_TID_RDMA_H
#define HFI1_TID_RDMA_H
#include <linux/circ_buf.h>
#include "common.h"
/* Add a convenience helper */
#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size)
#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size)
#define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */
#define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT)
struct tid_rdma_params {
struct rcu_head rcu_head;
......@@ -36,6 +45,81 @@ struct tid_flow_state {
u8 flags;
};
struct tid_rdma_request {
struct rvt_qp *qp;
struct hfi1_ctxtdata *rcd;
union {
struct rvt_swqe *swqe;
struct rvt_ack_entry *ack;
} e;
struct tid_rdma_flow *flows; /* array of tid flows */
u16 n_flows; /* size of the flow buffer window */
u16 setup_head; /* flow index we are setting up */
u16 clear_tail; /* flow index we are clearing */
u16 flow_idx; /* flow index most recently set up */
u32 seg_len;
u32 isge; /* index of "current" sge */
};
/*
* When header suppression is used, PSNs associated with a "flow" are
* relevant (and not the PSNs maintained by verbs). Track per-flow
* PSNs here for a TID RDMA segment.
*
*/
struct flow_state {
u32 flags;
u32 resp_ib_psn; /* The IB PSN of the response for this flow */
u32 generation; /* generation of flow */
u32 spsn; /* starting PSN in TID space */
u32 lpsn; /* last PSN in TID space */
u32 r_next_psn; /* next PSN to be received (in TID space) */
};
struct tid_rdma_pageset {
dma_addr_t addr : 48; /* Only needed for the first page */
u8 idx: 8;
u8 count : 7;
u8 mapped: 1;
};
/**
* kern_tid_node - used for managing TID's in TID groups
*
* @grp_idx: rcd relative index to tid_group
* @map: grp->map captured prior to programming this TID group in HW
* @cnt: Only @cnt of available group entries are actually programmed
*/
struct kern_tid_node {
struct tid_group *grp;
u8 map;
u8 cnt;
};
/* Overall info for a TID RDMA segment */
struct tid_rdma_flow {
/*
* While a TID RDMA segment is being transferred, it uses a QP number
* from the "KDETH section of QP numbers" (which is different from the
* QP number that originated the request). Bits 11-15 of these QP
* numbers identify the "TID flow" for the segment.
*/
struct flow_state flow_state;
struct tid_rdma_request *req;
u32 length;
u8 tnode_cnt;
u8 tidcnt;
u8 idx;
u8 npagesets;
u8 npkts;
struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
u32 tid_entry[TID_RDMA_MAX_PAGES];
};
bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
......@@ -43,6 +127,23 @@ void tid_rdma_conn_error(struct rvt_qp *qp);
void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
struct rvt_sge_state *ss, bool *last);
int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req);
void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req);
void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
/**
* trdma_clean_swqe - clean flows for swqe if large send queue
* @qp: the qp
* @wqe: the send wqe
*/
static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
{
if (!wqe->priv)
return;
__trdma_clean_swqe(qp, wqe);
}
int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
struct ib_qp_init_attr *init_attr);
......
......@@ -48,7 +48,6 @@
*/
#include "hfi.h"
#include "exp_rcv.h"
struct tid_pageset {
......
......@@ -504,11 +504,28 @@ static void verbs_sdma_complete(
hfi1_put_txreq(tx);
}
void hfi1_wait_kmem(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
struct ib_qp *ibqp = &qp->ibqp;
struct ib_device *ibdev = ibqp->device;
struct hfi1_ibdev *dev = to_idev(ibdev);
if (list_empty(&priv->s_iowait.list)) {
if (list_empty(&dev->memwait))
mod_timer(&dev->mem_timer, jiffies + 1);
qp->s_flags |= RVT_S_WAIT_KMEM;
list_add_tail(&priv->s_iowait.list, &dev->memwait);
priv->s_iowait.lock = &dev->iowait_lock;
trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
rvt_get_qp(qp);
}
}
static int wait_kmem(struct hfi1_ibdev *dev,
struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
unsigned long flags;
int ret = 0;
......@@ -517,15 +534,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
write_seqlock(&dev->iowait_lock);
list_add_tail(&ps->s_txreq->txreq.list,
&ps->wait->tx_head);
if (list_empty(&priv->s_iowait.list)) {
if (list_empty(&dev->memwait))
mod_timer(&dev->mem_timer, jiffies + 1);
qp->s_flags |= RVT_S_WAIT_KMEM;
list_add_tail(&priv->s_iowait.list, &dev->memwait);
priv->s_iowait.lock = &dev->iowait_lock;
trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
rvt_get_qp(qp);
}
hfi1_wait_kmem(qp);
write_sequnlock(&dev->iowait_lock);
hfi1_qp_unbusy(qp, ps->wait);
ret = -EBUSY;
......
......@@ -159,6 +159,7 @@ struct hfi1_qp_priv {
struct sdma_engine *s_sde; /* current sde */
struct send_context *s_sendcontext; /* current sendcontext */
struct hfi1_ctxtdata *rcd; /* QP's receive context */
struct page **pages; /* for TID page scan */
u32 tid_enqueue; /* saved when tid waited */
u8 s_sc; /* SC[0..4] for next packet */
struct iowait s_iowait;
......@@ -173,6 +174,14 @@ struct hfi1_qp_priv {
u8 timeout_shift; /* account for number of packets per segment */
};
struct hfi1_swqe_priv {
struct tid_rdma_request tid_req;
};
struct hfi1_ack_priv {
struct tid_rdma_request tid_req;
};
/*
* This structure is used to hold commonly lookedup and computed values during
* the send engine progress.
......@@ -321,6 +330,21 @@ static inline u32 delta_psn(u32 a, u32 b)
return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
}
/*
* Look through all the active flows for a TID RDMA request and find
* the one (if it exists) that contains the specified PSN.
*/
static inline u32 __full_flow_psn(struct flow_state *state, u32 psn)
{
return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
(psn & HFI1_KDETH_BTH_SEQ_MASK));
}
static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn)
{
return __full_flow_psn(&flow->flow_state, psn);
}
struct verbs_txreq;
void hfi1_put_txreq(struct verbs_txreq *tx);
......@@ -403,6 +427,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
}
void hfi1_wait_kmem(struct rvt_qp *qp);
static inline void hfi1_trdma_send_complete(struct rvt_qp *qp,
struct rvt_swqe *wqe,
enum ib_wc_status status)
{
trdma_clean_swqe(qp, wqe);
rvt_send_complete(qp, wqe, status);
}
extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
extern const u8 hdr_len_by_opcode[];
......
......@@ -1642,11 +1642,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
kref_put(&qp->ip->ref, rvt_release_mmap_info);
else
vfree(qp->r_rq.wq);
vfree(qp->s_wq);
rdi->driver_f.qp_priv_free(rdi, qp);
kfree(qp->s_ack_queue);
rdma_destroy_ah_attr(&qp->remote_ah_attr);
rdma_destroy_ah_attr(&qp->alt_ah_attr);
vfree(qp->s_wq);
kfree(qp);
return 0;
}
......
......@@ -174,6 +174,7 @@ struct rvt_swqe {
u32 lpsn; /* last packet sequence number */
u32 ssn; /* send sequence number */
u32 length; /* total length of data in sg_list */
void *priv; /* driver dependent field */
struct rvt_sge sg_list[0];
};
......@@ -235,6 +236,7 @@ struct rvt_ack_entry {
u32 lpsn;
u8 opcode;
u8 sent;
void *priv;
};
#define RC_QP_SCALING_INTERVAL 5
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment