Commit e0f30bac authored by Ramkrishna Vepa's avatar Ramkrishna Vepa Committed by Roland Dreier

IB/qib: Add optional NUMA affinity

This patch adds context relative numa affinity conditioned on the
module parameter numa_aware. The qib_ctxtdata has an additional
node_id member and qib_create_ctxtdata() has an addition node_id
parameter.

The allocations within the hdr queue and eager queue setup routines
now take this additional member and adjust allocations as necesary.
PSM will pass the either current numa node or the node closest to the
HCA depending on numa_aware. Verbs will always use the node closest to
the HCA.
Reviewed-by: default avatarDean Luick <dean.luick@intel.com>
Signed-off-by: default avatarRamkrishna Vepa <ramkrishna.vepa@intel.com>
Signed-off-by: default avatarVinit Agnihotri <vinit.abhay.agnihotri@intel.com>
Signed-off-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarRoland Dreier <roland@purestorage.com>
parent ab4a13d6
...@@ -154,6 +154,8 @@ struct qib_ctxtdata { ...@@ -154,6 +154,8 @@ struct qib_ctxtdata {
*/ */
/* instead of calculating it */ /* instead of calculating it */
unsigned ctxt; unsigned ctxt;
/* local node of context */
int node_id;
/* non-zero if ctxt is being shared. */ /* non-zero if ctxt is being shared. */
u16 subctxt_cnt; u16 subctxt_cnt;
/* non-zero if ctxt is being shared. */ /* non-zero if ctxt is being shared. */
...@@ -1088,6 +1090,8 @@ struct qib_devdata { ...@@ -1088,6 +1090,8 @@ struct qib_devdata {
u16 psxmitwait_check_rate; u16 psxmitwait_check_rate;
/* high volume overflow errors defered to tasklet */ /* high volume overflow errors defered to tasklet */
struct tasklet_struct error_tasklet; struct tasklet_struct error_tasklet;
int assigned_node_id; /* NUMA node closest to HCA */
}; };
/* hol_state values */ /* hol_state values */
...@@ -1167,7 +1171,7 @@ int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *); ...@@ -1167,7 +1171,7 @@ int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *);
int qib_setup_eagerbufs(struct qib_ctxtdata *); int qib_setup_eagerbufs(struct qib_ctxtdata *);
void qib_set_ctxtcnt(struct qib_devdata *); void qib_set_ctxtcnt(struct qib_devdata *);
int qib_create_ctxts(struct qib_devdata *dd); int qib_create_ctxts(struct qib_devdata *dd);
struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32); struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int);
void qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8); void qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8);
void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *); void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *);
...@@ -1458,6 +1462,7 @@ extern unsigned qib_n_krcv_queues; ...@@ -1458,6 +1462,7 @@ extern unsigned qib_n_krcv_queues;
extern unsigned qib_sdma_fetch_arb; extern unsigned qib_sdma_fetch_arb;
extern unsigned qib_compat_ddr_negotiate; extern unsigned qib_compat_ddr_negotiate;
extern int qib_special_trigger; extern int qib_special_trigger;
extern unsigned qib_numa_aware;
extern struct mutex qib_mutex; extern struct mutex qib_mutex;
......
...@@ -1263,8 +1263,12 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, ...@@ -1263,8 +1263,12 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
struct qib_ctxtdata *rcd; struct qib_ctxtdata *rcd;
void *ptmp = NULL; void *ptmp = NULL;
int ret; int ret;
int numa_id;
rcd = qib_create_ctxtdata(ppd, ctxt); numa_id = qib_numa_aware ? numa_node_id() :
dd->assigned_node_id;
rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);
/* /*
* Allocate memory for use in qib_tid_update() at open to * Allocate memory for use in qib_tid_update() at open to
......
...@@ -67,6 +67,11 @@ ushort qib_cfgctxts; ...@@ -67,6 +67,11 @@ ushort qib_cfgctxts;
module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO); module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO);
MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use"); MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use");
unsigned qib_numa_aware;
module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO);
MODULE_PARM_DESC(numa_aware,
"0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process");
/* /*
* If set, do not write to any regs if avoidable, hack to allow * If set, do not write to any regs if avoidable, hack to allow
* check for deranged default register values. * check for deranged default register values.
...@@ -124,6 +129,11 @@ int qib_create_ctxts(struct qib_devdata *dd) ...@@ -124,6 +129,11 @@ int qib_create_ctxts(struct qib_devdata *dd)
{ {
unsigned i; unsigned i;
int ret; int ret;
int local_node_id = pcibus_to_node(dd->pcidev->bus);
if (local_node_id < 0)
local_node_id = numa_node_id();
dd->assigned_node_id = local_node_id;
/* /*
* Allocate full ctxtcnt array, rather than just cfgctxts, because * Allocate full ctxtcnt array, rather than just cfgctxts, because
...@@ -146,7 +156,8 @@ int qib_create_ctxts(struct qib_devdata *dd) ...@@ -146,7 +156,8 @@ int qib_create_ctxts(struct qib_devdata *dd)
continue; continue;
ppd = dd->pport + (i % dd->num_pports); ppd = dd->pport + (i % dd->num_pports);
rcd = qib_create_ctxtdata(ppd, i);
rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id);
if (!rcd) { if (!rcd) {
qib_dev_err(dd, qib_dev_err(dd,
"Unable to allocate ctxtdata for Kernel ctxt, failing\n"); "Unable to allocate ctxtdata for Kernel ctxt, failing\n");
...@@ -164,14 +175,16 @@ int qib_create_ctxts(struct qib_devdata *dd) ...@@ -164,14 +175,16 @@ int qib_create_ctxts(struct qib_devdata *dd)
/* /*
* Common code for user and kernel context setup. * Common code for user and kernel context setup.
*/ */
struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt) struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,
int node_id)
{ {
struct qib_devdata *dd = ppd->dd; struct qib_devdata *dd = ppd->dd;
struct qib_ctxtdata *rcd; struct qib_ctxtdata *rcd;
rcd = kzalloc(sizeof(*rcd), GFP_KERNEL); rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id);
if (rcd) { if (rcd) {
INIT_LIST_HEAD(&rcd->qp_wait_list); INIT_LIST_HEAD(&rcd->qp_wait_list);
rcd->node_id = node_id;
rcd->ppd = ppd; rcd->ppd = ppd;
rcd->dd = dd; rcd->dd = dd;
rcd->cnt = 1; rcd->cnt = 1;
...@@ -1524,6 +1537,7 @@ static void qib_remove_one(struct pci_dev *pdev) ...@@ -1524,6 +1537,7 @@ static void qib_remove_one(struct pci_dev *pdev)
int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
{ {
unsigned amt; unsigned amt;
int old_node_id;
if (!rcd->rcvhdrq) { if (!rcd->rcvhdrq) {
dma_addr_t phys_hdrqtail; dma_addr_t phys_hdrqtail;
...@@ -1533,9 +1547,13 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) ...@@ -1533,9 +1547,13 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
sizeof(u32), PAGE_SIZE); sizeof(u32), PAGE_SIZE);
gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
GFP_USER : GFP_KERNEL; GFP_USER : GFP_KERNEL;
old_node_id = dev_to_node(&dd->pcidev->dev);
set_dev_node(&dd->pcidev->dev, rcd->node_id);
rcd->rcvhdrq = dma_alloc_coherent( rcd->rcvhdrq = dma_alloc_coherent(
&dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
gfp_flags | __GFP_COMP); gfp_flags | __GFP_COMP);
set_dev_node(&dd->pcidev->dev, old_node_id);
if (!rcd->rcvhdrq) { if (!rcd->rcvhdrq) {
qib_dev_err(dd, qib_dev_err(dd,
...@@ -1551,9 +1569,11 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) ...@@ -1551,9 +1569,11 @@ int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
} }
if (!(dd->flags & QIB_NODMA_RTAIL)) { if (!(dd->flags & QIB_NODMA_RTAIL)) {
set_dev_node(&dd->pcidev->dev, rcd->node_id);
rcd->rcvhdrtail_kvaddr = dma_alloc_coherent( rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(
&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
gfp_flags); gfp_flags);
set_dev_node(&dd->pcidev->dev, old_node_id);
if (!rcd->rcvhdrtail_kvaddr) if (!rcd->rcvhdrtail_kvaddr)
goto bail_free; goto bail_free;
rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
...@@ -1597,6 +1617,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) ...@@ -1597,6 +1617,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
size_t size; size_t size;
gfp_t gfp_flags; gfp_t gfp_flags;
int old_node_id;
/* /*
* GFP_USER, but without GFP_FS, so buffer cache can be * GFP_USER, but without GFP_FS, so buffer cache can be
...@@ -1615,25 +1636,29 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) ...@@ -1615,25 +1636,29 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
size = rcd->rcvegrbuf_size; size = rcd->rcvegrbuf_size;
if (!rcd->rcvegrbuf) { if (!rcd->rcvegrbuf) {
rcd->rcvegrbuf = rcd->rcvegrbuf =
kzalloc(chunk * sizeof(rcd->rcvegrbuf[0]), kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]),
GFP_KERNEL); GFP_KERNEL, rcd->node_id);
if (!rcd->rcvegrbuf) if (!rcd->rcvegrbuf)
goto bail; goto bail;
} }
if (!rcd->rcvegrbuf_phys) { if (!rcd->rcvegrbuf_phys) {
rcd->rcvegrbuf_phys = rcd->rcvegrbuf_phys =
kmalloc(chunk * sizeof(rcd->rcvegrbuf_phys[0]), kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]),
GFP_KERNEL); GFP_KERNEL, rcd->node_id);
if (!rcd->rcvegrbuf_phys) if (!rcd->rcvegrbuf_phys)
goto bail_rcvegrbuf; goto bail_rcvegrbuf;
} }
for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
if (rcd->rcvegrbuf[e]) if (rcd->rcvegrbuf[e])
continue; continue;
old_node_id = dev_to_node(&dd->pcidev->dev);
set_dev_node(&dd->pcidev->dev, rcd->node_id);
rcd->rcvegrbuf[e] = rcd->rcvegrbuf[e] =
dma_alloc_coherent(&dd->pcidev->dev, size, dma_alloc_coherent(&dd->pcidev->dev, size,
&rcd->rcvegrbuf_phys[e], &rcd->rcvegrbuf_phys[e],
gfp_flags); gfp_flags);
set_dev_node(&dd->pcidev->dev, old_node_id);
if (!rcd->rcvegrbuf[e]) if (!rcd->rcvegrbuf[e])
goto bail_rcvegrbuf_phys; goto bail_rcvegrbuf_phys;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment