Commit 317aeb83 authored by Dick Kennedy's avatar Dick Kennedy Committed by Martin K. Petersen

scsi: lpfc: Add blk_io_poll support for latency improvment

Although the existing implementation is very good at high I/O load, on
tests involving light load, especially on only a few hardware queues,
latency was a little higher than it can be due to using workqueue
scheduling. Other tasks in the system can delay handling.

Change the lower level to use irq_poll by default which uses a softirq for
I/O completion. This gives better latency as variance in when the cq is
processed is reduced over the workqueue interface. However, as high load is
better served by not being in softirq when the CPU is loaded, work queues
are still used under high I/O load.

Link: https://lore.kernel.org/r/20200630215001.70793-13-jsmart2021@gmail.comSigned-off-by: default avatarDick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: default avatarJames Smart <jsmart2021@gmail.com>
Signed-off-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
parent f0020e42
......@@ -709,6 +709,9 @@ struct lpfc_hba {
struct workqueue_struct *wq;
struct delayed_work eq_delay_work;
#define LPFC_IDLE_STAT_DELAY 1000
struct delayed_work idle_stat_delay_work;
struct lpfc_sli sli;
uint8_t pci_dev_grp; /* lpfc PCI dev group: 0x0, 0x1, 0x2,... */
uint32_t sli_rev; /* SLI2, SLI3, or SLI4 */
......
......@@ -1224,6 +1224,75 @@ lpfc_hb_mbox_cmpl(struct lpfc_hba * phba, LPFC_MBOXQ_t * pmboxq)
return;
}
/**
* lpfc_idle_stat_delay_work - idle_stat tracking
*
* This routine tracks per-cq idle_stat and determines polling decisions.
*
* Return codes:
* None
**/
static void
lpfc_idle_stat_delay_work(struct work_struct *work)
{
struct lpfc_hba *phba = container_of(to_delayed_work(work),
struct lpfc_hba,
idle_stat_delay_work);
struct lpfc_queue *cq;
struct lpfc_sli4_hdw_queue *hdwq;
struct lpfc_idle_stat *idle_stat;
u32 i, idle_percent;
u64 wall, wall_idle, diff_wall, diff_idle, busy_time;
if (phba->pport->load_flag & FC_UNLOADING)
return;
if (phba->link_state == LPFC_HBA_ERROR ||
phba->pport->fc_flag & FC_OFFLINE_MODE)
goto requeue;
for_each_present_cpu(i) {
hdwq = &phba->sli4_hba.hdwq[phba->sli4_hba.cpu_map[i].hdwq];
cq = hdwq->io_cq;
/* Skip if we've already handled this cq's primary CPU */
if (cq->chann != i)
continue;
idle_stat = &phba->sli4_hba.idle_stat[i];
/* get_cpu_idle_time returns values as running counters. Thus,
* to know the amount for this period, the prior counter values
* need to be subtracted from the current counter values.
* From there, the idle time stat can be calculated as a
* percentage of 100 - the sum of the other consumption times.
*/
wall_idle = get_cpu_idle_time(i, &wall, 1);
diff_idle = wall_idle - idle_stat->prev_idle;
diff_wall = wall - idle_stat->prev_wall;
if (diff_wall <= diff_idle)
busy_time = 0;
else
busy_time = diff_wall - diff_idle;
idle_percent = div64_u64(100 * busy_time, diff_wall);
idle_percent = 100 - idle_percent;
if (idle_percent < 15)
cq->poll_mode = LPFC_QUEUE_WORK;
else
cq->poll_mode = LPFC_IRQ_POLL;
idle_stat->prev_idle = wall_idle;
idle_stat->prev_wall = wall;
}
requeue:
schedule_delayed_work(&phba->idle_stat_delay_work,
msecs_to_jiffies(LPFC_IDLE_STAT_DELAY));
}
static void
lpfc_hb_eq_delay_work(struct work_struct *work)
{
......@@ -2924,6 +2993,7 @@ lpfc_stop_hba_timers(struct lpfc_hba *phba)
if (phba->pport)
lpfc_stop_vport_timers(phba->pport);
cancel_delayed_work_sync(&phba->eq_delay_work);
cancel_delayed_work_sync(&phba->idle_stat_delay_work);
del_timer_sync(&phba->sli.mbox_tmo);
del_timer_sync(&phba->fabric_block_timer);
del_timer_sync(&phba->eratt_poll);
......@@ -6255,6 +6325,9 @@ lpfc_setup_driver_resource_phase1(struct lpfc_hba *phba)
INIT_DELAYED_WORK(&phba->eq_delay_work, lpfc_hb_eq_delay_work);
INIT_DELAYED_WORK(&phba->idle_stat_delay_work,
lpfc_idle_stat_delay_work);
return 0;
}
......@@ -6934,13 +7007,23 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
goto out_free_hba_cpu_map;
}
phba->sli4_hba.idle_stat = kcalloc(phba->sli4_hba.num_possible_cpu,
sizeof(*phba->sli4_hba.idle_stat),
GFP_KERNEL);
if (!phba->sli4_hba.idle_stat) {
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
"3390 Failed allocation for idle_stat\n");
rc = -ENOMEM;
goto out_free_hba_eq_info;
}
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
phba->sli4_hba.c_stat = alloc_percpu(struct lpfc_hdwq_stat);
if (!phba->sli4_hba.c_stat) {
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
"3332 Failed allocating per cpu hdwq stats\n");
rc = -ENOMEM;
goto out_free_hba_eq_info;
goto out_free_hba_idle_stat;
}
#endif
......@@ -6964,9 +7047,11 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
return 0;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
out_free_hba_idle_stat:
kfree(phba->sli4_hba.idle_stat);
#endif
out_free_hba_eq_info:
free_percpu(phba->sli4_hba.eq_info);
#endif
out_free_hba_cpu_map:
kfree(phba->sli4_hba.cpu_map);
out_free_hba_eq_hdl:
......@@ -7008,6 +7093,7 @@ lpfc_sli4_driver_resource_unset(struct lpfc_hba *phba)
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
free_percpu(phba->sli4_hba.c_stat);
#endif
kfree(phba->sli4_hba.idle_stat);
/* Free memory allocated for msi-x interrupt vector to CPU mapping */
kfree(phba->sli4_hba.cpu_map);
......
......@@ -7300,6 +7300,47 @@ lpfc_post_rq_buffer(struct lpfc_hba *phba, struct lpfc_queue *hrq,
return 1;
}
/**
* lpfc_init_idle_stat_hb - Initialize idle_stat tracking
*
* This routine initializes the per-cq idle_stat to dynamically dictate
* polling decisions.
*
* Return codes:
* None
**/
static void lpfc_init_idle_stat_hb(struct lpfc_hba *phba)
{
int i;
struct lpfc_sli4_hdw_queue *hdwq;
struct lpfc_queue *cq;
struct lpfc_idle_stat *idle_stat;
u64 wall;
for_each_present_cpu(i) {
hdwq = &phba->sli4_hba.hdwq[phba->sli4_hba.cpu_map[i].hdwq];
cq = hdwq->io_cq;
/* Skip if we've already handled this cq's primary CPU */
if (cq->chann != i)
continue;
idle_stat = &phba->sli4_hba.idle_stat[i];
idle_stat->prev_idle = get_cpu_idle_time(i, &wall, 1);
idle_stat->prev_wall = wall;
if (phba->nvmet_support)
cq->poll_mode = LPFC_QUEUE_WORK;
else
cq->poll_mode = LPFC_IRQ_POLL;
}
if (!phba->nvmet_support)
schedule_delayed_work(&phba->idle_stat_delay_work,
msecs_to_jiffies(LPFC_IDLE_STAT_DELAY));
}
static void lpfc_sli4_dip(struct lpfc_hba *phba)
{
uint32_t if_type;
......@@ -7877,6 +7918,9 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba)
queue_delayed_work(phba->wq, &phba->eq_delay_work,
msecs_to_jiffies(LPFC_EQ_DELAY_MSECS));
/* start per phba idle_stat_delay heartbeat */
lpfc_init_idle_stat_hb(phba);
/* Start error attention (ERATT) polling timer */
mod_timer(&phba->eratt_poll,
jiffies + msecs_to_jiffies(1000 * phba->eratt_poll_interval));
......@@ -13754,7 +13798,7 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe,
if (!ret)
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
"0390 Cannot schedule soft IRQ "
"0390 Cannot schedule queue work "
"for CQ eqcqid=%d, cqid=%d on CPU %d\n",
cqid, cq->queue_id, raw_smp_processor_id());
}
......@@ -13765,6 +13809,7 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe,
* @cq: Pointer to CQ to be processed
* @handler: Routine to process each cqe
* @delay: Pointer to usdelay to set in case of rescheduling of the handler
* @poll_mode: Polling mode we were called from
*
* This routine processes completion queue entries in a CQ. While a valid
* queue element is found, the handler is called. During processing checks
......@@ -13782,7 +13827,8 @@ lpfc_sli4_sp_handle_eqe(struct lpfc_hba *phba, struct lpfc_eqe *eqe,
static bool
__lpfc_sli4_process_cq(struct lpfc_hba *phba, struct lpfc_queue *cq,
bool (*handler)(struct lpfc_hba *, struct lpfc_queue *,
struct lpfc_cqe *), unsigned long *delay)
struct lpfc_cqe *), unsigned long *delay,
enum lpfc_poll_mode poll_mode)
{
struct lpfc_cqe *cqe;
bool workposted = false;
......@@ -13823,6 +13869,10 @@ __lpfc_sli4_process_cq(struct lpfc_hba *phba, struct lpfc_queue *cq,
arm = false;
}
/* Note: complete the irq_poll softirq before rearming CQ */
if (poll_mode == LPFC_IRQ_POLL)
irq_poll_complete(&cq->iop);
/* Track the max number of CQEs processed in 1 EQ */
if (count > cq->CQ_max_cqe)
cq->CQ_max_cqe = count;
......@@ -13872,17 +13922,17 @@ __lpfc_sli4_sp_process_cq(struct lpfc_queue *cq)
case LPFC_MCQ:
workposted |= __lpfc_sli4_process_cq(phba, cq,
lpfc_sli4_sp_handle_mcqe,
&delay);
&delay, LPFC_QUEUE_WORK);
break;
case LPFC_WCQ:
if (cq->subtype == LPFC_IO)
workposted |= __lpfc_sli4_process_cq(phba, cq,
lpfc_sli4_fp_handle_cqe,
&delay);
&delay, LPFC_QUEUE_WORK);
else
workposted |= __lpfc_sli4_process_cq(phba, cq,
lpfc_sli4_sp_handle_cqe,
&delay);
&delay, LPFC_QUEUE_WORK);
break;
default:
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
......@@ -13900,7 +13950,7 @@ __lpfc_sli4_sp_process_cq(struct lpfc_queue *cq)
&cq->sched_spwork, delay);
if (!ret)
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
"0394 Cannot schedule soft IRQ "
"0394 Cannot schedule queue work "
"for cqid=%d on CPU %d\n",
cq->queue_id, cq->chann);
}
......@@ -14231,6 +14281,44 @@ lpfc_sli4_fp_handle_cqe(struct lpfc_hba *phba, struct lpfc_queue *cq,
return workposted;
}
/**
* lpfc_sli4_sched_cq_work - Schedules cq work
* @phba: Pointer to HBA context object.
* @cq: Pointer to CQ
* @cqid: CQ ID
*
* This routine checks the poll mode of the CQ corresponding to
* cq->chann, then either schedules a softirq or queue_work to complete
* cq work.
*
* queue_work path is taken if in NVMET mode, or if poll_mode is in
* LPFC_QUEUE_WORK mode. Otherwise, softirq path is taken.
*
**/
static void lpfc_sli4_sched_cq_work(struct lpfc_hba *phba,
struct lpfc_queue *cq, uint16_t cqid)
{
int ret = 0;
switch (cq->poll_mode) {
case LPFC_IRQ_POLL:
irq_poll_sched(&cq->iop);
break;
case LPFC_QUEUE_WORK:
default:
if (is_kdump_kernel())
ret = queue_work(phba->wq, &cq->irqwork);
else
ret = queue_work_on(cq->chann, phba->wq, &cq->irqwork);
if (!ret)
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
"0383 Cannot schedule queue work "
"for CQ eqcqid=%d, cqid=%d on CPU %d\n",
cqid, cq->queue_id,
raw_smp_processor_id());
}
}
/**
* lpfc_sli4_hba_handle_eqe - Process a fast-path event queue entry
* @phba: Pointer to HBA context object.
......@@ -14250,7 +14338,6 @@ lpfc_sli4_hba_handle_eqe(struct lpfc_hba *phba, struct lpfc_queue *eq,
struct lpfc_queue *cq = NULL;
uint32_t qidx = eq->hdwq;
uint16_t cqid, id;
int ret = 0;
if (unlikely(bf_get_le32(lpfc_eqe_major_code, eqe) != 0)) {
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
......@@ -14310,20 +14397,13 @@ lpfc_sli4_hba_handle_eqe(struct lpfc_hba *phba, struct lpfc_queue *eq,
else
cq->isr_timestamp = 0;
#endif
if (is_kdump_kernel())
ret = queue_work(phba->wq, &cq->irqwork);
else
ret = queue_work_on(cq->chann, phba->wq, &cq->irqwork);
if (!ret)
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
"0363 Cannot schedule soft IRQ "
"for CQ eqcqid=%d, cqid=%d on CPU %d\n",
cqid, cq->queue_id, raw_smp_processor_id());
lpfc_sli4_sched_cq_work(phba, cq, cqid);
}
/**
* __lpfc_sli4_hba_process_cq - Process a fast-path event queue entry
* @cq: Pointer to CQ to be processed
* @poll_mode: Enum lpfc_poll_state to determine poll mode
*
* This routine calls the cq processing routine with the handler for
* fast path CQEs.
......@@ -14337,7 +14417,8 @@ lpfc_sli4_hba_handle_eqe(struct lpfc_hba *phba, struct lpfc_queue *eq,
* the delay indicates when to reschedule it.
**/
static void
__lpfc_sli4_hba_process_cq(struct lpfc_queue *cq)
__lpfc_sli4_hba_process_cq(struct lpfc_queue *cq,
enum lpfc_poll_mode poll_mode)
{
struct lpfc_hba *phba = cq->phba;
unsigned long delay;
......@@ -14346,7 +14427,7 @@ __lpfc_sli4_hba_process_cq(struct lpfc_queue *cq)
/* process and rearm the CQ */
workposted |= __lpfc_sli4_process_cq(phba, cq, lpfc_sli4_fp_handle_cqe,
&delay);
&delay, poll_mode);
if (delay) {
if (is_kdump_kernel())
......@@ -14357,7 +14438,7 @@ __lpfc_sli4_hba_process_cq(struct lpfc_queue *cq)
&cq->sched_irqwork, delay);
if (!ret)
lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
"0367 Cannot schedule soft IRQ "
"0367 Cannot schedule queue work "
"for cqid=%d on CPU %d\n",
cq->queue_id, cq->chann);
}
......@@ -14379,7 +14460,7 @@ lpfc_sli4_hba_process_cq(struct work_struct *work)
{
struct lpfc_queue *cq = container_of(work, struct lpfc_queue, irqwork);
__lpfc_sli4_hba_process_cq(cq);
__lpfc_sli4_hba_process_cq(cq, LPFC_QUEUE_WORK);
}
/**
......@@ -14394,7 +14475,7 @@ lpfc_sli4_dly_hba_process_cq(struct work_struct *work)
struct lpfc_queue *cq = container_of(to_delayed_work(work),
struct lpfc_queue, sched_irqwork);
__lpfc_sli4_hba_process_cq(cq);
__lpfc_sli4_hba_process_cq(cq, LPFC_QUEUE_WORK);
}
/**
......@@ -15069,6 +15150,15 @@ lpfc_eq_create(struct lpfc_hba *phba, struct lpfc_queue *eq, uint32_t imax)
return status;
}
static int lpfc_cq_poll_hdler(struct irq_poll *iop, int budget)
{
struct lpfc_queue *cq = container_of(iop, struct lpfc_queue, iop);
__lpfc_sli4_hba_process_cq(cq, LPFC_IRQ_POLL);
return 1;
}
/**
* lpfc_cq_create - Create a Completion Queue on the HBA
* @phba: HBA structure that indicates port to create a queue on.
......@@ -15208,6 +15298,8 @@ lpfc_cq_create(struct lpfc_hba *phba, struct lpfc_queue *cq,
if (cq->queue_id > phba->sli4_hba.cq_max)
phba->sli4_hba.cq_max = cq->queue_id;
irq_poll_init(&cq->iop, LPFC_IRQ_POLL_WEIGHT, lpfc_cq_poll_hdler);
out:
mempool_free(mbox, phba->mbox_mem_pool);
return status;
......
......@@ -20,6 +20,9 @@
* included with this package. *
*******************************************************************/
#include <linux/irq_poll.h>
#include <linux/cpufreq.h>
#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_SCSI_LPFC_DEBUG_FS)
#define CONFIG_SCSI_LPFC_DEBUG_FS
#endif
......@@ -135,6 +138,16 @@ struct lpfc_rqb {
struct rqb_dmabuf *);
};
enum lpfc_poll_mode {
LPFC_QUEUE_WORK,
LPFC_IRQ_POLL
};
struct lpfc_idle_stat {
u64 prev_idle;
u64 prev_wall;
};
struct lpfc_queue {
struct list_head list;
struct list_head wq_list;
......@@ -265,6 +278,10 @@ struct lpfc_queue {
struct lpfc_queue *assoc_qp;
struct list_head _poll_list;
void **q_pgs; /* array to index entries per page */
#define LPFC_IRQ_POLL_WEIGHT 256
struct irq_poll iop;
enum lpfc_poll_mode poll_mode;
};
struct lpfc_sli4_link {
......@@ -926,6 +943,7 @@ struct lpfc_sli4_hba {
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
struct lpfc_hdwq_stat __percpu *c_stat;
#endif
struct lpfc_idle_stat *idle_stat;
uint32_t conf_trunk;
#define lpfc_conf_trunk_port0_WORD conf_trunk
#define lpfc_conf_trunk_port0_SHIFT 0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment