Commit bf8440e6 authored by Johannes Berg's avatar Johannes Berg Committed by Wey-Yi Guy

iwlwifi: improve TX cache footprint

Having cmd[], meta[] and skbs[] as separate arrays
in the TX queue structure is cache inefficient as
we need the data for a given entry together.

To improve this, create an array with these three
members (allocate meta as part of that struct) so
we have the data we need together located together
improving cache footprint.

The downside is that we need to allocate a lot of
memory in one chunk, about 10KiB (on 64-bit) which
isn't very efficient.
Signed-off-by: default avatarJohannes Berg <johannes.berg@intel.com>
Signed-off-by: default avatarWey-Yi Guy <wey-yi.w.guy@intel.com>
parent 682e5f64
...@@ -179,30 +179,33 @@ struct iwl_queue { ...@@ -179,30 +179,33 @@ struct iwl_queue {
* space less than this */ * space less than this */
}; };
#define TFD_TX_CMD_SLOTS 256
#define TFD_CMD_SLOTS 32
struct iwl_pcie_tx_queue_entry {
struct iwl_device_cmd *cmd;
struct sk_buff *skb;
struct iwl_cmd_meta meta;
};
/** /**
* struct iwl_tx_queue - Tx Queue for DMA * struct iwl_tx_queue - Tx Queue for DMA
* @q: generic Rx/Tx queue descriptor * @q: generic Rx/Tx queue descriptor
* @bd: base of circular buffer of TFDs * @tfds: transmit frame descriptors (DMA memory)
* @cmd: array of command/TX buffer pointers * @entries: transmit entries (driver state)
* @meta: array of meta data for each command/tx buffer * @lock: queue lock
* @dma_addr_cmd: physical address of cmd/tx buffer array * @stuck_timer: timer that fires if queue gets stuck
* @txb: array of per-TFD driver data * @trans_pcie: pointer back to transport (for timer)
* lock: queue lock
* @time_stamp: time (in jiffies) of last read_ptr change
* @need_update: indicates need to update read/write index * @need_update: indicates need to update read/write index
* @active: stores if queue is active
* *
* A Tx queue consists of circular buffer of BDs (a.k.a. TFDs, transmit frame * A Tx queue consists of circular buffer of BDs (a.k.a. TFDs, transmit frame
* descriptors) and required locking structures. * descriptors) and required locking structures.
*/ */
#define TFD_TX_CMD_SLOTS 256
#define TFD_CMD_SLOTS 32
struct iwl_tx_queue { struct iwl_tx_queue {
struct iwl_queue q; struct iwl_queue q;
struct iwl_tfd *tfds; struct iwl_tfd *tfds;
struct iwl_device_cmd **cmd; struct iwl_pcie_tx_queue_entry *entries;
struct iwl_cmd_meta *meta;
struct sk_buff **skbs;
spinlock_t lock; spinlock_t lock;
struct timer_list stuck_timer; struct timer_list stuck_timer;
struct iwl_trans_pcie *trans_pcie; struct iwl_trans_pcie *trans_pcie;
......
...@@ -425,7 +425,7 @@ static void iwl_rx_handle_rxbuf(struct iwl_trans *trans, ...@@ -425,7 +425,7 @@ static void iwl_rx_handle_rxbuf(struct iwl_trans *trans,
cmd_index = get_cmd_index(&txq->q, index); cmd_index = get_cmd_index(&txq->q, index);
if (reclaim) if (reclaim)
cmd = txq->cmd[cmd_index]; cmd = txq->entries[cmd_index].cmd;
else else
cmd = NULL; cmd = NULL;
......
...@@ -58,7 +58,7 @@ void iwl_trans_txq_update_byte_cnt_tbl(struct iwl_trans *trans, ...@@ -58,7 +58,7 @@ void iwl_trans_txq_update_byte_cnt_tbl(struct iwl_trans *trans,
u16 len = byte_cnt + IWL_TX_CRC_SIZE + IWL_TX_DELIMITER_SIZE; u16 len = byte_cnt + IWL_TX_CRC_SIZE + IWL_TX_DELIMITER_SIZE;
__le16 bc_ent; __le16 bc_ent;
struct iwl_tx_cmd *tx_cmd = struct iwl_tx_cmd *tx_cmd =
(struct iwl_tx_cmd *) txq->cmd[txq->q.write_ptr]->payload; (void *) txq->entries[txq->q.write_ptr].cmd->payload;
scd_bc_tbl = trans_pcie->scd_bc_tbls.addr; scd_bc_tbl = trans_pcie->scd_bc_tbls.addr;
...@@ -221,13 +221,14 @@ void iwlagn_txq_free_tfd(struct iwl_trans *trans, struct iwl_tx_queue *txq, ...@@ -221,13 +221,14 @@ void iwlagn_txq_free_tfd(struct iwl_trans *trans, struct iwl_tx_queue *txq,
lockdep_assert_held(&txq->lock); lockdep_assert_held(&txq->lock);
iwlagn_unmap_tfd(trans, &txq->meta[index], &tfd_tmp[index], dma_dir); iwlagn_unmap_tfd(trans, &txq->entries[index].meta,
&tfd_tmp[index], dma_dir);
/* free SKB */ /* free SKB */
if (txq->skbs) { if (txq->entries) {
struct sk_buff *skb; struct sk_buff *skb;
skb = txq->skbs[index]; skb = txq->entries[index].skb;
/* Can be called from irqs-disabled context /* Can be called from irqs-disabled context
* If skb is not NULL, it means that the whole queue is being * If skb is not NULL, it means that the whole queue is being
...@@ -235,7 +236,7 @@ void iwlagn_txq_free_tfd(struct iwl_trans *trans, struct iwl_tx_queue *txq, ...@@ -235,7 +236,7 @@ void iwlagn_txq_free_tfd(struct iwl_trans *trans, struct iwl_tx_queue *txq,
*/ */
if (skb) { if (skb) {
iwl_op_mode_free_skb(trans->op_mode, skb); iwl_op_mode_free_skb(trans->op_mode, skb);
txq->skbs[index] = NULL; txq->entries[index].skb = NULL;
} }
} }
} }
...@@ -358,7 +359,7 @@ static void iwlagn_txq_inval_byte_cnt_tbl(struct iwl_trans *trans, ...@@ -358,7 +359,7 @@ static void iwlagn_txq_inval_byte_cnt_tbl(struct iwl_trans *trans,
u8 sta_id = 0; u8 sta_id = 0;
__le16 bc_ent; __le16 bc_ent;
struct iwl_tx_cmd *tx_cmd = struct iwl_tx_cmd *tx_cmd =
(struct iwl_tx_cmd *) txq->cmd[txq->q.read_ptr]->payload; (void *)txq->entries[txq->q.read_ptr].cmd->payload;
WARN_ON(read_ptr >= TFD_QUEUE_SIZE_MAX); WARN_ON(read_ptr >= TFD_QUEUE_SIZE_MAX);
...@@ -578,8 +579,8 @@ static int iwl_enqueue_hcmd(struct iwl_trans *trans, struct iwl_host_cmd *cmd) ...@@ -578,8 +579,8 @@ static int iwl_enqueue_hcmd(struct iwl_trans *trans, struct iwl_host_cmd *cmd)
} }
idx = get_cmd_index(q, q->write_ptr); idx = get_cmd_index(q, q->write_ptr);
out_cmd = txq->cmd[idx]; out_cmd = txq->entries[idx].cmd;
out_meta = &txq->meta[idx]; out_meta = &txq->entries[idx].meta;
memset(out_meta, 0, sizeof(*out_meta)); /* re-initialize to NULL */ memset(out_meta, 0, sizeof(*out_meta)); /* re-initialize to NULL */
if (cmd->flags & CMD_WANT_SKB) if (cmd->flags & CMD_WANT_SKB)
...@@ -772,8 +773,8 @@ void iwl_tx_cmd_complete(struct iwl_trans *trans, struct iwl_rx_cmd_buffer *rxb, ...@@ -772,8 +773,8 @@ void iwl_tx_cmd_complete(struct iwl_trans *trans, struct iwl_rx_cmd_buffer *rxb,
spin_lock(&txq->lock); spin_lock(&txq->lock);
cmd_index = get_cmd_index(&txq->q, index); cmd_index = get_cmd_index(&txq->q, index);
cmd = txq->cmd[cmd_index]; cmd = txq->entries[cmd_index].cmd;
meta = &txq->meta[cmd_index]; meta = &txq->entries[cmd_index].meta;
iwlagn_unmap_tfd(trans, meta, &txq->tfds[index], iwlagn_unmap_tfd(trans, meta, &txq->tfds[index],
DMA_BIDIRECTIONAL); DMA_BIDIRECTIONAL);
...@@ -905,8 +906,8 @@ static int iwl_send_cmd_sync(struct iwl_trans *trans, struct iwl_host_cmd *cmd) ...@@ -905,8 +906,8 @@ static int iwl_send_cmd_sync(struct iwl_trans *trans, struct iwl_host_cmd *cmd)
* in later, it will possibly set an invalid * in later, it will possibly set an invalid
* address (cmd->meta.source). * address (cmd->meta.source).
*/ */
trans_pcie->txq[trans_pcie->cmd_queue].meta[cmd_idx].flags &= trans_pcie->txq[trans_pcie->cmd_queue].
~CMD_WANT_SKB; entries[cmd_idx].meta.flags &= ~CMD_WANT_SKB;
} }
if (cmd->resp_pkt) { if (cmd->resp_pkt) {
...@@ -961,12 +962,12 @@ int iwl_tx_queue_reclaim(struct iwl_trans *trans, int txq_id, int index, ...@@ -961,12 +962,12 @@ int iwl_tx_queue_reclaim(struct iwl_trans *trans, int txq_id, int index,
q->read_ptr != index; q->read_ptr != index;
q->read_ptr = iwl_queue_inc_wrap(q->read_ptr, q->n_bd)) { q->read_ptr = iwl_queue_inc_wrap(q->read_ptr, q->n_bd)) {
if (WARN_ON_ONCE(txq->skbs[txq->q.read_ptr] == NULL)) if (WARN_ON_ONCE(txq->entries[txq->q.read_ptr].skb == NULL))
continue; continue;
__skb_queue_tail(skbs, txq->skbs[txq->q.read_ptr]); __skb_queue_tail(skbs, txq->entries[txq->q.read_ptr].skb);
txq->skbs[txq->q.read_ptr] = NULL; txq->entries[txq->q.read_ptr].skb = NULL;
iwlagn_txq_inval_byte_cnt_tbl(trans, txq); iwlagn_txq_inval_byte_cnt_tbl(trans, txq);
......
...@@ -333,7 +333,7 @@ static int iwl_trans_txq_alloc(struct iwl_trans *trans, ...@@ -333,7 +333,7 @@ static int iwl_trans_txq_alloc(struct iwl_trans *trans,
int i; int i;
struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
if (WARN_ON(txq->meta || txq->cmd || txq->skbs || txq->tfds)) if (WARN_ON(txq->entries || txq->tfds))
return -EINVAL; return -EINVAL;
setup_timer(&txq->stuck_timer, iwl_trans_pcie_queue_stuck_timer, setup_timer(&txq->stuck_timer, iwl_trans_pcie_queue_stuck_timer,
...@@ -342,35 +342,22 @@ static int iwl_trans_txq_alloc(struct iwl_trans *trans, ...@@ -342,35 +342,22 @@ static int iwl_trans_txq_alloc(struct iwl_trans *trans,
txq->q.n_window = slots_num; txq->q.n_window = slots_num;
txq->meta = kcalloc(slots_num, sizeof(txq->meta[0]), GFP_KERNEL); txq->entries = kcalloc(slots_num,
txq->cmd = kcalloc(slots_num, sizeof(txq->cmd[0]), GFP_KERNEL); sizeof(struct iwl_pcie_tx_queue_entry),
GFP_KERNEL);
if (!txq->meta || !txq->cmd) if (!txq->entries)
goto error; goto error;
if (txq_id == trans_pcie->cmd_queue) if (txq_id == trans_pcie->cmd_queue)
for (i = 0; i < slots_num; i++) { for (i = 0; i < slots_num; i++) {
txq->cmd[i] = kmalloc(sizeof(struct iwl_device_cmd), txq->entries[i].cmd =
GFP_KERNEL); kmalloc(sizeof(struct iwl_device_cmd),
if (!txq->cmd[i]) GFP_KERNEL);
if (!txq->entries[i].cmd)
goto error; goto error;
} }
/* Alloc driver data array and TFD circular buffer */
/* Driver private data, only for Tx (not command) queues,
* not shared with device. */
if (txq_id != trans_pcie->cmd_queue) {
txq->skbs = kcalloc(TFD_QUEUE_SIZE_MAX, sizeof(txq->skbs[0]),
GFP_KERNEL);
if (!txq->skbs) {
IWL_ERR(trans, "kmalloc for auxiliary BD "
"structures failed\n");
goto error;
}
} else {
txq->skbs = NULL;
}
/* Circular buffer of transmit frame descriptors (TFDs), /* Circular buffer of transmit frame descriptors (TFDs),
* shared with device */ * shared with device */
txq->tfds = dma_alloc_coherent(trans->dev, tfd_sz, txq->tfds = dma_alloc_coherent(trans->dev, tfd_sz,
...@@ -383,17 +370,11 @@ static int iwl_trans_txq_alloc(struct iwl_trans *trans, ...@@ -383,17 +370,11 @@ static int iwl_trans_txq_alloc(struct iwl_trans *trans,
return 0; return 0;
error: error:
kfree(txq->skbs); if (txq->entries && txq_id == trans_pcie->cmd_queue)
txq->skbs = NULL;
/* since txq->cmd has been zeroed,
* all non allocated cmd[i] will be NULL */
if (txq->cmd && txq_id == trans_pcie->cmd_queue)
for (i = 0; i < slots_num; i++) for (i = 0; i < slots_num; i++)
kfree(txq->cmd[i]); kfree(txq->entries[i].cmd);
kfree(txq->meta); kfree(txq->entries);
kfree(txq->cmd); txq->entries = NULL;
txq->meta = NULL;
txq->cmd = NULL;
return -ENOMEM; return -ENOMEM;
...@@ -405,7 +386,6 @@ static int iwl_trans_txq_init(struct iwl_trans *trans, struct iwl_tx_queue *txq, ...@@ -405,7 +386,6 @@ static int iwl_trans_txq_init(struct iwl_trans *trans, struct iwl_tx_queue *txq,
int ret; int ret;
txq->need_update = 0; txq->need_update = 0;
memset(txq->meta, 0, sizeof(txq->meta[0]) * slots_num);
/* TFD_QUEUE_SIZE_MAX must be power-of-two size, otherwise /* TFD_QUEUE_SIZE_MAX must be power-of-two size, otherwise
* iwl_queue_inc_wrap and iwl_queue_dec_wrap are broken. */ * iwl_queue_inc_wrap and iwl_queue_dec_wrap are broken. */
...@@ -483,7 +463,7 @@ static void iwl_tx_queue_free(struct iwl_trans *trans, int txq_id) ...@@ -483,7 +463,7 @@ static void iwl_tx_queue_free(struct iwl_trans *trans, int txq_id)
if (txq_id == trans_pcie->cmd_queue) if (txq_id == trans_pcie->cmd_queue)
for (i = 0; i < txq->q.n_window; i++) for (i = 0; i < txq->q.n_window; i++)
kfree(txq->cmd[i]); kfree(txq->entries[i].cmd);
/* De-alloc circular buffer of TFDs */ /* De-alloc circular buffer of TFDs */
if (txq->q.n_bd) { if (txq->q.n_bd) {
...@@ -492,15 +472,8 @@ static void iwl_tx_queue_free(struct iwl_trans *trans, int txq_id) ...@@ -492,15 +472,8 @@ static void iwl_tx_queue_free(struct iwl_trans *trans, int txq_id)
memset(&txq->q.dma_addr, 0, sizeof(txq->q.dma_addr)); memset(&txq->q.dma_addr, 0, sizeof(txq->q.dma_addr));
} }
/* De-alloc array of per-TFD driver data */ kfree(txq->entries);
kfree(txq->skbs); txq->entries = NULL;
txq->skbs = NULL;
/* deallocate arrays */
kfree(txq->cmd);
kfree(txq->meta);
txq->cmd = NULL;
txq->meta = NULL;
del_timer_sync(&txq->stuck_timer); del_timer_sync(&txq->stuck_timer);
...@@ -1295,15 +1268,15 @@ static int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, ...@@ -1295,15 +1268,15 @@ static int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb,
spin_lock(&txq->lock); spin_lock(&txq->lock);
/* Set up driver data for this TFD */ /* Set up driver data for this TFD */
txq->skbs[q->write_ptr] = skb; txq->entries[q->write_ptr].skb = skb;
txq->cmd[q->write_ptr] = dev_cmd; txq->entries[q->write_ptr].cmd = dev_cmd;
dev_cmd->hdr.cmd = REPLY_TX; dev_cmd->hdr.cmd = REPLY_TX;
dev_cmd->hdr.sequence = cpu_to_le16((u16)(QUEUE_TO_SEQ(txq_id) | dev_cmd->hdr.sequence = cpu_to_le16((u16)(QUEUE_TO_SEQ(txq_id) |
INDEX_TO_SEQ(q->write_ptr))); INDEX_TO_SEQ(q->write_ptr)));
/* Set up first empty entry in queue's array of Tx/cmd buffers */ /* Set up first empty entry in queue's array of Tx/cmd buffers */
out_meta = &txq->meta[q->write_ptr]; out_meta = &txq->entries[q->write_ptr].meta;
/* /*
* Use the first empty entry in this queue's command buffer array * Use the first empty entry in this queue's command buffer array
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment