Commit 16102736 authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman

Merge tag 'misc-habanalabs-next-2019-05-03' of...

Merge tag 'misc-habanalabs-next-2019-05-03' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next

Oded writes:

This tag contains further changes for kernel 5.2.

The changes are either bug fixes or simple re-factoring of existing code.
The notable changes are:

- Add missing fields in the bmon structure that is passed in the debug
  IOCTL when the user wants to configure the bus monitor.

- Use the dedicated device-CPU accessible memory pool for all host memory
  allocations that are accessible directly by the embedded CPU. This is
  needed to enforce certain restrictions we have due to the embedded CPU's
  architecture.

- Manipulate DMA addresses only inside ASIC-specific files. This is needed
  to better support future ASICs code.

Other minor changes include:

- Move pr_fmt() to c files to avoid dependency in include order.

- Remove call to CS parsing function for workloads that originates from
  the driver and remove dead code as a result from this change.

- Update names of structure members and labels to better reflect their
  usage.

- When moving the dram PCI bar aperture, return the old aperture address
  range instead of error code. This will allow us to restore the old
  address range in a simpler fashion.

* tag 'misc-habanalabs-next-2019-05-03' of git://people.freedesktop.org/~gabbayo/linux:
  habanalabs: Update CPU DMA memory label name
  habanalabs: Update CPU DMA pool label name
  habanalabs: increase timeout if working with simulator
  habanalabs: remove condition that is always true
  habanalabs: remove redundant member from parser struct
  habanalabs: Manipulate DMA addresses in ASIC functions
  habanalabs: rename functions to improve code readability
  habanalabs: remove call to cs_parser()
  habanalabs: Use single pool for CPU accessible host memory
  habanalabs: return old dram bar address upon change
  habanalabs: rename restore to ctx_switch when appropriate
  habanalabs: use ASIC functions interface for rreg/wreg
  uapi/habanalabs: add missing fields in bmon params
  habanalabs: re-factor goya_parse_cb_no_ext_queue()
  habanalabs: Cancel pr_fmt() definition dependency on includes order
parents 78e6427b 9f832fda
......@@ -13,7 +13,7 @@
static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
{
hdev->asic_funcs->dma_free_coherent(hdev, cb->size,
hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size,
(void *) (uintptr_t) cb->kernel_address,
cb->bus_address);
kfree(cb);
......@@ -66,10 +66,10 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
return NULL;
if (ctx_id == HL_KERNEL_ASID_ID)
p = hdev->asic_funcs->dma_alloc_coherent(hdev, cb_size,
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size,
&cb->bus_address, GFP_ATOMIC);
else
p = hdev->asic_funcs->dma_alloc_coherent(hdev, cb_size,
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size,
&cb->bus_address,
GFP_USER | __GFP_ZERO);
if (!p) {
......
......@@ -93,7 +93,6 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
parser.user_cb_size = job->user_cb_size;
parser.ext_queue = job->ext_queue;
job->patched_cb = NULL;
parser.use_virt_addr = hdev->mmu_enable;
rc = hdev->asic_funcs->cs_parser(hdev, &parser);
if (job->ext_queue) {
......@@ -601,7 +600,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
void __user *chunks;
u32 num_chunks;
u64 cs_seq = ULONG_MAX;
int rc, do_restore;
int rc, do_ctx_switch;
bool need_soft_reset = false;
if (hl_device_disabled_or_in_reset(hdev)) {
......@@ -612,9 +611,9 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
goto out;
}
do_restore = atomic_cmpxchg(&ctx->thread_restore_token, 1, 0);
do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
if (do_restore || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
long ret;
chunks = (void __user *)(uintptr_t)args->in.chunks_restore;
......@@ -622,7 +621,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
mutex_lock(&hpriv->restore_phase_mutex);
if (do_restore) {
if (do_ctx_switch) {
rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
if (rc) {
dev_err_ratelimited(hdev->dev,
......@@ -678,18 +677,18 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
}
}
ctx->thread_restore_wait_token = 1;
} else if (!ctx->thread_restore_wait_token) {
ctx->thread_ctx_switch_wait_token = 1;
} else if (!ctx->thread_ctx_switch_wait_token) {
u32 tmp;
rc = hl_poll_timeout_memory(hdev,
(u64) (uintptr_t) &ctx->thread_restore_wait_token,
(u64) (uintptr_t) &ctx->thread_ctx_switch_wait_token,
jiffies_to_usecs(hdev->timeout_jiffies),
&tmp);
if (rc || !tmp) {
dev_err(hdev->dev,
"restore phase hasn't finished in time\n");
"context switch phase didn't finish in time\n");
rc = -ETIMEDOUT;
goto out;
}
......
......@@ -106,8 +106,8 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
ctx->cs_sequence = 1;
spin_lock_init(&ctx->cs_lock);
atomic_set(&ctx->thread_restore_token, 1);
ctx->thread_restore_wait_token = 0;
atomic_set(&ctx->thread_ctx_switch_token, 1);
ctx->thread_ctx_switch_wait_token = 0;
if (is_kernel_ctx) {
ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */
......
......@@ -5,6 +5,8 @@
* All Rights Reserved.
*/
#define pr_fmt(fmt) "habanalabs: " fmt
#include "habanalabs.h"
#include <linux/pci.h>
......@@ -708,10 +710,10 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
hl_cq_reset(hdev, &hdev->completion_queue[i]);
/* Make sure the setup phase for the user context will run again */
/* Make sure the context switch phase will run again */
if (hdev->user_ctx) {
atomic_set(&hdev->user_ctx->thread_restore_token, 1);
hdev->user_ctx->thread_restore_wait_token = 0;
atomic_set(&hdev->user_ctx->thread_ctx_switch_token, 1);
hdev->user_ctx->thread_ctx_switch_wait_token = 0;
}
/* Finished tear-down, starting to re-initialize */
......@@ -1145,7 +1147,13 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr,
* either by the direct access of the device or by another core
*/
u32 *paddr = (u32 *) (uintptr_t) addr;
ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
ktime_t timeout;
/* timeout should be longer when working with simulator */
if (!hdev->pdev)
timeout_us *= 10;
timeout = ktime_add_us(ktime_get(), timeout_us);
might_sleep();
......
......@@ -249,8 +249,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)
pkt.ctl = cpu_to_le32(ARMCP_PACKET_INFO_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.addr = cpu_to_le64(armcp_info_dma_addr +
prop->host_phys_base_address);
pkt.addr = cpu_to_le64(armcp_info_dma_addr);
pkt.data_max_size = cpu_to_le32(sizeof(struct armcp_info));
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
......@@ -281,7 +280,6 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct armcp_packet pkt = {};
void *eeprom_info_cpu_addr;
dma_addr_t eeprom_info_dma_addr;
......@@ -301,8 +299,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
pkt.ctl = cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.addr = cpu_to_le64(eeprom_info_dma_addr +
prop->host_phys_base_address);
pkt.addr = cpu_to_le64(eeprom_info_dma_addr);
pkt.data_max_size = cpu_to_le32(max_size);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
......
......@@ -297,7 +297,7 @@ static u32 goya_all_events[] = {
GOYA_ASYNC_EVENT_ID_DMA_BM_CH4
};
static void goya_get_fixed_properties(struct hl_device *hdev)
void goya_get_fixed_properties(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
int i;
......@@ -345,7 +345,6 @@ static void goya_get_fixed_properties(struct hl_device *hdev)
prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
prop->dram_page_size = PAGE_SIZE_2MB;
prop->host_phys_base_address = HOST_PHYS_BASE;
prop->va_space_host_start_address = VA_HOST_SPACE_START;
prop->va_space_host_end_address = VA_HOST_SPACE_END;
prop->va_space_dram_start_address = VA_DDR_SPACE_START;
......@@ -389,33 +388,26 @@ static int goya_pci_bars_map(struct hl_device *hdev)
return 0;
}
/*
* goya_set_ddr_bar_base - set DDR bar to map specific device address
*
* @hdev: pointer to hl_device structure
* @addr: address in DDR. Must be aligned to DDR bar size
*
* This function configures the iATU so that the DDR bar will start at the
* specified addr.
*
*/
static int goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
{
struct goya_device *goya = hdev->asic_specific;
u64 old_addr = addr;
int rc;
if ((goya) && (goya->ddr_bar_cur_addr == addr))
return 0;
return old_addr;
/* Inbound Region 1 - Bar 4 - Point to DDR */
rc = hl_pci_set_dram_bar_base(hdev, 1, 4, addr);
if (rc)
return rc;
return U64_MAX;
if (goya)
if (goya) {
old_addr = goya->ddr_bar_cur_addr;
goya->ddr_bar_cur_addr = addr;
}
return 0;
return old_addr;
}
/*
......@@ -429,7 +421,7 @@ static int goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
static int goya_init_iatu(struct hl_device *hdev)
{
return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE,
HOST_PHYS_SIZE);
HOST_PHYS_BASE, HOST_PHYS_SIZE);
}
/*
......@@ -542,14 +534,7 @@ static void goya_fetch_psoc_frequency(struct hl_device *hdev)
prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1);
}
/*
* goya_late_init - GOYA late initialization code
*
* @hdev: pointer to hl_device structure
*
* Get ArmCP info and send message to CPU to enable PCI access
*/
static int goya_late_init(struct hl_device *hdev)
int goya_late_init(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
int rc;
......@@ -648,9 +633,6 @@ static int goya_sw_init(struct hl_device *hdev)
goya->tpc_clk = GOYA_PLL_FREQ_LOW;
goya->ic_clk = GOYA_PLL_FREQ_LOW;
goya->mmu_prepare_reg = goya_mmu_prepare_reg;
goya->qman0_set_security = goya_qman0_set_security;
hdev->asic_specific = goya;
/* Create DMA pool for small allocations */
......@@ -663,7 +645,7 @@ static int goya_sw_init(struct hl_device *hdev)
}
hdev->cpu_accessible_dma_mem =
hdev->asic_funcs->dma_alloc_coherent(hdev,
hdev->asic_funcs->asic_dma_alloc_coherent(hdev,
HL_CPU_ACCESSIBLE_MEM_SIZE,
&hdev->cpu_accessible_dma_address,
GFP_KERNEL | __GFP_ZERO);
......@@ -678,7 +660,7 @@ static int goya_sw_init(struct hl_device *hdev)
dev_err(hdev->dev,
"Failed to create CPU accessible DMA pool\n");
rc = -ENOMEM;
goto free_cpu_pq_dma_mem;
goto free_cpu_dma_mem;
}
rc = gen_pool_add(hdev->cpu_accessible_dma_pool,
......@@ -688,17 +670,18 @@ static int goya_sw_init(struct hl_device *hdev)
dev_err(hdev->dev,
"Failed to add memory to CPU accessible DMA pool\n");
rc = -EFAULT;
goto free_cpu_pq_pool;
goto free_cpu_accessible_dma_pool;
}
spin_lock_init(&goya->hw_queues_lock);
return 0;
free_cpu_pq_pool:
free_cpu_accessible_dma_pool:
gen_pool_destroy(hdev->cpu_accessible_dma_pool);
free_cpu_pq_dma_mem:
hdev->asic_funcs->dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE,
free_cpu_dma_mem:
hdev->asic_funcs->asic_dma_free_coherent(hdev,
HL_CPU_ACCESSIBLE_MEM_SIZE,
hdev->cpu_accessible_dma_mem,
hdev->cpu_accessible_dma_address);
free_dma_pool:
......@@ -721,7 +704,8 @@ static int goya_sw_fini(struct hl_device *hdev)
gen_pool_destroy(hdev->cpu_accessible_dma_pool);
hdev->asic_funcs->dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE,
hdev->asic_funcs->asic_dma_free_coherent(hdev,
HL_CPU_ACCESSIBLE_MEM_SIZE,
hdev->cpu_accessible_dma_mem,
hdev->cpu_accessible_dma_address);
......@@ -815,11 +799,10 @@ static void goya_init_dma_ch(struct hl_device *hdev, int dma_id)
* Initialize the H/W registers of the QMAN DMA channels
*
*/
static void goya_init_dma_qmans(struct hl_device *hdev)
void goya_init_dma_qmans(struct hl_device *hdev)
{
struct goya_device *goya = hdev->asic_specific;
struct hl_hw_queue *q;
dma_addr_t bus_address;
int i;
if (goya->hw_cap_initialized & HW_CAP_DMA)
......@@ -828,10 +811,7 @@ static void goya_init_dma_qmans(struct hl_device *hdev)
q = &hdev->kernel_queues[0];
for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++, q++) {
bus_address = q->bus_address +
hdev->asic_prop.host_phys_base_address;
goya_init_dma_qman(hdev, i, bus_address);
goya_init_dma_qman(hdev, i, q->bus_address);
goya_init_dma_ch(hdev, i);
}
......@@ -968,11 +948,10 @@ static int goya_stop_external_queues(struct hl_device *hdev)
* Returns 0 on success
*
*/
static int goya_init_cpu_queues(struct hl_device *hdev)
int goya_init_cpu_queues(struct hl_device *hdev)
{
struct goya_device *goya = hdev->asic_specific;
struct hl_eq *eq;
dma_addr_t bus_address;
u32 status;
struct hl_hw_queue *cpu_pq = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
int err;
......@@ -985,19 +964,18 @@ static int goya_init_cpu_queues(struct hl_device *hdev)
eq = &hdev->event_queue;
bus_address = cpu_pq->bus_address +
hdev->asic_prop.host_phys_base_address;
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, lower_32_bits(bus_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, upper_32_bits(bus_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0,
lower_32_bits(cpu_pq->bus_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1,
upper_32_bits(cpu_pq->bus_address));
bus_address = eq->bus_address + hdev->asic_prop.host_phys_base_address;
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(bus_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(bus_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(eq->bus_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(eq->bus_address));
bus_address = hdev->cpu_accessible_dma_address +
hdev->asic_prop.host_phys_base_address;
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, lower_32_bits(bus_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, upper_32_bits(bus_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8,
lower_32_bits(hdev->cpu_accessible_dma_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9,
upper_32_bits(hdev->cpu_accessible_dma_address));
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES);
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_4, HL_EQ_SIZE_IN_BYTES);
......@@ -1549,7 +1527,7 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
WREG32(mmMME_CMDQ_GLBL_CFG0, CMDQ_MME_ENABLE);
}
static void goya_init_mme_qmans(struct hl_device *hdev)
void goya_init_mme_qmans(struct hl_device *hdev)
{
struct goya_device *goya = hdev->asic_specific;
u32 so_base_lo, so_base_hi;
......@@ -1656,7 +1634,7 @@ static void goya_init_tpc_cmdq(struct hl_device *hdev, int tpc_id)
WREG32(mmTPC0_CMDQ_GLBL_CFG0 + reg_off, CMDQ_TPC_ENABLE);
}
static void goya_init_tpc_qmans(struct hl_device *hdev)
void goya_init_tpc_qmans(struct hl_device *hdev)
{
struct goya_device *goya = hdev->asic_specific;
u32 so_base_lo, so_base_hi;
......@@ -2225,11 +2203,10 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
* Before pushing u-boot/linux to device, need to set the ddr bar to
* base address of dram
*/
rc = goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE);
if (rc) {
if (goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE) == U64_MAX) {
dev_err(hdev->dev,
"failed to map DDR bar to DRAM base address\n");
return rc;
return -EIO;
}
if (hdev->pldm) {
......@@ -2373,7 +2350,7 @@ static int goya_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid,
return 0;
}
static int goya_mmu_init(struct hl_device *hdev)
int goya_mmu_init(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct goya_device *goya = hdev->asic_specific;
......@@ -2464,12 +2441,12 @@ static int goya_hw_init(struct hl_device *hdev)
* After CPU initialization is finished, change DDR bar mapping inside
* iATU to point to the start address of the MMU page tables
*/
rc = goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE +
(MMU_PAGE_TABLES_ADDR & ~(prop->dram_pci_bar_size - 0x1ull)));
if (rc) {
if (goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE +
(MMU_PAGE_TABLES_ADDR &
~(prop->dram_pci_bar_size - 0x1ull))) == U64_MAX) {
dev_err(hdev->dev,
"failed to map DDR bar to MMU page tables\n");
return rc;
return -EIO;
}
rc = goya_mmu_init(hdev);
......@@ -2649,7 +2626,7 @@ static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma,
return rc;
}
static void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
{
u32 db_reg_offset, db_value;
bool invalid_queue = false;
......@@ -2747,13 +2724,23 @@ void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val)
static void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags)
{
return dma_alloc_coherent(&hdev->pdev->dev, size, dma_handle, flags);
void *kernel_addr = dma_alloc_coherent(&hdev->pdev->dev, size,
dma_handle, flags);
/* Shift to the device's base physical address of host memory */
if (kernel_addr)
*dma_handle += HOST_PHYS_BASE;
return kernel_addr;
}
static void goya_dma_free_coherent(struct hl_device *hdev, size_t size,
void *cpu_addr, dma_addr_t dma_handle)
{
dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle);
/* Cancel the device's base physical address of host memory */
dma_addr_t fixed_dma_handle = dma_handle - HOST_PHYS_BASE;
dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, fixed_dma_handle);
}
void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
......@@ -2816,7 +2803,6 @@ void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
{
struct goya_device *goya = hdev->asic_specific;
struct packet_msg_prot *fence_pkt;
u32 *fence_ptr;
dma_addr_t fence_dma_addr;
......@@ -2837,7 +2823,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
return -EBUSY;
}
fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,
fence_ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, 4, GFP_KERNEL,
&fence_dma_addr);
if (!fence_ptr) {
dev_err(hdev->dev,
......@@ -2847,7 +2833,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
*fence_ptr = 0;
goya->qman0_set_security(hdev, true);
goya_qman0_set_security(hdev, true);
/*
* goya cs parser saves space for 2xpacket_msg_prot at end of CB. For
......@@ -2865,8 +2851,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
(1 << GOYA_PKT_CTL_MB_SHIFT);
fence_pkt->ctl = cpu_to_le32(tmp);
fence_pkt->value = cpu_to_le32(GOYA_QMAN0_FENCE_VAL);
fence_pkt->addr = cpu_to_le64(fence_dma_addr +
hdev->asic_prop.host_phys_base_address);
fence_pkt->addr = cpu_to_le64(fence_dma_addr);
rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0,
job->job_cb_size, cb->bus_address);
......@@ -2886,10 +2871,10 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
}
free_fence_ptr:
hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr,
hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_ptr,
fence_dma_addr);
goya->qman0_set_security(hdev, false);
goya_qman0_set_security(hdev, false);
return rc;
}
......@@ -2920,7 +2905,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
fence_val = GOYA_QMAN0_FENCE_VAL;
fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,
fence_ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, 4, GFP_KERNEL,
&fence_dma_addr);
if (!fence_ptr) {
dev_err(hdev->dev,
......@@ -2930,7 +2915,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
*fence_ptr = 0;
fence_pkt = hdev->asic_funcs->dma_pool_zalloc(hdev,
fence_pkt = hdev->asic_funcs->asic_dma_pool_zalloc(hdev,
sizeof(struct packet_msg_prot),
GFP_KERNEL, &pkt_dma_addr);
if (!fence_pkt) {
......@@ -2945,8 +2930,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
(1 << GOYA_PKT_CTL_MB_SHIFT);
fence_pkt->ctl = cpu_to_le32(tmp);
fence_pkt->value = cpu_to_le32(fence_val);
fence_pkt->addr = cpu_to_le64(fence_dma_addr +
hdev->asic_prop.host_phys_base_address);
fence_pkt->addr = cpu_to_le64(fence_dma_addr);
rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id,
sizeof(struct packet_msg_prot),
......@@ -2974,10 +2958,10 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
}
free_pkt:
hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_pkt,
hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_pkt,
pkt_dma_addr);
free_fence_ptr:
hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr,
hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_ptr,
fence_dma_addr);
return rc;
}
......@@ -3018,16 +3002,27 @@ int goya_test_queues(struct hl_device *hdev)
static void *goya_dma_pool_zalloc(struct hl_device *hdev, size_t size,
gfp_t mem_flags, dma_addr_t *dma_handle)
{
void *kernel_addr;
if (size > GOYA_DMA_POOL_BLK_SIZE)
return NULL;
return dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle);
kernel_addr = dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle);
/* Shift to the device's base physical address of host memory */
if (kernel_addr)
*dma_handle += HOST_PHYS_BASE;
return kernel_addr;
}
static void goya_dma_pool_free(struct hl_device *hdev, void *vaddr,
dma_addr_t dma_addr)
{
dma_pool_free(hdev->dma_pool, vaddr, dma_addr);
/* Cancel the device's base physical address of host memory */
dma_addr_t fixed_dma_addr = dma_addr - HOST_PHYS_BASE;
dma_pool_free(hdev->dma_pool, vaddr, fixed_dma_addr);
}
void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
......@@ -3042,19 +3037,33 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
hl_fw_cpu_accessible_dma_pool_free(hdev, size, vaddr);
}
static int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg,
static int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir)
{
if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir))
struct scatterlist *sg;
int i;
if (!dma_map_sg(&hdev->pdev->dev, sgl, nents, dir))
return -ENOMEM;
/* Shift to the device's base physical address of host memory */
for_each_sg(sgl, sg, nents, i)
sg->dma_address += HOST_PHYS_BASE;
return 0;
}
static void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg,
static void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir)
{
dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir);
struct scatterlist *sg;
int i;
/* Cancel the device's base physical address of host memory */
for_each_sg(sgl, sg, nents, i)
sg->dma_address -= HOST_PHYS_BASE;
dma_unmap_sg(&hdev->pdev->dev, sgl, nents, dir);
}
u32 goya_get_dma_desc_list_size(struct hl_device *hdev, struct sg_table *sgt)
......@@ -3204,31 +3213,29 @@ static int goya_validate_dma_pkt_host(struct hl_device *hdev,
return -EFAULT;
}
if (parser->ctx_id != HL_KERNEL_ASID_ID) {
if (sram_addr) {
if (!hl_mem_area_inside_range(device_memory_addr,
le32_to_cpu(user_dma_pkt->tsize),
hdev->asic_prop.sram_user_base_address,
hdev->asic_prop.sram_end_address)) {
if (sram_addr) {
if (!hl_mem_area_inside_range(device_memory_addr,
le32_to_cpu(user_dma_pkt->tsize),
hdev->asic_prop.sram_user_base_address,
hdev->asic_prop.sram_end_address)) {
dev_err(hdev->dev,
"SRAM address 0x%llx + 0x%x is invalid\n",
device_memory_addr,
user_dma_pkt->tsize);
return -EFAULT;
}
} else {
if (!hl_mem_area_inside_range(device_memory_addr,
le32_to_cpu(user_dma_pkt->tsize),
hdev->asic_prop.dram_user_base_address,
hdev->asic_prop.dram_end_address)) {
dev_err(hdev->dev,
"DRAM address 0x%llx + 0x%x is invalid\n",
device_memory_addr,
user_dma_pkt->tsize);
return -EFAULT;
}
dev_err(hdev->dev,
"SRAM address 0x%llx + 0x%x is invalid\n",
device_memory_addr,
user_dma_pkt->tsize);
return -EFAULT;
}
} else {
if (!hl_mem_area_inside_range(device_memory_addr,
le32_to_cpu(user_dma_pkt->tsize),
hdev->asic_prop.dram_user_base_address,
hdev->asic_prop.dram_end_address)) {
dev_err(hdev->dev,
"DRAM address 0x%llx + 0x%x is invalid\n",
device_memory_addr,
user_dma_pkt->tsize);
return -EFAULT;
}
}
......@@ -3606,8 +3613,6 @@ static int goya_patch_dma_packet(struct hl_device *hdev,
new_dma_pkt->ctl = cpu_to_le32(ctl);
new_dma_pkt->tsize = cpu_to_le32((u32) len);
dma_addr += hdev->asic_prop.host_phys_base_address;
if (dir == DMA_TO_DEVICE) {
new_dma_pkt->src_addr = cpu_to_le64(dma_addr);
new_dma_pkt->dst_addr = cpu_to_le64(device_memory_addr);
......@@ -3858,36 +3863,35 @@ static int goya_parse_cb_no_mmu(struct hl_device *hdev,
return rc;
}
static int goya_parse_cb_no_ext_quque(struct hl_device *hdev,
static int goya_parse_cb_no_ext_queue(struct hl_device *hdev,
struct hl_cs_parser *parser)
{
struct asic_fixed_properties *asic_prop = &hdev->asic_prop;
struct goya_device *goya = hdev->asic_specific;
if (!(goya->hw_cap_initialized & HW_CAP_MMU)) {
/* For internal queue jobs, just check if cb address is valid */
if (hl_mem_area_inside_range(
(u64) (uintptr_t) parser->user_cb,
parser->user_cb_size,
asic_prop->sram_user_base_address,
asic_prop->sram_end_address))
return 0;
if (goya->hw_cap_initialized & HW_CAP_MMU)
return 0;
if (hl_mem_area_inside_range(
(u64) (uintptr_t) parser->user_cb,
parser->user_cb_size,
asic_prop->dram_user_base_address,
asic_prop->dram_end_address))
return 0;
/* For internal queue jobs, just check if CB address is valid */
if (hl_mem_area_inside_range(
(u64) (uintptr_t) parser->user_cb,
parser->user_cb_size,
asic_prop->sram_user_base_address,
asic_prop->sram_end_address))
return 0;
dev_err(hdev->dev,
"Internal CB address %px + 0x%x is not in SRAM nor in DRAM\n",
parser->user_cb, parser->user_cb_size);
if (hl_mem_area_inside_range(
(u64) (uintptr_t) parser->user_cb,
parser->user_cb_size,
asic_prop->dram_user_base_address,
asic_prop->dram_end_address))
return 0;
return -EFAULT;
}
dev_err(hdev->dev,
"Internal CB address %px + 0x%x is not in SRAM nor in DRAM\n",
parser->user_cb, parser->user_cb_size);
return 0;
return -EFAULT;
}
int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
......@@ -3895,9 +3899,9 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
struct goya_device *goya = hdev->asic_specific;
if (!parser->ext_queue)
return goya_parse_cb_no_ext_quque(hdev, parser);
return goya_parse_cb_no_ext_queue(hdev, parser);
if ((goya->hw_cap_initialized & HW_CAP_MMU) && parser->use_virt_addr)
if (goya->hw_cap_initialized & HW_CAP_MMU)
return goya_parse_cb_mmu(hdev, parser);
else
return goya_parse_cb_no_mmu(hdev, parser);
......@@ -3928,12 +3932,12 @@ void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr,
cq_pkt->addr = cpu_to_le64(CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF);
}
static void goya_update_eq_ci(struct hl_device *hdev, u32 val)
void goya_update_eq_ci(struct hl_device *hdev, u32 val)
{
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, val);
}
static void goya_restore_phase_topology(struct hl_device *hdev)
void goya_restore_phase_topology(struct hl_device *hdev)
{
int i, num_of_sob_in_longs, num_of_mon_in_longs;
......@@ -3970,6 +3974,7 @@ static void goya_restore_phase_topology(struct hl_device *hdev)
static int goya_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 ddr_bar_addr;
int rc = 0;
if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) {
......@@ -3987,15 +3992,16 @@ static int goya_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
u64 bar_base_addr = DRAM_PHYS_BASE +
(addr & ~(prop->dram_pci_bar_size - 0x1ull));
rc = goya_set_ddr_bar_base(hdev, bar_base_addr);
if (!rc) {
ddr_bar_addr = goya_set_ddr_bar_base(hdev, bar_base_addr);
if (ddr_bar_addr != U64_MAX) {
*val = readl(hdev->pcie_bar[DDR_BAR_ID] +
(addr - bar_base_addr));
rc = goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE +
(MMU_PAGE_TABLES_ADDR &
~(prop->dram_pci_bar_size - 0x1ull)));
ddr_bar_addr = goya_set_ddr_bar_base(hdev,
ddr_bar_addr);
}
if (ddr_bar_addr == U64_MAX)
rc = -EIO;
} else {
rc = -EFAULT;
}
......@@ -4020,6 +4026,7 @@ static int goya_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
static int goya_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 ddr_bar_addr;
int rc = 0;
if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) {
......@@ -4037,15 +4044,16 @@ static int goya_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
u64 bar_base_addr = DRAM_PHYS_BASE +
(addr & ~(prop->dram_pci_bar_size - 0x1ull));
rc = goya_set_ddr_bar_base(hdev, bar_base_addr);
if (!rc) {
ddr_bar_addr = goya_set_ddr_bar_base(hdev, bar_base_addr);
if (ddr_bar_addr != U64_MAX) {
writel(val, hdev->pcie_bar[DDR_BAR_ID] +
(addr - bar_base_addr));
rc = goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE +
(MMU_PAGE_TABLES_ADDR &
~(prop->dram_pci_bar_size - 0x1ull)));
ddr_bar_addr = goya_set_ddr_bar_base(hdev,
ddr_bar_addr);
}
if (ddr_bar_addr == U64_MAX)
rc = -EIO;
} else {
rc = -EFAULT;
}
......@@ -4414,7 +4422,6 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u32 size,
u64 val, bool is_dram)
{
struct packet_lin_dma *lin_dma_pkt;
struct hl_cs_parser parser;
struct hl_cs_job *job;
u32 cb_size, ctl;
struct hl_cb *cb;
......@@ -4454,36 +4461,16 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u32 size,
job->user_cb->cs_cnt++;
job->user_cb_size = cb_size;
job->hw_queue_id = GOYA_QUEUE_ID_DMA_0;
job->patched_cb = job->user_cb;
job->job_cb_size = job->user_cb_size +
sizeof(struct packet_msg_prot) * 2;
hl_debugfs_add_job(hdev, job);
parser.ctx_id = HL_KERNEL_ASID_ID;
parser.cs_sequence = 0;
parser.job_id = job->id;
parser.hw_queue_id = job->hw_queue_id;
parser.job_userptr_list = &job->userptr_list;
parser.user_cb = job->user_cb;
parser.user_cb_size = job->user_cb_size;
parser.ext_queue = job->ext_queue;
parser.use_virt_addr = hdev->mmu_enable;
rc = hdev->asic_funcs->cs_parser(hdev, &parser);
if (rc) {
dev_err(hdev->dev, "Failed to parse kernel CB\n");
goto free_job;
}
job->patched_cb = parser.patched_cb;
job->job_cb_size = parser.patched_cb_size;
job->patched_cb->cs_cnt++;
rc = goya_send_job_on_qman0(hdev, job);
job->patched_cb->cs_cnt--;
hl_cb_put(job->patched_cb);
free_job:
hl_userptr_delete_list(hdev, &job->userptr_list);
hl_debugfs_remove_job(hdev, job);
kfree(job);
cb->cs_cnt--;
......@@ -4495,7 +4482,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u32 size,
return rc;
}
static int goya_context_switch(struct hl_device *hdev, u32 asid)
int goya_context_switch(struct hl_device *hdev, u32 asid)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 addr = prop->sram_base_address;
......@@ -4557,7 +4544,7 @@ void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
/* zero the MMBP and ASID bits and then set the ASID */
for (i = 0 ; i < GOYA_MMU_REGS_NUM ; i++)
goya->mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
}
static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard)
......@@ -4792,12 +4779,12 @@ static const struct hl_asic_funcs goya_funcs = {
.cb_mmap = goya_cb_mmap,
.ring_doorbell = goya_ring_doorbell,
.flush_pq_write = goya_flush_pq_write,
.dma_alloc_coherent = goya_dma_alloc_coherent,
.dma_free_coherent = goya_dma_free_coherent,
.asic_dma_alloc_coherent = goya_dma_alloc_coherent,
.asic_dma_free_coherent = goya_dma_free_coherent,
.get_int_queue_base = goya_get_int_queue_base,
.test_queues = goya_test_queues,
.dma_pool_zalloc = goya_dma_pool_zalloc,
.dma_pool_free = goya_dma_pool_free,
.asic_dma_pool_zalloc = goya_dma_pool_zalloc,
.asic_dma_pool_free = goya_dma_pool_free,
.cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc,
.cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free,
.hl_dma_unmap_sg = goya_dma_unmap_sg,
......@@ -4830,7 +4817,9 @@ static const struct hl_asic_funcs goya_funcs = {
.get_hw_state = goya_get_hw_state,
.pci_bars_map = goya_pci_bars_map,
.set_dram_bar_base = goya_set_ddr_bar_base,
.init_iatu = goya_init_iatu
.init_iatu = goya_init_iatu,
.rreg = hl_rreg,
.wreg = hl_wreg
};
/*
......
......@@ -147,9 +147,6 @@ enum goya_fw_component {
};
struct goya_device {
void (*mmu_prepare_reg)(struct hl_device *hdev, u64 reg, u32 asid);
void (*qman0_set_security)(struct hl_device *hdev, bool secure);
/* TODO: remove hw_queues_lock after moving to scheduler code */
spinlock_t hw_queues_lock;
......@@ -162,13 +159,34 @@ struct goya_device {
u32 hw_cap_initialized;
};
void goya_get_fixed_properties(struct hl_device *hdev);
int goya_mmu_init(struct hl_device *hdev);
void goya_init_dma_qmans(struct hl_device *hdev);
void goya_init_mme_qmans(struct hl_device *hdev);
void goya_init_tpc_qmans(struct hl_device *hdev);
int goya_init_cpu_queues(struct hl_device *hdev);
void goya_init_security(struct hl_device *hdev);
int goya_late_init(struct hl_device *hdev);
void goya_late_fini(struct hl_device *hdev);
void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val);
void goya_update_eq_ci(struct hl_device *hdev, u32 val);
void goya_restore_phase_topology(struct hl_device *hdev);
int goya_context_switch(struct hl_device *hdev, u32 asid);
int goya_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus,
u8 i2c_addr, u8 i2c_reg, u32 *val);
int goya_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus,
u8 i2c_addr, u8 i2c_reg, u32 val);
void goya_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state);
int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id);
int goya_test_queues(struct hl_device *hdev);
int goya_test_cpu_queue(struct hl_device *hdev);
int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
u32 timeout, long *result);
long goya_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr);
long goya_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr);
long goya_get_current(struct hl_device *hdev, int sensor_index, u32 attr);
......@@ -176,33 +194,31 @@ long goya_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr);
long goya_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr);
void goya_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
long value);
void goya_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state);
u64 goya_get_max_power(struct hl_device *hdev);
void goya_set_max_power(struct hl_device *hdev, u64 value);
void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq);
void goya_add_device_attr(struct hl_device *hdev,
struct attribute_group *dev_attr_grp);
int goya_armcp_info_get(struct hl_device *hdev);
void goya_init_security(struct hl_device *hdev);
int goya_debug_coresight(struct hl_device *hdev, void *data);
u64 goya_get_max_power(struct hl_device *hdev);
void goya_set_max_power(struct hl_device *hdev, u64 value);
int goya_test_queues(struct hl_device *hdev);
void goya_mmu_prepare(struct hl_device *hdev, u32 asid);
int goya_mmu_clear_pgt_range(struct hl_device *hdev);
int goya_mmu_set_dram_default_page(struct hl_device *hdev);
void goya_late_fini(struct hl_device *hdev);
int goya_suspend(struct hl_device *hdev);
int goya_resume(struct hl_device *hdev);
void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val);
void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry);
void *goya_get_events_stat(struct hl_device *hdev, u32 *size);
void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr,
u32 cq_val, u32 msix_vec);
int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser);
void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
dma_addr_t *dma_handle, u16 *queue_len);
dma_addr_t *dma_handle, u16 *queue_len);
u32 goya_get_dma_desc_list_size(struct hl_device *hdev, struct sg_table *sgt);
int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id);
int goya_send_heartbeat(struct hl_device *hdev);
void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
dma_addr_t *dma_handle);
......
......@@ -459,10 +459,14 @@ static int goya_config_bmon(struct hl_device *hdev,
if (!input)
return -EINVAL;
WREG32(base_reg + 0x208, lower_32_bits(input->addr_range0));
WREG32(base_reg + 0x20C, upper_32_bits(input->addr_range0));
WREG32(base_reg + 0x248, lower_32_bits(input->addr_range1));
WREG32(base_reg + 0x24C, upper_32_bits(input->addr_range1));
WREG32(base_reg + 0x200, lower_32_bits(input->start_addr0));
WREG32(base_reg + 0x204, upper_32_bits(input->start_addr0));
WREG32(base_reg + 0x208, lower_32_bits(input->addr_mask0));
WREG32(base_reg + 0x20C, upper_32_bits(input->addr_mask0));
WREG32(base_reg + 0x240, lower_32_bits(input->start_addr1));
WREG32(base_reg + 0x244, upper_32_bits(input->start_addr1));
WREG32(base_reg + 0x248, lower_32_bits(input->addr_mask1));
WREG32(base_reg + 0x24C, upper_32_bits(input->addr_mask1));
WREG32(base_reg + 0x224, 0);
WREG32(base_reg + 0x234, 0);
WREG32(base_reg + 0x30C, input->bw_win);
......@@ -482,8 +486,12 @@ static int goya_config_bmon(struct hl_device *hdev,
WREG32(base_reg + 0x100, 0x11);
WREG32(base_reg + 0x304, 0x1);
} else {
WREG32(base_reg + 0x200, 0);
WREG32(base_reg + 0x204, 0);
WREG32(base_reg + 0x208, 0xFFFFFFFF);
WREG32(base_reg + 0x20C, 0xFFFFFFFF);
WREG32(base_reg + 0x240, 0);
WREG32(base_reg + 0x244, 0);
WREG32(base_reg + 0x248, 0xFFFFFFFF);
WREG32(base_reg + 0x24C, 0xFFFFFFFF);
WREG32(base_reg + 0x224, 0xFFFFFFFF);
......
......@@ -11,8 +11,6 @@
#include "include/armcp_if.h"
#include "include/qman_if.h"
#define pr_fmt(fmt) "habanalabs: " fmt
#include <linux/cdev.h>
#include <linux/iopoll.h>
#include <linux/irqreturn.h>
......@@ -137,8 +135,6 @@ enum hl_device_hw_state {
* @dram_user_base_address: DRAM physical start address for user access.
* @dram_size: DRAM total size.
* @dram_pci_bar_size: size of PCI bar towards DRAM.
* @host_phys_base_address: base physical address of host memory for
* transactions that the device generates.
* @max_power_default: max power of the device after reset
* @va_space_host_start_address: base address of virtual memory range for
* mapping host memory.
......@@ -186,7 +182,6 @@ struct asic_fixed_properties {
u64 dram_user_base_address;
u64 dram_size;
u64 dram_pci_bar_size;
u64 host_phys_base_address;
u64 max_power_default;
u64 va_space_host_start_address;
u64 va_space_host_end_address;
......@@ -323,6 +318,18 @@ struct hl_cs_job;
#define HL_EQ_LENGTH 64
#define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE)
#define HL_CPU_PKT_SHIFT 5
#define HL_CPU_PKT_SIZE (1 << HL_CPU_PKT_SHIFT)
#define HL_CPU_PKT_MASK (~((1 << HL_CPU_PKT_SHIFT) - 1))
#define HL_CPU_MAX_PKTS_IN_CB 32
#define HL_CPU_CB_SIZE (HL_CPU_PKT_SIZE * \
HL_CPU_MAX_PKTS_IN_CB)
#define HL_CPU_CB_QUEUE_SIZE (HL_QUEUE_LENGTH * HL_CPU_CB_SIZE)
/* KMD <-> ArmCP shared memory size (EQ + PQ + CPU CB queue) */
#define HL_CPU_ACCESSIBLE_MEM_SIZE (HL_EQ_SIZE_IN_BYTES + \
HL_QUEUE_SIZE_IN_BYTES + \
HL_CPU_CB_QUEUE_SIZE)
/**
* struct hl_hw_queue - describes a H/W transport queue.
......@@ -443,19 +450,19 @@ enum hl_pll_frequency {
* @cb_mmap: maps a CB.
* @ring_doorbell: increment PI on a given QMAN.
* @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed.
* @dma_alloc_coherent: Allocate coherent DMA memory by calling
* dma_alloc_coherent(). This is ASIC function because its
* implementation is not trivial when the driver is loaded
* in simulation mode (not upstreamed).
* @dma_free_coherent: Free coherent DMA memory by calling dma_free_coherent().
* This is ASIC function because its implementation is not
* trivial when the driver is loaded in simulation mode
* (not upstreamed).
* @asic_dma_alloc_coherent: Allocate coherent DMA memory by calling
* dma_alloc_coherent(). This is ASIC function because
* its implementation is not trivial when the driver
* is loaded in simulation mode (not upstreamed).
* @asic_dma_free_coherent: Free coherent DMA memory by calling
* dma_free_coherent(). This is ASIC function because
* its implementation is not trivial when the driver
* is loaded in simulation mode (not upstreamed).
* @get_int_queue_base: get the internal queue base address.
* @test_queues: run simple test on all queues for sanity check.
* @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
* size of allocation is HL_DMA_POOL_BLK_SIZE.
* @dma_pool_free: free small DMA allocation from pool.
* @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
* size of allocation is HL_DMA_POOL_BLK_SIZE.
* @asic_dma_pool_free: free small DMA allocation from pool.
* @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
* @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
* @hl_dma_unmap_sg: DMA unmap scatter-gather list.
......@@ -489,8 +496,11 @@ enum hl_pll_frequency {
* @send_cpu_message: send buffer to ArmCP.
* @get_hw_state: retrieve the H/W state
* @pci_bars_map: Map PCI BARs.
* @set_dram_bar_base: Set DRAM BAR to map specific device address.
* @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
* old address the bar pointed to or U64_MAX for failure
* @init_iatu: Initialize the iATU unit inside the PCI controller.
* @rreg: Read a register. Needed for simulator support.
* @wreg: Write a register. Needed for simulator support.
*/
struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev);
......@@ -508,27 +518,27 @@ struct hl_asic_funcs {
u64 kaddress, phys_addr_t paddress, u32 size);
void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val);
void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size,
void* (*asic_dma_alloc_coherent)(struct hl_device *hdev, size_t size,
dma_addr_t *dma_handle, gfp_t flag);
void (*dma_free_coherent)(struct hl_device *hdev, size_t size,
void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size,
void *cpu_addr, dma_addr_t dma_handle);
void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
dma_addr_t *dma_handle, u16 *queue_len);
int (*test_queues)(struct hl_device *hdev);
void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size,
void* (*asic_dma_pool_zalloc)(struct hl_device *hdev, size_t size,
gfp_t mem_flags, dma_addr_t *dma_handle);
void (*dma_pool_free)(struct hl_device *hdev, void *vaddr,
void (*asic_dma_pool_free)(struct hl_device *hdev, void *vaddr,
dma_addr_t dma_addr);
void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev,
size_t size, dma_addr_t *dma_handle);
void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
size_t size, void *vaddr);
void (*hl_dma_unmap_sg)(struct hl_device *hdev,
struct scatterlist *sg, int nents,
struct scatterlist *sgl, int nents,
enum dma_data_direction dir);
int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
int (*asic_dma_map_sg)(struct hl_device *hdev,
struct scatterlist *sg, int nents,
struct scatterlist *sgl, int nents,
enum dma_data_direction dir);
u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
struct sg_table *sgt);
......@@ -564,8 +574,10 @@ struct hl_asic_funcs {
u16 len, u32 timeout, long *result);
enum hl_device_hw_state (*get_hw_state)(struct hl_device *hdev);
int (*pci_bars_map)(struct hl_device *hdev);
int (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
int (*init_iatu)(struct hl_device *hdev);
u32 (*rreg)(struct hl_device *hdev, u32 reg);
void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
};
......@@ -613,12 +625,13 @@ struct hl_va_range {
* DRAM mapping.
* @cs_lock: spinlock to protect cs_sequence.
* @dram_phys_mem: amount of used physical DRAM memory by this context.
* @thread_restore_token: token to prevent multiple threads of the same context
* from running the restore phase. Only one thread
* should run it.
* @thread_restore_wait_token: token to prevent the threads that didn't run
* the restore phase from moving to their execution
* phase before the restore phase has finished.
* @thread_ctx_switch_token: token to prevent multiple threads of the same
* context from running the context switch phase.
* Only a single thread should run it.
* @thread_ctx_switch_wait_token: token to prevent the threads that didn't run
* the context switch phase from moving to their
* execution phase before the context switch phase
* has finished.
* @asid: context's unique address space ID in the device's MMU.
*/
struct hl_ctx {
......@@ -638,8 +651,8 @@ struct hl_ctx {
u64 *dram_default_hops;
spinlock_t cs_lock;
atomic64_t dram_phys_mem;
atomic_t thread_restore_token;
u32 thread_restore_wait_token;
atomic_t thread_ctx_switch_token;
u32 thread_ctx_switch_wait_token;
u32 asid;
};
......@@ -766,8 +779,6 @@ struct hl_cs_job {
* @patched_cb_size: the size of the CB after parsing.
* @ext_queue: whether the job is for external queue or internal queue.
* @job_id: the id of the related job inside the related CS.
* @use_virt_addr: whether to treat the addresses in the CB as virtual during
* parsing.
*/
struct hl_cs_parser {
struct hl_cb *user_cb;
......@@ -780,7 +791,6 @@ struct hl_cs_parser {
u32 patched_cb_size;
u8 ext_queue;
u8 job_id;
u8 use_virt_addr;
};
......@@ -1009,13 +1019,10 @@ struct hl_dbg_device_entry {
u32 hl_rreg(struct hl_device *hdev, u32 reg);
void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \
readl_poll_timeout(hdev->rmmio + addr, val, cond, sleep_us, timeout_us)
#define RREG32(reg) hl_rreg(hdev, (reg))
#define WREG32(reg, v) hl_wreg(hdev, (reg), (v))
#define RREG32(reg) hdev->asic_funcs->rreg(hdev, (reg))
#define WREG32(reg, v) hdev->asic_funcs->wreg(hdev, (reg), (v))
#define DREG32(reg) pr_info("REGISTER: " #reg " : 0x%08X\n", \
hl_rreg(hdev, (reg)))
hdev->asic_funcs->rreg(hdev, (reg)))
#define WREG32_P(reg, val, mask) \
do { \
......@@ -1033,6 +1040,30 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \
(val) << REG_FIELD_SHIFT(reg, field))
#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \
({ \
ktime_t __timeout; \
/* timeout should be longer when working with simulator */ \
if (hdev->pdev) \
__timeout = ktime_add_us(ktime_get(), timeout_us); \
else \
__timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \
might_sleep_if(sleep_us); \
for (;;) { \
(val) = RREG32(addr); \
if (cond) \
break; \
if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \
(val) = RREG32(addr); \
break; \
} \
if (sleep_us) \
usleep_range((sleep_us >> 2) + 1, sleep_us); \
} \
(cond) ? 0 : -ETIMEDOUT; \
})
#define HL_ENG_BUSY(buf, size, fmt, ...) ({ \
if (buf) \
snprintf(buf, size, fmt, ##__VA_ARGS__); \
......@@ -1418,7 +1449,8 @@ int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data);
int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
u64 addr);
int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
u64 dram_base_address, u64 host_phys_size);
u64 dram_base_address, u64 host_phys_base_address,
u64 host_phys_size);
int hl_pci_init(struct hl_device *hdev, u8 dma_mask);
void hl_pci_fini(struct hl_device *hdev);
int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask);
......
......@@ -6,6 +6,8 @@
*
*/
#define pr_fmt(fmt) "habanalabs: " fmt
#include "habanalabs.h"
#include <linux/pci.h>
......
......@@ -82,7 +82,7 @@ static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
bd += hl_pi_2_offset(q->pi);
bd->ctl = __cpu_to_le32(ctl);
bd->len = __cpu_to_le32(len);
bd->ptr = __cpu_to_le64(ptr + hdev->asic_prop.host_phys_base_address);
bd->ptr = __cpu_to_le64(ptr);
q->pi = hl_queue_inc_ptr(q->pi);
hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
......@@ -263,9 +263,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
* checked in hl_queue_sanity_checks
*/
cq = &hdev->completion_queue[q->hw_queue_id];
cq_addr = cq->bus_address +
hdev->asic_prop.host_phys_base_address;
cq_addr += cq->pi * sizeof(struct hl_cq_entry);
cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);
hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len,
cq_addr,
......@@ -415,14 +413,20 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
}
static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
struct hl_hw_queue *q)
struct hl_hw_queue *q, bool is_cpu_queue)
{
void *p;
int rc;
p = hdev->asic_funcs->dma_alloc_coherent(hdev,
HL_QUEUE_SIZE_IN_BYTES,
&q->bus_address, GFP_KERNEL | __GFP_ZERO);
if (is_cpu_queue)
p = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
HL_QUEUE_SIZE_IN_BYTES,
&q->bus_address);
else
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev,
HL_QUEUE_SIZE_IN_BYTES,
&q->bus_address,
GFP_KERNEL | __GFP_ZERO);
if (!p)
return -ENOMEM;
......@@ -446,8 +450,15 @@ static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
return 0;
free_queue:
hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address, q->bus_address);
if (is_cpu_queue)
hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
HL_QUEUE_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address);
else
hdev->asic_funcs->asic_dma_free_coherent(hdev,
HL_QUEUE_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address,
q->bus_address);
return rc;
}
......@@ -474,12 +485,12 @@ static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{
return ext_and_cpu_hw_queue_init(hdev, q);
return ext_and_cpu_hw_queue_init(hdev, q, true);
}
static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{
return ext_and_cpu_hw_queue_init(hdev, q);
return ext_and_cpu_hw_queue_init(hdev, q, false);
}
/*
......@@ -569,8 +580,15 @@ static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
kfree(q->shadow_queue);
hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address, q->bus_address);
if (q->queue_type == QUEUE_TYPE_CPU)
hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
HL_QUEUE_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address);
else
hdev->asic_funcs->asic_dma_free_coherent(hdev,
HL_QUEUE_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address,
q->bus_address);
}
int hl_hw_queues_create(struct hl_device *hdev)
......
......@@ -300,14 +300,6 @@ enum armcp_pwm_attributes {
armcp_pwm_enable
};
#define HL_CPU_PKT_SHIFT 5
#define HL_CPU_PKT_SIZE (1 << HL_CPU_PKT_SHIFT)
#define HL_CPU_PKT_MASK (~((1 << HL_CPU_PKT_SHIFT) - 1))
#define HL_CPU_MAX_PKTS_IN_CB 32
#define HL_CPU_CB_SIZE (HL_CPU_PKT_SIZE * \
HL_CPU_MAX_PKTS_IN_CB)
#define HL_CPU_ACCESSIBLE_MEM_SIZE (HL_QUEUE_LENGTH * HL_CPU_CB_SIZE)
/* Event Queue Packets */
struct eq_generic_event {
......
......@@ -222,7 +222,7 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id)
BUILD_BUG_ON(HL_CQ_SIZE_IN_BYTES > HL_PAGE_SIZE);
p = hdev->asic_funcs->dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
&q->bus_address, GFP_KERNEL | __GFP_ZERO);
if (!p)
return -ENOMEM;
......@@ -248,7 +248,7 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id)
*/
void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q)
{
hdev->asic_funcs->dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address, q->bus_address);
}
......@@ -284,8 +284,9 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q)
BUILD_BUG_ON(HL_EQ_SIZE_IN_BYTES > HL_PAGE_SIZE);
p = hdev->asic_funcs->dma_alloc_coherent(hdev, HL_EQ_SIZE_IN_BYTES,
&q->bus_address, GFP_KERNEL | __GFP_ZERO);
p = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
HL_EQ_SIZE_IN_BYTES,
&q->bus_address);
if (!p)
return -ENOMEM;
......@@ -308,8 +309,9 @@ void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q)
{
flush_workqueue(hdev->eq_wq);
hdev->asic_funcs->dma_free_coherent(hdev, HL_EQ_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address, q->bus_address);
hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
HL_EQ_SIZE_IN_BYTES,
(void *) (uintptr_t) q->kernel_address);
}
void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q)
......
......@@ -759,10 +759,6 @@ static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr,
for (i = 0 ; i < phys_pg_pack->npages ; i++) {
paddr = phys_pg_pack->pages[i];
/* For accessing the host we need to turn on bit 39 */
if (phys_pg_pack->created_from_userptr)
paddr += hdev->asic_prop.host_phys_base_address;
rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size);
if (rc) {
dev_err(hdev->dev,
......
......@@ -236,6 +236,8 @@ int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
* @hdev: Pointer to hl_device structure.
* @sram_base_address: SRAM base address.
* @dram_base_address: DRAM base address.
* @host_phys_base_address: Base physical address of host memory for device
* transactions.
* @host_phys_size: Size of host memory for device transactions.
*
* This is needed in case the firmware doesn't initialize the iATU.
......@@ -243,7 +245,8 @@ int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
* Return: 0 on success, negative value for failure.
*/
int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
u64 dram_base_address, u64 host_phys_size)
u64 dram_base_address, u64 host_phys_base_address,
u64 host_phys_size)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 host_phys_end_addr;
......@@ -259,14 +262,17 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
/* Point to DRAM */
if (!hdev->asic_funcs->set_dram_bar_base)
return -EINVAL;
rc |= hdev->asic_funcs->set_dram_bar_base(hdev, dram_base_address);
if (hdev->asic_funcs->set_dram_bar_base(hdev, dram_base_address) ==
U64_MAX)
return -EIO;
/* Outbound Region 0 - Point to Host */
host_phys_end_addr = prop->host_phys_base_address + host_phys_size - 1;
host_phys_end_addr = host_phys_base_address + host_phys_size - 1;
rc |= hl_pci_iatu_write(hdev, 0x008,
lower_32_bits(prop->host_phys_base_address));
lower_32_bits(host_phys_base_address));
rc |= hl_pci_iatu_write(hdev, 0x00C,
upper_32_bits(prop->host_phys_base_address));
upper_32_bits(host_phys_base_address));
rc |= hl_pci_iatu_write(hdev, 0x010, lower_32_bits(host_phys_end_addr));
rc |= hl_pci_iatu_write(hdev, 0x014, 0);
rc |= hl_pci_iatu_write(hdev, 0x018, 0);
......
......@@ -374,9 +374,12 @@ struct hl_debug_params_stm {
};
struct hl_debug_params_bmon {
/* Transaction address filter */
__u64 addr_range0;
__u64 addr_range1;
/* Two address ranges that the user can request to filter */
__u64 start_addr0;
__u64 addr_mask0;
__u64 start_addr1;
__u64 addr_mask1;
/* Capture window configuration */
__u32 bw_win;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment