Commit f2b0b25a authored by Alexey Brodkin's avatar Alexey Brodkin Committed by Vineet Gupta

ARCv2: Support IO Coherency and permutations involving L1 and L2 caches

In case of ARCv2 CPU there're could be following configurations
that affect cache handling for data exchanged with peripherals
via DMA:
 [1] Only L1 cache exists
 [2] Both L1 and L2 exist, but no IO coherency unit
 [3] L1, L2 caches and IO coherency unit exist

Current implementation takes care of [1] and [2].
Moreover support of [2] is implemented with run-time check
for SLC existence which is not super optimal.

This patch introduces support of [3] and rework of DMA ops
usage. Instead of doing run-time check every time a particular
DMA op is executed we'll have 3 different implementations of
DMA ops and select appropriate one during init.

As for IOC support for it we need:
 [a] Implement empty DMA ops because IOC takes care of cache
     coherency with DMAed data
 [b] Route dma_alloc_coherent() via dma_alloc_noncoherent()
     This is required to make IOC work in first place and also
     serves as optimization as LD/ST to coherent buffers can be
     srviced from caches w/o going all the way to memory
Signed-off-by: default avatarAlexey Brodkin <abrodkin@synopsys.com>
[vgupta:
  -Added some comments about IOC gains
  -Marked dma ops as static,
  -Massaged changelog a bit]
Signed-off-by: default avatarVineet Gupta <vgupta@synopsys.com>
parent 2a440168
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#define ARC_REG_RTT_BCR 0xF2 #define ARC_REG_RTT_BCR 0xF2
#define ARC_REG_IRQ_BCR 0xF3 #define ARC_REG_IRQ_BCR 0xF3
#define ARC_REG_SMART_BCR 0xFF #define ARC_REG_SMART_BCR 0xFF
#define ARC_REG_CLUSTER_BCR 0xcf
/* status32 Bits Positions */ /* status32 Bits Positions */
#define STATUS_AE_BIT 5 /* Exception active */ #define STATUS_AE_BIT 5 /* Exception active */
......
...@@ -53,6 +53,8 @@ extern void arc_cache_init(void); ...@@ -53,6 +53,8 @@ extern void arc_cache_init(void);
extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len); extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
extern void read_decode_cache_bcr(void); extern void read_decode_cache_bcr(void);
extern int ioc_exists;
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
/* Instruction cache related Auxiliary registers */ /* Instruction cache related Auxiliary registers */
...@@ -94,4 +96,10 @@ extern void read_decode_cache_bcr(void); ...@@ -94,4 +96,10 @@ extern void read_decode_cache_bcr(void);
#define SLC_CTRL_BUSY 0x100 #define SLC_CTRL_BUSY 0x100
#define SLC_CTRL_RGN_OP_INV 0x200 #define SLC_CTRL_RGN_OP_INV 0x200
/* IO coherency related Auxiliary registers */
#define ARC_REG_IO_COH_ENABLE 0x500
#define ARC_REG_IO_COH_PARTIAL 0x501
#define ARC_REG_IO_COH_AP0_BASE 0x508
#define ARC_REG_IO_COH_AP0_SIZE 0x509
#endif /* _ASM_CACHE_H */ #endif /* _ASM_CACHE_H */
...@@ -22,10 +22,15 @@ ...@@ -22,10 +22,15 @@
#include <asm/setup.h> #include <asm/setup.h>
static int l2_line_sz; static int l2_line_sz;
int ioc_exists;
void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr, void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr,
unsigned long sz, const int cacheop); unsigned long sz, const int cacheop);
void (*__dma_cache_wback_inv)(unsigned long start, unsigned long sz);
void (*__dma_cache_inv)(unsigned long start, unsigned long sz);
void (*__dma_cache_wback)(unsigned long start, unsigned long sz);
char *arc_cache_mumbojumbo(int c, char *buf, int len) char *arc_cache_mumbojumbo(int c, char *buf, int len)
{ {
int n = 0; int n = 0;
...@@ -50,6 +55,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) ...@@ -50,6 +55,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
n += scnprintf(buf + n, len - n, n += scnprintf(buf + n, len - n,
"SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len); "SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len);
if (ioc_exists)
n += scnprintf(buf + n, len - n, "IOC\t\t: exists\n");
return buf; return buf;
} }
...@@ -80,6 +88,14 @@ void read_decode_cache_bcr(void) ...@@ -80,6 +88,14 @@ void read_decode_cache_bcr(void)
#endif #endif
} slc_cfg; } slc_cfg;
struct bcr_clust_cfg {
#ifdef CONFIG_CPU_BIG_ENDIAN
unsigned int pad:7, c:1, num_entries:8, num_cores:8, ver:8;
#else
unsigned int ver:8, num_cores:8, num_entries:8, c:1, pad:7;
#endif
} cbcr;
p_ic = &cpuinfo_arc700[cpu].icache; p_ic = &cpuinfo_arc700[cpu].icache;
READ_BCR(ARC_REG_IC_BCR, ibcr); READ_BCR(ARC_REG_IC_BCR, ibcr);
...@@ -133,6 +149,10 @@ void read_decode_cache_bcr(void) ...@@ -133,6 +149,10 @@ void read_decode_cache_bcr(void)
p_slc->sz_k = 128 << slc_cfg.sz; p_slc->sz_k = 128 << slc_cfg.sz;
l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64; l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
} }
READ_BCR(ARC_REG_CLUSTER_BCR, cbcr);
if (cbcr.c)
ioc_exists = 1;
} }
/* /*
...@@ -516,11 +536,6 @@ noinline void slc_op(unsigned long paddr, unsigned long sz, const int op) ...@@ -516,11 +536,6 @@ noinline void slc_op(unsigned long paddr, unsigned long sz, const int op)
#endif #endif
} }
static inline int need_slc_flush(void)
{
return is_isa_arcv2() && l2_line_sz;
}
/*********************************************************** /***********************************************************
* Exported APIs * Exported APIs
*/ */
...@@ -569,30 +584,74 @@ void flush_dcache_page(struct page *page) ...@@ -569,30 +584,74 @@ void flush_dcache_page(struct page *page)
} }
EXPORT_SYMBOL(flush_dcache_page); EXPORT_SYMBOL(flush_dcache_page);
void dma_cache_wback_inv(unsigned long start, unsigned long sz) /*
* DMA ops for systems with L1 cache only
* Make memory coherent with L1 cache by flushing/invalidating L1 lines
*/
static void __dma_cache_wback_inv_l1(unsigned long start, unsigned long sz)
{ {
__dc_line_op_k(start, sz, OP_FLUSH_N_INV); __dc_line_op_k(start, sz, OP_FLUSH_N_INV);
}
if (need_slc_flush()) static void __dma_cache_inv_l1(unsigned long start, unsigned long sz)
slc_op(start, sz, OP_FLUSH_N_INV); {
__dc_line_op_k(start, sz, OP_INV);
} }
EXPORT_SYMBOL(dma_cache_wback_inv);
void dma_cache_inv(unsigned long start, unsigned long sz) static void __dma_cache_wback_l1(unsigned long start, unsigned long sz)
{
__dc_line_op_k(start, sz, OP_FLUSH);
}
/*
* DMA ops for systems with both L1 and L2 caches, but without IOC
* Both L1 and L2 lines need to be explicity flushed/invalidated
*/
static void __dma_cache_wback_inv_slc(unsigned long start, unsigned long sz)
{
__dc_line_op_k(start, sz, OP_FLUSH_N_INV);
slc_op(start, sz, OP_FLUSH_N_INV);
}
static void __dma_cache_inv_slc(unsigned long start, unsigned long sz)
{ {
__dc_line_op_k(start, sz, OP_INV); __dc_line_op_k(start, sz, OP_INV);
slc_op(start, sz, OP_INV);
}
if (need_slc_flush()) static void __dma_cache_wback_slc(unsigned long start, unsigned long sz)
slc_op(start, sz, OP_INV); {
__dc_line_op_k(start, sz, OP_FLUSH);
slc_op(start, sz, OP_FLUSH);
}
/*
* DMA ops for systems with IOC
* IOC hardware snoops all DMA traffic keeping the caches consistent with
* memory - eliding need for any explicit cache maintenance of DMA buffers
*/
static void __dma_cache_wback_inv_ioc(unsigned long start, unsigned long sz) {}
static void __dma_cache_inv_ioc(unsigned long start, unsigned long sz) {}
static void __dma_cache_wback_ioc(unsigned long start, unsigned long sz) {}
/*
* Exported DMA API
*/
void dma_cache_wback_inv(unsigned long start, unsigned long sz)
{
__dma_cache_wback_inv(start, sz);
}
EXPORT_SYMBOL(dma_cache_wback_inv);
void dma_cache_inv(unsigned long start, unsigned long sz)
{
__dma_cache_inv(start, sz);
} }
EXPORT_SYMBOL(dma_cache_inv); EXPORT_SYMBOL(dma_cache_inv);
void dma_cache_wback(unsigned long start, unsigned long sz) void dma_cache_wback(unsigned long start, unsigned long sz)
{ {
__dc_line_op_k(start, sz, OP_FLUSH); __dma_cache_wback(start, sz);
if (need_slc_flush())
slc_op(start, sz, OP_FLUSH);
} }
EXPORT_SYMBOL(dma_cache_wback); EXPORT_SYMBOL(dma_cache_wback);
...@@ -848,4 +907,27 @@ void arc_cache_init(void) ...@@ -848,4 +907,27 @@ void arc_cache_init(void)
panic("Disable CONFIG_ARC_CACHE_VIPT_ALIASING\n"); panic("Disable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
} }
} }
if (is_isa_arcv2() && ioc_exists) {
/* IO coherency base - 0x8z */
write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000);
/* IO coherency aperture size - 512Mb: 0x8z-0xAz */
write_aux_reg(ARC_REG_IO_COH_AP0_SIZE, 0x11);
/* Enable partial writes */
write_aux_reg(ARC_REG_IO_COH_PARTIAL, 1);
/* Enable IO coherency */
write_aux_reg(ARC_REG_IO_COH_ENABLE, 1);
__dma_cache_wback_inv = __dma_cache_wback_inv_ioc;
__dma_cache_inv = __dma_cache_inv_ioc;
__dma_cache_wback = __dma_cache_wback_ioc;
} else if (is_isa_arcv2() && l2_line_sz) {
__dma_cache_wback_inv = __dma_cache_wback_inv_slc;
__dma_cache_inv = __dma_cache_inv_slc;
__dma_cache_wback = __dma_cache_wback_slc;
} else {
__dma_cache_wback_inv = __dma_cache_wback_inv_l1;
__dma_cache_inv = __dma_cache_inv_l1;
__dma_cache_wback = __dma_cache_wback_l1;
}
} }
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/dma-mapping.h> #include <linux/dma-mapping.h>
#include <linux/dma-debug.h> #include <linux/dma-debug.h>
#include <linux/export.h> #include <linux/export.h>
#include <asm/cache.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
/* /*
...@@ -53,6 +54,20 @@ void *dma_alloc_coherent(struct device *dev, size_t size, ...@@ -53,6 +54,20 @@ void *dma_alloc_coherent(struct device *dev, size_t size,
{ {
void *paddr, *kvaddr; void *paddr, *kvaddr;
/*
* IOC relies on all data (even coherent DMA data) being in cache
* Thus allocate normal cached memory
*
* The gains with IOC are two pronged:
* -For streaming data, elides needs for cache maintenance, saving
* cycles in flush code, and bus bandwidth as all the lines of a
* buffer need to be flushed out to memory
* -For coherent data, Read/Write to buffers terminate early in cache
* (vs. always going to memory - thus are faster)
*/
if (ioc_exists)
return dma_alloc_noncoherent(dev, size, dma_handle, gfp);
/* This is linear addr (0x8000_0000 based) */ /* This is linear addr (0x8000_0000 based) */
paddr = alloc_pages_exact(size, gfp); paddr = alloc_pages_exact(size, gfp);
if (!paddr) if (!paddr)
...@@ -85,6 +100,9 @@ EXPORT_SYMBOL(dma_alloc_coherent); ...@@ -85,6 +100,9 @@ EXPORT_SYMBOL(dma_alloc_coherent);
void dma_free_coherent(struct device *dev, size_t size, void *kvaddr, void dma_free_coherent(struct device *dev, size_t size, void *kvaddr,
dma_addr_t dma_handle) dma_addr_t dma_handle)
{ {
if (ioc_exists)
return dma_free_noncoherent(dev, size, kvaddr, dma_handle);
iounmap((void __force __iomem *)kvaddr); iounmap((void __force __iomem *)kvaddr);
free_pages_exact((void *)dma_handle, size); free_pages_exact((void *)dma_handle, size);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment