Commit 17f2c308 authored by Palmer Dabbelt's avatar Palmer Dabbelt

Merge patch series "riscv: enable EFFICIENT_UNALIGNED_ACCESS and DCACHE_WORD_ACCESS"

Jisheng Zhang <jszhang@kernel.org> says:

Some riscv implementations such as T-HEAD's C906, C908, C910 and C920
support efficient unaligned access, for performance reason we want
to enable HAVE_EFFICIENT_UNALIGNED_ACCESS on these platforms. To
avoid performance regressions on non efficient unaligned access
platforms, HAVE_EFFICIENT_UNALIGNED_ACCESS can't be globally selected.

To solve this problem, runtime code patching based on the detected
speed is a good solution. But that's not easy, it involves lots of
work to modify vairous subsystems such as net, mm, lib and so on.
This can be done step by step.

So let's take an easier solution: add support to efficient unaligned
access and hide the support under NONPORTABLE.

patch1 introduces RISCV_EFFICIENT_UNALIGNED_ACCESS which depends on
NONPORTABLE, if users know during config time that the kernel will be
only run on those efficient unaligned access hw platforms, they can
enable it. Obviously, generic unified kernel Image shouldn't enable it.

patch2 adds support DCACHE_WORD_ACCESS when MMU and
RISCV_EFFICIENT_UNALIGNED_ACCESS.

Below test program and step shows how much performance can be improved:

 $ cat tt.c
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>

 #define ITERATIONS 1000000

 #define PATH "123456781234567812345678123456781"

 int main(void)
 {
         unsigned long i;
         struct stat buf;

         for (i = 0; i < ITERATIONS; i++)
                 stat(PATH, &buf);

         return 0;
 }

 $ gcc -O2 tt.c
 $ touch 123456781234567812345678123456781
 $ time ./a.out

Per my test on T-HEAD C910 platforms, the above test performance is
improved by about 7.5%.

* b4-shazam-merge:
  riscv: select DCACHE_WORD_ACCESS for efficient unaligned access HW
  riscv: introduce RISCV_EFFICIENT_UNALIGNED_ACCESS

Link: https://lore.kernel.org/r/20231225044207.3821-1-jszhang@kernel.orgSigned-off-by: default avatarPalmer Dabbelt <palmer@rivosinc.com>
parents cb51bfee d0fdc20b
...@@ -652,6 +652,20 @@ config RISCV_MISALIGNED ...@@ -652,6 +652,20 @@ config RISCV_MISALIGNED
load/store for both kernel and userspace. When disable, misaligned load/store for both kernel and userspace. When disable, misaligned
accesses will generate SIGBUS in userspace and panic in kernel. accesses will generate SIGBUS in userspace and panic in kernel.
config RISCV_EFFICIENT_UNALIGNED_ACCESS
bool "Assume the CPU supports fast unaligned memory accesses"
depends on NONPORTABLE
select DCACHE_WORD_ACCESS if MMU
select HAVE_EFFICIENT_UNALIGNED_ACCESS
help
Say Y here if you want the kernel to assume that the CPU supports
efficient unaligned memory accesses. When enabled, this option
improves the performance of the kernel on such CPUs. However, the
kernel will run much more slowly, or will not be able to run at all,
on CPUs that do not support efficient unaligned memory accesses.
If unsure what to do here, say N.
endmenu # "Platform type" endmenu # "Platform type"
menu "Kernel features" menu "Kernel features"
......
...@@ -108,7 +108,9 @@ KBUILD_AFLAGS_MODULE += $(call as-option,-Wa$(comma)-mno-relax) ...@@ -108,7 +108,9 @@ KBUILD_AFLAGS_MODULE += $(call as-option,-Wa$(comma)-mno-relax)
# unaligned accesses. While unaligned accesses are explicitly allowed in the # unaligned accesses. While unaligned accesses are explicitly allowed in the
# RISC-V ISA, they're emulated by machine mode traps on all extant # RISC-V ISA, they're emulated by machine mode traps on all extant
# architectures. It's faster to have GCC emit only aligned accesses. # architectures. It's faster to have GCC emit only aligned accesses.
ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS),y)
KBUILD_CFLAGS += $(call cc-option,-mstrict-align) KBUILD_CFLAGS += $(call cc-option,-mstrict-align)
endif
ifeq ($(CONFIG_STACKPROTECTOR_PER_TASK),y) ifeq ($(CONFIG_STACKPROTECTOR_PER_TASK),y)
prepare: stack_protector_prepare prepare: stack_protector_prepare
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#define EX_TYPE_FIXUP 1 #define EX_TYPE_FIXUP 1
#define EX_TYPE_BPF 2 #define EX_TYPE_BPF 2
#define EX_TYPE_UACCESS_ERR_ZERO 3 #define EX_TYPE_UACCESS_ERR_ZERO 3
#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 4
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
...@@ -47,6 +48,11 @@ ...@@ -47,6 +48,11 @@
#define EX_DATA_REG_ZERO_SHIFT 5 #define EX_DATA_REG_ZERO_SHIFT 5
#define EX_DATA_REG_ZERO GENMASK(9, 5) #define EX_DATA_REG_ZERO GENMASK(9, 5)
#define EX_DATA_REG_DATA_SHIFT 0
#define EX_DATA_REG_DATA GENMASK(4, 0)
#define EX_DATA_REG_ADDR_SHIFT 5
#define EX_DATA_REG_ADDR GENMASK(9, 5)
#define EX_DATA_REG(reg, gpr) \ #define EX_DATA_REG(reg, gpr) \
"((.L__gpr_num_" #gpr ") << " __stringify(EX_DATA_REG_##reg##_SHIFT) ")" "((.L__gpr_num_" #gpr ") << " __stringify(EX_DATA_REG_##reg##_SHIFT) ")"
...@@ -62,6 +68,15 @@ ...@@ -62,6 +68,15 @@
#define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) \ #define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) \
_ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, zero) _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, zero)
#define _ASM_EXTABLE_LOAD_UNALIGNED_ZEROPAD(insn, fixup, data, addr) \
__DEFINE_ASM_GPR_NUMS \
__ASM_EXTABLE_RAW(#insn, #fixup, \
__stringify(EX_TYPE_LOAD_UNALIGNED_ZEROPAD), \
"(" \
EX_DATA_REG(DATA, data) " | " \
EX_DATA_REG(ADDR, addr) \
")")
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
#else /* CONFIG_MMU */ #else /* CONFIG_MMU */
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#define _ASM_RISCV_WORD_AT_A_TIME_H #define _ASM_RISCV_WORD_AT_A_TIME_H
#include <asm/asm-extable.h>
#include <linux/kernel.h> #include <linux/kernel.h>
struct word_at_a_time { struct word_at_a_time {
...@@ -45,4 +46,30 @@ static inline unsigned long find_zero(unsigned long mask) ...@@ -45,4 +46,30 @@ static inline unsigned long find_zero(unsigned long mask)
/* The mask we created is directly usable as a bytemask */ /* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask) #define zero_bytemask(mask) (mask)
#ifdef CONFIG_DCACHE_WORD_ACCESS
/*
* Load an unaligned word from kernel space.
*
* In the (very unlikely) case of the word being a page-crosser
* and the next page not being mapped, take the exception and
* return zeroes in the non-existing part.
*/
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
unsigned long ret;
/* Load word from unaligned pointer addr */
asm(
"1: " REG_L " %0, %2\n"
"2:\n"
_ASM_EXTABLE_LOAD_UNALIGNED_ZEROPAD(1b, 2b, %0, %1)
: "=&r" (ret)
: "r" (addr), "m" (*(unsigned long *)addr));
return ret;
}
#endif /* CONFIG_DCACHE_WORD_ACCESS */
#endif /* _ASM_RISCV_WORD_AT_A_TIME_H */ #endif /* _ASM_RISCV_WORD_AT_A_TIME_H */
...@@ -27,6 +27,14 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex, ...@@ -27,6 +27,14 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex,
return true; return true;
} }
static inline unsigned long regs_get_gpr(struct pt_regs *regs, unsigned int offset)
{
if (unlikely(!offset || offset > MAX_REG_OFFSET))
return 0;
return *(unsigned long *)((unsigned long)regs + offset);
}
static inline void regs_set_gpr(struct pt_regs *regs, unsigned int offset, static inline void regs_set_gpr(struct pt_regs *regs, unsigned int offset,
unsigned long val) unsigned long val)
{ {
...@@ -50,6 +58,27 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, ...@@ -50,6 +58,27 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex,
return true; return true;
} }
static bool
ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex,
struct pt_regs *regs)
{
int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->data);
int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
unsigned long data, addr, offset;
addr = regs_get_gpr(regs, reg_addr * sizeof(unsigned long));
offset = addr & 0x7UL;
addr &= ~0x7UL;
data = *(unsigned long *)addr >> (offset * 8);
regs_set_gpr(regs, reg_data * sizeof(unsigned long), data);
regs->epc = get_ex_fixup(ex);
return true;
}
bool fixup_exception(struct pt_regs *regs) bool fixup_exception(struct pt_regs *regs)
{ {
const struct exception_table_entry *ex; const struct exception_table_entry *ex;
...@@ -65,6 +94,8 @@ bool fixup_exception(struct pt_regs *regs) ...@@ -65,6 +94,8 @@ bool fixup_exception(struct pt_regs *regs)
return ex_handler_bpf(ex, regs); return ex_handler_bpf(ex, regs);
case EX_TYPE_UACCESS_ERR_ZERO: case EX_TYPE_UACCESS_ERR_ZERO:
return ex_handler_uaccess_err_zero(ex, regs); return ex_handler_uaccess_err_zero(ex, regs);
case EX_TYPE_LOAD_UNALIGNED_ZEROPAD:
return ex_handler_load_unaligned_zeropad(ex, regs);
} }
BUG(); BUG();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment