Commit 38b89d93 authored by David Mosberger's avatar David Mosberger

ia64: Improve spinlock code to handle contention in shared routine called

	with a special convention.  Various minor fixes for gcc-pre3.4.
parent 91d599a9
...@@ -23,6 +23,7 @@ cflags-y := -pipe $(EXTRA) -ffixed-r13 -mfixed-range=f10-f15,f32-f127 \ ...@@ -23,6 +23,7 @@ cflags-y := -pipe $(EXTRA) -ffixed-r13 -mfixed-range=f10-f15,f32-f127 \
CFLAGS_KERNEL := -mconstant-gp CFLAGS_KERNEL := -mconstant-gp
GCC_VERSION=$(shell $(CC) -v 2>&1 | fgrep 'gcc version' | cut -f3 -d' ' | cut -f1 -d'.') GCC_VERSION=$(shell $(CC) -v 2>&1 | fgrep 'gcc version' | cut -f3 -d' ' | cut -f1 -d'.')
GCC_MINOR_VERSION=$(shell $(CC) -v 2>&1 | fgrep 'gcc version' | cut -f3 -d' ' | cut -f2 -d'.')
GAS_STATUS=$(shell arch/ia64/scripts/check-gas $(CC) $(OBJDUMP)) GAS_STATUS=$(shell arch/ia64/scripts/check-gas $(CC) $(OBJDUMP))
...@@ -35,7 +36,14 @@ $(error Sorry, you need a newer version of the assember, one that is built from ...@@ -35,7 +36,14 @@ $(error Sorry, you need a newer version of the assember, one that is built from
endif endif
ifneq ($(GCC_VERSION),2) ifneq ($(GCC_VERSION),2)
cflags-y += -frename-registers --param max-inline-insns=5000 cflags-$(CONFIG_ITANIUM) += -frename-registers
endif
ifeq ($(GCC_VERSION),3)
ifeq ($(GCC_MINOR_VERSION),4)
cflags-$(CONFIG_ITANIUM) += -mtune=merced
cflags-$(CONFIG_MCKINLEY) += -mtune=mckinley
endif
endif endif
cflags-$(CONFIG_ITANIUM_BSTEP_SPECIFIC) += -mb-step cflags-$(CONFIG_ITANIUM_BSTEP_SPECIFIC) += -mb-step
......
...@@ -733,3 +733,82 @@ SET_REG(b4); ...@@ -733,3 +733,82 @@ SET_REG(b4);
SET_REG(b5); SET_REG(b5);
#endif /* CONFIG_IA64_BRL_EMU */ #endif /* CONFIG_IA64_BRL_EMU */
#ifdef CONFIG_SMP
/*
* This routine handles spinlock contention. It uses a non-standard calling
* convention to avoid converting leaf routines into interior routines. Because
* of this special convention, there are several restrictions:
*
* - do not use gp relative variables, this code is called from the kernel
* and from modules, r1 is undefined.
* - do not use stacked registers, the caller owns them.
* - do not use the scratch stack space, the caller owns it.
* - do not use any registers other than the ones listed below
*
* Inputs:
* ar.pfs - saved CFM of caller
* ar.ccv - 0 (and available for use)
* r28 - available for use.
* r29 - available for use.
* r30 - available for use.
* r31 - address of lock, available for use.
* b7 - return address
* p14 - available for use.
*
* If you patch this code to use more registers, do not forget to update
* the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
*/
#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)
GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
.prologue
.save ar.pfs, r0 // this code effectively has a zero frame size
.save rp, r28
.body
nop 0
nop 0
.restore sp // pop existing prologue after next insn
mov b6 = r28
.prologue
.save ar.pfs, r0
.altrp b6
.body
.wait:
// exponential backoff, kdb, lockmeter etc. go in here
hint @pause
ld4.bias r30=[r31]
nop 0
;;
cmp4.eq p14,p0=r30,r0
(p14) br.cond.sptk.few b6 // lock is now free, try to acquire
br.cond.sptk.few .wait
END(ia64_spinlock_contention_pre3_4)
#else
GLOBAL_ENTRY(ia64_spinlock_contention)
.prologue
.altrp b6
.body
.wait:
// exponential backoff, kdb, lockmeter etc. go in here
hint @pause
ld4.bias r30=[r31]
;;
cmp4.ne p14,p0=r30,r0
mov r30 = 1
(p14) br.cond.sptk.few .wait
;;
cmpxchg4.acq r30=[r31], r30, ar.ccv
;;
cmp4.ne p14,p0=r0,r30
(p14) br.cond.sptk.few .wait
br.ret.sptk.many b6 // lock is now taken
END(ia64_spinlock_contention)
#endif
#endif /* CONFIG_SMP */
...@@ -282,7 +282,7 @@ fetch_min_state (pal_min_state_area_t *ms, struct pt_regs *pt, struct switch_sta ...@@ -282,7 +282,7 @@ fetch_min_state (pal_min_state_area_t *ms, struct pt_regs *pt, struct switch_sta
dst_banked = &pt->r16; /* r16-r31 are contiguous in struct pt_regs */ dst_banked = &pt->r16; /* r16-r31 are contiguous in struct pt_regs */
src_banked = ms->pmsa_bank1_gr; src_banked = ms->pmsa_bank1_gr;
for (i = 0; i < 16; ++i) for (i = 0; i < 16; ++i)
*dst_banked = *src_banked; dst_banked[i] = src_banked[i];
pt->b0 = ms->pmsa_br0; pt->b0 = ms->pmsa_br0;
sw->b1 = ms->pmsa_br1; sw->b1 = ms->pmsa_br1;
...@@ -339,7 +339,6 @@ init_handler_platform (sal_log_processor_info_t *proc_ptr, ...@@ -339,7 +339,6 @@ init_handler_platform (sal_log_processor_info_t *proc_ptr,
show_min_state(&SAL_LPI_PSI_INFO(proc_ptr)->min_state_area); show_min_state(&SAL_LPI_PSI_INFO(proc_ptr)->min_state_area);
fetch_min_state(&SAL_LPI_PSI_INFO(proc_ptr)->min_state_area, pt, sw); fetch_min_state(&SAL_LPI_PSI_INFO(proc_ptr)->min_state_area, pt, sw);
unw_init_from_interruption(&info, current, pt, sw); unw_init_from_interruption(&info, current, pt, sw);
ia64_do_show_stack(&info, NULL); ia64_do_show_stack(&info, NULL);
......
...@@ -397,13 +397,26 @@ do_boot_cpu (int sapicid, int cpu) ...@@ -397,13 +397,26 @@ do_boot_cpu (int sapicid, int cpu)
return 0; return 0;
} }
unsigned long cache_decay_ticks; /* # of ticks an idle task is considered cache-hot */ static int __init
decay (char *str)
{
int ticks;
get_option (&str, &ticks);
cache_decay_ticks = ticks;
return 1;
}
__setup("decay=", decay);
/*
* # of ticks an idle task is considered cache-hot. Highly application-dependent. There
* are apps out there which are known to suffer significantly with values >= 4.
*/
unsigned long cache_decay_ticks = 10; /* equal to MIN_TIMESLICE */
static void static void
smp_tune_scheduling (void) smp_tune_scheduling (void)
{ {
cache_decay_ticks = 10; /* XXX base this on PAL info and cache-bandwidth estimate */
printk(KERN_INFO "task migration cache decay timeout: %ld msecs.\n", printk(KERN_INFO "task migration cache decay timeout: %ld msecs.\n",
(cache_decay_ticks + 1) * 1000 / HZ); (cache_decay_ticks + 1) * 1000 / HZ);
} }
......
...@@ -94,7 +94,7 @@ die (const char *str, struct pt_regs *regs, long err) ...@@ -94,7 +94,7 @@ die (const char *str, struct pt_regs *regs, long err)
{ {
static struct { static struct {
spinlock_t lock; spinlock_t lock;
int lock_owner; u32 lock_owner;
int lock_owner_depth; int lock_owner_depth;
} die = { } die = {
.lock = SPIN_LOCK_UNLOCKED, .lock = SPIN_LOCK_UNLOCKED,
......
...@@ -682,7 +682,7 @@ finish_prologue (struct unw_state_record *sr) ...@@ -682,7 +682,7 @@ finish_prologue (struct unw_state_record *sr)
* First, resolve implicit register save locations (see Section "11.4.2.3 Rules * First, resolve implicit register save locations (see Section "11.4.2.3 Rules
* for Using Unwind Descriptors", rule 3): * for Using Unwind Descriptors", rule 3):
*/ */
for (i = 0; i < (int) sizeof(unw.save_order)/sizeof(unw.save_order[0]); ++i) { for (i = 0; i < (int) (sizeof(unw.save_order)/sizeof(unw.save_order[0])); ++i) {
reg = sr->curr.reg + unw.save_order[i]; reg = sr->curr.reg + unw.save_order[i];
if (reg->where == UNW_WHERE_GR_SAVE) { if (reg->where == UNW_WHERE_GR_SAVE) {
reg->where = UNW_WHERE_GR; reg->where = UNW_WHERE_GR;
...@@ -698,7 +698,7 @@ finish_prologue (struct unw_state_record *sr) ...@@ -698,7 +698,7 @@ finish_prologue (struct unw_state_record *sr)
*/ */
if (sr->imask) { if (sr->imask) {
unsigned char kind, mask = 0, *cp = sr->imask; unsigned char kind, mask = 0, *cp = sr->imask;
unsigned long t; int t;
static const unsigned char limit[3] = { static const unsigned char limit[3] = {
UNW_REG_F31, UNW_REG_R7, UNW_REG_B5 UNW_REG_F31, UNW_REG_R7, UNW_REG_B5
}; };
...@@ -1931,7 +1931,7 @@ init_frame_info (struct unw_frame_info *info, struct task_struct *t, ...@@ -1931,7 +1931,7 @@ init_frame_info (struct unw_frame_info *info, struct task_struct *t,
" pr 0x%lx\n" " pr 0x%lx\n"
" sw 0x%lx\n" " sw 0x%lx\n"
" sp 0x%lx\n", " sp 0x%lx\n",
__FUNCTION__, (unsigned long) task, rbslimit, rbstop, stktop, stklimit, __FUNCTION__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit,
info->pr, (unsigned long) info->sw, info->sp); info->pr, (unsigned long) info->sw, info->sp);
STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags)); STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags));
} }
...@@ -1944,6 +1944,8 @@ unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t, ...@@ -1944,6 +1944,8 @@ unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t,
init_frame_info(info, t, sw, pt->r12); init_frame_info(info, t, sw, pt->r12);
info->cfm_loc = &pt->cr_ifs; info->cfm_loc = &pt->cr_ifs;
info->unat_loc = &pt->ar_unat;
info->pfs_loc = &pt->ar_pfs;
sof = *info->cfm_loc & 0x7f; sof = *info->cfm_loc & 0x7f;
info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sof); info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sof);
info->ip = pt->cr_iip + ia64_psr(pt)->ri; info->ip = pt->cr_iip + ia64_psr(pt)->ri;
...@@ -1952,7 +1954,7 @@ unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t, ...@@ -1952,7 +1954,7 @@ unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t,
" bsp 0x%lx\n" " bsp 0x%lx\n"
" sof 0x%lx\n" " sof 0x%lx\n"
" ip 0x%lx\n", " ip 0x%lx\n",
info->bsp, sof, info->ip); __FUNCTION__, info->bsp, sof, info->ip);
find_save_locs(info); find_save_locs(info);
} }
...@@ -1970,7 +1972,7 @@ unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct ...@@ -1970,7 +1972,7 @@ unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct
" bsp 0x%lx\n" " bsp 0x%lx\n"
" sol 0x%lx\n" " sol 0x%lx\n"
" ip 0x%lx\n", " ip 0x%lx\n",
info->bsp, sol, info->ip); __FUNCTION__, info->bsp, sol, info->ip);
find_save_locs(info); find_save_locs(info);
} }
......
...@@ -22,26 +22,72 @@ typedef struct { ...@@ -22,26 +22,72 @@ typedef struct {
#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
#define spin_lock_init(x) ((x)->lock = 0) #define spin_lock_init(x) ((x)->lock = 0)
#define DEBUG_SPIN_LOCK 0 #define NEW_LOCK
#ifdef NEW_LOCK
#if DEBUG_SPIN_LOCK /*
* Try to get the lock. If we fail to get the lock, make a non-standard call to
#include <ia64intrin.h> * ia64_spinlock_contention(). We do not use a normal call because that would force all
* callers of spin_lock() to be non-leaf routines. Instead, ia64_spinlock_contention() is
#define _raw_spin_lock(x) \ * carefully coded to touch only those registers that spin_lock() marks "clobbered".
do { \ */
unsigned long _timeout = 1000000000; \
volatile unsigned int _old = 0, _new = 1, *_ptr = &((x)->lock); \
do { \
if (_timeout-- == 0) { \
extern void dump_stack (void); \
printk("kernel DEADLOCK at %s:%d?\n", __FILE__, __LINE__); \
dump_stack(); \
} \
} while (__sync_val_compare_and_swap(_ptr, _old, _new) != _old); \
} while (0)
#define IA64_SPINLOCK_CLOBBERS "ar.pfs", "p14", "r28", "r29", "r30", "b6", "memory"
static inline void
_raw_spin_lock (spinlock_t *lock)
{
register volatile unsigned int *ptr asm ("r31") = &lock->lock;
#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)
# ifdef CONFIG_ITANIUM
/* don't use brl on Itanium... */
asm volatile ("{\n\t"
" mov ar.ccv = r0\n\t"
" mov r28 = ip\n\t"
" mov r30 = 1;;\n\t"
"}\n\t"
"cmpxchg4.acq r30 = [%1], r30, ar.ccv\n\t"
"movl r29 = ia64_spinlock_contention_pre3_4;;\n\t"
"cmp4.ne p14, p0 = r30, r0\n\t"
"mov b6 = r29;;\n"
"(p14) br.cond.spnt.many b6"
: "=r"(ptr) : "r"(ptr) : IA64_SPINLOCK_CLOBBERS);
# else
asm volatile ("{\n\t"
" mov ar.ccv = r0\n\t"
" mov r28 = ip\n\t"
" mov r30 = 1;;\n\t"
"}\n\t"
"cmpxchg4.acq r30 = [%1], r30, ar.ccv;;\n\t"
"cmp4.ne p14, p0 = r30, r0\n"
"(p14) brl.cond.spnt.many ia64_spinlock_contention_pre3_4"
: "=r"(ptr) : "r"(ptr) : IA64_SPINLOCK_CLOBBERS);
# endif /* CONFIG_MCKINLEY */
#else #else
# ifdef CONFIG_ITANIUM
/* don't use brl on Itanium... */
/* mis-declare, so we get the entry-point, not it's function descriptor: */
asm volatile ("mov r30 = 1\n\t"
"mov ar.ccv = r0;;\n\t"
"cmpxchg4.acq r30 = [%0], r30, ar.ccv\n\t"
"movl r29 = ia64_spinlock_contention;;\n\t"
"cmp4.ne p14, p0 = r30, r0\n\t"
"mov b6 = r29;;\n"
"(p14) br.call.spnt.many b6 = b6"
: "=r"(ptr) : "r"(ptr) : IA64_SPINLOCK_CLOBBERS);
# else
asm volatile ("mov r30 = 1\n\t"
"mov ar.ccv = r0;;\n\t"
"cmpxchg4.acq r30 = [%0], r30, ar.ccv;;\n\t"
"cmp4.ne p14, p0 = r30, r0\n\t"
"(p14) brl.call.spnt.many b6=ia64_spinlock_contention"
: "=r"(ptr) : "r"(ptr) : IA64_SPINLOCK_CLOBBERS);
# endif /* CONFIG_MCKINLEY */
#endif
}
#else /* !NEW_LOCK */
/* /*
* Streamlined test_and_set_bit(0, (x)). We use test-and-test-and-set * Streamlined test_and_set_bit(0, (x)). We use test-and-test-and-set
...@@ -64,7 +110,7 @@ do { \ ...@@ -64,7 +110,7 @@ do { \
";;\n" \ ";;\n" \
:: "r"(&(x)->lock) : "ar.ccv", "p7", "r2", "r29", "memory") :: "r"(&(x)->lock) : "ar.ccv", "p7", "r2", "r29", "memory")
#endif /* !DEBUG_SPIN_LOCK */ #endif /* !NEW_LOCK */
#define spin_is_locked(x) ((x)->lock != 0) #define spin_is_locked(x) ((x)->lock != 0)
#define _raw_spin_unlock(x) do { barrier(); ((spinlock_t *) x)->lock = 0; } while (0) #define _raw_spin_unlock(x) do { barrier(); ((spinlock_t *) x)->lock = 0; } while (0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment