Commit ca92c573 authored by Andi Kleen's avatar Andi Kleen Committed by Stephen Hemminger

[PATCH] Intel x86-64 support merge

This has all the x86-64 specific changes for Intel Prescott/Nocona
support.

It requires a few minor changes outside arch/x86_64, which I am sending
separately.

This patch is needed to boot an 64bit kernel on a 64-bit capable
Prescott machine.

The ugliest part is probably the swiotlb code.  In fact the code for
that is not even included, but just reused from IA64.  swiotlb
implements the PCI DMA API using bounce buffering.  I don't like this at
all, but there was no other way to support non DAC capable hardware
(like IDE or USB) on machines with >3GB.  Please redirect all flames for
that to the Intel chipset designers.

ChangeLog:
- Add Kconfig options for PSC
- Add support to reuse microcode driver from i386 (Suresh B Siddha)
- Try to optimize for the selected CPU
- Fix early CPUID check for Intel CPUs (Suresh B Siddha)
- Fix GDT to use the configured cache line size for padding
- Support monitor/mwait idle loop
- Support HyperThreading
- Support Intel CPUID flags
- Remove all 3dnow prefetches
- Add alternative() for the prefetchw prefetch inline.
- Include P4 driver in oprofile
- Support Intel NOPs in alternative
parent 857bf13c
...@@ -89,6 +89,9 @@ config MK8 ...@@ -89,6 +89,9 @@ config MK8
help help
Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs. Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs.
config MPSC
bool "Prescott/Nocona"
config GENERIC_CPU config GENERIC_CPU
bool "Generic-x86-64" bool "Generic-x86-64"
help help
...@@ -101,11 +104,13 @@ endchoice ...@@ -101,11 +104,13 @@ endchoice
# #
config X86_L1_CACHE_BYTES config X86_L1_CACHE_BYTES
int int
default "64" default "128" if GENERIC_CPU || MPSC
default "64" if MK8
config X86_L1_CACHE_SHIFT config X86_L1_CACHE_SHIFT
int int
default "6" default "7" if GENERIC_CPU || MPSC
default "6" if MK8
config X86_TSC config X86_TSC
bool bool
...@@ -115,6 +120,23 @@ config X86_GOOD_APIC ...@@ -115,6 +120,23 @@ config X86_GOOD_APIC
bool bool
default y default y
config MICROCODE
tristate "/dev/cpu/microcode - Intel CPU microcode support"
---help---
If you say Y here the 'File systems' section, you will be
able to update the microcode on Intel processors. You will
obviously need the actual microcode binary data itself which is
not shipped with the Linux kernel.
For latest news and information on obtaining all the required
ingredients for this driver, check:
<http://www.urbanmyth.org/microcode/>.
To compile this driver as a module, choose M here: the
module will be called microcode.
If you use modprobe or kmod you may also want to add the line
'alias char-major-10-184 microcode' to your /etc/modules.conf file.
config X86_MSR config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support" tristate "/dev/cpu/*/msr - Model-specific register support"
help help
...@@ -132,6 +154,11 @@ config X86_CPUID ...@@ -132,6 +154,11 @@ config X86_CPUID
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
/dev/cpu/31/cpuid. /dev/cpu/31/cpuid.
config X86_HT
bool
depends on SMP
default y
config MATH_EMULATION config MATH_EMULATION
bool bool
...@@ -256,9 +283,13 @@ config GART_IOMMU ...@@ -256,9 +283,13 @@ config GART_IOMMU
Normally the kernel will take the right choice by itself. Normally the kernel will take the right choice by itself.
If unsure say Y If unsure say Y
config SWIOTLB
select GART_IOMMU
bool "Software IOTLB support"
config DUMMY_IOMMU config DUMMY_IOMMU
bool bool
depends on !GART_IOMMU depends on !GART_IOMMU && !SWIOTLB
default y default y
help help
Don't use IOMMU code. This will cause problems when you have more than 4GB Don't use IOMMU code. This will cause problems when you have more than 4GB
......
...@@ -39,6 +39,10 @@ LDFLAGS_vmlinux := -e stext ...@@ -39,6 +39,10 @@ LDFLAGS_vmlinux := -e stext
check_gcc = $(shell if $(CC) $(1) -S -o /dev/null -xc /dev/null > /dev/null 2>&1 ; then echo "$(1)"; else echo "$(2)"; fi) check_gcc = $(shell if $(CC) $(1) -S -o /dev/null -xc /dev/null > /dev/null 2>&1 ; then echo "$(1)"; else echo "$(2)"; fi)
cflags-$(CONFIG_MK8) += $(call check_gcc,-march=k8,)
cflags-$(CONFIG_MPSC) += $(call check_gcc,-march=pentium4,)
CFLAGS += $(cflags-y)
CFLAGS += -mno-red-zone CFLAGS += -mno-red-zone
CFLAGS += -mcmodel=kernel CFLAGS += -mcmodel=kernel
CFLAGS += -pipe CFLAGS += -pipe
......
...@@ -292,8 +292,9 @@ loader_ok: ...@@ -292,8 +292,9 @@ loader_ok:
/* minimum CPUID flags for x86-64 */ /* minimum CPUID flags for x86-64 */
/* see http://www.x86-64.org/lists/discuss/msg02971.html */ /* see http://www.x86-64.org/lists/discuss/msg02971.html */
#define SSE_MASK ((1<<25)|(1<<26)) #define SSE_MASK ((1<<25)|(1<<26))
#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|(1<<11)| \ #define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\
(1<<13)|(1<<15)|(1<<24)|(1<<29)) (1<<13)|(1<<15)|(1<<24))
#define REQUIRED_MASK2 (1<<29)
pushfl /* standard way to check for cpuid */ pushfl /* standard way to check for cpuid */
popl %eax popl %eax
...@@ -305,10 +306,10 @@ loader_ok: ...@@ -305,10 +306,10 @@ loader_ok:
popl %eax popl %eax
cmpl %eax,%ebx cmpl %eax,%ebx
jz no_longmode /* cpu has no cpuid */ jz no_longmode /* cpu has no cpuid */
movl $0x80000000,%eax movl $0x0,%eax
cpuid cpuid
cmpl $0x80000001,%eax cmpl $0x1,%eax
jb no_longmode /* no extended cpuid */ jb no_longmode /* no cpuid 1 */
xor %di,%di xor %di,%di
cmpl $0x68747541,%ebx /* AuthenticAMD */ cmpl $0x68747541,%ebx /* AuthenticAMD */
jnz noamd jnz noamd
...@@ -318,11 +319,20 @@ loader_ok: ...@@ -318,11 +319,20 @@ loader_ok:
jnz noamd jnz noamd
mov $1,%di /* cpu is from AMD */ mov $1,%di /* cpu is from AMD */
noamd: noamd:
movl $0x80000001,%eax movl $0x1,%eax
cpuid cpuid
andl $REQUIRED_MASK1,%edx andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx xorl $REQUIRED_MASK1,%edx
jnz no_longmode jnz no_longmode
movl $0x80000000,%eax
cpuid
cmpl $0x80000001,%eax
jb no_longmode /* no extended cpuid */
movl $0x80000001,%eax
cpuid
andl $REQUIRED_MASK2,%edx
xorl $REQUIRED_MASK2,%edx
jnz no_longmode
sse_test: sse_test:
movl $1,%eax movl $1,%eax
cpuid cpuid
......
...@@ -12,6 +12,7 @@ obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ ...@@ -12,6 +12,7 @@ obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_ACPI) += acpi/
obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
...@@ -22,6 +23,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/ ...@@ -22,6 +23,7 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULES) += module.o
...@@ -30,4 +32,5 @@ obj-y += topology.o ...@@ -30,4 +32,5 @@ obj-y += topology.o
bootflag-y += ../../i386/kernel/bootflag.o bootflag-y += ../../i386/kernel/bootflag.o
cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
topology-y += ../../i386/mach-default/topology.o topology-y += ../../i386/mach-default/topology.o
swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o
microcode-$(CONFIG_MICROCODE) += ../../i386/kernel/microcode.o
...@@ -24,6 +24,8 @@ ...@@ -24,6 +24,8 @@
#include <asm/proto.h> #include <asm/proto.h>
#include <asm/pci-direct.h> #include <asm/pci-direct.h>
int iommu_aperture;
int fallback_aper_order __initdata = 1; /* 64MB */ int fallback_aper_order __initdata = 1; /* 64MB */
int fallback_aper_force __initdata = 0; int fallback_aper_force __initdata = 0;
...@@ -206,6 +208,8 @@ void __init iommu_hole_init(void) ...@@ -206,6 +208,8 @@ void __init iommu_hole_init(void)
if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
continue; continue;
iommu_aperture = 1;;
aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order; aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <asm/segment.h> #include <asm/segment.h>
#include <asm/page.h> #include <asm/page.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/cache.h>
/* we are not able to switch in one step to the final KERNEL ADRESS SPACE /* we are not able to switch in one step to the final KERNEL ADRESS SPACE
* because we need identity-mapped pages on setup so define __START_KERNEL to * because we need identity-mapped pages on setup so define __START_KERNEL to
...@@ -322,7 +323,6 @@ gdt: ...@@ -322,7 +323,6 @@ gdt:
.endr .endr
#endif #endif
.align 64 /* cacheline aligned */
ENTRY(gdt_table32) ENTRY(gdt_table32)
.quad 0x0000000000000000 /* This one is magic */ .quad 0x0000000000000000 /* This one is magic */
.quad 0x0000000000000000 /* unused */ .quad 0x0000000000000000 /* unused */
...@@ -334,7 +334,7 @@ gdt32_end: ...@@ -334,7 +334,7 @@ gdt32_end:
* Also sysret mandates a special GDT layout * Also sysret mandates a special GDT layout
*/ */
.align 64 /* cacheline aligned, keep this synchronized with asm/desc.h */ .align L1_CACHE_BYTES
/* The TLS descriptors are currently at a different place compared to i386. /* The TLS descriptors are currently at a different place compared to i386.
Hopefully nobody expects them at a fixed place (Wine?) */ Hopefully nobody expects them at a fixed place (Wine?) */
...@@ -354,18 +354,13 @@ ENTRY(cpu_gdt_table) ...@@ -354,18 +354,13 @@ ENTRY(cpu_gdt_table)
.quad 0 /* unused now */ .quad 0 /* unused now */
.quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */ .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */
/* base must be patched for real base address. */ /* base must be patched for real base address. */
/* This should be a multiple of the cache line size */
gdt_end: gdt_end:
.globl gdt_end /* asm/segment.h:GDT_ENTRIES must match this */
/* This should be a multiple of the cache line size */
/* GDTs of other CPUs */ /* GDTs of other CPUs: */
#ifdef CONFIG_SMP .fill (L1_CACHE_BYTES * NR_CPUS) - (gdt_end - cpu_gdt_table)
.rept NR_CPUS-1
.quad 0,0,0,0,0,0,0,0,0,0,0
.endr
#endif
.align 64 .align L1_CACHE_BYTES
ENTRY(idt_table) ENTRY(idt_table)
.rept 256 .rept 256
.quad 0 .quad 0
......
...@@ -354,6 +354,11 @@ dma_addr_t pci_map_single(struct pci_dev *dev, void *addr, size_t size, int dir) ...@@ -354,6 +354,11 @@ dma_addr_t pci_map_single(struct pci_dev *dev, void *addr, size_t size, int dir)
BUG_ON(dir == PCI_DMA_NONE); BUG_ON(dir == PCI_DMA_NONE);
#ifdef CONFIG_SWIOTLB
if (swiotlb)
return swiotlb_map_single(&dev->dev,addr,size,dir);
#endif
phys_mem = virt_to_phys(addr); phys_mem = virt_to_phys(addr);
if (!need_iommu(dev, phys_mem, size)) if (!need_iommu(dev, phys_mem, size))
return phys_mem; return phys_mem;
...@@ -460,6 +465,12 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir) ...@@ -460,6 +465,12 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir)
BUG_ON(dir == PCI_DMA_NONE); BUG_ON(dir == PCI_DMA_NONE);
if (nents == 0) if (nents == 0)
return 0; return 0;
#ifdef CONFIG_SWIOTLB
if (swiotlb)
return swiotlb_map_sg(&dev->dev,sg,nents,dir);
#endif
out = 0; out = 0;
start = 0; start = 0;
for (i = 0; i < nents; i++) { for (i = 0; i < nents; i++) {
...@@ -520,6 +531,14 @@ void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, ...@@ -520,6 +531,14 @@ void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
unsigned long iommu_page; unsigned long iommu_page;
int npages; int npages;
int i; int i;
#ifdef CONFIG_SWIOTLB
if (swiotlb) {
swiotlb_unmap_single(&hwdev->dev,dma_addr,size,direction);
return;
}
#endif
if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
dma_addr >= iommu_bus_base + iommu_size) dma_addr >= iommu_bus_base + iommu_size)
return; return;
...@@ -570,7 +589,7 @@ int pci_dma_supported(struct pci_dev *dev, u64 mask) ...@@ -570,7 +589,7 @@ int pci_dma_supported(struct pci_dev *dev, u64 mask)
return 0; return 0;
} }
if (no_iommu && (mask < (end_pfn << PAGE_SHIFT))) if (no_iommu && (mask < (end_pfn << PAGE_SHIFT)) && !swiotlb)
return 0; return 0;
return 1; return 1;
...@@ -680,6 +699,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) ...@@ -680,6 +699,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
return 0; return 0;
nommu: nommu:
/* Should not happen anymore */
printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.");
return -1; return -1;
...@@ -694,6 +714,7 @@ static int __init pci_iommu_init(void) ...@@ -694,6 +714,7 @@ static int __init pci_iommu_init(void)
unsigned long iommu_start; unsigned long iommu_start;
struct pci_dev *dev; struct pci_dev *dev;
#ifndef CONFIG_AGP_AMD64 #ifndef CONFIG_AGP_AMD64
no_agp = 1; no_agp = 1;
#else #else
...@@ -704,7 +725,14 @@ static int __init pci_iommu_init(void) ...@@ -704,7 +725,14 @@ static int __init pci_iommu_init(void)
(agp_copy_info(&info) < 0); (agp_copy_info(&info) < 0);
#endif #endif
if (no_iommu || (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT)) { if (swiotlb) {
no_iommu = 1;
printk(KERN_INFO "PCI-DMA: Using SWIOTLB :-(\n");
return -1;
}
if (no_iommu || (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) ||
!iommu_aperture) {
printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
no_iommu = 1; no_iommu = 1;
return -1; return -1;
......
...@@ -140,6 +140,52 @@ void cpu_idle (void) ...@@ -140,6 +140,52 @@ void cpu_idle (void)
} }
} }
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*/
static void mwait_idle(void)
{
local_irq_enable();
if (!need_resched()) {
set_thread_flag(TIF_POLLING_NRFLAG);
do {
__monitor((void *)&current_thread_info()->flags, 0, 0);
if (need_resched())
break;
__mwait(0, 0);
} while (!need_resched());
clear_thread_flag(TIF_POLLING_NRFLAG);
}
}
void __init select_idle_routine(const struct cpuinfo_x86 *c)
{
static int printed;
if (cpu_has(c, X86_FEATURE_MWAIT)) {
/*
* Skip, if setup has overridden idle.
* Also, take care of system with asymmetric CPUs.
* Use, mwait_idle only if all cpus support it.
* If not, we fallback to default_idle()
*/
if (!pm_idle) {
if (!printed) {
printk("using mwait in idle threads.\n");
printed = 1;
}
pm_idle = mwait_idle;
}
return;
}
pm_idle = default_idle;
return;
}
static int __init idle_setup (char *str) static int __init idle_setup (char *str)
{ {
if (!strncmp(str, "poll", 4)) { if (!strncmp(str, "poll", 4)) {
......
...@@ -76,6 +76,9 @@ unsigned long pci_mem_start = 0x10000000; ...@@ -76,6 +76,9 @@ unsigned long pci_mem_start = 0x10000000;
unsigned long saved_video_mode; unsigned long saved_video_mode;
int swiotlb;
EXPORT_SYMBOL(swiotlb);
/* /*
* Setup options * Setup options
*/ */
...@@ -440,7 +443,6 @@ void __init setup_arch(char **cmdline_p) ...@@ -440,7 +443,6 @@ void __init setup_arch(char **cmdline_p)
} }
} }
#endif #endif
paging_init(); paging_init();
#ifndef CONFIG_SMP #ifndef CONFIG_SMP
...@@ -584,6 +586,191 @@ static int __init init_amd(struct cpuinfo_x86 *c) ...@@ -584,6 +586,191 @@ static int __init init_amd(struct cpuinfo_x86 *c)
return r; return r;
} }
static void __init detect_ht(void)
{
extern int phys_proc_id[NR_CPUS];
u32 eax, ebx, ecx, edx;
int index_lsb, index_msb, tmp;
int initial_apic_id;
int cpu = smp_processor_id();
cpuid(1, &eax, &ebx, &ecx, &edx);
smp_num_siblings = (ebx & 0xff0000) >> 16;
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
} else if (smp_num_siblings > 1) {
index_lsb = 0;
index_msb = 31;
/*
* At this point we only support two siblings per
* processor package.
*/
#define NR_SIBLINGS 2
if (smp_num_siblings != NR_SIBLINGS) {
printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
smp_num_siblings = 1;
return;
}
tmp = smp_num_siblings;
while ((tmp & 1) == 0) {
tmp >>=1 ;
index_lsb++;
}
tmp = smp_num_siblings;
while ((tmp & 0x80000000 ) == 0) {
tmp <<=1 ;
index_msb--;
}
if (index_lsb != index_msb )
index_msb++;
initial_apic_id = ebx >> 24 & 0xff;
phys_proc_id[cpu] = initial_apic_id >> index_msb;
printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
phys_proc_id[cpu]);
}
}
#define LVL_1_INST 1
#define LVL_1_DATA 2
#define LVL_2 3
#define LVL_3 4
#define LVL_TRACE 5
struct _cache_table
{
unsigned char descriptor;
char cache_type;
short size;
};
/* all the cache descriptor types we care about (no TLB or trace cache entries) */
static struct _cache_table cache_table[] __initdata =
{
{ 0x06, LVL_1_INST, 8 },
{ 0x08, LVL_1_INST, 16 },
{ 0x0a, LVL_1_DATA, 8 },
{ 0x0c, LVL_1_DATA, 16 },
{ 0x22, LVL_3, 512 },
{ 0x23, LVL_3, 1024 },
{ 0x25, LVL_3, 2048 },
{ 0x29, LVL_3, 4096 },
{ 0x2c, LVL_1_DATA, 32 },
{ 0x30, LVL_1_INST, 32 },
{ 0x39, LVL_2, 128 },
{ 0x3b, LVL_2, 128 },
{ 0x3c, LVL_2, 256 },
{ 0x41, LVL_2, 128 },
{ 0x42, LVL_2, 256 },
{ 0x43, LVL_2, 512 },
{ 0x44, LVL_2, 1024 },
{ 0x45, LVL_2, 2048 },
{ 0x66, LVL_1_DATA, 8 },
{ 0x67, LVL_1_DATA, 16 },
{ 0x68, LVL_1_DATA, 32 },
{ 0x70, LVL_TRACE, 12 },
{ 0x71, LVL_TRACE, 16 },
{ 0x72, LVL_TRACE, 32 },
{ 0x79, LVL_2, 128 },
{ 0x7a, LVL_2, 256 },
{ 0x7b, LVL_2, 512 },
{ 0x7c, LVL_2, 1024 },
{ 0x82, LVL_2, 256 },
{ 0x83, LVL_2, 512 },
{ 0x84, LVL_2, 1024 },
{ 0x85, LVL_2, 2048 },
{ 0x86, LVL_2, 512 },
{ 0x87, LVL_2, 1024 },
{ 0x00, 0, 0}
};
static void __init init_intel(struct cpuinfo_x86 *c)
{
/* Cache sizes */
unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
unsigned n;
select_idle_routine(c);
if (c->cpuid_level > 1) {
/* supports eax=2 call */
int i, j, n;
int regs[4];
unsigned char *dp = (unsigned char *)regs;
/* Number of times to iterate */
n = cpuid_eax(2) & 0xFF;
for ( i = 0 ; i < n ; i++ ) {
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
for ( j = 0 ; j < 3 ; j++ ) {
if ( regs[j] < 0 ) regs[j] = 0;
}
/* Byte 0 is level count, not a descriptor */
for ( j = 1 ; j < 16 ; j++ ) {
unsigned char des = dp[j];
unsigned char k = 0;
/* look up this descriptor in the table */
while (cache_table[k].descriptor != 0)
{
if (cache_table[k].descriptor == des) {
switch (cache_table[k].cache_type) {
case LVL_1_INST:
l1i += cache_table[k].size;
break;
case LVL_1_DATA:
l1d += cache_table[k].size;
break;
case LVL_2:
l2 += cache_table[k].size;
break;
case LVL_3:
l3 += cache_table[k].size;
break;
case LVL_TRACE:
trace += cache_table[k].size;
break;
}
break;
}
k++;
}
}
}
if (trace)
printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
else if (l1i)
printk (KERN_INFO "CPU: L1 I cache: %dK", l1i);
if (l1d)
printk(", L1 D cache: %dK\n", l1d);
else
printk("\n");
if (l2)
printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
if (l3)
printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
c->x86_cache_size = l2 ? l2 : (l1i+l1d);
}
if (cpu_has(c, X86_FEATURE_HT))
detect_ht();
n = cpuid_eax(0x80000000);
if (n >= 0x80000008) {
unsigned eax = cpuid_eax(0x80000008);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
}
}
void __init get_cpu_vendor(struct cpuinfo_x86 *c) void __init get_cpu_vendor(struct cpuinfo_x86 *c)
{ {
...@@ -591,6 +778,8 @@ void __init get_cpu_vendor(struct cpuinfo_x86 *c) ...@@ -591,6 +778,8 @@ void __init get_cpu_vendor(struct cpuinfo_x86 *c)
if (!strcmp(v, "AuthenticAMD")) if (!strcmp(v, "AuthenticAMD"))
c->x86_vendor = X86_VENDOR_AMD; c->x86_vendor = X86_VENDOR_AMD;
else if (!strcmp(v, "GenuineIntel"))
c->x86_vendor = X86_VENDOR_INTEL;
else else
c->x86_vendor = X86_VENDOR_UNKNOWN; c->x86_vendor = X86_VENDOR_UNKNOWN;
} }
...@@ -606,7 +795,7 @@ struct cpu_model_info { ...@@ -606,7 +795,7 @@ struct cpu_model_info {
*/ */
void __init identify_cpu(struct cpuinfo_x86 *c) void __init identify_cpu(struct cpuinfo_x86 *c)
{ {
int junk, i; int i;
u32 xlvl, tfms; u32 xlvl, tfms;
c->loops_per_jiffy = loops_per_jiffy; c->loops_per_jiffy = loops_per_jiffy;
...@@ -630,7 +819,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c) ...@@ -630,7 +819,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
/* Intel-defined flags: level 0x00000001 */ /* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) { if (c->cpuid_level >= 0x00000001) {
__u32 misc; __u32 misc;
cpuid(0x00000001, &tfms, &misc, &junk, cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
&c->x86_capability[0]); &c->x86_capability[0]);
c->x86 = (tfms >> 8) & 0xf; c->x86 = (tfms >> 8) & 0xf;
c->x86_model = (tfms >> 4) & 0xf; c->x86_model = (tfms >> 4) & 0xf;
...@@ -679,9 +868,13 @@ void __init identify_cpu(struct cpuinfo_x86 *c) ...@@ -679,9 +868,13 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
init_amd(c); init_amd(c);
break; break;
case X86_VENDOR_INTEL:
init_intel(c);
break;
case X86_VENDOR_UNKNOWN: case X86_VENDOR_UNKNOWN:
default: default:
/* Not much we can do here... */ display_cacheinfo(c);
break; break;
} }
...@@ -732,7 +925,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) ...@@ -732,7 +925,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
"fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
"cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
"pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
"fxsr", "sse", "sse2", "ss", NULL, "tm", "ia64", NULL, "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
/* AMD-defined */ /* AMD-defined */
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
...@@ -751,6 +944,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) ...@@ -751,6 +944,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* Intel-defined (#2) */
"pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "tm2",
"est", NULL, "cid", NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
}; };
static char *x86_power_flags[] = { static char *x86_power_flags[] = {
"ts", /* temperature sensor */ "ts", /* temperature sensor */
...@@ -790,6 +989,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) ...@@ -790,6 +989,14 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (c->x86_cache_size >= 0) if (c->x86_cache_size >= 0)
seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
#ifdef CONFIG_X86_HT
if (cpu_has_ht) {
extern int phys_proc_id[NR_CPUS];
seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]);
seq_printf(m, "siblings\t: %d\n", smp_num_siblings);
}
#endif
seq_printf(m, seq_printf(m,
"fpu\t\t: yes\n" "fpu\t\t: yes\n"
"fpu_exception\t: yes\n" "fpu_exception\t: yes\n"
......
...@@ -53,6 +53,10 @@ ...@@ -53,6 +53,10 @@
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/proto.h> #include <asm/proto.h>
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
/* Bitmask of currently online CPUs */ /* Bitmask of currently online CPUs */
cpumask_t cpu_online_map; cpumask_t cpu_online_map;
...@@ -66,6 +70,8 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; ...@@ -66,6 +70,8 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
/* Set when the idlers are all forked */ /* Set when the idlers are all forked */
int smp_threads_ready; int smp_threads_ready;
int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
/* /*
* Trampoline 80x86 program as an array. * Trampoline 80x86 program as an array.
*/ */
...@@ -857,6 +863,34 @@ static void __init smp_boot_cpus(unsigned int max_cpus) ...@@ -857,6 +863,34 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
Dprintk("Before bogocount - setting activated=1.\n"); Dprintk("Before bogocount - setting activated=1.\n");
} }
/*
* If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
* that we can tell the sibling CPU efficiently.
*/
if (cpu_has_ht && smp_num_siblings > 1) {
for (cpu = 0; cpu < NR_CPUS; cpu++)
cpu_sibling_map[cpu] = NO_PROC_ID;
for (cpu = 0; cpu < NR_CPUS; cpu++) {
int i;
if (!cpu_isset(cpu, cpu_callout_map))
continue;
for (i = 0; i < NR_CPUS; i++) {
if (i == cpu || !cpu_isset(i, cpu_callout_map))
continue;
if (phys_proc_id[cpu] == phys_proc_id[i]) {
cpu_sibling_map[cpu] = i;
break;
}
}
if (cpu_sibling_map[cpu] == NO_PROC_ID) {
smp_num_siblings = 1;
printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
}
}
}
Dprintk("Boot done.\n"); Dprintk("Boot done.\n");
/* /*
......
...@@ -194,6 +194,8 @@ EXPORT_SYMBOL(atomic_dec_and_lock); ...@@ -194,6 +194,8 @@ EXPORT_SYMBOL(atomic_dec_and_lock);
EXPORT_SYMBOL(die_chain); EXPORT_SYMBOL(die_chain);
EXPORT_SYMBOL(cpu_sibling_map);
extern void do_softirq_thunk(void); extern void do_softirq_thunk(void);
EXPORT_SYMBOL_NOVERS(do_softirq_thunk); EXPORT_SYMBOL_NOVERS(do_softirq_thunk);
......
...@@ -8,11 +8,6 @@ ...@@ -8,11 +8,6 @@
.globl copy_page .globl copy_page
.p2align 4 .p2align 4
copy_page: copy_page:
prefetch (%rsi)
prefetch 1*64(%rsi)
prefetchw (%rdi)
prefetchw 1*64(%rdi)
subq $3*8,%rsp subq $3*8,%rsp
movq %rbx,(%rsp) movq %rbx,(%rsp)
movq %r12,1*8(%rsp) movq %r12,1*8(%rsp)
...@@ -32,7 +27,7 @@ copy_page: ...@@ -32,7 +27,7 @@ copy_page:
movq 48 (%rsi), %r11 movq 48 (%rsi), %r11
movq 56 (%rsi), %r12 movq 56 (%rsi), %r12
prefetch 5*64(%rsi) prefetcht0 5*64(%rsi)
movq %rax, (%rdi) movq %rax, (%rdi)
movq %rbx, 8 (%rdi) movq %rbx, 8 (%rdi)
...@@ -43,8 +38,6 @@ copy_page: ...@@ -43,8 +38,6 @@ copy_page:
movq %r11, 48 (%rdi) movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi) movq %r12, 56 (%rdi)
prefetchw 5*64(%rdi)
leaq 64 (%rsi), %rsi leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi leaq 64 (%rdi), %rdi
......
...@@ -59,15 +59,6 @@ csum_partial_copy_generic: ...@@ -59,15 +59,6 @@ csum_partial_copy_generic:
cmpl $3*64,%edx cmpl $3*64,%edx
jle .Lignore jle .Lignore
ignore
prefetch (%rdi)
ignore
prefetch 1*64(%rdi)
ignore
prefetchw (%rsi)
ignore
prefetchw 1*64(%rsi)
.Lignore: .Lignore:
subq $7*8,%rsp subq $7*8,%rsp
movq %rbx,2*8(%rsp) movq %rbx,2*8(%rsp)
...@@ -115,7 +106,7 @@ csum_partial_copy_generic: ...@@ -115,7 +106,7 @@ csum_partial_copy_generic:
movq 56(%rdi),%r13 movq 56(%rdi),%r13
ignore 2f ignore 2f
prefetch 5*64(%rdi) prefetcht0 5*64(%rdi)
2: 2:
adcq %rbx,%rax adcq %rbx,%rax
adcq %r8,%rax adcq %r8,%rax
...@@ -146,8 +137,6 @@ csum_partial_copy_generic: ...@@ -146,8 +137,6 @@ csum_partial_copy_generic:
dest dest
movq %r13,56(%rsi) movq %r13,56(%rsi)
ignore 3f
prefetchw 5*64(%rsi)
3: 3:
leaq 64(%rdi),%rdi leaq 64(%rdi),%rdi
......
...@@ -402,6 +402,13 @@ void __init mem_init(void) ...@@ -402,6 +402,13 @@ void __init mem_init(void)
int codesize, reservedpages, datasize, initsize; int codesize, reservedpages, datasize, initsize;
int tmp; int tmp;
#ifdef CONFIG_SWIOTLB
if (!iommu_aperture && end_pfn >= 0xffffffff>>PAGE_SHIFT) {
swiotlb_init();
swiotlb = 1;
}
#endif
/* How many end-of-memory variables you have, grandma! */ /* How many end-of-memory variables you have, grandma! */
max_low_pfn = end_pfn; max_low_pfn = end_pfn;
max_pfn = end_pfn; max_pfn = end_pfn;
......
# #
# oprofile for x86-64. # oprofile for x86-64.
# Just reuse the one from i386. The Hammer performance counters # Just reuse the one from i386.
# are similar to Athlon.
# #
obj-$(CONFIG_OPROFILE) += oprofile.o obj-$(CONFIG_OPROFILE) += oprofile.o
...@@ -13,7 +12,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ ...@@ -13,7 +12,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
timer_int.o ) timer_int.o )
OPROFILE-y := init.o OPROFILE-y := init.o
OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
op_model_ppro.o
OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y)) oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#ifndef __ASM_X8664_CPUFEATURE_H #ifndef __ASM_X8664_CPUFEATURE_H
#define __ASM_X8664_CPUFEATURE_H #define __ASM_X8664_CPUFEATURE_H
#define NCAPINTS 4 /* Currently we have 4 32-bit words worth of info */ #define NCAPINTS 5 /* Currently we have 4 32-bit words worth of info */
/* Intel-defined CPU features, CPUID level 0x00000001, word 0 */ /* Intel-defined CPU features, CPUID level 0x00000001, word 0 */
#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */ #define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */
#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */
#define X86_FEATURE_SELFSNOOP (0*32+27) /* CPU self snoop */ #define X86_FEATURE_SELFSNOOP (0*32+27) /* CPU self snoop */
#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
#define X86_FEATURE_ACC (0*32+29) /* Automatic clock control */ #define X86_FEATURE_ACC (0*32+29) /* Automatic clock control */
#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */ #define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
...@@ -61,6 +62,10 @@ ...@@ -61,6 +62,10 @@
#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ #define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
#define X86_FEATURE_K8_C (3*32+ 4) /* C stepping K8 */ #define X86_FEATURE_K8_C (3*32+ 4) /* C stepping K8 */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
#define X86_FEATURE_MWAIT (4*32+ 3) /* Monitor/Mwait support */
#define cpu_has(c, bit) test_bit(bit, (c)->x86_capability) #define cpu_has(c, bit) test_bit(bit, (c)->x86_capability)
#define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability) #define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability)
...@@ -76,7 +81,7 @@ ...@@ -76,7 +81,7 @@
#define cpu_has_mmx 1 #define cpu_has_mmx 1
#define cpu_has_fxsr 1 #define cpu_has_fxsr 1
#define cpu_has_xmm 1 #define cpu_has_xmm 1
#define cpu_has_ht 0 /* you need to report the support from i386. sorry */ #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
#define cpu_has_mp 1 /* XXX */ #define cpu_has_mp 1 /* XXX */
#define cpu_has_k6_mtrr 0 #define cpu_has_k6_mtrr 0
#define cpu_has_cyrix_arr 0 #define cpu_has_cyrix_arr 0
......
...@@ -121,6 +121,9 @@ extern inline unsigned int cpuid_edx(unsigned int op) ...@@ -121,6 +121,9 @@ extern inline unsigned int cpuid_edx(unsigned int op)
return edx; return edx;
} }
#define MSR_IA32_UCODE_WRITE 0x79
#define MSR_IA32_UCODE_REV 0x8b
#endif #endif
...@@ -243,4 +246,123 @@ extern inline unsigned int cpuid_edx(unsigned int op) ...@@ -243,4 +246,123 @@ extern inline unsigned int cpuid_edx(unsigned int op)
#define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_ENABLE (1<<11)
#define MSR_IA32_APICBASE_BASE (0xfffff<<12) #define MSR_IA32_APICBASE_BASE (0xfffff<<12)
/* P4/Xeon+ specific */
#define MSR_IA32_MCG_EAX 0x180
#define MSR_IA32_MCG_EBX 0x181
#define MSR_IA32_MCG_ECX 0x182
#define MSR_IA32_MCG_EDX 0x183
#define MSR_IA32_MCG_ESI 0x184
#define MSR_IA32_MCG_EDI 0x185
#define MSR_IA32_MCG_EBP 0x186
#define MSR_IA32_MCG_ESP 0x187
#define MSR_IA32_MCG_EFLAGS 0x188
#define MSR_IA32_MCG_EIP 0x189
#define MSR_IA32_MCG_RESERVED 0x18A
#define MSR_P6_EVNTSEL0 0x186
#define MSR_P6_EVNTSEL1 0x187
#define MSR_IA32_PERF_STATUS 0x198
#define MSR_IA32_PERF_CTL 0x199
#define MSR_IA32_THERM_CONTROL 0x19a
#define MSR_IA32_THERM_INTERRUPT 0x19b
#define MSR_IA32_THERM_STATUS 0x19c
#define MSR_IA32_MISC_ENABLE 0x1a0
#define MSR_IA32_DEBUGCTLMSR 0x1d9
#define MSR_IA32_LASTBRANCHFROMIP 0x1db
#define MSR_IA32_LASTBRANCHTOIP 0x1dc
#define MSR_IA32_LASTINTFROMIP 0x1dd
#define MSR_IA32_LASTINTTOIP 0x1de
#define MSR_IA32_MC0_CTL 0x400
#define MSR_IA32_MC0_STATUS 0x401
#define MSR_IA32_MC0_ADDR 0x402
#define MSR_IA32_MC0_MISC 0x403
/* Pentium IV performance counter MSRs */
#define MSR_P4_BPU_PERFCTR0 0x300
#define MSR_P4_BPU_PERFCTR1 0x301
#define MSR_P4_BPU_PERFCTR2 0x302
#define MSR_P4_BPU_PERFCTR3 0x303
#define MSR_P4_MS_PERFCTR0 0x304
#define MSR_P4_MS_PERFCTR1 0x305
#define MSR_P4_MS_PERFCTR2 0x306
#define MSR_P4_MS_PERFCTR3 0x307
#define MSR_P4_FLAME_PERFCTR0 0x308
#define MSR_P4_FLAME_PERFCTR1 0x309
#define MSR_P4_FLAME_PERFCTR2 0x30a
#define MSR_P4_FLAME_PERFCTR3 0x30b
#define MSR_P4_IQ_PERFCTR0 0x30c
#define MSR_P4_IQ_PERFCTR1 0x30d
#define MSR_P4_IQ_PERFCTR2 0x30e
#define MSR_P4_IQ_PERFCTR3 0x30f
#define MSR_P4_IQ_PERFCTR4 0x310
#define MSR_P4_IQ_PERFCTR5 0x311
#define MSR_P4_BPU_CCCR0 0x360
#define MSR_P4_BPU_CCCR1 0x361
#define MSR_P4_BPU_CCCR2 0x362
#define MSR_P4_BPU_CCCR3 0x363
#define MSR_P4_MS_CCCR0 0x364
#define MSR_P4_MS_CCCR1 0x365
#define MSR_P4_MS_CCCR2 0x366
#define MSR_P4_MS_CCCR3 0x367
#define MSR_P4_FLAME_CCCR0 0x368
#define MSR_P4_FLAME_CCCR1 0x369
#define MSR_P4_FLAME_CCCR2 0x36a
#define MSR_P4_FLAME_CCCR3 0x36b
#define MSR_P4_IQ_CCCR0 0x36c
#define MSR_P4_IQ_CCCR1 0x36d
#define MSR_P4_IQ_CCCR2 0x36e
#define MSR_P4_IQ_CCCR3 0x36f
#define MSR_P4_IQ_CCCR4 0x370
#define MSR_P4_IQ_CCCR5 0x371
#define MSR_P4_ALF_ESCR0 0x3ca
#define MSR_P4_ALF_ESCR1 0x3cb
#define MSR_P4_BPU_ESCR0 0x3b2
#define MSR_P4_BPU_ESCR1 0x3b3
#define MSR_P4_BSU_ESCR0 0x3a0
#define MSR_P4_BSU_ESCR1 0x3a1
#define MSR_P4_CRU_ESCR0 0x3b8
#define MSR_P4_CRU_ESCR1 0x3b9
#define MSR_P4_CRU_ESCR2 0x3cc
#define MSR_P4_CRU_ESCR3 0x3cd
#define MSR_P4_CRU_ESCR4 0x3e0
#define MSR_P4_CRU_ESCR5 0x3e1
#define MSR_P4_DAC_ESCR0 0x3a8
#define MSR_P4_DAC_ESCR1 0x3a9
#define MSR_P4_FIRM_ESCR0 0x3a4
#define MSR_P4_FIRM_ESCR1 0x3a5
#define MSR_P4_FLAME_ESCR0 0x3a6
#define MSR_P4_FLAME_ESCR1 0x3a7
#define MSR_P4_FSB_ESCR0 0x3a2
#define MSR_P4_FSB_ESCR1 0x3a3
#define MSR_P4_IQ_ESCR0 0x3ba
#define MSR_P4_IQ_ESCR1 0x3bb
#define MSR_P4_IS_ESCR0 0x3b4
#define MSR_P4_IS_ESCR1 0x3b5
#define MSR_P4_ITLB_ESCR0 0x3b6
#define MSR_P4_ITLB_ESCR1 0x3b7
#define MSR_P4_IX_ESCR0 0x3c8
#define MSR_P4_IX_ESCR1 0x3c9
#define MSR_P4_MOB_ESCR0 0x3aa
#define MSR_P4_MOB_ESCR1 0x3ab
#define MSR_P4_MS_ESCR0 0x3c0
#define MSR_P4_MS_ESCR1 0x3c1
#define MSR_P4_PMH_ESCR0 0x3ac
#define MSR_P4_PMH_ESCR1 0x3ad
#define MSR_P4_RAT_ESCR0 0x3bc
#define MSR_P4_RAT_ESCR1 0x3bd
#define MSR_P4_SAAT_ESCR0 0x3ae
#define MSR_P4_SAAT_ESCR1 0x3af
#define MSR_P4_SSU_ESCR0 0x3be
#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */
#define MSR_P4_TBPU_ESCR0 0x3c2
#define MSR_P4_TBPU_ESCR1 0x3c3
#define MSR_P4_TC_ESCR0 0x3c4
#define MSR_P4_TC_ESCR1 0x3c5
#define MSR_P4_U2L_ESCR0 0x3b0
#define MSR_P4_U2L_ESCR1 0x3b1
#endif #endif
...@@ -72,6 +72,23 @@ extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, ...@@ -72,6 +72,23 @@ extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
void *vaddr, dma_addr_t dma_handle); void *vaddr, dma_addr_t dma_handle);
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
extern dma_addr_t swiotlb_map_single (struct device *hwdev, void *ptr, size_t size,
int dir);
extern void swiotlb_unmap_single (struct device *hwdev, dma_addr_t dev_addr,
size_t size, int dir);
extern void swiotlb_sync_single (struct device *hwdev, dma_addr_t dev_addr,
size_t size, int dir);
extern void swiotlb_sync_sg (struct device *hwdev, struct scatterlist *sg, int nelems,
int dir);
extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
int nents, int direction);
extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
int nents, int direction);
#endif
#ifdef CONFIG_GART_IOMMU #ifdef CONFIG_GART_IOMMU
/* Map a single buffer of the indicated size for DMA in streaming mode. /* Map a single buffer of the indicated size for DMA in streaming mode.
...@@ -113,6 +130,13 @@ static inline void pci_dma_sync_single(struct pci_dev *hwdev, ...@@ -113,6 +130,13 @@ static inline void pci_dma_sync_single(struct pci_dev *hwdev,
size_t size, int direction) size_t size, int direction)
{ {
BUG_ON(direction == PCI_DMA_NONE); BUG_ON(direction == PCI_DMA_NONE);
#ifdef CONFIG_SWIOTLB
if (swiotlb)
return swiotlb_sync_single(&hwdev->dev,dma_handle,size,direction);
#endif
flush_write_buffers();
} }
static inline void pci_dma_sync_sg(struct pci_dev *hwdev, static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
...@@ -120,6 +144,12 @@ static inline void pci_dma_sync_sg(struct pci_dev *hwdev, ...@@ -120,6 +144,12 @@ static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
int nelems, int direction) int nelems, int direction)
{ {
BUG_ON(direction == PCI_DMA_NONE); BUG_ON(direction == PCI_DMA_NONE);
#ifdef CONFIG_SWIOTLB
if (swiotlb)
return swiotlb_sync_sg(&hwdev->dev,sg,nelems,direction);
#endif
flush_write_buffers();
} }
/* The PCI address space does equal the physical memory /* The PCI address space does equal the physical memory
...@@ -272,4 +302,6 @@ static inline void pcibios_add_platform_entries(struct pci_dev *dev) ...@@ -272,4 +302,6 @@ static inline void pcibios_add_platform_entries(struct pci_dev *dev)
/* generic pci stuff */ /* generic pci stuff */
#include <asm-generic/pci.h> #include <asm-generic/pci.h>
#include <linux/dma-mapping.h>
#endif /* __x8664_PCI_H */ #endif /* __x8664_PCI_H */
...@@ -303,6 +303,67 @@ extern unsigned long get_wchan(struct task_struct *p); ...@@ -303,6 +303,67 @@ extern unsigned long get_wchan(struct task_struct *p);
(((struct pt_regs *)(tsk->thread.rsp0 - sizeof(struct pt_regs)))->rip) (((struct pt_regs *)(tsk->thread.rsp0 - sizeof(struct pt_regs)))->rip)
#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ #define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
struct microcode_header {
unsigned int hdrver;
unsigned int rev;
unsigned int date;
unsigned int sig;
unsigned int cksum;
unsigned int ldrver;
unsigned int pf;
unsigned int datasize;
unsigned int totalsize;
unsigned int reserved[3];
};
struct microcode {
struct microcode_header hdr;
unsigned int bits[0];
};
typedef struct microcode microcode_t;
typedef struct microcode_header microcode_header_t;
/* microcode format is extended from prescott processors */
struct extended_signature {
unsigned int sig;
unsigned int pf;
unsigned int cksum;
};
struct extended_sigtable {
unsigned int count;
unsigned int cksum;
unsigned int reserved[3];
struct extended_signature sigs[0];
};
/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
#define MICROCODE_IOCFREE _IO('6',0)
#define ASM_NOP1 K8_NOP1
#define ASM_NOP2 K8_NOP2
#define ASM_NOP3 K8_NOP3
#define ASM_NOP4 K8_NOP4
#define ASM_NOP5 K8_NOP5
#define ASM_NOP6 K8_NOP6
#define ASM_NOP7 K8_NOP7
#define ASM_NOP8 K8_NOP8
/* Opteron nops */
#define K8_NOP1 ".byte 0x90\n"
#define K8_NOP2 ".byte 0x66,0x90\n"
#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
#define K8_NOP5 K8_NOP3 K8_NOP2
#define K8_NOP6 K8_NOP3 K8_NOP3
#define K8_NOP7 K8_NOP4 K8_NOP3
#define K8_NOP8 K8_NOP4 K8_NOP4
#define ASM_NOP_MAX 8
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
extern inline void rep_nop(void) extern inline void rep_nop(void)
{ {
...@@ -318,31 +379,25 @@ extern inline void sync_core(void) ...@@ -318,31 +379,25 @@ extern inline void sync_core(void)
#define cpu_has_fpu 1 #define cpu_has_fpu 1
/* Some early Opteron versions incorrectly fault on prefetch (errata #91).
If this happens just jump back. */
#define ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCH
static inline void prefetch(void *x) static inline void prefetch(void *x)
{ {
asm volatile("2: prefetcht0 %0\n1:\t" asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
".section __ex_table,\"a\"\n\t"
" .align 8\n\t"
" .quad 2b,1b\n\t"
".previous" :: "m" (*(unsigned long *)x));
} }
#define ARCH_HAS_PREFETCHW #define ARCH_HAS_PREFETCHW 1
static inline void prefetchw(void *x) static inline void prefetchw(void *x)
{ {
asm volatile("2: prefetchw %0\n1:\t" alternative_input(ASM_NOP4,
".section __ex_table,\"a\"\n\t" "prefetchw (%1)",
" .align 8\n\t" X86_FEATURE_3DNOW,
" .quad 2b,1b\n\t" "r" (x));
".previous" :: "m" (*(unsigned long *)x));
} }
#define ARCH_HAS_SPINLOCK_PREFETCH #define ARCH_HAS_SPINLOCK_PREFETCH 1
#define spin_lock_prefetch(x) prefetchw(x) #define spin_lock_prefetch(x) prefetchw(x)
#define cpu_relax() rep_nop() #define cpu_relax() rep_nop()
/* /*
...@@ -372,6 +427,23 @@ static inline void prefetchw(void *x) ...@@ -372,6 +427,23 @@ static inline void prefetchw(void *x)
outb((data), 0x23); \ outb((data), 0x23); \
} while (0) } while (0)
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax,%ecx,%edx;" */
asm volatile(
".byte 0x0f,0x01,0xc8;"
: :"a" (eax), "c" (ecx), "d"(edx));
}
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
/* "mwait %eax,%ecx;" */
asm volatile(
".byte 0x0f,0x01,0xc9;"
: :"a" (eax), "c" (ecx));
}
#define stack_current() \ #define stack_current() \
({ \ ({ \
struct thread_info *ti; \ struct thread_info *ti; \
...@@ -379,25 +451,4 @@ static inline void prefetchw(void *x) ...@@ -379,25 +451,4 @@ static inline void prefetchw(void *x)
ti->task; \ ti->task; \
}) })
#define ASM_NOP1 K8_NOP1
#define ASM_NOP2 K8_NOP2
#define ASM_NOP3 K8_NOP3
#define ASM_NOP4 K8_NOP4
#define ASM_NOP5 K8_NOP5
#define ASM_NOP6 K8_NOP6
#define ASM_NOP7 K8_NOP7
#define ASM_NOP8 K8_NOP8
/* Opteron nops */
#define K8_NOP1 ".byte 0x90\n"
#define K8_NOP2 ".byte 0x66,0x90\n"
#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
#define K8_NOP5 K8_NOP3 K8_NOP2
#define K8_NOP6 K8_NOP3 K8_NOP3
#define K8_NOP7 K8_NOP4 K8_NOP3
#define K8_NOP8 K8_NOP4 K8_NOP4
#define ASM_NOP_MAX 8
#endif /* __ASM_X86_64_PROCESSOR_H */ #endif /* __ASM_X86_64_PROCESSOR_H */
...@@ -76,6 +76,10 @@ extern void check_ioapic(void); ...@@ -76,6 +76,10 @@ extern void check_ioapic(void);
extern int unhandled_signal(struct task_struct *tsk, int sig); extern int unhandled_signal(struct task_struct *tsk, int sig);
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void swiotlb_init(void);
extern int swiotlb;
extern unsigned long max_mapnr; extern unsigned long max_mapnr;
extern unsigned long end_pfn; extern unsigned long end_pfn;
extern unsigned long table_start, table_end; extern unsigned long table_start, table_end;
...@@ -92,6 +96,7 @@ extern int acpi_disabled; ...@@ -92,6 +96,7 @@ extern int acpi_disabled;
extern int fallback_aper_order; extern int fallback_aper_order;
extern int fallback_aper_force; extern int fallback_aper_force;
extern int iommu_aperture;
extern void smp_local_timer_interrupt(struct pt_regs * regs); extern void smp_local_timer_interrupt(struct pt_regs * regs);
......
#ifndef _ASM_SEGMENT_H #ifndef _ASM_SEGMENT_H
#define _ASM_SEGMENT_H #define _ASM_SEGMENT_H
#include <asm/cache.h>
#define __KERNEL_CS 0x10 #define __KERNEL_CS 0x10
#define __KERNEL_DS 0x18 #define __KERNEL_DS 0x18
...@@ -38,7 +40,7 @@ ...@@ -38,7 +40,7 @@
#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
#define IDT_ENTRIES 256 #define IDT_ENTRIES 256
#define GDT_ENTRIES 16 #define GDT_ENTRIES (L1_CACHE_BYTES / 8)
#define GDT_SIZE (GDT_ENTRIES * 8) #define GDT_SIZE (GDT_ENTRIES * 8)
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
......
...@@ -39,6 +39,7 @@ extern void smp_alloc_memory(void); ...@@ -39,6 +39,7 @@ extern void smp_alloc_memory(void);
extern cpumask_t cpu_online_map; extern cpumask_t cpu_online_map;
extern volatile unsigned long smp_invalidate_needed; extern volatile unsigned long smp_invalidate_needed;
extern int pic_mode; extern int pic_mode;
extern int smp_num_siblings;
extern void smp_flush_tlb(void); extern void smp_flush_tlb(void);
extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
extern void smp_send_reschedule(int cpu); extern void smp_send_reschedule(int cpu);
...@@ -46,7 +47,7 @@ extern void smp_invalidate_rcv(void); /* Process an NMI */ ...@@ -46,7 +47,7 @@ extern void smp_invalidate_rcv(void); /* Process an NMI */
extern void (*mtrr_hook) (void); extern void (*mtrr_hook) (void);
extern void zap_low_mappings(void); extern void zap_low_mappings(void);
void smp_stop_cpu(void); void smp_stop_cpu(void);
extern int cpu_sibling_map[];
#define SMP_TRAMPOLINE_BASE 0x6000 #define SMP_TRAMPOLINE_BASE 0x6000
......
...@@ -87,6 +87,56 @@ struct alt_instr { ...@@ -87,6 +87,56 @@ struct alt_instr {
}; };
#endif #endif
/*
* Alternative instructions for different CPU types or capabilities.
*
* This allows to use optimized instructions even on generic binary
* kernels.
*
* length of oldinstr must be longer or equal the length of newinstr
* It can be padded with nops as needed.
*
* For non barrier like inlines please define new variants
* without volatile and memory clobber.
*/
#define alternative(oldinstr, newinstr, feature) \
asm volatile ("661:\n\t" oldinstr "\n662:\n" \
".section .altinstructions,\"a\"\n" \
" .align 8\n" \
" .quad 661b\n" /* label */ \
" .quad 663f\n" /* new instruction */ \
" .byte %c0\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
".previous\n" \
".section .altinstr_replacement,\"ax\"\n" \
"663:\n\t" newinstr "\n664:\n" /* replacement */ \
".previous" :: "i" (feature) : "memory")
/*
* Alternative inline assembly with input.
*
* Pecularities:
* No memory clobber here.
* Argument numbers start with 1.
* Best is to use constraints that are fixed size (like (%1) ... "r")
* If you use variable sized constraints like "m" or "g" in the
* replacement maake sure to pad to the worst case length.
*/
#define alternative_input(oldinstr, newinstr, feature, input) \
asm volatile ("661:\n\t" oldinstr "\n662:\n" \
".section .altinstructions,\"a\"\n" \
" .align 8\n" \
" .quad 661b\n" /* label */ \
" .quad 663f\n" /* new instruction */ \
" .byte %c0\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
".previous\n" \
".section .altinstr_replacement,\"ax\"\n" \
"663:\n\t" newinstr "\n664:\n" /* replacement */ \
".previous" :: "i" (feature), input)
/* /*
* Clear and set 'TS' bit respectively * Clear and set 'TS' bit respectively
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment