Commit 973adfc0 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] i386 very early memory detection cleanup patch

From: "H. Peter Anvin" <hpa@zytor.com>

This patch cleans up the very early memory setup on the i386 platform.  In
particular, it removes the hard-coded 8 MB limit completely by dynamically
creating the early-boot pagetables rather than having them hard coded.

While I was at it, I changed head.S so that it always sets up a local GDT;
this means among other things that SMP and VISWS are no longer special
cases, and is conceptually cleaner to boot.  The VISWS people have
confirmed it works on VISWS.

It also uses a separate entrypoint for non-boot processors since this is
completely kernel-internal anyway.  This eliminates the need to set %bx on
boot.  (If you think this is a bad idea I can eliminate this change; it
just seemed cleaner to me to do it this way.)

Additionally, zero bss with rep;stosl rather that rep;stosb.
parent e47d860c
......@@ -1335,11 +1335,6 @@ config X86_BIOS_REBOOT
depends on !(X86_VISWS || X86_VOYAGER)
default y
config X86_TRAMPOLINE
bool
depends on SMP || X86_VISWS
default y
config PC
bool
depends on X86 && !EMBEDDED
......
......@@ -150,10 +150,8 @@ int main(int argc, char ** argv)
sz = sb.st_size;
fprintf (stderr, "System is %d kB\n", sz/1024);
sys_size = (sz + 15) / 16;
/* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */
if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE))
die("System is too big. Try using %smodules.",
is_big_kernel ? "" : "bzImage or ");
if (!is_big_kernel && sys_size > DEF_SYSSIZE)
die("System is too big. Try using bzImage or modules.");
while (sz > 0) {
int l, n;
......
......@@ -1212,5 +1212,4 @@ CONFIG_CRC32=y
CONFIG_X86_SMP=y
CONFIG_X86_HT=y
CONFIG_X86_BIOS_REBOOT=y
CONFIG_X86_TRAMPOLINE=y
CONFIG_PC=y
......@@ -18,8 +18,7 @@ obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_X86_SMP) += smp.o smpboot.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-$(CONFIG_X86_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
......
......@@ -4,9 +4,11 @@
* to extract and format the required data.
*/
#include <linux/sched.h>
#include <linux/signal.h>
#include <asm/ucontext.h>
#include "sigframe.h"
#include <asm/fixmap.h>
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
......@@ -28,4 +30,6 @@ void foo(void)
DEFINE(RT_SIGFRAME_sigcontext,
offsetof (struct rt_sigframe, uc.uc_mcontext));
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
}
......@@ -17,7 +17,7 @@
#include <asm/desc.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm_offsets.h>
#define OLD_CL_MAGIC_ADDR 0x90020
#define OLD_CL_MAGIC 0xA33F
......@@ -40,49 +40,89 @@
#define X86_VENDOR_ID CPU_PARAMS+36 /* offset dependent on NCAPINTS */
/*
* Initialize page tables
* This is how much memory *in addition to the memory covered up to
* and including _end* we need mapped initially. We need one bit for
* each possible page, but only in low memory, which means
* 2^32/4096/8 = 128K worst case (4G/4G split.)
*
* Modulo rounding, each megabyte assigned here requires a kilobyte of
* memory, which is currently unreclaimed.
*
* This should be a multiple of a page.
*/
#define INIT_PAGE_TABLES \
movl $pg0 - __PAGE_OFFSET, %edi; \
/* "007" doesn't mean with license to kill, but PRESENT+RW+USER */ \
movl $007, %eax; \
2: stosl; \
add $0x1000, %eax; \
cmp $empty_zero_page - __PAGE_OFFSET, %edi; \
jne 2b;
#define INIT_MAP_BEYOND_END (128*1024)
/*
* swapper_pg_dir is the main page directory, address 0x00101000
*
* On entry, %esi points to the real-mode code as a 32-bit pointer.
* 32-bit kernel entrypoint; only used by the boot CPU. On entry,
* %esi points to the real-mode code as a 32-bit pointer.
* CS and DS must be 4 GB flat segments, but we don't depend on
* any particular GDT layout, because we load our own as soon as we
* can.
*/
ENTRY(startup_32)
#ifdef CONFIG_X86_VISWS
/*
* On SGI Visual Workstations boot CPU starts in protected mode.
* Set segments to known values.
*/
orw %bx, %bx
jnz 1f
INIT_PAGE_TABLES
movl $swapper_pg_dir - __PAGE_OFFSET, %eax
movl %eax, %cr3
lgdt boot_gdt
1:
#endif
cld
lgdt boot_gdt_descr - __PAGE_OFFSET
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
/*
* Set segments to known values
* Initialize page tables. This creates a PDE and a set of page
* tables, which are located immediately beyond _end. The variable
* init_pg_tables_end is set up to point to the first "safe" location.
*
* Warning: don't use %esi or the stack in this code. However, %esp
* can be used as a GPR if you really need it...
*/
page_pde_offset = (__PAGE_OFFSET >> 20);
movl $(pg0 - __PAGE_OFFSET), %edi
movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
10:
leal 0x007(%edi),%ecx /* Create PDE entry */
movl %ecx,(%edx) /* Store identity PDE entry */
movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
addl $4,%edx
movl $1024, %ecx
11:
stosl
addl $0x1000,%eax
loop 11b
/* End condition: we must map up to and including INIT_MAP_BEYOND_END */
/* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
#ifdef CONFIG_SMP
xorl %ebx,%ebx /* This is the boot CPU (BSP) */
jmp 3f
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
* we know the trampoline has already loaded the boot_gdt_table GDT
* for us.
*/
ENTRY(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
#ifdef CONFIG_SMP
orw %bx,%bx
jz 1f
xorl %ebx,%ebx
incl %ebx /* This is a secondary processor (AP) */
/*
* New page tables may be in 4Mbyte page mode and may
......@@ -99,37 +139,40 @@ ENTRY(startup_32)
* not yet offset PAGE_OFFSET..
*/
#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
cmpl $0,cr4_bits
je 3f
movl cr4_bits,%edx
andl %edx,%edx
jz 3f
movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
orl cr4_bits,%eax
orl %edx,%eax
movl %eax,%cr4
jmp 3f
1:
#endif
INIT_PAGE_TABLES
3:
#endif /* CONFIG_SMP */
/*
* Enable paging
*/
3:
movl $swapper_pg_dir-__PAGE_OFFSET,%eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl $0x80000000,%eax
movl %eax,%cr0 /* ..and set paging (PG) bit */
jmp 1f /* flush the prefetch-queue */
1:
movl $1f,%eax
jmp *%eax /* make sure eip is relocated */
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
1:
/* Set up the stack pointer */
lss stack_start,%esp
#ifdef CONFIG_SMP
orw %bx,%bx
jz 1f /* Initial CPU cleans BSS */
/*
* Initialize eflags. Some BIOS's leave bits like NT set. This would
* confuse the debugger if this code is traced.
* XXX - best to initialize before switching to protected mode.
*/
pushl $0
popfl
#ifdef CONFIG_SMP
andl %ebx,%ebx
jz 1f /* Initial CPU cleans BSS */
jmp checkCPUtype
1:
#endif /* CONFIG_SMP */
......@@ -142,21 +185,15 @@ ENTRY(startup_32)
movl $__bss_start,%edi
movl $__bss_stop,%ecx
subl %edi,%ecx
rep
stosb
shrl $2,%ecx
rep ; stosl
/*
* start system 32-bit setup. We need to re-do some of the things done
* in 16-bit mode for the "real" operations.
*/
call setup_idt
/*
* Initialize eflags. Some BIOS's leave bits like NT set. This would
* confuse the debugger if this code is traced.
* XXX - best to initialize before switching to protected mode.
*/
pushl $0
popfl
/*
* Copy bootup parameters out of the way. First 2kB of
* _empty_zero_page is for boot parameters, second 2kB
......@@ -273,7 +310,7 @@ is386: movl $2,%ecx # set MP
call initialize_secondary
jmp L6
1:
#endif
#endif /* CONFIG_SMP */
call start_kernel
L6:
jmp L6 # main should never return here, but
......@@ -309,6 +346,8 @@ check_x87:
* and the kernel moved to PAGE_OFFSET. Interrupts
* are enabled elsewhere, when we can be relatively
* sure everything is ok.
*
* Warning: %esi is live across this function.
*/
setup_idt:
lea ignore_int,%edx
......@@ -332,7 +371,7 @@ ENTRY(stack_start)
/* This is the default interrupt "handler" :-) */
int_msg:
.asciz "Unknown interrupt\n"
.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
ALIGN
ignore_int:
cld
......@@ -344,9 +383,13 @@ ignore_int:
movl $(__KERNEL_DS),%eax
movl %eax,%ds
movl %eax,%es
pushl 16(%esp)
pushl 24(%esp)
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
call printk
popl %eax
addl $(5*4),%esp
popl %ds
popl %es
popl %edx
......@@ -361,10 +404,17 @@ ignore_int:
* segment size, and 32-bit linear address value:
*/
.globl boot_gdt_descr
.globl idt_descr
.globl cpu_gdt_descr
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
.word 0 # 32 bit align gdt_desc.address
boot_gdt_descr:
.word __BOOT_DS+7
.long boot_gdt_table - __PAGE_OFFSET
.word 0 # 32-bit align idt_desc.address
idt_descr:
.word IDT_ENTRIES*8-1 # idt contains 256 entries
......@@ -379,41 +429,25 @@ cpu_gdt_descr:
.fill NR_CPUS-1,8,0 # space for the other GDT descriptors
/*
* This is initialized to create an identity-mapping at 0-8M (for bootup
* purposes) and another mapping of the 0-8M area at virtual address
* PAGE_OFFSET.
* swapper_pg_dir is the main page directory, address 0x00101000
*
* This is initialized to create an identity-mapping at 0 (for bootup
* purposes) and another mapping at virtual address PAGE_OFFSET. The
* values put here should be all invalid (zero); the valid
* entries are created dynamically at boot time.
*
* The code creates enough page tables to map 0-_end, the page tables
* themselves, plus INIT_MAP_BEYOND_END bytes; see comment at beginning.
*/
.org 0x1000
ENTRY(swapper_pg_dir)
.long 0x00102007
.long 0x00103007
.fill BOOT_USER_PGD_PTRS-2,4,0
/* default: 766 entries */
.long 0x00102007
.long 0x00103007
/* default: 254 entries */
.fill BOOT_KERNEL_PGD_PTRS-2,4,0
.fill 1024,4,0
/*
* The page tables are initialized to only 8MB here - the final page
* tables are set up later depending on memory size.
*/
.org 0x2000
ENTRY(pg0)
.org 0x3000
ENTRY(pg1)
/*
* empty_zero_page must immediately follow the page tables ! (The
* initialization loop counts until empty_zero_page)
*/
.org 0x4000
ENTRY(empty_zero_page)
.fill 4096,1,0
.org 0x5000
.org 0x3000
/*
* Real beginning of normal "text" segment
*/
......@@ -427,21 +461,20 @@ ENTRY(_stext)
*/
.data
/*
* The Global Descriptor Table contains 28 quadwords, per-CPU.
*/
#if defined(CONFIG_SMP) || defined(CONFIG_X86_VISWS)
/*
* The boot_gdt_table must mirror the equivalent in setup.S and is
* used only by the trampoline for booting other CPUs
* used only for booting.
*/
.align L1_CACHE_BYTES
ENTRY(boot_gdt_table)
.fill GDT_ENTRY_BOOT_CS,8,0
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
#endif
.align L1_CACHE_BYTES
/*
* The Global Descriptor Table contains 28 quadwords, per-CPU.
*/
.align PAGE_SIZE_asm
ENTRY(cpu_gdt_table)
.quad 0x0000000000000000 /* NULL descriptor */
.quad 0x0000000000000000 /* 0x0b reserved */
......@@ -488,4 +521,3 @@ ENTRY(cpu_gdt_table)
#ifdef CONFIG_SMP
.fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */
#endif
......@@ -50,6 +50,11 @@
#include "setup_arch_pre.h"
#include "mach_resources.h"
/* This value is set up by the early boot code to point to the value
immediately after the boot time page tables. It contains a *physical*
address, and must not be in the .bss segment! */
unsigned long init_pg_tables_end __initdata = ~0UL;
int disable_pse __initdata = 0;
static inline char * __init machine_specific_memory_setup(void);
......@@ -115,7 +120,6 @@ extern void early_cpu_init(void);
extern void dmi_scan_machine(void);
extern void generic_apic_probe(char *);
extern int root_mountflags;
extern char _end[];
unsigned long saved_videomode;
......@@ -790,7 +794,7 @@ static unsigned long __init setup_memory(void)
* partially used pages are not usable - thus
* we are rounding upwards:
*/
start_pfn = PFN_UP(__pa(_end));
start_pfn = PFN_UP(init_pg_tables_end);
find_max_pfn();
......@@ -1102,7 +1106,7 @@ void __init setup_arch(char **cmdline_p)
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = (unsigned long) _end;
init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
code_resource.start = virt_to_phys(_text);
code_resource.end = virt_to_phys(_etext)-1;
......
......@@ -23,9 +23,13 @@
* and IP is zero. Thus, data addresses need to be absolute
* (no relocation) and are taken with regard to r_base.
*
* If you work on this file, check the object module with objdump
* --full-contents --reloc to make sure there are no relocation
* entries except for the gdt one..
* If you work on this file, check the object module with
* objdump --reloc to make sure there are no relocation
* entries except for:
*
* TYPE VALUE
* R_386_32 startup_32_smp
* R_386_32 boot_gdt_table
*/
#include <linux/linkage.h>
......@@ -42,7 +46,6 @@ r_base = .
mov %cs, %ax # Code and data in the same place
mov %ax, %ds
mov $1, %bx # Flag an SMP trampoline
cli # We should be safe anyway
movl $0xA5A5A5A5, trampoline_data - r_base
......@@ -54,22 +57,18 @@ r_base = .
xor %ax, %ax
inc %ax # protected mode (PE) bit
lmsw %ax # into protected mode
jmp flush_instr
flush_instr:
ljmpl $__BOOT_CS, $0x00100000
# jump to startup_32 in arch/i386/kernel/head.S
boot_idt:
.word 0 # idt limit = 0
.word 0, 0 # idt base = 0L
# flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
#
# NOTE: here we actually use CPU#0's GDT - but that is OK, we reload
# the proper GDT shortly after booting up the secondary CPUs.
#
ENTRY(boot_gdt)
# These need to be in the same 64K segment as the above;
# hence we don't use the boot_gdt_descr defined in head.S
boot_gdt:
.word __BOOT_DS + 7 # gdt limit
.long boot_gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU)
.long boot_gdt_table-__PAGE_OFFSET # gdt base
boot_idt:
.word 0 # idt limit = 0
.long 0 # idt base = 0L
.globl trampoline_end
trampoline_end:
......@@ -105,10 +105,15 @@ SECTIONS
__bss_start = .; /* BSS */
.bss : { *(.bss) }
. = ALIGN(4);
__bss_stop = .;
_end = . ;
/* This is where the kernel creates the early boot page tables */
. = ALIGN(4096);
pg0 = .;
/* Sections to be discarded */
/DISCARD/ : {
*(.exitcall.exit)
......
......@@ -66,7 +66,7 @@ extern void find_max_pfn(void);
extern void one_highpage_init(struct page *, int, int);
extern struct e820map e820;
extern char _end;
extern unsigned long init_pg_tables_end;
extern unsigned long highend_pfn, highstart_pfn;
extern unsigned long max_low_pfn;
extern unsigned long totalram_pages;
......@@ -237,7 +237,7 @@ unsigned long __init setup_memory(void)
reserve_pages = calculate_numa_remap_pages();
/* partially used pages are not usable - thus round upwards */
system_start_pfn = min_low_pfn = PFN_UP(__pa(&_end));
system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
find_max_pfn();
system_max_low_pfn = max_low_pfn = find_max_low_pfn();
......
......@@ -173,8 +173,8 @@ extern unsigned long __PAGE_KERNEL;
*/
#undef TEST_VERIFY_AREA
/* page table for 0-4MB for everybody */
extern unsigned long pg0[1024];
/* The boot page tables (all created as a single array) */
extern unsigned long pg0[];
#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment