Commit 06955392 authored by Changbin Du's avatar Changbin Du Committed by Jonathan Corbet

Documentation: x86: convert exception-tables.txt to reST

This converts the plain text documentation to reStructuredText format and
add it to Sphinx TOC tree. No essential content change.
Signed-off-by: default avatarChangbin Du <changbin.du@gmail.com>
Reviewed-by: default avatarMauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: default avatarJonathan Corbet <corbet@lwn.net>
parent 848942cb
Kernel level exception handling in Linux .. SPDX-License-Identifier: GPL-2.0
Commentary by Joerg Pommnitz <joerg@raleigh.ibm.com>
===============================
Kernel level exception handling
===============================
Commentary by Joerg Pommnitz <joerg@raleigh.ibm.com>
When a process runs in kernel mode, it often has to access user When a process runs in kernel mode, it often has to access user
mode memory whose address has been passed by an untrusted program. mode memory whose address has been passed by an untrusted program.
...@@ -25,9 +30,9 @@ How does this work? ...@@ -25,9 +30,9 @@ How does this work?
Whenever the kernel tries to access an address that is currently not Whenever the kernel tries to access an address that is currently not
accessible, the CPU generates a page fault exception and calls the accessible, the CPU generates a page fault exception and calls the
page fault handler page fault handler::
void do_page_fault(struct pt_regs *regs, unsigned long error_code) void do_page_fault(struct pt_regs *regs, unsigned long error_code)
in arch/x86/mm/fault.c. The parameters on the stack are set up by in arch/x86/mm/fault.c. The parameters on the stack are set up by
the low level assembly glue in arch/x86/kernel/entry_32.S. The parameter the low level assembly glue in arch/x86/kernel/entry_32.S. The parameter
...@@ -57,73 +62,74 @@ as an example. The definition is somewhat hard to follow, so let's peek at ...@@ -57,73 +62,74 @@ as an example. The definition is somewhat hard to follow, so let's peek at
the code generated by the preprocessor and the compiler. I selected the code generated by the preprocessor and the compiler. I selected
the get_user call in drivers/char/sysrq.c for a detailed examination. the get_user call in drivers/char/sysrq.c for a detailed examination.
The original code in sysrq.c line 587: The original code in sysrq.c line 587::
get_user(c, buf); get_user(c, buf);
The preprocessor output (edited to become somewhat readable): The preprocessor output (edited to become somewhat readable)::
( (
{ {
long __gu_err = - 14 , __gu_val = 0; long __gu_err = - 14 , __gu_val = 0;
const __typeof__(*( ( buf ) )) *__gu_addr = ((buf)); const __typeof__(*( ( buf ) )) *__gu_addr = ((buf));
if (((((0 + current_set[0])->tss.segment) == 0x18 ) || if (((((0 + current_set[0])->tss.segment) == 0x18 ) ||
(((sizeof(*(buf))) <= 0xC0000000UL) && (((sizeof(*(buf))) <= 0xC0000000UL) &&
((unsigned long)(__gu_addr ) <= 0xC0000000UL - (sizeof(*(buf))))))) ((unsigned long)(__gu_addr ) <= 0xC0000000UL - (sizeof(*(buf)))))))
do { do {
__gu_err = 0; __gu_err = 0;
switch ((sizeof(*(buf)))) { switch ((sizeof(*(buf)))) {
case 1: case 1:
__asm__ __volatile__( __asm__ __volatile__(
"1: mov" "b" " %2,%" "b" "1\n" "1: mov" "b" " %2,%" "b" "1\n"
"2:\n" "2:\n"
".section .fixup,\"ax\"\n" ".section .fixup,\"ax\"\n"
"3: movl %3,%0\n" "3: movl %3,%0\n"
" xor" "b" " %" "b" "1,%" "b" "1\n" " xor" "b" " %" "b" "1,%" "b" "1\n"
" jmp 2b\n" " jmp 2b\n"
".section __ex_table,\"a\"\n" ".section __ex_table,\"a\"\n"
" .align 4\n" " .align 4\n"
" .long 1b,3b\n" " .long 1b,3b\n"
".text" : "=r"(__gu_err), "=q" (__gu_val): "m"((*(struct __large_struct *) ".text" : "=r"(__gu_err), "=q" (__gu_val): "m"((*(struct __large_struct *)
( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )) ; ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )) ;
break; break;
case 2: case 2:
__asm__ __volatile__( __asm__ __volatile__(
"1: mov" "w" " %2,%" "w" "1\n" "1: mov" "w" " %2,%" "w" "1\n"
"2:\n" "2:\n"
".section .fixup,\"ax\"\n" ".section .fixup,\"ax\"\n"
"3: movl %3,%0\n" "3: movl %3,%0\n"
" xor" "w" " %" "w" "1,%" "w" "1\n" " xor" "w" " %" "w" "1,%" "w" "1\n"
" jmp 2b\n" " jmp 2b\n"
".section __ex_table,\"a\"\n" ".section __ex_table,\"a\"\n"
" .align 4\n" " .align 4\n"
" .long 1b,3b\n" " .long 1b,3b\n"
".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *) ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *)
( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err )); ( __gu_addr )) ), "i"(- 14 ), "0"( __gu_err ));
break; break;
case 4: case 4:
__asm__ __volatile__( __asm__ __volatile__(
"1: mov" "l" " %2,%" "" "1\n" "1: mov" "l" " %2,%" "" "1\n"
"2:\n" "2:\n"
".section .fixup,\"ax\"\n" ".section .fixup,\"ax\"\n"
"3: movl %3,%0\n" "3: movl %3,%0\n"
" xor" "l" " %" "" "1,%" "" "1\n" " xor" "l" " %" "" "1,%" "" "1\n"
" jmp 2b\n" " jmp 2b\n"
".section __ex_table,\"a\"\n" ".section __ex_table,\"a\"\n"
" .align 4\n" " .long 1b,3b\n" " .align 4\n" " .long 1b,3b\n"
".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *) ".text" : "=r"(__gu_err), "=r" (__gu_val) : "m"((*(struct __large_struct *)
( __gu_addr )) ), "i"(- 14 ), "0"(__gu_err)); ( __gu_addr )) ), "i"(- 14 ), "0"(__gu_err));
break; break;
default: default:
(__gu_val) = __get_user_bad(); (__gu_val) = __get_user_bad();
} }
} while (0) ; } while (0) ;
((c)) = (__typeof__(*((buf))))__gu_val; ((c)) = (__typeof__(*((buf))))__gu_val;
__gu_err; __gu_err;
} }
); );
WOW! Black GCC/assembly magic. This is impossible to follow, so let's WOW! Black GCC/assembly magic. This is impossible to follow, so let's
see what code gcc generates: see what code gcc generates::
> xorl %edx,%edx > xorl %edx,%edx
> movl current_set,%eax > movl current_set,%eax
...@@ -154,7 +160,7 @@ understand. Can we? The actual user access is quite obvious. Thanks ...@@ -154,7 +160,7 @@ understand. Can we? The actual user access is quite obvious. Thanks
to the unified address space we can just access the address in user to the unified address space we can just access the address in user
memory. But what does the .section stuff do????? memory. But what does the .section stuff do?????
To understand this we have to look at the final kernel: To understand this we have to look at the final kernel::
> objdump --section-headers vmlinux > objdump --section-headers vmlinux
> >
...@@ -181,7 +187,7 @@ To understand this we have to look at the final kernel: ...@@ -181,7 +187,7 @@ To understand this we have to look at the final kernel:
There are obviously 2 non standard ELF sections in the generated object There are obviously 2 non standard ELF sections in the generated object
file. But first we want to find out what happened to our code in the file. But first we want to find out what happened to our code in the
final kernel executable: final kernel executable::
> objdump --disassemble --section=.text vmlinux > objdump --disassemble --section=.text vmlinux
> >
...@@ -199,7 +205,7 @@ final kernel executable: ...@@ -199,7 +205,7 @@ final kernel executable:
The whole user memory access is reduced to 10 x86 machine instructions. The whole user memory access is reduced to 10 x86 machine instructions.
The instructions bracketed in the .section directives are no longer The instructions bracketed in the .section directives are no longer
in the normal execution path. They are located in a different section in the normal execution path. They are located in a different section
of the executable file: of the executable file::
> objdump --disassemble --section=.fixup vmlinux > objdump --disassemble --section=.fixup vmlinux
> >
...@@ -207,14 +213,15 @@ of the executable file: ...@@ -207,14 +213,15 @@ of the executable file:
> c0199ffa <.fixup+10ba> xorb %dl,%dl > c0199ffa <.fixup+10ba> xorb %dl,%dl
> c0199ffc <.fixup+10bc> jmp c017e7a7 <do_con_write+e3> > c0199ffc <.fixup+10bc> jmp c017e7a7 <do_con_write+e3>
And finally: And finally::
> objdump --full-contents --section=__ex_table vmlinux > objdump --full-contents --section=__ex_table vmlinux
> >
> c01aa7c4 93c017c0 e09f19c0 97c017c0 99c017c0 ................ > c01aa7c4 93c017c0 e09f19c0 97c017c0 99c017c0 ................
> c01aa7d4 f6c217c0 e99f19c0 a5e717c0 f59f19c0 ................ > c01aa7d4 f6c217c0 e99f19c0 a5e717c0 f59f19c0 ................
> c01aa7e4 080a18c0 01a019c0 0a0a18c0 04a019c0 ................ > c01aa7e4 080a18c0 01a019c0 0a0a18c0 04a019c0 ................
or in human readable byte order: or in human readable byte order::
> c01aa7c4 c017c093 c0199fe0 c017c097 c017c099 ................ > c01aa7c4 c017c093 c0199fe0 c017c097 c017c099 ................
> c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................ > c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................
...@@ -222,18 +229,22 @@ or in human readable byte order: ...@@ -222,18 +229,22 @@ or in human readable byte order:
this is the interesting part! this is the interesting part!
> c01aa7e4 c0180a08 c019a001 c0180a0a c019a004 ................ > c01aa7e4 c0180a08 c019a001 c0180a0a c019a004 ................
What happened? The assembly directives What happened? The assembly directives::
.section .fixup,"ax" .section .fixup,"ax"
.section __ex_table,"a" .section __ex_table,"a"
told the assembler to move the following code to the specified told the assembler to move the following code to the specified
sections in the ELF object file. So the instructions sections in the ELF object file. So the instructions::
3: movl $-14,%eax
xorb %dl,%dl 3: movl $-14,%eax
jmp 2b xorb %dl,%dl
ended up in the .fixup section of the object file and the addresses jmp 2b
ended up in the .fixup section of the object file and the addresses::
.long 1b,3b .long 1b,3b
ended up in the __ex_table section of the object file. 1b and 3b ended up in the __ex_table section of the object file. 1b and 3b
are local labels. The local label 1b (1b stands for next label 1 are local labels. The local label 1b (1b stands for next label 1
backward) is the address of the instruction that might fault, i.e. backward) is the address of the instruction that might fault, i.e.
...@@ -246,35 +257,39 @@ the fault, in our case the actual value is c0199ff5: ...@@ -246,35 +257,39 @@ the fault, in our case the actual value is c0199ff5:
the original assembly code: > 3: movl $-14,%eax the original assembly code: > 3: movl $-14,%eax
and linked in vmlinux : > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax and linked in vmlinux : > c0199ff5 <.fixup+10b5> movl $0xfffffff2,%eax
The assembly code The assembly code::
> .section __ex_table,"a" > .section __ex_table,"a"
> .align 4 > .align 4
> .long 1b,3b > .long 1b,3b
becomes the value pair becomes the value pair::
> c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................ > c01aa7d4 c017c2f6 c0199fe9 c017e7a5 c0199ff5 ................
^this is ^this is ^this is ^this is
1b 3b 1b 3b
c017e7a5,c0199ff5 in the exception table of the kernel. c017e7a5,c0199ff5 in the exception table of the kernel.
So, what actually happens if a fault from kernel mode with no suitable So, what actually happens if a fault from kernel mode with no suitable
vma occurs? vma occurs?
1.) access to invalid address: #. access to invalid address::
> c017e7a5 <do_con_write+e1> movb (%ebx),%dl
2.) MMU generates exception > c017e7a5 <do_con_write+e1> movb (%ebx),%dl
3.) CPU calls do_page_fault #. MMU generates exception
4.) do page fault calls search_exception_table (regs->eip == c017e7a5); #. CPU calls do_page_fault
5.) search_exception_table looks up the address c017e7a5 in the #. do page fault calls search_exception_table (regs->eip == c017e7a5);
exception table (i.e. the contents of the ELF section __ex_table) #. search_exception_table looks up the address c017e7a5 in the
and returns the address of the associated fault handle code c0199ff5. exception table (i.e. the contents of the ELF section __ex_table)
6.) do_page_fault modifies its own return address to point to the fault and returns the address of the associated fault handle code c0199ff5.
handle code and returns. #. do_page_fault modifies its own return address to point to the fault
7.) execution continues in the fault handling code. handle code and returns.
8.) 8a) EAX becomes -EFAULT (== -14) #. execution continues in the fault handling code.
8b) DL becomes zero (the value we "read" from user space) #. a) EAX becomes -EFAULT (== -14)
8c) execution continues at local label 2 (address of the b) DL becomes zero (the value we "read" from user space)
instruction immediately after the faulting user access). c) execution continues at local label 2 (address of the
instruction immediately after the faulting user access).
The steps 8a to 8c in a certain way emulate the faulting instruction. The steps 8a to 8c in a certain way emulate the faulting instruction.
...@@ -295,14 +310,15 @@ Things changed when 64-bit support was added to x86 Linux. Rather than ...@@ -295,14 +310,15 @@ Things changed when 64-bit support was added to x86 Linux. Rather than
double the size of the exception table by expanding the two entries double the size of the exception table by expanding the two entries
from 32-bits to 64 bits, a clever trick was used to store addresses from 32-bits to 64 bits, a clever trick was used to store addresses
as relative offsets from the table itself. The assembly code changed as relative offsets from the table itself. The assembly code changed
from: from::
.long 1b,3b
to: .long 1b,3b
.long (from) - . to:
.long (to) - . .long (from) - .
.long (to) - .
and the C-code that uses these values converts back to absolute addresses and the C-code that uses these values converts back to absolute addresses
like this: like this::
ex_insn_addr(const struct exception_table_entry *x) ex_insn_addr(const struct exception_table_entry *x)
{ {
...@@ -313,15 +329,18 @@ In v4.6 the exception table entry was expanded with a new field "handler". ...@@ -313,15 +329,18 @@ In v4.6 the exception table entry was expanded with a new field "handler".
This is also 32-bits wide and contains a third relative function This is also 32-bits wide and contains a third relative function
pointer which points to one of: pointer which points to one of:
1) int ex_handler_default(const struct exception_table_entry *fixup) 1) ``int ex_handler_default(const struct exception_table_entry *fixup)``
This is legacy case that just jumps to the fixup code This is legacy case that just jumps to the fixup code
2) int ex_handler_fault(const struct exception_table_entry *fixup)
This case provides the fault number of the trap that occurred at 2) ``int ex_handler_fault(const struct exception_table_entry *fixup)``
entry->insn. It is used to distinguish page faults from machine This case provides the fault number of the trap that occurred at
check. entry->insn. It is used to distinguish page faults from machine
3) int ex_handler_ext(const struct exception_table_entry *fixup) check.
This case is used for uaccess_err ... we need to set a flag
in the task structure. Before the handler functions existed this 3) ``int ex_handler_ext(const struct exception_table_entry *fixup)``
case was handled by adding a large offset to the fixup to tag This case is used for uaccess_err ... we need to set a flag
it as special. in the task structure. Before the handler functions existed this
case was handled by adding a large offset to the fixup to tag
it as special.
More functions can easily be added. More functions can easily be added.
...@@ -10,3 +10,4 @@ x86-specific Documentation ...@@ -10,3 +10,4 @@ x86-specific Documentation
boot boot
topology topology
exception-tables
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment