Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
linux
Commits
e9d08567
Commit
e9d08567
authored
Oct 22, 2010
by
Andi Kleen
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'hwpoison-cleanups' into hwpoison
parents
df27570f
a08c80eb
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
41 deletions
+32
-41
mm/memory-failure.c
mm/memory-failure.c
+32
-41
No files found.
mm/memory-failure.c
View file @
e9d08567
...
...
@@ -7,21 +7,26 @@
* Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
* hardware as being corrupted usually due to a
2
bit ECC memory or cache
* hardware as being corrupted usually due to a
multi-
bit ECC memory or cache
* failure.
*
* In addition there is a "soft offline" entry point that allows stop using
* not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
* here is that we can access any page asynchronous to other VM
* users, because memory failures could happen anytime and anywhere,
* possibly violating some of their assumptions. This is why this code
* has to be extremely careful. Generally it tries to use normal locking
* rules, as in get the standard locks, even if that means the
* error handling takes potentially a long time.
*
* The operation to map back from RMAP chains to processes has to walk
* the complete process list and has non linear complexity with the number
* mappings. In short it can be quite slow. But since memory corruptions
* are rare we hope to get away with this.
* here is that we can access any page asynchronously in respect to
* other VM users, because memory failures could happen anytime and
* anywhere. This could violate some of their assumptions. This is why
* this code has to be extremely careful. Generally it tries to use
* normal locking rules, as in get the standard locks, even if that means
* the error handling takes potentially a long time.
*
* There are several operations here with exponential complexity because
* of unsuitable VM data structures. For example the operation to map back
* from RMAP chains to processes has to walk the complete process list and
* has non linear complexity with the number. But since memory corruptions
* are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
/*
...
...
@@ -30,7 +35,6 @@
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel
*/
#define DEBUG 1
/* remove me in 2.6.34 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
...
...
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
return
0
;
/*
* page_mapping() does not accept slab page
* page_mapping() does not accept slab page
s.
*/
if
(
PageSlab
(
p
))
return
-
EINVAL
;
...
...
@@ -268,7 +272,7 @@ struct to_kill {
struct
list_head
nd
;
struct
task_struct
*
tsk
;
unsigned
long
addr
;
unsigned
addr_valid
:
1
;
char
addr_valid
;
};
/*
...
...
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if
(
tk
->
addr
==
-
EFAULT
)
{
pr_
debug
(
"MCE: Unable to find user space address %lx in %s
\n
"
,
pr_
info
(
"MCE: Unable to find user space address %lx in %s
\n
"
,
page_to_pfn
(
p
),
tsk
->
comm
);
tk
->
addr_valid
=
0
;
}
...
...
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
pfn
,
err
);
}
else
if
(
page_has_private
(
p
)
&&
!
try_to_release_page
(
p
,
GFP_NOIO
))
{
pr_
debug
(
"MCE %#lx: failed to release buffers
\n
"
,
pfn
);
pr_
info
(
"MCE %#lx: failed to release buffers
\n
"
,
pfn
);
}
else
{
ret
=
RECOVERED
;
}
...
...
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
return
(
result
==
RECOVERED
||
result
==
DELAYED
)
?
0
:
-
EBUSY
;
}
#define N_UNMAP_TRIES 5
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
...
...
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct
address_space
*
mapping
;
LIST_HEAD
(
tokill
);
int
ret
;
int
i
;
int
kill
=
1
;
struct
page
*
hpage
=
compound_head
(
p
);
...
...
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if
(
kill
)
collect_procs
(
hpage
,
&
tokill
);
/*
* try_to_unmap can fail temporarily due to races.
* Try a few times (RED-PEN better strategy?)
*/
for
(
i
=
0
;
i
<
N_UNMAP_TRIES
;
i
++
)
{
ret
=
try_to_unmap
(
hpage
,
ttu
);
if
(
ret
==
SWAP_SUCCESS
)
break
;
pr_debug
(
"MCE %#lx: try_to_unmap retry needed %d
\n
"
,
pfn
,
ret
);
}
ret
=
try_to_unmap
(
hpage
,
ttu
);
if
(
ret
!=
SWAP_SUCCESS
)
printk
(
KERN_ERR
"MCE %#lx: failed to unmap page (mapcount=%d)
\n
"
,
pfn
,
page_mapcount
(
hpage
));
...
...
@@ -1147,7 +1138,7 @@ int unpoison_memory(unsigned long pfn)
page
=
compound_head
(
p
);
if
(
!
PageHWPoison
(
p
))
{
pr_
debug
(
"MCE: Page was already unpoisoned %#lx
\n
"
,
pfn
);
pr_
info
(
"MCE: Page was already unpoisoned %#lx
\n
"
,
pfn
);
return
0
;
}
...
...
@@ -1156,7 +1147,7 @@ int unpoison_memory(unsigned long pfn)
if
(
!
get_page_unless_zero
(
page
))
{
if
(
TestClearPageHWPoison
(
p
))
atomic_long_sub
(
nr_pages
,
&
mce_bad_pages
);
pr_
debug
(
"MCE: Software-unpoisoned free page %#lx
\n
"
,
pfn
);
pr_
info
(
"MCE: Software-unpoisoned free page %#lx
\n
"
,
pfn
);
return
0
;
}
...
...
@@ -1168,7 +1159,7 @@ int unpoison_memory(unsigned long pfn)
* the free buddy page pool.
*/
if
(
TestClearPageHWPoison
(
page
))
{
pr_
debug
(
"MCE: Software-unpoisoned page %#lx
\n
"
,
pfn
);
pr_
info
(
"MCE: Software-unpoisoned page %#lx
\n
"
,
pfn
);
atomic_long_sub
(
nr_pages
,
&
mce_bad_pages
);
freeit
=
1
;
}
...
...
@@ -1217,12 +1208,12 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
set_migratetype_isolate
(
p
);
if
(
!
get_page_unless_zero
(
compound_head
(
p
)))
{
if
(
is_free_buddy_page
(
p
))
{
pr_
debug
(
"get_any_page: %#lx free buddy page
\n
"
,
pfn
);
pr_
info
(
"get_any_page: %#lx free buddy page
\n
"
,
pfn
);
/* Set hwpoison bit while page is still isolated */
SetPageHWPoison
(
p
);
ret
=
0
;
}
else
{
pr_
debug
(
"get_any_page: %#lx: unknown zero refcount page type %lx
\n
"
,
pr_
info
(
"get_any_page: %#lx: unknown zero refcount page type %lx
\n
"
,
pfn
,
p
->
flags
);
ret
=
-
EIO
;
}
...
...
@@ -1288,7 +1279,7 @@ int soft_offline_page(struct page *page, int flags)
goto
done
;
}
if
(
!
PageLRU
(
page
))
{
pr_
debug
(
"soft_offline: %#lx: unknown non LRU page type %lx
\n
"
,
pr_
info
(
"soft_offline: %#lx: unknown non LRU page type %lx
\n
"
,
pfn
,
page
->
flags
);
return
-
EIO
;
}
...
...
@@ -1302,7 +1293,7 @@ int soft_offline_page(struct page *page, int flags)
if
(
PageHWPoison
(
page
))
{
unlock_page
(
page
);
put_page
(
page
);
pr_
debug
(
"soft offline: %#lx page already poisoned
\n
"
,
pfn
);
pr_
info
(
"soft offline: %#lx page already poisoned
\n
"
,
pfn
);
return
-
EBUSY
;
}
...
...
@@ -1323,7 +1314,7 @@ int soft_offline_page(struct page *page, int flags)
put_page
(
page
);
if
(
ret
==
1
)
{
ret
=
0
;
pr_
debug
(
"soft_offline: %#lx: invalidated
\n
"
,
pfn
);
pr_
info
(
"soft_offline: %#lx: invalidated
\n
"
,
pfn
);
goto
done
;
}
...
...
@@ -1339,13 +1330,13 @@ int soft_offline_page(struct page *page, int flags)
list_add
(
&
page
->
lru
,
&
pagelist
);
ret
=
migrate_pages
(
&
pagelist
,
new_page
,
MPOL_MF_MOVE_ALL
,
0
);
if
(
ret
)
{
pr_
debug
(
"soft offline: %#lx: migration failed %d, type %lx
\n
"
,
pr_
info
(
"soft offline: %#lx: migration failed %d, type %lx
\n
"
,
pfn
,
ret
,
page
->
flags
);
if
(
ret
>
0
)
ret
=
-
EIO
;
}
}
else
{
pr_
debug
(
"soft offline: %#lx: isolation failed: %d, page count %d, type %lx
\n
"
,
pr_
info
(
"soft offline: %#lx: isolation failed: %d, page count %d, type %lx
\n
"
,
pfn
,
ret
,
page_count
(
page
),
page
->
flags
);
}
if
(
ret
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment