Commit cefe53f8 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] /proc/sys/vm/min_free_kbytes

From: Matthew Dobson <colpatch@us.ibm.com>

This resurrects the old /proc/sys/vm/free_pages functionality: the ability to
tell page reclaim how much free memory to maintain.

This may be needed for specialised networking applications, and it provides
an interesting way to stress the kernel: set it very low so atomic
allocations can easily fail.

Also, a 16G ppc64 box currently cruises along at 1M free memory, which is
surely too little to supporthigh-speed networking.  We have not changed that
setting here, but it is now possible to do so.

The patch also reduces the amount of free memory which the VM will maintain
in ZONE_HIGHMEM, as it is almost always wasted memory.
parent 1b0a5d8e
...@@ -540,8 +540,6 @@ running once the system is up. ...@@ -540,8 +540,6 @@ running once the system is up.
[KNL,ACPI] Mark specific memory as reserved. [KNL,ACPI] Mark specific memory as reserved.
Region of memory to be used, from ss to ss+nn. Region of memory to be used, from ss to ss+nn.
memfrac= [KNL]
meye= [HW] Set MotionEye Camera parameters meye= [HW] Set MotionEye Camera parameters
See Documentation/video4linux/meye.txt. See Documentation/video4linux/meye.txt.
......
...@@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/vm: ...@@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/vm:
- dirty_background_ratio - dirty_background_ratio
- dirty_expire_centisecs - dirty_expire_centisecs
- dirty_writeback_centisecs - dirty_writeback_centisecs
- min_free_kbytes
============================================================== ==============================================================
...@@ -74,3 +75,11 @@ The number of pages the kernel reads in at once is equal to ...@@ -74,3 +75,11 @@ The number of pages the kernel reads in at once is equal to
2 ^ page-cluster. Values above 2 ^ 5 don't make much sense 2 ^ page-cluster. Values above 2 ^ 5 don't make much sense
for swap because we only cluster swap data in 32-page groups. for swap because we only cluster swap data in 32-page groups.
==============================================================
min_free_kbytes:
This is used to force the Linux VM to keep a minimum number
of kilobytes free. The VM uses this number to compute a pages_min
value for each lowmem zone in the system. Each lowmem zone gets
a number of reserved free pages based proportionally on its size.
...@@ -249,6 +249,25 @@ static inline struct zone *next_zone(struct zone *zone) ...@@ -249,6 +249,25 @@ static inline struct zone *next_zone(struct zone *zone)
#define for_each_zone(zone) \ #define for_each_zone(zone) \
for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
/**
* is_highmem - helper function to quickly check if a struct zone is a
* highmem zone or not. This is an attempt to keep references
* to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
* @zone - pointer to struct zone variable
*/
static inline int is_highmem(struct zone *zone)
{
return (zone - zone->zone_pgdat->node_zones == ZONE_HIGHMEM);
}
/* These two functions are used to setup the per zone pages min values */
struct ctl_table;
struct file;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
void *, size_t *);
extern void setup_per_zone_pages_min(void);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
#define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ #define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */
#else /* !CONFIG_NUMA */ #else /* !CONFIG_NUMA */
......
...@@ -156,6 +156,7 @@ enum ...@@ -156,6 +156,7 @@ enum
VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */
VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ VM_SWAPPINESS=19, /* Tendency to steal mapped memory */
VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */
}; };
......
...@@ -390,6 +390,7 @@ asmlinkage void __init start_kernel(void) ...@@ -390,6 +390,7 @@ asmlinkage void __init start_kernel(void)
lock_kernel(); lock_kernel();
printk(linux_banner); printk(linux_banner);
setup_arch(&command_line); setup_arch(&command_line);
setup_per_zone_pages_min();
setup_per_cpu_areas(); setup_per_cpu_areas();
/* /*
......
...@@ -57,6 +57,7 @@ extern char core_pattern[]; ...@@ -57,6 +57,7 @@ extern char core_pattern[];
extern int cad_pid; extern int cad_pid;
extern int pid_max; extern int pid_max;
extern int sysctl_lower_zone_protection; extern int sysctl_lower_zone_protection;
extern int min_free_kbytes;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535; static int maxolduid = 65535;
...@@ -661,6 +662,16 @@ static ctl_table vm_table[] = { ...@@ -661,6 +662,16 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_intvec, .strategy = &sysctl_intvec,
.extra1 = &zero, .extra1 = &zero,
}, },
{
.ctl_name = VM_MIN_FREE_KBYTES,
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
.maxlen = sizeof(min_free_kbytes),
.mode = 0644,
.proc_handler = &min_free_kbytes_sysctl_handler,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
{ .ctl_name = 0 } { .ctl_name = 0 }
}; };
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/topology.h> #include <linux/topology.h>
#include <linux/sysctl.h>
DECLARE_BITMAP(node_online_map, MAX_NUMNODES); DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS); DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS);
...@@ -47,9 +48,7 @@ struct zone *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; ...@@ -47,9 +48,7 @@ struct zone *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
EXPORT_SYMBOL(zone_table); EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; int min_free_kbytes = 1024;
static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
/* /*
* Temporary debugging check for pages not lying within a given zone. * Temporary debugging check for pages not lying within a given zone.
...@@ -1205,7 +1204,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, ...@@ -1205,7 +1204,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
for (j = 0; j < MAX_NR_ZONES; j++) { for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j; struct zone *zone = pgdat->node_zones + j;
unsigned long mask;
unsigned long size, realsize; unsigned long size, realsize;
unsigned long batch; unsigned long batch;
...@@ -1279,15 +1277,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, ...@@ -1279,15 +1277,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
pgdat->nr_zones = j+1; pgdat->nr_zones = j+1;
mask = (realsize / zone_balance_ratio[j]);
if (mask < zone_balance_min[j])
mask = zone_balance_min[j];
else if (mask > zone_balance_max[j])
mask = zone_balance_max[j];
zone->pages_min = mask;
zone->pages_low = mask*2;
zone->pages_high = mask*3;
zone->zone_mem_map = lmem_map; zone->zone_mem_map = lmem_map;
zone->zone_start_pfn = zone_start_pfn; zone->zone_start_pfn = zone_start_pfn;
...@@ -1372,19 +1361,6 @@ void __init free_area_init(unsigned long *zones_size) ...@@ -1372,19 +1361,6 @@ void __init free_area_init(unsigned long *zones_size)
} }
#endif #endif
static int __init setup_mem_frac(char *str)
{
int j = 0;
while (get_option(&str, &zone_balance_ratio[j++]) == 2);
printk("setup_mem_frac: ");
for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
printk("\n");
return 1;
}
__setup("memfrac=", setup_mem_frac);
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
#include <linux/seq_file.h> #include <linux/seq_file.h>
...@@ -1561,3 +1537,64 @@ void __init page_alloc_init(void) ...@@ -1561,3 +1537,64 @@ void __init page_alloc_init(void)
init_page_alloc_cpu(smp_processor_id()); init_page_alloc_cpu(smp_processor_id());
register_cpu_notifier(&page_alloc_nb); register_cpu_notifier(&page_alloc_nb);
} }
/*
* setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
* that the pages_{min,low,high} values for each zone are set correctly
* with respect to min_free_kbytes.
*/
void setup_per_zone_pages_min(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone)
if (!is_highmem(zone))
lowmem_pages += zone->present_pages;
for_each_zone(zone) {
spin_lock_irqsave(&zone->lru_lock, flags);
if (is_highmem(zone)) {
/*
* Often, highmem doesn't need to reserve any pages.
* But the pages_min/low/high values are also used for
* batching up page reclaim activity so we need a
* decent value here.
*/
int min_pages;
min_pages = zone->present_pages / 1024;
if (min_pages < SWAP_CLUSTER_MAX)
min_pages = SWAP_CLUSTER_MAX;
if (min_pages > 128)
min_pages = 128;
zone->pages_min = min_pages;
} else {
/* if it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
zone->pages_min = (pages_min * zone->present_pages) /
lowmem_pages;
}
zone->pages_low = zone->pages_min * 2;
zone->pages_high = zone->pages_min * 3;
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
}
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call setup_per_zone_pages_min() whenever min_free_kbytes
* changes.
*/
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
struct file *file, void *buffer, size_t *length)
{
proc_dointvec(table, write, file, buffer, length);
setup_per_zone_pages_min();
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment