Commit 7716f383 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'cgroup-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

 - Per-cpu cpu usage stats are now tracked

   This currently isn't printed out in the cgroupfs interface and can
   only be accessed through e.g. BPF. Should decide on a not-too-ugly
   way to show per-cpu stats in cgroupfs

 - cpuset received some cleanups and prepatory patches for the pending
   cpus.exclusive patchset which will allow cpuset partitions to be
   created below non-partition parents, which should ease the management
   of partition cpusets

 - A lot of code and documentation cleanup patches

 - tools/testing/selftests/cgroup/test_cpuset.c added

* tag 'cgroup-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (32 commits)
  cgroup: Avoid -Wstringop-overflow warnings
  cgroup:namespace: Remove unused cgroup_namespaces_init()
  cgroup/rstat: Record the cumulative per-cpu time of cgroup and its descendants
  cgroup: clean up if condition in cgroup_pidlist_start()
  cgroup: fix obsolete function name in cgroup_destroy_locked()
  Documentation: cgroup-v2.rst: Correct number of stats entries
  cgroup: fix obsolete function name above css_free_rwork_fn()
  cgroup/cpuset: fix kernel-doc
  cgroup: clean up printk()
  cgroup: fix obsolete comment above cgroup_create()
  docs: cgroup-v1: fix typo
  docs: cgroup-v1: correct the term of Page Cache organization in inode
  cgroup/misc: Store atomic64_t reads to u64
  cgroup/misc: Change counters to be explicit 64bit types
  cgroup/misc: update struct members descriptions
  cgroup: remove cgrp->kn check in css_populate_dir()
  cgroup: fix obsolete function name
  cgroup: use cached local variable parent in for loop
  cgroup: remove obsolete comment above struct cgroupstats
  cgroup: put cgroup_tryget_css() inside CONFIG_CGROUP_SCHED
  ...
parents e987af45 78d44b82
......@@ -195,11 +195,11 @@ are not accounted. We just account pages under usual VM management.
RSS pages are accounted at page_fault unless they've already been accounted
for earlier. A file page will be accounted for as Page Cache when it's
inserted into inode (radix-tree). While it's mapped into the page tables of
inserted into inode (xarray). While it's mapped into the page tables of
processes, duplicate accounting is carefully avoided.
An RSS page is unaccounted when it's fully unmapped. A PageCache page is
unaccounted when it's removed from radix-tree. Even if RSS pages are fully
unaccounted when it's removed from xarray. Even if RSS pages are fully
unmapped (by kswapd), they may exist as SwapCache in the system until they
are really freed. Such SwapCaches are also accounted.
A swapped-in page is accounted after adding into swapcache.
......@@ -907,7 +907,7 @@ experiences some pressure. In this situation, only group C will receive the
notification, i.e. groups A and B will not receive it. This is done to avoid
excessive "broadcasting" of messages, which disturbs the system and which is
especially bad if we are low on memory or thrashing. Group B, will receive
notification only if there are no event listers for group C.
notification only if there are no event listeners for group C.
There are three optional modes that specify different propagation behavior:
......
......@@ -1045,7 +1045,7 @@ All time durations are in microseconds.
- user_usec
- system_usec
and the following three when the controller is enabled:
and the following five when the controller is enabled:
- nr_periods
- nr_throttled
......
......@@ -5255,6 +5255,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
F: Documentation/admin-guide/cgroup-v1/cpusets.rst
F: include/linux/cpuset.h
F: kernel/cgroup/cpuset.c
F: tools/testing/selftests/cgroup/test_cpuset.c
F: tools/testing/selftests/cgroup/test_cpuset_prs.sh
CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
M: Johannes Weiner <hannes@cmpxchg.org>
......
......@@ -341,6 +341,20 @@ struct cgroup_rstat_cpu {
*/
struct cgroup_base_stat last_bstat;
/*
* This field is used to record the cumulative per-cpu time of
* the cgroup and its descendants. Currently it can be read via
* eBPF/drgn etc, and we are still trying to determine how to
* expose it in the cgroupfs interface.
*/
struct cgroup_base_stat subtree_bstat;
/*
* Snapshots at the last reading. These are used to calculate the
* deltas to propagate to the per-cpu subtree_bstat.
*/
struct cgroup_base_stat last_subtree_bstat;
/*
* Child cgroups with stat updates on this cpu since the last read
* are linked on the parent's ->updated_children through
......
......@@ -31,17 +31,18 @@ struct misc_cg;
* struct misc_res: Per cgroup per misc type resource
* @max: Maximum limit on the resource.
* @usage: Current usage of the resource.
* @failed: True if charged failed for the resource in a cgroup.
* @events: Number of times, the resource limit exceeded.
*/
struct misc_res {
unsigned long max;
atomic_long_t usage;
atomic_long_t events;
u64 max;
atomic64_t usage;
atomic64_t events;
};
/**
* struct misc_cg - Miscellaneous controller's cgroup structure.
* @css: cgroup subsys state object.
* @events_file: Handle for the misc resources events file.
* @res: Array of misc resources usage in the cgroup.
*/
struct misc_cg {
......@@ -53,12 +54,10 @@ struct misc_cg {
struct misc_res res[MISC_CG_RES_TYPES];
};
unsigned long misc_cg_res_total_usage(enum misc_res_type type);
int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity);
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
unsigned long amount);
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
unsigned long amount);
u64 misc_cg_res_total_usage(enum misc_res_type type);
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity);
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
/**
* css_misc() - Get misc cgroup from the css.
......@@ -99,27 +98,26 @@ static inline void put_misc_cg(struct misc_cg *cg)
#else /* !CONFIG_CGROUP_MISC */
static inline unsigned long misc_cg_res_total_usage(enum misc_res_type type)
static inline u64 misc_cg_res_total_usage(enum misc_res_type type)
{
return 0;
}
static inline int misc_cg_set_capacity(enum misc_res_type type,
unsigned long capacity)
static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
{
return 0;
}
static inline int misc_cg_try_charge(enum misc_res_type type,
struct misc_cg *cg,
unsigned long amount)
u64 amount)
{
return 0;
}
static inline void misc_cg_uncharge(enum misc_res_type type,
struct misc_cg *cg,
unsigned long amount)
u64 amount)
{
}
......
......@@ -24,8 +24,6 @@
* basis. This data is shared using taskstats.
*
* Most of these states are derived by looking at the task->state value
* For the nr_io_wait state, a flag in the delay accounting structure
* indicates that the task is waiting on IO
*
* Each member is aligned to a 8 byte boundary.
*/
......
......@@ -431,7 +431,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
if (l->list[mid] == pid) {
index = mid;
break;
} else if (l->list[mid] <= pid)
} else if (l->list[mid] < pid)
index = mid + 1;
else
end = mid;
......
......@@ -492,28 +492,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
return &cgrp->self;
}
/**
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get @cgrp's css associated with @ss. If the css doesn't exist
* or is offline, %NULL is returned.
*/
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
css = cgroup_css(cgrp, ss);
if (css && !css_tryget_online(css))
css = NULL;
rcu_read_unlock();
return css;
}
/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
......@@ -679,7 +657,7 @@ EXPORT_SYMBOL_GPL(of_css);
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
* Should be called under cgroup_mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
......@@ -929,7 +907,7 @@ static void css_set_move_task(struct task_struct *task,
#define CSS_SET_HASH_BITS 7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
static unsigned long css_set_hash(struct cgroup_subsys_state **css)
{
unsigned long key = 0UL;
struct cgroup_subsys *ss;
......@@ -1070,7 +1048,7 @@ static bool compare_css_sets(struct css_set *cset,
*/
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
struct cgroup_subsys_state **template)
{
struct cgroup_root *root = cgrp->root;
struct cgroup_subsys *ss;
......@@ -1736,7 +1714,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
struct cftype *cfts, *failed_cfts;
int ret;
if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
if (css->flags & CSS_VISIBLE)
return 0;
if (!css->ss) {
......@@ -2499,7 +2477,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
/*
* This function may be called both before and
* after cgroup_taskset_migrate(). The two cases
* after cgroup_migrate_execute(). The two cases
* can be distinguished by looking at whether @cset
* has its ->mg_dst_cset set.
*/
......@@ -3654,9 +3632,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
struct cgroup *cgrp, int ssid)
#ifdef CONFIG_CGROUP_SCHED
/**
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get @cgrp's css associated with @ss. If the css doesn't exist
* or is offline, %NULL is returned.
*/
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
css = cgroup_css(cgrp, ss);
if (css && !css_tryget_online(css))
css = NULL;
rcu_read_unlock();
return css;
}
static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
int ret;
......@@ -3672,15 +3673,15 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
css_put(css);
return ret;
}
#endif
static int cpu_stat_show(struct seq_file *seq, void *v)
{
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
#endif
return ret;
}
......@@ -4350,14 +4351,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
return ret;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
static void cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
return 0;
}
/**
......@@ -4373,8 +4373,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts)
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
if (!cfts || cfts[0].name[0] == '\0')
return 0;
......@@ -4382,9 +4380,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
return -ENOENT;
cgroup_lock();
ret = cgroup_rm_cftypes_locked(cfts);
cgroup_rm_cftypes_locked(cfts);
cgroup_unlock();
return ret;
return 0;
}
/**
......@@ -5337,7 +5335,7 @@ static struct cftype cgroup_psi_files[] = {
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
* css_free_work_fn().
* css_free_rwork_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
......@@ -5581,8 +5579,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
/*
* The returned cgroup is fully initialized including its control mask, but
* it isn't associated with its kernfs_node and doesn't have the control
* mask applied.
* it doesn't have the control mask applied.
*/
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
umode_t mode)
......@@ -5908,7 +5905,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Mark @cgrp and the associated csets dead. The former prevents
* further task migration and child creation by disabling
* cgroup_lock_live_group(). The latter makes the csets ignored by
* cgroup_kn_lock_live(). The latter makes the csets ignored by
* the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
......@@ -5930,7 +5927,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
/*
......@@ -6123,7 +6120,7 @@ int __init cgroup_init(void)
continue;
if (cgroup1_ssid_disabled(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
pr_info("Disabling %s control group subsystem in v1 mounts\n",
ss->name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
......
This diff is collapsed.
......@@ -14,7 +14,7 @@
#include <linux/misc_cgroup.h>
#define MAX_STR "max"
#define MAX_NUM ULONG_MAX
#define MAX_NUM U64_MAX
/* Miscellaneous res name, keep it in sync with enum misc_res_type */
static const char *const misc_res_name[] = {
......@@ -37,7 +37,7 @@ static struct misc_cg root_cg;
* more than the actual capacity. We are using Limits resource distribution
* model of cgroup for miscellaneous controller.
*/
static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
static u64 misc_res_capacity[MISC_CG_RES_TYPES];
/**
* parent_misc() - Get the parent of the passed misc cgroup.
......@@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type)
* Context: Any context.
* Return: Current total usage of the resource.
*/
unsigned long misc_cg_res_total_usage(enum misc_res_type type)
u64 misc_cg_res_total_usage(enum misc_res_type type)
{
if (valid_type(type))
return atomic_long_read(&root_cg.res[type].usage);
return atomic64_read(&root_cg.res[type].usage);
return 0;
}
......@@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
* * %0 - Successfully registered the capacity.
* * %-EINVAL - If @type is invalid.
*/
int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
{
if (!valid_type(type))
return -EINVAL;
......@@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
* Context: Any context.
*/
static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
unsigned long amount)
u64 amount)
{
WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage),
"misc cgroup resource %s became less than 0",
misc_res_name[type]);
}
......@@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
* * -EBUSY - If max limit will be crossed or total usage will be more than the
* capacity.
*/
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
unsigned long amount)
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
{
struct misc_cg *i, *j;
int ret;
struct misc_res *res;
int new_usage;
u64 new_usage;
if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
return -EINVAL;
......@@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
for (i = cg; i; i = parent_misc(i)) {
res = &i->res[type];
new_usage = atomic_long_add_return(amount, &res->usage);
new_usage = atomic64_add_return(amount, &res->usage);
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
ret = -EBUSY;
......@@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
err_charge:
for (j = i; j; j = parent_misc(j)) {
atomic_long_inc(&j->res[type].events);
atomic64_inc(&j->res[type].events);
cgroup_file_notify(&j->events_file);
}
......@@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge);
*
* Context: Any context.
*/
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
unsigned long amount)
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
{
struct misc_cg *i;
......@@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
{
int i;
struct misc_cg *cg = css_misc(seq_css(sf));
unsigned long max;
u64 max;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
if (READ_ONCE(misc_res_capacity[i])) {
......@@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
if (max == MAX_NUM)
seq_printf(sf, "%s max\n", misc_res_name[i]);
else
seq_printf(sf, "%s %lu\n", misc_res_name[i],
seq_printf(sf, "%s %llu\n", misc_res_name[i],
max);
}
}
......@@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v)
* Return:
* * >= 0 - Number of bytes processed in the input.
* * -EINVAL - If buf is not valid.
* * -ERANGE - If number is bigger than the unsigned long capacity.
* * -ERANGE - If number is bigger than the u64 capacity.
*/
static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct misc_cg *cg;
unsigned long max;
u64 max;
int ret = 0, i;
enum misc_res_type type = MISC_CG_RES_TYPES;
char *token;
......@@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
if (!strcmp(MAX_STR, buf)) {
max = MAX_NUM;
} else {
ret = kstrtoul(buf, 0, &max);
ret = kstrtou64(buf, 0, &max);
if (ret)
return ret;
}
......@@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
static int misc_cg_current_show(struct seq_file *sf, void *v)
{
int i;
unsigned long usage;
u64 usage;
struct misc_cg *cg = css_misc(seq_css(sf));
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
usage = atomic_long_read(&cg->res[i].usage);
usage = atomic64_read(&cg->res[i].usage);
if (READ_ONCE(misc_res_capacity[i]) || usage)
seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
seq_printf(sf, "%s %llu\n", misc_res_name[i], usage);
}
return 0;
......@@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
static int misc_cg_capacity_show(struct seq_file *sf, void *v)
{
int i;
unsigned long cap;
u64 cap;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
cap = READ_ONCE(misc_res_capacity[i]);
if (cap)
seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
seq_printf(sf, "%s %llu\n", misc_res_name[i], cap);
}
return 0;
......@@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
static int misc_events_show(struct seq_file *sf, void *v)
{
struct misc_cg *cg = css_misc(seq_css(sf));
unsigned long events, i;
u64 events;
int i;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
events = atomic_long_read(&cg->res[i].events);
events = atomic64_read(&cg->res[i].events);
if (READ_ONCE(misc_res_capacity[i]) || events)
seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
}
return 0;
}
......@@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
WRITE_ONCE(cg->res[i].max, MAX_NUM);
atomic_long_set(&cg->res[i].usage, 0);
atomic64_set(&cg->res[i].usage, 0);
}
return &cg->css;
......
......@@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = {
.install = cgroupns_install,
.owner = cgroupns_owner,
};
static __init int cgroup_namespaces_init(void)
{
return 0;
}
subsys_initcall(cgroup_namespaces_init);
......@@ -344,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_rstat_cpu *prstatc;
struct cgroup_base_stat delta;
unsigned seq;
......@@ -357,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
/* propagate per-cpu delta to cgroup and per-cpu global statistics */
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
/* propagate global delta to parent (unless that's root) */
/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
if (cgroup_parent(parent)) {
delta = cgrp->bstat;
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
delta = rstatc->subtree_bstat;
prstatc = cgroup_rstat_cpu(parent, cpu);
cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
}
}
......
......@@ -5,5 +5,6 @@ test_freezer
test_kmem
test_kill
test_cpu
test_cpuset
test_zswap
wait_inotify
......@@ -12,6 +12,7 @@ TEST_GEN_PROGS += test_core
TEST_GEN_PROGS += test_freezer
TEST_GEN_PROGS += test_kill
TEST_GEN_PROGS += test_cpu
TEST_GEN_PROGS += test_cpuset
TEST_GEN_PROGS += test_zswap
LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h
......@@ -24,4 +25,5 @@ $(OUTPUT)/test_core: cgroup_util.c
$(OUTPUT)/test_freezer: cgroup_util.c
$(OUTPUT)/test_kill: cgroup_util.c
$(OUTPUT)/test_cpu: cgroup_util.c
$(OUTPUT)/test_cpuset: cgroup_util.c
$(OUTPUT)/test_zswap: cgroup_util.c
......@@ -286,6 +286,8 @@ int cg_destroy(const char *cgroup)
{
int ret;
if (!cgroup)
return 0;
retry:
ret = rmdir(cgroup);
if (ret && errno == EBUSY) {
......
......@@ -11,6 +11,8 @@
#define USEC_PER_SEC 1000000L
#define NSEC_PER_SEC 1000000000L
#define TEST_UID 65534 /* usually nobody, any !root is fine */
/*
* Checks if two given values differ by less than err% of their sum.
*/
......
......@@ -683,7 +683,7 @@ static int test_cgcore_thread_migration(const char *root)
*/
static int test_cgcore_lesser_euid_open(const char *root)
{
const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
const uid_t test_euid = TEST_UID;
int ret = KSFT_FAIL;
char *cg_test_a = NULL, *cg_test_b = NULL;
char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/limits.h>
#include <signal.h>
#include "../kselftest.h"
#include "cgroup_util.h"
static int idle_process_fn(const char *cgroup, void *arg)
{
(void)pause();
return 0;
}
static int do_migration_fn(const char *cgroup, void *arg)
{
int object_pid = (int)(size_t)arg;
if (setuid(TEST_UID))
return EXIT_FAILURE;
// XXX checking /proc/$pid/cgroup would be quicker than wait
if (cg_enter(cgroup, object_pid) ||
cg_wait_for_proc_count(cgroup, 1))
return EXIT_FAILURE;
return EXIT_SUCCESS;
}
static int do_controller_fn(const char *cgroup, void *arg)
{
const char *child = cgroup;
const char *parent = arg;
if (setuid(TEST_UID))
return EXIT_FAILURE;
if (!cg_read_strstr(child, "cgroup.controllers", "cpuset"))
return EXIT_FAILURE;
if (cg_write(parent, "cgroup.subtree_control", "+cpuset"))
return EXIT_FAILURE;
if (cg_read_strstr(child, "cgroup.controllers", "cpuset"))
return EXIT_FAILURE;
if (cg_write(parent, "cgroup.subtree_control", "-cpuset"))
return EXIT_FAILURE;
if (!cg_read_strstr(child, "cgroup.controllers", "cpuset"))
return EXIT_FAILURE;
return EXIT_SUCCESS;
}
/*
* Migrate a process between two sibling cgroups.
* The success should only depend on the parent cgroup permissions and not the
* migrated process itself (cpuset controller is in place because it uses
* security_task_setscheduler() in cgroup v1).
*
* Deliberately don't set cpuset.cpus in children to avoid definining migration
* permissions between two different cpusets.
*/
static int test_cpuset_perms_object(const char *root, bool allow)
{
char *parent = NULL, *child_src = NULL, *child_dst = NULL;
char *parent_procs = NULL, *child_src_procs = NULL, *child_dst_procs = NULL;
const uid_t test_euid = TEST_UID;
int object_pid = 0;
int ret = KSFT_FAIL;
parent = cg_name(root, "cpuset_test_0");
if (!parent)
goto cleanup;
parent_procs = cg_name(parent, "cgroup.procs");
if (!parent_procs)
goto cleanup;
if (cg_create(parent))
goto cleanup;
child_src = cg_name(parent, "cpuset_test_1");
if (!child_src)
goto cleanup;
child_src_procs = cg_name(child_src, "cgroup.procs");
if (!child_src_procs)
goto cleanup;
if (cg_create(child_src))
goto cleanup;
child_dst = cg_name(parent, "cpuset_test_2");
if (!child_dst)
goto cleanup;
child_dst_procs = cg_name(child_dst, "cgroup.procs");
if (!child_dst_procs)
goto cleanup;
if (cg_create(child_dst))
goto cleanup;
if (cg_write(parent, "cgroup.subtree_control", "+cpuset"))
goto cleanup;
if (cg_read_strstr(child_src, "cgroup.controllers", "cpuset") ||
cg_read_strstr(child_dst, "cgroup.controllers", "cpuset"))
goto cleanup;
/* Enable permissions along src->dst tree path */
if (chown(child_src_procs, test_euid, -1) ||
chown(child_dst_procs, test_euid, -1))
goto cleanup;
if (allow && chown(parent_procs, test_euid, -1))
goto cleanup;
/* Fork a privileged child as a test object */
object_pid = cg_run_nowait(child_src, idle_process_fn, NULL);
if (object_pid < 0)
goto cleanup;
/* Carry out migration in a child process that can drop all privileges
* (including capabilities), the main process must remain privileged for
* cleanup.
* Child process's cgroup is irrelevant but we place it into child_dst
* as hacky way to pass information about migration target to the child.
*/
if (allow ^ (cg_run(child_dst, do_migration_fn, (void *)(size_t)object_pid) == EXIT_SUCCESS))
goto cleanup;
ret = KSFT_PASS;
cleanup:
if (object_pid > 0) {
(void)kill(object_pid, SIGTERM);
(void)clone_reap(object_pid, WEXITED);
}
cg_destroy(child_dst);
free(child_dst_procs);
free(child_dst);
cg_destroy(child_src);
free(child_src_procs);
free(child_src);
cg_destroy(parent);
free(parent_procs);
free(parent);
return ret;
}
static int test_cpuset_perms_object_allow(const char *root)
{
return test_cpuset_perms_object(root, true);
}
static int test_cpuset_perms_object_deny(const char *root)
{
return test_cpuset_perms_object(root, false);
}
/*
* Migrate a process between parent and child implicitely
* Implicit migration happens when a controller is enabled/disabled.
*
*/
static int test_cpuset_perms_subtree(const char *root)
{
char *parent = NULL, *child = NULL;
char *parent_procs = NULL, *parent_subctl = NULL, *child_procs = NULL;
const uid_t test_euid = TEST_UID;
int object_pid = 0;
int ret = KSFT_FAIL;
parent = cg_name(root, "cpuset_test_0");
if (!parent)
goto cleanup;
parent_procs = cg_name(parent, "cgroup.procs");
if (!parent_procs)
goto cleanup;
parent_subctl = cg_name(parent, "cgroup.subtree_control");
if (!parent_subctl)
goto cleanup;
if (cg_create(parent))
goto cleanup;
child = cg_name(parent, "cpuset_test_1");
if (!child)
goto cleanup;
child_procs = cg_name(child, "cgroup.procs");
if (!child_procs)
goto cleanup;
if (cg_create(child))
goto cleanup;
/* Enable permissions as in a delegated subtree */
if (chown(parent_procs, test_euid, -1) ||
chown(parent_subctl, test_euid, -1) ||
chown(child_procs, test_euid, -1))
goto cleanup;
/* Put a privileged child in the subtree and modify controller state
* from an unprivileged process, the main process remains privileged
* for cleanup.
* The unprivileged child runs in subtree too to avoid parent and
* internal-node constraing violation.
*/
object_pid = cg_run_nowait(child, idle_process_fn, NULL);
if (object_pid < 0)
goto cleanup;
if (cg_run(child, do_controller_fn, parent) != EXIT_SUCCESS)
goto cleanup;
ret = KSFT_PASS;
cleanup:
if (object_pid > 0) {
(void)kill(object_pid, SIGTERM);
(void)clone_reap(object_pid, WEXITED);
}
cg_destroy(child);
free(child_procs);
free(child);
cg_destroy(parent);
free(parent_subctl);
free(parent_procs);
free(parent);
return ret;
}
#define T(x) { x, #x }
struct cpuset_test {
int (*fn)(const char *root);
const char *name;
} tests[] = {
T(test_cpuset_perms_object_allow),
T(test_cpuset_perms_object_deny),
T(test_cpuset_perms_subtree),
};
#undef T
int main(int argc, char *argv[])
{
char root[PATH_MAX];
int i, ret = EXIT_SUCCESS;
if (cg_find_unified_root(root, sizeof(root)))
ksft_exit_skip("cgroup v2 isn't mounted\n");
if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset"))
if (cg_write(root, "cgroup.subtree_control", "+cpuset"))
ksft_exit_skip("Failed to set cpuset controller\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
switch (tests[i].fn(root)) {
case KSFT_PASS:
ksft_test_result_pass("%s\n", tests[i].name);
break;
case KSFT_SKIP:
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
return ret;
}
......@@ -10,7 +10,7 @@
skip_test() {
echo "$1"
echo "Test SKIPPED"
exit 0
exit 4 # ksft_skip
}
[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment