Commit 1b96a41b authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:
 "There are several notable changes here:

   - Single thread migrating itself has been optimized so that it
     doesn't need threadgroup rwsem anymore.

   - Freezer optimization to avoid unnecessary frozen state changes.

   - cgroup ID unification so that cgroup fs ino is the only unique ID
     used for the cgroup and can be used to directly look up live
     cgroups through filehandle interface on 64bit ino archs. On 32bit
     archs, cgroup fs ino is still the only ID in use but it is only
     unique when combined with gen.

   - selftest and other changes"

* 'for-5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (24 commits)
  writeback: fix -Wformat compilation warnings
  docs: cgroup: mm: Fix spelling of "list"
  cgroup: fix incorrect WARN_ON_ONCE() in cgroup_setup_root()
  cgroup: use cgrp->kn->id as the cgroup ID
  kernfs: use 64bit inos if ino_t is 64bit
  kernfs: implement custom exportfs ops and fid type
  kernfs: combine ino/id lookup functions into kernfs_find_and_get_node_by_id()
  kernfs: convert kernfs_node->id from union kernfs_node_id to u64
  kernfs: kernfs_find_and_get_node_by_ino() should only look up activated nodes
  kernfs: use dumber locking for kernfs_find_and_get_node_by_ino()
  netprio: use css ID instead of cgroup ID
  writeback: use ino_t for inodes in tracepoints
  kernfs: fix ino wrap-around detection
  kselftests: cgroup: Avoid the reuse of fd after it is deallocated
  cgroup: freezer: don't change task and cgroups status unnecessarily
  cgroup: use cgroup->last_bstat instead of cgroup->bstat_pending for consistency
  cgroup: remove cgroup_enable_task_cg_lists() optimization
  cgroup: pids: use atomic64_t for pids->limit
  selftests: cgroup: Run test_core under interfering stress
  selftests: cgroup: Add task migration tests
  ...
parents 9391edee 40363cf1
...@@ -1334,7 +1334,7 @@ PAGE_SIZE multiple when read back. ...@@ -1334,7 +1334,7 @@ PAGE_SIZE multiple when read back.
pgdeactivate pgdeactivate
Amount of pages moved to the inactive LRU lis Amount of pages moved to the inactive LRU list
pglazyfree pglazyfree
......
...@@ -508,10 +508,6 @@ void kernfs_put(struct kernfs_node *kn) ...@@ -508,10 +508,6 @@ void kernfs_put(struct kernfs_node *kn)
struct kernfs_node *parent; struct kernfs_node *parent;
struct kernfs_root *root; struct kernfs_root *root;
/*
* kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino
* depends on this to filter reused stale node
*/
if (!kn || !atomic_dec_and_test(&kn->count)) if (!kn || !atomic_dec_and_test(&kn->count))
return; return;
root = kernfs_root(kn); root = kernfs_root(kn);
...@@ -536,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn) ...@@ -536,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn)
kmem_cache_free(kernfs_iattrs_cache, kn->iattr); kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
} }
spin_lock(&kernfs_idr_lock); spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, kn->id.ino); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock); spin_unlock(&kernfs_idr_lock);
kmem_cache_free(kernfs_node_cache, kn); kmem_cache_free(kernfs_node_cache, kn);
...@@ -621,8 +617,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, ...@@ -621,8 +617,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
unsigned flags) unsigned flags)
{ {
struct kernfs_node *kn; struct kernfs_node *kn;
u32 gen; u32 id_highbits;
int cursor;
int ret; int ret;
name = kstrdup_const(name, GFP_KERNEL); name = kstrdup_const(name, GFP_KERNEL);
...@@ -635,23 +630,19 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, ...@@ -635,23 +630,19 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
idr_preload(GFP_KERNEL); idr_preload(GFP_KERNEL);
spin_lock(&kernfs_idr_lock); spin_lock(&kernfs_idr_lock);
cursor = idr_get_cursor(&root->ino_idr);
ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
if (ret >= 0 && ret < cursor) if (ret >= 0 && ret < root->last_id_lowbits)
root->next_generation++; root->id_highbits++;
gen = root->next_generation; id_highbits = root->id_highbits;
root->last_id_lowbits = ret;
spin_unlock(&kernfs_idr_lock); spin_unlock(&kernfs_idr_lock);
idr_preload_end(); idr_preload_end();
if (ret < 0) if (ret < 0)
goto err_out2; goto err_out2;
kn->id.ino = ret;
kn->id.generation = gen;
/* kn->id = (u64)id_highbits << 32 | ret;
* set ino first. This RELEASE is paired with atomic_inc_not_zero in
* kernfs_find_and_get_node_by_ino atomic_set(&kn->count, 1);
*/
atomic_set_release(&kn->count, 1);
atomic_set(&kn->active, KN_DEACTIVATED_BIAS); atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
RB_CLEAR_NODE(&kn->rb); RB_CLEAR_NODE(&kn->rb);
...@@ -680,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, ...@@ -680,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
return kn; return kn;
err_out3: err_out3:
idr_remove(&root->ino_idr, kn->id.ino); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
err_out2: err_out2:
kmem_cache_free(kernfs_node_cache, kn); kmem_cache_free(kernfs_node_cache, kn);
err_out1: err_out1:
...@@ -705,50 +696,52 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, ...@@ -705,50 +696,52 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
} }
/* /*
* kernfs_find_and_get_node_by_ino - get kernfs_node from inode number * kernfs_find_and_get_node_by_id - get kernfs_node from node id
* @root: the kernfs root * @root: the kernfs root
* @ino: inode number * @id: the target node id
*
* @id's lower 32bits encode ino and upper gen. If the gen portion is
* zero, all generations are matched.
* *
* RETURNS: * RETURNS:
* NULL on failure. Return a kernfs node with reference counter incremented * NULL on failure. Return a kernfs node with reference counter incremented
*/ */
struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
unsigned int ino) u64 id)
{ {
struct kernfs_node *kn; struct kernfs_node *kn;
ino_t ino = kernfs_id_ino(id);
u32 gen = kernfs_id_gen(id);
rcu_read_lock(); spin_lock(&kernfs_idr_lock);
kn = idr_find(&root->ino_idr, ino);
kn = idr_find(&root->ino_idr, (u32)ino);
if (!kn) if (!kn)
goto out; goto err_unlock;
/* if (sizeof(ino_t) >= sizeof(u64)) {
* Since kernfs_node is freed in RCU, it's possible an old node for ino /* we looked up with the low 32bits, compare the whole */
* is freed, but reused before RCU grace period. But a freed node (see if (kernfs_ino(kn) != ino)
* kernfs_put) or an incompletedly initialized node (see goto err_unlock;
* __kernfs_new_node) should have 'count' 0. We can use this fact to } else {
* filter out such node. /* 0 matches all generations */
*/ if (unlikely(gen && kernfs_gen(kn) != gen))
if (!atomic_inc_not_zero(&kn->count)) { goto err_unlock;
kn = NULL;
goto out;
} }
/* /*
* The node could be a new node or a reused node. If it's a new node, * ACTIVATED is protected with kernfs_mutex but it was clear when
* we are ok. If it's reused because of RCU (because of * @kn was added to idr and we just wanna see it set. No need to
* SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino' * grab kernfs_mutex.
* before 'count'. So if 'count' is uptodate, 'ino' should be uptodate,
* hence we can use 'ino' to filter stale node.
*/ */
if (kn->id.ino != ino) if (unlikely(!(kn->flags & KERNFS_ACTIVATED) ||
goto out; !atomic_inc_not_zero(&kn->count)))
rcu_read_unlock(); goto err_unlock;
spin_unlock(&kernfs_idr_lock);
return kn; return kn;
out: err_unlock:
rcu_read_unlock(); spin_unlock(&kernfs_idr_lock);
kernfs_put(kn);
return NULL; return NULL;
} }
...@@ -962,7 +955,17 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, ...@@ -962,7 +955,17 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
idr_init(&root->ino_idr); idr_init(&root->ino_idr);
INIT_LIST_HEAD(&root->supers); INIT_LIST_HEAD(&root->supers);
root->next_generation = 1;
/*
* On 64bit ino setups, id is ino. On 32bit, low 32bits are ino.
* High bits generation. The starting value for both ino and
* genenration is 1. Initialize upper 32bit allocation
* accordingly.
*/
if (sizeof(ino_t) >= sizeof(u64))
root->id_highbits = 0;
else
root->id_highbits = 1;
kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO,
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
...@@ -1678,7 +1681,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) ...@@ -1678,7 +1681,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
const char *name = pos->name; const char *name = pos->name;
unsigned int type = dt_type(pos); unsigned int type = dt_type(pos);
int len = strlen(name); int len = strlen(name);
ino_t ino = pos->id.ino; ino_t ino = kernfs_ino(pos);
ctx->pos = pos->hash; ctx->pos = pos->hash;
file->private_data = pos; file->private_data = pos;
......
...@@ -892,7 +892,7 @@ static void kernfs_notify_workfn(struct work_struct *work) ...@@ -892,7 +892,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
* have the matching @file available. Look up the inodes * have the matching @file available. Look up the inodes
* and generate the events manually. * and generate the events manually.
*/ */
inode = ilookup(info->sb, kn->id.ino); inode = ilookup(info->sb, kernfs_ino(kn));
if (!inode) if (!inode)
continue; continue;
...@@ -901,7 +901,7 @@ static void kernfs_notify_workfn(struct work_struct *work) ...@@ -901,7 +901,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
if (parent) { if (parent) {
struct inode *p_inode; struct inode *p_inode;
p_inode = ilookup(info->sb, parent->id.ino); p_inode = ilookup(info->sb, kernfs_ino(parent));
if (p_inode) { if (p_inode) {
fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
inode, FSNOTIFY_EVENT_INODE, &name, 0); inode, FSNOTIFY_EVENT_INODE, &name, 0);
......
...@@ -201,7 +201,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) ...@@ -201,7 +201,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
inode->i_private = kn; inode->i_private = kn;
inode->i_mapping->a_ops = &kernfs_aops; inode->i_mapping->a_ops = &kernfs_aops;
inode->i_op = &kernfs_iops; inode->i_op = &kernfs_iops;
inode->i_generation = kn->id.generation; inode->i_generation = kernfs_gen(kn);
set_default_inode_attr(inode, kn->mode); set_default_inode_attr(inode, kn->mode);
kernfs_refresh_inode(kn, inode); kernfs_refresh_inode(kn, inode);
...@@ -247,7 +247,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) ...@@ -247,7 +247,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{ {
struct inode *inode; struct inode *inode;
inode = iget_locked(sb, kn->id.ino); inode = iget_locked(sb, kernfs_ino(kn));
if (inode && (inode->i_state & I_NEW)) if (inode && (inode->i_state & I_NEW))
kernfs_init_inode(kn, inode); kernfs_init_inode(kn, inode);
......
...@@ -109,8 +109,6 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, ...@@ -109,8 +109,6 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
const char *name, umode_t mode, const char *name, umode_t mode,
kuid_t uid, kgid_t gid, kuid_t uid, kgid_t gid,
unsigned flags); unsigned flags);
struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root,
unsigned int ino);
/* /*
* file.c * file.c
......
...@@ -53,63 +53,85 @@ const struct super_operations kernfs_sops = { ...@@ -53,63 +53,85 @@ const struct super_operations kernfs_sops = {
.show_path = kernfs_sop_show_path, .show_path = kernfs_sop_show_path,
}; };
/* static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
* Similar to kernfs_fh_get_inode, this one gets kernfs node from inode struct inode *parent)
* number and generation
*/
struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root,
const union kernfs_node_id *id)
{ {
struct kernfs_node *kn; struct kernfs_node *kn = inode->i_private;
kn = kernfs_find_and_get_node_by_ino(root, id->ino); if (*max_len < 2) {
if (!kn) *max_len = 2;
return NULL; return FILEID_INVALID;
if (kn->id.generation != id->generation) {
kernfs_put(kn);
return NULL;
} }
return kn;
*max_len = 2;
*(u64 *)fh = kn->id;
return FILEID_KERNFS;
} }
static struct inode *kernfs_fh_get_inode(struct super_block *sb, static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb,
u64 ino, u32 generation) struct fid *fid, int fh_len,
int fh_type, bool get_parent)
{ {
struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_super_info *info = kernfs_info(sb);
struct inode *inode;
struct kernfs_node *kn; struct kernfs_node *kn;
struct inode *inode;
u64 id;
if (ino == 0) if (fh_len < 2)
return ERR_PTR(-ESTALE); return NULL;
switch (fh_type) {
case FILEID_KERNFS:
id = *(u64 *)fid;
break;
case FILEID_INO32_GEN:
case FILEID_INO32_GEN_PARENT:
/*
* blk_log_action() exposes "LOW32,HIGH32" pair without
* type and userland can call us with generic fid
* constructed from them. Combine it back to ID. See
* blk_log_action().
*/
id = ((u64)fid->i32.gen << 32) | fid->i32.ino;
break;
default:
return NULL;
}
kn = kernfs_find_and_get_node_by_ino(info->root, ino); kn = kernfs_find_and_get_node_by_id(info->root, id);
if (!kn) if (!kn)
return ERR_PTR(-ESTALE); return ERR_PTR(-ESTALE);
if (get_parent) {
struct kernfs_node *parent;
parent = kernfs_get_parent(kn);
kernfs_put(kn);
kn = parent;
if (!kn)
return ERR_PTR(-ESTALE);
}
inode = kernfs_get_inode(sb, kn); inode = kernfs_get_inode(sb, kn);
kernfs_put(kn); kernfs_put(kn);
if (!inode) if (!inode)
return ERR_PTR(-ESTALE); return ERR_PTR(-ESTALE);
if (generation && inode->i_generation != generation) { return d_obtain_alias(inode);
/* we didn't find the right inode.. */
iput(inode);
return ERR_PTR(-ESTALE);
}
return inode;
} }
static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, static struct dentry *kernfs_fh_to_dentry(struct super_block *sb,
int fh_len, int fh_type) struct fid *fid, int fh_len,
int fh_type)
{ {
return generic_fh_to_dentry(sb, fid, fh_len, fh_type, return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false);
kernfs_fh_get_inode);
} }
static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, static struct dentry *kernfs_fh_to_parent(struct super_block *sb,
int fh_len, int fh_type) struct fid *fid, int fh_len,
int fh_type)
{ {
return generic_fh_to_parent(sb, fid, fh_len, fh_type, return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true);
kernfs_fh_get_inode);
} }
static struct dentry *kernfs_get_parent_dentry(struct dentry *child) static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
...@@ -120,6 +142,7 @@ static struct dentry *kernfs_get_parent_dentry(struct dentry *child) ...@@ -120,6 +142,7 @@ static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
} }
static const struct export_operations kernfs_export_ops = { static const struct export_operations kernfs_export_ops = {
.encode_fh = kernfs_encode_fh,
.fh_to_dentry = kernfs_fh_to_dentry, .fh_to_dentry = kernfs_fh_to_dentry,
.fh_to_parent = kernfs_fh_to_parent, .fh_to_parent = kernfs_fh_to_parent,
.get_parent = kernfs_get_parent_dentry, .get_parent = kernfs_get_parent_dentry,
...@@ -363,18 +386,9 @@ void kernfs_kill_sb(struct super_block *sb) ...@@ -363,18 +386,9 @@ void kernfs_kill_sb(struct super_block *sb)
void __init kernfs_init(void) void __init kernfs_init(void)
{ {
/*
* the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino
* can access the slab lock free. This could introduce stale nodes,
* please see how kernfs_find_and_get_node_by_ino filters out stale
* nodes.
*/
kernfs_node_cache = kmem_cache_create("kernfs_node_cache", kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
sizeof(struct kernfs_node), sizeof(struct kernfs_node),
0, 0, SLAB_PANIC, NULL);
SLAB_PANIC | SLAB_TYPESAFE_BY_RCU,
NULL);
/* Creates slab cache for kernfs inode attributes */ /* Creates slab cache for kernfs inode attributes */
kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
......
...@@ -354,16 +354,6 @@ struct cgroup { ...@@ -354,16 +354,6 @@ struct cgroup {
unsigned long flags; /* "unsigned long" so bitops work */ unsigned long flags; /* "unsigned long" so bitops work */
/*
* idr allocated in-hierarchy ID.
*
* ID 0 is not used, the ID of the root cgroup is always 1, and a
* new cgroup will be assigned with a smallest available ID.
*
* Allocating/Removing ID must be protected by cgroup_mutex.
*/
int id;
/* /*
* The depth this cgroup is at. The root is at depth zero and each * The depth this cgroup is at. The root is at depth zero and each
* step down the hierarchy increments the level. This along with * step down the hierarchy increments the level. This along with
...@@ -458,7 +448,7 @@ struct cgroup { ...@@ -458,7 +448,7 @@ struct cgroup {
struct list_head rstat_css_list; struct list_head rstat_css_list;
/* cgroup basic resource statistics */ /* cgroup basic resource statistics */
struct cgroup_base_stat pending_bstat; /* pending from children */ struct cgroup_base_stat last_bstat;
struct cgroup_base_stat bstat; struct cgroup_base_stat bstat;
struct prev_cputime prev_cputime; /* for printing out cputime */ struct prev_cputime prev_cputime; /* for printing out cputime */
...@@ -488,7 +478,7 @@ struct cgroup { ...@@ -488,7 +478,7 @@ struct cgroup {
struct cgroup_freezer_state freezer; struct cgroup_freezer_state freezer;
/* ids of the ancestors at each level including self */ /* ids of the ancestors at each level including self */
int ancestor_ids[]; u64 ancestor_ids[];
}; };
/* /*
...@@ -509,7 +499,7 @@ struct cgroup_root { ...@@ -509,7 +499,7 @@ struct cgroup_root {
struct cgroup cgrp; struct cgroup cgrp;
/* for cgrp->ancestor_ids[0] */ /* for cgrp->ancestor_ids[0] */
int cgrp_ancestor_id_storage; u64 cgrp_ancestor_id_storage;
/* Number of cgroups in the hierarchy, used only for /proc/cgroups */ /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
atomic_t nr_cgrps; atomic_t nr_cgrps;
...@@ -520,9 +510,6 @@ struct cgroup_root { ...@@ -520,9 +510,6 @@ struct cgroup_root {
/* Hierarchy-specific flags */ /* Hierarchy-specific flags */
unsigned int flags; unsigned int flags;
/* IDs for cgroups in this hierarchy */
struct idr cgroup_idr;
/* The path to use for release notifications. */ /* The path to use for release notifications. */
char release_agent_path[PATH_MAX]; char release_agent_path[PATH_MAX];
......
...@@ -150,7 +150,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, ...@@ -150,7 +150,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp); struct cgroup_subsys_state **dst_cssp);
void cgroup_enable_task_cg_lists(void);
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it); struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it);
...@@ -305,6 +304,11 @@ void css_task_iter_end(struct css_task_iter *it); ...@@ -305,6 +304,11 @@ void css_task_iter_end(struct css_task_iter *it);
* Inline functions. * Inline functions.
*/ */
static inline u64 cgroup_id(struct cgroup *cgrp)
{
return cgrp->kn->id;
}
/** /**
* css_get - obtain a reference on the specified css * css_get - obtain a reference on the specified css
* @css: target css * @css: target css
...@@ -566,7 +570,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, ...@@ -566,7 +570,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
{ {
if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
return false; return false;
return cgrp->ancestor_ids[ancestor->level] == ancestor->id; return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
} }
/** /**
...@@ -617,7 +621,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) ...@@ -617,7 +621,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp)
/* returns ino associated with a cgroup */ /* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp) static inline ino_t cgroup_ino(struct cgroup *cgrp)
{ {
return cgrp->kn->id.ino; return kernfs_ino(cgrp->kn);
} }
/* cft/css accessors for cftype->write() operation */ /* cft/css accessors for cftype->write() operation */
...@@ -688,18 +692,13 @@ static inline void cgroup_kthread_ready(void) ...@@ -688,18 +692,13 @@ static inline void cgroup_kthread_ready(void)
current->no_cgroup_migration = 0; current->no_cgroup_migration = 0;
} }
static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
{
return &cgrp->kn->id;
}
void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
char *buf, size_t buflen);
#else /* !CONFIG_CGROUPS */ #else /* !CONFIG_CGROUPS */
struct cgroup_subsys_state; struct cgroup_subsys_state;
struct cgroup; struct cgroup;
static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {}
static inline int cgroup_attach_task_all(struct task_struct *from, static inline int cgroup_attach_task_all(struct task_struct *from,
...@@ -719,10 +718,6 @@ static inline int cgroup_init_early(void) { return 0; } ...@@ -719,10 +718,6 @@ static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; } static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {} static inline void cgroup_kthread_ready(void) {}
static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
{
return NULL;
}
static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{ {
...@@ -740,8 +735,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, ...@@ -740,8 +735,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
return true; return true;
} }
static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
char *buf, size_t buflen) {} {}
#endif /* !CONFIG_CGROUPS */ #endif /* !CONFIG_CGROUPS */
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
......
...@@ -104,6 +104,11 @@ enum fid_type { ...@@ -104,6 +104,11 @@ enum fid_type {
*/ */
FILEID_LUSTRE = 0x97, FILEID_LUSTRE = 0x97,
/*
* 64 bit unique kernfs id
*/
FILEID_KERNFS = 0xfe,
/* /*
* Filesystems must not use 0xff file ID. * Filesystems must not use 0xff file ID.
*/ */
......
...@@ -104,21 +104,6 @@ struct kernfs_elem_attr { ...@@ -104,21 +104,6 @@ struct kernfs_elem_attr {
struct kernfs_node *notify_next; /* for kernfs_notify() */ struct kernfs_node *notify_next; /* for kernfs_notify() */
}; };
/* represent a kernfs node */
union kernfs_node_id {
struct {
/*
* blktrace will export this struct as a simplified 'struct
* fid' (which is a big data struction), so userspace can use
* it to find kernfs node. The layout must match the first two
* fields of 'struct fid' exactly.
*/
u32 ino;
u32 generation;
};
u64 id;
};
/* /*
* kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs_node - the building block of kernfs hierarchy. Each and every
* kernfs node is represented by single kernfs_node. Most fields are * kernfs node is represented by single kernfs_node. Most fields are
...@@ -155,7 +140,12 @@ struct kernfs_node { ...@@ -155,7 +140,12 @@ struct kernfs_node {
void *priv; void *priv;
union kernfs_node_id id; /*
* 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit,
* the low 32bits are ino and upper generation.
*/
u64 id;
unsigned short flags; unsigned short flags;
umode_t mode; umode_t mode;
struct kernfs_iattrs *iattr; struct kernfs_iattrs *iattr;
...@@ -187,7 +177,8 @@ struct kernfs_root { ...@@ -187,7 +177,8 @@ struct kernfs_root {
/* private fields, do not use outside kernfs proper */ /* private fields, do not use outside kernfs proper */
struct idr ino_idr; struct idr ino_idr;
u32 next_generation; u32 last_id_lowbits;
u32 id_highbits;
struct kernfs_syscall_ops *syscall_ops; struct kernfs_syscall_ops *syscall_ops;
/* list of kernfs_super_info of this root, protected by kernfs_mutex */ /* list of kernfs_super_info of this root, protected by kernfs_mutex */
...@@ -291,6 +282,34 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) ...@@ -291,6 +282,34 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
return kn->flags & KERNFS_TYPE_MASK; return kn->flags & KERNFS_TYPE_MASK;
} }
static inline ino_t kernfs_id_ino(u64 id)
{
/* id is ino if ino_t is 64bit; otherwise, low 32bits */
if (sizeof(ino_t) >= sizeof(u64))
return id;
else
return (u32)id;
}
static inline u32 kernfs_id_gen(u64 id)
{
/* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */
if (sizeof(ino_t) >= sizeof(u64))
return 1;
else
return id >> 32;
}
static inline ino_t kernfs_ino(struct kernfs_node *kn)
{
return kernfs_id_ino(kn->id);
}
static inline ino_t kernfs_gen(struct kernfs_node *kn)
{
return kernfs_id_gen(kn->id);
}
/** /**
* kernfs_enable_ns - enable namespace under a directory * kernfs_enable_ns - enable namespace under a directory
* @kn: directory of interest, should be empty * @kn: directory of interest, should be empty
...@@ -382,8 +401,8 @@ void kernfs_kill_sb(struct super_block *sb); ...@@ -382,8 +401,8 @@ void kernfs_kill_sb(struct super_block *sb);
void kernfs_init(void); void kernfs_init(void);
struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
const union kernfs_node_id *id); u64 id);
#else /* CONFIG_KERNFS */ #else /* CONFIG_KERNFS */
static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
......
...@@ -26,7 +26,7 @@ static inline u32 task_netprioidx(struct task_struct *p) ...@@ -26,7 +26,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
rcu_read_lock(); rcu_read_lock();
css = task_css(p, net_prio_cgrp_id); css = task_css(p, net_prio_cgrp_id);
idx = css->cgroup->id; idx = css->id;
rcu_read_unlock(); rcu_read_unlock();
return idx; return idx;
} }
......
...@@ -66,7 +66,7 @@ DECLARE_EVENT_CLASS(cgroup, ...@@ -66,7 +66,7 @@ DECLARE_EVENT_CLASS(cgroup,
TP_fast_assign( TP_fast_assign(
__entry->root = cgrp->root->hierarchy_id; __entry->root = cgrp->root->hierarchy_id;
__entry->id = cgrp->id; __entry->id = cgroup_id(cgrp);
__entry->level = cgrp->level; __entry->level = cgrp->level;
__assign_str(path, path); __assign_str(path, path);
), ),
...@@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate, ...@@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
TP_fast_assign( TP_fast_assign(
__entry->dst_root = dst_cgrp->root->hierarchy_id; __entry->dst_root = dst_cgrp->root->hierarchy_id;
__entry->dst_id = dst_cgrp->id; __entry->dst_id = cgroup_id(dst_cgrp);
__entry->dst_level = dst_cgrp->level; __entry->dst_level = dst_cgrp->level;
__assign_str(dst_path, path); __assign_str(dst_path, path);
__entry->pid = task->pid; __entry->pid = task->pid;
...@@ -179,7 +179,7 @@ DECLARE_EVENT_CLASS(cgroup_event, ...@@ -179,7 +179,7 @@ DECLARE_EVENT_CLASS(cgroup_event,
TP_fast_assign( TP_fast_assign(
__entry->root = cgrp->root->hierarchy_id; __entry->root = cgrp->root->hierarchy_id;
__entry->id = cgrp->id; __entry->id = cgroup_id(cgrp);
__entry->level = cgrp->level; __entry->level = cgrp->level;
__assign_str(path, path); __assign_str(path, path);
__entry->val = val; __entry->val = val;
......
...@@ -61,7 +61,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, ...@@ -61,7 +61,7 @@ DECLARE_EVENT_CLASS(writeback_page_template,
TP_STRUCT__entry ( TP_STRUCT__entry (
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, ino) __field(ino_t, ino)
__field(pgoff_t, index) __field(pgoff_t, index)
), ),
...@@ -75,7 +75,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, ...@@ -75,7 +75,7 @@ DECLARE_EVENT_CLASS(writeback_page_template,
TP_printk("bdi %s: ino=%lu index=%lu", TP_printk("bdi %s: ino=%lu index=%lu",
__entry->name, __entry->name,
__entry->ino, (unsigned long)__entry->ino,
__entry->index __entry->index
) )
); );
...@@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, ...@@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
TP_STRUCT__entry ( TP_STRUCT__entry (
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, ino) __field(ino_t, ino)
__field(unsigned long, state) __field(unsigned long, state)
__field(unsigned long, flags) __field(unsigned long, flags)
), ),
...@@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, ...@@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
TP_printk("bdi %s: ino=%lu state=%s flags=%s", TP_printk("bdi %s: ino=%lu state=%s flags=%s",
__entry->name, __entry->name,
__entry->ino, (unsigned long)__entry->ino,
show_inode_state(__entry->state), show_inode_state(__entry->state),
show_inode_state(__entry->flags) show_inode_state(__entry->flags)
) )
...@@ -150,28 +150,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, ...@@ -150,28 +150,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
#ifdef CREATE_TRACE_POINTS #ifdef CREATE_TRACE_POINTS
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{ {
return wb->memcg_css->cgroup->kn->id.ino; return cgroup_ino(wb->memcg_css->cgroup);
} }
static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{ {
if (wbc->wb) if (wbc->wb)
return __trace_wb_assign_cgroup(wbc->wb); return __trace_wb_assign_cgroup(wbc->wb);
else else
return -1U; return 1;
} }
#else /* CONFIG_CGROUP_WRITEBACK */ #else /* CONFIG_CGROUP_WRITEBACK */
static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{ {
return -1U; return 1;
} }
static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{ {
return -1U; return 1;
} }
#endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CONFIG_CGROUP_WRITEBACK */
...@@ -187,8 +187,8 @@ TRACE_EVENT(inode_foreign_history, ...@@ -187,8 +187,8 @@ TRACE_EVENT(inode_foreign_history,
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, ino) __field(ino_t, ino)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
__field(unsigned int, history) __field(unsigned int, history)
), ),
...@@ -199,10 +199,10 @@ TRACE_EVENT(inode_foreign_history, ...@@ -199,10 +199,10 @@ TRACE_EVENT(inode_foreign_history,
__entry->history = history; __entry->history = history;
), ),
TP_printk("bdi %s: ino=%lu cgroup_ino=%u history=0x%x", TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x",
__entry->name, __entry->name,
__entry->ino, (unsigned long)__entry->ino,
__entry->cgroup_ino, (unsigned long)__entry->cgroup_ino,
__entry->history __entry->history
) )
); );
...@@ -216,9 +216,9 @@ TRACE_EVENT(inode_switch_wbs, ...@@ -216,9 +216,9 @@ TRACE_EVENT(inode_switch_wbs,
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, ino) __field(ino_t, ino)
__field(unsigned int, old_cgroup_ino) __field(ino_t, old_cgroup_ino)
__field(unsigned int, new_cgroup_ino) __field(ino_t, new_cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
...@@ -228,11 +228,11 @@ TRACE_EVENT(inode_switch_wbs, ...@@ -228,11 +228,11 @@ TRACE_EVENT(inode_switch_wbs,
__entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
), ),
TP_printk("bdi %s: ino=%lu old_cgroup_ino=%u new_cgroup_ino=%u", TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->ino, (unsigned long)__entry->ino,
__entry->old_cgroup_ino, (unsigned long)__entry->old_cgroup_ino,
__entry->new_cgroup_ino (unsigned long)__entry->new_cgroup_ino
) )
); );
...@@ -245,10 +245,10 @@ TRACE_EVENT(track_foreign_dirty, ...@@ -245,10 +245,10 @@ TRACE_EVENT(track_foreign_dirty,
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(u64, bdi_id) __field(u64, bdi_id)
__field(unsigned long, ino) __field(ino_t, ino)
__field(unsigned int, memcg_id) __field(unsigned int, memcg_id)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
__field(unsigned int, page_cgroup_ino) __field(ino_t, page_cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
...@@ -260,16 +260,16 @@ TRACE_EVENT(track_foreign_dirty, ...@@ -260,16 +260,16 @@ TRACE_EVENT(track_foreign_dirty,
__entry->ino = inode ? inode->i_ino : 0; __entry->ino = inode ? inode->i_ino : 0;
__entry->memcg_id = wb->memcg_css->id; __entry->memcg_id = wb->memcg_css->id;
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
__entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup);
), ),
TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%u page_cgroup_ino=%u", TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->bdi_id, __entry->bdi_id,
__entry->ino, (unsigned long)__entry->ino,
__entry->memcg_id, __entry->memcg_id,
__entry->cgroup_ino, (unsigned long)__entry->cgroup_ino,
__entry->page_cgroup_ino (unsigned long)__entry->page_cgroup_ino
) )
); );
...@@ -282,7 +282,7 @@ TRACE_EVENT(flush_foreign, ...@@ -282,7 +282,7 @@ TRACE_EVENT(flush_foreign,
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
__field(unsigned int, frn_bdi_id) __field(unsigned int, frn_bdi_id)
__field(unsigned int, frn_memcg_id) __field(unsigned int, frn_memcg_id)
), ),
...@@ -294,9 +294,9 @@ TRACE_EVENT(flush_foreign, ...@@ -294,9 +294,9 @@ TRACE_EVENT(flush_foreign,
__entry->frn_memcg_id = frn_memcg_id; __entry->frn_memcg_id = frn_memcg_id;
), ),
TP_printk("bdi %s: cgroup_ino=%u frn_bdi_id=%u frn_memcg_id=%u", TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u",
__entry->name, __entry->name,
__entry->cgroup_ino, (unsigned long)__entry->cgroup_ino,
__entry->frn_bdi_id, __entry->frn_bdi_id,
__entry->frn_memcg_id __entry->frn_memcg_id
) )
...@@ -311,9 +311,9 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, ...@@ -311,9 +311,9 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
TP_STRUCT__entry ( TP_STRUCT__entry (
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, ino) __field(ino_t, ino)
__field(int, sync_mode) __field(int, sync_mode)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
...@@ -324,11 +324,11 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, ...@@ -324,11 +324,11 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
__entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc);
), ),
TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u", TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->ino, (unsigned long)__entry->ino,
__entry->sync_mode, __entry->sync_mode,
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
); );
...@@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, ...@@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__field(int, range_cyclic) __field(int, range_cyclic)
__field(int, for_background) __field(int, for_background)
__field(int, reason) __field(int, reason)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
strscpy_pad(__entry->name, strscpy_pad(__entry->name,
...@@ -374,7 +374,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, ...@@ -374,7 +374,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
), ),
TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
"kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u", "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu",
__entry->name, __entry->name,
MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
__entry->nr_pages, __entry->nr_pages,
...@@ -383,7 +383,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, ...@@ -383,7 +383,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__entry->range_cyclic, __entry->range_cyclic,
__entry->for_background, __entry->for_background,
__print_symbolic(__entry->reason, WB_WORK_REASON), __print_symbolic(__entry->reason, WB_WORK_REASON),
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
); );
#define DEFINE_WRITEBACK_WORK_EVENT(name) \ #define DEFINE_WRITEBACK_WORK_EVENT(name) \
...@@ -413,15 +413,15 @@ DECLARE_EVENT_CLASS(writeback_class, ...@@ -413,15 +413,15 @@ DECLARE_EVENT_CLASS(writeback_class,
TP_ARGS(wb), TP_ARGS(wb),
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32);
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
), ),
TP_printk("bdi %s: cgroup_ino=%u", TP_printk("bdi %s: cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
); );
#define DEFINE_WRITEBACK_EVENT(name) \ #define DEFINE_WRITEBACK_EVENT(name) \
...@@ -459,7 +459,7 @@ DECLARE_EVENT_CLASS(wbc_class, ...@@ -459,7 +459,7 @@ DECLARE_EVENT_CLASS(wbc_class,
__field(int, range_cyclic) __field(int, range_cyclic)
__field(long, range_start) __field(long, range_start)
__field(long, range_end) __field(long, range_end)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
...@@ -478,7 +478,7 @@ DECLARE_EVENT_CLASS(wbc_class, ...@@ -478,7 +478,7 @@ DECLARE_EVENT_CLASS(wbc_class,
TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
"bgrd=%d reclm=%d cyclic=%d " "bgrd=%d reclm=%d cyclic=%d "
"start=0x%lx end=0x%lx cgroup_ino=%u", "start=0x%lx end=0x%lx cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->nr_to_write, __entry->nr_to_write,
__entry->pages_skipped, __entry->pages_skipped,
...@@ -489,7 +489,7 @@ DECLARE_EVENT_CLASS(wbc_class, ...@@ -489,7 +489,7 @@ DECLARE_EVENT_CLASS(wbc_class,
__entry->range_cyclic, __entry->range_cyclic,
__entry->range_start, __entry->range_start,
__entry->range_end, __entry->range_end,
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
) )
...@@ -510,7 +510,7 @@ TRACE_EVENT(writeback_queue_io, ...@@ -510,7 +510,7 @@ TRACE_EVENT(writeback_queue_io,
__field(long, age) __field(long, age)
__field(int, moved) __field(int, moved)
__field(int, reason) __field(int, reason)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
unsigned long *older_than_this = work->older_than_this; unsigned long *older_than_this = work->older_than_this;
...@@ -522,13 +522,13 @@ TRACE_EVENT(writeback_queue_io, ...@@ -522,13 +522,13 @@ TRACE_EVENT(writeback_queue_io,
__entry->reason = work->reason; __entry->reason = work->reason;
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
), ),
TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u", TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->older, /* older_than_this in jiffies */ __entry->older, /* older_than_this in jiffies */
__entry->age, /* older_than_this in relative milliseconds */ __entry->age, /* older_than_this in relative milliseconds */
__entry->moved, __entry->moved,
__print_symbolic(__entry->reason, WB_WORK_REASON), __print_symbolic(__entry->reason, WB_WORK_REASON),
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
); );
...@@ -596,7 +596,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, ...@@ -596,7 +596,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
__field(unsigned long, dirty_ratelimit) __field(unsigned long, dirty_ratelimit)
__field(unsigned long, task_ratelimit) __field(unsigned long, task_ratelimit)
__field(unsigned long, balanced_dirty_ratelimit) __field(unsigned long, balanced_dirty_ratelimit)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
...@@ -614,7 +614,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, ...@@ -614,7 +614,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
TP_printk("bdi %s: " TP_printk("bdi %s: "
"write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu "
"balanced_dirty_ratelimit=%lu cgroup_ino=%u", "balanced_dirty_ratelimit=%lu cgroup_ino=%lu",
__entry->bdi, __entry->bdi,
__entry->write_bw, /* write bandwidth */ __entry->write_bw, /* write bandwidth */
__entry->avg_write_bw, /* avg write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */
...@@ -622,7 +622,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, ...@@ -622,7 +622,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
__entry->dirty_ratelimit, /* base ratelimit */ __entry->dirty_ratelimit, /* base ratelimit */
__entry->task_ratelimit, /* ratelimit with position control */ __entry->task_ratelimit, /* ratelimit with position control */
__entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
); );
...@@ -660,7 +660,7 @@ TRACE_EVENT(balance_dirty_pages, ...@@ -660,7 +660,7 @@ TRACE_EVENT(balance_dirty_pages,
__field( long, pause) __field( long, pause)
__field(unsigned long, period) __field(unsigned long, period)
__field( long, think) __field( long, think)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
...@@ -692,7 +692,7 @@ TRACE_EVENT(balance_dirty_pages, ...@@ -692,7 +692,7 @@ TRACE_EVENT(balance_dirty_pages,
"bdi_setpoint=%lu bdi_dirty=%lu " "bdi_setpoint=%lu bdi_dirty=%lu "
"dirty_ratelimit=%lu task_ratelimit=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu "
"dirtied=%u dirtied_pause=%u " "dirtied=%u dirtied_pause=%u "
"paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u", "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
__entry->bdi, __entry->bdi,
__entry->limit, __entry->limit,
__entry->setpoint, __entry->setpoint,
...@@ -707,7 +707,7 @@ TRACE_EVENT(balance_dirty_pages, ...@@ -707,7 +707,7 @@ TRACE_EVENT(balance_dirty_pages,
__entry->pause, /* ms */ __entry->pause, /* ms */
__entry->period, /* ms */ __entry->period, /* ms */
__entry->think, /* ms */ __entry->think, /* ms */
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
); );
...@@ -718,10 +718,10 @@ TRACE_EVENT(writeback_sb_inodes_requeue, ...@@ -718,10 +718,10 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, ino) __field(ino_t, ino)
__field(unsigned long, state) __field(unsigned long, state)
__field(unsigned long, dirtied_when) __field(unsigned long, dirtied_when)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
...@@ -733,13 +733,13 @@ TRACE_EVENT(writeback_sb_inodes_requeue, ...@@ -733,13 +733,13 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
__entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode));
), ),
TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u", TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->ino, (unsigned long)__entry->ino,
show_inode_state(__entry->state), show_inode_state(__entry->state),
__entry->dirtied_when, __entry->dirtied_when,
(jiffies - __entry->dirtied_when) / HZ, (jiffies - __entry->dirtied_when) / HZ,
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
); );
...@@ -789,13 +789,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ...@@ -789,13 +789,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, name, 32) __array(char, name, 32)
__field(unsigned long, ino) __field(ino_t, ino)
__field(unsigned long, state) __field(unsigned long, state)
__field(unsigned long, dirtied_when) __field(unsigned long, dirtied_when)
__field(unsigned long, writeback_index) __field(unsigned long, writeback_index)
__field(long, nr_to_write) __field(long, nr_to_write)
__field(unsigned long, wrote) __field(unsigned long, wrote)
__field(unsigned int, cgroup_ino) __field(ino_t, cgroup_ino)
), ),
TP_fast_assign( TP_fast_assign(
...@@ -811,16 +811,16 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ...@@ -811,16 +811,16 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
), ),
TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
"index=%lu to_write=%ld wrote=%lu cgroup_ino=%u", "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu",
__entry->name, __entry->name,
__entry->ino, (unsigned long)__entry->ino,
show_inode_state(__entry->state), show_inode_state(__entry->state),
__entry->dirtied_when, __entry->dirtied_when,
(jiffies - __entry->dirtied_when) / HZ, (jiffies - __entry->dirtied_when) / HZ,
__entry->writeback_index, __entry->writeback_index,
__entry->nr_to_write, __entry->nr_to_write,
__entry->wrote, __entry->wrote,
__entry->cgroup_ino (unsigned long)__entry->cgroup_ino
) )
); );
...@@ -845,7 +845,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, ...@@ -845,7 +845,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template,
TP_STRUCT__entry( TP_STRUCT__entry(
__field( dev_t, dev ) __field( dev_t, dev )
__field(unsigned long, ino ) __field( ino_t, ino )
__field(unsigned long, state ) __field(unsigned long, state )
__field( __u16, mode ) __field( __u16, mode )
__field(unsigned long, dirtied_when ) __field(unsigned long, dirtied_when )
...@@ -861,7 +861,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, ...@@ -861,7 +861,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template,
TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino, __entry->dirtied_when, (unsigned long)__entry->ino, __entry->dirtied_when,
show_inode_state(__entry->state), __entry->mode) show_inode_state(__entry->state), __entry->mode)
); );
......
...@@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) ...@@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id)
{ {
struct cgroup *cgrp = task_dfl_cgroup(current); struct cgroup *cgrp = task_dfl_cgroup(current);
return cgrp->kn->id.id; return cgroup_id(cgrp);
} }
const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
......
...@@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, ...@@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
return; return;
storage->key.attach_type = type; storage->key.attach_type = type;
storage->key.cgroup_inode_id = cgroup->kn->id.id; storage->key.cgroup_inode_id = cgroup_id(cgroup);
map = storage->map; map = storage->map;
......
...@@ -231,9 +231,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, ...@@ -231,9 +231,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup); bool threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem); __acquires(&cgroup_threadgroup_rwsem);
void cgroup_procs_write_finish(struct task_struct *task) void cgroup_procs_write_finish(struct task_struct *task, bool locked)
__releases(&cgroup_threadgroup_rwsem); __releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp); void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
......
...@@ -495,12 +495,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ...@@ -495,12 +495,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
struct task_struct *task; struct task_struct *task;
const struct cred *cred, *tcred; const struct cred *cred, *tcred;
ssize_t ret; ssize_t ret;
bool locked;
cgrp = cgroup_kn_lock_live(of->kn, false); cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp) if (!cgrp)
return -ENODEV; return -ENODEV;
task = cgroup_procs_write_start(buf, threadgroup); task = cgroup_procs_write_start(buf, threadgroup, &locked);
ret = PTR_ERR_OR_ZERO(task); ret = PTR_ERR_OR_ZERO(task);
if (ret) if (ret)
goto out_unlock; goto out_unlock;
...@@ -522,7 +523,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ...@@ -522,7 +523,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(cgrp, task, threadgroup); ret = cgroup_attach_task(cgrp, task, threadgroup);
out_finish: out_finish:
cgroup_procs_write_finish(task); cgroup_procs_write_finish(task, locked);
out_unlock: out_unlock:
cgroup_kn_unlock(of->kn); cgroup_kn_unlock(of->kn);
......
...@@ -899,8 +899,7 @@ static void css_set_move_task(struct task_struct *task, ...@@ -899,8 +899,7 @@ static void css_set_move_task(struct task_struct *task,
/* /*
* We are synchronized through cgroup_threadgroup_rwsem * We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race * against PF_EXITING setting such that we can't race
* against cgroup_exit() changing the css_set to * against cgroup_exit()/cgroup_free() dropping the css_set.
* init_css_set and dropping the old one.
*/ */
WARN_ON_ONCE(task->flags & PF_EXITING); WARN_ON_ONCE(task->flags & PF_EXITING);
...@@ -1309,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) ...@@ -1309,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
void cgroup_free_root(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root)
{ {
if (root) { kfree(root);
idr_destroy(&root->cgroup_idr);
kfree(root);
}
} }
static void cgroup_destroy_root(struct cgroup_root *root) static void cgroup_destroy_root(struct cgroup_root *root)
...@@ -1374,6 +1370,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) ...@@ -1374,6 +1370,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
cset = current->nsproxy->cgroup_ns->root_cset; cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) { if (cset == &init_css_set) {
res = &root->cgrp; res = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
res = cset->dfl_cgrp;
} else { } else {
struct cgrp_cset_link *link; struct cgrp_cset_link *link;
...@@ -1430,9 +1428,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, ...@@ -1430,9 +1428,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root) struct cgroup_root *root)
{ {
/* /*
* No need to lock the task - since we hold cgroup_mutex the * No need to lock the task - since we hold css_set_lock the
* task can't change groups, so the only thing that can happen * task can't change groups.
* is that it exits and its css is set back to init_css_set.
*/ */
return cset_cgroup_from_root(task_css_set(task), root); return cset_cgroup_from_root(task_css_set(task), root);
} }
...@@ -1883,65 +1880,6 @@ static int cgroup_reconfigure(struct fs_context *fc) ...@@ -1883,65 +1880,6 @@ static int cgroup_reconfigure(struct fs_context *fc)
return 0; return 0;
} }
/*
* To reduce the fork() overhead for systems that are not actually using
* their cgroups capability, we don't maintain the lists running through
* each css_set to its tasks until we see the list actually used - in other
* words after the first mount.
*/
static bool use_task_css_set_links __read_mostly;
void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
* cgroup_post_fork() without seeing use_task_css_set_links = 1
* is not guaranteed to have its child immediately visible in the
* tasklist if we walk through it with RCU.
*/
read_lock(&tasklist_lock);
spin_lock_irq(&css_set_lock);
if (use_task_css_set_links)
goto out_unlock;
use_task_css_set_links = true;
do_each_thread(g, p) {
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
task_css_set(p) != &init_css_set);
/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
* Do it while holding siglock so that we don't end up
* racing against cgroup_exit().
*
* Interrupts were already disabled while acquiring
* the css_set_lock, so we do not need to disable it
* again when acquiring the sighand->siglock here.
*/
spin_lock(&p->sighand->siglock);
if (!(p->flags & PF_EXITING)) {
struct css_set *cset = task_css_set(p);
if (!css_set_populated(cset))
css_set_update_populated(cset, true);
list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
cset->nr_tasks++;
}
spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
out_unlock:
spin_unlock_irq(&css_set_lock);
read_unlock(&tasklist_lock);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp) static void init_cgroup_housekeeping(struct cgroup *cgrp)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
...@@ -1976,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) ...@@ -1976,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
atomic_set(&root->nr_cgrps, 1); atomic_set(&root->nr_cgrps, 1);
cgrp->root = root; cgrp->root = root;
init_cgroup_housekeeping(cgrp); init_cgroup_housekeeping(cgrp);
idr_init(&root->cgroup_idr);
root->flags = ctx->flags; root->flags = ctx->flags;
if (ctx->release_agent) if (ctx->release_agent)
...@@ -1997,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ...@@ -1997,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
if (ret < 0)
goto out;
root_cgrp->id = ret;
root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
0, GFP_KERNEL); 0, GFP_KERNEL);
if (ret) if (ret)
...@@ -2035,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ...@@ -2035,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
goto exit_root_id; goto exit_root_id;
} }
root_cgrp->kn = root->kf_root->kn; root_cgrp->kn = root->kf_root->kn;
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
ret = css_populate_dir(&root_cgrp->self); ret = css_populate_dir(&root_cgrp->self);
if (ret) if (ret)
...@@ -2188,13 +2121,6 @@ static int cgroup_init_fs_context(struct fs_context *fc) ...@@ -2188,13 +2121,6 @@ static int cgroup_init_fs_context(struct fs_context *fc)
if (!ctx) if (!ctx)
return -ENOMEM; return -ENOMEM;
/*
* The first time anyone tries to mount a cgroup, enable the list
* linking each css_set to its tasks and fix up all existing tasks.
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
ctx->ns = current->nsproxy->cgroup_ns; ctx->ns = current->nsproxy->cgroup_ns;
get_cgroup_ns(ctx->ns); get_cgroup_ns(ctx->ns);
fc->fs_private = &ctx->kfc; fc->fs_private = &ctx->kfc;
...@@ -2372,9 +2298,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, ...@@ -2372,9 +2298,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
if (task->flags & PF_EXITING) if (task->flags & PF_EXITING)
return; return;
/* leave @task alone if post_fork() hasn't linked it yet */ /* cgroup_threadgroup_rwsem protects racing against forks */
if (list_empty(&task->cg_list)) WARN_ON_ONCE(list_empty(&task->cg_list));
return;
cset = task_css_set(task); cset = task_css_set(task);
if (!cset->mg_src_cgrp) if (!cset->mg_src_cgrp)
...@@ -2825,7 +2750,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, ...@@ -2825,7 +2750,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
return ret; return ret;
} }
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem) __acquires(&cgroup_threadgroup_rwsem)
{ {
struct task_struct *tsk; struct task_struct *tsk;
...@@ -2834,7 +2760,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) ...@@ -2834,7 +2760,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
percpu_down_write(&cgroup_threadgroup_rwsem); /*
* If we migrate a single thread, we don't care about threadgroup
* stability. If the thread is `current`, it won't exit(2) under our
* hands or change PID through exec(2). We exclude
* cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
* callers by cgroup_mutex.
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
if (pid || threadgroup) {
percpu_down_write(&cgroup_threadgroup_rwsem);
*locked = true;
} else {
*locked = false;
}
rcu_read_lock(); rcu_read_lock();
if (pid) { if (pid) {
...@@ -2865,13 +2805,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) ...@@ -2865,13 +2805,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
goto out_unlock_rcu; goto out_unlock_rcu;
out_unlock_threadgroup: out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem); if (*locked) {
percpu_up_write(&cgroup_threadgroup_rwsem);
*locked = false;
}
out_unlock_rcu: out_unlock_rcu:
rcu_read_unlock(); rcu_read_unlock();
return tsk; return tsk;
} }
void cgroup_procs_write_finish(struct task_struct *task) void cgroup_procs_write_finish(struct task_struct *task, bool locked)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
...@@ -2880,7 +2823,8 @@ void cgroup_procs_write_finish(struct task_struct *task) ...@@ -2880,7 +2823,8 @@ void cgroup_procs_write_finish(struct task_struct *task)
/* release reference from cgroup_procs_write_start() */ /* release reference from cgroup_procs_write_start() */
put_task_struct(task); put_task_struct(task);
percpu_up_write(&cgroup_threadgroup_rwsem); if (locked)
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid) for_each_subsys(ss, ssid)
if (ss->post_attach) if (ss->post_attach)
ss->post_attach(); ss->post_attach();
...@@ -3601,22 +3545,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v) ...@@ -3601,22 +3545,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
#ifdef CONFIG_PSI #ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{ {
struct cgroup *cgroup = seq_css(seq)->cgroup; struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_IO); return psi_show(seq, psi, PSI_IO);
} }
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{ {
struct cgroup *cgroup = seq_css(seq)->cgroup; struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_MEM); return psi_show(seq, psi, PSI_MEM);
} }
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{ {
struct cgroup *cgroup = seq_css(seq)->cgroup; struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_CPU); return psi_show(seq, psi, PSI_CPU);
} }
...@@ -4568,9 +4512,6 @@ static void css_task_iter_advance(struct css_task_iter *it) ...@@ -4568,9 +4512,6 @@ static void css_task_iter_advance(struct css_task_iter *it)
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it) struct css_task_iter *it)
{ {
/* no one should try to iterate before mounting cgroups */
WARN_ON_ONCE(!use_task_css_set_links);
memset(it, 0, sizeof(*it)); memset(it, 0, sizeof(*it));
spin_lock_irq(&css_set_lock); spin_lock_irq(&css_set_lock);
...@@ -4755,12 +4696,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ...@@ -4755,12 +4696,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
struct cgroup *src_cgrp, *dst_cgrp; struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task; struct task_struct *task;
ssize_t ret; ssize_t ret;
bool locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false); dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp) if (!dst_cgrp)
return -ENODEV; return -ENODEV;
task = cgroup_procs_write_start(buf, true); task = cgroup_procs_write_start(buf, true, &locked);
ret = PTR_ERR_OR_ZERO(task); ret = PTR_ERR_OR_ZERO(task);
if (ret) if (ret)
goto out_unlock; goto out_unlock;
...@@ -4778,7 +4720,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ...@@ -4778,7 +4720,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(dst_cgrp, task, true); ret = cgroup_attach_task(dst_cgrp, task, true);
out_finish: out_finish:
cgroup_procs_write_finish(task); cgroup_procs_write_finish(task, locked);
out_unlock: out_unlock:
cgroup_kn_unlock(of->kn); cgroup_kn_unlock(of->kn);
...@@ -4796,6 +4738,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ...@@ -4796,6 +4738,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
struct cgroup *src_cgrp, *dst_cgrp; struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task; struct task_struct *task;
ssize_t ret; ssize_t ret;
bool locked;
buf = strstrip(buf); buf = strstrip(buf);
...@@ -4803,7 +4746,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ...@@ -4803,7 +4746,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
if (!dst_cgrp) if (!dst_cgrp)
return -ENODEV; return -ENODEV;
task = cgroup_procs_write_start(buf, false); task = cgroup_procs_write_start(buf, false, &locked);
ret = PTR_ERR_OR_ZERO(task); ret = PTR_ERR_OR_ZERO(task);
if (ret) if (ret)
goto out_unlock; goto out_unlock;
...@@ -4827,7 +4770,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ...@@ -4827,7 +4770,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(dst_cgrp, task, false); ret = cgroup_attach_task(dst_cgrp, task, false);
out_finish: out_finish:
cgroup_procs_write_finish(task); cgroup_procs_write_finish(task, locked);
out_unlock: out_unlock:
cgroup_kn_unlock(of->kn); cgroup_kn_unlock(of->kn);
...@@ -5037,9 +4980,6 @@ static void css_release_work_fn(struct work_struct *work) ...@@ -5037,9 +4980,6 @@ static void css_release_work_fn(struct work_struct *work)
tcgrp->nr_dying_descendants--; tcgrp->nr_dying_descendants--;
spin_unlock_irq(&css_set_lock); spin_unlock_irq(&css_set_lock);
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
/* /*
* There are two control paths which try to determine * There are two control paths which try to determine
* cgroup from dentry without going through kernfs - * cgroup from dentry without going through kernfs -
...@@ -5204,10 +5144,12 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, ...@@ -5204,10 +5144,12 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
* it isn't associated with its kernfs_node and doesn't have the control * it isn't associated with its kernfs_node and doesn't have the control
* mask applied. * mask applied.
*/ */
static struct cgroup *cgroup_create(struct cgroup *parent) static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
umode_t mode)
{ {
struct cgroup_root *root = parent->root; struct cgroup_root *root = parent->root;
struct cgroup *cgrp, *tcgrp; struct cgroup *cgrp, *tcgrp;
struct kernfs_node *kn;
int level = parent->level + 1; int level = parent->level + 1;
int ret; int ret;
...@@ -5227,15 +5169,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ...@@ -5227,15 +5169,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
goto out_cancel_ref; goto out_cancel_ref;
} }
/* /* create the directory */
* Temporarily set the pointer to NULL, so idr_find() won't return kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
* a half-baked cgroup. if (IS_ERR(kn)) {
*/ ret = PTR_ERR(kn);
cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
goto out_stat_exit; goto out_stat_exit;
} }
cgrp->kn = kn;
init_cgroup_housekeeping(cgrp); init_cgroup_housekeeping(cgrp);
...@@ -5245,7 +5185,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ...@@ -5245,7 +5185,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
ret = psi_cgroup_alloc(cgrp); ret = psi_cgroup_alloc(cgrp);
if (ret) if (ret)
goto out_idr_free; goto out_kernfs_remove;
ret = cgroup_bpf_inherit(cgrp); ret = cgroup_bpf_inherit(cgrp);
if (ret) if (ret)
...@@ -5269,7 +5209,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ...@@ -5269,7 +5209,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
spin_lock_irq(&css_set_lock); spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
if (tcgrp != cgrp) { if (tcgrp != cgrp) {
tcgrp->nr_descendants++; tcgrp->nr_descendants++;
...@@ -5298,12 +5238,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ...@@ -5298,12 +5238,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
atomic_inc(&root->nr_cgrps); atomic_inc(&root->nr_cgrps);
cgroup_get_live(parent); cgroup_get_live(parent);
/*
* @cgrp is now fully operational. If something fails after this
* point, it'll be released via the normal destruction path.
*/
cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
/* /*
* On the default hierarchy, a child doesn't automatically inherit * On the default hierarchy, a child doesn't automatically inherit
* subtree_control from the parent. Each is configured manually. * subtree_control from the parent. Each is configured manually.
...@@ -5317,8 +5251,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ...@@ -5317,8 +5251,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
out_psi_free: out_psi_free:
psi_cgroup_free(cgrp); psi_cgroup_free(cgrp);
out_idr_free: out_kernfs_remove:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id); kernfs_remove(cgrp->kn);
out_stat_exit: out_stat_exit:
if (cgroup_on_dfl(parent)) if (cgroup_on_dfl(parent))
cgroup_rstat_exit(cgrp); cgroup_rstat_exit(cgrp);
...@@ -5355,7 +5289,6 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent) ...@@ -5355,7 +5289,6 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{ {
struct cgroup *parent, *cgrp; struct cgroup *parent, *cgrp;
struct kernfs_node *kn;
int ret; int ret;
/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
...@@ -5371,27 +5304,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) ...@@ -5371,27 +5304,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
goto out_unlock; goto out_unlock;
} }
cgrp = cgroup_create(parent); cgrp = cgroup_create(parent, name, mode);
if (IS_ERR(cgrp)) { if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp); ret = PTR_ERR(cgrp);
goto out_unlock; goto out_unlock;
} }
/* create the directory */
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_destroy;
}
cgrp->kn = kn;
/* /*
* This extra ref will be put in cgroup_free_fn() and guarantees * This extra ref will be put in cgroup_free_fn() and guarantees
* that @cgrp->kn is always accessible. * that @cgrp->kn is always accessible.
*/ */
kernfs_get(kn); kernfs_get(cgrp->kn);
ret = cgroup_kn_set_ugid(kn); ret = cgroup_kn_set_ugid(cgrp->kn);
if (ret) if (ret)
goto out_destroy; goto out_destroy;
...@@ -5406,7 +5331,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) ...@@ -5406,7 +5331,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
TRACE_CGROUP_PATH(mkdir, cgrp); TRACE_CGROUP_PATH(mkdir, cgrp);
/* let's create and online css's */ /* let's create and online css's */
kernfs_activate(kn); kernfs_activate(cgrp->kn);
ret = 0; ret = 0;
goto out_unlock; goto out_unlock;
...@@ -5836,12 +5761,11 @@ static int __init cgroup_wq_init(void) ...@@ -5836,12 +5761,11 @@ static int __init cgroup_wq_init(void)
} }
core_initcall(cgroup_wq_init); core_initcall(cgroup_wq_init);
void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
char *buf, size_t buflen)
{ {
struct kernfs_node *kn; struct kernfs_node *kn;
kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn) if (!kn)
return; return;
kernfs_path(kn, buf, buflen); kernfs_path(kn, buf, buflen);
...@@ -6002,62 +5926,38 @@ void cgroup_cancel_fork(struct task_struct *child) ...@@ -6002,62 +5926,38 @@ void cgroup_cancel_fork(struct task_struct *child)
void cgroup_post_fork(struct task_struct *child) void cgroup_post_fork(struct task_struct *child)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
struct css_set *cset;
int i; int i;
spin_lock_irq(&css_set_lock);
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset = task_css_set(current); /* current is @child's parent */
get_css_set(cset);
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
/* /*
* This may race against cgroup_enable_task_cg_lists(). As that * If the cgroup has to be frozen, the new task has too. Let's set
* function sets use_task_css_set_links before grabbing * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
* tasklist_lock and we just went through tasklist_lock to add * frozen state.
* @child, it's guaranteed that either we see the set
* use_task_css_set_links or cgroup_enable_task_cg_lists() sees
* @child during its iteration.
*
* If we won the race, @child is associated with %current's
* css_set. Grabbing css_set_lock guarantees both that the
* association is stable, and, on completion of the parent's
* migration, @child is visible in the source of migration or
* already in the destination cgroup. This guarantee is necessary
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
* Note that if we lose to cgroup_enable_task_cg_lists(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
*/ */
if (use_task_css_set_links) { if (unlikely(cgroup_task_freeze(child))) {
struct css_set *cset; spin_lock(&child->sighand->siglock);
WARN_ON_ONCE(child->frozen);
spin_lock_irq(&css_set_lock); child->jobctl |= JOBCTL_TRAP_FREEZE;
cset = task_css_set(current); spin_unlock(&child->sighand->siglock);
if (list_empty(&child->cg_list)) {
get_css_set(cset);
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
}
/* /*
* If the cgroup has to be frozen, the new task has too. * Calling cgroup_update_frozen() isn't required here,
* Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get * because it will be called anyway a bit later from
* the task into the frozen state. * do_freezer_trap(). So we avoid cgroup's transient switch
* from the frozen state and back.
*/ */
if (unlikely(cgroup_task_freeze(child))) {
spin_lock(&child->sighand->siglock);
WARN_ON_ONCE(child->frozen);
child->jobctl |= JOBCTL_TRAP_FREEZE;
spin_unlock(&child->sighand->siglock);
/*
* Calling cgroup_update_frozen() isn't required here,
* because it will be called anyway a bit later
* from do_freezer_trap(). So we avoid cgroup's
* transient switch from the frozen state and back.
*/
}
spin_unlock_irq(&css_set_lock);
} }
spin_unlock_irq(&css_set_lock);
/* /*
* Call ss->fork(). This must happen after @child is linked on * Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork() * css_set; otherwise, @child might change state between ->fork()
...@@ -6072,20 +5972,8 @@ void cgroup_post_fork(struct task_struct *child) ...@@ -6072,20 +5972,8 @@ void cgroup_post_fork(struct task_struct *child)
* cgroup_exit - detach cgroup from exiting task * cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process * @tsk: pointer to task_struct of exiting process
* *
* Description: Detach cgroup from @tsk and release it. * Description: Detach cgroup from @tsk.
* *
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
* call cgroup_exit() while the task is still competent to handle
* notify_on_release(), then leave the task attached to the root cgroup in
* each hierarchy for the remainder of its exit. No need to bother with
* init_css_set refcnting. init_css_set never goes away and we can't race
* with migration path - PF_EXITING is visible to migration path.
*/ */
void cgroup_exit(struct task_struct *tsk) void cgroup_exit(struct task_struct *tsk)
{ {
...@@ -6093,26 +5981,19 @@ void cgroup_exit(struct task_struct *tsk) ...@@ -6093,26 +5981,19 @@ void cgroup_exit(struct task_struct *tsk)
struct css_set *cset; struct css_set *cset;
int i; int i;
/* spin_lock_irq(&css_set_lock);
* Unlink from @tsk from its css_set. As migration path can't race
* with us, we can check css_set and cg_list without synchronization.
*/
cset = task_css_set(tsk);
if (!list_empty(&tsk->cg_list)) { WARN_ON_ONCE(list_empty(&tsk->cg_list));
spin_lock_irq(&css_set_lock); cset = task_css_set(tsk);
css_set_move_task(tsk, cset, NULL, false); css_set_move_task(tsk, cset, NULL, false);
list_add_tail(&tsk->cg_list, &cset->dying_tasks); list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--; cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk)); WARN_ON_ONCE(cgroup_task_frozen(tsk));
if (unlikely(cgroup_task_freeze(tsk))) if (unlikely(cgroup_task_freeze(tsk)))
cgroup_update_frozen(task_dfl_cgroup(tsk)); cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock); spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
}
/* see cgroup_post_fork() for details */ /* see cgroup_post_fork() for details */
do_each_subsys_mask(ss, i, have_exit_callback) { do_each_subsys_mask(ss, i, have_exit_callback) {
...@@ -6129,12 +6010,10 @@ void cgroup_release(struct task_struct *task) ...@@ -6129,12 +6010,10 @@ void cgroup_release(struct task_struct *task)
ss->release(task); ss->release(task);
} while_each_subsys_mask(); } while_each_subsys_mask();
if (use_task_css_set_links) { spin_lock_irq(&css_set_lock);
spin_lock_irq(&css_set_lock); css_set_skip_task_iters(task_css_set(task), task);
css_set_skip_task_iters(task_css_set(task), task); list_del_init(&task->cg_list);
list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock);
spin_unlock_irq(&css_set_lock);
}
} }
void cgroup_free(struct task_struct *task) void cgroup_free(struct task_struct *task)
......
...@@ -929,8 +929,6 @@ static void rebuild_root_domains(void) ...@@ -929,8 +929,6 @@ static void rebuild_root_domains(void)
lockdep_assert_cpus_held(); lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex); lockdep_assert_held(&sched_domains_mutex);
cgroup_enable_task_cg_lists();
rcu_read_lock(); rcu_read_lock();
/* /*
......
...@@ -230,6 +230,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task, ...@@ -230,6 +230,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
if (task->flags & PF_KTHREAD) if (task->flags & PF_KTHREAD)
return; return;
/*
* It's not necessary to do changes if both of the src and dst cgroups
* are not freezing and task is not frozen.
*/
if (!test_bit(CGRP_FREEZE, &src->flags) &&
!test_bit(CGRP_FREEZE, &dst->flags) &&
!task->frozen)
return;
/* /*
* Adjust counters of freezing and frozen tasks. * Adjust counters of freezing and frozen tasks.
* Note, that if the task is frozen, but the destination cgroup is not * Note, that if the task is frozen, but the destination cgroup is not
......
...@@ -45,7 +45,7 @@ struct pids_cgroup { ...@@ -45,7 +45,7 @@ struct pids_cgroup {
* %PIDS_MAX = (%PID_MAX_LIMIT + 1). * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
*/ */
atomic64_t counter; atomic64_t counter;
int64_t limit; atomic64_t limit;
/* Handle for "pids.events" */ /* Handle for "pids.events" */
struct cgroup_file events_file; struct cgroup_file events_file;
...@@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent) ...@@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
if (!pids) if (!pids)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
pids->limit = PIDS_MAX;
atomic64_set(&pids->counter, 0); atomic64_set(&pids->counter, 0);
atomic64_set(&pids->limit, PIDS_MAX);
atomic64_set(&pids->events_limit, 0); atomic64_set(&pids->events_limit, 0);
return &pids->css; return &pids->css;
} }
...@@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) ...@@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
for (p = pids; parent_pids(p); p = parent_pids(p)) { for (p = pids; parent_pids(p); p = parent_pids(p)) {
int64_t new = atomic64_add_return(num, &p->counter); int64_t new = atomic64_add_return(num, &p->counter);
int64_t limit = atomic64_read(&p->limit);
/* /*
* Since new is capped to the maximum number of pid_t, if * Since new is capped to the maximum number of pid_t, if
* p->limit is %PIDS_MAX then we know that this test will never * p->limit is %PIDS_MAX then we know that this test will never
* fail. * fail.
*/ */
if (new > p->limit) if (new > limit)
goto revert; goto revert;
} }
...@@ -277,7 +278,7 @@ static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, ...@@ -277,7 +278,7 @@ static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
* Limit updates don't need to be mutex'd, since it isn't * Limit updates don't need to be mutex'd, since it isn't
* critical that any racing fork()s follow the new limit. * critical that any racing fork()s follow the new limit.
*/ */
pids->limit = limit; atomic64_set(&pids->limit, limit);
return nbytes; return nbytes;
} }
...@@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v) ...@@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v)
{ {
struct cgroup_subsys_state *css = seq_css(sf); struct cgroup_subsys_state *css = seq_css(sf);
struct pids_cgroup *pids = css_pids(css); struct pids_cgroup *pids = css_pids(css);
int64_t limit = pids->limit; int64_t limit = atomic64_read(&pids->limit);
if (limit >= PIDS_MAX) if (limit >= PIDS_MAX)
seq_printf(sf, "%s\n", PIDS_MAX_STR); seq_printf(sf, "%s\n", PIDS_MAX_STR);
......
...@@ -304,44 +304,48 @@ void __init cgroup_rstat_boot(void) ...@@ -304,44 +304,48 @@ void __init cgroup_rstat_boot(void)
* Functions for cgroup basic resource statistics implemented on top of * Functions for cgroup basic resource statistics implemented on top of
* rstat. * rstat.
*/ */
static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
struct cgroup_base_stat *src_bstat) struct cgroup_base_stat *src_bstat)
{ {
dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
} }
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
struct cgroup_base_stat *src_bstat)
{
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{ {
struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; struct cgroup_base_stat cur, delta;
struct task_cputime cputime;
struct cgroup_base_stat delta;
unsigned seq; unsigned seq;
/* fetch the current per-cpu values */ /* fetch the current per-cpu values */
do { do {
seq = __u64_stats_fetch_begin(&rstatc->bsync); seq = __u64_stats_fetch_begin(&rstatc->bsync);
cputime = rstatc->bstat.cputime; cur.cputime = rstatc->bstat.cputime;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* calculate the delta to propgate */ /* propagate percpu delta to global */
delta.cputime.utime = cputime.utime - last_cputime->utime; delta = cur;
delta.cputime.stime = cputime.stime - last_cputime->stime; cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - cgroup_base_stat_add(&cgrp->bstat, &delta);
last_cputime->sum_exec_runtime; cgroup_base_stat_add(&rstatc->last_bstat, &delta);
*last_cputime = cputime;
/* propagate global delta to parent */
/* transfer the pending stat into delta */ if (parent) {
cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); delta = cgrp->bstat;
memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
/* propagate delta into the global stat and the parent's pending */ cgroup_base_stat_add(&cgrp->last_bstat, &delta);
cgroup_base_stat_accumulate(&cgrp->bstat, &delta); }
if (parent)
cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
} }
static struct cgroup_rstat_cpu * static struct cgroup_rstat_cpu *
......
...@@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void); ...@@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void);
* Send out a notify message. * Send out a notify message.
*/ */
static void trace_note(struct blk_trace *bt, pid_t pid, int action, static void trace_note(struct blk_trace *bt, pid_t pid, int action,
const void *data, size_t len, const void *data, size_t len, u64 cgid)
union kernfs_node_id *cgid)
{ {
struct blk_io_trace *t; struct blk_io_trace *t;
struct ring_buffer_event *event = NULL; struct ring_buffer_event *event = NULL;
...@@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, ...@@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
int pc = 0; int pc = 0;
int cpu = smp_processor_id(); int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled; bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) { if (blk_tracer) {
buffer = blk_tr->trace_buffer.buffer; buffer = blk_tr->trace_buffer.buffer;
...@@ -100,8 +99,8 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, ...@@ -100,8 +99,8 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
t->pid = pid; t->pid = pid;
t->cpu = cpu; t->cpu = cpu;
t->pdu_len = len + cgid_len; t->pdu_len = len + cgid_len;
if (cgid) if (cgid_len)
memcpy((void *)t + sizeof(*t), cgid, cgid_len); memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
memcpy((void *) t + sizeof(*t) + cgid_len, data, len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer) if (blk_tracer)
...@@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk) ...@@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk)
spin_lock_irqsave(&running_trace_lock, flags); spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) { list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
sizeof(tsk->comm), NULL); sizeof(tsk->comm), 0);
} }
spin_unlock_irqrestore(&running_trace_lock, flags); spin_unlock_irqrestore(&running_trace_lock, flags);
} }
...@@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt) ...@@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt)
words[1] = now.tv_nsec; words[1] = now.tv_nsec;
local_irq_save(flags); local_irq_save(flags);
trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0);
local_irq_restore(flags); local_irq_restore(flags);
} }
...@@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, ...@@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
blkcg = NULL; blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP #ifdef CONFIG_BLK_CGROUP
trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
#else #else
trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0);
#endif #endif
local_irq_restore(flags); local_irq_restore(flags);
} }
...@@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), ...@@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
*/ */
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int op, int op_flags, u32 what, int error, int pdu_len, int op, int op_flags, u32 what, int error, int pdu_len,
void *pdu_data, union kernfs_node_id *cgid) void *pdu_data, u64 cgid)
{ {
struct task_struct *tsk = current; struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL; struct ring_buffer_event *event = NULL;
...@@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, ...@@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
pid_t pid; pid_t pid;
int cpu, pc = 0; int cpu, pc = 0;
bool blk_tracer = blk_tracer_enabled; bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return; return;
...@@ -294,7 +293,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, ...@@ -294,7 +293,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
t->pdu_len = pdu_len + cgid_len; t->pdu_len = pdu_len + cgid_len;
if (cgid_len) if (cgid_len)
memcpy((void *)t + sizeof(*t), cgid, cgid_len); memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
if (pdu_len) if (pdu_len)
memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
...@@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q) ...@@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q)
} }
#ifdef CONFIG_BLK_CGROUP #ifdef CONFIG_BLK_CGROUP
static union kernfs_node_id * static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{ {
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt = q->blk_trace;
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return NULL; return 0;
if (!bio->bi_blkg) if (!bio->bi_blkg)
return NULL; return 0;
return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); return cgroup_id(bio_blkcg(bio)->css.cgroup);
} }
#else #else
static union kernfs_node_id * u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{ {
return NULL; return 0;
} }
#endif #endif
static union kernfs_node_id * static u64
blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
{ {
if (!rq->bio) if (!rq->bio)
return NULL; return 0;
/* Use the first bio */ /* Use the first bio */
return blk_trace_bio_get_cgid(q, rq->bio); return blk_trace_bio_get_cgid(q, rq->bio);
} }
...@@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) ...@@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
* *
**/ **/
static void blk_add_trace_rq(struct request *rq, int error, static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what, unsigned int nr_bytes, u32 what, u64 cgid)
union kernfs_node_id *cgid)
{ {
struct blk_trace *bt = rq->q->blk_trace; struct blk_trace *bt = rq->q->blk_trace;
...@@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore, ...@@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore,
if (bt) if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
NULL, NULL); NULL, 0);
} }
} }
...@@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore, ...@@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bt) if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL, NULL); 0, 0, NULL, 0);
} }
} }
...@@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) ...@@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace; struct blk_trace *bt = q->blk_trace;
if (bt) if (bt)
__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
} }
static void blk_add_trace_unplug(void *ignore, struct request_queue *q, static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
...@@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, ...@@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else else
what = BLK_TA_UNPLUG_TIMER; what = BLK_TA_UNPLUG_TIMER;
__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
} }
} }
...@@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) ...@@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
{ {
return (void *)(te_blk_io_trace(ent) + 1) + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0);
(has_cg ? sizeof(union kernfs_node_id) : 0);
} }
static inline const void *cgid_start(const struct trace_entry *ent) static inline u64 t_cgid(const struct trace_entry *ent)
{ {
return (void *)(te_blk_io_trace(ent) + 1); return *(u64 *)(te_blk_io_trace(ent) + 1);
} }
static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
{ {
return te_blk_io_trace(ent)->pdu_len - return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0);
(has_cg ? sizeof(union kernfs_node_id) : 0);
} }
static inline u32 t_action(const struct trace_entry *ent) static inline u32 t_action(const struct trace_entry *ent)
...@@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, ...@@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
fill_rwbs(rwbs, t); fill_rwbs(rwbs, t);
if (has_cg) { if (has_cg) {
const union kernfs_node_id *id = cgid_start(iter->ent); u64 id = t_cgid(iter->ent);
if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
char blkcg_name_buf[NAME_MAX + 1] = "<...>"; char blkcg_name_buf[NAME_MAX + 1] = "<...>";
...@@ -1267,11 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, ...@@ -1267,11 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
MAJOR(t->device), MINOR(t->device), MAJOR(t->device), MINOR(t->device),
blkcg_name_buf, act, rwbs); blkcg_name_buf, act, rwbs);
} else } else {
/*
* The cgid portion used to be "INO,GEN". Userland
* builds a FILEID_INO32_GEN fid out of them and
* opens the cgroup using open_by_handle_at(2).
* While 32bit ino setups are still the same, 64bit
* ones now use the 64bit ino as the whole ID and
* no longer use generation.
*
* Regarldess of the content, always output
* "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
* be mapped back to @id on both 64 and 32bit ino
* setups. See __kernfs_fh_to_dentry().
*/
trace_seq_printf(&iter->seq, trace_seq_printf(&iter->seq,
"%3d,%-3d %x,%-x %2s %3s ", "%3d,%-3d %llx,%-llx %2s %3s ",
MAJOR(t->device), MINOR(t->device), MAJOR(t->device), MINOR(t->device),
id->ino, id->generation, act, rwbs); id & U32_MAX, id >> 32, act, rwbs);
}
} else } else
trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
MAJOR(t->device), MINOR(t->device), act, rwbs); MAJOR(t->device), MINOR(t->device), act, rwbs);
......
...@@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) ...@@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
return 0; return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return cgrp->kn->id.id; return cgroup_id(cgrp);
} }
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
...@@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ...@@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
if (!ancestor) if (!ancestor)
return 0; return 0;
return ancestor->kn->id.id; return cgroup_id(ancestor);
} }
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
......
...@@ -93,7 +93,7 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) ...@@ -93,7 +93,7 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx)
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
{ {
struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
int id = css->cgroup->id; int id = css->id;
if (map && id < map->priomap_len) if (map && id < map->priomap_len)
return map->priomap[id]; return map->priomap[id];
...@@ -113,7 +113,7 @@ static int netprio_set_prio(struct cgroup_subsys_state *css, ...@@ -113,7 +113,7 @@ static int netprio_set_prio(struct cgroup_subsys_state *css,
struct net_device *dev, u32 prio) struct net_device *dev, u32 prio)
{ {
struct netprio_map *map; struct netprio_map *map;
int id = css->cgroup->id; int id = css->id;
int ret; int ret;
/* avoid extending priomap for zero writes */ /* avoid extending priomap for zero writes */
...@@ -177,7 +177,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) ...@@ -177,7 +177,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css)
static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
{ {
return css->cgroup->id; return css->id;
} }
static int read_priomap(struct seq_file *sf, void *v) static int read_priomap(struct seq_file *sf, void *v)
...@@ -237,7 +237,7 @@ static void net_prio_attach(struct cgroup_taskset *tset) ...@@ -237,7 +237,7 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
cgroup_taskset_for_each(p, css, tset) { cgroup_taskset_for_each(p, css, tset) {
void *v = (void *)(unsigned long)css->cgroup->id; void *v = (void *)(unsigned long)css->id;
task_lock(p); task_lock(p);
iterate_fd(p->files, 0, update_netprio, v); iterate_fd(p->files, 0, update_netprio, v);
......
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
CFLAGS += -Wall CFLAGS += -Wall -pthread
all: all:
TEST_FILES := with_stress.sh
TEST_PROGS := test_stress.sh
TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS = test_memcontrol
TEST_GEN_PROGS += test_core TEST_GEN_PROGS += test_core
TEST_GEN_PROGS += test_freezer TEST_GEN_PROGS += test_freezer
......
...@@ -158,6 +158,22 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key) ...@@ -158,6 +158,22 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key)
return atol(ptr + strlen(key)); return atol(ptr + strlen(key));
} }
long cg_read_lc(const char *cgroup, const char *control)
{
char buf[PAGE_SIZE];
const char delim[] = "\n";
char *line;
long cnt = 0;
if (cg_read(cgroup, control, buf, sizeof(buf)))
return -1;
for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
cnt++;
return cnt;
}
int cg_write(const char *cgroup, const char *control, char *buf) int cg_write(const char *cgroup, const char *control, char *buf)
{ {
char path[PATH_MAX]; char path[PATH_MAX];
...@@ -282,10 +298,12 @@ int cg_enter(const char *cgroup, int pid) ...@@ -282,10 +298,12 @@ int cg_enter(const char *cgroup, int pid)
int cg_enter_current(const char *cgroup) int cg_enter_current(const char *cgroup)
{ {
char pidbuf[64]; return cg_write(cgroup, "cgroup.procs", "0");
}
snprintf(pidbuf, sizeof(pidbuf), "%d", getpid()); int cg_enter_current_thread(const char *cgroup)
return cg_write(cgroup, "cgroup.procs", pidbuf); {
return cg_write(cgroup, "cgroup.threads", "0");
} }
int cg_run(const char *cgroup, int cg_run(const char *cgroup,
...@@ -410,11 +428,25 @@ int set_oom_adj_score(int pid, int score) ...@@ -410,11 +428,25 @@ int set_oom_adj_score(int pid, int score)
return 0; return 0;
} }
char proc_read_text(int pid, const char *item, char *buf, size_t size) ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
{ {
char path[PATH_MAX]; char path[PATH_MAX];
snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); if (!pid)
snprintf(path, sizeof(path), "/proc/%s/%s",
thread ? "thread-self" : "self", item);
else
snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
return read_text(path, buf, size); return read_text(path, buf, size);
} }
int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
{
char buf[PAGE_SIZE];
if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
return -1;
return strstr(buf, needle) ? 0 : -1;
}
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#include <stdbool.h>
#include <stdlib.h> #include <stdlib.h>
#define PAGE_SIZE 4096 #define PAGE_SIZE 4096
...@@ -29,12 +30,14 @@ extern int cg_read_strstr(const char *cgroup, const char *control, ...@@ -29,12 +30,14 @@ extern int cg_read_strstr(const char *cgroup, const char *control,
const char *needle); const char *needle);
extern long cg_read_long(const char *cgroup, const char *control); extern long cg_read_long(const char *cgroup, const char *control);
long cg_read_key_long(const char *cgroup, const char *control, const char *key); long cg_read_key_long(const char *cgroup, const char *control, const char *key);
extern long cg_read_lc(const char *cgroup, const char *control);
extern int cg_write(const char *cgroup, const char *control, char *buf); extern int cg_write(const char *cgroup, const char *control, char *buf);
extern int cg_run(const char *cgroup, extern int cg_run(const char *cgroup,
int (*fn)(const char *cgroup, void *arg), int (*fn)(const char *cgroup, void *arg),
void *arg); void *arg);
extern int cg_enter(const char *cgroup, int pid); extern int cg_enter(const char *cgroup, int pid);
extern int cg_enter_current(const char *cgroup); extern int cg_enter_current(const char *cgroup);
extern int cg_enter_current_thread(const char *cgroup);
extern int cg_run_nowait(const char *cgroup, extern int cg_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg), int (*fn)(const char *cgroup, void *arg),
void *arg); void *arg);
...@@ -45,4 +48,5 @@ extern int is_swap_enabled(void); ...@@ -45,4 +48,5 @@ extern int is_swap_enabled(void);
extern int set_oom_adj_score(int pid, int score); extern int set_oom_adj_score(int pid, int score);
extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_wait_for_proc_count(const char *cgroup, int count);
extern int cg_killall(const char *cgroup); extern int cg_killall(const char *cgroup);
extern char proc_read_text(int pid, const char *item, char *buf, size_t size); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
...@@ -5,6 +5,9 @@ ...@@ -5,6 +5,9 @@
#include <unistd.h> #include <unistd.h>
#include <stdio.h> #include <stdio.h>
#include <errno.h> #include <errno.h>
#include <signal.h>
#include <string.h>
#include <pthread.h>
#include "../kselftest.h" #include "../kselftest.h"
#include "cgroup_util.h" #include "cgroup_util.h"
...@@ -354,6 +357,147 @@ static int test_cgcore_internal_process_constraint(const char *root) ...@@ -354,6 +357,147 @@ static int test_cgcore_internal_process_constraint(const char *root)
return ret; return ret;
} }
static void *dummy_thread_fn(void *arg)
{
return (void *)(size_t)pause();
}
/*
* Test threadgroup migration.
* All threads of a process are migrated together.
*/
static int test_cgcore_proc_migration(const char *root)
{
int ret = KSFT_FAIL;
int t, c_threads, n_threads = 13;
char *src = NULL, *dst = NULL;
pthread_t threads[n_threads];
src = cg_name(root, "cg_src");
dst = cg_name(root, "cg_dst");
if (!src || !dst)
goto cleanup;
if (cg_create(src))
goto cleanup;
if (cg_create(dst))
goto cleanup;
if (cg_enter_current(src))
goto cleanup;
for (c_threads = 0; c_threads < n_threads; ++c_threads) {
if (pthread_create(&threads[c_threads], NULL, dummy_thread_fn, NULL))
goto cleanup;
}
cg_enter_current(dst);
if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1)
goto cleanup;
ret = KSFT_PASS;
cleanup:
for (t = 0; t < c_threads; ++t) {
pthread_cancel(threads[t]);
}
for (t = 0; t < c_threads; ++t) {
pthread_join(threads[t], NULL);
}
cg_enter_current(root);
if (dst)
cg_destroy(dst);
if (src)
cg_destroy(src);
free(dst);
free(src);
return ret;
}
static void *migrating_thread_fn(void *arg)
{
int g, i, n_iterations = 1000;
char **grps = arg;
char lines[3][PATH_MAX];
for (g = 1; g < 3; ++g)
snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0]));
for (i = 0; i < n_iterations; ++i) {
cg_enter_current_thread(grps[(i % 2) + 1]);
if (proc_read_strstr(0, 1, "cgroup", lines[(i % 2) + 1]))
return (void *)-1;
}
return NULL;
}
/*
* Test single thread migration.
* Threaded cgroups allow successful migration of a thread.
*/
static int test_cgcore_thread_migration(const char *root)
{
int ret = KSFT_FAIL;
char *dom = NULL;
char line[PATH_MAX];
char *grps[3] = { (char *)root, NULL, NULL };
pthread_t thr;
void *retval;
dom = cg_name(root, "cg_dom");
grps[1] = cg_name(root, "cg_dom/cg_src");
grps[2] = cg_name(root, "cg_dom/cg_dst");
if (!grps[1] || !grps[2] || !dom)
goto cleanup;
if (cg_create(dom))
goto cleanup;
if (cg_create(grps[1]))
goto cleanup;
if (cg_create(grps[2]))
goto cleanup;
if (cg_write(grps[1], "cgroup.type", "threaded"))
goto cleanup;
if (cg_write(grps[2], "cgroup.type", "threaded"))
goto cleanup;
if (cg_enter_current(grps[1]))
goto cleanup;
if (pthread_create(&thr, NULL, migrating_thread_fn, grps))
goto cleanup;
if (pthread_join(thr, &retval))
goto cleanup;
if (retval)
goto cleanup;
snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0]));
if (proc_read_strstr(0, 1, "cgroup", line))
goto cleanup;
ret = KSFT_PASS;
cleanup:
cg_enter_current(root);
if (grps[2])
cg_destroy(grps[2]);
if (grps[1])
cg_destroy(grps[1]);
if (dom)
cg_destroy(dom);
free(grps[2]);
free(grps[1]);
free(dom);
return ret;
}
#define T(x) { x, #x } #define T(x) { x, #x }
struct corecg_test { struct corecg_test {
int (*fn)(const char *root); int (*fn)(const char *root);
...@@ -366,6 +510,8 @@ struct corecg_test { ...@@ -366,6 +510,8 @@ struct corecg_test {
T(test_cgcore_parent_becomes_threaded), T(test_cgcore_parent_becomes_threaded),
T(test_cgcore_invalid_domain), T(test_cgcore_invalid_domain),
T(test_cgcore_populated), T(test_cgcore_populated),
T(test_cgcore_proc_migration),
T(test_cgcore_thread_migration),
}; };
#undef T #undef T
......
...@@ -72,6 +72,7 @@ static int cg_prepare_for_wait(const char *cgroup) ...@@ -72,6 +72,7 @@ static int cg_prepare_for_wait(const char *cgroup)
if (ret == -1) { if (ret == -1) {
debug("Error: inotify_add_watch() failed\n"); debug("Error: inotify_add_watch() failed\n");
close(fd); close(fd);
fd = -1;
} }
return fd; return fd;
...@@ -701,7 +702,7 @@ static int proc_check_stopped(int pid) ...@@ -701,7 +702,7 @@ static int proc_check_stopped(int pid)
char buf[PAGE_SIZE]; char buf[PAGE_SIZE];
int len; int len;
len = proc_read_text(pid, "stat", buf, sizeof(buf)); len = proc_read_text(pid, 0, "stat", buf, sizeof(buf));
if (len == -1) { if (len == -1) {
debug("Can't get %d stat\n", pid); debug("Can't get %d stat\n", pid);
return -1; return -1;
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
./with_stress.sh -s subsys -s fork ./test_core
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
stress_fork()
{
while true ; do
/usr/bin/true
sleep 0.01
done
}
stress_subsys()
{
local verb=+
while true ; do
echo $verb$subsys_ctrl >$sysfs/cgroup.subtree_control
[ $verb = "+" ] && verb=- || verb=+
# incommensurable period with other stresses
sleep 0.011
done
}
init_and_check()
{
sysfs=`mount -t cgroup2 | head -1 | awk '{ print $3 }'`
if [ ! -d "$sysfs" ]; then
echo "Skipping: cgroup2 is not mounted" >&2
exit $ksft_skip
fi
if ! echo +$subsys_ctrl >$sysfs/cgroup.subtree_control ; then
echo "Skipping: cannot enable $subsys_ctrl in $sysfs" >&2
exit $ksft_skip
fi
if ! echo -$subsys_ctrl >$sysfs/cgroup.subtree_control ; then
echo "Skipping: cannot disable $subsys_ctrl in $sysfs" >&2
exit $ksft_skip
fi
}
declare -a stresses
declare -a stress_pids
duration=5
rc=0
subsys_ctrl=cpuset
sysfs=
while getopts c:d:hs: opt; do
case $opt in
c)
subsys_ctrl=$OPTARG
;;
d)
duration=$OPTARG
;;
h)
echo "Usage $0 [ -s stress ] ... [ -d duration ] [-c controller] cmd args .."
echo -e "\t default duration $duration seconds"
echo -e "\t default controller $subsys_ctrl"
exit
;;
s)
func=stress_$OPTARG
if [ "x$(type -t $func)" != "xfunction" ] ; then
echo "Unknown stress $OPTARG"
exit 1
fi
stresses+=($func)
;;
esac
done
shift $((OPTIND - 1))
init_and_check
for s in ${stresses[*]} ; do
$s &
stress_pids+=($!)
done
time=0
start=$(date +%s)
while [ $time -lt $duration ] ; do
$*
rc=$?
[ $rc -eq 0 ] || break
time=$(($(date +%s) - $start))
done
for pid in ${stress_pids[*]} ; do
kill -SIGTERM $pid
wait $pid
done
exit $rc
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment