[PATCH] thread-exec-2.5.34-B1, BK-curr

This implements one of the last missing POSIX threading details - exec() semantics. Previous kernels had code that tried to handle it, but that code had a number of disadvantages: - it only worked if the exec()-ing thread was the thread group leader, creating an assymetry. This does not work if the thread group leader has exited already. - it was racy: it sent a SIGKILL to every thread in the group but did not wait for them to actually process the SIGKILL. It did a yield() but that is not enough. All 'other' threads have to finish processing before we can continue with the exec(). This adds the same logic, but extended with the following enhancements: - works from non-leader threads just as much as the thread group leader. - waits for all other threads to exit before continuing with the exec(). - reuses the PID of the group. It would perhaps be a more generic approach to add a new syscall, sys_ungroup() - which would do largely what de_thread() does in this patch. But it's not really needed now - posix_spawn() is currently implemented via starting a non-CLONE_THREAD helper thread that does a sys_exec(). There's no API currently that needs a direct exec() from a thread - but it could be created (such as pthread_exec_np()). It would have the advantage of not having to go through a helper thread, but the difference is minimal.

[PATCH] thread-exec-2.5.34-B1, BK-curr
This implements one of the last missing POSIX threading details - exec() semantics. Previous kernels had code that tried to handle it, but that code had a number of disadvantages: - it only worked if the exec()-ing thread was the thread group leader, creating an assymetry. This does not work if the thread group leader has exited already. - it was racy: it sent a SIGKILL to every thread in the group but did not wait for them to actually process the SIGKILL. It did a yield() but that is not enough. All 'other' threads have to finish processing before we can continue with the exec(). This adds the same logic, but extended with the following enhancements: - works from non-leader threads just as much as the thread group leader. - waits for all other threads to exit before continuing with the exec(). - reuses the PID of the group. It would perhaps be a more generic approach to add a new syscall, sys_ungroup() - which would do largely what de_thread() does in this patch. But it's not really needed now - posix_spawn() is currently implemented via starting a non-CLONE_THREAD helper thread that does a sys_exec(). There's no API currently that needs a direct exec() from a thread - but it could be created (such as pthread_exec_np()). It would have the advantage of not having to go through a helper thread, but the difference is minimal.
63540cea · Ingo Molnar · Linus Torvalds · 7cd0a691 · 63540cea · 63540cea
Commit 63540cea authored Sep 14, 2002 by Ingo Molnar Committed by Linus Torvalds Sep 14, 2002
Showing with 145 additions and 48 deletions

fs/exec.c fs/exec.c +130 -47

include/linux/sched.h include/linux/sched.h +3 -0

kernel/exit.c kernel/exit.c +1 -1

kernel/fork.c kernel/fork.c +1 -0

kernel/signal.c kernel/signal.c +10 -0

No files found.
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -40,6 +40,7 @@
 #define __NO_VERSION__
 #include <linux/module.h>
 #include <linux/namei.h>
+#include <linux/proc_fs.h>

 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -493,52 +494,151 @@ static int exec_mmap(struct mm_struct *mm)
 	return 0;
 }

+static struct dentry *clean_proc_dentry(struct task_struct *p)
+{
+	struct dentry *proc_dentry = p->proc_dentry;
+
+	if (proc_dentry) {
+		spin_lock(&dcache_lock);
+		if (!list_empty(&proc_dentry->d_hash)) {
+			dget_locked(proc_dentry);
+			list_del_init(&proc_dentry->d_hash);
+		} else
+			proc_dentry = NULL;
+		spin_unlock(&dcache_lock);
+	}
+	return proc_dentry;
+}
+
+static inline void put_proc_dentry(struct dentry *dentry)
+{
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		dput(dentry);
+	}
+}
+
 /*
 * This function makes sure the current process has its own signal table,
 * so that flush_signal_handlers can later reset the handlers without
 * disturbing other processes.  (Other processes might share the signal
 * table via the CLONE_SIGNAL option to clone().)
 */
- 
-static inline int make_private_signals(void)
+static inline int de_thread(struct signal_struct *oldsig)
 {
-	struct signal_struct * newsig;
-
-	remove_thread_group(current, current->sig);
+	struct signal_struct *newsig;
+	int count;

 	if (atomic_read(&current->sig->count) <= 1)
 		return 0;
+
 	newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
-	if (newsig == NULL)
+	if (!newsig)
 		return -ENOMEM;
+
+	if (list_empty(&current->thread_group))
+		goto out;
+	/*
+	 * Kill all other threads in the thread group:
+	 */
+	spin_lock_irq(&oldsig->siglock);
+	if (oldsig->group_exit) {
+		/*
+		 * Another group action in progress, just
+		 * return so that the signal is processed.
+		 */
+		spin_unlock_irq(&oldsig->siglock);
+		kmem_cache_free(sigact_cachep, newsig);
+		return -EAGAIN;
+	}
+	oldsig->group_exit = 1;
+	__broadcast_thread_group(current, SIGKILL);
+
+	/*
+	 * Account for the thread group leader hanging around:
+	 */
+	count = 2;
+	if (current->pid == current->tgid)
+		count = 1;
+	while (atomic_read(&oldsig->count) > count) {
+		oldsig->group_exit_task = current;
+		current->state = TASK_UNINTERRUPTIBLE;
+		spin_unlock_irq(&oldsig->siglock);
+		schedule();
+		spin_lock_irq(&oldsig->siglock);
+		if (oldsig->group_exit_task)
+			BUG();
+	}
+	spin_unlock_irq(&oldsig->siglock);
+
+	/*
+	 * At this point all other threads have exited, all we have to
+	 * do is to wait for the thread group leader to become inactive,
+	 * and to assume its PID:
+	 */
+	if (current->pid != current->tgid) {
+		struct task_struct *leader = current->group_leader;
+		struct dentry *proc_dentry1, *proc_dentry2;
+		unsigned long state;
+
+		wait_task_inactive(leader);
+
+		write_lock_irq(&tasklist_lock);
+		proc_dentry1 = clean_proc_dentry(current);
+		proc_dentry2 = clean_proc_dentry(leader);
+
+		if (leader->tgid != current->tgid)
+			BUG();
+		if (current->pid == current->tgid)
+			BUG();
+		/*
+		 * An exec() starts a new thread group with the
+		 * TGID of the previous thread group. Rehash the
+		 * two threads with a switched PID, and release
+		 * the former thread group leader:
+		 */
+		unhash_pid(current);
+		unhash_pid(leader);
+		leader->pid = leader->tgid = current->pid;
+		current->pid = current->tgid;
+		hash_pid(current);
+		hash_pid(leader);
+		
+		list_add_tail(&current->tasks, &init_task.tasks);
+		state = leader->state;
+		write_unlock_irq(&tasklist_lock);
+
+		if (state == TASK_ZOMBIE)
+			release_task(leader);
+
+		put_proc_dentry(proc_dentry1);
+		put_proc_dentry(proc_dentry2);
+        }
+
+out:
 	spin_lock_init(&newsig->siglock);
 	atomic_set(&newsig->count, 1);
 	newsig->group_exit = 0;
 	newsig->group_exit_code = 0;
+	newsig->group_exit_task = NULL;
 	memcpy(newsig->action, current->sig->action, sizeof(newsig->action));
 	init_sigpending(&newsig->shared_pending);

+	remove_thread_group(current, current->sig);
 	spin_lock_irq(&current->sigmask_lock);
 	current->sig = newsig;
 	spin_unlock_irq(&current->sigmask_lock);
-	return 0;
-}
-	
-/*
- * If make_private_signals() made a copy of the signal table, decrement the
- * refcount of the original table, and free it if necessary.
- * We don't do that in make_private_signals() so that we can back off
- * in flush_old_exec() if an error occurs after calling make_private_signals().
- */

-static inline void release_old_signals(struct signal_struct * oldsig)
-{
-	if (current->sig == oldsig)
-		return;
 	if (atomic_dec_and_test(&oldsig->count))
 		kmem_cache_free(sigact_cachep, oldsig);
-}

+	if (!list_empty(&current->thread_group))
+		BUG();
+	if (current->tgid != current->pid)
+		BUG();
+	return 0;
+}
+	
 /*
 * These functions flushes out all traces of the currently running executable
 * so that a new one can be started
@@ -572,44 +672,27 @@ static inline void flush_old_files(struct files_struct * files)
 	write_unlock(&files->file_lock);
 }

-/*
- * An execve() will automatically "de-thread" the process.
- * - if a master thread (PID==TGID) is doing this, then all subsidiary threads
- *   will be killed (otherwise there will end up being two independent thread
- *   groups with the same TGID).
- * - if a subsidary thread is doing this, then it just leaves the thread group
- */
-static void de_thread(struct task_struct *tsk)
-{
-	if (!list_empty(&tsk->thread_group))
-		BUG();
-	/* An exec() starts a new thread group: */
-	tsk->tgid = tsk->pid;
-}
-
 int flush_old_exec(struct linux_binprm * bprm)
 {
 	char * name;
 	int i, ch, retval;
-	struct signal_struct * oldsig;
-
-	/*
-	 * Make sure we have a private signal table
-	 */
-	oldsig = current->sig;
-	retval = make_private_signals();
-	if (retval) goto flush_failed;
+	struct signal_struct * oldsig = current->sig;

 	/* 
 	 * Release all of the old mmap stuff
 	 */
 	retval = exec_mmap(bprm->mm);
-	if (retval) goto mmap_failed;
+	if (retval)
+		goto mmap_failed;
+	/*
+	 * Make sure we have a private signal table and that
+	 * we are unassociated from the previous thread group.
+	 */
+	retval = de_thread(oldsig);
+	if (retval)
+		goto flush_failed;

 	/* This is the point of no return */
-	de_thread(current);
-
-	release_old_signals(oldsig);

 	current->sas_ss_sp = current->sas_ss_size = 0;


--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -222,6 +222,8 @@ struct signal_struct {
 	/* thread group exit support */
 	int			group_exit;
 	int			group_exit_code;
+
+	struct task_struct	*group_exit_task;
 };

 /*
@@ -552,6 +554,7 @@ extern int dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
 extern void unblock_all_signals(void);
+extern void release_task(struct task_struct * p);
 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp);

--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,7 +49,7 @@ static struct dentry * __unhash_process(struct task_struct *p)
 	return proc_dentry;
 }

-static void release_task(struct task_struct * p)
+void release_task(struct task_struct * p)
 {
 	struct dentry *proc_dentry;


--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -633,6 +633,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	atomic_set(&sig->count, 1);
 	sig->group_exit = 0;
 	sig->group_exit_code = 0;
+	sig->group_exit_task = NULL;
 	memcpy(sig->action, current->sig->action, sizeof(sig->action));
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);

--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -273,6 +273,15 @@ void __exit_sighand(struct task_struct *tsk)
 		kmem_cache_free(sigact_cachep, sig);
 	} else {
 		struct task_struct *leader = tsk->group_leader;
+
+		/*
+		 * If there is any task waiting for the group exit
+		 * then notify it:
+		 */
+		if (sig->group_exit_task && atomic_read(&sig->count) <= 2) {
+			wake_up_process(sig->group_exit_task);
+			sig->group_exit_task = NULL;
+		}
 		/*
 		 * If we are the last non-leader member of the thread
 		 * group, and the leader is zombie, then notify the
@@ -283,6 +292,7 @@ void __exit_sighand(struct task_struct *tsk)
 		 */
 		if (atomic_read(&sig->count) == 1 &&
 					leader->state == TASK_ZOMBIE) {
+
 			__remove_thread_group(tsk, sig);
 			spin_unlock(&sig->siglock);
 			do_notify_parent(leader, leader->exit_signal);