Commit 1a82e9e1 authored by Manfred Spraul's avatar Manfred Spraul Committed by Linus Torvalds

ipc/sem: separate wait-for-zero and alter tasks into seperate queues

Introduce separate queues for operations that do not modify the
semaphore values.  Advantages:

 - Simpler logic in check_restart().
 - Faster update_queue(): Right now, all wait-for-zero operations are
   always tested, even if the semaphore value is not 0.
 - wait-for-zero gets again priority, as in linux <=3.0.9
Signed-off-by: default avatarManfred Spraul <manfred@colorfullife.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent f5c936c0
...@@ -15,7 +15,10 @@ struct sem_array { ...@@ -15,7 +15,10 @@ struct sem_array {
time_t sem_otime; /* last semop time */ time_t sem_otime; /* last semop time */
time_t sem_ctime; /* last change time */ time_t sem_ctime; /* last change time */
struct sem *sem_base; /* ptr to first semaphore in array */ struct sem *sem_base; /* ptr to first semaphore in array */
struct list_head sem_pending; /* pending operations to be processed */ struct list_head pending_alter; /* pending operations */
/* that alter the array */
struct list_head pending_const; /* pending complex operations */
/* that do not alter semvals */
struct list_head list_id; /* undo requests on this array */ struct list_head list_id; /* undo requests on this array */
int sem_nsems; /* no. of semaphores in array */ int sem_nsems; /* no. of semaphores in array */
int complex_count; /* pending complex operations */ int complex_count; /* pending complex operations */
......
...@@ -95,7 +95,10 @@ struct sem { ...@@ -95,7 +95,10 @@ struct sem {
int semval; /* current value */ int semval; /* current value */
int sempid; /* pid of last operation */ int sempid; /* pid of last operation */
spinlock_t lock; /* spinlock for fine-grained semtimedop */ spinlock_t lock; /* spinlock for fine-grained semtimedop */
struct list_head sem_pending; /* pending single-sop operations */ struct list_head pending_alter; /* pending single-sop operations */
/* that alter the semaphore */
struct list_head pending_const; /* pending single-sop operations */
/* that do not alter the semaphore*/
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* One queue for each sleeping process in the system. */ /* One queue for each sleeping process in the system. */
...@@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); ...@@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
/* /*
* linked list protection: * linked list protection:
* sem_undo.id_next, * sem_undo.id_next,
* sem_array.sem_pending{,last}, * sem_array.pending{_alter,_cont},
* sem_array.sem_undo: sem_lock() for read/write * sem_array.sem_undo: sem_lock() for read/write
* sem_undo.proc_next: only "current" is allowed to read/write that field. * sem_undo.proc_next: only "current" is allowed to read/write that field.
* *
...@@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) ...@@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
* Without the check/retry algorithm a lockless wakeup is possible: * Without the check/retry algorithm a lockless wakeup is possible:
* - queue.status is initialized to -EINTR before blocking. * - queue.status is initialized to -EINTR before blocking.
* - wakeup is performed by * - wakeup is performed by
* * unlinking the queue entry from sma->sem_pending * * unlinking the queue entry from the pending list
* * setting queue.status to IN_WAKEUP * * setting queue.status to IN_WAKEUP
* This is the notification for the blocked thread that a * This is the notification for the blocked thread that a
* result value is imminent. * result value is imminent.
...@@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) ...@@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
sma->sem_base = (struct sem *) &sma[1]; sma->sem_base = (struct sem *) &sma[1];
for (i = 0; i < nsems; i++) { for (i = 0; i < nsems; i++) {
INIT_LIST_HEAD(&sma->sem_base[i].sem_pending); INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
spin_lock_init(&sma->sem_base[i].lock); spin_lock_init(&sma->sem_base[i].lock);
} }
sma->complex_count = 0; sma->complex_count = 0;
INIT_LIST_HEAD(&sma->sem_pending); INIT_LIST_HEAD(&sma->pending_alter);
INIT_LIST_HEAD(&sma->pending_const);
INIT_LIST_HEAD(&sma->list_id); INIT_LIST_HEAD(&sma->list_id);
sma->sem_nsems = nsems; sma->sem_nsems = nsems;
sma->sem_ctime = get_seconds(); sma->sem_ctime = get_seconds();
...@@ -609,60 +614,132 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) ...@@ -609,60 +614,132 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
* update_queue is O(N^2) when it restarts scanning the whole queue of * update_queue is O(N^2) when it restarts scanning the whole queue of
* waiting operations. Therefore this function checks if the restart is * waiting operations. Therefore this function checks if the restart is
* really necessary. It is called after a previously waiting operation * really necessary. It is called after a previously waiting operation
* was completed. * modified the array.
* Note that wait-for-zero operations are handled without restart.
*/ */
static int check_restart(struct sem_array *sma, struct sem_queue *q) static int check_restart(struct sem_array *sma, struct sem_queue *q)
{ {
struct sem *curr; /* pending complex alter operations are too difficult to analyse */
struct sem_queue *h; if (!list_empty(&sma->pending_alter))
/* if the operation didn't modify the array, then no restart */
if (q->alter == 0)
return 0;
/* pending complex operations are too difficult to analyse */
if (sma->complex_count)
return 1; return 1;
/* we were a sleeping complex operation. Too difficult */ /* we were a sleeping complex operation. Too difficult */
if (q->nsops > 1) if (q->nsops > 1)
return 1; return 1;
curr = sma->sem_base + q->sops[0].sem_num; /* It is impossible that someone waits for the new value:
* - complex operations always restart.
* - wait-for-zero are handled seperately.
* - q is a previously sleeping simple operation that
* altered the array. It must be a decrement, because
* simple increments never sleep.
* - If there are older (higher priority) decrements
* in the queue, then they have observed the original
* semval value and couldn't proceed. The operation
* decremented to value - thus they won't proceed either.
*/
return 0;
}
/* No-one waits on this queue */ /**
if (list_empty(&curr->sem_pending)) * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks
return 0; * @sma: semaphore array.
* @semnum: semaphore that was modified.
* @pt: list head for the tasks that must be woken up.
*
* wake_const_ops must be called after a semaphore in a semaphore array
* was set to 0. If complex const operations are pending, wake_const_ops must
* be called with semnum = -1, as well as with the number of each modified
* semaphore.
* The tasks that must be woken up are added to @pt. The return code
* is stored in q->pid.
* The function returns 1 if at least one operation was completed successfully.
*/
static int wake_const_ops(struct sem_array *sma, int semnum,
struct list_head *pt)
{
struct sem_queue *q;
struct list_head *walk;
struct list_head *pending_list;
int semop_completed = 0;
if (semnum == -1)
pending_list = &sma->pending_const;
else
pending_list = &sma->sem_base[semnum].pending_const;
/* the new semaphore value */ walk = pending_list->next;
if (curr->semval) { while (walk != pending_list) {
/* It is impossible that someone waits for the new value: int error;
* - q is a previously sleeping simple operation that
* altered the array. It must be a decrement, because q = container_of(walk, struct sem_queue, list);
* simple increments never sleep. walk = walk->next;
* - The value is not 0, thus wait-for-zero won't proceed.
* - If there are older (higher priority) decrements error = try_atomic_semop(sma, q->sops, q->nsops,
* in the queue, then they have observed the original q->undo, q->pid);
* semval value and couldn't proceed. The operation
* decremented to value - thus they won't proceed either. if (error <= 0) {
/* operation completed, remove from queue & wakeup */
unlink_queue(sma, q);
wake_up_sem_queue_prepare(pt, q, error);
if (error == 0)
semop_completed = 1;
}
}
return semop_completed;
}
/**
* do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks
* @sma: semaphore array
* @sops: operations that were performed
* @nsops: number of operations
* @pt: list head of the tasks that must be woken up.
*
* do_smart_wakeup_zero() checks all required queue for wait-for-zero
* operations, based on the actual changes that were performed on the
* semaphore array.
* The function returns 1 if at least one operation was completed successfully.
*/
static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
int nsops, struct list_head *pt)
{
int i;
int semop_completed = 0;
int got_zero = 0;
/* first: the per-semaphore queues, if known */
if (sops) {
for (i = 0; i < nsops; i++) {
int num = sops[i].sem_num;
if (sma->sem_base[num].semval == 0) {
got_zero = 1;
semop_completed |= wake_const_ops(sma, num, pt);
}
}
} else {
/*
* No sops means modified semaphores not known.
* Assume all were changed.
*/ */
BUG_ON(q->sops[0].sem_op >= 0); for (i = 0; i < sma->sem_nsems; i++) {
return 0; if (sma->sem_base[i].semval == 0) {
got_zero = 1;
semop_completed |= wake_const_ops(sma, i, pt);
}
}
} }
/* /*
* semval is 0. Check if there are wait-for-zero semops. * If one of the modified semaphores got 0,
* They must be the first entries in the per-semaphore queue * then check the global queue, too.
*/ */
h = list_first_entry(&curr->sem_pending, struct sem_queue, list); if (got_zero)
BUG_ON(h->nsops != 1); semop_completed |= wake_const_ops(sma, -1, pt);
BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num);
/* Yes, there is a wait-for-zero semop. Restart */ return semop_completed;
if (h->sops[0].sem_op == 0)
return 1;
/* Again - no-one is waiting for the new value. */
return 0;
} }
...@@ -678,6 +755,8 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) ...@@ -678,6 +755,8 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
* semaphore. * semaphore.
* The tasks that must be woken up are added to @pt. The return code * The tasks that must be woken up are added to @pt. The return code
* is stored in q->pid. * is stored in q->pid.
* The function internally checks if const operations can now succeed.
*
* The function return 1 if at least one semop was completed successfully. * The function return 1 if at least one semop was completed successfully.
*/ */
static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
...@@ -688,9 +767,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) ...@@ -688,9 +767,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
int semop_completed = 0; int semop_completed = 0;
if (semnum == -1) if (semnum == -1)
pending_list = &sma->sem_pending; pending_list = &sma->pending_alter;
else else
pending_list = &sma->sem_base[semnum].sem_pending; pending_list = &sma->sem_base[semnum].pending_alter;
again: again:
walk = pending_list->next; walk = pending_list->next;
...@@ -702,13 +781,12 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) ...@@ -702,13 +781,12 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
/* If we are scanning the single sop, per-semaphore list of /* If we are scanning the single sop, per-semaphore list of
* one semaphore and that semaphore is 0, then it is not * one semaphore and that semaphore is 0, then it is not
* necessary to scan the "alter" entries: simple increments * necessary to scan further: simple increments
* that affect only one entry succeed immediately and cannot * that affect only one entry succeed immediately and cannot
* be in the per semaphore pending queue, and decrements * be in the per semaphore pending queue, and decrements
* cannot be successful if the value is already 0. * cannot be successful if the value is already 0.
*/ */
if (semnum != -1 && sma->sem_base[semnum].semval == 0 && if (semnum != -1 && sma->sem_base[semnum].semval == 0)
q->alter)
break; break;
error = try_atomic_semop(sma, q->sops, q->nsops, error = try_atomic_semop(sma, q->sops, q->nsops,
...@@ -724,6 +802,7 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) ...@@ -724,6 +802,7 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
restart = 0; restart = 0;
} else { } else {
semop_completed = 1; semop_completed = 1;
do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
restart = check_restart(sma, q); restart = check_restart(sma, q);
} }
...@@ -742,8 +821,8 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) ...@@ -742,8 +821,8 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
* @otime: force setting otime * @otime: force setting otime
* @pt: list head of the tasks that must be woken up. * @pt: list head of the tasks that must be woken up.
* *
* do_smart_update() does the required called to update_queue, based on the * do_smart_update() does the required calls to update_queue and wakeup_zero,
* actual changes that were performed on the semaphore array. * based on the actual changes that were performed on the semaphore array.
* Note that the function does not do the actual wake-up: the caller is * Note that the function does not do the actual wake-up: the caller is
* responsible for calling wake_up_sem_queue_do(@pt). * responsible for calling wake_up_sem_queue_do(@pt).
* It is safe to perform this call after dropping all locks. * It is safe to perform this call after dropping all locks.
...@@ -754,6 +833,8 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop ...@@ -754,6 +833,8 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
int i; int i;
int progress; int progress;
otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
progress = 1; progress = 1;
retry_global: retry_global:
if (sma->complex_count) { if (sma->complex_count) {
...@@ -813,14 +894,14 @@ static int count_semncnt (struct sem_array * sma, ushort semnum) ...@@ -813,14 +894,14 @@ static int count_semncnt (struct sem_array * sma, ushort semnum)
struct sem_queue * q; struct sem_queue * q;
semncnt = 0; semncnt = 0;
list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) { list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) {
struct sembuf * sops = q->sops; struct sembuf * sops = q->sops;
BUG_ON(sops->sem_num != semnum); BUG_ON(sops->sem_num != semnum);
if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT)) if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT))
semncnt++; semncnt++;
} }
list_for_each_entry(q, &sma->sem_pending, list) { list_for_each_entry(q, &sma->pending_alter, list) {
struct sembuf * sops = q->sops; struct sembuf * sops = q->sops;
int nsops = q->nsops; int nsops = q->nsops;
int i; int i;
...@@ -839,14 +920,14 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) ...@@ -839,14 +920,14 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum)
struct sem_queue * q; struct sem_queue * q;
semzcnt = 0; semzcnt = 0;
list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) { list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) {
struct sembuf * sops = q->sops; struct sembuf * sops = q->sops;
BUG_ON(sops->sem_num != semnum); BUG_ON(sops->sem_num != semnum);
if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT)) if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT))
semzcnt++; semzcnt++;
} }
list_for_each_entry(q, &sma->sem_pending, list) { list_for_each_entry(q, &sma->pending_const, list) {
struct sembuf * sops = q->sops; struct sembuf * sops = q->sops;
int nsops = q->nsops; int nsops = q->nsops;
int i; int i;
...@@ -884,13 +965,22 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) ...@@ -884,13 +965,22 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
/* Wake up all pending processes and let them fail with EIDRM. */ /* Wake up all pending processes and let them fail with EIDRM. */
INIT_LIST_HEAD(&tasks); INIT_LIST_HEAD(&tasks);
list_for_each_entry_safe(q, tq, &sma->sem_pending, list) { list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
}
list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
unlink_queue(sma, q); unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM); wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
} }
for (i = 0; i < sma->sem_nsems; i++) { for (i = 0; i < sma->sem_nsems; i++) {
struct sem *sem = sma->sem_base + i; struct sem *sem = sma->sem_base + i;
list_for_each_entry_safe(q, tq, &sem->sem_pending, list) { list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
}
list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
unlink_queue(sma, q); unlink_queue(sma, q);
wake_up_sem_queue_prepare(&tasks, q, -EIDRM); wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
} }
...@@ -1658,14 +1748,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, ...@@ -1658,14 +1748,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
curr = &sma->sem_base[sops->sem_num]; curr = &sma->sem_base[sops->sem_num];
if (alter) if (alter)
list_add_tail(&queue.list, &curr->sem_pending); list_add_tail(&queue.list, &curr->pending_alter);
else else
list_add(&queue.list, &curr->sem_pending); list_add_tail(&queue.list, &curr->pending_const);
} else { } else {
if (alter) if (alter)
list_add_tail(&queue.list, &sma->sem_pending); list_add_tail(&queue.list, &sma->pending_alter);
else else
list_add(&queue.list, &sma->sem_pending); list_add_tail(&queue.list, &sma->pending_const);
sma->complex_count++; sma->complex_count++;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment