Commit 63b05203 authored by Suparna Bhattacharya's avatar Suparna Bhattacharya Committed by Linus Torvalds

[PATCH] AIO: retry infrastructure fixes and enhancements

From: Daniel McNeil <daniel@osdl.org>
From: Chris Mason <mason@suse.com>

 AIO: retry infrastructure fixes and enhancements

 Reorganises, comments and fixes the AIO retry logic. Fixes 
 and enhancements include:

   - Split iocb setup and execution in io_submit
        (also fixes io_submit error reporting)
   - Use aio workqueue instead of keventd for retries
   - Default high level retry methods
   - Subtle use_mm/unuse_mm fix
   - Code commenting
   - Fix aio process hang on EINVAL (Daniel McNeil)
   - Hold the context lock across unuse_mm
   - Acquire task_lock in use_mm()
   - Allow fops to override the retry method with their own
   - Elevated ref count for AIO retries (Daniel McNeil)
   - set_fs needed when calling use_mm
   - Flush workqueue on __put_ioctx (Chris Mason)
   - Fix io_cancel to work with retries (Chris Mason)
   - Read-immediate option for socket/pipe retry support

 Note on default high-level retry methods support
 ================================================

 High-level retry methods allows an AIO request to be executed as a series of
 non-blocking iterations, where each iteration retries the remaining part of
 the request from where the last iteration left off, by reissuing the
 corresponding AIO fop routine with modified arguments representing the
 remaining I/O.  The retries are "kicked" via the AIO waitqueue callback
 aio_wake_function() which replaces the default wait queue entry used for
 blocking waits.

 The high level retry infrastructure is responsible for running the
 iterations in the mm context (address space) of the caller, and ensures that
 only one retry instance is active at a given time, thus relieving the fops
 themselves from having to deal with potential races of that sort.
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 86b9159a
This diff is collapsed.
...@@ -52,7 +52,7 @@ struct kiocb { ...@@ -52,7 +52,7 @@ struct kiocb {
struct file *ki_filp; struct file *ki_filp;
struct kioctx *ki_ctx; /* may be NULL for sync ops */ struct kioctx *ki_ctx; /* may be NULL for sync ops */
int (*ki_cancel)(struct kiocb *, struct io_event *); int (*ki_cancel)(struct kiocb *, struct io_event *);
long (*ki_retry)(struct kiocb *); ssize_t (*ki_retry)(struct kiocb *);
void (*ki_dtor)(struct kiocb *); void (*ki_dtor)(struct kiocb *);
struct list_head ki_list; /* the aio core uses this struct list_head ki_list; /* the aio core uses this
...@@ -64,6 +64,16 @@ struct kiocb { ...@@ -64,6 +64,16 @@ struct kiocb {
} ki_obj; } ki_obj;
__u64 ki_user_data; /* user's data for completion */ __u64 ki_user_data; /* user's data for completion */
loff_t ki_pos; loff_t ki_pos;
/* State that we remember to be able to restart/retry */
unsigned short ki_opcode;
size_t ki_nbytes; /* copy of iocb->aio_nbytes */
char *ki_buf; /* remaining iocb->aio_buf */
size_t ki_left; /* remaining bytes */
wait_queue_t ki_wait;
long ki_retried; /* just for testing */
long ki_kicked; /* just for testing */
long ki_queued; /* just for testing */
void *private; void *private;
}; };
...@@ -79,6 +89,8 @@ struct kiocb { ...@@ -79,6 +89,8 @@ struct kiocb {
(x)->ki_cancel = NULL; \ (x)->ki_cancel = NULL; \
(x)->ki_dtor = NULL; \ (x)->ki_dtor = NULL; \
(x)->ki_obj.tsk = tsk; \ (x)->ki_obj.tsk = tsk; \
(x)->ki_user_data = 0; \
init_wait((&(x)->ki_wait)); \
} while (0) } while (0)
#define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_MAGIC 0xa10a10a1
...@@ -161,6 +173,20 @@ int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ...@@ -161,6 +173,20 @@ int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
#define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0) #define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0)
#define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0) #define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0)
#define in_aio() !is_sync_wait(current->io_wait)
/* may be used for debugging */
#define warn_if_async() \
do { \
if (in_aio()) { \
printk(KERN_ERR "%s(%s:%d) called in async context!\n", \
__FUNCTION__, __FILE__, __LINE__); \
dump_stack(); \
} \
} while (0)
#define io_wait_to_kiocb(wait) container_of(wait, struct kiocb, ki_wait)
#define is_retried_kiocb(iocb) ((iocb)->ki_retried > 1)
#include <linux/aio_abi.h> #include <linux/aio_abi.h>
static inline struct kiocb *list_kiocb(struct list_head *h) static inline struct kiocb *list_kiocb(struct list_head *h)
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#define EBADTYPE 527 /* Type not supported by server */ #define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */
#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif #endif
......
...@@ -561,7 +561,13 @@ struct task_struct { ...@@ -561,7 +561,13 @@ struct task_struct {
unsigned long ptrace_message; unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */ siginfo_t *last_siginfo; /* For ptrace use. */
/*
* current io wait handle: wait queue entry to use for io waits
* If this thread is processing aio, this points at the waitqueue
* inside the currently handled kiocb. It may be NULL (i.e. default
* to a stack based synchronous wait) if its doing sync IO.
*/
wait_queue_t *io_wait;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
struct mempolicy *mempolicy; struct mempolicy *mempolicy;
short il_next; /* could be shared with used_math */ short il_next; /* could be shared with used_math */
......
...@@ -80,6 +80,15 @@ static inline int waitqueue_active(wait_queue_head_t *q) ...@@ -80,6 +80,15 @@ static inline int waitqueue_active(wait_queue_head_t *q)
return !list_empty(&q->task_list); return !list_empty(&q->task_list);
} }
/*
* Used to distinguish between sync and async io wait context:
* sync i/o typically specifies a NULL wait queue entry or a wait
* queue entry bound to a task (current task) to wake up.
* aio specifies a wait queue entry with an async notification
* callback routine, not associated with any task.
*/
#define is_sync_wait(wait) (!(wait) || ((wait)->task))
extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
......
...@@ -152,7 +152,12 @@ void fastcall prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int stat ...@@ -152,7 +152,12 @@ void fastcall prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int stat
spin_lock_irqsave(&q->lock, flags); spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list)) if (list_empty(&wait->task_list))
__add_wait_queue(q, wait); __add_wait_queue(q, wait);
set_current_state(state); /*
* don't alter the task state if this is just going to
* queue an async wait queue callback
*/
if (is_sync_wait(wait))
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags); spin_unlock_irqrestore(&q->lock, flags);
} }
...@@ -167,7 +172,12 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) ...@@ -167,7 +172,12 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
spin_lock_irqsave(&q->lock, flags); spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list)) if (list_empty(&wait->task_list))
__add_wait_queue_tail(q, wait); __add_wait_queue_tail(q, wait);
set_current_state(state); /*
* don't alter the task state if this is just going to
* queue an async wait queue callback
*/
if (is_sync_wait(wait))
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags); spin_unlock_irqrestore(&q->lock, flags);
} }
...@@ -965,6 +975,7 @@ static task_t *copy_process(unsigned long clone_flags, ...@@ -965,6 +975,7 @@ static task_t *copy_process(unsigned long clone_flags,
p->start_time = get_jiffies_64(); p->start_time = get_jiffies_64();
p->security = NULL; p->security = NULL;
p->io_context = NULL; p->io_context = NULL;
p->io_wait = NULL;
p->audit_context = NULL; p->audit_context = NULL;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy); p->mempolicy = mpol_copy(p->mempolicy);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment