Commit 1c706086 authored by Dave Kleikamp's avatar Dave Kleikamp

Rework JFS's inode locking

In order for JFS to be able to quiesce the current activity, while
blocking new transactions, the locking needed some rework.  New
transactions are stopped in the functions txBegin or txBeginAnon,
where the rdwrlock (IREAD_LOCK/IWRITE_LOCK) may be held.  Dirty
inodes may need to be committed while new transactions are blocked
here, so another lock is introduced (commit_sem) which is taken after
txBegin/txBeginAnon is called.  This ensures that the proper
serialization takes place, without the write_inode method needing to
grab the rdwrlock.

In addition, the use of IWRITE_LOCK and IREAD_LOCK has been removed
from directory inodes.  The serialization done by the VFS using i_sem
is sufficient to avoid races.

This patch removes JFS's dependency on down_write_trylock.
parent 88859354
......@@ -38,9 +38,7 @@ int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
return rc;
IWRITE_LOCK(inode);
rc |= jfs_commit_inode(inode, 1);
IWRITE_UNLOCK(inode);
return rc ? -EIO : 0;
}
......@@ -64,10 +62,19 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
do {
tid = txBegin(ip->i_sb, 0);
/*
* The commit_sem cannot be taken before txBegin.
* txBegin may block and there is a chance the inode
* could be marked dirty and need to be committed
* before txBegin unblocks
*/
down(&JFS_IP(ip)->commit_sem);
newsize = xtTruncate(tid, ip, length,
COMMIT_TRUNCATE | COMMIT_PWMAP);
if (newsize < 0) {
txEnd(tid);
up(&JFS_IP(ip)->commit_sem);
break;
}
......@@ -76,6 +83,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
txCommit(tid, 1, &ip, 0);
txEnd(tid);
up(&JFS_IP(ip)->commit_sem);
} while (newsize > length); /* Truncate isn't always atomic */
}
......
......@@ -107,8 +107,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
}
tid = txBegin(inode->i_sb, COMMIT_INODE);
down(&JFS_IP(inode)->commit_sem);
rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0);
txEnd(tid);
up(&JFS_IP(inode)->commit_sem);
return -rc;
}
......@@ -123,25 +125,19 @@ void jfs_write_inode(struct inode *inode, int wait)
!test_cflag(COMMIT_Dirty, inode))
return;
IWRITE_LOCK(inode);
if (jfs_commit_inode(inode, wait)) {
jERROR(1, ("jfs_write_inode: jfs_commit_inode failed!\n"));
}
IWRITE_UNLOCK(inode);
}
void jfs_delete_inode(struct inode *inode)
{
jFYI(1, ("In jfs_delete_inode, inode = 0x%p\n", inode));
IWRITE_LOCK(inode);
if (test_cflag(COMMIT_Freewmap, inode))
freeZeroLink(inode);
diFree(inode);
IWRITE_UNLOCK(inode);
clear_inode(inode);
}
......@@ -203,8 +199,7 @@ static int jfs_get_block(struct inode *ip, sector_t lblock,
if ((no_size_check ||
((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size)) &&
(xtLookup
(ip, lblock64, 1, &xflag, &xaddr, &xlen, no_size_check)
(xtLookup(ip, lblock64, 1, &xflag, &xaddr, &xlen, no_size_check)
== 0) && xlen) {
if (xflag & XAD_NOTRECORDED) {
if (!create)
......@@ -241,8 +236,7 @@ static int jfs_get_block(struct inode *ip, sector_t lblock,
* Allocate a new block
*/
#ifdef _JFS_4K
if ((rc =
extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
goto unlock;
rc = extAlloc(ip, 1, lblock64, &xad, FALSE);
if (rc)
......
......@@ -96,6 +96,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
/* This blocks if we are low on resources */
txBeginAnon(ip->i_sb);
/* Avoid race with jfs_commit_inode() */
down(&JFS_IP(ip)->commit_sem);
/* validate extent length */
if (xlen > MAXXLEN)
xlen = MAXXLEN;
......@@ -138,8 +141,8 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
* is smaller than the number of blocks per page.
*/
nxlen = xlen;
if ((rc =
extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) {
if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) {
up(&JFS_IP(ip)->commit_sem);
return (rc);
}
......@@ -160,6 +163,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
*/
if (rc) {
dbFree(ip, nxaddr, nxlen);
up(&JFS_IP(ip)->commit_sem);
return (rc);
}
......@@ -174,6 +178,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
mark_inode_dirty(ip);
up(&JFS_IP(ip)->commit_sem);
/*
* COMMIT_SyncList flags an anonymous tlock on page that is on
* sync list.
......@@ -217,6 +222,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
/* This blocks if we are low on resources */
txBeginAnon(ip->i_sb);
down(&JFS_IP(ip)->commit_sem);
/* validate extent length */
if (nxlen > MAXXLEN)
nxlen = MAXXLEN;
......@@ -235,7 +241,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
xp->flag = 0;
if ((rc = xtUpdate(0, ip, xp)))
return (rc);
goto exit;
}
/* try to allocated the request number of blocks for the
......@@ -247,7 +253,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
* space as to satisfy the extend page.
*/
if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr)))
return (rc);
goto exit;
delta = nxlen - xlen;
......@@ -284,7 +290,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
/* extend the extent */
if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
dbFree(ip, xaddr + xlen, delta);
return (rc);
goto exit;
}
} else {
/*
......@@ -294,7 +300,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
*/
if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
dbFree(ip, nxaddr, nxlen);
return (rc);
goto exit;
}
}
......@@ -325,8 +331,9 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
xp->flag = xflag;
mark_inode_dirty(ip);
return (0);
exit:
up(&JFS_IP(ip)->commit_sem);
return (rc);
}
......@@ -423,19 +430,13 @@ int extRecord(struct inode *ip, xad_t * xp)
txBeginAnon(ip->i_sb);
/* update the extent */
if ((rc = xtUpdate(0, ip, xp)))
return (rc);
#ifdef _STILL_TO_PORT
/* no longer abnr */
cp->cm_abnr = FALSE;
down(&JFS_IP(ip)->commit_sem);
/* mark the cbuf as modified */
cp->cm_modified = TRUE;
#endif /* _STILL_TO_PORT */
/* update the extent */
rc = xtUpdate(0, ip, xp);
return (0);
up(&JFS_IP(ip)->commit_sem);
return (rc);
}
......
......@@ -19,6 +19,7 @@
#ifndef _H_JFS_INCORE
#define _H_JFS_INCORE
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <asm/bitops.h>
#include "jfs_types.h"
......@@ -30,14 +31,6 @@
*/
#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */
/*
* Due to header ordering problems this can't be in jfs_lock.h
*/
typedef struct jfs_rwlock {
struct rw_semaphore rw_sem;
atomic_t in_use; /* for hacked implementation of trylock */
} jfs_rwlock_t;
/*
* JFS-private inode information
*/
......@@ -62,7 +55,19 @@ struct jfs_inode_info {
lid_t atltail; /* anonymous tlock list tail */
struct list_head anon_inode_list; /* inodes having anonymous txns */
struct list_head mp_list; /* metapages in inode's address space */
jfs_rwlock_t rdwrlock; /* read/write lock */
/*
* rdwrlock serializes xtree between reads & writes and synchronizes
* changes to special inodes. It's use would be redundant on
* directories since the i_sem taken in the VFS is sufficient.
*/
struct rw_semaphore rdwrlock;
/*
* commit_sem serializes transaction processing on an inode.
* It must be taken after beginning a transaction (txBegin), since
* dirty inodes may be committed while a new transaction on the
* inode is blocked in txBegin or TxBeginAnon
*/
struct semaphore commit_sem;
lid_t xtlid; /* lid of xtree lock on directory */
union {
struct {
......@@ -87,6 +92,12 @@ struct jfs_inode_info {
#define i_dtroot u.dir._dtroot
#define i_inline u.link._inline
#define IREAD_LOCK(ip) down_read(&JFS_IP(ip)->rdwrlock)
#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock)
#define IWRITE_LOCK(ip) down_write(&JFS_IP(ip)->rdwrlock)
#define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock)
/*
* cflag
*/
......
......@@ -91,40 +91,3 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
return inode;
}
/*
* NAME: iwritelocklist()
*
* FUNCTION: Lock multiple inodes in sorted order to avoid deadlock
*
*/
void iwritelocklist(int n, ...)
{
va_list ilist;
struct inode *sort[4];
struct inode *ip;
int k, m;
va_start(ilist, n);
for (k = 0; k < n; k++)
sort[k] = va_arg(ilist, struct inode *);
va_end(ilist);
/* Bubble sort in descending order */
do {
m = 0;
for (k = 0; k < n; k++)
if ((k + 1) < n
&& sort[k + 1]->i_ino > sort[k]->i_ino) {
ip = sort[k];
sort[k] = sort[k + 1];
sort[k + 1] = ip;
m++;
}
} while (m);
/* Lock them */
for (k = 0; k < n; k++) {
IWRITE_LOCK(sort[k]);
}
}
......@@ -24,63 +24,7 @@
/*
* jfs_lock.h
*
* JFS lock definition for globally referenced locks
*/
/* readers/writer lock: thread-thread */
/*
* RW semaphores do not currently have a trylock function. Since the
* implementation varies by platform, I have implemented a platform-independent
* wrapper around the rw_semaphore routines. If this turns out to be the best
* way of avoiding our locking problems, I will push to get a trylock
* implemented in the kernel, but I'd rather find a way to avoid having to
* use it.
*/
#define RDWRLOCK_T jfs_rwlock_t
static inline void RDWRLOCK_INIT(jfs_rwlock_t * Lock)
{
init_rwsem(&Lock->rw_sem);
atomic_set(&Lock->in_use, 0);
}
static inline void READ_LOCK(jfs_rwlock_t * Lock)
{
atomic_inc(&Lock->in_use);
down_read(&Lock->rw_sem);
}
static inline void READ_UNLOCK(jfs_rwlock_t * Lock)
{
up_read(&Lock->rw_sem);
atomic_dec(&Lock->in_use);
}
static inline void WRITE_LOCK(jfs_rwlock_t * Lock)
{
atomic_inc(&Lock->in_use);
down_write(&Lock->rw_sem);
}
static inline int WRITE_TRYLOCK(jfs_rwlock_t * Lock)
{
if (atomic_read(&Lock->in_use))
return 0;
WRITE_LOCK(Lock);
return 1;
}
static inline void WRITE_UNLOCK(jfs_rwlock_t * Lock)
{
up_write(&Lock->rw_sem);
atomic_dec(&Lock->in_use);
}
#define IREAD_LOCK(ip) READ_LOCK(&JFS_IP(ip)->rdwrlock)
#define IREAD_UNLOCK(ip) READ_UNLOCK(&JFS_IP(ip)->rdwrlock)
#define IWRITE_LOCK(ip) WRITE_LOCK(&JFS_IP(ip)->rdwrlock)
#define IWRITE_TRYLOCK(ip) WRITE_TRYLOCK(&JFS_IP(ip)->rdwrlock)
#define IWRITE_UNLOCK(ip) WRITE_UNLOCK(&JFS_IP(ip)->rdwrlock)
#define IWRITE_LOCK_LIST iwritelocklist
extern void iwritelocklist(int, ...);
/*
* Conditional sleep where condition is protected by spinlock
......
......@@ -1524,8 +1524,6 @@ static int lmLogShutdown(log_t * log)
*
* RETURN: 0 - success
* errors returned by vms_iowait().
*
* serialization: IWRITE_LOCK(log inode) held on entry/exit
*/
static int lmLogFileSystem(log_t * log, char *uuid, int activate)
{
......
......@@ -2898,6 +2898,8 @@ int jfs_sync(void)
{
struct inode *ip;
struct jfs_inode_info *jfs_ip;
int rc;
tid_t tid;
lock_kernel();
......@@ -2927,17 +2929,19 @@ int jfs_sync(void)
ip = &jfs_ip->vfs_inode;
/*
* We must release the TXN_LOCK since our
* IWRITE_TRYLOCK implementation may still block
* down_trylock returns 0 on success. This is
* inconsistent with spin_trylock.
*/
TXN_UNLOCK();
if (IWRITE_TRYLOCK(ip)) {
if (! down_trylock(&jfs_ip->commit_sem)) {
/*
* inode will be removed from anonymous list
* when it is committed
*/
jfs_commit_inode(ip, 0);
IWRITE_UNLOCK(ip);
TXN_UNLOCK();
tid = txBegin(ip->i_sb, COMMIT_INODE);
rc = txCommit(tid, 1, &ip, 0);
txEnd(tid);
up(&jfs_ip->commit_sem);
/*
* Just to be safe. I don't know how
* long we can run without blocking
......@@ -2945,17 +2949,11 @@ int jfs_sync(void)
cond_resched();
TXN_LOCK();
} else {
/* We can't get the write lock. It may
/* We can't get the commit semaphore. It may
* be held by a thread waiting for tlock's
* so let's not block here. Save it to
* put back on the anon_list.
*/
/*
* We released TXN_LOCK, let's make sure
* this inode is still there
*/
TXN_LOCK();
if (TxAnchor.anon_list.next !=
&jfs_ip->anon_inode_list)
continue;
......
This diff is collapsed.
......@@ -387,7 +387,8 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
SLAB_CTOR_CONSTRUCTOR) {
INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
INIT_LIST_HEAD(&jfs_ip->mp_list);
RDWRLOCK_INIT(&jfs_ip->rdwrlock);
init_rwsem(&jfs_ip->rdwrlock);
init_MUTEX(&jfs_ip->commit_sem);
inode_init_once(&jfs_ip->vfs_inode);
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment