Merge nathans@xfs.org:/export/hose/bkroot/xfs-linux-2.6

into sgi.com:/source2/xfs-linux-2.6

Merge nathans@xfs.org:/export/hose/bkroot/xfs-linux-2.6
into sgi.com:/source2/xfs-linux-2.6
9004fd8a · Nathan Scott · 961c380c · 2a6d76e4 · 9004fd8a · 961c380c
Commit 9004fd8a authored Mar 04, 2004 by Nathan Scott
32 changed files
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -130,7 +130,6 @@ xfs-$(CONFIG_XFS_TRACE)		+= xfs_dir2_trace.o

 # Objects in linux/
 xfs-y				+= $(addprefix linux/, \
-				   mrlock.o \
 				   xfs_aops.o \
 				   xfs_buf.o \
 				   xfs_file.o \

--- a/fs/xfs/linux/mrlock.c
+++ b/fs/xfs/linux/mrlock.c
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-#include <linux/time.h>
-#include <linux/sched.h>
-#include <asm/system.h>
-#include <linux/interrupt.h>
-#include <asm/current.h>
-
-#include "mrlock.h"
-
-
-#if USE_RW_WAIT_QUEUE_SPINLOCK
-# define wq_write_lock	write_lock
-#else
-# define wq_write_lock	spin_lock
-#endif
-
-/*
- * We don't seem to need lock_type (only one supported), name, or
- * sequence. But, XFS will pass it so let's leave them here for now.
- */
-/* ARGSUSED */
-void
-mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence)
-{
-	mrp->mr_count = 0;
-	mrp->mr_reads_waiting = 0;
-	mrp->mr_writes_waiting = 0;
-	init_waitqueue_head(&mrp->mr_readerq);
-	init_waitqueue_head(&mrp->mr_writerq);
-	mrp->mr_lock = SPIN_LOCK_UNLOCKED;
-}
-
-/*
- * Macros to lock/unlock the mrlock_t.
- */
-
-#define MRLOCK(m)		spin_lock(&(m)->mr_lock);
-#define MRUNLOCK(m)		spin_unlock(&(m)->mr_lock);
-
-
-/*
- * lock_wait should never be called in an interrupt thread.
- *
- * mrlocks can sleep (i.e. call schedule) and so they can't ever
- * be called from an interrupt thread.
- *
- * threads that wake-up should also never be invoked from interrupt threads.
- *
- * But, waitqueue_lock is locked from interrupt threads - and we are
- * called with interrupts disabled, so it is all OK.
- */
-
-/* ARGSUSED */
-void
-lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw)
-{
-	DECLARE_WAITQUEUE( wait, current );
-
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-
-	spin_lock(&q->lock);
-	if (rw) {
-		__add_wait_queue_tail(q, &wait);
-	} else {
-		__add_wait_queue(q, &wait);
-	}
-
-	spin_unlock(&q->lock);
-	spin_unlock(lock);
-
-	schedule();
-
-	spin_lock(&q->lock);
-	__remove_wait_queue(q, &wait);
-	spin_unlock(&q->lock);
-
-	spin_lock(lock);
-
-	/* return with lock held */
-}
-
-/* ARGSUSED */
-void
-mrfree(mrlock_t *mrp)
-{
-}
-
-/* ARGSUSED */
-void
-mrlock(mrlock_t *mrp, int type, int flags)
-{
-	if (type == MR_ACCESS)
-		mraccess(mrp);
-	else
-		mrupdate(mrp);
-}
-
-/* ARGSUSED */
-void
-mraccessf(mrlock_t *mrp, int flags)
-{
-	MRLOCK(mrp);
-	if(mrp->mr_writes_waiting > 0) {
-		mrp->mr_reads_waiting++;
-		lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
-		mrp->mr_reads_waiting--;
-	}
-	while (mrp->mr_count < 0) {
-		mrp->mr_reads_waiting++;
-		lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
-		mrp->mr_reads_waiting--;
-	}
-	mrp->mr_count++;
-	MRUNLOCK(mrp);
-}
-
-/* ARGSUSED */
-void
-mrupdatef(mrlock_t *mrp, int flags)
-{
-	MRLOCK(mrp);
-	while(mrp->mr_count) {
-		mrp->mr_writes_waiting++;
-		lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1);
-		mrp->mr_writes_waiting--;
-	}
-
-	mrp->mr_count = -1; /* writer on it */
-	MRUNLOCK(mrp);
-}
-
-int
-mrtryaccess(mrlock_t *mrp)
-{
-	MRLOCK(mrp);
-	/*
-	 * If anyone is waiting for update access or the lock is held for update
-	 * fail the request.
-	 */
-	if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) {
-		MRUNLOCK(mrp);
-		return 0;
-	}
-	mrp->mr_count++;
-	MRUNLOCK(mrp);
-	return 1;
-}
-
-int
-mrtrypromote(mrlock_t *mrp)
-{
-	MRLOCK(mrp);
-
-	if(mrp->mr_count == 1) { /* We are the only thread with the lock */
-		mrp->mr_count = -1; /* writer on it */
-		MRUNLOCK(mrp);
-		return 1;
-	}
-
-	MRUNLOCK(mrp);
-	return 0;
-}
-
-int
-mrtryupdate(mrlock_t *mrp)
-{
-	MRLOCK(mrp);
-
-	if(mrp->mr_count) {
-		MRUNLOCK(mrp);
-		return 0;
-	}
-
-	mrp->mr_count = -1; /* writer on it */
-	MRUNLOCK(mrp);
-	return 1;
-}
-
-static __inline__ void mrwake(mrlock_t *mrp)
-{
-	/*
-	 * First, if the count is now 0, we need to wake-up anyone waiting.
-	 */
-	if (!mrp->mr_count) {
-		if (mrp->mr_writes_waiting) {	/* Wake-up first writer waiting */
-			wake_up(&mrp->mr_writerq);
-		} else if (mrp->mr_reads_waiting) {	/* Wakeup any readers waiting */
-			wake_up(&mrp->mr_readerq);
-		}
-	}
-}
-
-void
-mraccunlock(mrlock_t *mrp)
-{
-	MRLOCK(mrp);
-	mrp->mr_count--;
-	mrwake(mrp);
-	MRUNLOCK(mrp);
-}
-
-void
-mrunlock(mrlock_t *mrp)
-{
-	MRLOCK(mrp);
-	if (mrp->mr_count < 0) {
-		mrp->mr_count = 0;
-	} else {
-		mrp->mr_count--;
-	}
-	mrwake(mrp);
-	MRUNLOCK(mrp);
-}
-
-int
-ismrlocked(mrlock_t *mrp, int type)	/* No need to lock since info can change */
-{
-	if (type == MR_ACCESS)
-		return (mrp->mr_count > 0); /* Read lock */
-	else if (type == MR_UPDATE)
-		return (mrp->mr_count < 0); /* Write lock */
-	else if (type == (MR_UPDATE | MR_ACCESS))
-		return (mrp->mr_count);	/* Any type of lock held */
-	else /* Any waiters */
-		return (mrp->mr_reads_waiting | mrp->mr_writes_waiting);
-}
-
-/*
- * Demote from update to access. We better be the only thread with the
- * lock in update mode so it should be easy to set to 1.
- * Wake-up any readers waiting.
- */
-
-void
-mrdemote(mrlock_t *mrp)
-{
-	MRLOCK(mrp);
-	mrp->mr_count = 1;
-	if (mrp->mr_reads_waiting) {	/* Wakeup all readers waiting */
-		wake_up(&mrp->mr_readerq);
-	}
-	MRUNLOCK(mrp);
-}
--- a/fs/xfs/linux/mrlock.h
+++ b/fs/xfs/linux/mrlock.h
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -32,56 +32,73 @@
 #ifndef __XFS_SUPPORT_MRLOCK_H__
 #define __XFS_SUPPORT_MRLOCK_H__

-#include <linux/time.h>
-#include <linux/wait.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
+#include <linux/rwsem.h>

-/*
- * Implement mrlocks on Linux that work for XFS.
- *
- * These are sleep locks and not spinlocks. If one wants read/write spinlocks,
- * use read_lock, write_lock, ... see spinlock.h.
- */
+enum { MR_NONE, MR_ACCESS, MR_UPDATE };

-typedef struct mrlock_s {
-	int			mr_count;
-	unsigned short		mr_reads_waiting;
-	unsigned short		mr_writes_waiting;
-	wait_queue_head_t	mr_readerq;
-	wait_queue_head_t	mr_writerq;
-	spinlock_t		mr_lock;
+typedef struct {
+	struct rw_semaphore	mr_lock;
+	int			mr_writer;
 } mrlock_t;

-#define MR_ACCESS	1
-#define MR_UPDATE	2
+#define mrinit(mrp, name)	\
+	( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+#define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
+#define mrfree(mrp)		do { } while (0)
+#define mraccess(mrp)		mraccessf(mrp, 0)
+#define mrupdate(mrp)		mrupdatef(mrp, 0)

-#define MRLOCK_BARRIER		0x1
-#define MRLOCK_ALLOW_EQUAL_PRI	0x8
+static inline void mraccessf(mrlock_t *mrp, int flags)
+{
+	down_read(&mrp->mr_lock);
+}

-/*
- * mraccessf/mrupdatef take flags to be passed in while sleeping;
- * only PLTWAIT is currently supported.
- */
+static inline void mrupdatef(mrlock_t *mrp, int flags)
+{
+	down_write(&mrp->mr_lock);
+	mrp->mr_writer = 1;
+}

-extern void	mraccessf(mrlock_t *, int);
-extern void	mrupdatef(mrlock_t *, int);
-extern void     mrlock(mrlock_t *, int, int);
-extern void     mrunlock(mrlock_t *);
-extern void     mraccunlock(mrlock_t *);
-extern int      mrtryupdate(mrlock_t *);
-extern int      mrtryaccess(mrlock_t *);
-extern int	mrtrypromote(mrlock_t *);
-extern void     mrdemote(mrlock_t *);
+static inline int mrtryaccess(mrlock_t *mrp)
+{
+	return down_read_trylock(&mrp->mr_lock);
+}

-extern int	ismrlocked(mrlock_t *, int);
-extern void     mrlock_init(mrlock_t *, int type, char *name, long sequence);
-extern void     mrfree(mrlock_t *);
+static inline int mrtryupdate(mrlock_t *mrp)
+{
+	if (!down_write_trylock(&mrp->mr_lock))
+		return 0;
+	mrp->mr_writer = 1;
+	return 1;
+}

-#define mrinit(mrp, name)	mrlock_init(mrp, MRLOCK_BARRIER, name, -1)
-#define mraccess(mrp)		mraccessf(mrp, 0) /* grab for READ/ACCESS */
-#define mrupdate(mrp)		mrupdatef(mrp, 0) /* grab for WRITE/UPDATE */
-#define mrislocked_access(mrp)	((mrp)->mr_count > 0)
-#define mrislocked_update(mrp)	((mrp)->mr_count < 0)
+static inline void mrunlock(mrlock_t *mrp)
+{
+	if (mrp->mr_writer) {
+		mrp->mr_writer = 0;
+		up_write(&mrp->mr_lock);
+	} else {
+		up_read(&mrp->mr_lock);
+	}
+}
+
+static inline void mrdemote(mrlock_t *mrp)
+{
+	mrp->mr_writer = 0;
+	downgrade_write(&mrp->mr_lock);
+}
+
+/*
+ * Debug-only routine, without some platform-specific asm code, we can
+ * now only answer requests regarding whether we hold the lock for write
+ * (reader state is outside our visibility, we only track writer state).
+ * Note: means !ismrlocked would give false positivies, so don't do that.
+ */
+static inline int ismrlocked(mrlock_t *mrp, int type)
+{
+	if (type == MR_UPDATE)
+		return mrp->mr_writer;
+	return 1;
+}

 #endif /* __XFS_SUPPORT_MRLOCK_H__ */
--- a/fs/xfs/linux/xfs_aops.c
+++ b/fs/xfs/linux/xfs_aops.c
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -54,8 +54,54 @@
 #include "xfs_iomap.h"
 #include <linux/mpage.h>

-STATIC void convert_page(struct inode *, struct page *,
-			xfs_iomap_t *, void *, int, int);
+STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
+STATIC void xfs_convert_page(struct inode *, struct page *,
+				xfs_iomap_t *, void *, int, int);
+
+#if defined(XFS_RW_TRACE)
+void
+xfs_page_trace(
+	int		tag,
+	struct inode	*inode,
+	struct page	*page,
+	int		mask)
+{
+	xfs_inode_t	*ip;
+	bhv_desc_t	*bdp;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	loff_t		isize = i_size_read(inode);
+	loff_t		offset = page->index << PAGE_CACHE_SHIFT;
+	int		delalloc = -1, unmapped = -1, unwritten = -1;
+
+	if (page_has_buffers(page))
+		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
+
+	bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
+	ip = XFS_BHVTOI(bdp);
+	if (!ip->i_rwtrace)
+		return;
+
+	ktrace_enter(ip->i_rwtrace,
+		(void *)((unsigned long)tag),
+		(void *)ip,
+		(void *)inode,
+		(void *)page,
+		(void *)((unsigned long)mask),
+		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
+		(void *)((unsigned long)((isize >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(isize & 0xffffffff)),
+		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(offset & 0xffffffff)),
+		(void *)((unsigned long)delalloc),
+		(void *)((unsigned long)unmapped),
+		(void *)((unsigned long)unwritten),
+		(void *)NULL,
+		(void *)NULL);
+}
+#else
+#define xfs_page_trace(tag, inode, page, mask)
+#endif

 void
 linvfs_unwritten_done(
@@ -121,7 +167,7 @@ linvfs_unwritten_convert_direct(
 }

 STATIC int
-map_blocks(
+xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
 	ssize_t			count,
@@ -151,12 +197,11 @@ map_blocks(
 }

 /*
- * match_offset_to_mapping
 * Finds the corresponding mapping in block @map array of the
 * given @offset within a @page.
 */
 STATIC xfs_iomap_t *
-match_offset_to_mapping(
+xfs_offset_to_map(
 	struct page		*page,
 	xfs_iomap_t		*iomapp,
 	unsigned long		offset)
@@ -177,7 +222,7 @@ match_offset_to_mapping(
 }

 STATIC void
-map_buffer_at_offset(
+xfs_map_at_offset(
 	struct page		*page,
 	struct buffer_head	*bh,
 	unsigned long		offset,
@@ -218,7 +263,7 @@ map_buffer_at_offset(
 * in units of filesystem blocks.
 */
 STATIC struct page *
-probe_unwritten_page(
+xfs_probe_unwritten_page(
 	struct address_space	*mapping,
 	unsigned long		index,
 	xfs_iomap_t		*iomapp,
@@ -244,11 +289,11 @@ probe_unwritten_page(
 		do {
 			if (!buffer_unwritten(bh))
 				break;
-			if (!match_offset_to_mapping(page, iomapp, p_offset))
+			if (!xfs_offset_to_map(page, iomapp, p_offset))
 				break;
 			if (p_offset >= max_offset)
 				break;
-			map_buffer_at_offset(page, bh, p_offset, bbits, iomapp);
+			xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
 			set_buffer_unwritten_io(bh);
 			bh->b_private = pb;
 			p_offset += bh->b_size;
@@ -269,7 +314,7 @@ probe_unwritten_page(
 * yet - clustering for mmap write case.
 */
 STATIC unsigned int
-probe_unmapped_page(
+xfs_probe_unmapped_page(
 	struct address_space	*mapping,
 	unsigned long		index,
 	unsigned int		pg_offset)
@@ -305,7 +350,7 @@ probe_unmapped_page(
 }

 STATIC unsigned int
-probe_unmapped_cluster(
+xfs_probe_unmapped_cluster(
 	struct inode		*inode,
 	struct page		*startpage,
 	struct buffer_head	*bh,
@@ -330,7 +375,7 @@ probe_unmapped_cluster(
 		/* Prune this back to avoid pathological behavior */
 		tloff = min(tlast, startpage->index + 64);
 		for (tindex = startpage->index + 1; tindex < tloff; tindex++) {
-			len = probe_unmapped_page(mapping, tindex,
+			len = xfs_probe_unmapped_page(mapping, tindex,
 							PAGE_CACHE_SIZE);
 			if (!len)
 				return total;
@@ -338,7 +383,8 @@ probe_unmapped_cluster(
 		}
 		if (tindex == tlast &&
 		    (tloff = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
-			total += probe_unmapped_page(mapping, tindex, tloff);
+			total += xfs_probe_unmapped_page(mapping,
+							tindex, tloff);
 		}
 	}
 	return total;
@@ -350,7 +396,7 @@ probe_unmapped_cluster(
 * reference count.
 */
 STATIC struct page *
-probe_delalloc_page(
+xfs_probe_delalloc_page(
 	struct inode		*inode,
 	unsigned long		index)
 {
@@ -386,7 +432,7 @@ probe_delalloc_page(
 }

 STATIC int
-map_unwritten(
+xfs_map_unwritten(
 	struct inode		*inode,
 	struct page		*start_page,
 	struct buffer_head	*head,
@@ -434,22 +480,16 @@ map_unwritten(
 	do {
 		if (!buffer_unwritten(bh))
 			break;
-		tmp = match_offset_to_mapping(start_page, iomapp, p_offset);
+		tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
 		if (!tmp)
 			break;
-		map_buffer_at_offset(start_page, bh, p_offset, block_bits, iomapp);
+		xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
 		set_buffer_unwritten_io(bh);
 		bh->b_private = pb;
 		p_offset += bh->b_size;
 		nblocks++;
 	} while ((bh = bh->b_this_page) != head);

-	if (unlikely(nblocks == 0)) {
-		printk("XFS: bad unwritten extent map: bh=0x%p, iomapp=0x%p\n",
-		       curr, iomapp);
-		BUG();
-	}
-
 	atomic_add(nblocks, &pb->pb_io_remaining);

 	/* If we reached the end of the page, map forwards in any
@@ -465,13 +505,15 @@ map_unwritten(
 		tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
 		tloff = min(tlast, tloff);
 		for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
-			page = probe_unwritten_page(mapping, tindex, iomapp, pb,
+			page = xfs_probe_unwritten_page(mapping,
+						tindex, iomapp, pb,
 						PAGE_CACHE_SIZE, &bs, bbits);
 			if (!page)
 				break;
 			nblocks += bs;
 			atomic_add(bs, &pb->pb_io_remaining);
-			convert_page(inode, page, iomapp, pb, startio, all_bh);
+			xfs_convert_page(inode, page, iomapp, pb,
+							startio, all_bh);
 			/* stop if converting the next page might add
 			 * enough blocks that the corresponding byte
 			 * count won't fit in our ulong page buf length */
@@ -481,12 +523,14 @@ map_unwritten(

 		if (tindex == tlast &&
 		    (tloff = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
-			page = probe_unwritten_page(mapping, tindex, iomapp, pb,
+			page = xfs_probe_unwritten_page(mapping,
+							tindex, iomapp, pb,
 							tloff, &bs, bbits);
 			if (page) {
 				nblocks += bs;
 				atomic_add(bs, &pb->pb_io_remaining);
-				convert_page(inode, page, iomapp, pb, startio, all_bh);
+				xfs_convert_page(inode, page, iomapp, pb,
+							startio, all_bh);
 				if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
 					goto enough;
 			}
@@ -513,7 +557,7 @@ map_unwritten(
 }

 STATIC void
-submit_page(
+xfs_submit_page(
 	struct page		*page,
 	struct buffer_head	*bh_arr[],
 	int			cnt)
@@ -549,7 +593,7 @@ submit_page(
 * that the page has no mapping at all.
 */
 STATIC void
-convert_page(
+xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
 	xfs_iomap_t		*iomapp,
@@ -582,7 +626,7 @@ convert_page(
 			}
 			continue;
 		}
-		tmp = match_offset_to_mapping(page, mp, offset);
+		tmp = xfs_offset_to_map(page, mp, offset);
 		if (!tmp)
 			continue;
 		ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
@@ -594,10 +638,10 @@ convert_page(
 		 */
 		if (buffer_unwritten(bh) && !bh->b_end_io) {
 			ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN);
-			map_unwritten(inode, page, head, bh,
+			xfs_map_unwritten(inode, page, head, bh,
 					offset, bbits, tmp, startio, all_bh);
 		} else if (! (buffer_unwritten(bh) && buffer_locked(bh))) {
-			map_buffer_at_offset(page, bh, offset, bbits, tmp);
+			xfs_map_at_offset(page, bh, offset, bbits, tmp);
 			if (buffer_unwritten(bh)) {
 				set_buffer_unwritten_io(bh);
 				bh->b_private = private;
@@ -614,7 +658,7 @@ convert_page(
 	} while (i++, (bh = bh->b_this_page) != head);

 	if (startio) {
-		submit_page(page, bh_arr, index);
+		xfs_submit_page(page, bh_arr, index);
 	} else {
 		unlock_page(page);
 	}
@@ -625,7 +669,7 @@ convert_page(
 * by mp and following the start page.
 */
 STATIC void
-cluster_write(
+xfs_cluster_write(
 	struct inode		*inode,
 	unsigned long		tindex,
 	xfs_iomap_t		*iomapp,
@@ -637,10 +681,10 @@ cluster_write(

 	tlast = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
 	for (; tindex < tlast; tindex++) {
-		page = probe_delalloc_page(inode, tindex);
+		page = xfs_probe_delalloc_page(inode, tindex);
 		if (!page)
 			break;
-		convert_page(inode, page, iomapp, NULL, startio, all_bh);
+		xfs_convert_page(inode, page, iomapp, NULL, startio, all_bh);
 	}
 }

@@ -664,7 +708,7 @@ cluster_write(
 */

 STATIC int
-page_state_convert(
+xfs_page_state_convert(
 	struct inode	*inode,
 	struct page	*page,
 	int		startio,
@@ -707,7 +751,7 @@ page_state_convert(
 			continue;

 		if (iomp) {
-			iomp = match_offset_to_mapping(page, &iomap, p_offset);
+			iomp = xfs_offset_to_map(page, &iomap, p_offset);
 		}

 		/*
@@ -716,17 +760,17 @@ page_state_convert(
 		 */
 		if (buffer_unwritten(bh)) {
 			if (!iomp) {
-				err = map_blocks(inode, offset, len, &iomap,
+				err = xfs_map_blocks(inode, offset, len, &iomap,
 						BMAPI_READ|BMAPI_IGNSTATE);
 				if (err) {
 					goto error;
 				}
-				iomp = match_offset_to_mapping(page, &iomap,
+				iomp = xfs_offset_to_map(page, &iomap,
 								p_offset);
 			}
 			if (iomp && startio) {
 				if (!bh->b_end_io) {
-					err = map_unwritten(inode, page,
+					err = xfs_map_unwritten(inode, page,
 							head, bh, p_offset,
 							inode->i_blkbits, iomp,
 							startio, unmapped);
@@ -743,17 +787,17 @@ page_state_convert(
 		 */
 		} else if (buffer_delay(bh)) {
 			if (!iomp) {
-				err = map_blocks(inode, offset, len, &iomap,
-					BMAPI_ALLOCATE | flags);
+				err = xfs_map_blocks(inode, offset, len, &iomap,
+						BMAPI_ALLOCATE | flags);
 				if (err) {
 					goto error;
 				}
-				iomp = match_offset_to_mapping(page, &iomap,
+				iomp = xfs_offset_to_map(page, &iomap,
 								p_offset);
 			}
 			if (iomp) {
-				map_buffer_at_offset(page, bh, p_offset,
-					inode->i_blkbits, iomp);
+				xfs_map_at_offset(page, bh, p_offset,
+						inode->i_blkbits, iomp);
 				if (startio) {
 					bh_arr[cnt++] = bh;
 				} else {
@@ -775,19 +819,19 @@ page_state_convert(
 				 * need to write the whole page out.
 				 */
 				if (!iomp) {
-					size = probe_unmapped_cluster(
+					size = xfs_probe_unmapped_cluster(
 							inode, page, bh, head);
-					err = map_blocks(inode, offset,
-						size, &iomap,
-						BMAPI_WRITE | BMAPI_MMAP);
+					err = xfs_map_blocks(inode, offset,
+							size, &iomap,
+							BMAPI_WRITE|BMAPI_MMAP);
 					if (err) {
 						goto error;
 					}
-					iomp = match_offset_to_mapping(page, &iomap,
+					iomp = xfs_offset_to_map(page, &iomap,
 								     p_offset);
 				}
 				if (iomp) {
-					map_buffer_at_offset(page,
+					xfs_map_at_offset(page,
 							bh, p_offset,
 							inode->i_blkbits, iomp);
 					if (startio) {
@@ -814,10 +858,10 @@ page_state_convert(
 		SetPageUptodate(page);

 	if (startio)
-		submit_page(page, bh_arr, cnt);
+		xfs_submit_page(page, bh_arr, cnt);

 	if (iomp)
-		cluster_write(inode, page->index + 1, iomp, startio, unmapped);
+		xfs_cluster_write(inode, page->index + 1, iomp, startio, unmapped);

 	return page_dirty;

@@ -1031,7 +1075,7 @@ linvfs_readpages(
 }

 STATIC void
-count_page_state(
+xfs_count_page_state(
 	struct page		*page,
 	int			*delalloc,
 	int			*unmapped,
@@ -1085,18 +1129,21 @@ linvfs_writepage(
 	int			delalloc, unmapped, unwritten;
 	struct inode		*inode = page->mapping->host;

+	xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
+
 	/*
 	 * We need a transaction if:
 	 *  1. There are delalloc buffers on the page
-	 *  2. The page is upto date and we have unmapped buffers
-	 *  3. The page is upto date and we have no buffers
+	 *  2. The page is uptodate and we have unmapped buffers
+	 *  3. The page is uptodate and we have no buffers
 	 *  4. There are unwritten buffers on the page
 	 */
+
 	if (!page_has_buffers(page)) {
 		unmapped = 1;
 		need_trans = 1;
 	} else {
-		count_page_state(page, &delalloc, &unmapped, &unwritten);
+		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
 		if (!PageUptodate(page))
 			unmapped = 0;
 		need_trans = delalloc + unmapped + unwritten;
@@ -1122,7 +1169,7 @@ linvfs_writepage(
 	 * Convert delayed allocate, unwritten or unmapped space
 	 * to real space and flush out to disk.
 	 */
-	error = page_state_convert(inode, page, 1, unmapped);
+	error = xfs_page_state_convert(inode, page, 1, unmapped);
 	if (error == -EAGAIN)
 		goto out_fail;
 	if (unlikely(error < 0))
@@ -1166,7 +1213,9 @@ linvfs_release_page(
 	struct inode		*inode = page->mapping->host;
 	int			dirty, delalloc, unmapped, unwritten;

-	count_page_state(page, &delalloc, &unmapped, &unwritten);
+	xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
+
+	xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
 	if (!delalloc && !unwritten)
 		goto free_buffers;

@@ -1185,7 +1234,7 @@ linvfs_release_page(
 	 * Never need to allocate space here - we will always
 	 * come back to writepage in that case.
 	 */
-	dirty = page_state_convert(inode, page, 0, 0);
+	dirty = xfs_page_state_convert(inode, page, 0, 0);
 	if (dirty == 0 && !unwritten)
 		goto free_buffers;
 	return 0;

--- a/fs/xfs/linux/xfs_buf.c
+++ b/fs/xfs/linux/xfs_buf.c
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -59,17 +59,7 @@
 #include <linux/suspend.h>
 #include <linux/percpu.h>

-#include <support/ktrace.h>
-#include <support/debug.h>
-#include "kmem.h"
-
-#include "xfs_types.h"
-#include "xfs_cred.h"
-#include "xfs_lrw.h"
-#include "xfs_buf.h"
-
-#define BBSHIFT		9
-#define BN_ALIGN_MASK	((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)
+#include "xfs_linux.h"

 #ifndef GFP_READAHEAD
 #define GFP_READAHEAD	(__GFP_NOWARN|__GFP_NORETRY)
@@ -85,60 +75,6 @@ STATIC void pagebuf_delwri_queue(page_buf_t *, int);
 STATIC struct workqueue_struct *pagebuf_logio_workqueue;
 STATIC struct workqueue_struct *pagebuf_dataio_workqueue;

-/*
- * Pagebuf module configuration parameters, exported via
- * /proc/sys/vm/pagebuf
- */
-
-typedef struct pb_sysctl_val {
-	int	min;
-	int	val;
-	int	max;
-} pb_sysctl_val_t;
-
-struct {
-	pb_sysctl_val_t	flush_interval;	/* interval between runs of the
-					 * delwri flush daemon.  */
-	pb_sysctl_val_t	age_buffer;	/* time for buffer to age before
-					 * we flush it.  */
-	pb_sysctl_val_t	stats_clear;	/* clear the pagebuf stats */
-	pb_sysctl_val_t	debug;		/* debug tracing on or off */
-} pb_params = {
-			  /*	MIN	DFLT	MAX	*/
-	.flush_interval	= {	HZ/2,	HZ,	30*HZ	},
-	.age_buffer	= {	1*HZ,	15*HZ,	300*HZ	},
-	.stats_clear	= {	0,	0,	1	},
-	.debug		= {	0,	0,	1	},
-};
-
-enum {
-	PB_FLUSH_INT = 1,
-	PB_FLUSH_AGE = 2,
-	PB_STATS_CLEAR = 3,
-	PB_DEBUG = 4,
-};
-
-/*
- * Pagebuf statistics variables
- */
-
-struct pbstats {
-	u_int32_t	pb_get;
-	u_int32_t	pb_create;
-	u_int32_t	pb_get_locked;
-	u_int32_t	pb_get_locked_waited;
-	u_int32_t	pb_busy_locked;
-	u_int32_t	pb_miss_locked;
-	u_int32_t	pb_page_retries;
-	u_int32_t	pb_page_found;
-	u_int32_t	pb_get_read;
-} pbstats;
-DEFINE_PER_CPU(struct pbstats, pbstats);
-
-/* We don't disable preempt, not too worried about poking the
- * wrong cpu's stat for now */
-#define PB_STATS_INC(count)	(__get_cpu_var(pbstats).count++)
-
 /*
 * Pagebuf debugging
 */
@@ -151,8 +87,6 @@ pagebuf_trace(
 	void		*data,
 	void		*ra)
 {
-	if (!pb_params.debug.val)
-		return;
 	ktrace_enter(pagebuf_trace_buf,
 		pb, id,
 		(void *)(unsigned long)pb->pb_flags,
@@ -326,7 +260,7 @@ _pagebuf_initialize(
 	atomic_set(&pb->pb_pin_count, 0);
 	init_waitqueue_head(&pb->pb_waiters);

-	PB_STATS_INC(pb_create);
+	XFS_STATS_INC(pb_create);
 	PB_TRACE(pb, "initialize", target);
 }

@@ -382,25 +316,13 @@ _pagebuf_freepages(
 *	pagebuf_free releases the specified buffer.  The modification
 *	state of any associated pages is left unchanged.
 */
-STATIC void
-__pagebuf_free(
+void
+pagebuf_free(
 	page_buf_t		*pb)
 {
-	pb_hash_t		*hash = pb_hash(pb);
-
 	PB_TRACE(pb, "free", 0);
-
-	spin_lock(&hash->pb_hash_lock);
-	/*
-	 * Someone grabbed a reference while we weren't looking,
-	 * try again later.
-	 */
-	if (unlikely(atomic_read(&pb->pb_hold))) {
-		spin_unlock(&hash->pb_hash_lock);
-		return;
-	} else if (!list_empty(&pb->pb_hash_list))
-		list_del_init(&pb->pb_hash_list);
-	spin_unlock(&hash->pb_hash_lock);
+	
+	ASSERT(list_empty(&pb->pb_hash_list));

 	/* release any virtual mapping */ ;
 	if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
@@ -429,17 +351,6 @@ __pagebuf_free(
 	pagebuf_deallocate(pb);
 }

-void
-pagebuf_free(
-	page_buf_t		*pb)
-{
-	if (unlikely(!atomic_dec_and_test(&pb->pb_hold))) {
-		printk(KERN_ERR "XFS: freeing inuse buffer!\n");
-		dump_stack();
-	} else
-		__pagebuf_free(pb);
-}
-
 /*
 *	_pagebuf_lookup_pages
 *
@@ -513,13 +424,13 @@ _pagebuf_lookup_pages(
 					       "possibly deadlocking in %s\n",
 					       __FUNCTION__);
 				}
-				PB_STATS_INC(pb_page_retries);
+				XFS_STATS_INC(pb_page_retries);
 				pagebuf_daemon_wakeup();
 				current->state = TASK_UNINTERRUPTIBLE;
 				schedule_timeout(10);
 				goto retry;
 			}
-			PB_STATS_INC(pb_page_found);
+			XFS_STATS_INC(pb_page_found);
 			mark_page_accessed(page);
 			pb->pb_pages[pi] = page;
 		} else {
@@ -565,6 +476,7 @@ _pagebuf_lookup_pages(
 		}
 	}

+	pb->pb_flags |= _PBF_PAGECACHE;
 mapit:
 	pb->pb_flags |= _PBF_MEM_ALLOCATED;
 	if (all_mapped) {
@@ -649,8 +561,7 @@ _pagebuf_find(				/* find buffer for block	*/

 		if (pb->pb_target == target &&
 		    pb->pb_file_offset == range_base &&
-		    pb->pb_buffer_length == range_length &&
-		    atomic_read(&pb->pb_hold)) {
+		    pb->pb_buffer_length == range_length) {
 			/* If we look at something bring it to the
 			 * front of the list for next time
 			 */
@@ -667,7 +578,7 @@ _pagebuf_find(				/* find buffer for block	*/
 		new_pb->pb_hash_index = hval;
 		list_add(&new_pb->pb_hash_list, &h->pb_hash);
 	} else {
-		PB_STATS_INC(pb_miss_locked);
+		XFS_STATS_INC(pb_miss_locked);
 	}

 	spin_unlock(&h->pb_hash_lock);
@@ -686,7 +597,7 @@ _pagebuf_find(				/* find buffer for block	*/
 			/* wait for buffer ownership */
 			PB_TRACE(pb, "get_lock", 0);
 			pagebuf_lock(pb);
-			PB_STATS_INC(pb_get_locked_waited);
+			XFS_STATS_INC(pb_get_locked_waited);
 		} else {
 			/* We asked for a trylock and failed, no need
 			 * to look at file offset and length here, we
@@ -696,7 +607,7 @@ _pagebuf_find(				/* find buffer for block	*/
 			 */

 			pagebuf_rele(pb);
-			PB_STATS_INC(pb_busy_locked);
+			XFS_STATS_INC(pb_busy_locked);
 			return (NULL);
 		}
 	} else {
@@ -711,7 +622,7 @@ _pagebuf_find(				/* find buffer for block	*/
 				_PBF_MEM_ALLOCATED | \
 				_PBF_MEM_SLAB;
 	PB_TRACE(pb, "got_lock", 0);
-	PB_STATS_INC(pb_get_locked);
+	XFS_STATS_INC(pb_get_locked);
 	return (pb);
 }

@@ -767,7 +678,7 @@ pagebuf_get(				/* allocate a buffer		*/
 			return (NULL);
 	}

-	PB_STATS_INC(pb_get);
+	XFS_STATS_INC(pb_get);

 	/* fill in any missing pages */
 	error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags);
@@ -787,7 +698,7 @@ pagebuf_get(				/* allocate a buffer		*/
 	if (flags & PBF_READ) {
 		if (PBF_NOT_DONE(pb)) {
 			PB_TRACE(pb, "get_read", (unsigned long)flags);
-			PB_STATS_INC(pb_get_read);
+			XFS_STATS_INC(pb_get_read);
 			pagebuf_iostart(pb, flags);
 		} else if (flags & PBF_ASYNC) {
 			PB_TRACE(pb, "get_read_async", (unsigned long)flags);
@@ -1007,16 +918,21 @@ void
 pagebuf_rele(
 	page_buf_t		*pb)
 {
+	pb_hash_t		*hash = pb_hash(pb);
+
 	PB_TRACE(pb, "rele", pb->pb_relse);

-	if (atomic_dec_and_test(&pb->pb_hold)) {
+	if (atomic_dec_and_lock(&pb->pb_hold, &hash->pb_hash_lock)) {
 		int		do_free = 1;

 		if (pb->pb_relse) {
 			atomic_inc(&pb->pb_hold);
+			spin_unlock(&hash->pb_hash_lock);
 			(*(pb->pb_relse)) (pb);
+			spin_lock(&hash->pb_hash_lock);
 			do_free = 0;
 		}
+
 		if (pb->pb_flags & PBF_DELWRI) {
 			pb->pb_flags |= PBF_ASYNC;
 			atomic_inc(&pb->pb_hold);
@@ -1027,7 +943,11 @@ pagebuf_rele(
 		}

 		if (do_free) {
-			__pagebuf_free(pb);
+			list_del_init(&pb->pb_hash_list);
+			spin_unlock(&hash->pb_hash_lock);
+			pagebuf_free(pb);
+		} else {
+			spin_unlock(&hash->pb_hash_lock);
 		}
 	}
 }
@@ -1282,7 +1202,7 @@ pagebuf_iostart(			/* start I/O on a buffer	  */
 	page_buf_t		*pb,	/* buffer to start		  */
 	page_buf_flags_t	flags)	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
 					/* PBF_WRITE, PBF_DELWRI,	  */
-					/* PBF_SYNC, PBF_DONT_BLOCK	  */
+					/* PBF_DONT_BLOCK		  */
 {
 	int			status = 0;

@@ -1290,16 +1210,15 @@ pagebuf_iostart(			/* start I/O on a buffer	  */

 	if (flags & PBF_DELWRI) {
 		pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
-		pb->pb_flags |= flags &
-				(PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
+		pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC);
 		pagebuf_delwri_queue(pb, 1);
 		return status;
 	}

-	pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | \
-			PBF_DELWRI | PBF_READ_AHEAD | PBF_RUN_QUEUES);
+	pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \
+			PBF_READ_AHEAD | PBF_RUN_QUEUES);
 	pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
-			PBF_SYNC | PBF_READ_AHEAD | PBF_RUN_QUEUES);
+			PBF_READ_AHEAD | PBF_RUN_QUEUES);

 	BUG_ON(pb->pb_bn == PAGE_BUF_DADDR_NULL);

@@ -1655,7 +1574,7 @@ pagebuf_delwri_queue(
 	}

 	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
-	pb->pb_flushtime = jiffies + pb_params.age_buffer.val;
+	pb->pb_flushtime = jiffies + xfs_age_buffer;
 	spin_unlock(&pbd_delwrite_lock);

 	if (unlock)
@@ -1703,7 +1622,7 @@ pagebuf_daemon(
 	struct list_head	*curr, *next, tmp;

 	/*  Set up the thread  */
-	daemonize("pagebufd");
+	daemonize("xfsbufd");
 	current->flags |= PF_MEMALLOC;

 	pagebuf_daemon_task = current;
@@ -1717,7 +1636,7 @@ pagebuf_daemon(
 			refrigerator(PF_IOTHREAD);

 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(pb_params.flush_interval.val);
+		schedule_timeout(xfs_flush_interval);

 		spin_lock(&pbd_delwrite_lock);

@@ -1876,112 +1795,6 @@ pagebuf_daemon_stop(void)
 	destroy_workqueue(pagebuf_dataio_workqueue);
 }

-
-/*
- * Pagebuf sysctl interface
- */
-
-STATIC int
-pb_stats_clear_handler(
-	ctl_table		*ctl,
-	int			write,
-	struct file		*filp,
-	void			*buffer,
-	size_t			*lenp)
-{
-	int			c, ret;
-	int			*valp = ctl->data;
-
-	ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp);
-
-	if (!ret && write && *valp) {
-		printk("XFS Clearing pbstats\n");
-		for (c = 0; c < NR_CPUS; c++) {
-			if (!cpu_possible(c)) continue;
-				memset(&per_cpu(pbstats, c), 0,
-				       sizeof(struct pbstats));
-		}
-		pb_params.stats_clear.val = 0;
-	}
-
-	return ret;
-}
-
-STATIC struct ctl_table_header *pagebuf_table_header;
-
-STATIC ctl_table pagebuf_table[] = {
-	{PB_FLUSH_INT, "flush_int", &pb_params.flush_interval.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&pb_params.flush_interval.min, &pb_params.flush_interval.max},
-
-	{PB_FLUSH_AGE, "flush_age", &pb_params.age_buffer.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL, 
-	&pb_params.age_buffer.min, &pb_params.age_buffer.max},
-
-	{PB_STATS_CLEAR, "stats_clear", &pb_params.stats_clear.val,
-	sizeof(int), 0644, NULL, &pb_stats_clear_handler,
-	&sysctl_intvec, NULL, 
-	&pb_params.stats_clear.min, &pb_params.stats_clear.max},
-
-#ifdef PAGEBUF_TRACE
-	{PB_DEBUG, "debug", &pb_params.debug.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL, 
-	&pb_params.debug.min, &pb_params.debug.max},
-#endif
-	{0}
-};
-
-STATIC ctl_table pagebuf_dir_table[] = {
-	{VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
-	{0}
-};
-
-STATIC ctl_table pagebuf_root_table[] = {
-	{CTL_VM, "vm",  NULL, 0, 0555, pagebuf_dir_table},
-	{0}
-};
-
-#ifdef CONFIG_PROC_FS
-STATIC int
-pagebuf_readstats(
-	char			*buffer,
-	char			**start,
-	off_t			offset,
-	int			count,
-	int			*eof,
-	void			*data)
-{
-	int			c, i, len, val;
-
-	len = 0;
-	len += sprintf(buffer + len, "pagebuf");
-	for (i = 0; i < sizeof(struct pbstats) / sizeof(u_int32_t); i++) {
-		val = 0;
-		for (c = 0 ; c < NR_CPUS; c++) {
-			if (!cpu_possible(c)) continue;
-			val += *(((u_int32_t*)&per_cpu(pbstats, c) + i));
-		}
-		len += sprintf(buffer + len, " %u", val);
-	}
-	buffer[len++] = '\n';
-
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
-}
-#endif  /* CONFIG_PROC_FS */
-
 /*
 *	Initialization and Termination
 */
@@ -1991,14 +1804,6 @@ pagebuf_init(void)
 {
 	int			i;

-	pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);
-
-#ifdef CONFIG_PROC_FS
-	if (proc_mkdir("fs/pagebuf", 0))
-		create_proc_read_entry(
-			"fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
-#endif
-
 	pagebuf_cache = kmem_cache_create("page_buf_t", sizeof(page_buf_t), 0,
 			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (pagebuf_cache == NULL) {
@@ -2036,10 +1841,4 @@ pagebuf_terminate(void)
 #endif

 	kmem_cache_destroy(pagebuf_cache);
-
-	unregister_sysctl_table(pagebuf_table_header);
-#ifdef  CONFIG_PROC_FS
-	remove_proc_entry("fs/pagebuf/stat", NULL);
-	remove_proc_entry("fs/pagebuf", NULL);
-#endif
 }
--- a/fs/xfs/linux/xfs_buf.h
+++ b/fs/xfs/linux/xfs_buf.h
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -76,7 +76,6 @@ typedef enum page_buf_flags_e {		/* pb_flags values */
 	PBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
 	PBF_NONE = (1 << 5),    /* buffer not read at all                  */
 	PBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
-	PBF_SYNC = (1 << 8),    /* force updates to disk                   */
 	PBF_STALE = (1 << 10),	/* buffer has been staled, do not find it  */
 	PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory  */
 	PBF_FS_DATAIOD = (1 << 12), /* schedule IO completion on fs datad  */
@@ -87,6 +86,7 @@ typedef enum page_buf_flags_e {		/* pb_flags values */
 	PBF_DONT_BLOCK = (1 << 15), /* do not block in current thread	   */

 	/* flags used only internally */
+	_PBF_PAGECACHE = (1 << 16),	/* backed by pagecache		   */
 	_PBF_ALL_PAGES_MAPPED = (1 << 18), /* all pages in range mapped	   */
 	_PBF_ADDR_ALLOCATED = (1 << 19), /* pb_addr space was allocated	   */
 	_PBF_MEM_ALLOCATED = (1 << 20), /* underlying pages are allocated  */
@@ -260,7 +260,7 @@ extern int pagebuf_iostart(		/* start I/O on a buffer	*/
 		page_buf_t *,		/* buffer to start		*/
 		page_buf_flags_t);	/* PBF_LOCK, PBF_ASYNC,		*/
 					/* PBF_READ, PBF_WRITE,		*/
-					/* PBF_DELWRI, PBF_SYNC		*/
+					/* PBF_DELWRI			*/

 extern int pagebuf_iorequest(		/* start real I/O		*/
 		page_buf_t *);		/* buffer to convey to device	*/
@@ -355,7 +355,7 @@ extern void pagebuf_trace(

 #define XFS_BUF_BFLAGS(x)	((x)->pb_flags)
 #define XFS_BUF_ZEROFLAGS(x)	\
-	((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI))
+	((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI))

 #define XFS_BUF_STALE(x)	((x)->pb_flags |= XFS_B_STALE)
 #define XFS_BUF_UNSTALE(x)	((x)->pb_flags &= ~XFS_B_STALE)
@@ -558,7 +558,6 @@ static inline int	XFS_bwrite(page_buf_t *pb)
 	int	iowait = (pb->pb_flags & PBF_ASYNC) == 0;
 	int	error = 0;

-	pb->pb_flags |= PBF_SYNC;
 	if (!iowait)
 		pb->pb_flags |= PBF_RUN_QUEUES;


--- a/fs/xfs/linux/xfs_globals.c
+++ b/fs/xfs/linux/xfs_globals.c
@@ -61,6 +61,8 @@ xfs_param_t xfs_params = {
 	.inherit_sync	= {	0,	1,	1	},
 	.inherit_nodump	= {	0,	1,	1	},
 	.inherit_noatim = {	0,	1,	1	},
+	.flush_interval	= {	HZ/2,	HZ,	30*HZ	},
+	.age_buffer	= {	1*HZ,	15*HZ,	300*HZ	},
 };

 /*

--- a/fs/xfs/linux/xfs_ioctl.c
+++ b/fs/xfs/linux/xfs_ioctl.c
@@ -699,9 +699,7 @@ xfs_ioctl(

 		error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate,
 							NULL);
-		if (error)
-			return -error;
-		return 0;
+		return -error;
 	}

 	case XFS_IOC_GETBMAP:
@@ -733,9 +731,7 @@ xfs_ioctl(

 	case XFS_IOC_SWAPEXT: {
 		error = xfs_swapext((struct xfs_swapext *)arg);
-		if (error)
-			return -error;
-		return 0;
+		return -error;
 	}

 	case XFS_IOC_FSCOUNTS: {
@@ -763,6 +759,8 @@ xfs_ioctl(
 		/* input parameter is passed in resblks field of structure */
 		in = inout.resblks;
 		error = xfs_reserve_blocks(mp, &in, &inout);
+		if (error)
+			return -error;

 		if (copy_to_user((char *)arg, &inout, sizeof(inout)))
 			return -XFS_ERROR(EFAULT);
@@ -795,9 +793,7 @@ xfs_ioctl(
 			return -XFS_ERROR(EFAULT);

 		error = xfs_growfs_data(mp, &in);
-		if (error)
-			return -error;
-		return 0;
+		return -error;
 	}

 	case XFS_IOC_FSGROWFSLOG: {
@@ -810,9 +806,7 @@ xfs_ioctl(
 			return -XFS_ERROR(EFAULT);

 		error = xfs_growfs_log(mp, &in);
-		if (error)
-			return -error;
-		return 0;
+		return -error;
 	}

 	case XFS_IOC_FSGROWFSRT: {
@@ -825,9 +819,7 @@ xfs_ioctl(
 			return -XFS_ERROR(EFAULT);

 		error = xfs_growfs_rt(mp, &in);
-		if (error)
-			return -error;
-		return 0;
+		return -error;
 	}

 	case XFS_IOC_FREEZE:
@@ -842,6 +834,19 @@ xfs_ioctl(
 		xfs_fs_thaw(mp);
 		return 0;

+	case XFS_IOC_GOINGDOWN: {
+		__uint32_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (get_user(in, (__uint32_t *)arg))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_fs_goingdown(mp, in);
+		return -error;
+	}
+
 	case XFS_IOC_ERROR_INJECTION: {
 		xfs_error_injection_t in;

@@ -849,9 +854,7 @@ xfs_ioctl(
 			return -XFS_ERROR(EFAULT);

 		error = xfs_errortag_add(in.errtag, mp);
-		if (error)
-			return -error;
-		return 0;
+		return -error;
 	}

 	case XFS_IOC_ERROR_CLEARALL:

--- a/fs/xfs/linux/xfs_iops.c
+++ b/fs/xfs/linux/xfs_iops.c
@@ -541,7 +541,6 @@ linvfs_setattr(
 	if (error)
 		return(-error);	/* Positive error up from XFS */
 	if (ia_valid & ATTR_SIZE) {
-		i_size_write(inode, vattr.va_size);
 		error = vmtruncate(inode, attr->ia_size);
 	}

@@ -631,8 +630,7 @@ linvfs_listxattr(

 	if (!size)
 		xflags |= ATTR_KERNOVAL;
-	if (capable(CAP_SYS_ADMIN))
-		xflags |= ATTR_KERNFULLS;
+	xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS;

 	error = attr_generic_list(vp, data, size, xflags, &result);
 	if (error < 0)

--- a/fs/xfs/linux/xfs_linux.h
+++ b/fs/xfs/linux/xfs_linux.h
@@ -138,6 +138,8 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh)
 #define xfs_inherit_sync	xfs_params.inherit_sync.val
 #define xfs_inherit_nodump	xfs_params.inherit_nodump.val
 #define xfs_inherit_noatime	xfs_params.inherit_noatim.val
+#define xfs_flush_interval	xfs_params.flush_interval.val
+#define xfs_age_buffer		xfs_params.age_buffer.val

 #define current_cpu()		smp_processor_id()
 #define current_pid()		(current->pid)

--- a/fs/xfs/linux/xfs_lrw.c
+++ b/fs/xfs/linux/xfs_lrw.c
@@ -283,7 +283,6 @@ xfs_read(
 	ip = XFS_BHVTOI(bdp);
 	vp = BHV_TO_VNODE(bdp);
 	mp = ip->i_mount;
-	vn_trace_entry(vp, "xfs_read", (inst_t *)__return_address);

 	XFS_STATS_INC(xs_read_calls);

@@ -345,6 +344,8 @@ xfs_read(
 		}
 	}

+	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
+				iovp, segs, *offset, ioflags);
 	ret = __generic_file_aio_read(iocb, iovp, segs, offset);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);

@@ -377,7 +378,6 @@ xfs_sendfile(
 	ip = XFS_BHVTOI(bdp);
 	vp = BHV_TO_VNODE(bdp);
 	mp = ip->i_mount;
-	vn_trace_entry(vp, "xfs_sendfile", (inst_t *)__return_address);

 	XFS_STATS_INC(xs_read_calls);

@@ -405,6 +405,8 @@ xfs_sendfile(
 			return -error;
 		}
 	}
+	xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
+				target, count, *offset, ioflags);
 	ret = generic_file_sendfile(filp, offset, count, actor, target);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);

@@ -658,7 +660,6 @@ xfs_write(
 	XFS_STATS_INC(xs_write_calls);

 	vp = BHV_TO_VNODE(bdp);
-	vn_trace_entry(vp, "xfs_write", (inst_t *)__return_address);
 	xip = XFS_BHVTOI(bdp);

 	/* START copy & waste from filemap.c */
@@ -678,7 +679,7 @@ xfs_write(
 	if (size == 0)
 		return 0;

-	io = &(xip->i_iocore);
+	io = &xip->i_iocore;
 	mp = io->io_mount;

 	xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);
@@ -729,11 +730,12 @@ xfs_write(
 	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
 	    !(ioflags & IO_INVIS) && !eventsent)) {
 		loff_t		savedsize = *offset;
+		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);

 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
 				      *offset, size,
-				      FILP_DELAY_FLAG(file), &locktype);
+				      dmflags, &locktype);
 		if (error) {
 			xfs_iunlock(xip, iolock);
 			return -error;

--- a/fs/xfs/linux/xfs_lrw.h
+++ b/fs/xfs/linux/xfs_lrw.h
@@ -45,9 +45,7 @@ struct xfs_iomap;
 /*
 * Defines for the trace mechanisms in xfs_lrw.c.
 */
-#define	XFS_RW_KTRACE_SIZE	64
-#define	XFS_STRAT_KTRACE_SIZE	64
-#define	XFS_STRAT_GTRACE_SIZE	512
+#define	XFS_RW_KTRACE_SIZE	128

 #define	XFS_READ_ENTER		1
 #define	XFS_WRITE_ENTER		2
@@ -69,6 +67,12 @@ struct xfs_iomap;
 #define	XFS_INVAL_CACHED	18
 #define	XFS_DIORD_ENTER		19
 #define	XFS_DIOWR_ENTER		20
+#define	XFS_SENDFILE_ENTER	21
+#define	XFS_WRITEPAGE_ENTER	22
+#define	XFS_RELEASEPAGE_ENTER	23
+#define	XFS_IOMAP_ALLOC_ENTER	24
+#define	XFS_IOMAP_ALLOC_MAP	25
+#define	XFS_IOMAP_UNWRITTEN	26
 extern void xfs_rw_enter_trace(int, struct xfs_iocore *,
 			const struct iovec *, size_t, loff_t, int);
 extern void xfs_inval_cached_trace(struct xfs_iocore *,

--- a/fs/xfs/linux/xfs_stats.c
+++ b/fs/xfs/linux/xfs_stats.c
@@ -67,6 +67,7 @@ xfs_read_xfsstats(
 		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
 		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
 		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
+		{ "buf",		XFSSTAT_END_BUF			},
 	};

 	/* Loop over all stats groups */

--- a/fs/xfs/linux/xfs_stats.h
+++ b/fs/xfs/linux/xfs_stats.h
@@ -122,6 +122,16 @@ struct xfsstats {
 	__uint32_t		vn_reclaim;	/* # times vn_reclaim called */
 	__uint32_t		vn_remove;	/* # times vn_remove called */
 	__uint32_t		vn_free;	/* # times vn_free called */
+#define XFSSTAT_END_BUF			(XFSSTAT_END_VNODE_OPS+9)
+	__uint32_t		pb_get;
+	__uint32_t		pb_create;
+	__uint32_t		pb_get_locked;
+	__uint32_t		pb_get_locked_waited;
+	__uint32_t		pb_busy_locked;
+	__uint32_t		pb_miss_locked;
+	__uint32_t		pb_page_retries;
+	__uint32_t		pb_page_found;
+	__uint32_t		pb_get_read;
 /* Extra precision counters */
 	__uint64_t		xs_xstrat_bytes;
 	__uint64_t		xs_write_bytes;

--- a/fs/xfs/linux/xfs_super.c
+++ b/fs/xfs/linux/xfs_super.c
@@ -453,7 +453,7 @@ syncd(void *arg)
 	vfs_t			*vfsp = (vfs_t *) arg;
 	int			error;

-	daemonize("xfs_syncd");
+	daemonize("xfssyncd");

 	vfsp->vfs_sync_task = current;
 	wmb();

--- a/fs/xfs/linux/xfs_super.h
+++ b/fs/xfs/linux/xfs_super.h
@@ -61,7 +61,7 @@
 #endif

 #ifdef CONFIG_XFS_SECURITY
-# define XFS_SECURITY_STRING	"security attrs, "
+# define XFS_SECURITY_STRING	"security attributes, "
 # define ENOSECURITY		0
 #else
 # define XFS_SECURITY_STRING

--- a/fs/xfs/linux/xfs_sysctl.c
+++ b/fs/xfs/linux/xfs_sysctl.c
@@ -117,6 +117,16 @@ STATIC ctl_table xfs_table[] = {
 	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
 	&sysctl_intvec, NULL,
 	&xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max},
+	
+	{XFS_FLUSH_INTERVAL, "flush_interval", &xfs_params.flush_interval.val,
+	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	&sysctl_intvec, NULL,
+	&xfs_params.flush_interval.min, &xfs_params.flush_interval.max},
+
+	{XFS_AGE_BUFFER, "age_buffer", &xfs_params.age_buffer.val,
+	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	&sysctl_intvec, NULL,
+	&xfs_params.age_buffer.min, &xfs_params.age_buffer.max},

 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS

--- a/fs/xfs/linux/xfs_sysctl.h
+++ b/fs/xfs/linux/xfs_sysctl.h
@@ -58,6 +58,10 @@ typedef struct xfs_param {
 	xfs_sysctl_val_t inherit_sync;	/* Inherit the "sync" inode flag. */
 	xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
 	xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
+	xfs_sysctl_val_t flush_interval;/* interval between runs of the
+					 * delwri flush daemon.  */
+	xfs_sysctl_val_t age_buffer;	/* time for buffer to age before
+					 * we flush it.  */
 } xfs_param_t;

 /*
@@ -86,6 +90,8 @@ enum {
 	XFS_INHERIT_SYNC = 13,
 	XFS_INHERIT_NODUMP = 14,
 	XFS_INHERIT_NOATIME = 15,
+	XFS_FLUSH_INTERVAL = 16,
+	XFS_AGE_BUFFER = 17,
 };

 extern xfs_param_t	xfs_params;

--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -780,14 +780,8 @@ xfs_alloc_ag_vextent_near(
 	/*
 	 * Randomly don't execute the first algorithm.
 	 */
-	static int	seed;		/* randomizing seed value */
 	int		dofirst;	/* set to do first algorithm */
-	timespec_t	now;		/* current time */

-	if (!seed) {
-		nanotime(&now);
-		seed = (int)now.tv_sec ^ (int)now.tv_nsec;
-	}
 	dofirst = random() & 1;
 #endif
 	/*

--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -91,10 +91,14 @@ extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *);
 #define ATTR_CREATE	0x0010	/* pure create: fail if attr already exists */
 #define ATTR_REPLACE	0x0020	/* pure set: fail if attr does not exist */
 #define ATTR_SYSTEM	0x0100	/* use attrs in system (pseudo) namespace */
+
 #define ATTR_KERNOTIME	0x1000	/* [kernel] don't update inode timestamps */
 #define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
 #define ATTR_KERNAMELS	0x4000	/* [kernel] list attr names (simple list) */
-#define ATTR_KERNFULLS	0x8000	/* [kernel] full attr list, ie. root+user */
+
+#define ATTR_KERNORMALS	0x0800	/* [kernel] normal attr list: user+secure */
+#define ATTR_KERNROOTLS	0x8000	/* [kernel] include root in the attr list */
+#define ATTR_KERNFULLS	(ATTR_KERNORMALS|ATTR_KERNROOTLS)

 /*
 * The maximum size (into the kernel or returned from the kernel) of an

--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -460,9 +460,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 				i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
 			attrnames_t	*namesp;

+			if (((context->flags & ATTR_SECURE) != 0) !=
+			    ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
+			    !(context->flags & ATTR_KERNORMALS)) {
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+				continue;
+			}
 			if (((context->flags & ATTR_ROOT) != 0) !=
 			    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
-			    !(context->flags & ATTR_KERNFULLS)) {
+			    !(context->flags & ATTR_KERNROOTLS)) {
 				sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 				continue;
 			}
@@ -511,9 +517,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 			kmem_free(sbuf, sbsize);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
+		if (((context->flags & ATTR_SECURE) != 0) !=
+		    ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
+		    !(context->flags & ATTR_KERNORMALS)) {
+			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+			continue;
+		}
 		if (((context->flags & ATTR_ROOT) != 0) !=
 		    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
-		    !(context->flags & ATTR_KERNFULLS)) {
+		    !(context->flags & ATTR_KERNROOTLS)) {
 			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 			continue;
 		}
@@ -2309,9 +2321,13 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)

 		if (entry->flags & XFS_ATTR_INCOMPLETE)
 			continue;		/* skip incomplete entries */
+		if (((context->flags & ATTR_SECURE) != 0) !=
+		    ((entry->flags & XFS_ATTR_SECURE) != 0) &&
+		    !(context->flags & ATTR_KERNORMALS))
+			continue;		/* skip non-matching entries */
 		if (((context->flags & ATTR_ROOT) != 0) !=
 		    ((entry->flags & XFS_ATTR_ROOT) != 0) &&
-		    !(context->flags & ATTR_KERNFULLS))
+		    !(context->flags & ATTR_KERNROOTLS))
 			continue;		/* skip non-matching entries */

 		namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure :

--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -57,10 +57,10 @@ struct xfs_mount_args {
 	int	flags;		/* flags -> see XFSMNT_... macros below */
 	int	logbufs;	/* Number of log buffers, -1 to default */
 	int	logbufsize;	/* Size of log buffers, -1 to default */
-	char	fsname[MAXNAMELEN];	/* data device name */
-	char	rtname[MAXNAMELEN];	/* realtime device filename */
-	char	logname[MAXNAMELEN];	/* journal device filename */
-	char	mtpt[MAXNAMELEN];	/* filesystem mount point */
+	char	fsname[MAXNAMELEN+1];	/* data device name */
+	char	rtname[MAXNAMELEN+1];	/* realtime device filename */
+	char	logname[MAXNAMELEN+1];	/* journal device filename */
+	char	mtpt[MAXNAMELEN+1];	/* filesystem mount point */
 	int	sunit;		/* stripe unit (BBs) */
 	int	swidth;		/* stripe width (BBs), multiple of sunit */
 	uchar_t iosizelog;	/* log2 of the preferred I/O size */

--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -165,6 +165,27 @@ typedef enum {

 #define DM_FLAGS_NDELAY		0x001	/* return EAGAIN after dm_pending() */
 #define DM_FLAGS_UNWANTED	0x002	/* event not in fsys dm_eventset_t */
+#define DM_FLAGS_ISEM		0x004	/* thread holds i_sem */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,21)
+/* i_alloc_sem was added in 2.4.22-pre1 */
+#define DM_FLAGS_IALLOCSEM_RD	0x010	/* thread holds i_alloc_sem rd */
+#define DM_FLAGS_IALLOCSEM_WR	0x020	/* thread holds i_alloc_sem wr */
+#endif
+#endif
+
+/*
+ *	Based on IO_ISDIRECT, decide which i_ flag is set.
+ */
+#ifdef DM_FLAGS_IALLOCSEM_RD
+#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
+			      DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM)
+#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM)
+#else
+#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
+			      0 : DM_FLAGS_ISEM)
+#define DM_SEM_FLAG_WR	(DM_FLAGS_ISEM)
+#endif

 /*
 *	Macros to turn caller specified delay/block flags into

--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -437,6 +437,12 @@ typedef struct xfs_handle {

 #define FSHSIZE		sizeof(fsid_t)

+/* 
+ * Flags for going down operation
+ */
+#define XFS_FSOP_GOING_FLAGS_DEFAULT		0x0	/* going down */
+#define XFS_FSOP_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
+#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */

 /*
 * ioctl commands that replace IRIX fcntl()'s
@@ -490,6 +496,7 @@ typedef struct xfs_handle {
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
 #define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
 #define XFS_IOC_FSGEOMETRY	     _IOR ('X', 124, struct xfs_fsop_geom)
+#define XFS_IOC_GOINGDOWN	     _IOR ('X', 125, __uint32_t)
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */



--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -626,3 +626,28 @@ xfs_fs_thaw(
 	xfs_finish_freeze(mp);
 	return 0;
 }
+
+int
+xfs_fs_goingdown(
+	xfs_mount_t	*mp,
+	__uint32_t	inflags)
+{
+	switch (inflags)
+	{
+	case XFS_FSOP_GOING_FLAGS_DEFAULT:
+		xfs_fs_freeze(mp);
+		xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+		xfs_fs_thaw(mp);
+		break;
+	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
+		xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+		break;
+	case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
+		xfs_force_shutdown(mp, XFS_FORCE_UMOUNT|XFS_LOG_IO_ERROR);
+		break;
+	default:
+		return XFS_ERROR(EINVAL);
+	}
+
+	return 0;
+}
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -67,4 +67,9 @@ int
 xfs_fs_thaw(
 	xfs_mount_t		*mp);

+int
+xfs_fs_goingdown(
+	xfs_mount_t		*mp,
+	__uint32_t		inflags);
+
 #endif	/* __XFS_FSOPS_H__ */
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -69,6 +69,76 @@
 #include "xfs_utils.h"
 #include "xfs_iomap.h"

+#if defined(XFS_RW_TRACE)
+void
+xfs_iomap_enter_trace(
+	int		tag,
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,
+	ssize_t		count)
+{
+	xfs_inode_t	*ip = XFS_IO_INODE(io);
+
+	if (!ip->i_rwtrace)
+		return;
+
+	ktrace_enter(ip->i_rwtrace,
+		(void *)((unsigned long)tag),
+		(void *)ip,
+		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
+		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(offset & 0xffffffff)),
+		(void *)((unsigned long)count),
+		(void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(io->io_new_size & 0xffffffff)),
+		(void *)NULL,
+		(void *)NULL,
+		(void *)NULL,
+		(void *)NULL,
+		(void *)NULL,
+		(void *)NULL,
+		(void *)NULL);
+}
+
+void
+xfs_iomap_map_trace(
+	int		tag,
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,
+	ssize_t		count,
+	xfs_iomap_t	*iomapp,
+	xfs_bmbt_irec_t	*imapp,
+	int		flags)
+{
+	xfs_inode_t	*ip = XFS_IO_INODE(io);
+
+	if (!ip->i_rwtrace)
+		return;
+
+	ktrace_enter(ip->i_rwtrace,
+		(void *)((unsigned long)tag),
+		(void *)ip,
+		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
+		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(offset & 0xffffffff)),
+		(void *)((unsigned long)count),
+		(void *)((unsigned long)flags),
+		(void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)),
+		(void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)),
+		(void *)((unsigned long)(iomapp->iomap_delta)),
+		(void *)((unsigned long)(iomapp->iomap_bsize)),
+		(void *)((unsigned long)(iomapp->iomap_bn)),
+		(void *)(__psint_t)(imapp->br_startoff),
+		(void *)((unsigned long)(imapp->br_blockcount)),
+		(void *)(__psint_t)(imapp->br_startblock));
+}
+#else
+#define xfs_iomap_enter_trace(tag, io, offset, count)
+#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags)
+#endif
+
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
 						<< mp->m_writeio_log)
 #define XFS_STRAT_WRITE_IMAPS	2
@@ -149,17 +219,20 @@ xfs_iomap(
 		(BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE |
 		 BMAPI_UNWRITTEN | BMAPI_DEVICE)) {
 	case BMAPI_READ:
+		xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count);
 		lockmode = XFS_LCK_MAP_SHARED(mp, io);
 		bmapi_flags = XFS_BMAPI_ENTIRE;
 		if (flags & BMAPI_IGNSTATE)
 			bmapi_flags |= XFS_BMAPI_IGSTATE;
 		break;
 	case BMAPI_WRITE:
+		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count);
 		lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
 		bmapi_flags = 0;
 		XFS_ILOCK(mp, io, lockmode);
 		break;
 	case BMAPI_ALLOCATE:
+		xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, io, offset, count);
 		lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
 		bmapi_flags = XFS_BMAPI_ENTIRE;
 		/* Attempt non-blocking lock */
@@ -201,8 +274,11 @@ xfs_iomap(
 	switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) {
 	case BMAPI_WRITE:
 		/* If we found an extent, return it */
-		if (nimaps && (imap.br_startblock != HOLESTARTBLOCK))
+		if (nimaps && (imap.br_startblock != HOLESTARTBLOCK)) {
+			xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
+					offset, count, iomapp, &imap, flags);
 			break;
+		}

 		if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
 			error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset,
@@ -211,6 +287,10 @@ xfs_iomap(
 			error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count,
 					flags, &imap, &nimaps);
 		}
+		if (!error) {
+			xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, io,
+					offset, count, iomapp, &imap, flags);
+		}
 		iomap_flags = IOMAP_NEW;
 		break;
 	case BMAPI_ALLOCATE:
@@ -218,8 +298,11 @@ xfs_iomap(
 		XFS_IUNLOCK(mp, io, lockmode);
 		lockmode = 0;

-		if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock))
+		if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) {
+			xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
+					offset, count, iomapp, &imap, flags);
 			break;
+		}

 		error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps);
 		break;
@@ -309,7 +392,6 @@ xfs_iomap_write_direct(
 	 * Make sure that the dquots are there. This doesn't hold
 	 * the ilock across a disk read.
 	 */
-
 	error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
 	if (error)
 		return XFS_ERROR(error);
@@ -540,8 +622,9 @@ xfs_iomap_write_delay(
 	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
 	 * then we must have run out of space.
 	 */
-
 	if (nimaps == 0) {
+		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
+					io, offset, count);
 		if (xfs_flush_space(ip, &fsynced, &ioflag))
 			return XFS_ERROR(ENOSPC);

@@ -584,7 +667,6 @@ xfs_iomap_write_allocate(
 	/*
 	 * Make sure that the dquots are there.
 	 */
-
 	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
 		return XFS_ERROR(error);

@@ -612,7 +694,6 @@ xfs_iomap_write_allocate(
 					XFS_WRITE_LOG_RES(mp),
 					0, XFS_TRANS_PERM_LOG_RES,
 					XFS_WRITE_LOG_COUNT);
-
 			if (error == ENOSPC) {
 				error = xfs_trans_reserve(tp, 0,
 						XFS_WRITE_LOG_RES(mp),
@@ -653,19 +734,16 @@ xfs_iomap_write_allocate(
 			error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
 					XFS_BMAPI_WRITE, &first_block, 1,
 					imap, &nimaps, &free_list);
-
 			if (error)
 				goto trans_cancel;

 			error = xfs_bmap_finish(&tp, &free_list,
 					first_block, &committed);
-
 			if (error)
 				goto trans_cancel;

 			error = xfs_trans_commit(tp,
 					XFS_TRANS_RELEASE_LOG_RES, NULL);
-
 			if (error)
 				goto error0;

@@ -725,6 +803,9 @@ xfs_iomap_write_unwritten(
 	xfs_fsblock_t	firstfsb;
 	xfs_bmap_free_t	free_list;

+	xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN,
+				&ip->i_iocore, offset, count);
+
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	count_fsb = XFS_B_TO_FSB(mp, count);


--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -759,8 +759,9 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 	/* Also an invalid lsn.  1 implies that we aren't passing in a valid
 	 * tail_lsn.
 	 */
-	if (tail_lsn != 1)
+	if (tail_lsn != 1) {
 		log->l_tail_lsn = tail_lsn;
+	}

 	if ((tic = log->l_write_headq)) {
 #ifdef DEBUG
@@ -866,10 +867,11 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)

 	tail_lsn = xfs_trans_tail_ail(mp);
 	s = GRANT_LOCK(log);
-	if (tail_lsn != 0)
+	if (tail_lsn != 0) {
 		log->l_tail_lsn = tail_lsn;
-	else
+	} else {
 		tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
+	}
 	GRANT_UNLOCK(log, s);

 	return tail_lsn;
@@ -921,10 +923,8 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
 		 * In this case we just want to return the size of the
 		 * log as the amount of space left.
 		 */
-/* This assert does not take into account padding from striped log writes *
 		ASSERT((tail_cycle == (cycle + 1)) ||
 		       ((bytes + log->l_roundoff) >= tail_bytes));
-*/
 		free_bytes = log->l_logsize;
 	}
 	return free_bytes;
@@ -1183,14 +1183,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_grant_reserve_cycle = 1;
 	log->l_grant_write_cycle = 1;

-	if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
-		if (mp->m_sb.sb_logsunit <= 1) {
-			log->l_stripemask = 1;
-		} else {
-			log->l_stripemask = 1 <<
-				xfs_highbit32(mp->m_sb.sb_logsunit >> BBSHIFT);
-		}
-	}
 	if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) {
 		log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
 		ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
@@ -1401,45 +1393,35 @@ xlog_sync(xlog_t		*log,
 	xfs_caddr_t	dptr;		/* pointer to byte sized element */
 	xfs_buf_t	*bp;
 	int		i, ops;
-	uint		roundup;
 	uint		count;		/* byte count of bwrite */
+	uint		count_init;	/* initial count before roundup */
 	int		split = 0;	/* split write into two regions */
 	int		error;

 	XFS_STATS_INC(xs_log_writes);
 	ASSERT(iclog->ic_refcnt == 0);

-	/* Round out the log write size */
-	if (iclog->ic_offset & BBMASK) {
-		/* count of 0 is already accounted for up in
-		 * xlog_state_sync_all().  Once in this routine,
-		 * operations on the iclog are single threaded.
-		 *
-		 * Difference between rounded up size and size
-		 */
-		count = iclog->ic_offset & BBMASK;
-		iclog->ic_roundoff += BBSIZE - count;
-	}
-	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
-		unsigned sunit = BTOBB(log->l_mp->m_sb.sb_logsunit);
-		if (!sunit)
-			sunit = 1;
+	/* Add for LR header */
+	count_init = log->l_iclog_hsize + iclog->ic_offset;

-		count = BTOBB(log->l_iclog_hsize + iclog->ic_offset);
-		if (count & (sunit - 1)) {
-			roundup = sunit - (count & (sunit - 1));
-		} else {
-			roundup = 0;
-		}
-		iclog->ic_offset += BBTOB(roundup);
+	/* Round out the log write size */
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
+	    log->l_mp->m_sb.sb_logsunit > 1) {
+		/* we have a v2 stripe unit to use */
+		count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+	} else {
+		count = BBTOB(BTOBB(count_init));
 	}
-
+	iclog->ic_roundoff = count - count_init;
 	log->l_roundoff += iclog->ic_roundoff;

 	xlog_pack_data(log, iclog);       /* put cycle number in every block */

 	/* real byte length */
-	INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset);
+	INT_SET(iclog->ic_header.h_len, 
+		ARCH_CONVERT,
+		iclog->ic_offset + iclog->ic_roundoff);
+
 	/* put ops count in correct order */
 	ops = iclog->ic_header.h_num_logops;
 	INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops);
@@ -1449,12 +1431,6 @@ xlog_sync(xlog_t		*log,
 	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
 	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT));

-	/* Count is already rounded up to a BBSIZE above */
-	count = iclog->ic_offset + iclog->ic_roundoff;
-	ASSERT((count & BBMASK) == 0);
-
-	/* Add for LR header */
-	count += log->l_iclog_hsize;
 	XFS_STATS_ADD(xs_log_blocks, BTOBB(count));

 	/* Do we need to split this write into 2 parts? */
@@ -2783,8 +2759,6 @@ xlog_state_switch_iclogs(xlog_t		*log,
 			 xlog_in_core_t *iclog,
 			 int		eventual_size)
 {
-	uint roundup;
-
 	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
 	if (!eventual_size)
 		eventual_size = iclog->ic_offset;
@@ -2797,14 +2771,10 @@ xlog_state_switch_iclogs(xlog_t		*log,
 	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);

 	/* Round up to next log-sunit */
-	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
-		if (log->l_curr_block & (log->l_stripemask - 1)) {
-			roundup = log->l_stripemask -
-				(log->l_curr_block & (log->l_stripemask - 1));
-		} else {
-			roundup = 0;
-		}
-		log->l_curr_block += roundup;
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
+	    log->l_mp->m_sb.sb_logsunit > 1) {
+		__uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
+		log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
 	}

 	if (log->l_curr_block >= log->l_logBBsize) {

--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -63,6 +63,9 @@ int xlog_btolrbb(int b);
 #else
 #define XLOG_BTOLRBB(b)		(((b)+XLOG_RECORD_BSIZE-1) >> XLOG_RECORD_BSHIFT)
 #endif
+#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+                                 (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)

 #define XLOG_HEADER_SIZE	512

@@ -531,7 +534,6 @@ typedef struct log {
 	uint			l_flags;
 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
 	struct xfs_buf_cancel	**l_buf_cancel_table;
-	int			l_stripemask;	/* log stripe mask */
 	int			l_iclog_hsize;  /* size of iclog header */
 	int			l_iclog_heads;  /* # of iclog header sectors */
 	uint			l_sectbb_log;   /* log2 of sector size in BBs */

--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3416,6 +3416,7 @@ xlog_unpack_data_checksum(
 {
 	uint			*up = (uint *)dp;
 	uint			chksum = 0;
+	int			i;

 	/* divide length by 4 to get # words */
 	for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
@@ -3476,7 +3477,7 @@ xlog_valid_rec_header(
 	xlog_rec_header_t	*rhead,
 	xfs_daddr_t		blkno)
 {
-	int			bblks;
+	int			hlen;

 	if (unlikely(
 	    (INT_GET(rhead->h_magicno, ARCH_CONVERT) !=
@@ -3495,8 +3496,8 @@ xlog_valid_rec_header(
 	}

 	/* LR body must have data or it wouldn't have been written */
-	bblks = INT_GET(rhead->h_len, ARCH_CONVERT);
-	if (unlikely( bblks <= 0 || bblks > INT_MAX )) {
+	hlen = INT_GET(rhead->h_len, ARCH_CONVERT);
+	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
@@ -3658,7 +3659,7 @@ xlog_do_recovery_pass(
 				error = xlog_bread(log, 0, wrapped_hblks, hbp);
 				if (error)
 					goto bread_err2;
-				XFS_BUF_SET_PTR(hbp, bufaddr, hblks);
+				XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
 				if (!offset)
 					offset = xlog_align(log, 0,
 							wrapped_hblks, hbp);
@@ -3716,8 +3717,7 @@ xlog_do_recovery_pass(
 				if ((error = xlog_bread(log, wrapped_hblks,
 						bblks - split_bblks, dbp)))
 					goto bread_err2;
-				XFS_BUF_SET_PTR(dbp, bufaddr,
-						XLOG_BIG_RECORD_BSIZE);
+				XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
 				if (!offset)
 					offset = xlog_align(log, wrapped_hblks,
 						bblks - split_bblks, dbp);
@@ -4042,7 +4042,7 @@ xlog_recover_check_summary(
 				XFS_FSS_TO_BB(mp, 1), 0);
 		if (XFS_BUF_ISERROR(agibp)) {
 			xfs_ioerror_alert("xlog_recover_check_summary(agi)",
-					  log->l_mp, agibp, agidaddr);
+					  mp, agibp, agidaddr);
 		}
 		agip = XFS_BUF_TO_AGI(agibp);
 		ASSERT(XFS_AGI_MAGIC ==
@@ -4058,7 +4058,8 @@ xlog_recover_check_summary(

 	sbbp = xfs_getsb(mp, 0);
 #ifdef XFS_LOUD_RECOVERY
-	sbp = XFS_BUF_TO_SBP(sbbp);
+	sbp = &mp->m_sb;
+	xfs_xlatesb(XFS_BUF_TO_SBP(sbbp), sbp, 1, ARCH_CONVERT, XFS_SB_ALL_BITS);
 	cmn_err(CE_NOTE,
 		"xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
 		sbp->sb_icount, itotal);

--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -675,6 +675,7 @@ xfs_mountfs(
 				error = XFS_ERROR(EINVAL);
 				goto error1;
 			}
+			mp->m_dalign = mp->m_swidth = 0;
 		} else {
 			/*
 			 * Convert the stripe unit and width to FSBs.

--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -413,8 +413,9 @@ xfs_setattr(
 	} else {
 		if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
 		    !(flags & ATTR_DMI)) {
+			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
-				vap->va_size, 0, AT_DELAY_FLAG(flags), NULL);
+				vap->va_size, 0, dmflags, NULL);
 			if (code) {
 				lock_flags = 0;
 				goto error_return;