/* * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Further, this software is distributed without any warranty that it is * free of the rightful claim of any third person regarding infringement * or the like. Any license provided herein, whether implied or * otherwise, applies only to this software file. Patent licenses, if * any, provided herein do not apply to combinations of this program with * other software, or any other product whatsoever. * * You should have received a copy of the GNU General Public License along * with this program; if not, write the Free Software Foundation, Inc., 59 * Temple Place - Suite 330, Boston MA 02111-1307, USA. * * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, * Mountain View, CA 94043, or: * * http://www.sgi.com * * For further information regarding this notice, see: * * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ */ #include <xfs.h> /* * This is a subroutine for xfs_write() and other writers (xfs_ioctl) * which clears the setuid and setgid bits when a file is written. */ int xfs_write_clear_setuid( xfs_inode_t *ip) { xfs_mount_t *mp; xfs_trans_t *tp; int error; mp = ip->i_mount; tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp), 0, 0, 0))) { xfs_trans_cancel(tp, 0); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); ip->i_d.di_mode &= ~ISUID; /* * Note that we don't have to worry about mandatory * file locking being disabled here because we only * clear the ISGID bit if the Group execute bit is * on, but if it was on then mandatory locking wouldn't * have been enabled. */ if (ip->i_d.di_mode & (IEXEC >> 3)) { ip->i_d.di_mode &= ~ISGID; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0, NULL); xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } /* * Force a shutdown of the filesystem instantly while keeping * the filesystem consistent. We don't do an unmount here; just shutdown * the shop, make sure that absolutely nothing persistent happens to * this filesystem after this point. */ void xfs_do_force_shutdown( bhv_desc_t *bdp, int flags, char *fname, int lnnum) { int logerror; xfs_mount_t *mp; mp = XFS_BHVTOM(bdp); logerror = flags & XFS_LOG_IO_ERROR; if (!(flags & XFS_FORCE_UMOUNT)) { cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from line %d of file %s. Return address = 0x%x", mp->m_fsname,flags,lnnum,fname,__return_address); } /* * No need to duplicate efforts. */ if (XFS_FORCED_SHUTDOWN(mp) && !logerror) return; /* * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't * queue up anybody new on the log reservations, and wakes up * everybody who's sleeping on log reservations and tells * them the bad news. */ if (xfs_log_force_umount(mp, logerror)) return; if (flags & XFS_CORRUPT_INCORE) { cmn_err(CE_ALERT, "Corruption of in-memory data detected. Shutting down filesystem: %s", mp->m_fsname); } else if (!(flags & XFS_FORCE_UMOUNT)) { if (logerror) { cmn_err(CE_ALERT, "Log I/O Error Detected. Shutting down filesystem: %s", mp->m_fsname); } else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) { cmn_err(CE_ALERT, "I/O Error Detected. Shutting down filesystem: %s", mp->m_fsname); } } if (!(flags & XFS_FORCE_UMOUNT)) { cmn_err(CE_ALERT, "Please umount the filesystem, and rectify the problem(s)"); } } /* * Called when we want to stop a buffer from getting written or read. * We attach the EIO error, muck with its flags, and call biodone * so that the proper iodone callbacks get called. */ int xfs_bioerror( xfs_buf_t *bp) { #ifdef XFSERRORDEBUG ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); #endif /* * No need to wait until the buffer is unpinned. * We aren't flushing it. */ xfs_buftrace("XFS IOERROR", bp); XFS_BUF_ERROR(bp, EIO); /* * We're calling biodone, so delete B_DONE flag. Either way * we have to call the iodone callback, and calling biodone * probably is the best way since it takes care of * GRIO as well. */ XFS_BUF_UNREAD(bp); XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); XFS_BUF_CLR_BDSTRAT_FUNC(bp); xfs_biodone(bp); return (EIO); } /* * Same as xfs_bioerror, except that we are releasing the buffer * here ourselves, and avoiding the biodone call. * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ int xfs_bioerror_relse( xfs_buf_t *bp) { int64_t fl; ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks); ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone); xfs_buftrace("XFS IOERRELSE", bp); fl = XFS_BUF_BFLAGS(bp); /* * No need to wait until the buffer is unpinned. * We aren't flushing it. * * chunkhold expects B_DONE to be set, whether * we actually finish the I/O or not. We don't want to * change that interface. */ XFS_BUF_UNREAD(bp); XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_DONE(bp); XFS_BUF_STALE(bp); XFS_BUF_CLR_IODONE_FUNC(bp); XFS_BUF_CLR_BDSTRAT_FUNC(bp); if (!(fl & XFS_B_ASYNC)) { /* * Mark b_error and B_ERROR _both_. * Lot's of chunkcache code assumes that. * There's no reason to mark error for * ASYNC buffers. */ XFS_BUF_ERROR(bp, EIO); XFS_BUF_V_IODONESEMA(bp); } else { xfs_buf_relse(bp); } return (EIO); } /* * Prints out an ALERT message about I/O error. */ void xfs_ioerror_alert( char *func, struct xfs_mount *mp, xfs_buf_t *bp, xfs_daddr_t blkno) { cmn_err(CE_ALERT, "I/O error in filesystem (\"%s\") meta-data dev 0x%x block 0x%llx\n" " (\"%s\") error %d buf count %u", (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname, XFS_BUF_TARGET_DEV(bp), (__uint64_t)blkno, func, XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); } /* * This isn't an absolute requirement, but it is * just a good idea to call xfs_read_buf instead of * directly doing a read_buf call. For one, we shouldn't * be doing this disk read if we are in SHUTDOWN state anyway, * so this stops that from happening. Secondly, this does all * the error checking stuff and the brelse if appropriate for * the caller, so the code can be a little leaner. */ int xfs_read_buf( struct xfs_mount *mp, xfs_buftarg_t *target, xfs_daddr_t blkno, int len, uint flags, xfs_buf_t **bpp) { xfs_buf_t *bp; int error; if (flags) bp = xfs_buf_read_flags(target, blkno, len, flags); else bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return XFS_ERROR(EIO); error = XFS_BUF_GETERROR(bp); if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { *bpp = bp; } else { *bpp = NULL; if (error) { xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); } else { error = XFS_ERROR(EIO); } if (bp) { XFS_BUF_UNDONE(bp); XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_STALE(bp); /* * brelse clears B_ERROR and b_error */ xfs_buf_relse(bp); } } return (error); } /* * Wrapper around bwrite() so that we can trap * write errors, and act accordingly. */ int xfs_bwrite( struct xfs_mount *mp, struct xfs_buf *bp) { int error; /* * XXXsup how does this work for quotas. */ XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); XFS_BUF_SET_FSPRIVATE3(bp, mp); XFS_BUF_WRITE(bp); if ((error = XFS_bwrite(bp))) { ASSERT(mp); /* * Cannot put a buftrace here since if the buffer is not * B_HOLD then we will brelse() the buffer before returning * from bwrite and we could be tracing a buffer that has * been reused. */ xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR); } return (error); } /* * xfs_inval_cached_pages() * This routine is responsible for keeping direct I/O and buffered I/O * somewhat coherent. From here we make sure that we're at least * temporarily holding the inode I/O lock exclusively and then call * the page cache to flush and invalidate any cached pages. If there * are no cached pages this routine will be very quick. */ void xfs_inval_cached_pages( vnode_t *vp, xfs_iocore_t *io, xfs_off_t offset, int write, int relock) { xfs_mount_t *mp; if (!VN_CACHED(vp)) { return; } mp = io->io_mount; /* * We need to get the I/O lock exclusively in order * to safely invalidate pages and mappings. */ if (relock) { XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED); XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL); } /* Writing beyond EOF creates a hole that must be zeroed */ if (write && (offset > XFS_SIZE(mp, io))) { xfs_fsize_t isize; XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); isize = XFS_SIZE(mp, io); if (offset > isize) { xfs_zero_eof(vp, io, offset, isize, offset); } XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); } VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED); if (relock) { XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); } } spinlock_t xfs_refcache_lock = SPIN_LOCK_UNLOCKED; xfs_inode_t **xfs_refcache; int xfs_refcache_size; int xfs_refcache_index; int xfs_refcache_busy; int xfs_refcache_count; /* * Timer callback to mark SB dirty, make sure we keep purging refcache */ void xfs_refcache_sbdirty(struct super_block *sb) { sb->s_dirt = 1; } /* * Insert the given inode into the reference cache. */ void xfs_refcache_insert( xfs_inode_t *ip) { vnode_t *vp; xfs_inode_t *release_ip; xfs_inode_t **refcache; ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE)); /* * If an unmount is busy blowing entries out of the cache, * then don't bother. */ if (xfs_refcache_busy) { return; } /* * If we tuned the refcache down to zero, don't do anything. */ if (!xfs_refcache_size) { return; } /* * The inode is already in the refcache, so don't bother * with it. */ if (ip->i_refcache != NULL) { return; } vp = XFS_ITOV(ip); /* ASSERT(vp->v_count > 0); */ VN_HOLD(vp); /* * We allocate the reference cache on use so that we don't * waste the memory on systems not being used as NFS servers. */ if (xfs_refcache == NULL) { refcache = (xfs_inode_t **)kmem_zalloc(XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *), KM_SLEEP); } else { refcache = NULL; } spin_lock(&xfs_refcache_lock); /* * If we allocated memory for the refcache above and it still * needs it, then use the memory we allocated. Otherwise we'll * free the memory below. */ if (refcache != NULL) { if (xfs_refcache == NULL) { xfs_refcache = refcache; refcache = NULL; } } /* * If an unmount is busy clearing out the cache, don't add new * entries to it. */ if (xfs_refcache_busy) { spin_unlock(&xfs_refcache_lock); VN_RELE(vp); /* * If we allocated memory for the refcache above but someone * else beat us to using it, then free the memory now. */ if (refcache != NULL) { kmem_free(refcache, XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *)); } return; } release_ip = xfs_refcache[xfs_refcache_index]; if (release_ip != NULL) { release_ip->i_refcache = NULL; xfs_refcache_count--; ASSERT(xfs_refcache_count >= 0); } xfs_refcache[xfs_refcache_index] = ip; ASSERT(ip->i_refcache == NULL); ip->i_refcache = &(xfs_refcache[xfs_refcache_index]); xfs_refcache_count++; ASSERT(xfs_refcache_count <= xfs_refcache_size); xfs_refcache_index++; if (xfs_refcache_index == xfs_refcache_size) { xfs_refcache_index = 0; } spin_unlock(&xfs_refcache_lock); /* * Save the pointer to the inode to be released so that we can * VN_RELE it once we've dropped our inode locks in xfs_rwunlock(). * The pointer may be NULL, but that's OK. */ ip->i_release = release_ip; /* * If we allocated memory for the refcache above but someone * else beat us to using it, then free the memory now. */ if (refcache != NULL) { kmem_free(refcache, XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *)); } return; } /* * If the given inode is in the reference cache, purge its entry and * release the reference on the vnode. */ void xfs_refcache_purge_ip( xfs_inode_t *ip) { vnode_t *vp; int error; /* * If we're not pointing to our entry in the cache, then * we must not be in the cache. */ if (ip->i_refcache == NULL) { return; } spin_lock(&xfs_refcache_lock); if (ip->i_refcache == NULL) { spin_unlock(&xfs_refcache_lock); return; } /* * Clear both our pointer to the cache entry and its pointer * back to us. */ ASSERT(*(ip->i_refcache) == ip); *(ip->i_refcache) = NULL; ip->i_refcache = NULL; xfs_refcache_count--; ASSERT(xfs_refcache_count >= 0); spin_unlock(&xfs_refcache_lock); vp = XFS_ITOV(ip); /* ASSERT(vp->v_count > 1); */ VOP_RELEASE(vp, error); VN_RELE(vp); return; } /* * This is called from the XFS unmount code to purge all entries for the * given mount from the cache. It uses the refcache busy counter to * make sure that new entries are not added to the cache as we purge them. */ void xfs_refcache_purge_mp( xfs_mount_t *mp) { vnode_t *vp; int error, i; xfs_inode_t *ip; if (xfs_refcache == NULL) { return; } spin_lock(&xfs_refcache_lock); /* * Bumping the busy counter keeps new entries from being added * to the cache. We use a counter since multiple unmounts could * be in here simultaneously. */ xfs_refcache_busy++; for (i = 0; i < xfs_refcache_size; i++) { ip = xfs_refcache[i]; if ((ip != NULL) && (ip->i_mount == mp)) { xfs_refcache[i] = NULL; ip->i_refcache = NULL; xfs_refcache_count--; ASSERT(xfs_refcache_count >= 0); spin_unlock(&xfs_refcache_lock); vp = XFS_ITOV(ip); VOP_RELEASE(vp, error); VN_RELE(vp); spin_lock(&xfs_refcache_lock); } } xfs_refcache_busy--; ASSERT(xfs_refcache_busy >= 0); spin_unlock(&xfs_refcache_lock); } /* * This is called from the XFS sync code to ensure that the refcache * is emptied out over time. We purge a small number of entries with * each call. */ void xfs_refcache_purge_some(xfs_mount_t *mp) { int error, i; xfs_inode_t *ip; int iplist_index; xfs_inode_t **iplist; int purge_count; if ((xfs_refcache == NULL) || (xfs_refcache_count == 0)) { return; } iplist_index = 0; purge_count = xfs_params.refcache_purge; iplist = (xfs_inode_t **)kmem_zalloc(purge_count * sizeof(xfs_inode_t *), KM_SLEEP); spin_lock(&xfs_refcache_lock); /* * Store any inodes we find in the next several entries * into the iplist array to be released after dropping * the spinlock. We always start looking from the currently * oldest place in the cache. We move the refcache index * forward as we go so that we are sure to eventually clear * out the entire cache when the system goes idle. */ for (i = 0; i < purge_count; i++) { ip = xfs_refcache[xfs_refcache_index]; if (ip != NULL) { xfs_refcache[xfs_refcache_index] = NULL; ip->i_refcache = NULL; xfs_refcache_count--; ASSERT(xfs_refcache_count >= 0); iplist[iplist_index] = ip; iplist_index++; } xfs_refcache_index++; if (xfs_refcache_index == xfs_refcache_size) { xfs_refcache_index = 0; } } spin_unlock(&xfs_refcache_lock); /* * If there are still entries in the refcache, * set timer to mark the SB dirty to make sure that * we hit sync even if filesystem is idle, so that we'll * purge some more later. */ if (xfs_refcache_count) { del_timer_sync(&mp->m_sbdirty_timer); mp->m_sbdirty_timer.data = (unsigned long)LINVFS_GET_IP(XFS_ITOV(mp->m_rootip))->i_sb; mp->m_sbdirty_timer.expires = jiffies + 2*HZ; add_timer(&mp->m_sbdirty_timer); } /* * Now drop the inodes we collected. */ for (i = 0; i < iplist_index; i++) { VOP_RELEASE(XFS_ITOV(iplist[i]), error); VN_RELE(XFS_ITOV(iplist[i])); } kmem_free(iplist, purge_count * sizeof(xfs_inode_t *)); } /* * This is called when the refcache is dynamically resized * via a sysctl. * * If the new size is smaller than the old size, purge all * entries in slots greater than the new size, and move * the index if necessary. * * If the refcache hasn't even been allocated yet, or the * new size is larger than the old size, just set the value * of xfs_refcache_size. */ void xfs_refcache_resize(int xfs_refcache_new_size) { int i; xfs_inode_t *ip; int iplist_index = 0; xfs_inode_t **iplist; int error; /* * If the new size is smaller than the current size, * purge entries to create smaller cache, and * reposition index if necessary. * Don't bother if no refcache yet. */ if (xfs_refcache && (xfs_refcache_new_size < xfs_refcache_size)) { iplist = (xfs_inode_t **)kmem_zalloc(XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *), KM_SLEEP); spin_lock(&xfs_refcache_lock); for (i = xfs_refcache_new_size; i < xfs_refcache_size; i++) { ip = xfs_refcache[i]; if (ip != NULL) { xfs_refcache[i] = NULL; ip->i_refcache = NULL; xfs_refcache_count--; ASSERT(xfs_refcache_count >= 0); iplist[iplist_index] = ip; iplist_index++; } } xfs_refcache_size = xfs_refcache_new_size; /* * Move index to beginning of cache if it's now past the end */ if (xfs_refcache_index >= xfs_refcache_new_size) xfs_refcache_index = 0; spin_unlock(&xfs_refcache_lock); /* * Now drop the inodes we collected. */ for (i = 0; i < iplist_index; i++) { VOP_RELEASE(XFS_ITOV(iplist[i]), error); VN_RELE(XFS_ITOV(iplist[i])); } kmem_free(iplist, XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *)); } else { spin_lock(&xfs_refcache_lock); xfs_refcache_size = xfs_refcache_new_size; spin_unlock(&xfs_refcache_lock); } }