balloc.c 45.2 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 *  linux/fs/ext3/balloc.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/config.h>
15
#include <linux/time.h>
Linus Torvalds's avatar
Linus Torvalds committed
16 17 18 19 20
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/ext3_fs.h>
#include <linux/ext3_jbd.h>
#include <linux/quotaops.h>
21
#include <linux/buffer_head.h>
Linus Torvalds's avatar
Linus Torvalds committed
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48

/*
 * balloc.c contains the blocks allocation and deallocation routines
 */

/*
 * The free blocks are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
 * when a file system is mounted (see ext3_read_super).
 */


#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)

struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
					     unsigned int block_group,
					     struct buffer_head ** bh)
{
	unsigned long group_desc;
	unsigned long desc;
	struct ext3_group_desc * gdp;

49
	if (block_group >= EXT3_SB(sb)->s_groups_count) {
Linus Torvalds's avatar
Linus Torvalds committed
50 51 52
		ext3_error (sb, "ext3_get_group_desc",
			    "block_group >= groups_count - "
			    "block_group = %d, groups_count = %lu",
53
			    block_group, EXT3_SB(sb)->s_groups_count);
Linus Torvalds's avatar
Linus Torvalds committed
54 55 56

		return NULL;
	}
57
	smp_rmb();
58

Jan Kara's avatar
Jan Kara committed
59 60
	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
	desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
61
	if (!EXT3_SB(sb)->s_group_desc[group_desc]) {
Linus Torvalds's avatar
Linus Torvalds committed
62 63 64 65 66 67
		ext3_error (sb, "ext3_get_group_desc",
			    "Group descriptor not loaded - "
			    "block_group = %d, group_desc = %lu, desc = %lu",
			     block_group, group_desc, desc);
		return NULL;
	}
68

Linus Torvalds's avatar
Linus Torvalds committed
69
	gdp = (struct ext3_group_desc *) 
70
	      EXT3_SB(sb)->s_group_desc[group_desc]->b_data;
Linus Torvalds's avatar
Linus Torvalds committed
71
	if (bh)
72
		*bh = EXT3_SB(sb)->s_group_desc[group_desc];
Linus Torvalds's avatar
Linus Torvalds committed
73 74 75 76 77 78 79
	return gdp + desc;
}

/*
 * Read the bitmap for a given block_group, reading into the specified 
 * slot in the superblock's bitmap cache.
 *
80
 * Return buffer_head on success or NULL in case of failure.
Linus Torvalds's avatar
Linus Torvalds committed
81
 */
82 83
static struct buffer_head *
read_block_bitmap(struct super_block *sb, unsigned int block_group)
Linus Torvalds's avatar
Linus Torvalds committed
84
{
85
	struct ext3_group_desc * desc;
Linus Torvalds's avatar
Linus Torvalds committed
86
	struct buffer_head * bh = NULL;
87

88 89
	desc = ext3_get_group_desc (sb, block_group, NULL);
	if (!desc)
Linus Torvalds's avatar
Linus Torvalds committed
90
		goto error_out;
91 92
	bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
	if (!bh)
Linus Torvalds's avatar
Linus Torvalds committed
93 94
		ext3_error (sb, "read_block_bitmap",
			    "Cannot read block bitmap - "
95 96
			    "block_group = %d, block_bitmap = %u",
			    block_group, le32_to_cpu(desc->bg_block_bitmap));
Linus Torvalds's avatar
Linus Torvalds committed
97
error_out:
98
	return bh;
Linus Torvalds's avatar
Linus Torvalds committed
99
}
Mingming Cao's avatar
Mingming Cao committed
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
/*
 * The reservation window structure operations
 * --------------------------------------------
 * Operations include:
 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
 *
 * We use sorted double linked list for the per-filesystem reservation
 * window list. (like in vm_region).
 *
 * Initially, we keep those small operations in the abstract functions,
 * so later if we need a better searching tree than double linked-list,
 * we could easily switch to that without changing too much
 * code.
 */
#if 0
static void __rsv_window_dump(struct rb_root *root, int verbose,
			      const char *fn)
{
	struct rb_node *n;
119
	struct ext3_reserve_window_node *rsv, *prev;
Mingming Cao's avatar
Mingming Cao committed
120 121 122 123 124 125 126 127 128
	int bad;

restart:
	n = rb_first(root);
	bad = 0;
	prev = NULL;

	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
	while (n) {
129
		rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node);
Mingming Cao's avatar
Mingming Cao committed
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
		if (verbose)
			printk("reservation window 0x%p "
			       "start:  %d, end:  %d\n",
			       rsv, rsv->rsv_start, rsv->rsv_end);
		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
			printk("Bad reservation %p (start >= end)\n",
			       rsv);
			bad = 1;
		}
		if (prev && prev->rsv_end >= rsv->rsv_start) {
			printk("Bad reservation %p (prev->end >= start)\n",
			       rsv);
			bad = 1;
		}
		if (bad) {
			if (!verbose) {
				printk("Restarting reservation walk in verbose mode\n");
				verbose = 1;
				goto restart;
			}
		}
		n = rb_next(n);
		prev = rsv;
	}
	printk("Window map complete.\n");
	if (bad)
		BUG();
}
#define rsv_window_dump(root, verbose) \
	__rsv_window_dump((root), (verbose), __FUNCTION__)
#else
#define rsv_window_dump(root, verbose) do {} while (0)
#endif

static int
165
goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
Mingming Cao's avatar
Mingming Cao committed
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
			unsigned int group, struct super_block * sb)
{
	unsigned long group_first_block, group_last_block;

	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
				group * EXT3_BLOCKS_PER_GROUP(sb);
	group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;

	if ((rsv->_rsv_start > group_last_block) ||
	    (rsv->_rsv_end < group_first_block))
		return 0;
	if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
		|| (goal + group_first_block > rsv->_rsv_end)))
		return 0;
	return 1;
}

/*
 * Find the reserved window which includes the goal, or the previous one
 * if the goal is not in any window.
 * Returns NULL if there are no windows or if all windows start after the goal.
 */
188 189
static struct ext3_reserve_window_node *
search_reserve_window(struct rb_root *root, unsigned long goal)
Mingming Cao's avatar
Mingming Cao committed
190 191
{
	struct rb_node *n = root->rb_node;
192
	struct ext3_reserve_window_node *rsv;
Mingming Cao's avatar
Mingming Cao committed
193 194 195 196

	if (!n)
		return NULL;

197
	do {
198
		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
Mingming Cao's avatar
Mingming Cao committed
199 200 201 202 203 204 205

		if (goal < rsv->rsv_start)
			n = n->rb_left;
		else if (goal > rsv->rsv_end)
			n = n->rb_right;
		else
			return rsv;
206
	} while (n);
Mingming Cao's avatar
Mingming Cao committed
207 208 209 210 211 212 213 214
	/*
	 * We've fallen off the end of the tree: the goal wasn't inside
	 * any particular node.  OK, the previous node must be to one
	 * side of the interval containing the goal.  If it's the RHS,
	 * we need to back up one.
	 */
	if (rsv->rsv_start > goal) {
		n = rb_prev(&rsv->rsv_node);
215
		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
Mingming Cao's avatar
Mingming Cao committed
216 217 218 219
	}
	return rsv;
}

220 221
void ext3_rsv_window_add(struct super_block *sb,
		    struct ext3_reserve_window_node *rsv)
Mingming Cao's avatar
Mingming Cao committed
222 223 224 225 226 227 228
{
	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
	struct rb_node *node = &rsv->rsv_node;
	unsigned int start = rsv->rsv_start;

	struct rb_node ** p = &root->rb_node;
	struct rb_node * parent = NULL;
229
	struct ext3_reserve_window_node *this;
Mingming Cao's avatar
Mingming Cao committed
230 231 232 233

	while (*p)
	{
		parent = *p;
234
		this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
Mingming Cao's avatar
Mingming Cao committed
235 236 237 238 239 240 241 242 243 244 245 246 247 248

		if (start < this->rsv_start)
			p = &(*p)->rb_left;
		else if (start > this->rsv_end)
			p = &(*p)->rb_right;
		else
			BUG();
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
}

static void rsv_window_remove(struct super_block *sb,
249
			      struct ext3_reserve_window_node *rsv)
Mingming Cao's avatar
Mingming Cao committed
250 251 252 253 254 255 256
{
	rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
	rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
	atomic_set(&rsv->rsv_alloc_hit, 0);
	rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
}

257
static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
Mingming Cao's avatar
Mingming Cao committed
258 259 260 261 262 263 264 265
{
	/* a valid reservation end block could not be 0 */
	return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED);
}

void ext3_discard_reservation(struct inode *inode)
{
	struct ext3_inode_info *ei = EXT3_I(inode);
266
	struct ext3_reserve_window_node *rsv = &ei->i_rsv_window;
Mingming Cao's avatar
Mingming Cao committed
267 268 269 270 271 272 273 274
	spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;

	if (!rsv_is_empty(&rsv->rsv_window)) {
		spin_lock(rsv_lock);
		rsv_window_remove(inode->i_sb, rsv);
		spin_unlock(rsv_lock);
	}
}
Linus Torvalds's avatar
Linus Torvalds committed
275 276

/* Free given blocks, update quota and i_blocks field */
277 278 279
void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
			 unsigned long block, unsigned long count,
			 int *pdquot_freed_blocks)
Linus Torvalds's avatar
Linus Torvalds committed
280
{
281
	struct buffer_head *bitmap_bh = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
282 283 284 285 286 287 288
	struct buffer_head *gd_bh;
	unsigned long block_group;
	unsigned long bit;
	unsigned long i;
	unsigned long overflow;
	struct ext3_group_desc * gdp;
	struct ext3_super_block * es;
289
	struct ext3_sb_info *sbi;
Linus Torvalds's avatar
Linus Torvalds committed
290 291
	int err = 0, ret;

292
	*pdquot_freed_blocks = 0;
293
	sbi = EXT3_SB(sb);
294
	es = EXT3_SB(sb)->s_es;
295 296 297
	if (block < le32_to_cpu(es->s_first_data_block) ||
	    block + count < block ||
	    block + count > le32_to_cpu(es->s_blocks_count)) {
Linus Torvalds's avatar
Linus Torvalds committed
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
		ext3_error (sb, "ext3_free_blocks",
			    "Freeing blocks not in datazone - "
			    "block = %lu, count = %lu", block, count);
		goto error_return;
	}

	ext3_debug ("freeing block %lu\n", block);

do_more:
	overflow = 0;
	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
		      EXT3_BLOCKS_PER_GROUP(sb);
	bit = (block - le32_to_cpu(es->s_first_data_block)) %
		      EXT3_BLOCKS_PER_GROUP(sb);
	/*
	 * Check to see if we are freeing blocks across a group
	 * boundary.
	 */
	if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
		overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
		count -= overflow;
	}
320 321 322
	brelse(bitmap_bh);
	bitmap_bh = read_block_bitmap(sb, block_group);
	if (!bitmap_bh)
Linus Torvalds's avatar
Linus Torvalds committed
323 324 325 326 327 328 329 330
		goto error_return;
	gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
	if (!gdp)
		goto error_return;

	if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
	    in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
	    in_range (block, le32_to_cpu(gdp->bg_inode_table),
331
		      EXT3_SB(sb)->s_itb_per_group) ||
Linus Torvalds's avatar
Linus Torvalds committed
332
	    in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
333
		      EXT3_SB(sb)->s_itb_per_group))
Linus Torvalds's avatar
Linus Torvalds committed
334 335 336 337 338 339 340 341 342 343 344
		ext3_error (sb, "ext3_free_blocks",
			    "Freeing blocks in system zones - "
			    "Block = %lu, count = %lu",
			    block, count);

	/*
	 * We are about to start releasing blocks in the bitmap,
	 * so we need undo access.
	 */
	/* @@@ check errors */
	BUFFER_TRACE(bitmap_bh, "getting undo access");
345
	err = ext3_journal_get_undo_access(handle, bitmap_bh, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
346 347
	if (err)
		goto error_return;
348

Linus Torvalds's avatar
Linus Torvalds committed
349 350 351 352 353 354
	/*
	 * We are about to modify some metadata.  Call the journal APIs
	 * to unshare ->b_data if a currently-committing transaction is
	 * using it
	 */
	BUFFER_TRACE(gd_bh, "get_write_access");
355
	err = ext3_journal_get_write_access(handle, gd_bh);
Linus Torvalds's avatar
Linus Torvalds committed
356 357 358
	if (err)
		goto error_return;

359
	jbd_lock_bh_state(bitmap_bh);
360

Linus Torvalds's avatar
Linus Torvalds committed
361 362 363 364 365
	for (i = 0; i < count; i++) {
		/*
		 * An HJ special.  This is expensive...
		 */
#ifdef CONFIG_JBD_DEBUG
366
		jbd_unlock_bh_state(bitmap_bh);
Linus Torvalds's avatar
Linus Torvalds committed
367 368
		{
			struct buffer_head *debug_bh;
369
			debug_bh = sb_find_get_block(sb, block + i);
Linus Torvalds's avatar
Linus Torvalds committed
370 371 372 373 374 375 376 377 378
			if (debug_bh) {
				BUFFER_TRACE(debug_bh, "Deleted!");
				if (!bh2jh(bitmap_bh)->b_committed_data)
					BUFFER_TRACE(debug_bh,
						"No commited data in bitmap");
				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
				__brelse(debug_bh);
			}
		}
379
		jbd_lock_bh_state(bitmap_bh);
Linus Torvalds's avatar
Linus Torvalds committed
380
#endif
381 382 383 384 385
		if (need_resched()) {
			jbd_unlock_bh_state(bitmap_bh);
			cond_resched();
			jbd_lock_bh_state(bitmap_bh);
		}
Linus Torvalds's avatar
Linus Torvalds committed
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
		/* @@@ This prevents newly-allocated data from being
		 * freed and then reallocated within the same
		 * transaction. 
		 * 
		 * Ideally we would want to allow that to happen, but to
		 * do so requires making journal_forget() capable of
		 * revoking the queued write of a data block, which
		 * implies blocking on the journal lock.  *forget()
		 * cannot block due to truncate races.
		 *
		 * Eventually we can fix this by making journal_forget()
		 * return a status indicating whether or not it was able
		 * to revoke the buffer.  On successful revoke, it is
		 * safe not to set the allocation bit in the committed
		 * bitmap, because we know that there is no outstanding
		 * activity on the buffer any more and so it is safe to
		 * reallocate it.  
		 */
404
		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
Linus Torvalds's avatar
Linus Torvalds committed
405 406
		J_ASSERT_BH(bitmap_bh,
				bh2jh(bitmap_bh)->b_committed_data != NULL);
407 408 409 410 411 412 413 414 415 416 417
		ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
				bh2jh(bitmap_bh)->b_committed_data);

		/*
		 * We clear the bit in the bitmap after setting the committed
		 * data bit, because this is the reverse order to that which
		 * the allocator uses.
		 */
		BUFFER_TRACE(bitmap_bh, "clear bit");
		if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
						bit + i, bitmap_bh->b_data)) {
418 419 420 421
			jbd_unlock_bh_state(bitmap_bh);
			ext3_error(sb, __FUNCTION__,
				"bit already cleared for block %lu", block + i);
			jbd_lock_bh_state(bitmap_bh);
422 423
			BUFFER_TRACE(bitmap_bh, "bit already cleared");
		} else {
424
			(*pdquot_freed_blocks)++;
425
		}
Linus Torvalds's avatar
Linus Torvalds committed
426
	}
427
	jbd_unlock_bh_state(bitmap_bh);
Linus Torvalds's avatar
Linus Torvalds committed
428

429
	spin_lock(sb_bgl_lock(sbi, block_group));
430 431
	gdp->bg_free_blocks_count =
		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) +
432
			*pdquot_freed_blocks);
433 434
	spin_unlock(sb_bgl_lock(sbi, block_group));
	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
435

Linus Torvalds's avatar
Linus Torvalds committed
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
	/* We dirtied the bitmap block */
	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
	err = ext3_journal_dirty_metadata(handle, bitmap_bh);

	/* And the group descriptor block */
	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
	ret = ext3_journal_dirty_metadata(handle, gd_bh);
	if (!err) err = ret;

	if (overflow && !err) {
		block += count;
		count = overflow;
		goto do_more;
	}
	sb->s_dirt = 1;
error_return:
452
	brelse(bitmap_bh);
Linus Torvalds's avatar
Linus Torvalds committed
453
	ext3_std_error(sb, err);
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469
	return;
}

/* Free given blocks, update quota and i_blocks field */
void ext3_free_blocks(handle_t *handle, struct inode *inode,
			unsigned long block, unsigned long count)
{
	struct super_block * sb;
	int dquot_freed_blocks;

	sb = inode->i_sb;
	if (!sb) {
		printk ("ext3_free_blocks: nonexistent device");
		return;
	}
	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
Linus Torvalds's avatar
Linus Torvalds committed
470 471 472 473 474
	if (dquot_freed_blocks)
		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
	return;
}

475 476
/*
 * For ext3 allocations, we must not reuse any blocks which are
Linus Torvalds's avatar
Linus Torvalds committed
477 478 479 480 481 482 483 484 485 486 487 488 489 490
 * allocated in the bitmap buffer's "last committed data" copy.  This
 * prevents deletes from freeing up the page for reuse until we have
 * committed the delete transaction.
 *
 * If we didn't do this, then deleting something and reallocating it as
 * data would allow the old block to be overwritten before the
 * transaction committed (because we force data to disk before commit).
 * This would lead to corruption if we crashed between overwriting the
 * data and committing the delete. 
 *
 * @@@ We may want to make this allocation behaviour conditional on
 * data-writes at some point, and disable it for metadata allocations or
 * sync-data inodes.
 */
Mingming Cao's avatar
Mingming Cao committed
491
static int ext3_test_allocatable(int nr, struct buffer_head *bh)
Linus Torvalds's avatar
Linus Torvalds committed
492
{
493 494 495
	int ret;
	struct journal_head *jh = bh2jh(bh);

Linus Torvalds's avatar
Linus Torvalds committed
496 497
	if (ext3_test_bit(nr, bh->b_data))
		return 0;
498 499 500 501 502 503 504 505

	jbd_lock_bh_state(bh);
	if (!jh->b_committed_data)
		ret = 1;
	else
		ret = !ext3_test_bit(nr, jh->b_committed_data);
	jbd_unlock_bh_state(bh);
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
506 507
}

Mingming Cao's avatar
Mingming Cao committed
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
static int
bitmap_search_next_usable_block(int start, struct buffer_head *bh,
					int maxblocks)
{
	int next;
	struct journal_head *jh = bh2jh(bh);

	/*
	 * The bitmap search --- search forward alternately through the actual
	 * bitmap and the last-committed copy until we find a bit free in
	 * both
	 */
	while (start < maxblocks) {
		next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
		if (next >= maxblocks)
			return -1;
		if (ext3_test_allocatable(next, bh))
			return next;
		jbd_lock_bh_state(bh);
		if (jh->b_committed_data)
			start = ext3_find_next_zero_bit(jh->b_committed_data,
						 	maxblocks, next);
		jbd_unlock_bh_state(bh);
	}
	return -1;
}

Linus Torvalds's avatar
Linus Torvalds committed
535 536 537 538 539 540 541
/*
 * Find an allocatable block in a bitmap.  We honour both the bitmap and
 * its last-committed copy (if that exists), and perform the "most
 * appropriate allocation" algorithm of looking for a free block near
 * the initial goal; then for a free byte somewhere in the bitmap; then
 * for any free bit in the bitmap.
 */
542 543
static int
find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
Linus Torvalds's avatar
Linus Torvalds committed
544 545 546
{
	int here, next;
	char *p, *r;
547

Linus Torvalds's avatar
Linus Torvalds committed
548 549 550 551 552 553 554 555 556 557
	if (start > 0) {
		/*
		 * The goal was occupied; search forward for a free 
		 * block within the next XX blocks.
		 *
		 * end_goal is more or less random, but it has to be
		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
		 * next 64-bit boundary is simple..
		 */
		int end_goal = (start + 63) & ~63;
Mingming Cao's avatar
Mingming Cao committed
558 559
		if (end_goal > maxblocks)
			end_goal = maxblocks;
Linus Torvalds's avatar
Linus Torvalds committed
560
		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
561
		if (here < end_goal && ext3_test_allocatable(here, bh))
Linus Torvalds's avatar
Linus Torvalds committed
562
			return here;
563
		ext3_debug("Bit not found near goal\n");
Linus Torvalds's avatar
Linus Torvalds committed
564
	}
565

Linus Torvalds's avatar
Linus Torvalds committed
566 567 568
	here = start;
	if (here < 0)
		here = 0;
569

570
	p = ((char *)bh->b_data) + (here >> 3);
Linus Torvalds's avatar
Linus Torvalds committed
571
	r = memscan(p, 0, (maxblocks - here + 7) >> 3);
572
	next = (r - ((char *)bh->b_data)) << 3;
573

Mingming Cao's avatar
Mingming Cao committed
574
	if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
Linus Torvalds's avatar
Linus Torvalds committed
575
		return next;
576

577 578 579 580 581
	/*
	 * The bitmap search --- search forward alternately through the actual
	 * bitmap and the last-committed copy until we find a bit free in
	 * both
	 */
Mingming Cao's avatar
Mingming Cao committed
582 583
	here = bitmap_search_next_usable_block(here, bh, maxblocks);
	return here;
Linus Torvalds's avatar
Linus Torvalds committed
584 585
}

586 587 588 589 590 591 592 593 594 595
/*
 * We think we can allocate this block in this bitmap.  Try to set the bit.
 * If that succeeds then check that nobody has allocated and then freed the
 * block since we saw that is was not marked in b_committed_data.  If it _was_
 * allocated and freed then clear the bit in the bitmap again and return
 * zero (failure).
 */
static inline int
claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
{
596 597 598
	struct journal_head *jh = bh2jh(bh);
	int ret;

599 600
	if (ext3_set_bit_atomic(lock, block, bh->b_data))
		return 0;
601 602
	jbd_lock_bh_state(bh);
	if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
603
		ext3_clear_bit_atomic(lock, block, bh->b_data);
604 605 606
		ret = 0;
	} else {
		ret = 1;
607
	}
608 609
	jbd_unlock_bh_state(bh);
	return ret;
610 611 612 613 614 615 616 617 618
}

/*
 * If we failed to allocate the desired block then we may end up crossing to a
 * new bitmap.  In that case we must release write access to the old one via
 * ext3_journal_release_buffer(), else we'll run out of credits.
 */
static int
ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
619
	struct buffer_head *bitmap_bh, int goal, struct ext3_reserve_window *my_rsv)
620
{
Mingming Cao's avatar
Mingming Cao committed
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
	int group_first_block, start, end;

	/* we do allocation within the reservation window if we have a window */
	if (my_rsv) {
		group_first_block =
			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
			group * EXT3_BLOCKS_PER_GROUP(sb);
		if (my_rsv->_rsv_start >= group_first_block)
			start = my_rsv->_rsv_start - group_first_block;
		else
			/* reservation window cross group boundary */
			start = 0;
		end = my_rsv->_rsv_end - group_first_block + 1;
		if (end > EXT3_BLOCKS_PER_GROUP(sb))
			/* reservation window crosses group boundary */
			end = EXT3_BLOCKS_PER_GROUP(sb);
		if ((start <= goal) && (goal < end))
			start = goal;
		else
			goal = -1;
	} else {
		if (goal > 0)
			start = goal;
		else
			start = 0;
		end = EXT3_BLOCKS_PER_GROUP(sb);
647
	}
648

Mingming Cao's avatar
Mingming Cao committed
649 650
	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));

651 652
repeat:
	if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
Mingming Cao's avatar
Mingming Cao committed
653
		goal = find_next_usable_block(start, bitmap_bh, end);
654 655
		if (goal < 0)
			goto fail_access;
Mingming Cao's avatar
Mingming Cao committed
656 657 658 659 660 661 662 663 664
		if (!my_rsv) {
			int i;

			for (i = 0; i < 7 && goal > start &&
					ext3_test_allocatable(goal - 1,
								bitmap_bh);
					i++, goal--)
				;
		}
665
	}
Mingming Cao's avatar
Mingming Cao committed
666
	start = goal;
667

668
	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
669 670 671 672
		/*
		 * The block was allocated by another thread, or it was
		 * allocated and then freed by another thread
		 */
Mingming Cao's avatar
Mingming Cao committed
673
		start++;
674
		goal++;
Mingming Cao's avatar
Mingming Cao committed
675
		if (start >= end)
676
			goto fail_access;
677 678
		goto repeat;
	}
Mingming Cao's avatar
Mingming Cao committed
679 680 681 682
	return goal;
fail_access:
	return -1;
}
683

Mingming Cao's avatar
Mingming Cao committed
684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718
/**
 * 	find_next_reservable_window():
 *		find a reservable space within the given range.
 *		It does not allocate the reservation window for now:
 *		alloc_new_reservation() will do the work later.
 *
 * 	@search_head: the head of the searching list;
 *		This is not necessarily the list head of the whole filesystem
 *
 *		We have both head and start_block to assist the search
 *		for the reservable space. The list starts from head,
 *		but we will shift to the place where start_block is,
 *		then start from there, when looking for a reservable space.
 *
 * 	@size: the target new reservation window size
 *
 * 	@group_first_block: the first block we consider to start
 *			the real search from
 *
 * 	@last_block:
 *		the maximum block number that our goal reservable space
 *		could start from. This is normally the last block in this
 *		group. The search will end when we found the start of next
 *		possible reservable space is out of this boundary.
 *		This could handle the cross boundary reservation window
 *		request.
 *
 * 	basically we search from the given range, rather than the whole
 * 	reservation double linked list, (start_block, last_block)
 * 	to find a free region that is of my size and has not
 * 	been reserved.
 *
 *	on succeed, it returns the reservation window to be appended to.
 *	failed, return NULL.
 */
719 720
static struct ext3_reserve_window_node *find_next_reservable_window(
				struct ext3_reserve_window_node *search_head,
Mingming Cao's avatar
Mingming Cao committed
721 722 723 724
				unsigned long size, int *start_block,
				int last_block)
{
	struct rb_node *next;
725
	struct ext3_reserve_window_node *rsv, *prev;
Mingming Cao's avatar
Mingming Cao committed
726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
	int cur;

	/* TODO: make the start of the reservation window byte-aligned */
	/* cur = *start_block & ~7;*/
	cur = *start_block;
	rsv = search_head;
	if (!rsv)
		return NULL;

	while (1) {
		if (cur <= rsv->rsv_end)
			cur = rsv->rsv_end + 1;

		/* TODO?
		 * in the case we could not find a reservable space
		 * that is what is expected, during the re-search, we could
		 * remember what's the largest reservable space we could have
		 * and return that one.
		 *
		 * For now it will fail if we could not find the reservable
		 * space with expected-size (or more)...
		 */
		if (cur > last_block)
			return NULL;		/* fail */

		prev = rsv;
		next = rb_next(&rsv->rsv_node);
753
		rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node);
Mingming Cao's avatar
Mingming Cao committed
754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819

		/*
		 * Reached the last reservation, we can just append to the
		 * previous one.
		 */
		if (!next)
			break;

		if (cur + size <= rsv->rsv_start) {
			/*
			 * Found a reserveable space big enough.  We could
			 * have a reservation across the group boundary here
		 	 */
			break;
		}
	}
	/*
	 * we come here either :
	 * when we reach the end of the whole list,
	 * and there is empty reservable space after last entry in the list.
	 * append it to the end of the list.
	 *
	 * or we found one reservable space in the middle of the list,
	 * return the reservation window that we could append to.
	 * succeed.
	 */
	*start_block = cur;
	return prev;
}

/**
 * 	alloc_new_reservation()--allocate a new reservation window
 *
 *		To make a new reservation, we search part of the filesystem
 *		reservation list (the list that inside the group). We try to
 *		allocate a new reservation window near the allocation goal,
 *		or the beginning of the group, if there is no goal.
 *
 *		We first find a reservable space after the goal, then from
 *		there, we check the bitmap for the first free block after
 *		it. If there is no free block until the end of group, then the
 *		whole group is full, we failed. Otherwise, check if the free
 *		block is inside the expected reservable space, if so, we
 *		succeed.
 *		If the first free block is outside the reservable space, then
 *		start from the first free block, we search for next available
 *		space, and go on.
 *
 *	on succeed, a new reservation will be found and inserted into the list
 *	It contains at least one free block, and it does not overlap with other
 *	reservation windows.
 *
 *	failed: we failed to find a reservation window in this group
 *
 *	@rsv: the reservation
 *
 *	@goal: The goal (group-relative).  It is where the search for a
 *		free reservable space should start from.
 *		if we have a goal(goal >0 ), then start from there,
 *		no goal(goal = -1), we start from the first block
 *		of the group.
 *
 *	@sb: the super block
 *	@group: the group we are trying to allocate in
 *	@bitmap_bh: the block group block bitmap
 */
820
static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
Mingming Cao's avatar
Mingming Cao committed
821 822 823
		int goal, struct super_block *sb,
		unsigned int group, struct buffer_head *bitmap_bh)
{
824
	struct ext3_reserve_window_node *search_head;
Mingming Cao's avatar
Mingming Cao committed
825 826 827
	int group_first_block, group_end_block, start_block;
	int first_free_block;
	int reservable_space_start;
828
	struct ext3_reserve_window_node *prev_rsv;
Mingming Cao's avatar
Mingming Cao committed
829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948
	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
	unsigned long size;

	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
				group * EXT3_BLOCKS_PER_GROUP(sb);
	group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;

	if (goal < 0)
		start_block = group_first_block;
	else
		start_block = goal + group_first_block;

	size = atomic_read(&my_rsv->rsv_goal_size);
	if (!rsv_is_empty(&my_rsv->rsv_window)) {
		/*
		 * if the old reservation is cross group boundary
		 * and if the goal is inside the old reservation window,
		 * we will come here when we just failed to allocate from
		 * the first part of the window. We still have another part
		 * that belongs to the next group. In this case, there is no
		 * point to discard our window and try to allocate a new one
		 * in this group(which will fail). we should
		 * keep the reservation window, just simply move on.
		 *
		 * Maybe we could shift the start block of the reservation
		 * window to the first block of next group.
		 */

		if ((my_rsv->rsv_start <= group_end_block) &&
				(my_rsv->rsv_end > group_end_block) &&
				(start_block >= my_rsv->rsv_start))
			return -1;

		if ((atomic_read(&my_rsv->rsv_alloc_hit) >
		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
			/*
			 * if we previously allocation hit ration is greater than half
			 * we double the size of reservation window next time
			 * otherwise keep the same
			 */
			size = size * 2;
			if (size > EXT3_MAX_RESERVE_BLOCKS)
				size = EXT3_MAX_RESERVE_BLOCKS;
			atomic_set(&my_rsv->rsv_goal_size, size);
		}
	}
	/*
	 * shift the search start to the window near the goal block
	 */
	search_head = search_reserve_window(fs_rsv_root, start_block);

	/*
	 * find_next_reservable_window() simply finds a reservable window
	 * inside the given range(start_block, group_end_block).
	 *
	 * To make sure the reservation window has a free bit inside it, we
	 * need to check the bitmap after we found a reservable window.
	 */
retry:
	prev_rsv = find_next_reservable_window(search_head, size,
						&start_block, group_end_block);
	if (prev_rsv == NULL)
		goto failed;
	reservable_space_start = start_block;
	/*
	 * On success, find_next_reservable_window() returns the
	 * reservation window where there is a reservable space after it.
	 * Before we reserve this reservable space, we need
	 * to make sure there is at least a free block inside this region.
	 *
	 * searching the first free bit on the block bitmap and copy of
	 * last committed bitmap alternatively, until we found a allocatable
	 * block. Search start from the start block of the reservable space
	 * we just found.
	 */
	first_free_block = bitmap_search_next_usable_block(
			reservable_space_start - group_first_block,
			bitmap_bh, group_end_block - group_first_block + 1);

	if (first_free_block < 0) {
		/*
		 * no free block left on the bitmap, no point
		 * to reserve the space. return failed.
		 */
		goto failed;
	}
	start_block = first_free_block + group_first_block;
	/*
	 * check if the first free block is within the
	 * free space we just found
	 */
	if ((start_block >= reservable_space_start) &&
	  (start_block < reservable_space_start + size))
		goto found_rsv_window;
	/*
	 * if the first free bit we found is out of the reservable space
	 * this means there is no free block on the reservable space
	 * we should continue search for next reservable space,
	 * start from where the free block is,
	 * we also shift the list head to where we stopped last time
	 */
	search_head = prev_rsv;
	goto retry;

found_rsv_window:
	/*
	 * great! the reservable space contains some free blocks.
	 * if the search returns that we should add the new
	 * window just next to where the old window, we don't
 	 * need to remove the old window first then add it to the
	 * same place, just update the new start and new end.
	 */
	if (my_rsv != prev_rsv)  {
		if (!rsv_is_empty(&my_rsv->rsv_window))
			rsv_window_remove(sb, my_rsv);
	}
	my_rsv->rsv_start = reservable_space_start;
	my_rsv->rsv_end = my_rsv->rsv_start + size - 1;
	atomic_set(&my_rsv->rsv_alloc_hit, 0);
	if (my_rsv != prev_rsv)  {
949
		ext3_rsv_window_add(sb, my_rsv);
Mingming Cao's avatar
Mingming Cao committed
950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986
	}
	return 0;		/* succeed */
failed:
	/*
	 * failed to find a new reservation window in the current
	 * group, remove the current(stale) reservation window
	 * if there is any
	 */
	if (!rsv_is_empty(&my_rsv->rsv_window))
		rsv_window_remove(sb, my_rsv);
	return -1;		/* failed */
}

/*
 * This is the main function used to allocate a new block and its reservation
 * window.
 *
 * Each time when a new block allocation is need, first try to allocate from
 * its own reservation.  If it does not have a reservation window, instead of
 * looking for a free bit on bitmap first, then look up the reservation list to
 * see if it is inside somebody else's reservation window, we try to allocate a
 * reservation window for it starting from the goal first. Then do the block
 * allocation within the reservation window.
 *
 * This will avoid keeping on searching the reservation list again and
 * again when someboday is looking for a free block (without
 * reservation), and there are lots of free blocks, but they are all
 * being reserved.
 *
 * We use a sorted double linked list for the per-filesystem reservation list.
 * The insert, remove and find a free space(non-reserved) operations for the
 * sorted double linked list should be fast.
 *
 */
static int
ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
			unsigned int group, struct buffer_head *bitmap_bh,
987
			int goal, struct ext3_reserve_window_node * my_rsv,
Mingming Cao's avatar
Mingming Cao committed
988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
			int *errp)
{
	spinlock_t *rsv_lock;
	unsigned long group_first_block;
	int ret = 0;
	int fatal;
	int credits = 0;

	*errp = 0;

	/*
	 * Make sure we use undo access for the bitmap, because it is critical
	 * that we do the frozen_data COW on bitmap buffers in all cases even
	 * if the buffer is in BJ_Forget state in the committing transaction.
	 */
	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
	fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits);
1005 1006
	if (fatal) {
		*errp = fatal;
Mingming Cao's avatar
Mingming Cao committed
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
		return -1;
	}

	/*
	 * we don't deal with reservation when
	 * filesystem is mounted without reservation
	 * or the file is not a regular file
	 * or last attempt to allocate a block with reservation turned on failed
	 */
	if (my_rsv == NULL ) {
		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL);
		goto out;
	}
	rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
	/*
	 * goal is a group relative block number (if there is a goal)
	 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
	 * first block is a filesystem wide block number
	 * first block is the block number of the first block in this group
	 */
	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
			group * EXT3_BLOCKS_PER_GROUP(sb);

	/*
	 * Basically we will allocate a new block from inode's reservation
	 * window.
	 *
	 * We need to allocate a new reservation window, if:
	 * a) inode does not have a reservation window; or
	 * b) last attempt to allocate a block from existing reservation
	 *    failed; or
	 * c) we come here with a goal and with a reservation window
	 *
	 * We do not need to allocate a new reservation window if we come here
	 * at the beginning with a goal and the goal is inside the window, or
	 * we don't have a goal but already have a reservation window.
	 * then we could go to allocate from the reservation window directly.
	 */
	while (1) {
1046
		struct ext3_reserve_window rsv_copy;
Mingming Cao's avatar
Mingming Cao committed
1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
		unsigned int seq;

		do {
			seq = read_seqbegin(&my_rsv->rsv_seqlock);
			rsv_copy._rsv_start = my_rsv->rsv_start;
			rsv_copy._rsv_end = my_rsv->rsv_end;
		} while (read_seqretry(&my_rsv->rsv_seqlock, seq));

		if (rsv_is_empty(&rsv_copy) || (ret < 0) ||
			!goal_in_my_reservation(&rsv_copy, goal, group, sb)) {
			spin_lock(rsv_lock);
			write_seqlock(&my_rsv->rsv_seqlock);
			ret = alloc_new_reservation(my_rsv, goal, sb,
							group, bitmap_bh);
			rsv_copy._rsv_start = my_rsv->rsv_start;
			rsv_copy._rsv_end = my_rsv->rsv_end;
			write_sequnlock(&my_rsv->rsv_seqlock);
			spin_unlock(rsv_lock);
			if (ret < 0)
				break;			/* failed */

			if (!goal_in_my_reservation(&rsv_copy, goal, group, sb))
				goal = -1;
		}
		if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
		    || (rsv_copy._rsv_end < group_first_block))
			BUG();
		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
					   &rsv_copy);
		if (ret >= 0) {
			if (!read_seqretry(&my_rsv->rsv_seqlock, seq))
				atomic_inc(&my_rsv->rsv_alloc_hit);
			break;				/* succeed */
		}
	}
out:
	if (ret >= 0) {
		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
					"bitmap block");
		fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
		if (fatal) {
			*errp = fatal;
			return -1;
		}
		return ret;
1092
	}
1093 1094 1095

	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
	ext3_journal_release_buffer(handle, bitmap_bh, credits);
Mingming Cao's avatar
Mingming Cao committed
1096
	return ret;
1097 1098
}

1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
{
	int free_blocks, root_blocks;

	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
		sbi->s_resuid != current->fsuid &&
		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
		return 0;
	}
	return 1;
}

/*
 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
 * it is profitable to retry the operation, this function will wait
 * for the current or commiting transaction to complete, and then
 * return TRUE.
 */
int ext3_should_retry_alloc(struct super_block *sb, int *retries)
{
	if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
		return 0;

	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);

	return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
}

Linus Torvalds's avatar
Linus Torvalds committed
1129 1130 1131 1132 1133 1134 1135 1136
/*
 * ext3_new_block uses a goal block to assist allocation.  If the goal is
 * free, or there is a free block within 32 blocks of the goal, that block
 * is allocated.  Otherwise a forward search is made for a free block; within 
 * each block group the search first looks for an entire free byte in the block
 * bitmap, and then for any free bit if that fails.
 * This function also updates quota and i_blocks field.
 */
Mingming Cao's avatar
Mingming Cao committed
1137 1138
int ext3_new_block(handle_t *handle, struct inode *inode,
			unsigned long goal, int *errp)
Linus Torvalds's avatar
Linus Torvalds committed
1139
{
Mingming Cao's avatar
Mingming Cao committed
1140 1141 1142 1143 1144 1145 1146
	struct buffer_head *bitmap_bh = NULL;
	struct buffer_head *gdp_bh;
	int group_no;
	int goal_group;
	int ret_block;
	int bgi;			/* blockgroup iteration index */
	int target_block;
Linus Torvalds's avatar
Linus Torvalds committed
1147
	int fatal = 0, err;
1148
	int performed_allocation = 0;
1149
	int free_blocks;
1150 1151 1152
	struct super_block *sb;
	struct ext3_group_desc *gdp;
	struct ext3_super_block *es;
1153
	struct ext3_sb_info *sbi;
1154 1155
	struct ext3_reserve_window_node *my_rsv = NULL;
	struct ext3_reserve_window_node *rsv = &EXT3_I(inode)->i_rsv_window;
Mingming Cao's avatar
Mingming Cao committed
1156
	unsigned short windowsz = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1157
#ifdef EXT3FS_DEBUG
1158
	static int goal_hits, goal_attempts;
Linus Torvalds's avatar
Linus Torvalds committed
1159
#endif
1160 1161
	unsigned long ngroups;

Linus Torvalds's avatar
Linus Torvalds committed
1162 1163 1164
	*errp = -ENOSPC;
	sb = inode->i_sb;
	if (!sb) {
1165
		printk("ext3_new_block: nonexistent device");
Linus Torvalds's avatar
Linus Torvalds committed
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
		return 0;
	}

	/*
	 * Check quota for allocation of this block.
	 */
	if (DQUOT_ALLOC_BLOCK(inode, 1)) {
		*errp = -EDQUOT;
		return 0;
	}

1177
	sbi = EXT3_SB(sb);
1178
	es = EXT3_SB(sb)->s_es;
1179
	ext3_debug("goal=%lu.\n", goal);
Mingming Cao's avatar
Mingming Cao committed
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191
	/*
	 * Allocate a block from reservation only when
	 * filesystem is mounted with reservation(default,-o reservation), and
	 * it's a regular file, and
	 * the desired window size is greater than 0 (One could use ioctl
	 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
	 * reservation on that particular file)
	 */
	windowsz = atomic_read(&rsv->rsv_goal_size);
	if (test_opt(sb, RESERVATION) &&
		S_ISREG(inode->i_mode) && (windowsz > 0))
		my_rsv = rsv;
1192
	if (!ext3_has_free_blocks(sbi)) {
1193
		*errp = -ENOSPC;
1194
		goto out;
1195 1196
	}

Linus Torvalds's avatar
Linus Torvalds committed
1197 1198 1199 1200 1201 1202
	/*
	 * First, test whether the goal block is free.
	 */
	if (goal < le32_to_cpu(es->s_first_data_block) ||
	    goal >= le32_to_cpu(es->s_blocks_count))
		goal = le32_to_cpu(es->s_first_data_block);
1203
	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
Linus Torvalds's avatar
Linus Torvalds committed
1204
			EXT3_BLOCKS_PER_GROUP(sb);
1205
	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
Linus Torvalds's avatar
Linus Torvalds committed
1206 1207 1208
	if (!gdp)
		goto io_error;

Mingming Cao's avatar
Mingming Cao committed
1209 1210
	goal_group = group_no;
retry:
1211 1212
	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
	if (free_blocks > 0) {
1213
		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
Linus Torvalds's avatar
Linus Torvalds committed
1214
				EXT3_BLOCKS_PER_GROUP(sb));
1215 1216
		bitmap_bh = read_block_bitmap(sb, group_no);
		if (!bitmap_bh)
1217
			goto io_error;
Mingming Cao's avatar
Mingming Cao committed
1218 1219
		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
					bitmap_bh, ret_block, my_rsv, &fatal);
1220 1221
		if (fatal)
			goto out;
1222
		if (ret_block >= 0)
1223
			goto allocated;
Linus Torvalds's avatar
Linus Torvalds committed
1224
	}
1225

1226 1227 1228
	ngroups = EXT3_SB(sb)->s_groups_count;
	smp_rmb();

Linus Torvalds's avatar
Linus Torvalds committed
1229 1230 1231 1232
	/*
	 * Now search the rest of the groups.  We assume that 
	 * i and gdp correctly point to the last group visited.
	 */
1233
	for (bgi = 0; bgi < ngroups; bgi++) {
1234
		group_no++;
1235
		if (group_no >= ngroups)
1236 1237
			group_no = 0;
		gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
Linus Torvalds's avatar
Linus Torvalds committed
1238 1239 1240 1241
		if (!gdp) {
			*errp = -EIO;
			goto out;
		}
1242
		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
Mingming Cao's avatar
Mingming Cao committed
1243 1244 1245 1246 1247 1248
		/*
		 * skip this group if the number of
		 * free blocks is less than half of the reservation
		 * window size.
		 */
		if (free_blocks <= (windowsz/2))
1249 1250 1251 1252 1253 1254
			continue;

		brelse(bitmap_bh);
		bitmap_bh = read_block_bitmap(sb, group_no);
		if (!bitmap_bh)
			goto io_error;
Mingming Cao's avatar
Mingming Cao committed
1255 1256
		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
					bitmap_bh, -1, my_rsv, &fatal);
1257 1258 1259 1260 1261
		if (fatal)
			goto out;
		if (ret_block >= 0) 
			goto allocated;
	}
Mingming Cao's avatar
Mingming Cao committed
1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
	/*
	 * We may end up a bogus ealier ENOSPC error due to
	 * filesystem is "full" of reservations, but
	 * there maybe indeed free blocks avaliable on disk
	 * In this case, we just forget about the reservations
	 * just do block allocation as without reservations.
	 */
	if (my_rsv) {
		my_rsv = NULL;
		group_no = goal_group;
		goto retry;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1274
	/* No space left on the device */
1275
	*errp = -ENOSPC;
1276
	goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1277

1278
allocated:
Linus Torvalds's avatar
Linus Torvalds committed
1279

1280 1281
	ext3_debug("using block group %d(%d)\n",
			group_no, gdp->bg_free_blocks_count);
Linus Torvalds's avatar
Linus Torvalds committed
1282

1283 1284 1285 1286
	BUFFER_TRACE(gdp_bh, "get_write_access");
	fatal = ext3_journal_get_write_access(handle, gdp_bh);
	if (fatal)
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1287

1288
	target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
Linus Torvalds's avatar
Linus Torvalds committed
1289 1290
				+ le32_to_cpu(es->s_first_data_block);

1291 1292 1293
	if (target_block == le32_to_cpu(gdp->bg_block_bitmap) ||
	    target_block == le32_to_cpu(gdp->bg_inode_bitmap) ||
	    in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
1294
		      EXT3_SB(sb)->s_itb_per_group))
1295
		ext3_error(sb, "ext3_new_block",
Linus Torvalds's avatar
Linus Torvalds committed
1296
			    "Allocating block in system zone - "
1297
			    "block = %u", target_block);
Linus Torvalds's avatar
Linus Torvalds committed
1298

1299
	performed_allocation = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1300 1301 1302 1303 1304 1305

#ifdef CONFIG_JBD_DEBUG
	{
		struct buffer_head *debug_bh;

		/* Record bitmap buffer state in the newly allocated block */
1306
		debug_bh = sb_find_get_block(sb, target_block);
Linus Torvalds's avatar
Linus Torvalds committed
1307 1308
		if (debug_bh) {
			BUFFER_TRACE(debug_bh, "state when allocated");
1309
			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
Linus Torvalds's avatar
Linus Torvalds committed
1310 1311 1312
			brelse(debug_bh);
		}
	}
1313
	jbd_lock_bh_state(bitmap_bh);
1314
	spin_lock(sb_bgl_lock(sbi, group_no));
1315 1316 1317 1318 1319 1320 1321
	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
		if (ext3_test_bit(ret_block,
				bh2jh(bitmap_bh)->b_committed_data)) {
			printk("%s: block was unexpectedly set in "
				"b_committed_data\n", __FUNCTION__);
		}
	}
1322
	ext3_debug("found bit %d\n", ret_block);
1323
	spin_unlock(sb_bgl_lock(sbi, group_no));
1324 1325
	jbd_unlock_bh_state(bitmap_bh);
#endif
Linus Torvalds's avatar
Linus Torvalds committed
1326

1327 1328
	/* ret_block was blockgroup-relative.  Now it becomes fs-relative */
	ret_block = target_block;
Linus Torvalds's avatar
Linus Torvalds committed
1329

1330 1331
	if (ret_block >= le32_to_cpu(es->s_blocks_count)) {
		ext3_error(sb, "ext3_new_block",
Linus Torvalds's avatar
Linus Torvalds committed
1332
			    "block(%d) >= blocks count(%d) - "
1333 1334
			    "block_group = %d, es == %p ", ret_block,
			le32_to_cpu(es->s_blocks_count), group_no, es);
Linus Torvalds's avatar
Linus Torvalds committed
1335 1336 1337 1338 1339 1340 1341 1342
		goto out;
	}

	/*
	 * It is up to the caller to add the new buffer to a journal
	 * list of some description.  We don't know in advance whether
	 * the caller wants to use it as metadata or data.
	 */
1343 1344
	ext3_debug("allocating block %d. Goal hits %d of %d.\n",
			ret_block, goal_hits, goal_attempts);
Linus Torvalds's avatar
Linus Torvalds committed
1345

1346
	spin_lock(sb_bgl_lock(sbi, group_no));
Linus Torvalds's avatar
Linus Torvalds committed
1347 1348
	gdp->bg_free_blocks_count =
			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
1349 1350
	spin_unlock(sb_bgl_lock(sbi, group_no));
	percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
Linus Torvalds's avatar
Linus Torvalds committed
1351

1352 1353 1354 1355 1356
	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
	err = ext3_journal_dirty_metadata(handle, gdp_bh);
	if (!fatal)
		fatal = err;

Linus Torvalds's avatar
Linus Torvalds committed
1357 1358 1359 1360 1361
	sb->s_dirt = 1;
	if (fatal)
		goto out;

	*errp = 0;
1362 1363
	brelse(bitmap_bh);
	return ret_block;
1364

Linus Torvalds's avatar
Linus Torvalds committed
1365 1366 1367 1368 1369 1370 1371
io_error:
	*errp = -EIO;
out:
	if (fatal) {
		*errp = fatal;
		ext3_std_error(sb, fatal);
	}
1372 1373 1374 1375 1376
	/*
	 * Undo the block allocation
	 */
	if (!performed_allocation)
		DQUOT_FREE_BLOCK(inode, 1);
1377
	brelse(bitmap_bh);
Linus Torvalds's avatar
Linus Torvalds committed
1378 1379 1380
	return 0;
}

1381
unsigned long ext3_count_free_blocks(struct super_block *sb)
Linus Torvalds's avatar
Linus Torvalds committed
1382
{
1383 1384 1385
	unsigned long desc_count;
	struct ext3_group_desc *gdp;
	int i;
1386
	unsigned long ngroups;
Linus Torvalds's avatar
Linus Torvalds committed
1387
#ifdef EXT3FS_DEBUG
1388
	struct ext3_super_block *es;
1389
	unsigned long bitmap_count, x;
1390
	struct buffer_head *bitmap_bh = NULL;
1391

1392
	lock_super(sb);
1393
	es = EXT3_SB(sb)->s_es;
Linus Torvalds's avatar
Linus Torvalds committed
1394 1395 1396
	desc_count = 0;
	bitmap_count = 0;
	gdp = NULL;
1397
	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
1398
		gdp = ext3_get_group_desc(sb, i, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1399 1400 1401
		if (!gdp)
			continue;
		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1402 1403 1404
		brelse(bitmap_bh);
		bitmap_bh = read_block_bitmap(sb, i);
		if (bitmap_bh == NULL)
Linus Torvalds's avatar
Linus Torvalds committed
1405
			continue;
1406

1407 1408
		x = ext3_count_free(bitmap_bh, sb->s_blocksize);
		printk("group %d: stored = %d, counted = %lu\n",
Linus Torvalds's avatar
Linus Torvalds committed
1409 1410 1411
			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
		bitmap_count += x;
	}
1412
	brelse(bitmap_bh);
1413
	printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
Linus Torvalds's avatar
Linus Torvalds committed
1414
	       le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
1415
	unlock_super(sb);
Linus Torvalds's avatar
Linus Torvalds committed
1416 1417
	return bitmap_count;
#else
1418
	desc_count = 0;
1419 1420 1421
	ngroups = EXT3_SB(sb)->s_groups_count;
	smp_rmb();
	for (i = 0; i < ngroups; i++) {
1422 1423 1424 1425 1426 1427 1428
		gdp = ext3_get_group_desc(sb, i, NULL);
		if (!gdp)
			continue;
		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
	}

	return desc_count;
Linus Torvalds's avatar
Linus Torvalds committed
1429 1430 1431
#endif
}

1432
static inline int block_in_use(unsigned long block,
Linus Torvalds's avatar
Linus Torvalds committed
1433 1434 1435 1436
				struct super_block * sb,
				unsigned char * map)
{
	return ext3_test_bit ((block -
1437
		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
Linus Torvalds's avatar
Linus Torvalds committed
1438 1439 1440 1441 1442
			 EXT3_BLOCKS_PER_GROUP(sb), map);
}

static inline int test_root(int a, int b)
{
Jan Kara's avatar
Jan Kara committed
1443 1444 1445 1446 1447
	int num = b;

	while (a > num)
		num *= b;
	return num == a;
Linus Torvalds's avatar
Linus Torvalds committed
1448 1449
}

Adrian Bunk's avatar
Adrian Bunk committed
1450
static int ext3_group_sparse(int group)
Linus Torvalds's avatar
Linus Torvalds committed
1451
{
Jan Kara's avatar
Jan Kara committed
1452 1453
	if (group <= 1)
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494
	return (test_root(group, 3) || test_root(group, 5) ||
		test_root(group, 7));
}

/**
 *	ext3_bg_has_super - number of blocks used by the superblock in group
 *	@sb: superblock for filesystem
 *	@group: group number to check
 *
 *	Return the number of blocks used by the superblock (primary or backup)
 *	in this group.  Currently this will be only 0 or 1.
 */
int ext3_bg_has_super(struct super_block *sb, int group)
{
	if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
	    !ext3_group_sparse(group))
		return 0;
	return 1;
}

/**
 *	ext3_bg_num_gdb - number of blocks used by the group table in group
 *	@sb: superblock for filesystem
 *	@group: group number to check
 *
 *	Return the number of blocks used by the group descriptor table
 *	(primary or backup) in this group.  In the future there may be a
 *	different number of descriptor blocks in each group.
 */
unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
{
	if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
	    !ext3_group_sparse(group))
		return 0;
	return EXT3_SB(sb)->s_gdb_count;
}

#ifdef CONFIG_EXT3_CHECK
/* Called at mount-time, super-block is locked */
void ext3_check_blocks_bitmap (struct super_block * sb)
{
1495
	struct ext3_super_block *es;
Linus Torvalds's avatar
Linus Torvalds committed
1496 1497
	unsigned long desc_count, bitmap_count, x, j;
	unsigned long desc_blocks;
1498 1499
	struct buffer_head *bitmap_bh = NULL;
	struct ext3_group_desc *gdp;
Linus Torvalds's avatar
Linus Torvalds committed
1500 1501
	int i;

1502
	es = EXT3_SB(sb)->s_es;
Linus Torvalds's avatar
Linus Torvalds committed
1503 1504 1505
	desc_count = 0;
	bitmap_count = 0;
	gdp = NULL;
1506
	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
1507 1508 1509 1510
		gdp = ext3_get_group_desc (sb, i, NULL);
		if (!gdp)
			continue;
		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1511 1512 1513
		brelse(bitmap_bh);
		bitmap_bh = read_block_bitmap(sb, i);
		if (bitmap_bh == NULL)
Linus Torvalds's avatar
Linus Torvalds committed
1514 1515
			continue;

1516 1517
		if (ext3_bg_has_super(sb, i) &&
				!ext3_test_bit(0, bitmap_bh->b_data))
Linus Torvalds's avatar
Linus Torvalds committed
1518 1519 1520 1521 1522
			ext3_error(sb, __FUNCTION__,
				   "Superblock in group %d is marked free", i);

		desc_blocks = ext3_bg_num_gdb(sb, i);
		for (j = 0; j < desc_blocks; j++)
1523
			if (!ext3_test_bit(j + 1, bitmap_bh->b_data))
Linus Torvalds's avatar
Linus Torvalds committed
1524 1525 1526 1527 1528
				ext3_error(sb, __FUNCTION__,
					   "Descriptor block #%ld in group "
					   "%d is marked free", j, i);

		if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap),
1529
						sb, bitmap_bh->b_data))
Linus Torvalds's avatar
Linus Torvalds committed
1530 1531 1532 1533 1534
			ext3_error (sb, "ext3_check_blocks_bitmap",
				    "Block bitmap for group %d is marked free",
				    i);

		if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap),
1535
						sb, bitmap_bh->b_data))
Linus Torvalds's avatar
Linus Torvalds committed
1536 1537 1538 1539
			ext3_error (sb, "ext3_check_blocks_bitmap",
				    "Inode bitmap for group %d is marked free",
				    i);

1540
		for (j = 0; j < EXT3_SB(sb)->s_itb_per_group; j++)
Linus Torvalds's avatar
Linus Torvalds committed
1541
			if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
1542
							sb, bitmap_bh->b_data))
Linus Torvalds's avatar
Linus Torvalds committed
1543 1544 1545 1546
				ext3_error (sb, "ext3_check_blocks_bitmap",
					    "Block #%d of the inode table in "
					    "group %d is marked free", j, i);

1547
		x = ext3_count_free(bitmap_bh, sb->s_blocksize);
Linus Torvalds's avatar
Linus Torvalds committed
1548 1549 1550 1551 1552 1553 1554
		if (le16_to_cpu(gdp->bg_free_blocks_count) != x)
			ext3_error (sb, "ext3_check_blocks_bitmap",
				    "Wrong free blocks count for group %d, "
				    "stored = %d, counted = %lu", i,
				    le16_to_cpu(gdp->bg_free_blocks_count), x);
		bitmap_count += x;
	}
1555
	brelse(bitmap_bh);
Linus Torvalds's avatar
Linus Torvalds committed
1556 1557 1558 1559 1560 1561 1562 1563
	if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count)
		ext3_error (sb, "ext3_check_blocks_bitmap",
			"Wrong free blocks count in super block, "
			"stored = %lu, counted = %lu",
			(unsigned long)le32_to_cpu(es->s_free_blocks_count),
			bitmap_count);
}
#endif