buf0flu.c 30 KB
Newer Older
osku's avatar
osku committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/******************************************************
The database buffer buf_pool flush algorithm

(c) 1995-2001 Innobase Oy

Created 11/11/1995 Heikki Tuuri
*******************************************************/

#include "buf0flu.h"

#ifdef UNIV_NONINL
#include "buf0flu.ic"
#include "trx0sys.h"
#endif

#include "ut0byte.h"
#include "ut0lst.h"
#include "page0page.h"
19
#include "page0zip.h"
osku's avatar
osku committed
20 21 22 23 24 25 26 27 28 29
#include "fil0fil.h"
#include "buf0buf.h"
#include "buf0lru.h"
#include "buf0rea.h"
#include "ibuf0ibuf.h"
#include "log0log.h"
#include "os0file.h"
#include "trx0sys.h"
#include "srv0srv.h"

30
/* When flushed, dirty blocks are searched in neighborhoods of this size, and
osku's avatar
osku committed
31 32 33
flushed along with the original page. */

#define BUF_FLUSH_AREA		ut_min(BUF_READ_AHEAD_AREA,\
34
		buf_pool->curr_size / 16)
osku's avatar
osku committed
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

/**********************************************************************
Validates the flush list. */
static
ibool
buf_flush_validate_low(void);
/*========================*/
		/* out: TRUE if ok */

/************************************************************************
Inserts a modified block into the flush list. */

void
buf_flush_insert_into_flush_list(
/*=============================*/
	buf_block_t*	block)	/* in: block which is modified */
{
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */

	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
59
		|| (ut_dulint_cmp(
osku's avatar
osku committed
60
			(UT_LIST_GET_FIRST(buf_pool->flush_list))
61
			->oldest_modification,
osku's avatar
osku committed
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
			block->oldest_modification) <= 0));

	UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);

	ut_ad(buf_flush_validate_low());
}

/************************************************************************
Inserts a modified block into the flush list in the right sorted position.
This function is used by recovery, because there the modifications do not
necessarily come in the order of lsn's. */

void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
	buf_block_t*	block)	/* in: block which is modified */
{
	buf_block_t*	prev_b;
	buf_block_t*	b;
81

osku's avatar
osku committed
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */

	prev_b = NULL;
	b = UT_LIST_GET_FIRST(buf_pool->flush_list);

	while (b && (ut_dulint_cmp(b->oldest_modification,
					block->oldest_modification) > 0)) {
		prev_b = b;
		b = UT_LIST_GET_NEXT(flush_list, b);
	}

	if (prev_b == NULL) {
		UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
	} else {
		UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b,
								block);
	}

	ut_ad(buf_flush_validate_low());
}

/************************************************************************
Returns TRUE if the file page block is immediately suitable for replacement,
i.e., the transition FILE_PAGE => NOT_USED allowed. */

ibool
buf_flush_ready_for_replace(
/*========================*/
				/* out: TRUE if can replace immediately */
	buf_block_t*	block)	/* in: buffer control block, must be in state
				BUF_BLOCK_FILE_PAGE and in the LRU list */
{
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
	if (block->state != BUF_BLOCK_FILE_PAGE) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
"  InnoDB: Error: buffer block state %lu in the LRU list!\n",
			(ulong)block->state);
124
		ut_print_buf(stderr, block, sizeof(buf_block_t));
osku's avatar
osku committed
125 126 127 128 129

		return(FALSE);
	}

	if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
130 131
		|| (block->buf_fix_count != 0)
		|| (block->io_fix != 0)) {
osku's avatar
osku committed
132 133 134

		return(FALSE);
	}
135

osku's avatar
osku committed
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
	return(TRUE);
}

/************************************************************************
Returns TRUE if the block is modified and ready for flushing. */
UNIV_INLINE
ibool
buf_flush_ready_for_flush(
/*======================*/
				/* out: TRUE if can flush immediately */
	buf_block_t*	block,	/* in: buffer control block, must be in state
				BUF_BLOCK_FILE_PAGE */
	ulint		flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
{
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

	if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
156 157
						&& (block->io_fix == 0)) {
		if (flush_type != BUF_FLUSH_LRU) {
osku's avatar
osku committed
158 159 160 161

			return(TRUE);

		} else if (block->buf_fix_count == 0) {
162

osku's avatar
osku committed
163 164 165 166 167 168 169
			/* If we are flushing the LRU list, to avoid deadlocks
			we require the block not to be bufferfixed, and hence
			not latched. */

			return(TRUE);
		}
	}
170

osku's avatar
osku committed
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
	return(FALSE);
}

/************************************************************************
Updates the flush system data structures when a write is completed. */

void
buf_flush_write_complete(
/*=====================*/
	buf_block_t*	block)	/* in: pointer to the block in question */
{
	ut_ad(block);
#ifdef UNIV_SYNC_DEBUG
	ut_ad(mutex_own(&(buf_pool->mutex)));
#endif /* UNIV_SYNC_DEBUG */
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

	block->oldest_modification = ut_dulint_zero;

	UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);

	ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list));

	(buf_pool->n_flush[block->flush_type])--;

	if (block->flush_type == BUF_FLUSH_LRU) {
		/* Put the block to the end of the LRU list to wait to be
		moved to the free list */

		buf_LRU_make_block_old(block);

		buf_pool->LRU_flush_ended++;
	}

	/* fprintf(stderr, "n pending flush %lu\n",
		buf_pool->n_flush[block->flush_type]); */

	if ((buf_pool->n_flush[block->flush_type] == 0)
209
		&& (buf_pool->init_flush[block->flush_type] == FALSE)) {
osku's avatar
osku committed
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238

		/* The running flush batch has ended */

		os_event_set(buf_pool->no_flush[block->flush_type]);
	}
}

/************************************************************************
Flushes possible buffered writes from the doublewrite memory buffer to disk,
and also wakes up the aio thread if simulated aio is used. It is very
important to call this function after a batch of writes has been posted,
and also when we may have to wait for a page latch! Otherwise a deadlock
of threads can occur. */
static
void
buf_flush_buffered_writes(void)
/*===========================*/
{
	buf_block_t*	block;
	byte*		write_buf;
	ulint		len;
	ulint		len2;
	ulint		i;

	if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
		os_aio_simulated_wake_handler_threads();

		return;
	}
239

osku's avatar
osku committed
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
	mutex_enter(&(trx_doublewrite->mutex));

	/* Write first to doublewrite buffer blocks. We use synchronous
	aio and thus know that file write has been completed when the
	control returns. */

	if (trx_doublewrite->first_free == 0) {

		mutex_exit(&(trx_doublewrite->mutex));

		return;
	}

	for (i = 0; i < trx_doublewrite->first_free; i++) {

		block = trx_doublewrite->buf_block_arr[i];
256
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
osku's avatar
osku committed
257

258 259
		/* TODO: page_zip */

260 261 262
		if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
				block->frame + (UNIV_PAGE_SIZE
				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
263 264
			ut_print_timestamp(stderr);
			fprintf(stderr,
osku's avatar
osku committed
265 266 267
"  InnoDB: ERROR: The page to be written seems corrupt!\n"
"InnoDB: The lsn fields do not match! Noticed in the buffer pool\n"
"InnoDB: before posting to the doublewrite buffer.\n");
268
		}
osku's avatar
osku committed
269

marko's avatar
marko committed
270
		if (!block->check_index_page_at_flush) {
marko's avatar
marko committed
271 272
		} else if (page_is_comp(block->frame)) {
			if (UNIV_UNLIKELY(!page_simple_validate_new(
marko's avatar
marko committed
273 274
						block->frame))) {
corrupted_page:
275
				buf_page_print(block->frame, 0);
osku's avatar
osku committed
276

marko's avatar
marko committed
277 278
				ut_print_timestamp(stderr);
				fprintf(stderr,
osku's avatar
osku committed
279 280 281 282
	"  InnoDB: Apparent corruption of an index page n:o %lu in space %lu\n"
	"InnoDB: to be written to data file. We intentionally crash server\n"
	"InnoDB: to prevent corrupt data from ending up in data\n"
	"InnoDB: files.\n",
marko's avatar
marko committed
283 284
					(ulong) block->offset,
					(ulong) block->space);
osku's avatar
osku committed
285

marko's avatar
marko committed
286 287
				ut_error;
			}
marko's avatar
marko committed
288 289 290 291
		} else if (UNIV_UNLIKELY(!page_simple_validate_old(
						block->frame))) {

			goto corrupted_page;
osku's avatar
osku committed
292 293 294
		}
	}

295 296 297 298
	/* increment the doublewrite flushed pages counter */
	srv_dblwr_pages_written+= trx_doublewrite->first_free;
	srv_dblwr_writes++;

osku's avatar
osku committed
299 300 301 302 303
	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
	} else {
		len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
	}
304

osku's avatar
osku committed
305 306 307
	fil_io(OS_FILE_WRITE,
		TRUE, TRX_SYS_SPACE,
		trx_doublewrite->block1, 0, len,
308 309
			(void*)trx_doublewrite->write_buf, NULL);

osku's avatar
osku committed
310 311
	write_buf = trx_doublewrite->write_buf;

312
	/* TODO: page_zip */
313
	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) {
314 315 316
		if (UNIV_UNLIKELY(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
			write_buf + len2 + (UNIV_PAGE_SIZE
				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
osku's avatar
osku committed
317 318 319 320 321 322 323 324 325 326
			ut_print_timestamp(stderr);
			fprintf(stderr,
"  InnoDB: ERROR: The page to be written seems corrupt!\n"
"InnoDB: The lsn fields do not match! Noticed in the doublewrite block1.\n");
		}
	}

	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		len = (trx_doublewrite->first_free
			- TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
327

osku's avatar
osku committed
328 329 330
		fil_io(OS_FILE_WRITE,
			TRUE, TRX_SYS_SPACE,
			trx_doublewrite->block2, 0, len,
331 332
			(void*)(trx_doublewrite->write_buf
			+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE),
osku's avatar
osku committed
333 334 335
			NULL);

		write_buf = trx_doublewrite->write_buf
336
			+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
337
		/* TODO: page_zip */
osku's avatar
osku committed
338 339
		for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
						len2 += UNIV_PAGE_SIZE) {
340 341 342 343 344 345
			if (UNIV_UNLIKELY(memcmp(write_buf + len2
					+ (FIL_PAGE_LSN + 4),
					write_buf + len2
					+ (UNIV_PAGE_SIZE
					- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
					4))) {
osku's avatar
osku committed
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
				ut_print_timestamp(stderr);
				fprintf(stderr,
"  InnoDB: ERROR: The page to be written seems corrupt!\n"
"InnoDB: The lsn fields do not match! Noticed in the doublewrite block2.\n");
			}
		}
	}

	/* Now flush the doublewrite buffer data to disk */

	fil_flush(TRX_SYS_SPACE);

	/* We know that the writes have been flushed to disk now
	and in recovery we will find them in the doublewrite buffer
	blocks. Next do the writes to the intended positions. */

	for (i = 0; i < trx_doublewrite->first_free; i++) {
		block = trx_doublewrite->buf_block_arr[i];
364
		/* TODO: page_zip */
365 366 367
		if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
				block->frame + (UNIV_PAGE_SIZE
				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
368 369
			ut_print_timestamp(stderr);
			fprintf(stderr,
osku's avatar
osku committed
370 371 372 373
"  InnoDB: ERROR: The page to be written seems corrupt!\n"
"InnoDB: The lsn fields do not match! Noticed in the buffer pool\n"
"InnoDB: after posting and flushing the doublewrite buffer.\n"
"InnoDB: Page buf fix count %lu, io fix %lu, state %lu\n",
374 375 376 377
				(ulong)block->buf_fix_count,
				(ulong)block->io_fix,
				(ulong)block->state);
		}
osku's avatar
osku committed
378 379 380 381
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);

		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
			FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
382
					(void*)block->frame, (void*)block);
osku's avatar
osku committed
383
	}
384

osku's avatar
osku committed
385 386 387 388 389 390
	/* Wake possible simulated aio thread to actually post the
	writes to the operating system */

	os_aio_simulated_wake_handler_threads();

	/* Wait that all async writes to tablespaces have been posted to
391 392
	the OS */

osku's avatar
osku committed
393 394 395 396 397 398 399 400 401 402
	os_aio_wait_until_no_pending_writes();

	/* Now we flush the data to disk (for example, with fsync) */

	fil_flush_file_spaces(FIL_TABLESPACE);

	/* We can now reuse the doublewrite memory buffer: */

	trx_doublewrite->first_free = 0;

403
	mutex_exit(&(trx_doublewrite->mutex));
osku's avatar
osku committed
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
}

/************************************************************************
Posts a buffer page for writing. If the doublewrite memory buffer is
full, calls buf_flush_buffered_writes and waits for for free space to
appear. */
static
void
buf_flush_post_to_doublewrite_buf(
/*==============================*/
	buf_block_t*	block)	/* in: buffer block to write */
{
try_again:
	mutex_enter(&(trx_doublewrite->mutex));

	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

	if (trx_doublewrite->first_free
				>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		mutex_exit(&(trx_doublewrite->mutex));

		buf_flush_buffered_writes();

		goto try_again;
	}

	ut_memcpy(trx_doublewrite->write_buf
				+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
			block->frame, UNIV_PAGE_SIZE);

	trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;

	trx_doublewrite->first_free++;

	if (trx_doublewrite->first_free
				>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		mutex_exit(&(trx_doublewrite->mutex));

		buf_flush_buffered_writes();

		return;
	}

	mutex_exit(&(trx_doublewrite->mutex));
}

/************************************************************************
Initializes a page for writing to the tablespace. */

void
buf_flush_init_for_writing(
/*=======================*/
456 457
	byte*	page,		/* in/out: page */
	void*	page_zip_,	/* in/out: compressed page, or NULL */
osku's avatar
osku committed
458 459 460
	dulint	newest_lsn,	/* in: newest modification lsn to the page */
	ulint	space,		/* in: space id */
	ulint	page_no)	/* in: page number */
461
{
462
	page_zip_des_t*	page_zip = page_zip_;
marko's avatar
marko committed
463
	ulint		zip_size = fil_space_get_zip_size(space);
464

465 466
	if (zip_size && zip_size != ULINT_UNDEFINED) {
		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
467 468 469
		case FIL_PAGE_TYPE_ZBLOB:
			ut_ad(!page_zip);
			mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
470
			mach_write_to_4(page + FIL_PAGE_ZBLOB_SPACE_ID, space);
471 472 473
			mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
			mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
					srv_use_checksums
marko's avatar
marko committed
474
					? page_zip_calc_checksum(
marko's avatar
marko committed
475
							page, zip_size)
476 477
					: BUF_NO_CHECKSUM_MAGIC);
			return;
478 479 480
		case FIL_PAGE_INODE:
		case FIL_PAGE_IBUF_BITMAP:
		case FIL_PAGE_TYPE_FSP_HDR:
481 482
		case FIL_PAGE_TYPE_XDES:
			/* This is essentially an uncompressed page. */
483
			break;
484 485
		case FIL_PAGE_INDEX:
			ut_a(zip_size == page_zip->size);
marko's avatar
marko committed
486 487 488 489 490
			mach_write_to_4(page
					+ FIL_PAGE_OFFSET, page_no);
			mach_write_to_4(page
					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
					space);
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
			mach_write_to_4(page_zip->data
					+ FIL_PAGE_OFFSET, page_no);
			mach_write_to_8(page_zip->data
					+ FIL_PAGE_LSN, newest_lsn);
			mach_write_to_4(page_zip->data
					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
					space);
			memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
			mach_write_to_4(page_zip->data
					+ FIL_PAGE_SPACE_OR_CHKSUM,
					srv_use_checksums
					? page_zip_calc_checksum(
						page_zip->data, zip_size)
					: BUF_NO_CHECKSUM_MAGIC);
			return;
		default:
			ut_error;
508 509 510
		}
	}

osku's avatar
osku committed
511 512 513 514 515 516 517 518
	/* Write the newest modification lsn to the page header and trailer */
	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);

	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
								newest_lsn);
	/* Write the page number and the space id */

	mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
519
	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space);
osku's avatar
osku committed
520 521 522 523 524

	/* Store the new formula checksum */

	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
					srv_use_checksums ?
525
		buf_calc_page_new_checksum(page) : BUF_NO_CHECKSUM_MAGIC);
osku's avatar
osku committed
526 527 528 529 530 531 532 533

	/* We overwrite the first 4 bytes of the end lsn field to store
	the old formula checksum. Since it depends also on the field
	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
	new formula checksum. */

	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
					srv_use_checksums ?
534
		buf_calc_page_old_checksum(page) : BUF_NO_CHECKSUM_MAGIC);
osku's avatar
osku committed
535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
}

/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
also when the doublewrite buffer is used, we must call
buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
/*======================*/
	buf_block_t*	block)	/* in: buffer block to write */
{
#ifdef UNIV_LOG_DEBUG
	static ibool univ_log_debug_warned;
#endif /* UNIV_LOG_DEBUG */
	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

#ifdef UNIV_IBUF_DEBUG
	ut_a(ibuf_count_get(block->space, block->offset) == 0);
#endif
	ut_ad(!ut_dulint_is_zero(block->newest_modification));

#ifdef UNIV_LOG_DEBUG
	if (!univ_log_debug_warned) {
		univ_log_debug_warned = TRUE;
		fputs(
	"Warning: cannot force log to disk if UNIV_LOG_DEBUG is defined!\n"
	"Crash recovery will not work!\n",
			stderr);
	}
#else
	/* Force the log to the disk before writing the modified block */
	log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
568
#endif
569 570 571 572
	buf_flush_init_for_writing(block->frame,
			buf_block_get_page_zip(block),
			block->newest_modification,
			block->space, block->offset);
osku's avatar
osku committed
573 574 575
	if (!srv_use_doublewrite_buf || !trx_doublewrite) {
		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
			FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
576
					(void*)block->frame, (void*)block);
osku's avatar
osku committed
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
	} else {
		buf_flush_post_to_doublewrite_buf(block);
	}
}

/************************************************************************
Writes a page asynchronously from the buffer buf_pool to a file, if it can be
found in the buf_pool and it is in a flushable state. NOTE: in simulated aio
we must call os_aio_simulated_wake_handler_threads after we have posted a batch
of writes! */
static
ulint
buf_flush_try_page(
/*===============*/
				/* out: 1 if a page was flushed, 0 otherwise */
	ulint	space,		/* in: space id */
	ulint	offset,		/* in: page offset */
	ulint	flush_type)	/* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST, or
				BUF_FLUSH_SINGLE_PAGE */
{
	buf_block_t*	block;
	ibool		locked;
599

osku's avatar
osku committed
600 601 602 603 604 605 606 607 608 609
	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
				|| flush_type == BUF_FLUSH_SINGLE_PAGE);

	mutex_enter(&(buf_pool->mutex));

	block = buf_page_hash_get(space, offset);

	ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);

	if (flush_type == BUF_FLUSH_LIST
610 611
		&& block && buf_flush_ready_for_flush(block, flush_type)) {

osku's avatar
osku committed
612 613 614 615 616 617 618 619 620 621 622
		block->io_fix = BUF_IO_WRITE;

		/* If AWE is enabled and the page is not mapped to a frame,
		then map it */

		if (block->frame == NULL) {
			ut_a(srv_use_awe);

			/* We set second parameter TRUE because the block is
			in the LRU list and we must put it to
			awe_LRU_free_mapped list once mapped to a frame */
623

osku's avatar
osku committed
624 625 626 627 628 629 630 631 632 633 634 635 636
			buf_awe_map_page_to_frame(block, TRUE);
		}

		block->flush_type = flush_type;

		if (buf_pool->n_flush[flush_type] == 0) {

			os_event_reset(buf_pool->no_flush[flush_type]);
		}

		(buf_pool->n_flush[flush_type])++;

		locked = FALSE;
637

osku's avatar
osku committed
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
		/* If the simulated aio thread is not running, we must
		not wait for any latch, as we may end up in a deadlock:
		if buf_fix_count == 0, then we know we need not wait */

		if (block->buf_fix_count == 0) {
			rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);

			locked = TRUE;
		}

		mutex_exit(&(buf_pool->mutex));

		if (!locked) {
			buf_flush_buffered_writes();

			rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
		}

#ifdef UNIV_DEBUG
		if (buf_debug_prints) {
			fprintf(stderr,
				"Flushing page space %lu, page no %lu \n",
				(ulong) block->space, (ulong) block->offset);
		}
#endif /* UNIV_DEBUG */

		buf_flush_write_block_low(block);
665

osku's avatar
osku committed
666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
		return(1);

	} else if (flush_type == BUF_FLUSH_LRU && block
			&& buf_flush_ready_for_flush(block, flush_type)) {

		/* VERY IMPORTANT:
		Because any thread may call the LRU flush, even when owning
		locks on pages, to avoid deadlocks, we must make sure that the
		s-lock is acquired on the page without waiting: this is
		accomplished because in the if-condition above we require
		the page not to be bufferfixed (in function
		..._ready_for_flush). */

		block->io_fix = BUF_IO_WRITE;

		/* If AWE is enabled and the page is not mapped to a frame,
		then map it */

		if (block->frame == NULL) {
			ut_a(srv_use_awe);

			/* We set second parameter TRUE because the block is
			in the LRU list and we must put it to
			awe_LRU_free_mapped list once mapped to a frame */
690

osku's avatar
osku committed
691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707
			buf_awe_map_page_to_frame(block, TRUE);
		}

		block->flush_type = flush_type;

		if (buf_pool->n_flush[flush_type] == 0) {

			os_event_reset(buf_pool->no_flush[flush_type]);
		}

		(buf_pool->n_flush[flush_type])++;

		rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);

		/* Note that the s-latch is acquired before releasing the
		buf_pool mutex: this ensures that the latch is acquired
		immediately. */
708

osku's avatar
osku committed
709 710 711 712 713 714 715 716
		mutex_exit(&(buf_pool->mutex));

		buf_flush_write_block_low(block);

		return(1);

	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
			&& buf_flush_ready_for_flush(block, flush_type)) {
717

osku's avatar
osku committed
718 719 720 721 722 723 724 725 726 727 728
		block->io_fix = BUF_IO_WRITE;

		/* If AWE is enabled and the page is not mapped to a frame,
		then map it */

		if (block->frame == NULL) {
			ut_a(srv_use_awe);

			/* We set second parameter TRUE because the block is
			in the LRU list and we must put it to
			awe_LRU_free_mapped list once mapped to a frame */
729

osku's avatar
osku committed
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
			buf_awe_map_page_to_frame(block, TRUE);
		}

		block->flush_type = flush_type;

		if (buf_pool->n_flush[block->flush_type] == 0) {

			os_event_reset(buf_pool->no_flush[block->flush_type]);
		}

		(buf_pool->n_flush[flush_type])++;

		mutex_exit(&(buf_pool->mutex));

		rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);

#ifdef UNIV_DEBUG
		if (buf_debug_prints) {
			fprintf(stderr,
			"Flushing single page space %lu, page no %lu \n",
						(ulong) block->space,
751
						(ulong) block->offset);
osku's avatar
osku committed
752 753 754 755
		}
#endif /* UNIV_DEBUG */

		buf_flush_write_block_low(block);
756

osku's avatar
osku committed
757 758 759 760 761
		return(1);
	} else {
		mutex_exit(&(buf_pool->mutex));

		return(0);
762
	}
osku's avatar
osku committed
763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
}

/***************************************************************
Flushes to disk all flushable pages within the flush area. */
static
ulint
buf_flush_try_neighbors(
/*====================*/
				/* out: number of pages flushed */
	ulint	space,		/* in: space id */
	ulint	offset,		/* in: page offset */
	ulint	flush_type)	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
{
	buf_block_t*	block;
	ulint		low, high;
	ulint		count		= 0;
	ulint		i;

	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);

	low = (offset / BUF_FLUSH_AREA) * BUF_FLUSH_AREA;
	high = (offset / BUF_FLUSH_AREA + 1) * BUF_FLUSH_AREA;

	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
		/* If there is little space, it is better not to flush any
		block except from the end of the LRU list */
789

osku's avatar
osku committed
790 791 792 793 794
		low = offset;
		high = offset + 1;
	}

	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
795

osku's avatar
osku committed
796 797 798 799 800 801 802 803 804 805 806 807
	if (high > fil_space_get_size(space)) {
		high = fil_space_get_size(space);
	}

	mutex_enter(&(buf_pool->mutex));

	for (i = low; i < high; i++) {

		block = buf_page_hash_get(space, i);
		ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);

		if (block && flush_type == BUF_FLUSH_LRU && i != offset
808
			&& !block->old) {
osku's avatar
osku committed
809

810 811
			/* We avoid flushing 'non-old' blocks in an LRU flush,
			because the flushed blocks are soon freed */
osku's avatar
osku committed
812

813
			continue;
osku's avatar
osku committed
814 815 816
		}

		if (block && buf_flush_ready_for_flush(block, flush_type)
817
		    && (i == offset || block->buf_fix_count == 0)) {
osku's avatar
osku committed
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
			/* We only try to flush those neighbors != offset
			where the buf fix count is zero, as we then know that
			we probably can latch the page without a semaphore
			wait. Semaphore waits are expensive because we must
			flush the doublewrite buffer before we start
			waiting. */

			mutex_exit(&(buf_pool->mutex));

			/* Note: as we release the buf_pool mutex above, in
			buf_flush_try_page we cannot be sure the page is still
			in a flushable state: therefore we check it again
			inside that function. */

			count += buf_flush_try_page(space, i, flush_type);

			mutex_enter(&(buf_pool->mutex));
		}
	}
837

osku's avatar
osku committed
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867
	mutex_exit(&(buf_pool->mutex));

	return(count);
}

/***********************************************************************
This utility flushes dirty blocks from the end of the LRU list or flush_list.
NOTE 1: in the case of an LRU flush the calling thread may own latches to
pages: to avoid deadlocks, this function must be written so that it cannot
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
the calling thread is not allowed to own any latches on pages! */

ulint
buf_flush_batch(
/*============*/
				/* out: number of blocks for which the write
				request was queued; ULINT_UNDEFINED if there
				was a flush of the same type already running */
	ulint	flush_type,	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
				BUF_FLUSH_LIST, then the caller must not own
				any latches on pages */
	ulint	min_n,		/* in: wished minimum mumber of blocks flushed
				(it is not guaranteed that the actual number
				is that big, though) */
	dulint	lsn_limit)	/* in the case BUF_FLUSH_LIST all blocks whose
				oldest_modification is smaller than this
				should be flushed (if their number does not
				exceed min_n), otherwise ignored */
{
	buf_block_t*	block;
868
	ulint		page_count	= 0;
osku's avatar
osku committed
869 870 871 872
	ulint		old_page_count;
	ulint		space;
	ulint		offset;
	ibool		found;
873

osku's avatar
osku committed
874
	ut_ad((flush_type == BUF_FLUSH_LRU)
875
					|| (flush_type == BUF_FLUSH_LIST));
osku's avatar
osku committed
876 877 878 879 880
	ut_ad((flush_type != BUF_FLUSH_LIST)
					|| sync_thread_levels_empty_gen(TRUE));
	mutex_enter(&(buf_pool->mutex));

	if ((buf_pool->n_flush[flush_type] > 0)
881
		|| (buf_pool->init_flush[flush_type] == TRUE)) {
osku's avatar
osku committed
882 883

		/* There is already a flush batch of the same type running */
884

osku's avatar
osku committed
885 886 887 888 889 890
		mutex_exit(&(buf_pool->mutex));

		return(ULINT_UNDEFINED);
	}

	(buf_pool->init_flush)[flush_type] = TRUE;
891

osku's avatar
osku committed
892 893 894 895 896 897
	for (;;) {
		/* If we have flushed enough, leave the loop */
		if (page_count >= min_n) {

			break;
		}
898

osku's avatar
osku committed
899 900
		/* Start from the end of the list looking for a suitable
		block to be flushed. */
901 902

		if (flush_type == BUF_FLUSH_LRU) {
osku's avatar
osku committed
903
			block = UT_LIST_GET_LAST(buf_pool->LRU);
904
		} else {
osku's avatar
osku committed
905 906 907 908
			ut_ad(flush_type == BUF_FLUSH_LIST);

			block = UT_LIST_GET_LAST(buf_pool->flush_list);
			if (!block
909 910
				|| (ut_dulint_cmp(block->oldest_modification,
						lsn_limit) >= 0)) {
osku's avatar
osku committed
911 912 913 914
				/* We have flushed enough */

				break;
			}
915 916 917 918
		}

		found = FALSE;

osku's avatar
osku committed
919 920 921 922 923 924
		/* Note that after finding a single flushable page, we try to
		flush also all its neighbors, and after that start from the
		END of the LRU list or flush list again: the list may change
		during the flushing and we cannot safely preserve within this
		function a pointer to a block in the list! */

925
		while ((block != NULL) && !found) {
osku's avatar
osku committed
926 927 928 929 930 931 932
			ut_a(block->state == BUF_BLOCK_FILE_PAGE);

			if (buf_flush_ready_for_flush(block, flush_type)) {

				found = TRUE;
				space = block->space;
				offset = block->offset;
933

osku's avatar
osku committed
934 935 936
				mutex_exit(&(buf_pool->mutex));

				old_page_count = page_count;
937

osku's avatar
osku committed
938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956
				/* Try to flush also all the neighbors */
				page_count +=
					buf_flush_try_neighbors(space, offset,
								flush_type);
				/* fprintf(stderr,
				"Flush type %lu, page no %lu, neighb %lu\n",
				flush_type, offset,
				page_count - old_page_count); */

				mutex_enter(&(buf_pool->mutex));

			} else if (flush_type == BUF_FLUSH_LRU) {

				block = UT_LIST_GET_PREV(LRU, block);
			} else {
				ut_ad(flush_type == BUF_FLUSH_LIST);

				block = UT_LIST_GET_PREV(flush_list, block);
			}
957
		}
osku's avatar
osku committed
958

959
		/* If we could not find anything to flush, leave the loop */
osku's avatar
osku committed
960

961 962 963
		if (!found) {
			break;
		}
osku's avatar
osku committed
964 965 966 967 968
	}

	(buf_pool->init_flush)[flush_type] = FALSE;

	if ((buf_pool->n_flush[flush_type] == 0)
969
		&& (buf_pool->init_flush[flush_type] == FALSE)) {
osku's avatar
osku committed
970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989

		/* The running flush batch has ended */

		os_event_set(buf_pool->no_flush[flush_type]);
	}

	mutex_exit(&(buf_pool->mutex));

	buf_flush_buffered_writes();

#ifdef UNIV_DEBUG
	if (buf_debug_prints && page_count > 0) {
		ut_a(flush_type == BUF_FLUSH_LRU
			|| flush_type == BUF_FLUSH_LIST);
		fprintf(stderr, flush_type == BUF_FLUSH_LRU
			? "Flushed %lu pages in LRU flush\n"
			: "Flushed %lu pages in flush list flush\n",
			(ulong) page_count);
	}
#endif /* UNIV_DEBUG */
990 991 992 993

	if (page_count != ULINT_UNDEFINED) {
		srv_buf_pool_flushed += page_count;
	}
osku's avatar
osku committed
994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006

	return(page_count);
}

/**********************************************************************
Waits until a flush batch of the given type ends */

void
buf_flush_wait_batch_end(
/*=====================*/
	ulint	type)	/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
{
	ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
1007

osku's avatar
osku committed
1008
	os_event_wait(buf_pool->no_flush[type]);
1009
}
osku's avatar
osku committed
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024

/**********************************************************************
Gives a recommendation of how many blocks should be flushed to establish
a big enough margin of replaceable blocks near the end of the LRU list
and in the free list. */
static
ulint
buf_flush_LRU_recommendation(void)
/*==============================*/
			/* out: number of blocks which should be flushed
			from the end of the LRU list */
{
	buf_block_t*	block;
	ulint		n_replaceable;
	ulint		distance	= 0;
1025

osku's avatar
osku committed
1026 1027 1028 1029 1030 1031 1032
	mutex_enter(&(buf_pool->mutex));

	n_replaceable = UT_LIST_GET_LEN(buf_pool->free);

	block = UT_LIST_GET_LAST(buf_pool->LRU);

	while ((block != NULL)
1033 1034 1035
		&& (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
			+ BUF_FLUSH_EXTRA_MARGIN)
		&& (distance < BUF_LRU_FREE_SEARCH_LEN)) {
osku's avatar
osku committed
1036 1037 1038 1039 1040 1041

		if (buf_flush_ready_for_replace(block)) {
			n_replaceable++;
		}

		distance++;
1042

osku's avatar
osku committed
1043 1044
		block = UT_LIST_GET_PREV(LRU, block);
	}
1045

osku's avatar
osku committed
1046 1047 1048 1049 1050 1051
	mutex_exit(&(buf_pool->mutex));

	if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {

		return(0);
	}
1052

osku's avatar
osku committed
1053 1054 1055 1056 1057 1058 1059 1060 1061
	return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
							- n_replaceable);
}

/*************************************************************************
Flushes pages from the end of the LRU list if there is too small a margin
of replaceable pages there or in the free list. VERY IMPORTANT: this function
is called also by threads which have locks on pages. To avoid deadlocks, we
flush only pages such that the s-lock required for flushing can be acquired
1062
immediately, without waiting. */
osku's avatar
osku committed
1063 1064 1065 1066 1067 1068 1069 1070 1071

void
buf_flush_free_margin(void)
/*=======================*/
{
	ulint	n_to_flush;
	ulint	n_flushed;

	n_to_flush = buf_flush_LRU_recommendation();
1072

osku's avatar
osku committed
1073 1074 1075 1076 1077 1078
	if (n_to_flush > 0) {
		n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush,
							ut_dulint_zero);
		if (n_flushed == ULINT_UNDEFINED) {
			/* There was an LRU type flush batch already running;
			let us wait for it to end */
1079 1080

			buf_flush_wait_batch_end(BUF_FLUSH_LRU);
osku's avatar
osku committed
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
		}
	}
}

/**********************************************************************
Validates the flush list. */
static
ibool
buf_flush_validate_low(void)
/*========================*/
		/* out: TRUE if ok */
{
	buf_block_t*	block;
	dulint		om;
1095

osku's avatar
osku committed
1096 1097 1098 1099 1100 1101 1102 1103
	UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list);

	block = UT_LIST_GET_FIRST(buf_pool->flush_list);

	while (block != NULL) {
		om = block->oldest_modification;
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
		ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0);
1104

osku's avatar
osku committed
1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124
		block = UT_LIST_GET_NEXT(flush_list, block);

		if (block) {
			ut_a(ut_dulint_cmp(om, block->oldest_modification)
									>= 0);
		}
	}

	return(TRUE);
}

/**********************************************************************
Validates the flush list. */

ibool
buf_flush_validate(void)
/*====================*/
		/* out: TRUE if ok */
{
	ibool	ret;
1125

osku's avatar
osku committed
1126 1127 1128
	mutex_enter(&(buf_pool->mutex));

	ret = buf_flush_validate_low();
1129

osku's avatar
osku committed
1130 1131 1132 1133
	mutex_exit(&(buf_pool->mutex));

	return(ret);
}