buf0flu.c 67.6 KB
Newer Older
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1 2
/*****************************************************************************

3
Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Vadim Tkachenko's avatar
Vadim Tkachenko committed
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA

*****************************************************************************/

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
19 20
/**************************************************//**
@file buf/buf0flu.c
21 22 23 24 25 26 27 28 29 30 31
The database buffer buf_pool flush algorithm

Created 11/11/1995 Heikki Tuuri
*******************************************************/

#include "buf0flu.h"

#ifdef UNIV_NONINL
#include "buf0flu.ic"
#endif

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
32 33 34 35
#include "buf0buf.h"
#include "srv0srv.h"
#include "page0zip.h"
#ifndef UNIV_HOTBACKUP
36 37 38 39 40 41 42 43 44 45
#include "ut0byte.h"
#include "ut0lst.h"
#include "page0page.h"
#include "fil0fil.h"
#include "buf0lru.h"
#include "buf0rea.h"
#include "ibuf0ibuf.h"
#include "log0log.h"
#include "os0file.h"
#include "trx0sys.h"
Sergei Golubchik's avatar
Sergei Golubchik committed
46 47
#include "mysql/plugin.h"
#include "mysql/service_thd_wait.h"
48 49

/**********************************************************************
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
These statistics are generated for heuristics used in estimating the
rate at which we should flush the dirty blocks to avoid bursty IO
activity. Note that the rate of flushing not only depends on how many
dirty pages we have in the buffer pool but it is also a fucntion of
how much redo the workload is generating and at what rate. */
/* @{ */

/** Number of intervals for which we keep the history of these stats.
Each interval is 1 second, defined by the rate at which
srv_error_monitor_thread() calls buf_flush_stat_update(). */
#define BUF_FLUSH_STAT_N_INTERVAL 20

/** Sampled values buf_flush_stat_cur.
Not protected by any mutex.  Updated by buf_flush_stat_update(). */
static buf_flush_stat_t	buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];

/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
static ulint		buf_flush_stat_arr_ind;

/** Values at start of the current interval. Reset by
buf_flush_stat_update(). */
static buf_flush_stat_t	buf_flush_stat_cur;

/** Running sum of past values of buf_flush_stat_cur.
Updated by buf_flush_stat_update(). Not protected by any mutex. */
static buf_flush_stat_t	buf_flush_stat_sum;

/** Number of pages flushed through non flush_list flushes. */
Sergei Golubchik's avatar
Sergei Golubchik committed
78
// static ulint buf_lru_flush_page_count = 0;
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
79 80 81 82 83 84 85

/* @} */

#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/******************************************************************//**
Validates the flush list.
@return	TRUE if ok */
86 87
static
ibool
Sergei Golubchik's avatar
Sergei Golubchik committed
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
buf_flush_validate_low(
/*===================*/
	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */

/******************************************************************//**
Validates the flush list some of the time.
@return	TRUE if ok or the check was skipped */
static
ibool
buf_flush_validate_skip(
/*====================*/
	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
{
/** Try buf_flush_validate_low() every this many times */
# define BUF_FLUSH_VALIDATE_SKIP	23

	/** The buf_flush_validate_low() call skip counter.
	Use a signed type because of the race condition below. */
	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;

	/* There is a race condition below, but it does not matter,
	because this call is only for heuristic purposes. We want to
	reduce the call frequency of the costly buf_flush_validate_low()
	check in debug builds. */
	if (--buf_flush_validate_count > 0) {
		return(TRUE);
	}

	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
	return(buf_flush_validate_low(buf_pool));
}
119 120
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */

Sergei Golubchik's avatar
Sergei Golubchik committed
121
/******************************************************************//**
122 123 124
Insert a block in the flush_rbt and returns a pointer to its
predecessor or NULL if no predecessor. The ordering is maintained
on the basis of the <oldest_modification, space, offset> key.
Sergei Golubchik's avatar
Sergei Golubchik committed
125
@return	pointer to the predecessor or NULL if no predecessor. */
126 127 128 129
static
buf_page_t*
buf_flush_insert_in_flush_rbt(
/*==========================*/
Sergei Golubchik's avatar
Sergei Golubchik committed
130
	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
131 132 133
{
	const ib_rbt_node_t*	c_node;
	const ib_rbt_node_t*	p_node;
Sergei Golubchik's avatar
Sergei Golubchik committed
134 135
	buf_page_t*		prev = NULL;
	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
136

Sergei Golubchik's avatar
Sergei Golubchik committed
137
	ut_ad(buf_flush_list_mutex_own(buf_pool));
138 139 140 141 142 143 144 145 146

	/* Insert this buffer into the rbt. */
	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
	ut_a(c_node != NULL);

	/* Get the predecessor. */
	p_node = rbt_prev(buf_pool->flush_rbt, c_node);

	if (p_node != NULL) {
Sergei Golubchik's avatar
Sergei Golubchik committed
147 148 149
		buf_page_t**	value;
		value = rbt_value(buf_page_t*, p_node);
		prev = *value;
150 151 152 153 154 155
		ut_a(prev != NULL);
	}

	return(prev);
}

Sergei Golubchik's avatar
Sergei Golubchik committed
156
/*********************************************************//**
157 158 159 160 161
Delete a bpage from the flush_rbt. */
static
void
buf_flush_delete_from_flush_rbt(
/*============================*/
Sergei Golubchik's avatar
Sergei Golubchik committed
162
	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
163
{
164
#ifdef UNIV_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
165
	ibool		ret = FALSE;
166
#endif /* UNIV_DEBUG */
Sergei Golubchik's avatar
Sergei Golubchik committed
167 168 169
	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);

	ut_ad(buf_flush_list_mutex_own(buf_pool));
170

171 172 173 174
#ifdef UNIV_DEBUG
	ret =
#endif /* UNIV_DEBUG */
	rbt_delete(buf_pool->flush_rbt, &bpage);
Sergei Golubchik's avatar
Sergei Golubchik committed
175

176 177 178
	ut_ad(ret);
}

Sergei Golubchik's avatar
Sergei Golubchik committed
179
/*****************************************************************//**
180 181 182 183 184 185 186 187
Compare two modified blocks in the buffer pool. The key for comparison
is:
key = <oldest_modification, space, offset>
This comparison is used to maintian ordering of blocks in the
buf_pool->flush_rbt.
Note that for the purpose of flush_rbt, we only need to order blocks
on the oldest_modification. The other two fields are used to uniquely
identify the blocks.
Sergei Golubchik's avatar
Sergei Golubchik committed
188
@return	 < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
189 190 191 192 193 194 195
static
int
buf_flush_block_cmp(
/*================*/
	const void*	p1,		/*!< in: block1 */
	const void*	p2)		/*!< in: block2 */
{
Sergei Golubchik's avatar
Sergei Golubchik committed
196 197 198 199 200 201
	int			ret;
	const buf_page_t*	b1 = *(const buf_page_t**) p1;
	const buf_page_t*	b2 = *(const buf_page_t**) p2;
#ifdef UNIV_DEBUG
	buf_pool_t*		buf_pool = buf_pool_from_bpage(b1);
#endif /* UNIV_DEBUG */
202 203 204 205

	ut_ad(b1 != NULL);
	ut_ad(b2 != NULL);

Sergei Golubchik's avatar
Sergei Golubchik committed
206 207
	ut_ad(buf_flush_list_mutex_own(buf_pool));

208 209 210
	ut_ad(b1->in_flush_list);
	ut_ad(b2->in_flush_list);

Sergei Golubchik's avatar
Sergei Golubchik committed
211
	if (b2->oldest_modification > b1->oldest_modification) {
212
		return(1);
Sergei Golubchik's avatar
Sergei Golubchik committed
213
	} else if (b2->oldest_modification < b1->oldest_modification) {
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
		return(-1);
	}

	/* If oldest_modification is same then decide on the space. */
	ret = (int)(b2->space - b1->space);

	/* Or else decide ordering on the offset field. */
	return(ret ? ret : (int)(b2->offset - b1->offset));
}

/********************************************************************//**
Initialize the red-black tree to speed up insertions into the flush_list
during recovery process. Should be called at the start of recovery
process before any page has been read/written. */
UNIV_INTERN
void
buf_flush_init_flush_rbt(void)
/*==========================*/
{
Sergei Golubchik's avatar
Sergei Golubchik committed
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
	ulint	i;

	for (i = 0; i < srv_buf_pool_instances; i++) {
		buf_pool_t*	buf_pool;

		buf_pool = buf_pool_from_array(i);

		buf_flush_list_mutex_enter(buf_pool);

		/* Create red black tree for speedy insertions in flush list. */
		buf_pool->flush_rbt = rbt_create(
			sizeof(buf_page_t*), buf_flush_block_cmp);

		buf_flush_list_mutex_exit(buf_pool);
	}
248 249 250 251 252 253 254 255 256
}

/********************************************************************//**
Frees up the red-black tree. */
UNIV_INTERN
void
buf_flush_free_flush_rbt(void)
/*==========================*/
{
Sergei Golubchik's avatar
Sergei Golubchik committed
257 258 259 260 261 262 263 264
	ulint	i;

	for (i = 0; i < srv_buf_pool_instances; i++) {
		buf_pool_t*	buf_pool;

		buf_pool = buf_pool_from_array(i);

		buf_flush_list_mutex_enter(buf_pool);
265 266

#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
267
		ut_a(buf_flush_validate_low(buf_pool));
268 269
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */

Sergei Golubchik's avatar
Sergei Golubchik committed
270 271
		rbt_free(buf_pool->flush_rbt);
		buf_pool->flush_rbt = NULL;
272

Sergei Golubchik's avatar
Sergei Golubchik committed
273 274
		buf_flush_list_mutex_exit(buf_pool);
	}
275 276
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
277
/********************************************************************//**
278 279 280 281 282
Inserts a modified block into the flush list. */
UNIV_INTERN
void
buf_flush_insert_into_flush_list(
/*=============================*/
Sergei Golubchik's avatar
Sergei Golubchik committed
283 284 285
	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
	buf_block_t*	block,		/*!< in/out: block which is modified */
	ib_uint64_t	lsn)		/*!< in: oldest modification */
286
{
Sergei Golubchik's avatar
Sergei Golubchik committed
287 288
	ut_ad(!buf_pool_mutex_own(buf_pool));
	ut_ad(log_flush_order_mutex_own());
Vadim Tkachenko's avatar
Vadim Tkachenko committed
289
	ut_ad(mutex_own(&block->mutex));
Sergei Golubchik's avatar
Sergei Golubchik committed
290 291 292

	buf_flush_list_mutex_enter(buf_pool);

293 294
	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
Sergei Golubchik's avatar
Sergei Golubchik committed
295
		  <= lsn));
296

297 298 299
	/* If we are in the recovery then we need to update the flush
	red-black tree as well. */
	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
Sergei Golubchik's avatar
Sergei Golubchik committed
300 301
		buf_flush_list_mutex_exit(buf_pool);
		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
302 303 304
		return;
	}

Vadim Tkachenko's avatar
Vadim Tkachenko committed
305 306
	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
	ut_ad(!block->page.in_flush_list);
Sergei Golubchik's avatar
Sergei Golubchik committed
307

Vadim Tkachenko's avatar
Vadim Tkachenko committed
308
	ut_d(block->page.in_flush_list = TRUE);
Sergei Golubchik's avatar
Sergei Golubchik committed
309
	block->page.oldest_modification = lsn;
Vadim Tkachenko's avatar
Vadim Tkachenko committed
310
	UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
311

312 313 314 315 316 317 318 319 320 321 322
#ifdef UNIV_DEBUG_VALGRIND
	{
		ulint	zip_size = buf_block_get_zip_size(block);

		if (UNIV_UNLIKELY(zip_size)) {
			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
		} else {
			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
		}
	}
#endif /* UNIV_DEBUG_VALGRIND */
323
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
324
	ut_a(buf_flush_validate_skip(buf_pool));
325
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
Sergei Golubchik's avatar
Sergei Golubchik committed
326 327

	buf_flush_list_mutex_exit(buf_pool);
328 329
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
330
/********************************************************************//**
331 332 333 334 335 336 337
Inserts a modified block into the flush list in the right sorted position.
This function is used by recovery, because there the modifications do not
necessarily come in the order of lsn's. */
UNIV_INTERN
void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
Sergei Golubchik's avatar
Sergei Golubchik committed
338 339 340
	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
	buf_block_t*	block,		/*!< in/out: block which is modified */
	ib_uint64_t	lsn)		/*!< in: oldest modification */
341 342 343 344
{
	buf_page_t*	prev_b;
	buf_page_t*	b;

Sergei Golubchik's avatar
Sergei Golubchik committed
345 346
	ut_ad(!buf_pool_mutex_own(buf_pool));
	ut_ad(log_flush_order_mutex_own());
Vadim Tkachenko's avatar
Vadim Tkachenko committed
347
	ut_ad(mutex_own(&block->mutex));
Vadim Tkachenko's avatar
Vadim Tkachenko committed
348
	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
349

Sergei Golubchik's avatar
Sergei Golubchik committed
350 351 352 353 354 355 356 357 358 359 360 361
	buf_flush_list_mutex_enter(buf_pool);

	/* The field in_LRU_list is protected by buf_pool->mutex, which
	we are not holding.  However, while a block is in the flush
	list, it is dirty and cannot be discarded, not from the
	page_hash or from the LRU list.  At most, the uncompressed
	page frame of a compressed block may be discarded or created
	(copying the block->page to or from a buf_page_t that is
	dynamically allocated from buf_buddy_alloc()).  Because those
	transitions hold block->mutex and the flush list mutex (via
	buf_flush_relocate_on_flush_list()), there is no possibility
	of a race condition in the assertions below. */
Vadim Tkachenko's avatar
Vadim Tkachenko committed
362 363
	ut_ad(block->page.in_LRU_list);
	ut_ad(block->page.in_page_hash);
Sergei Golubchik's avatar
Sergei Golubchik committed
364 365
	/* buf_buddy_block_register() will take a block in the
	BUF_BLOCK_MEMORY state, not a file page. */
Vadim Tkachenko's avatar
Vadim Tkachenko committed
366
	ut_ad(!block->page.in_zip_hash);
Sergei Golubchik's avatar
Sergei Golubchik committed
367

Vadim Tkachenko's avatar
Vadim Tkachenko committed
368 369
	ut_ad(!block->page.in_flush_list);
	ut_d(block->page.in_flush_list = TRUE);
Sergei Golubchik's avatar
Sergei Golubchik committed
370
	block->page.oldest_modification = lsn;
371

372 373 374 375 376 377 378 379 380 381 382 383
#ifdef UNIV_DEBUG_VALGRIND
	{
		ulint	zip_size = buf_block_get_zip_size(block);

		if (UNIV_UNLIKELY(zip_size)) {
			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
		} else {
			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
		}
	}
#endif /* UNIV_DEBUG_VALGRIND */

384 385
	prev_b = NULL;

386 387 388 389 390 391 392 393 394 395
	/* For the most part when this function is called the flush_rbt
	should not be NULL. In a very rare boundary case it is possible
	that the flush_rbt has already been freed by the recovery thread
	before the last page was hooked up in the flush_list by the
	io-handler thread. In that case we'll  just do a simple
	linear search in the else block. */
	if (buf_pool->flush_rbt) {

		prev_b = buf_flush_insert_in_flush_rbt(&block->page);

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
396
	} else {
397 398 399 400 401 402 403 404

		b = UT_LIST_GET_FIRST(buf_pool->flush_list);

		while (b && b->oldest_modification
		       > block->page.oldest_modification) {
			ut_ad(b->in_flush_list);
			prev_b = b;
			b = UT_LIST_GET_NEXT(flush_list, b);
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
405
		}
406 407 408
	}

	if (prev_b == NULL) {
Vadim Tkachenko's avatar
Vadim Tkachenko committed
409
		UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
410
	} else {
Vadim Tkachenko's avatar
Vadim Tkachenko committed
411
		UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list,
Vadim Tkachenko's avatar
Vadim Tkachenko committed
412
				     prev_b, &block->page);
413 414 415
	}

#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
416
	ut_a(buf_flush_validate_low(buf_pool));
417
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
Sergei Golubchik's avatar
Sergei Golubchik committed
418 419

	buf_flush_list_mutex_exit(buf_pool);
420 421
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
422
/********************************************************************//**
423
Returns TRUE if the file page block is immediately suitable for replacement,
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
424 425
i.e., the transition FILE_PAGE => NOT_USED allowed.
@return	TRUE if can replace immediately */
426 427 428 429
UNIV_INTERN
ibool
buf_flush_ready_for_replace(
/*========================*/
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
430
	buf_page_t*	bpage)	/*!< in: buffer control block, must be
431 432
				buf_page_in_file(bpage) and in the LRU list */
{
Sergei Golubchik's avatar
Sergei Golubchik committed
433 434 435 436
#ifdef UNIV_DEBUG
	//buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
	//ut_ad(buf_pool_mutex_own(buf_pool));
#endif
Vadim Tkachenko's avatar
Vadim Tkachenko committed
437
	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
Sergei Golubchik's avatar
Sergei Golubchik committed
438
	//ut_ad(bpage->in_LRU_list);
439

Vadim Tkachenko's avatar
Vadim Tkachenko committed
440
	if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) {
441

442
		return((bpage->oldest_modification == 0 || bpage->space_was_being_deleted)
443 444 445 446
		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
		       && bpage->buf_fix_count == 0);
	}

447 448
	/* permited not to own LRU_mutex..  */
/*
449 450 451 452 453 454
	ut_print_timestamp(stderr);
	fprintf(stderr,
		"  InnoDB: Error: buffer block state %lu"
		" in the LRU list!\n",
		(ulong) buf_page_get_state(bpage));
	ut_print_buf(stderr, bpage, sizeof(buf_page_t));
Vadim Tkachenko's avatar
Vadim Tkachenko committed
455
	putc('\n', stderr);
456
*/
457 458 459 460

	return(FALSE);
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
461 462 463
/********************************************************************//**
Returns TRUE if the block is modified and ready for flushing.
@return	TRUE if can flush immediately */
464 465 466 467
UNIV_INLINE
ibool
buf_flush_ready_for_flush(
/*======================*/
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
468
	buf_page_t*	bpage,	/*!< in: buffer control block, must be
469
				buf_page_in_file(bpage) */
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
470
	enum buf_flush	flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
471
{
Sergei Golubchik's avatar
Sergei Golubchik committed
472 473 474 475
#ifdef UNIV_DEBUG
	//buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
	//ut_ad(buf_pool_mutex_own(buf_pool));
#endif
Vadim Tkachenko's avatar
Vadim Tkachenko committed
476
	//ut_a(buf_page_in_file(bpage));
477
	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
Vadim Tkachenko's avatar
Vadim Tkachenko committed
478
	ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
479

Vadim Tkachenko's avatar
Vadim Tkachenko committed
480
	if (buf_page_in_file(bpage) && bpage->oldest_modification != 0
481 482 483
	    && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
		ut_ad(bpage->in_flush_list);

484 485 486 487 488 489 490
		if (bpage->space_was_being_deleted) {
			/* should be removed from flush_list here */
			/* because buf_flush_try_neighbors() cannot flush without fil_space_get_size(space) */
			buf_flush_remove(bpage);
			return(FALSE);
		}

491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
		if (flush_type != BUF_FLUSH_LRU) {

			return(TRUE);

		} else if (bpage->buf_fix_count == 0) {

			/* If we are flushing the LRU list, to avoid deadlocks
			we require the block not to be bufferfixed, and hence
			not latched. */

			return(TRUE);
		}
	}

	return(FALSE);
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
508
/********************************************************************//**
509 510 511 512 513
Remove a block from the flush list of modified blocks. */
UNIV_INTERN
void
buf_flush_remove(
/*=============*/
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
514
	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
515
{
Sergei Golubchik's avatar
Sergei Golubchik committed
516
	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
517

Sergei Golubchik's avatar
Sergei Golubchik committed
518 519
	//ut_ad(buf_pool_mutex_own(buf_pool));
	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
520 521
	ut_ad(bpage->in_flush_list);

Sergei Golubchik's avatar
Sergei Golubchik committed
522 523
	buf_flush_list_mutex_enter(buf_pool);

524 525
	switch (buf_page_get_state(bpage)) {
	case BUF_BLOCK_ZIP_PAGE:
Sergei Golubchik's avatar
Sergei Golubchik committed
526
		/* Clean compressed pages should not be on the flush list */
527 528 529 530 531 532 533 534 535
	case BUF_BLOCK_ZIP_FREE:
	case BUF_BLOCK_NOT_USED:
	case BUF_BLOCK_READY_FOR_USE:
	case BUF_BLOCK_MEMORY:
	case BUF_BLOCK_REMOVE_HASH:
		ut_error;
		return;
	case BUF_BLOCK_ZIP_DIRTY:
		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
536
		UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
537
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
538
		buf_LRU_insert_zip_clean(bpage);
539
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
540 541
		break;
	case BUF_BLOCK_FILE_PAGE:
Vadim Tkachenko's avatar
Vadim Tkachenko committed
542
		UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
543 544 545
		break;
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
546
	/* If the flush_rbt is active then delete from there as well. */
547 548 549 550 551 552 553 554
	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
		buf_flush_delete_from_flush_rbt(bpage);
	}

	/* Must be done after we have removed it from the flush_rbt
	because we assert on in_flush_list in comparison function. */
	ut_d(bpage->in_flush_list = FALSE);

555 556
	bpage->oldest_modification = 0;

Sergei Golubchik's avatar
Sergei Golubchik committed
557 558 559 560 561
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
	ut_a(buf_flush_validate_skip(buf_pool));
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */

	buf_flush_list_mutex_exit(buf_pool);
562 563
}

Sergei Golubchik's avatar
Sergei Golubchik committed
564
/*******************************************************************//**
565
Relocates a buffer control block on the flush_list.
Sergei Golubchik's avatar
Sergei Golubchik committed
566 567 568 569 570 571 572 573 574
Note that it is assumed that the contents of bpage have already been
copied to dpage.
IMPORTANT: When this function is called bpage and dpage are not
exact copies of each other. For example, they both will have different
::state. Also the ::list pointers in dpage may be stale. We need to
use the current list node (bpage) to do the list manipulation because
the list pointers could have changed between the time that we copied
the contents of bpage to the dpage and the flush list manipulation
below. */
575 576 577 578 579 580 581
UNIV_INTERN
void
buf_flush_relocate_on_flush_list(
/*=============================*/
	buf_page_t*	bpage,	/*!< in/out: control block being moved */
	buf_page_t*	dpage)	/*!< in/out: destination block */
{
Sergei Golubchik's avatar
Sergei Golubchik committed
582 583 584
	buf_page_t*	prev;
	buf_page_t* 	prev_b = NULL;
	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
585

Sergei Golubchik's avatar
Sergei Golubchik committed
586 587 588
	//ut_ad(buf_pool_mutex_own(buf_pool));
	/* Must reside in the same buffer pool. */
	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
589 590 591

	ut_ad(mutex_own(buf_page_get_mutex(bpage)));

Sergei Golubchik's avatar
Sergei Golubchik committed
592 593 594 595 596 597 598 599 600
	buf_flush_list_mutex_enter(buf_pool);

	/* FIXME: At this point we have both buf_pool and flush_list
	mutexes. Theoretically removal of a block from flush list is
	only covered by flush_list mutex but currently we do
	have buf_pool mutex in buf_flush_remove() therefore this block
	is guaranteed to be in the flush list. We need to check if
	this will work without the assumption of block removing code
	having the buf_pool mutex. */
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635
	ut_ad(bpage->in_flush_list);
	ut_ad(dpage->in_flush_list);

	/* If recovery is active we must swap the control blocks in
	the flush_rbt as well. */
	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
		buf_flush_delete_from_flush_rbt(bpage);
		prev_b = buf_flush_insert_in_flush_rbt(dpage);
	}

	/* Must be done after we have removed it from the flush_rbt
	because we assert on in_flush_list in comparison function. */
	ut_d(bpage->in_flush_list = FALSE);

	prev = UT_LIST_GET_PREV(flush_list, bpage);
	UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);

	if (prev) {
		ut_ad(prev->in_flush_list);
		UT_LIST_INSERT_AFTER(
			flush_list,
			buf_pool->flush_list,
			prev, dpage);
	} else {
		UT_LIST_ADD_FIRST(
			flush_list,
			buf_pool->flush_list,
			dpage);
	}

	/* Just an extra check. Previous in flush_list
	should be the same control block as in flush_rbt. */
	ut_a(!buf_pool->flush_rbt || prev_b == prev);

#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
636
	ut_a(buf_flush_validate_low(buf_pool));
637
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
Sergei Golubchik's avatar
Sergei Golubchik committed
638 639

	buf_flush_list_mutex_exit(buf_pool);
640 641
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
642
/********************************************************************//**
643 644 645 646 647
Updates the flush system data structures when a write is completed. */
UNIV_INTERN
void
buf_flush_write_complete(
/*=====================*/
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
648
	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
649 650
{
	enum buf_flush	flush_type;
Sergei Golubchik's avatar
Sergei Golubchik committed
651
	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

	ut_ad(bpage);

	buf_flush_remove(bpage);

	flush_type = buf_page_get_flush_type(bpage);
	buf_pool->n_flush[flush_type]--;

	if (flush_type == BUF_FLUSH_LRU) {
		/* Put the block to the end of the LRU list to wait to be
		moved to the free list */

		buf_LRU_make_block_old(bpage);

		buf_pool->LRU_flush_ended++;
	}

	/* fprintf(stderr, "n pending flush %lu\n",
	buf_pool->n_flush[flush_type]); */

Sergei Golubchik's avatar
Sergei Golubchik committed
672 673
	if (buf_pool->n_flush[flush_type] == 0
	    && buf_pool->init_flush[flush_type] == FALSE) {
674 675 676 677 678 679 680

		/* The running flush batch has ended */

		os_event_set(buf_pool->no_flush[flush_type]);
	}
}

681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
/********************************************************************//**
Flush a batch of writes to the datafiles that have already been
written by the OS. */
static
void
buf_flush_sync_datafiles(void)
/*==========================*/
{
	/* Wake possible simulated aio thread to actually post the
	writes to the operating system */
	os_aio_simulated_wake_handler_threads();

	/* Wait that all async writes to tablespaces have been posted to
	the OS */
	os_aio_wait_until_no_pending_writes();

	/* Now we flush the data to disk (for example, with fsync) */
	fil_flush_file_spaces(FIL_TABLESPACE);

	return;
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
703
/********************************************************************//**
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
Flushes possible buffered writes from the doublewrite memory buffer to disk,
and also wakes up the aio thread if simulated aio is used. It is very
important to call this function after a batch of writes has been posted,
and also when we may have to wait for a page latch! Otherwise a deadlock
of threads can occur. */
static
void
buf_flush_buffered_writes(void)
/*===========================*/
{
	byte*		write_buf;
	ulint		len;
	ulint		len2;
	ulint		i;

	if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
720 721
		/* Sync the writes to the disk. */
		buf_flush_sync_datafiles();
722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
		return;
	}

	mutex_enter(&(trx_doublewrite->mutex));

	/* Write first to doublewrite buffer blocks. We use synchronous
	aio and thus know that file write has been completed when the
	control returns. */

	if (trx_doublewrite->first_free == 0) {

		mutex_exit(&(trx_doublewrite->mutex));

		return;
	}

	for (i = 0; i < trx_doublewrite->first_free; i++) {

		const buf_block_t*	block;

		block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];

		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
		    || block->page.zip.data) {
			/* No simple validate for compressed pages exists. */
			continue;
		}

		if (UNIV_UNLIKELY
		    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
			    block->frame + (UNIV_PAGE_SIZE
					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
			    4))) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: ERROR: The page to be written"
				" seems corrupt!\n"
				"InnoDB: The lsn fields do not match!"
				" Noticed in the buffer pool\n"
				"InnoDB: before posting to the"
				" doublewrite buffer.\n");
		}

		if (!block->check_index_page_at_flush) {
		} else if (page_is_comp(block->frame)) {
			if (UNIV_UNLIKELY
			    (!page_simple_validate_new(block->frame))) {
corrupted_page:
				buf_page_print(block->frame, 0);

				ut_print_timestamp(stderr);
				fprintf(stderr,
					"  InnoDB: Apparent corruption of an"
					" index page n:o %lu in space %lu\n"
					"InnoDB: to be written to data file."
					" We intentionally crash server\n"
					"InnoDB: to prevent corrupt data"
					" from ending up in data\n"
					"InnoDB: files.\n",
					(ulong) buf_block_get_page_no(block),
					(ulong) buf_block_get_space(block));

				ut_error;
			}
		} else if (UNIV_UNLIKELY
			   (!page_simple_validate_old(block->frame))) {

			goto corrupted_page;
		}
	}

	/* increment the doublewrite flushed pages counter */
	srv_dblwr_pages_written+= trx_doublewrite->first_free;
	srv_dblwr_writes++;

	len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
		     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;

	write_buf = trx_doublewrite->write_buf;
	i = 0;

803 804
	fil_io(OS_FILE_WRITE, TRUE,
	       (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
	       trx_doublewrite->block1, 0, len,
	       (void*) write_buf, NULL);

	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
	     len2 += UNIV_PAGE_SIZE, i++) {
		const buf_block_t* block = (buf_block_t*)
			trx_doublewrite->buf_block_arr[i];

		if (UNIV_LIKELY(!block->page.zip.data)
		    && UNIV_LIKELY(buf_block_get_state(block)
				   == BUF_BLOCK_FILE_PAGE)
		    && UNIV_UNLIKELY
		    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
			    write_buf + len2
			    + (UNIV_PAGE_SIZE
			       - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: ERROR: The page to be written"
				" seems corrupt!\n"
				"InnoDB: The lsn fields do not match!"
				" Noticed in the doublewrite block1.\n");
		}
	}

	if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		goto flush;
	}

	len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
		* UNIV_PAGE_SIZE;

	write_buf = trx_doublewrite->write_buf
		+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
	ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);

841 842
	fil_io(OS_FILE_WRITE, TRUE,
	       (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871
	       trx_doublewrite->block2, 0, len,
	       (void*) write_buf, NULL);

	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
	     len2 += UNIV_PAGE_SIZE, i++) {
		const buf_block_t* block = (buf_block_t*)
			trx_doublewrite->buf_block_arr[i];

		if (UNIV_LIKELY(!block->page.zip.data)
		    && UNIV_LIKELY(buf_block_get_state(block)
				   == BUF_BLOCK_FILE_PAGE)
		    && UNIV_UNLIKELY
		    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
			    write_buf + len2
			    + (UNIV_PAGE_SIZE
			       - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: ERROR: The page to be"
				" written seems corrupt!\n"
				"InnoDB: The lsn fields do not match!"
				" Noticed in"
				" the doublewrite block2.\n");
		}
	}

flush:
	/* Now flush the doublewrite buffer data to disk */

872
	fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE, FALSE);
873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930

	/* We know that the writes have been flushed to disk now
	and in recovery we will find them in the doublewrite buffer
	blocks. Next do the writes to the intended positions. */

	for (i = 0; i < trx_doublewrite->first_free; i++) {
		const buf_block_t* block = (buf_block_t*)
			trx_doublewrite->buf_block_arr[i];

		ut_a(buf_page_in_file(&block->page));
		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
			fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
			       FALSE, buf_page_get_space(&block->page),
			       buf_page_get_zip_size(&block->page),
			       buf_page_get_page_no(&block->page), 0,
			       buf_page_get_zip_size(&block->page),
			       (void*)block->page.zip.data,
			       (void*)block);

			/* Increment the counter of I/O operations used
			for selecting LRU policy. */
			buf_LRU_stat_inc_io();

			continue;
		}

		ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);

		if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
					 block->frame
					 + (UNIV_PAGE_SIZE
					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
					 4))) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: ERROR: The page to be written"
				" seems corrupt!\n"
				"InnoDB: The lsn fields do not match!"
				" Noticed in the buffer pool\n"
				"InnoDB: after posting and flushing"
				" the doublewrite buffer.\n"
				"InnoDB: Page buf fix count %lu,"
				" io fix %lu, state %lu\n",
				(ulong)block->page.buf_fix_count,
				(ulong)buf_block_get_io_fix(block),
				(ulong)buf_block_get_state(block));
		}

		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
		       FALSE, buf_block_get_space(block), 0,
		       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
		       (void*)block->frame, (void*)block);

		/* Increment the counter of I/O operations used
		for selecting LRU policy. */
		buf_LRU_stat_inc_io();
	}

931 932
	/* Sync the writes to the disk. */
	buf_flush_sync_datafiles();
933 934 935 936 937 938 939

	/* We can now reuse the doublewrite memory buffer: */
	trx_doublewrite->first_free = 0;

	mutex_exit(&(trx_doublewrite->mutex));
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
940
/********************************************************************//**
941 942 943 944 945 946 947
Posts a buffer page for writing. If the doublewrite memory buffer is
full, calls buf_flush_buffered_writes and waits for for free space to
appear. */
static
void
buf_flush_post_to_doublewrite_buf(
/*==============================*/
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
948
	buf_page_t*	bpage)	/*!< in: buffer block to write */
949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
{
	ulint	zip_size;
try_again:
	mutex_enter(&(trx_doublewrite->mutex));

	ut_a(buf_page_in_file(bpage));

	if (trx_doublewrite->first_free
	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		mutex_exit(&(trx_doublewrite->mutex));

		buf_flush_buffered_writes();

		goto try_again;
	}

	zip_size = buf_page_get_zip_size(bpage);

	if (UNIV_UNLIKELY(zip_size)) {
968
		UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
969 970 971 972 973 974 975 976 977
		/* Copy the compressed page and clear the rest. */
		memcpy(trx_doublewrite->write_buf
		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
		       bpage->zip.data, zip_size);
		memset(trx_doublewrite->write_buf
		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
	} else {
		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
978 979
		UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
				   UNIV_PAGE_SIZE);
980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000

		memcpy(trx_doublewrite->write_buf
		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
		       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
	}

	trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;

	trx_doublewrite->first_free++;

	if (trx_doublewrite->first_free
	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		mutex_exit(&(trx_doublewrite->mutex));

		buf_flush_buffered_writes();

		return;
	}

	mutex_exit(&(trx_doublewrite->mutex));
}
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1001
#endif /* !UNIV_HOTBACKUP */
1002

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1003
/********************************************************************//**
1004 1005 1006 1007 1008
Initializes a page for writing to the tablespace. */
UNIV_INTERN
void
buf_flush_init_for_writing(
/*=======================*/
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1009 1010 1011
	byte*		page,		/*!< in/out: page */
	void*		page_zip_,	/*!< in/out: compressed page, or NULL */
	ib_uint64_t	newest_lsn)	/*!< in: newest modification lsn
1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
					to the page */
{
	ut_ad(page);

	if (page_zip_) {
		page_zip_des_t*	page_zip = page_zip_;
		ulint		zip_size = page_zip_get_size(page_zip);
		ut_ad(zip_size);
		ut_ad(ut_is_2pow(zip_size));
		ut_ad(zip_size <= UNIV_PAGE_SIZE);

		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
		case FIL_PAGE_TYPE_ALLOCATED:
		case FIL_PAGE_INODE:
		case FIL_PAGE_IBUF_BITMAP:
		case FIL_PAGE_TYPE_FSP_HDR:
		case FIL_PAGE_TYPE_XDES:
			/* These are essentially uncompressed pages. */
			memcpy(page_zip->data, page, zip_size);
			/* fall through */
		case FIL_PAGE_TYPE_ZBLOB:
		case FIL_PAGE_TYPE_ZBLOB2:
		case FIL_PAGE_INDEX:
Sergei Golubchik's avatar
Sergei Golubchik committed
1035 1036
			mach_write_to_8(page_zip->data
					+ FIL_PAGE_LSN, newest_lsn);
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
			memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
			mach_write_to_4(page_zip->data
					+ FIL_PAGE_SPACE_OR_CHKSUM,
					srv_use_checksums
					? page_zip_calc_checksum(
						page_zip->data, zip_size)
					: BUF_NO_CHECKSUM_MAGIC);
			return;
		}

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1047 1048 1049 1050 1051 1052 1053
		ut_print_timestamp(stderr);
		fputs("  InnoDB: ERROR: The compressed page to be written"
		      " seems corrupt:", stderr);
		ut_print_buf(stderr, page, zip_size);
		fputs("\nInnoDB: Possibly older version of the page:", stderr);
		ut_print_buf(stderr, page_zip->data, zip_size);
		putc('\n', stderr);
1054 1055 1056 1057
		ut_error;
	}

	/* Write the newest modification lsn to the page header and trailer */
Sergei Golubchik's avatar
Sergei Golubchik committed
1058
	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
1059

Sergei Golubchik's avatar
Sergei Golubchik committed
1060 1061
	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
			newest_lsn);
1062 1063 1064 1065 1066

	/* Store the new formula checksum */

	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
			srv_use_checksums
1067 1068 1069
			? (!srv_fast_checksum
			   ? buf_calc_page_new_checksum(page)
			   : buf_calc_page_new_checksum_32(page))
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082
			: BUF_NO_CHECKSUM_MAGIC);

	/* We overwrite the first 4 bytes of the end lsn field to store
	the old formula checksum. Since it depends also on the field
	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
	new formula checksum. */

	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
			srv_use_checksums
			? buf_calc_page_old_checksum(page)
			: BUF_NO_CHECKSUM_MAGIC);
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1083 1084
#ifndef UNIV_HOTBACKUP
/********************************************************************//**
1085 1086 1087 1088 1089 1090 1091
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
also when the doublewrite buffer is used, we must call
buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
/*======================*/
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1092
	buf_page_t*	bpage)	/*!< in: buffer block to write */
1093 1094 1095
{
	ulint	zip_size	= buf_page_get_zip_size(bpage);
	page_t*	frame		= NULL;
Sergei Golubchik's avatar
Sergei Golubchik committed
1096 1097 1098 1099 1100 1101

#ifdef UNIV_DEBUG
	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
	//ut_ad(!buf_pool_mutex_own(buf_pool));
#endif

1102 1103 1104 1105 1106 1107
#ifdef UNIV_LOG_DEBUG
	static ibool univ_log_debug_warned;
#endif /* UNIV_LOG_DEBUG */

	ut_ad(buf_page_in_file(bpage));

Sergei Golubchik's avatar
Sergei Golubchik committed
1108
	/* We are not holding buf_pool->mutex or block_mutex here.
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1109 1110 1111 1112
	Nevertheless, it is safe to access bpage, because it is
	io_fixed and oldest_modification != 0.  Thus, it cannot be
	relocated in the buffer pool or removed from flush_list or
	LRU_list. */
Sergei Golubchik's avatar
Sergei Golubchik committed
1113 1114 1115
	//ut_ad(!buf_pool_mutex_own(buf_pool));
	ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
	ut_ad(!buf_flush_list_mutex_own(buf_pool));
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1116 1117 1118 1119
	ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
	ut_ad(bpage->oldest_modification != 0);

1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
#ifdef UNIV_IBUF_COUNT_DEBUG
	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
#endif
	ut_ad(bpage->newest_modification != 0);

#ifdef UNIV_LOG_DEBUG
	if (!univ_log_debug_warned) {
		univ_log_debug_warned = TRUE;
		fputs("Warning: cannot force log to disk if"
		      " UNIV_LOG_DEBUG is defined!\n"
		      "Crash recovery will not work!\n",
		      stderr);
	}
#else
	/* Force the log to the disk before writing the modified block */
	log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
#endif
	switch (buf_page_get_state(bpage)) {
	case BUF_BLOCK_ZIP_FREE:
	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
	case BUF_BLOCK_NOT_USED:
	case BUF_BLOCK_READY_FOR_USE:
	case BUF_BLOCK_MEMORY:
	case BUF_BLOCK_REMOVE_HASH:
		ut_error;
		break;
	case BUF_BLOCK_ZIP_DIRTY:
		frame = bpage->zip.data;
		if (UNIV_LIKELY(srv_use_checksums)) {
			ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
			     == page_zip_calc_checksum(frame, zip_size));
		}
Sergei Golubchik's avatar
Sergei Golubchik committed
1152 1153
		mach_write_to_8(frame + FIL_PAGE_LSN,
				bpage->newest_modification);
1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
		memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
		break;
	case BUF_BLOCK_FILE_PAGE:
		frame = bpage->zip.data;
		if (!frame) {
			frame = ((buf_block_t*) bpage)->frame;
		}

		buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
					   bpage->zip.data
					   ? &bpage->zip : NULL,
					   bpage->newest_modification);
		break;
	}

	if (!srv_use_doublewrite_buf || !trx_doublewrite) {
		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
		       FALSE, buf_page_get_space(bpage), zip_size,
		       buf_page_get_page_no(bpage), 0,
		       zip_size ? zip_size : UNIV_PAGE_SIZE,
		       frame, bpage);
	} else {
		buf_flush_post_to_doublewrite_buf(bpage);
	}
}

1180 1181 1182
# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
/********************************************************************//**
Writes a flushable page asynchronously from the buffer pool to a file.
Sergei Golubchik's avatar
Sergei Golubchik committed
1183
NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1184 1185 1186 1187 1188 1189 1190
function, and they will be released by this function after flushing.
This is loosely based on buf_flush_batch() and buf_flush_page().
@return TRUE if the page was flushed and the mutexes released */
UNIV_INTERN
ibool
buf_flush_page_try(
/*===============*/
Sergei Golubchik's avatar
Sergei Golubchik committed
1191
	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
1192 1193
	buf_block_t*	block)		/*!< in/out: buffer control block */
{
Sergei Golubchik's avatar
Sergei Golubchik committed
1194
	//ut_ad(buf_pool_mutex_own(buf_pool));
1195 1196 1197 1198 1199 1200 1201
	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
	ut_ad(mutex_own(&block->mutex));

	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_LRU)) {
		return(FALSE);
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
1202 1203
	buf_pool_mutex_enter(buf_pool);

1204 1205
	if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
	    || buf_pool->init_flush[BUF_FLUSH_LRU]) {
Sergei Golubchik's avatar
Sergei Golubchik committed
1206
		buf_pool_mutex_exit(buf_pool);
1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
		/* There is already a flush batch of the same type running */
		return(FALSE);
	}

	buf_pool->init_flush[BUF_FLUSH_LRU] = TRUE;

	buf_page_set_io_fix(&block->page, BUF_IO_WRITE);

	buf_page_set_flush_type(&block->page, BUF_FLUSH_LRU);

	if (buf_pool->n_flush[BUF_FLUSH_LRU]++ == 0) {

		os_event_reset(buf_pool->no_flush[BUF_FLUSH_LRU]);
	}

	/* VERY IMPORTANT:
	Because any thread may call the LRU flush, even when owning
	locks on pages, to avoid deadlocks, we must make sure that the
	s-lock is acquired on the page without waiting: this is
	accomplished because buf_flush_ready_for_flush() must hold,
	and that requires the page not to be bufferfixed. */

	rw_lock_s_lock_gen(&block->lock, BUF_IO_WRITE);

	/* Note that the s-latch is acquired before releasing the
	buf_pool mutex: this ensures that the latch is acquired
	immediately. */

	mutex_exit(&block->mutex);
Sergei Golubchik's avatar
Sergei Golubchik committed
1236
	buf_pool_mutex_exit(buf_pool);
1237 1238 1239 1240 1241 1242 1243 1244

	/* Even though block is not protected by any mutex at this
	point, it is safe to access block, because it is io_fixed and
	oldest_modification != 0.  Thus, it cannot be relocated in the
	buffer pool or removed from flush_list or LRU_list. */

	buf_flush_write_block_low(&block->page);

Sergei Golubchik's avatar
Sergei Golubchik committed
1245
	buf_pool_mutex_enter(buf_pool);
1246 1247 1248 1249 1250 1251 1252
	buf_pool->init_flush[BUF_FLUSH_LRU] = FALSE;

	if (buf_pool->n_flush[BUF_FLUSH_LRU] == 0) {
		/* The running flush batch has ended */
		os_event_set(buf_pool->no_flush[BUF_FLUSH_LRU]);
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
1253
	buf_pool_mutex_exit(buf_pool);
1254 1255 1256 1257 1258 1259
	buf_flush_buffered_writes();

	return(TRUE);
}
# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1260
/********************************************************************//**
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1261 1262 1263
Writes a flushable page asynchronously from the buffer pool to a file.
NOTE: in simulated aio we must call
os_aio_simulated_wake_handler_threads after we have posted a batch of
Sergei Golubchik's avatar
Sergei Golubchik committed
1264
writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1265 1266
held upon entering this function, and they will be released by this
function. */
1267
static
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1268 1269 1270
void
buf_flush_page(
/*===========*/
Sergei Golubchik's avatar
Sergei Golubchik committed
1271
	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1272 1273
	buf_page_t*	bpage,		/*!< in: buffer control block */
	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1274
					or BUF_FLUSH_LIST */
1275 1276
{
	mutex_t*	block_mutex;
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1277
	ibool		is_uncompressed;
1278

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1279
	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
Sergei Golubchik's avatar
Sergei Golubchik committed
1280
	//ut_ad(buf_pool_mutex_own(buf_pool));
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1281
#ifdef UNIV_SYNC_DEBUG
Sergei Golubchik's avatar
Sergei Golubchik committed
1282
	ut_ad(rw_lock_own(&buf_pool->page_hash_latch, RW_LOCK_SHARED));
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1283
#endif
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1284
	ut_ad(buf_page_in_file(bpage));
1285 1286

	block_mutex = buf_page_get_mutex(bpage);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1287
	ut_ad(mutex_own(block_mutex));
1288

Sergei Golubchik's avatar
Sergei Golubchik committed
1289 1290
	buf_pool_mutex_enter(buf_pool);
	rw_lock_s_unlock(&buf_pool->page_hash_latch);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1291

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1292
	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1293

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1294
	buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1295

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1296
	buf_page_set_flush_type(bpage, flush_type);
1297

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1298
	if (buf_pool->n_flush[flush_type] == 0) {
1299

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1300 1301
		os_event_reset(buf_pool->no_flush[flush_type]);
	}
1302

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1303
	buf_pool->n_flush[flush_type]++;
1304

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1305
	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
Sergei Golubchik's avatar
Sergei Golubchik committed
1306
	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1307 1308 1309 1310

	switch (flush_type) {
		ibool	is_s_latched;
	case BUF_FLUSH_LIST:
1311 1312 1313 1314
		/* If the simulated aio thread is not running, we must
		not wait for any latch, as we may end up in a deadlock:
		if buf_fix_count == 0, then we know we need not wait */

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1315 1316
		is_s_latched = (bpage->buf_fix_count == 0);
		if (is_s_latched && is_uncompressed) {
1317 1318 1319 1320 1321
			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
					   BUF_IO_WRITE);
		}

		mutex_exit(block_mutex);
Sergei Golubchik's avatar
Sergei Golubchik committed
1322
		buf_pool_mutex_exit(buf_pool);
1323

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1324 1325 1326 1327 1328 1329 1330
		/* Even though bpage is not protected by any mutex at
		this point, it is safe to access bpage, because it is
		io_fixed and oldest_modification != 0.  Thus, it
		cannot be relocated in the buffer pool or removed from
		flush_list or LRU_list. */

		if (!is_s_latched) {
1331 1332
			buf_flush_buffered_writes();

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1333
			if (is_uncompressed) {
1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345
				rw_lock_s_lock_gen(&((buf_block_t*) bpage)
						   ->lock, BUF_IO_WRITE);
			}
		}

		break;

	case BUF_FLUSH_LRU:
		/* VERY IMPORTANT:
		Because any thread may call the LRU flush, even when owning
		locks on pages, to avoid deadlocks, we must make sure that the
		s-lock is acquired on the page without waiting: this is
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1346 1347
		accomplished because buf_flush_ready_for_flush() must hold,
		and that requires the page not to be bufferfixed. */
1348

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1349
		if (is_uncompressed) {
1350 1351 1352 1353 1354 1355 1356 1357 1358
			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
					   BUF_IO_WRITE);
		}

		/* Note that the s-latch is acquired before releasing the
		buf_pool mutex: this ensures that the latch is acquired
		immediately. */

		mutex_exit(block_mutex);
Sergei Golubchik's avatar
Sergei Golubchik committed
1359
		buf_pool_mutex_exit(buf_pool);
1360 1361 1362 1363 1364 1365
		break;

	default:
		ut_error;
	}

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1366 1367 1368 1369 1370
	/* Even though bpage is not protected by any mutex at this
	point, it is safe to access bpage, because it is io_fixed and
	oldest_modification != 0.  Thus, it cannot be relocated in the
	buffer pool or removed from flush_list or LRU_list. */

1371 1372 1373 1374 1375 1376 1377 1378 1379 1380
#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Flushing %u space %u page %u\n",
			flush_type, bpage->space, bpage->offset);
	}
#endif /* UNIV_DEBUG */
	buf_flush_write_block_low(bpage);
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1381 1382 1383
/***********************************************************//**
Flushes to disk all flushable pages within the flush area.
@return	number of pages flushed */
1384 1385 1386 1387
static
ulint
buf_flush_try_neighbors(
/*====================*/
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1388 1389 1390
	ulint		space,		/*!< in: space id */
	ulint		offset,		/*!< in: page offset */
	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
1391
					BUF_FLUSH_LIST */
Sergei Golubchik's avatar
Sergei Golubchik committed
1392 1393 1394 1395
	ulint		n_flushed,	/*!< in: number of pages
					flushed so far in this batch */
	ulint		n_to_flush)	/*!< in: maximum number of pages
					we are allowed to flush */
1396 1397
{
	ulint		i;
Sergei Golubchik's avatar
Sergei Golubchik committed
1398 1399 1400 1401
	ulint		low;
	ulint		high;
	ulint		count = 0;
	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
1402 1403 1404

	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);

Sergei Golubchik's avatar
Sergei Golubchik committed
1405 1406 1407
	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !srv_flush_neighbor_pages) {
		/* If there is little space, it is better not to flush
		any block except from the end of the LRU list */
1408 1409 1410 1411

		low = offset;
		high = offset + 1;
	} else {
Sergei Golubchik's avatar
Sergei Golubchik committed
1412 1413 1414
		/* When flushed, dirty blocks are searched in
		neighborhoods of this size, and flushed along with the
		original page. */
1415

Sergei Golubchik's avatar
Sergei Golubchik committed
1416 1417 1418 1419 1420
		ulint	buf_flush_area;
	
		buf_flush_area	= ut_min(
			BUF_READ_AHEAD_AREA(buf_pool),
			buf_pool->curr_size / 16);
1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433

		low = (offset / buf_flush_area) * buf_flush_area;
		high = (offset / buf_flush_area + 1) * buf_flush_area;
	}

	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */

	if (high > fil_space_get_size(space)) {
		high = fil_space_get_size(space);
	}

	for (i = low; i < high; i++) {

Sergei Golubchik's avatar
Sergei Golubchik committed
1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457
		buf_page_t*	bpage;

		if ((count + n_flushed) >= n_to_flush) {

			/* We have already flushed enough pages and
			should call it a day. There is, however, one
			exception. If the page whose neighbors we
			are flushing has not been flushed yet then
			we'll try to flush the victim that we
			selected originally. */
			if (i <= offset) {
				i = offset;
			} else {
				break;
			}
		}

		buf_pool = buf_pool_get(space, i);

		//buf_pool_mutex_enter(buf_pool);
		rw_lock_s_lock(&buf_pool->page_hash_latch);

		/* We only want to flush pages from this buffer pool. */
		bpage = buf_page_hash_get(buf_pool, space, i);
1458 1459 1460

		if (!bpage) {

Sergei Golubchik's avatar
Sergei Golubchik committed
1461 1462
			//buf_pool_mutex_exit(buf_pool);
			rw_lock_s_unlock(&buf_pool->page_hash_latch);
1463
			continue;
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1464
		}
1465

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1466
		ut_a(buf_page_in_file(bpage));
1467

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1468 1469
		/* We avoid flushing 'non-old' blocks in an LRU flush,
		because the flushed blocks are soon freed */
1470

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1471 1472 1473
		if (flush_type != BUF_FLUSH_LRU
		    || i == offset
		    || buf_page_is_old(bpage)) {
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1474
			mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
1475

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1476
			if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type)
1477 1478
			    && (i == offset || !bpage->buf_fix_count)) {
				/* We only try to flush those
Sergei Golubchik's avatar
Sergei Golubchik committed
1479 1480 1481 1482 1483 1484
				neighbors != offset where the buf fix
				count is zero, as we then know that we
				probably can latch the page without a
				semaphore wait. Semaphore waits are
				expensive because we must flush the
				doublewrite buffer before we start
1485 1486
				waiting. */

Sergei Golubchik's avatar
Sergei Golubchik committed
1487
				buf_flush_page(buf_pool, bpage, flush_type);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1488
				ut_ad(!mutex_own(block_mutex));
Sergei Golubchik's avatar
Sergei Golubchik committed
1489
				ut_ad(!buf_pool_mutex_own(buf_pool));
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1490
				count++;
Sergei Golubchik's avatar
Sergei Golubchik committed
1491
				continue;
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1492
			} else if (block_mutex) {
1493 1494 1495
				mutex_exit(block_mutex);
			}
		}
Sergei Golubchik's avatar
Sergei Golubchik committed
1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570
		//buf_pool_mutex_exit(buf_pool);
		rw_lock_s_unlock(&buf_pool->page_hash_latch);
	}

	return(count);
}

/********************************************************************//**
Check if the block is modified and ready for flushing. If the the block
is ready to flush then flush the page and try o flush its neighbors.

@return	TRUE if buf_pool mutex was not released during this function.
This does not guarantee that some pages were written as well.
Number of pages written are incremented to the count. */
static
ibool
buf_flush_page_and_try_neighbors(
/*=============================*/
	buf_page_t*	bpage,		/*!< in: buffer control block,
					must be
					buf_page_in_file(bpage) */
	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU
					or BUF_FLUSH_LIST */
	ulint		n_to_flush,	/*!< in: number of pages to
					flush */
	ulint*		count)		/*!< in/out: number of pages
					flushed */
{
	mutex_t*	block_mutex;
	ibool		flushed = FALSE;
#ifdef UNIV_DEBUG
	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
#endif /* UNIV_DEBUG */

	//ut_ad(buf_pool_mutex_own(buf_pool));
	ut_ad(flush_type != BUF_FLUSH_LRU
	      || mutex_own(&buf_pool->LRU_list_mutex));

	block_mutex = buf_page_get_mutex_enter(bpage);

	//ut_a(buf_page_in_file(bpage));

	if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type)) {
		ulint		space;
		ulint		offset;
		buf_pool_t*	buf_pool;

		buf_pool = buf_pool_from_bpage(bpage);

		//buf_pool_mutex_exit(buf_pool);
		if (flush_type == BUF_FLUSH_LRU) {
			mutex_exit(&buf_pool->LRU_list_mutex);
		}

		/* These fields are protected by both the
		buffer pool mutex and block mutex. */
		space = buf_page_get_space(bpage);
		offset = buf_page_get_page_no(bpage);

		mutex_exit(block_mutex);

		/* Try to flush also all the neighbors */
		*count += buf_flush_try_neighbors(space,
						  offset,
						  flush_type,
						  *count,
						  n_to_flush);

		//buf_pool_mutex_enter(buf_pool);
		if (flush_type == BUF_FLUSH_LRU) {
			mutex_enter(&buf_pool->LRU_list_mutex);
		}
		flushed = TRUE;
	} else if (block_mutex) {
		mutex_exit(block_mutex);
1571 1572
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731
	//ut_ad(buf_pool_mutex_own(buf_pool));
	ut_ad(flush_type != BUF_FLUSH_LRU
	      || mutex_own(&buf_pool->LRU_list_mutex));

	return(flushed);
}

/*******************************************************************//**
This utility flushes dirty blocks from the end of the LRU list.
In the case of an LRU flush the calling thread may own latches to
pages: to avoid deadlocks, this function must be written so that it
cannot end up waiting for these latches!
@return number of blocks for which the write request was queued. */
static
ulint
buf_flush_LRU_list_batch(
/*=====================*/
	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
	ulint		max)		/*!< in: max of blocks to flush */
{
	buf_page_t*	bpage;
	ulint		count = 0;

	//ut_ad(buf_pool_mutex_own(buf_pool));
	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));

	do {
		/* Start from the end of the list looking for a
		suitable block to be flushed. */
		bpage = UT_LIST_GET_LAST(buf_pool->LRU);

		/* Iterate backwards over the flush list till we find
		a page that isn't ready for flushing. */
		while (bpage != NULL
		       && !buf_flush_page_and_try_neighbors(
				bpage, BUF_FLUSH_LRU, max, &count)) {

			bpage = UT_LIST_GET_PREV(LRU, bpage);
		}
	} while (bpage != NULL && count < max);

	/* We keep track of all flushes happening as part of LRU
	flush. When estimating the desired rate at which flush_list
	should be flushed, we factor in this value. */
	buf_lru_flush_page_count += count;

	//ut_ad(buf_pool_mutex_own(buf_pool));
	ut_ad(mutex_own(&buf_pool->LRU_list_mutex));

	return(count);
}

/*******************************************************************//**
This utility flushes dirty blocks from the end of the flush_list.
the calling thread is not allowed to own any latches on pages!
@return number of blocks for which the write request was queued;
ULINT_UNDEFINED if there was a flush of the same type already
running */
static
ulint
buf_flush_flush_list_batch(
/*=======================*/
	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
	ulint		min_n,		/*!< in: wished minimum mumber
					of blocks flushed (it is not
					guaranteed that the actual
					number is that big, though) */
	ib_uint64_t	lsn_limit)	/*!< all blocks whose
					oldest_modification is smaller
					than this should be flushed (if
					their number does not exceed
					min_n) */
{
	ulint		len;
	buf_page_t*	bpage;
	buf_page_t*	prev_bpage = NULL;
	ulint		count = 0;

	//ut_ad(buf_pool_mutex_own(buf_pool));

	/* If we have flushed enough, leave the loop */
	do {
		/* Start from the end of the list looking for a suitable
		block to be flushed. */

		buf_flush_list_mutex_enter(buf_pool);

		/* We use len here because theoretically insertions can
		happen in the flush_list below while we are traversing
		it for a suitable candidate for flushing. We'd like to
		set a limit on how farther we are willing to traverse
		the list. */
		len = UT_LIST_GET_LEN(buf_pool->flush_list);
		bpage = UT_LIST_GET_LAST(buf_pool->flush_list);

		if (bpage) {
			ut_a(bpage->oldest_modification > 0);
			prev_bpage = UT_LIST_GET_PREV(flush_list, bpage);
		}

		if (!bpage || bpage->oldest_modification >= lsn_limit) {

			/* We have flushed enough */
			buf_flush_list_mutex_exit(buf_pool);
			break;
		}

		ut_a(bpage->oldest_modification > 0);

		ut_ad(bpage->in_flush_list);

		buf_flush_list_mutex_exit(buf_pool);

		/* The list may change during the flushing and we cannot
		safely preserve within this function a pointer to a
		block in the list! */
		while (bpage != NULL
		       && len > 0
		       && !buf_flush_page_and_try_neighbors(
				bpage, BUF_FLUSH_LIST, min_n, &count)) {

			buf_flush_list_mutex_enter(buf_pool);

			/* If we are here that means that buf_pool->mutex
			 was not released in buf_flush_page_and_try_neighbors()
			above and this guarantees that bpage didn't get
			relocated since we released the flush_list
			mutex above. There is a chance, however, that
			the bpage got removed from flush_list (not
			currently possible because flush_list_remove()
			also obtains buf_pool mutex but that may change
			in future). To avoid this scenario we check
			the oldest_modification and if it is zero
			we start all over again. */
			if (bpage->oldest_modification == 0) {
				buf_flush_list_mutex_exit(buf_pool);
				break;
			}

			bpage = UT_LIST_GET_PREV(flush_list, bpage);

			//ut_ad(!bpage || bpage->in_flush_list);
			if (bpage != prev_bpage) {
				/* the search might warp.. retrying */
				buf_flush_list_mutex_exit(buf_pool);
				break;
			}
			if (bpage) {
				prev_bpage = UT_LIST_GET_PREV(flush_list, bpage);
			}

			buf_flush_list_mutex_exit(buf_pool);

			--len;
		}

	} while (count < min_n && bpage != NULL && len > 0);

	//ut_ad(buf_pool_mutex_own(buf_pool));
1732 1733 1734 1735

	return(count);
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1736
/*******************************************************************//**
1737 1738 1739 1740
This utility flushes dirty blocks from the end of the LRU list or flush_list.
NOTE 1: in the case of an LRU flush the calling thread may own latches to
pages: to avoid deadlocks, this function must be written so that it cannot
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1741 1742 1743
the calling thread is not allowed to own any latches on pages!
@return number of blocks for which the write request was queued;
ULINT_UNDEFINED if there was a flush of the same type already running */
Sergei Golubchik's avatar
Sergei Golubchik committed
1744
static
1745 1746 1747
ulint
buf_flush_batch(
/*============*/
Sergei Golubchik's avatar
Sergei Golubchik committed
1748
	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1749
	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
1750 1751 1752
					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
					then the caller must not own any
					latches on pages */
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1753
	ulint		min_n,		/*!< in: wished minimum mumber of blocks
1754 1755
					flushed (it is not guaranteed that the
					actual number is that big, though) */
Sergei Golubchik's avatar
Sergei Golubchik committed
1756 1757
	ib_uint64_t	lsn_limit)	/*!< in: in the case of BUF_FLUSH_LIST
					all blocks whose oldest_modification is
1758 1759 1760 1761
					smaller than this should be flushed
					(if their number does not exceed
					min_n), otherwise ignored */
{
Sergei Golubchik's avatar
Sergei Golubchik committed
1762 1763 1764
	ulint		count	= 0;

	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1765 1766
#ifdef UNIV_SYNC_DEBUG
	ut_ad((flush_type != BUF_FLUSH_LIST)
Sergei Golubchik's avatar
Sergei Golubchik committed
1767
	      || sync_thread_levels_empty_except_dict());
1768 1769
#endif /* UNIV_SYNC_DEBUG */

Sergei Golubchik's avatar
Sergei Golubchik committed
1770
	//buf_pool_mutex_enter(buf_pool);
1771

Sergei Golubchik's avatar
Sergei Golubchik committed
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784
	/* Note: The buffer pool mutex is released and reacquired within
	the flush functions. */
	switch(flush_type) {
	case BUF_FLUSH_LRU:
		mutex_enter(&buf_pool->LRU_list_mutex);
		count = buf_flush_LRU_list_batch(buf_pool, min_n);
		mutex_exit(&buf_pool->LRU_list_mutex);
		break;
	case BUF_FLUSH_LIST:
		count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit);
		break;
	default:
		ut_error;
1785 1786
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
1787
	//buf_pool_mutex_exit(buf_pool);
1788

Sergei Golubchik's avatar
Sergei Golubchik committed
1789
	buf_flush_buffered_writes();
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1790

Sergei Golubchik's avatar
Sergei Golubchik committed
1791 1792 1793 1794 1795 1796
#ifdef UNIV_DEBUG
	if (buf_debug_prints && count > 0) {
		fprintf(stderr, flush_type == BUF_FLUSH_LRU
			? "Flushed %lu pages in LRU flush\n"
			: "Flushed %lu pages in flush list flush\n",
			(ulong) count);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1797
	}
Sergei Golubchik's avatar
Sergei Golubchik committed
1798
#endif /* UNIV_DEBUG */
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1799

Sergei Golubchik's avatar
Sergei Golubchik committed
1800
	srv_buf_pool_flushed += count;
1801

Sergei Golubchik's avatar
Sergei Golubchik committed
1802 1803
	return(count);
}
1804

Sergei Golubchik's avatar
Sergei Golubchik committed
1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
/******************************************************************//**
Gather the aggregated stats for both flush list and LRU list flushing */
static
void
buf_flush_common(
/*=============*/
	enum buf_flush	flush_type,	/*!< in: type of flush */
	ulint		page_count)	/*!< in: number of pages flushed */
{
	buf_flush_buffered_writes();
1815

Sergei Golubchik's avatar
Sergei Golubchik committed
1816
	ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1817

Sergei Golubchik's avatar
Sergei Golubchik committed
1818 1819 1820 1821 1822 1823 1824 1825
#ifdef UNIV_DEBUG
	if (buf_debug_prints && page_count > 0) {
		fprintf(stderr, flush_type == BUF_FLUSH_LRU
			? "Flushed %lu pages in LRU flush\n"
			: "Flushed %lu pages in flush list flush\n",
			(ulong) page_count);
	}
#endif /* UNIV_DEBUG */
1826

Sergei Golubchik's avatar
Sergei Golubchik committed
1827
	srv_buf_pool_flushed += page_count;
1828

Sergei Golubchik's avatar
Sergei Golubchik committed
1829 1830 1831 1832 1833 1834 1835
	if (flush_type == BUF_FLUSH_LRU) {
		/* We keep track of all flushes happening as part of LRU
		flush. When estimating the desired rate at which flush_list
		should be flushed we factor in this value. */
		buf_lru_flush_page_count += page_count;
	}
}
1836

Sergei Golubchik's avatar
Sergei Golubchik committed
1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847
/******************************************************************//**
Start a buffer flush batch for LRU or flush list */
static
ibool
buf_flush_start(
/*============*/
	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
					or BUF_FLUSH_LIST */
{
	buf_pool_mutex_enter(buf_pool);
1848

Sergei Golubchik's avatar
Sergei Golubchik committed
1849 1850
	if (buf_pool->n_flush[flush_type] > 0
	   || buf_pool->init_flush[flush_type] == TRUE) {
1851

Sergei Golubchik's avatar
Sergei Golubchik committed
1852
		/* There is already a flush batch of the same type running */
1853

Sergei Golubchik's avatar
Sergei Golubchik committed
1854
		buf_pool_mutex_exit(buf_pool);
1855

Sergei Golubchik's avatar
Sergei Golubchik committed
1856 1857
		return(FALSE);
	}
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1858

Sergei Golubchik's avatar
Sergei Golubchik committed
1859
	buf_pool->init_flush[flush_type] = TRUE;
1860

Sergei Golubchik's avatar
Sergei Golubchik committed
1861
	buf_pool_mutex_exit(buf_pool);
1862

Sergei Golubchik's avatar
Sergei Golubchik committed
1863 1864
	return(TRUE);
}
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1865

Sergei Golubchik's avatar
Sergei Golubchik committed
1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
/******************************************************************//**
End a buffer flush batch for LRU or flush list */
static
void
buf_flush_end(
/*==========*/
	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
					or BUF_FLUSH_LIST */
{
	buf_pool_mutex_enter(buf_pool);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
1877

1878 1879
	buf_pool->init_flush[flush_type] = FALSE;

Vadim Tkachenko's avatar
Vadim Tkachenko committed
1880
	if (buf_pool->n_flush[flush_type] == 0) {
1881 1882 1883 1884 1885 1886

		/* The running flush batch has ended */

		os_event_set(buf_pool->no_flush[flush_type]);
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
1887 1888
	buf_pool_mutex_exit(buf_pool);
}
1889

Sergei Golubchik's avatar
Sergei Golubchik committed
1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900
/******************************************************************//**
Waits until a flush batch of the given type ends */
UNIV_INTERN
void
buf_flush_wait_batch_end(
/*=====================*/
	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
	enum buf_flush	type)		/*!< in: BUF_FLUSH_LRU
					or BUF_FLUSH_LIST */
{
	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1901

Sergei Golubchik's avatar
Sergei Golubchik committed
1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917
	if (buf_pool == NULL) {
		ulint	i;

		for (i = 0; i < srv_buf_pool_instances; ++i) {
			buf_pool_t*	buf_pool;

			buf_pool = buf_pool_from_array(i);

			thd_wait_begin(NULL, THD_WAIT_DISKIO);
			os_event_wait(buf_pool->no_flush[type]);
			thd_wait_end(NULL);
		}
	} else {
		thd_wait_begin(NULL, THD_WAIT_DISKIO);
		os_event_wait(buf_pool->no_flush[type]);
		thd_wait_end(NULL);
1918
	}
Sergei Golubchik's avatar
Sergei Golubchik committed
1919
}
1920

Sergei Golubchik's avatar
Sergei Golubchik committed
1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937
/*******************************************************************//**
This utility flushes dirty blocks from the end of the LRU list.
NOTE: The calling thread may own latches to pages: to avoid deadlocks,
this function must be written so that it cannot end up waiting for these
latches!
@return number of blocks for which the write request was queued;
ULINT_UNDEFINED if there was a flush of the same type already running */
UNIV_INTERN
ulint
buf_flush_LRU(
/*==========*/
	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
	ulint		min_n)		/*!< in: wished minimum mumber of blocks
					flushed (it is not guaranteed that the
					actual number is that big, though) */
{
	ulint		page_count;
1938

Sergei Golubchik's avatar
Sergei Golubchik committed
1939 1940
	if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
		return(ULINT_UNDEFINED);
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
1941 1942
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
1943 1944 1945 1946 1947 1948
	page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);

	buf_flush_end(buf_pool, BUF_FLUSH_LRU);

	buf_flush_common(BUF_FLUSH_LRU, page_count);

1949 1950 1951
	return(page_count);
}

Sergei Golubchik's avatar
Sergei Golubchik committed
1952 1953 1954 1955 1956 1957
/*******************************************************************//**
This utility flushes dirty blocks from the end of the flush list of
all buffer pool instances.
NOTE: The calling thread is not allowed to own any latches on pages!
@return number of blocks for which the write request was queued;
ULINT_UNDEFINED if there was a flush of the same type already running */
1958
UNIV_INTERN
Sergei Golubchik's avatar
Sergei Golubchik committed
1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969
ulint
buf_flush_list(
/*===========*/
	ulint		min_n,		/*!< in: wished minimum mumber of blocks
					flushed (it is not guaranteed that the
					actual number is that big, though) */
	ib_uint64_t	lsn_limit)	/*!< in the case BUF_FLUSH_LIST all
					blocks whose oldest_modification is
					smaller than this should be flushed
					(if their number does not exceed
					min_n), otherwise ignored */
1970
{
Sergei Golubchik's avatar
Sergei Golubchik committed
1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
	ulint		i;
	ulint		total_page_count = 0;
	ibool		skipped = FALSE;

	if (min_n != ULINT_MAX) {
		/* Ensure that flushing is spread evenly amongst the
		buffer pool instances. When min_n is ULINT_MAX
		we need to flush everything up to the lsn limit
		so no limit here. */
		min_n = (min_n + srv_buf_pool_instances - 1)
			 / srv_buf_pool_instances;
	}
1983

Sergei Golubchik's avatar
Sergei Golubchik committed
1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
	/* Flush to lsn_limit in all buffer pool instances */
	for (i = 0; i < srv_buf_pool_instances; i++) {
		buf_pool_t*	buf_pool;
		ulint		page_count = 0;

		buf_pool = buf_pool_from_array(i);

		if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
			/* We have two choices here. If lsn_limit was
			specified then skipping an instance of buffer
			pool means we cannot guarantee that all pages
			up to lsn_limit has been flushed. We can
			return right now with failure or we can try
			to flush remaining buffer pools up to the
			lsn_limit. We attempt to flush other buffer
			pools based on the assumption that it will
			help in the retry which will follow the
			failure. */
			skipped = TRUE;

			continue;
		}

		page_count = buf_flush_batch(
			buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);

		buf_flush_end(buf_pool, BUF_FLUSH_LIST);
2011

Sergei Golubchik's avatar
Sergei Golubchik committed
2012 2013 2014 2015 2016 2017 2018 2019 2020
		buf_flush_common(BUF_FLUSH_LIST, page_count);

		total_page_count += page_count;
	}

	return(lsn_limit != IB_ULONGLONG_MAX && skipped
	       ? ULINT_UNDEFINED : total_page_count);
}
 
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2021
/******************************************************************//**
2022 2023
Gives a recommendation of how many blocks should be flushed to establish
a big enough margin of replaceable blocks near the end of the LRU list
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2024 2025 2026
and in the free list.
@return number of blocks which should be flushed from the end of the
LRU list */
2027 2028
static
ulint
Sergei Golubchik's avatar
Sergei Golubchik committed
2029 2030 2031
buf_flush_LRU_recommendation(
/*=========================*/
	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
2032 2033 2034 2035
{
	buf_page_t*	bpage;
	ulint		n_replaceable;
	ulint		distance	= 0;
Vadim Tkachenko's avatar
Vadim Tkachenko committed
2036 2037 2038 2039
	ibool		have_LRU_mutex = FALSE;

	if(UT_LIST_GET_LEN(buf_pool->unzip_LRU))
		have_LRU_mutex = TRUE;
2040
retry:
Sergei Golubchik's avatar
Sergei Golubchik committed
2041
	//buf_pool_mutex_enter(buf_pool);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
2042
	if (have_LRU_mutex)
Sergei Golubchik's avatar
Sergei Golubchik committed
2043
		mutex_enter(&buf_pool->LRU_list_mutex);
2044 2045 2046 2047 2048 2049

	n_replaceable = UT_LIST_GET_LEN(buf_pool->free);

	bpage = UT_LIST_GET_LAST(buf_pool->LRU);

	while ((bpage != NULL)
Sergei Golubchik's avatar
Sergei Golubchik committed
2050 2051 2052
	       && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
		   + BUF_FLUSH_EXTRA_MARGIN(buf_pool))
	       && (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) {
2053

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2054
		mutex_t* block_mutex;
Vadim Tkachenko's avatar
Vadim Tkachenko committed
2055 2056 2057 2058 2059
		if (!bpage->in_LRU_list) {
			/* reatart. but it is very optimistic */
			bpage = UT_LIST_GET_LAST(buf_pool->LRU);
			continue;
		}
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2060
		block_mutex = buf_page_get_mutex_enter(bpage);
2061

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2062
		if (block_mutex && buf_flush_ready_for_replace(bpage)) {
2063 2064 2065
			n_replaceable++;
		}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2066 2067 2068
		if (block_mutex) {
			mutex_exit(block_mutex);
		}
2069 2070 2071 2072 2073 2074

		distance++;

		bpage = UT_LIST_GET_PREV(LRU, bpage);
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
2075
	//buf_pool_mutex_exit(buf_pool);
Vadim Tkachenko's avatar
Vadim Tkachenko committed
2076
	if (have_LRU_mutex)
Sergei Golubchik's avatar
Sergei Golubchik committed
2077
		mutex_exit(&buf_pool->LRU_list_mutex);
2078

Sergei Golubchik's avatar
Sergei Golubchik committed
2079
	if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) {
2080 2081

		return(0);
2082 2083 2084
	} else if (!have_LRU_mutex) {
		/* confirm it again with LRU_mutex for exactness */
		have_LRU_mutex = TRUE;
2085
		distance = 0;
2086
		goto retry;
2087 2088
	}

Sergei Golubchik's avatar
Sergei Golubchik committed
2089 2090
	return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
	       + BUF_FLUSH_EXTRA_MARGIN(buf_pool)
2091 2092 2093
	       - n_replaceable);
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2094
/*********************************************************************//**
2095 2096 2097 2098 2099 2100 2101
Flushes pages from the end of the LRU list if there is too small a margin
of replaceable pages there or in the free list. VERY IMPORTANT: this function
is called also by threads which have locks on pages. To avoid deadlocks, we
flush only pages such that the s-lock required for flushing can be acquired
immediately, without waiting. */
UNIV_INTERN
void
2102
buf_flush_free_margin(
Sergei Golubchik's avatar
Sergei Golubchik committed
2103 2104 2105
/*==================*/
	buf_pool_t*	buf_pool,		/*!< in: Buffer pool instance */
	ibool		wait)
2106 2107 2108
{
	ulint	n_to_flush;

Sergei Golubchik's avatar
Sergei Golubchik committed
2109
	n_to_flush = buf_flush_LRU_recommendation(buf_pool);
2110 2111

	if (n_to_flush > 0) {
Sergei Golubchik's avatar
Sergei Golubchik committed
2112 2113 2114 2115
		ulint	n_flushed;

		n_flushed = buf_flush_LRU(buf_pool, n_to_flush);

2116
		if (wait && n_flushed == ULINT_UNDEFINED) {
2117 2118 2119
			/* There was an LRU type flush batch already running;
			let us wait for it to end */

Sergei Golubchik's avatar
Sergei Golubchik committed
2120
			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2121 2122 2123 2124
		}
	}
}

Sergei Golubchik's avatar
Sergei Golubchik committed
2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143
/*********************************************************************//**
Flushes pages from the end of all the LRU lists. */
UNIV_INTERN
void
buf_flush_free_margins(
/*========================*/
	ibool	wait)
{
	ulint	i;

	for (i = 0; i < srv_buf_pool_instances; i++) {
		buf_pool_t*	buf_pool;

		buf_pool = buf_pool_from_array(i);

		buf_flush_free_margin(buf_pool, wait);
	}
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203
/*********************************************************************
Update the historical stats that we are collecting for flush rate
heuristics at the end of each interval.
Flush rate heuristic depends on (a) rate of redo log generation and
(b) the rate at which LRU flush is happening. */
UNIV_INTERN
void
buf_flush_stat_update(void)
/*=======================*/
{
	buf_flush_stat_t*	item;
	ib_uint64_t		lsn_diff;
	ib_uint64_t		lsn;
	ulint			n_flushed;

	lsn = log_get_lsn();
	if (buf_flush_stat_cur.redo == 0) {
		/* First time around. Just update the current LSN
		and return. */
		buf_flush_stat_cur.redo = lsn;
		return;
	}

	item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];

	/* values for this interval */
	lsn_diff = lsn - buf_flush_stat_cur.redo;
	n_flushed = buf_lru_flush_page_count
		    - buf_flush_stat_cur.n_flushed;

	/* add the current value and subtract the obsolete entry. */
	buf_flush_stat_sum.redo += lsn_diff - item->redo;
	buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;

	/* put current entry in the array. */
	item->redo = lsn_diff;
	item->n_flushed = n_flushed;

	/* update the index */
	buf_flush_stat_arr_ind++;
	buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;

	/* reset the current entry. */
	buf_flush_stat_cur.redo = lsn;
	buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
}

/*********************************************************************
Determines the fraction of dirty pages that need to be flushed based
on the speed at which we generate redo log. Note that if redo log
is generated at a significant rate without corresponding increase
in the number of dirty pages (for example, an in-memory workload)
it can cause IO bursts of flushing. This function implements heuristics
to avoid this burstiness.
@return	number of dirty pages to be flushed / second */
UNIV_INTERN
ulint
buf_flush_get_desired_flush_rate(void)
/*==================================*/
{
Sergei Golubchik's avatar
Sergei Golubchik committed
2204 2205 2206 2207 2208 2209 2210 2211
	ulint		i;
	lint		rate;
	ulint		redo_avg;
	ulint		n_dirty = 0;
	ulint		n_flush_req;
	ulint		lru_flush_avg;
	ib_uint64_t	lsn = log_get_lsn();
	ulint		log_capacity = log_get_capacity();
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2212 2213 2214 2215 2216 2217

	/* log_capacity should never be zero after the initialization
	of log subsystem. */
	ut_ad(log_capacity != 0);

	/* Get total number of dirty pages. It is OK to access
Sergei Golubchik's avatar
Sergei Golubchik committed
2218
	flush_list without holding any mutex as we are using this
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2219
	only for heuristics. */
Sergei Golubchik's avatar
Sergei Golubchik committed
2220 2221 2222 2223 2224 2225
	for (i = 0; i < srv_buf_pool_instances; i++) {
		buf_pool_t*	buf_pool;

		buf_pool = buf_pool_from_array(i);
		n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list);
	}
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260

	/* An overflow can happen if we generate more than 2^32 bytes
	of redo in this interval i.e.: 4G of redo in 1 second. We can
	safely consider this as infinity because if we ever come close
	to 4G we'll start a synchronous flush of dirty pages. */
	/* redo_avg below is average at which redo is generated in
	past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
	interval. */
	redo_avg = (ulint) (buf_flush_stat_sum.redo
			    / BUF_FLUSH_STAT_N_INTERVAL
			    + (lsn - buf_flush_stat_cur.redo));

	/* An overflow can happen possibly if we flush more than 2^32
	pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
	unlikely scenario. Even when this happens it means that our
	flush rate will be off the mark. It won't affect correctness
	of any subsystem. */
	/* lru_flush_avg below is rate at which pages are flushed as
	part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
	number of pages flushed in the current interval. */
	lru_flush_avg = buf_flush_stat_sum.n_flushed
			/ BUF_FLUSH_STAT_N_INTERVAL
			+ (buf_lru_flush_page_count
			   - buf_flush_stat_cur.n_flushed);

	n_flush_req = (n_dirty * redo_avg) / log_capacity;

	/* The number of pages that we want to flush from the flush
	list is the difference between the required rate and the
	number of pages that we are historically flushing from the
	LRU list */
	rate = n_flush_req - lru_flush_avg;
	return(rate > 0 ? (ulint) rate : 0);
}

2261
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2262 2263 2264
/******************************************************************//**
Validates the flush list.
@return	TRUE if ok */
2265 2266
static
ibool
Sergei Golubchik's avatar
Sergei Golubchik committed
2267 2268 2269
buf_flush_validate_low(
/*===================*/
	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
2270
{
2271 2272
	buf_page_t*		bpage;
	const ib_rbt_node_t*	rnode = NULL;
2273

Sergei Golubchik's avatar
Sergei Golubchik committed
2274 2275
	ut_ad(buf_flush_list_mutex_own(buf_pool));

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2276 2277
	UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
			 ut_ad(ut_list_node_313->in_flush_list));
2278 2279 2280

	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);

2281 2282 2283 2284 2285 2286 2287
	/* If we are in recovery mode i.e.: flush_rbt != NULL
	then each block in the flush_list must also be present
	in the flush_rbt. */
	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
		rnode = rbt_first(buf_pool->flush_rbt);
	}

2288 2289
	while (bpage != NULL) {
		const ib_uint64_t om = bpage->oldest_modification;
Sergei Golubchik's avatar
Sergei Golubchik committed
2290 2291 2292

		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);

2293
		ut_ad(bpage->in_flush_list);
Sergei Golubchik's avatar
Sergei Golubchik committed
2294 2295 2296 2297 2298 2299 2300 2301 2302

		/* A page in buf_pool->flush_list can be in
		BUF_BLOCK_REMOVE_HASH state. This happens when a page
		is in the middle of being relocated. In that case the
		original descriptor can have this state and still be
		in the flush list waiting to acquire the
		buf_pool->flush_list_mutex to complete the relocation. */
		ut_a(buf_page_in_file(bpage)
		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
2303 2304
		ut_a(om > 0);

2305
		if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
Sergei Golubchik's avatar
Sergei Golubchik committed
2306 2307
			buf_page_t** prpage;

2308
			ut_a(rnode);
Sergei Golubchik's avatar
Sergei Golubchik committed
2309 2310 2311 2312
			prpage = rbt_value(buf_page_t*, rnode);

			ut_a(*prpage);
			ut_a(*prpage == bpage);
2313 2314 2315
			rnode = rbt_next(buf_pool->flush_rbt, rnode);
		}

Vadim Tkachenko's avatar
Vadim Tkachenko committed
2316
		bpage = UT_LIST_GET_NEXT(flush_list, bpage);
2317 2318 2319 2320

		ut_a(!bpage || om >= bpage->oldest_modification);
	}

2321 2322 2323 2324
	/* By this time we must have exhausted the traversal of
	flush_rbt (if active) as well. */
	ut_a(rnode == NULL);

2325 2326 2327
	return(TRUE);
}

Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2328 2329 2330
/******************************************************************//**
Validates the flush list.
@return	TRUE if ok */
2331 2332
UNIV_INTERN
ibool
Sergei Golubchik's avatar
Sergei Golubchik committed
2333 2334 2335
buf_flush_validate(
/*===============*/
	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
2336 2337 2338
{
	ibool	ret;

Sergei Golubchik's avatar
Sergei Golubchik committed
2339
	buf_flush_list_mutex_enter(buf_pool);
2340

Sergei Golubchik's avatar
Sergei Golubchik committed
2341
	ret = buf_flush_validate_low(buf_pool);
2342

Sergei Golubchik's avatar
Sergei Golubchik committed
2343
	buf_flush_list_mutex_exit(buf_pool);
2344 2345 2346 2347

	return(ret);
}
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
Aleksandr Kuzminsky's avatar
Aleksandr Kuzminsky committed
2348
#endif /* !UNIV_HOTBACKUP */