local merge

bbc2940b · Tor Didriksen · 35c5d31f · d259243a · bbc2940b · bbc2940b
Commit bbc2940b authored Aug 29, 2011 by Tor Didriksen
35 changed files
--- a/mysql-test/suite/innodb_plugin/r/innodb-index.result
+++ b/mysql-test/suite/innodb_plugin/r/innodb-index.result
@@ -1024,6 +1024,15 @@ INSERT INTO t1 VALUES(9,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r);
 UPDATE t1 SET a=1000;
 DELETE FROM t1;
 DROP TABLE t1;
+CREATE TABLE bug12547647(
+a INT NOT NULL, b BLOB NOT NULL, c TEXT,
+PRIMARY KEY (b(10), a), INDEX (c(10))
+) ENGINE=InnoDB ROW_FORMAT=DYNAMIC;
+INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731));
+COMMIT;
+UPDATE bug12547647 SET c = REPEAT('b',16928);
+ERROR 42000: Row size too large. The maximum row size for the used table type, not counting BLOBs, is 8126. You have to change some columns to TEXT or BLOBs
+DROP TABLE bug12547647;
 set global innodb_file_per_table=0;
 set global innodb_file_format=Antelope;
 set global innodb_file_format_check=Antelope;

--- a/mysql-test/suite/innodb_plugin/t/innodb-index.test
+++ b/mysql-test/suite/innodb_plugin/t/innodb-index.test
@@ -480,6 +480,19 @@ DELETE FROM t1;
 -- sleep 10
 DROP TABLE t1;

+# Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE
+CREATE TABLE bug12547647(
+a INT NOT NULL, b BLOB NOT NULL, c TEXT,
+PRIMARY KEY (b(10), a), INDEX (c(10))
+) ENGINE=InnoDB ROW_FORMAT=DYNAMIC;
+
+INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731));
+COMMIT;
+# The following used to cause infinite undo log allocation.
+--error ER_TOO_BIG_ROWSIZE
+UPDATE bug12547647 SET c = REPEAT('b',16928);
+DROP TABLE bug12547647;
+
 eval set global innodb_file_per_table=$per_table;
 eval set global innodb_file_format=$format;
 eval set global innodb_file_format_check=$format;

--- a/storage/innobase/btr/btr0btr.c
+++ b/storage/innobase/btr/btr0btr.c
@@ -300,29 +300,30 @@ btr_page_alloc_for_ibuf(
 /******************************************************************
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents! */
-
-page_t*
-btr_page_alloc(
-/*===========*/
-					/* out: new allocated page, x-latched;
-					NULL if out of space */
+static
+ulint
+btr_page_alloc_low(
+/*===============*/
+					/* out: allocated page number,
+					FIL_NULL if out of space */
 	dict_index_t*	index,		/* in: index */
 	ulint		hint_page_no,	/* in: hint of a good page */
 	byte		file_direction,	/* in: direction where a possible
 					page split is made */
 	ulint		level,		/* in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr)		/* in: mtr */
+	mtr_t*		mtr,		/* in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/* in/out: mini-transaction
+					in which the page should be
+					initialized (may be the same
+					as mtr), or NULL if it should
+					not be initialized (the page
+					at hint was previously freed
+					in mtr) */
 {
 	fseg_header_t*	seg_header;
 	page_t*		root;
-	page_t*		new_page;
-	ulint		new_page_no;
-
-	if (index->type & DICT_IBUF) {
-
-		return(btr_page_alloc_for_ibuf(index, mtr));
-	}

 	root = btr_root_get(index, mtr);

@@ -336,19 +337,61 @@ btr_page_alloc(
 	reservation for free extents, and thus we know that a page can
 	be allocated: */

-	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
-						   file_direction, TRUE, mtr);
+	return(fseg_alloc_free_page_general(seg_header, hint_page_no,
+					    file_direction, TRUE,
+					    mtr, init_mtr));
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents! */
+
+page_t*
+btr_page_alloc(
+/*===========*/
+					/* out:	new allocated block, x-latched;
+					NULL if out of space */
+	dict_index_t*	index,		/* in: index */
+	ulint		hint_page_no,	/* in: hint of a good page */
+	byte		file_direction,	/* in: direction where a possible
+					page split is made */
+	ulint		level,		/* in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/* in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/* in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+{
+	page_t*		new_page;
+	ulint		new_page_no;
+
+	if (index->type & DICT_IBUF) {
+
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
+
+	new_page_no = btr_page_alloc_low(
+		index, hint_page_no, file_direction, level, mtr, init_mtr);
+
 	if (new_page_no == FIL_NULL) {

 		return(NULL);
 	}

 	new_page = buf_page_get(dict_index_get_space(index), new_page_no,
-				RW_X_LATCH, mtr);
+				RW_X_LATCH, init_mtr);
 #ifdef UNIV_SYNC_DEBUG
 	buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW);
 #endif /* UNIV_SYNC_DEBUG */

+	if (mtr->freed_clust_leaf) {
+		mtr_memo_release(mtr, new_page, MTR_MEMO_FREE_CLUST_LEAF);
+		ut_ad(!mtr_memo_contains(mtr, buf_block_align(new_page),
+					 MTR_MEMO_FREE_CLUST_LEAF));
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
 	return(new_page);
 }

@@ -464,6 +507,16 @@ btr_page_free_low(
 	page_no = buf_frame_get_page_no(page);

 	fseg_free_page(seg_header, space, page_no, mtr);
+
+	/* The page was marked free in the allocation bitmap, but it
+	should remain buffer-fixed until mtr_commit(mtr) or until it
+	is explicitly freed from the mini-transaction. */
+	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
+				MTR_MEMO_PAGE_X_FIX));
+	/* TODO: Discard any operations on the page from the redo log
+	and remove the block from the flush list and the buffer pool.
+	This would free up buffer pool earlier and reduce writes to
+	both the tablespace and the redo log. */
 }

 /******************************************************************
@@ -479,13 +532,144 @@ btr_page_free(
 {
 	ulint		level;

+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
 	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
 				MTR_MEMO_PAGE_X_FIX));
 	level = btr_page_get_level(page, mtr);

 	btr_page_free_low(index, page, level, mtr);
+
+	/* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */
+	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
+				MTR_MEMO_PAGE_X_FIX));
+
+	if (level == 0 && (index->type & DICT_CLUSTERED)) {
+		/* We may have to call btr_mark_freed_leaves() to
+		temporarily mark the block nonfree for invoking
+		btr_store_big_rec_extern_fields() after an
+		update. Remember that the block was freed. */
+		mtr->freed_clust_leaf = TRUE;
+		mtr_memo_push(mtr, buf_block_align(page),
+			      MTR_MEMO_FREE_CLUST_LEAF);
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
 }

+/**************************************************************//**
+Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
+For invoking btr_store_big_rec_extern_fields() after an update,
+we must temporarily mark freed clustered index pages allocated, so
+that off-page columns will not be allocated from them. Between the
+btr_store_big_rec_extern_fields() and mtr_commit() we have to
+mark the pages free again, so that no pages will be leaked. */
+
+void
+btr_mark_freed_leaves(
+/*==================*/
+	dict_index_t*	index,	/* in/out: clustered index */
+	mtr_t*		mtr,	/* in/out: mini-transaction */
+	ibool		nonfree)/* in: TRUE=mark nonfree, FALSE=mark freed */
+{
+	/* This is loosely based on mtr_memo_release(). */
+
+	ulint	offset;
+
+	ut_ad(index->type & DICT_CLUSTERED);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	if (!mtr->freed_clust_leaf) {
+		return;
+	}
+
+	offset = dyn_array_get_data_size(&mtr->memo);
+
+	while (offset > 0) {
+		mtr_memo_slot_t*	slot;
+		buf_block_t*		block;
+
+		offset -= sizeof *slot;
+
+		slot = dyn_array_get_element(&mtr->memo, offset);
+
+		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
+			continue;
+		}
+
+		/* Because btr_page_alloc() does invoke
+		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
+		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
+		memo must still be clustered index leaf tree pages. */
+		block = slot->object;
+		ut_a(buf_block_get_space(block)
+		     == dict_index_get_space(index));
+		ut_a(fil_page_get_type(buf_block_get_frame(block))
+		     == FIL_PAGE_INDEX);
+		ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0);
+
+		if (nonfree) {
+			/* Allocate the same page again. */
+			ulint	page_no;
+			page_no = btr_page_alloc_low(
+				index, buf_block_get_page_no(block),
+				FSP_NO_DIR, 0, mtr, NULL);
+			ut_a(page_no == buf_block_get_page_no(block));
+		} else {
+			/* Assert that the page is allocated and free it. */
+			btr_page_free_low(index, buf_block_get_frame(block),
+					  0, mtr);
+		}
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
+}
+
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
+See btr_mark_freed_leaves(). */
+
+ibool
+btr_freed_leaves_validate(
+/*======================*/
+			/* out: TRUE if valid */
+	mtr_t*	mtr)	/* in: mini-transaction */
+{
+	ulint	offset;
+
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	offset = dyn_array_get_data_size(&mtr->memo);
+
+	while (offset > 0) {
+		mtr_memo_slot_t*	slot;
+		buf_block_t*		block;
+
+		offset -= sizeof *slot;
+
+		slot = dyn_array_get_element(&mtr->memo, offset);
+
+		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
+			continue;
+		}
+
+		ut_a(mtr->freed_clust_leaf);
+		/* Because btr_page_alloc() does invoke
+		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
+		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
+		memo must still be clustered index leaf tree pages. */
+		block = slot->object;
+		ut_a(fil_page_get_type(buf_block_get_frame(block))
+		     == FIL_PAGE_INDEX);
+		ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0);
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
 /******************************************************************
 Sets the child node file address in a node pointer. */
 UNIV_INLINE
@@ -1015,7 +1199,7 @@ btr_root_raise_and_insert(
 	a node pointer to the new page, and then splitting the new page. */

 	new_page = btr_page_alloc(index, 0, FSP_NO_DIR,
-				  btr_page_get_level(root, mtr), mtr);
+				  btr_page_get_level(root, mtr), mtr, mtr);

 	btr_page_create(new_page, index, mtr);

@@ -1636,7 +1820,7 @@ btr_page_split_and_insert(

 	/* 2. Allocate a new page to the index */
 	new_page = btr_page_alloc(cursor->index, hint_page_no, direction,
-				  btr_page_get_level(page, mtr), mtr);
+				  btr_page_get_level(page, mtr), mtr, mtr);
 	btr_page_create(new_page, cursor->index, mtr);

 	/* 3. Calculate the first record on the upper half-page, and the

--- a/storage/innobase/btr/btr0cur.c
+++ b/storage/innobase/btr/btr0cur.c
@@ -2051,43 +2051,6 @@ btr_cur_pessimistic_update(
 	return(err);
 }

-/*****************************************************************
-Commits and restarts a mini-transaction so that it will retain an
-x-lock on index->lock and the cursor page. */
-
-void
-btr_cur_mtr_commit_and_start(
-/*=========================*/
-	btr_cur_t*	cursor,	/* in: cursor */
-	mtr_t*		mtr)	/* in/out: mini-transaction */
-{
-	buf_block_t*	block;
-
-	block = buf_block_align(btr_cur_get_rec(cursor));
-
-	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
-				MTR_MEMO_X_LOCK));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	/* Keep the locks across the mtr_commit(mtr). */
-	rw_lock_x_lock(dict_index_get_lock(cursor->index));
-	rw_lock_x_lock(&block->lock);
-	mutex_enter(&block->mutex);
-#ifdef UNIV_SYNC_DEBUG
-	buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__);
-#else
-	buf_block_buf_fix_inc(block);
-#endif
-	mutex_exit(&block->mutex);
-	/* Write out the redo log. */
-	mtr_commit(mtr);
-	mtr_start(mtr);
-	/* Reassociate the locks with the mini-transaction.
-	They will be released on mtr_commit(mtr). */
-	mtr_memo_push(mtr, dict_index_get_lock(cursor->index),
-		      MTR_MEMO_X_LOCK);
-	mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
-}
-
 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/

 /********************************************************************
@@ -3494,6 +3457,11 @@ btr_store_big_rec_extern_fields(
 					this function returns */
 	big_rec_t*	big_rec_vec,	/* in: vector containing fields
 					to be stored externally */
+	mtr_t*		alloc_mtr,	/* in/out: in an insert, NULL;
+					in an update, local_mtr for
+					allocating BLOB pages and
+					updating BLOB pointers; alloc_mtr
+					must not have freed any leaf pages */
 	mtr_t*		local_mtr __attribute__((unused))) /* in: mtr
 					containing the latch to rec and to the
 					tree */
@@ -3514,6 +3482,8 @@ btr_store_big_rec_extern_fields(
 	ulint	i;
 	mtr_t	mtr;

+	ut_ad(local_mtr);
+	ut_ad(!alloc_mtr || alloc_mtr == local_mtr);
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 				MTR_MEMO_X_LOCK));
@@ -3523,6 +3493,25 @@ btr_store_big_rec_extern_fields(

 	space_id = buf_frame_get_space_id(rec);

+	if (alloc_mtr) {
+		/* Because alloc_mtr will be committed after
+		mtr, it is possible that the tablespace has been
+		extended when the B-tree record was updated or
+		inserted, or it will be extended while allocating
+		pages for big_rec.
+
+		TODO: In mtr (not alloc_mtr), write a redo log record
+		about extending the tablespace to its current size,
+		and remember the current size. Whenever the tablespace
+		grows as pages are allocated, write further redo log
+		records to mtr. (Currently tablespace extension is not
+		covered by the redo log. If it were, the record would
+		only be written to alloc_mtr, which is committed after
+		mtr.) */
+	} else {
+		alloc_mtr = &mtr;
+	}
+
 	/* We have to create a file segment to the tablespace
 	for each field and put the pointer to the field in rec */

@@ -3549,7 +3538,7 @@ btr_store_big_rec_extern_fields(
 			}

 			page = btr_page_alloc(index, hint_page_no,
-					      FSP_NO_DIR, 0, &mtr);
+					      FSP_NO_DIR, 0, alloc_mtr, &mtr);
 			if (page == NULL) {

 				mtr_commit(&mtr);
@@ -3603,37 +3592,42 @@ btr_store_big_rec_extern_fields(

 			extern_len -= store_len;

+			if (alloc_mtr == &mtr) {
 #ifdef UNIV_SYNC_DEBUG
-			rec_page =
+				rec_page =
 #endif /* UNIV_SYNC_DEBUG */
-			buf_page_get(space_id,
-				     buf_frame_get_page_no(data),
-				     RW_X_LATCH, &mtr);
+					buf_page_get(
+						space_id,
+						buf_frame_get_page_no(data),
+						RW_X_LATCH, &mtr);
 #ifdef UNIV_SYNC_DEBUG
-			buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK);
+				buf_page_dbg_add_level(
+					rec_page, SYNC_NO_ORDER_CHECK);
 #endif /* UNIV_SYNC_DEBUG */
+			}
+
 			mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0,
-					 MLOG_4BYTES, &mtr);
+					 MLOG_4BYTES, alloc_mtr);
 			mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4,
 					 big_rec_vec->fields[i].len
 					 - extern_len,
-					 MLOG_4BYTES, &mtr);
+					 MLOG_4BYTES, alloc_mtr);

 			if (prev_page_no == FIL_NULL) {
 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_SPACE_ID,
 						 space_id,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);

 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_PAGE_NO,
 						 page_no,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);

 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_OFFSET,
 						 FIL_PAGE_DATA,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);

 				/* Set the bit denoting that this field
 				in rec is stored externally */
@@ -3641,7 +3635,7 @@ btr_store_big_rec_extern_fields(
 				rec_set_nth_field_extern_bit(
 					rec, index,
 					big_rec_vec->fields[i].field_no,
-					TRUE, &mtr);
+					TRUE, alloc_mtr);
 			}

 			prev_page_no = page_no;

--- a/storage/innobase/buf/buf0buf.c
+++ b/storage/innobase/buf/buf0buf.c
@@ -1008,29 +1008,6 @@ buf_page_peek_block(
 	return(block);
 }

-/************************************************************************
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/* in: space id */
-	ulint	offset)	/* in: page number */
-{
-	buf_block_t*	block;
-
-	mutex_enter_fast(&(buf_pool->mutex));
-
-	block = buf_page_hash_get(space, offset);
-
-	if (block) {
-		block->check_index_page_at_flush = FALSE;
-	}
-
-	mutex_exit(&(buf_pool->mutex));
-}
-
 /************************************************************************
 Returns the current state of is_hashed of a page. FALSE if the page is
 not in the pool. NOTE that this operation does not fix the page in the

--- a/storage/innobase/fsp/fsp0fsp.c
+++ b/storage/innobase/fsp/fsp0fsp.c
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -379,7 +379,11 @@ btr_page_alloc(
 					page split is made */
 	ulint		level,		/* in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr);		/* in: mtr */
+	mtr_t*		mtr,		/* in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr);	/* in/out: mini-transaction
+					for x-latching and initializing
+					the page */
 /******************************************************************
 Frees a file page used in an index tree. NOTE: cannot free field external
 storage pages because the page must contain info on its level. */
@@ -402,6 +406,31 @@ btr_page_free_low(
 	page_t*		page,	/* in: page to be freed, x-latched */
 	ulint		level,	/* in: page level */
 	mtr_t*		mtr);	/* in: mtr */
+/**************************************************************//**
+Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
+For invoking btr_store_big_rec_extern_fields() after an update,
+we must temporarily mark freed clustered index pages allocated, so
+that off-page columns will not be allocated from them. Between the
+btr_store_big_rec_extern_fields() and mtr_commit() we have to
+mark the pages free again, so that no pages will be leaked. */
+
+void
+btr_mark_freed_leaves(
+/*==================*/
+	dict_index_t*	index,	/* in/out: clustered index */
+	mtr_t*		mtr,	/* in/out: mini-transaction */
+	ibool		nonfree);/* in: TRUE=mark nonfree, FALSE=mark freed */
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
+See btr_mark_freed_leaves(). */
+
+ibool
+btr_freed_leaves_validate(
+/*======================*/
+			/* out: TRUE if valid */
+	mtr_t*	mtr);	/* in: mini-transaction */
+#endif /* UNIV_DEBUG */
 #ifdef UNIV_BTR_PRINT
 /*****************************************************************
 Prints size info of a B-tree. */

--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -252,15 +252,6 @@ btr_cur_pessimistic_update(
 				updates */
 	que_thr_t*	thr,	/* in: query thread */
 	mtr_t*		mtr);	/* in: mtr */
-/*****************************************************************
-Commits and restarts a mini-transaction so that it will retain an
-x-lock on index->lock and the cursor page. */
-
-void
-btr_cur_mtr_commit_and_start(
-/*=========================*/
-	btr_cur_t*	cursor,	/* in: cursor */
-	mtr_t*		mtr);	/* in/out: mini-transaction */
 /***************************************************************
 Marks a clustered index record deleted. Writes an undo log record to
 undo log on this delete marking. Writes in the trx id field the id
@@ -471,6 +462,11 @@ btr_store_big_rec_extern_fields(
 					this function returns */
 	big_rec_t*	big_rec_vec,	/* in: vector containing fields
 					to be stored externally */
+	mtr_t*		alloc_mtr,	/* in/out: in an insert, NULL;
+					in an update, local_mtr for
+					allocating BLOB pages and
+					updating BLOB pointers; alloc_mtr
+					must not have freed any leaf pages */
 	mtr_t*		local_mtr);	/* in: mtr containing the latch to
 					rec and to the tree */
 /***********************************************************************

--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -294,15 +294,6 @@ buf_page_peek_block(
 	ulint	space,	/* in: space id */
 	ulint	offset);/* in: page number */
 /************************************************************************
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/* in: space id */
-	ulint	offset);/* in: page number */
-/************************************************************************
 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
 This function should be called when we free a file page and want the
 debug version to check that it is not accessed any more unless

--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -167,7 +167,7 @@ fseg_alloc_free_page_general(
 /*=========================*/
 				/* out: allocated page offset, FIL_NULL if no
 				page could be allocated */
-	fseg_header_t*	seg_header,/* in: segment header */
+	fseg_header_t*	seg_header,/* in/out: segment header */
 	ulint		hint,	/* in: hint of which page would be desirable */
 	byte		direction,/* in: if the new page is needed because
 				of an index page split, and records are
@@ -179,7 +179,11 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr);	/* in: mtr handle */
+	mtr_t*		mtr,	/* in/out: mini-transaction */
+	mtr_t*		init_mtr);/* in/out: mtr or another mini-transaction
+				in which the page should be initialized,
+				or NULL if this is a "fake allocation" of
+				a page that was previously freed in mtr */
 /**************************************************************************
 Reserves free pages from a tablespace. All mini-transactions which may
 use several pages from the tablespace should call this function beforehand

--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -36,6 +36,8 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
 #define MTR_MEMO_MODIFY		54
 #define	MTR_MEMO_S_LOCK		55
 #define	MTR_MEMO_X_LOCK		56
+/* The mini-transaction freed a clustered index leaf page. */
+#define MTR_MEMO_FREE_CLUST_LEAF	57

 /* Log item types: we have made them to be of the type 'byte'
 for the compiler to warn if val and type parameters are switched
@@ -325,9 +327,12 @@ struct mtr_struct{
 	ulint		state;	/* MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
 	dyn_array_t	memo;	/* memo stack for locks etc. */
 	dyn_array_t	log;	/* mini-transaction log */
-	ibool		modifications;
+	unsigned	modifications:1;
 				/* TRUE if the mtr made modifications to
 				buffer pool pages */
+	unsigned	freed_clust_leaf:1;
+				/* TRUE if MTR_MEMO_FREE_CLUST_LEAF
+				was logged in the mini-transaction */
 	ulint		n_log_recs;
 				/* count of how many page initial log records
 				have been written to the mtr log */

--- a/storage/innobase/include/mtr0mtr.ic
+++ b/storage/innobase/include/mtr0mtr.ic
@@ -26,6 +26,7 @@ mtr_start(

 	mtr->log_mode = MTR_LOG_ALL;
 	mtr->modifications = FALSE;
+	mtr->freed_clust_leaf = FALSE;
 	mtr->n_log_recs = 0;

 #ifdef UNIV_DEBUG
@@ -50,7 +51,8 @@ mtr_memo_push(

 	ut_ad(object);
 	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
-	ut_ad(type <= MTR_MEMO_X_LOCK);
+	ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF);
+	ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf);
 	ut_ad(mtr);
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);


--- a/storage/innobase/mtr/mtr0mtr.c
+++ b/storage/innobase/mtr/mtr0mtr.c
@@ -53,17 +53,13 @@ mtr_memo_slot_release(
 			buf_page_release((buf_block_t*)object, type, mtr);
 		} else if (type == MTR_MEMO_S_LOCK) {
 			rw_lock_s_unlock((rw_lock_t*)object);
-#ifdef UNIV_DEBUG
-		} else if (type == MTR_MEMO_X_LOCK) {
-			rw_lock_x_unlock((rw_lock_t*)object);
-		} else {
-			ut_ad(type == MTR_MEMO_MODIFY);
+		} else if (type != MTR_MEMO_X_LOCK) {
+			ut_ad(type == MTR_MEMO_MODIFY
+			      || type == MTR_MEMO_FREE_CLUST_LEAF);
 			ut_ad(mtr_memo_contains(mtr, object,
 						MTR_MEMO_PAGE_X_FIX));
-#else
 		} else {
 			rw_lock_x_unlock((rw_lock_t*)object);
-#endif
 		}
 	}


--- a/storage/innobase/row/row0ins.c
+++ b/storage/innobase/row/row0ins.c
@@ -2089,15 +2089,20 @@ row_ins_index_entry_low(
 			if (big_rec) {
 				ut_a(err == DB_SUCCESS);
 				/* Write out the externally stored
-				columns while still x-latching
-				index->lock and block->lock. We have
-				to mtr_commit(mtr) first, so that the
-				redo log will be written in the
-				correct order. Otherwise, we would run
-				into trouble on crash recovery if mtr
-				freed B-tree pages on which some of
-				the big_rec fields will be written. */
-				btr_cur_mtr_commit_and_start(&cursor, &mtr);
+				columns, but allocate the pages and
+				write the pointers using the
+				mini-transaction of the record update.
+				If any pages were freed in the update,
+				temporarily mark them allocated so
+				that off-page columns will not
+				overwrite them. We must do this,
+				because we will write the redo log for
+				the BLOB writes before writing the
+				redo log for the record update. Thus,
+				redo log application at crash recovery
+				will see BLOBs being written to free pages. */
+
+				btr_mark_freed_leaves(index, &mtr, TRUE);

 				rec = btr_cur_get_rec(&cursor);
 				offsets = rec_get_offsets(rec, index, offsets,
@@ -2105,7 +2110,8 @@ row_ins_index_entry_low(
 							  &heap);

 				err = btr_store_big_rec_extern_fields(
-					index, rec, offsets, big_rec, &mtr);
+					index, rec, offsets, big_rec,
+					&mtr, &mtr);
 				/* If writing big_rec fails (for
 				example, because of DB_OUT_OF_FILE_SPACE),
 				the record will be corrupted. Even if
@@ -2118,6 +2124,9 @@ row_ins_index_entry_low(
 				undo log, and thus the record cannot
 				be rolled back. */
 				ut_a(err == DB_SUCCESS);
+				/* Free the pages again
+				in order to avoid a leak. */
+				btr_mark_freed_leaves(index, &mtr, FALSE);
 				goto stored_big_rec;
 			}
 		} else {
@@ -2165,7 +2174,8 @@ row_ins_index_entry_low(
 					  ULINT_UNDEFINED, &heap);

 		err = btr_store_big_rec_extern_fields(index, rec,
-						      offsets, big_rec, &mtr);
+						      offsets, big_rec,
+						      NULL, &mtr);
 stored_big_rec:
 		if (modify) {
 			dtuple_big_rec_free(big_rec);

--- a/storage/innobase/row/row0row.c
+++ b/storage/innobase/row/row0row.c
@@ -212,23 +212,27 @@ row_build(
 	}

 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
-	/* This condition can occur during crash recovery before
-	trx_rollback_or_clean_all_without_sess() has completed
-	execution.
-
-	This condition is possible if the server crashed
-	during an insert or update before
-	btr_store_big_rec_extern_fields() did mtr_commit() all
-	BLOB pointers to the clustered index record.
-
-	If the record contains a null BLOB pointer, look up the
-	transaction that holds the implicit lock on this record, and
-	assert that it is active. (In this version of InnoDB, we
-	cannot assert that it was recovered, because there is no
-	trx->is_recovered field.) */
-
-	ut_a(!rec_offs_any_null_extern(rec, offsets)
-	     || trx_assert_active(row_get_rec_trx_id(rec, index, offsets)));
+	if (rec_offs_any_null_extern(rec, offsets)) {
+		/* This condition can occur during crash recovery
+		before trx_rollback_or_clean_all_without_sess() has
+		completed execution.
+
+		This condition is possible if the server crashed
+		during an insert or update before
+		btr_store_big_rec_extern_fields() did mtr_commit() all
+		BLOB pointers to the clustered index record.
+
+		If the record contains a null BLOB pointer, look up the
+		transaction that holds the implicit lock on this record, and
+		assert that it is active. (In this version of InnoDB, we
+		cannot assert that it was recovered, because there is no
+		trx->is_recovered field.) */
+
+		ut_a(trx_assert_active(
+			     row_get_rec_trx_id(rec, index, offsets)));
+		ut_a(trx_undo_roll_ptr_is_insert(
+			     row_get_rec_roll_ptr(rec, index, offsets)));
+	}
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */

 	if (type != ROW_COPY_POINTERS) {

--- a/storage/innobase/row/row0upd.c
+++ b/storage/innobase/row/row0upd.c
@@ -1591,21 +1591,22 @@ row_upd_clust_rec(
 		*offsets_ = (sizeof offsets_) / sizeof *offsets_;

 		ut_a(err == DB_SUCCESS);
-		/* Write out the externally stored columns while still
-		x-latching index->lock and block->lock. We have to
-		mtr_commit(mtr) first, so that the redo log will be
-		written in the correct order. Otherwise, we would run
-		into trouble on crash recovery if mtr freed B-tree
-		pages on which some of the big_rec fields will be
-		written. */
-		btr_cur_mtr_commit_and_start(btr_cur, mtr);
-
+		/* Write out the externally stored columns, but
+		allocate the pages and write the pointers using the
+		mini-transaction of the record update. If any pages
+		were freed in the update, temporarily mark them
+		allocated so that off-page columns will not overwrite
+		them. We must do this, because we write the redo log
+		for the BLOB writes before writing the redo log for
+		the record update. */
+
+		btr_mark_freed_leaves(index, mtr, TRUE);
 		rec = btr_cur_get_rec(btr_cur);
 		err = btr_store_big_rec_extern_fields(
 			index, rec,
 			rec_get_offsets(rec, index, offsets_,
 					ULINT_UNDEFINED, &heap),
-			big_rec, mtr);
+			big_rec, mtr, mtr);
 		if (UNIV_LIKELY_NULL(heap)) {
 			mem_heap_free(heap);
 		}
@@ -1618,6 +1619,8 @@ row_upd_clust_rec(
 		to the undo log, and thus the record cannot be rolled
 		back. */
 		ut_a(err == DB_SUCCESS);
+		/* Free the pages again in order to avoid a leak. */
+		btr_mark_freed_leaves(index, mtr, FALSE);
 	}

 	mtr_commit(mtr);

--- a/storage/innobase/trx/trx0undo.c
+++ b/storage/innobase/trx/trx0undo.c
@@ -864,7 +864,7 @@ trx_undo_add_page(
 	page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
 					       + TRX_UNDO_FSEG_HEADER,
 					       undo->top_page_no + 1, FSP_UP,
-					       TRUE, mtr);
+					       TRUE, mtr, mtr);

 	fil_space_release_free_extents(undo->space, n_reserved);


--- a/storage/innodb_plugin/ChangeLog
+++ b/storage/innodb_plugin/ChangeLog
+2011-08-29	The InnoDB Team
+
+	* btr/btr0btr.c, btr/btr0cur.c, fsp/fsp0fsp.c,
+	include/btr0btr.h, include/btr0cur.h, include/fsp0fsp.h,
+	include/mtr0mtr.h, include/mtr0mtr.ic, mtr/mtr0mtr.c,
+	row/row0ins.c, row/row0row.c, row/row0upd.c, trx/trx0undo.c:
+	Fix Bug#12704861 Corruption after a crash during BLOB update
+	and other regressions from the fix of Bug#12612184
+
+2011-08-23	The InnoDB Team
+
+	* include/trx0undo.h, trx/trx0rec.c, trx/trx0undo.c:
+	Fix Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE
+
 2011-08-15	The InnoDB Team

 	* btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, btr/btr0sea.c,

--- a/storage/innodb_plugin/btr/btr0btr.c
+++ b/storage/innodb_plugin/btr/btr0btr.c
@@ -906,28 +906,29 @@ btr_page_alloc_for_ibuf(
 /**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents!
-@return	new allocated block, x-latched; NULL if out of space */
-UNIV_INTERN
-buf_block_t*
-btr_page_alloc(
-/*===========*/
+@return	allocated page number, FIL_NULL if out of space */
+static __attribute__((nonnull(1,5), warn_unused_result))
+ulint
+btr_page_alloc_low(
+/*===============*/
 	dict_index_t*	index,		/*!< in: index */
 	ulint		hint_page_no,	/*!< in: hint of a good page */
 	byte		file_direction,	/*!< in: direction where a possible
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr)		/*!< in: mtr */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					in which the page should be
+					initialized (may be the same
+					as mtr), or NULL if it should
+					not be initialized (the page
+					at hint was previously freed
+					in mtr) */
 {
 	fseg_header_t*	seg_header;
 	page_t*		root;
-	buf_block_t*	new_block;
-	ulint		new_page_no;
-
-	if (dict_index_is_ibuf(index)) {
-
-		return(btr_page_alloc_for_ibuf(index, mtr));
-	}

 	root = btr_root_get(index, mtr);

@@ -941,8 +942,42 @@ btr_page_alloc(
 	reservation for free extents, and thus we know that a page can
 	be allocated: */

-	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
-						   file_direction, TRUE, mtr);
+	return(fseg_alloc_free_page_general(
+		       seg_header, hint_page_no, file_direction,
+		       TRUE, mtr, init_mtr));
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@return	new allocated block, x-latched; NULL if out of space */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+{
+	buf_block_t*	new_block;
+	ulint		new_page_no;
+
+	if (dict_index_is_ibuf(index)) {
+
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
+
+	new_page_no = btr_page_alloc_low(
+		index, hint_page_no, file_direction, level, mtr, init_mtr);
+
 	if (new_page_no == FIL_NULL) {

 		return(NULL);
@@ -950,9 +985,16 @@ btr_page_alloc(

 	new_block = buf_page_get(dict_index_get_space(index),
 				 dict_table_zip_size(index->table),
-				 new_page_no, RW_X_LATCH, mtr);
+				 new_page_no, RW_X_LATCH, init_mtr);
 	buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);

+	if (mtr->freed_clust_leaf) {
+		mtr_memo_release(mtr, new_block, MTR_MEMO_FREE_CLUST_LEAF);
+		ut_ad(!mtr_memo_contains(mtr, new_block,
+					 MTR_MEMO_FREE_CLUST_LEAF));
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
 	return(new_block);
 }

@@ -1065,6 +1107,15 @@ btr_page_free_low(
 	fseg_free_page(seg_header,
 		       buf_block_get_space(block),
 		       buf_block_get_page_no(block), mtr);
+
+	/* The page was marked free in the allocation bitmap, but it
+	should remain buffer-fixed until mtr_commit(mtr) or until it
+	is explicitly freed from the mini-transaction. */
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* TODO: Discard any operations on the page from the redo log
+	and remove the block from the flush list and the buffer pool.
+	This would free up buffer pool earlier and reduce writes to
+	both the tablespace and the redo log. */
 }

 /**************************************************************//**
@@ -1078,13 +1129,140 @@ btr_page_free(
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ulint		level;
-
-	level = btr_page_get_level(buf_block_get_frame(block), mtr);
+	const page_t*	page	= buf_block_get_frame(block);
+	ulint		level	= btr_page_get_level(page, mtr);

+	ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX);
 	btr_page_free_low(index, block, level, mtr);
+
+	/* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	if (level == 0 && dict_index_is_clust(index)) {
+		/* We may have to call btr_mark_freed_leaves() to
+		temporarily mark the block nonfree for invoking
+		btr_store_big_rec_extern_fields_func() after an
+		update. Remember that the block was freed. */
+		mtr->freed_clust_leaf = TRUE;
+		mtr_memo_push(mtr, block, MTR_MEMO_FREE_CLUST_LEAF);
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
 }

+/**************************************************************//**
+Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
+For invoking btr_store_big_rec_extern_fields() after an update,
+we must temporarily mark freed clustered index pages allocated, so
+that off-page columns will not be allocated from them. Between the
+btr_store_big_rec_extern_fields() and mtr_commit() we have to
+mark the pages free again, so that no pages will be leaked. */
+UNIV_INTERN
+void
+btr_mark_freed_leaves(
+/*==================*/
+	dict_index_t*	index,	/*!< in/out: clustered index */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool		nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */
+{
+	/* This is loosely based on mtr_memo_release(). */
+
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	if (!mtr->freed_clust_leaf) {
+		return;
+	}
+
+	offset = dyn_array_get_data_size(&mtr->memo);
+
+	while (offset > 0) {
+		mtr_memo_slot_t*	slot;
+		buf_block_t*		block;
+
+		offset -= sizeof *slot;
+
+		slot = dyn_array_get_element(&mtr->memo, offset);
+
+		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
+			continue;
+		}
+
+		/* Because btr_page_alloc() does invoke
+		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
+		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
+		memo must still be clustered index leaf tree pages. */
+		block = slot->object;
+		ut_a(buf_block_get_space(block)
+		     == dict_index_get_space(index));
+		ut_a(fil_page_get_type(buf_block_get_frame(block))
+		     == FIL_PAGE_INDEX);
+		ut_a(page_is_leaf(buf_block_get_frame(block)));
+
+		if (nonfree) {
+			/* Allocate the same page again. */
+			ulint	page_no;
+			page_no = btr_page_alloc_low(
+				index, buf_block_get_page_no(block),
+				FSP_NO_DIR, 0, mtr, NULL);
+			ut_a(page_no == buf_block_get_page_no(block));
+		} else {
+			/* Assert that the page is allocated and free it. */
+			btr_page_free_low(index, block, 0, mtr);
+		}
+	}
+
+	ut_ad(btr_freed_leaves_validate(mtr));
+}
+
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
+@see btr_mark_freed_leaves()
+@return TRUE */
+UNIV_INTERN
+ibool
+btr_freed_leaves_validate(
+/*======================*/
+	mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+	ulint	offset;
+
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	offset = dyn_array_get_data_size(&mtr->memo);
+
+	while (offset > 0) {
+		const mtr_memo_slot_t*	slot;
+		const buf_block_t*	block;
+
+		offset -= sizeof *slot;
+
+		slot = dyn_array_get_element(&mtr->memo, offset);
+
+		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
+			continue;
+		}
+
+		ut_a(mtr->freed_clust_leaf);
+		/* Because btr_page_alloc() does invoke
+		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
+		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
+		memo must still be clustered index leaf tree pages. */
+		block = slot->object;
+		ut_a(fil_page_get_type(buf_block_get_frame(block))
+		     == FIL_PAGE_INDEX);
+		ut_a(page_is_leaf(buf_block_get_frame(block)));
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
 /**************************************************************//**
 Sets the child node file address in a node pointer. */
 UNIV_INLINE
@@ -1806,7 +1984,7 @@ btr_root_raise_and_insert(

 	level = btr_page_get_level(root, mtr);

-	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
+	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
 	new_page = buf_block_get_frame(new_block);
 	new_page_zip = buf_block_get_page_zip(new_block);
 	ut_a(!new_page_zip == !root_page_zip);
@@ -2542,7 +2720,7 @@ btr_page_split_and_insert(

 	/* 2. Allocate a new page to the index */
 	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
-				   btr_page_get_level(page, mtr), mtr);
+				   btr_page_get_level(page, mtr), mtr, mtr);
 	new_page = buf_block_get_frame(new_block);
 	new_page_zip = buf_block_get_page_zip(new_block);
 	btr_page_create(new_block, new_page_zip, cursor->index,

--- a/storage/innodb_plugin/btr/btr0cur.c
+++ b/storage/innodb_plugin/btr/btr0cur.c
@@ -2414,39 +2414,6 @@ btr_cur_pessimistic_update(
 	return(err);
 }

-/**************************************************************//**
-Commits and restarts a mini-transaction so that it will retain an
-x-lock on index->lock and the cursor page. */
-UNIV_INTERN
-void
-btr_cur_mtr_commit_and_start(
-/*=========================*/
-	btr_cur_t*	cursor,	/*!< in: cursor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-{
-	buf_block_t*	block;
-
-	block = btr_cur_get_block(cursor);
-
-	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
-				MTR_MEMO_X_LOCK));
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	/* Keep the locks across the mtr_commit(mtr). */
-	rw_lock_x_lock(dict_index_get_lock(cursor->index));
-	rw_lock_x_lock(&block->lock);
-	mutex_enter(&block->mutex);
-	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-	mutex_exit(&block->mutex);
-	/* Write out the redo log. */
-	mtr_commit(mtr);
-	mtr_start(mtr);
-	/* Reassociate the locks with the mini-transaction.
-	They will be released on mtr_commit(mtr). */
-	mtr_memo_push(mtr, dict_index_get_lock(cursor->index),
-		      MTR_MEMO_X_LOCK);
-	mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
-}
-
 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/

 /****************************************************************//**
@@ -3901,6 +3868,9 @@ btr_store_big_rec_extern_fields_func(
 					the "external storage" flags in offsets
 					will not correspond to rec when
 					this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+
 #ifdef UNIV_DEBUG
 	mtr_t*		local_mtr,	/*!< in: mtr containing the
 					latch to rec and to the tree */
@@ -3909,9 +3879,11 @@ btr_store_big_rec_extern_fields_func(
 	ibool		update_in_place,/*! in: TRUE if the record is updated
 					in place (not delete+insert) */
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
-	const big_rec_t*big_rec_vec)	/*!< in: vector containing fields
-					to be stored externally */
-
+	mtr_t*		alloc_mtr)	/*!< in/out: in an insert, NULL;
+					in an update, local_mtr for
+					allocating BLOB pages and
+					updating BLOB pointers; alloc_mtr
+					must not have freed any leaf pages */
 {
 	ulint	rec_page_no;
 	byte*	field_ref;
@@ -3930,6 +3902,9 @@ btr_store_big_rec_extern_fields_func(

 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(local_mtr);
+	ut_ad(!alloc_mtr || alloc_mtr == local_mtr);
+	ut_ad(!update_in_place || alloc_mtr);
 	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 				MTR_MEMO_X_LOCK));
 	ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
@@ -3945,6 +3920,25 @@ btr_store_big_rec_extern_fields_func(
 	rec_page_no = buf_block_get_page_no(rec_block);
 	ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);

+	if (alloc_mtr) {
+		/* Because alloc_mtr will be committed after
+		mtr, it is possible that the tablespace has been
+		extended when the B-tree record was updated or
+		inserted, or it will be extended while allocating
+		pages for big_rec.
+
+		TODO: In mtr (not alloc_mtr), write a redo log record
+		about extending the tablespace to its current size,
+		and remember the current size. Whenever the tablespace
+		grows as pages are allocated, write further redo log
+		records to mtr. (Currently tablespace extension is not
+		covered by the redo log. If it were, the record would
+		only be written to alloc_mtr, which is committed after
+		mtr.) */
+	} else {
+		alloc_mtr = &mtr;
+	}
+
 	if (UNIV_LIKELY_NULL(page_zip)) {
 		int	err;

@@ -4021,7 +4015,7 @@ btr_store_big_rec_extern_fields_func(
 			}

 			block = btr_page_alloc(index, hint_page_no,
-					       FSP_NO_DIR, 0, &mtr);
+					       FSP_NO_DIR, 0, alloc_mtr, &mtr);
 			if (UNIV_UNLIKELY(block == NULL)) {

 				mtr_commit(&mtr);
@@ -4148,11 +4142,15 @@ btr_store_big_rec_extern_fields_func(
 					goto next_zip_page;
 				}

-				rec_block = buf_page_get(space_id, zip_size,
-							 rec_page_no,
-							 RW_X_LATCH, &mtr);
-				buf_block_dbg_add_level(rec_block,
-							SYNC_NO_ORDER_CHECK);
+				if (alloc_mtr == &mtr) {
+					rec_block = buf_page_get(
+						space_id, zip_size,
+						rec_page_no,
+						RW_X_LATCH, &mtr);
+					buf_block_dbg_add_level(
+						rec_block,
+						SYNC_NO_ORDER_CHECK);
+				}

 				if (err == Z_STREAM_END) {
 					mach_write_to_4(field_ref
@@ -4186,7 +4184,8 @@ btr_store_big_rec_extern_fields_func(

 				page_zip_write_blob_ptr(
 					page_zip, rec, index, offsets,
-					big_rec_vec->fields[i].field_no, &mtr);
+					big_rec_vec->fields[i].field_no,
+					alloc_mtr);

 next_zip_page:
 				prev_page_no = page_no;
@@ -4231,19 +4230,23 @@ btr_store_big_rec_extern_fields_func(

 				extern_len -= store_len;

-				rec_block = buf_page_get(space_id, zip_size,
-							 rec_page_no,
-							 RW_X_LATCH, &mtr);
-				buf_block_dbg_add_level(rec_block,
-							SYNC_NO_ORDER_CHECK);
+				if (alloc_mtr == &mtr) {
+					rec_block = buf_page_get(
+						space_id, zip_size,
+						rec_page_no,
+						RW_X_LATCH, &mtr);
+					buf_block_dbg_add_level(
+						rec_block,
+						SYNC_NO_ORDER_CHECK);
+				}

 				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);
 				mlog_write_ulint(field_ref
 						 + BTR_EXTERN_LEN + 4,
 						 big_rec_vec->fields[i].len
 						 - extern_len,
-						 MLOG_4BYTES, &mtr);
+						 MLOG_4BYTES, alloc_mtr);

 				if (prev_page_no == FIL_NULL) {
 					btr_blob_dbg_add_blob(
@@ -4253,18 +4256,19 @@ btr_store_big_rec_extern_fields_func(

 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_SPACE_ID,
-							 space_id,
-							 MLOG_4BYTES, &mtr);
+							 space_id, MLOG_4BYTES,
+							 alloc_mtr);

 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_PAGE_NO,
-							 page_no,
-							 MLOG_4BYTES, &mtr);
+							 page_no, MLOG_4BYTES,
+							 alloc_mtr);

 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_OFFSET,
 							 FIL_PAGE_DATA,
-							 MLOG_4BYTES, &mtr);
+							 MLOG_4BYTES,
+							 alloc_mtr);
 				}

 				prev_page_no = page_no;

--- a/storage/innodb_plugin/buf/buf0buf.c
+++ b/storage/innodb_plugin/buf/buf0buf.c
@@ -1174,29 +1174,6 @@ buf_page_set_accessed_make_young(
 	}
 }

-/********************************************************************//**
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-UNIV_INTERN
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/*!< in: space id */
-	ulint	offset)	/*!< in: page number */
-{
-	buf_block_t*	block;
-
-	buf_pool_mutex_enter();
-
-	block = (buf_block_t*) buf_page_hash_get(space, offset);
-
-	if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
-		block->check_index_page_at_flush = FALSE;
-	}
-
-	buf_pool_mutex_exit();
-}
-
 /********************************************************************//**
 Returns the current state of is_hashed of a page. FALSE if the page is
 not in the pool. NOTE that this operation does not fix the page in the

--- a/storage/innodb_plugin/fsp/fsp0fsp.c
+++ b/storage/innodb_plugin/fsp/fsp0fsp.c
--- a/storage/innodb_plugin/include/btr0btr.h
+++ b/storage/innodb_plugin/include/btr0btr.h
@@ -557,7 +557,12 @@ btr_page_alloc(
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr);		/*!< in: mtr */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
 Frees a file page used in an index tree. NOTE: cannot free field external
 storage pages because the page must contain info on its level. */
@@ -580,6 +585,33 @@ btr_page_free_low(
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	ulint		level,	/*!< in: page level */
 	mtr_t*		mtr);	/*!< in: mtr */
+/**************************************************************//**
+Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
+For invoking btr_store_big_rec_extern_fields() after an update,
+we must temporarily mark freed clustered index pages allocated, so
+that off-page columns will not be allocated from them. Between the
+btr_store_big_rec_extern_fields() and mtr_commit() we have to
+mark the pages free again, so that no pages will be leaked. */
+UNIV_INTERN
+void
+btr_mark_freed_leaves(
+/*==================*/
+	dict_index_t*	index,	/*!< in/out: clustered index */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool		nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
+@see btr_mark_freed_leaves()
+@return TRUE */
+UNIV_INTERN
+ibool
+btr_freed_leaves_validate(
+/*======================*/
+	mtr_t*	mtr)	/*!< in: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
 #ifdef UNIV_BTR_PRINT
 /*************************************************************//**
 Prints size info of a B-tree. */

--- a/storage/innodb_plugin/include/btr0cur.h
+++ b/storage/innodb_plugin/include/btr0cur.h
@@ -326,16 +326,6 @@ btr_cur_pessimistic_update(
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr);	/*!< in: mtr; must be committed before
 				latching any further pages */
-/*****************************************************************
-Commits and restarts a mini-transaction so that it will retain an
-x-lock on index->lock and the cursor page. */
-UNIV_INTERN
-void
-btr_cur_mtr_commit_and_start(
-/*=========================*/
-	btr_cur_t*	cursor,	/*!< in: cursor */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	__attribute__((nonnull));
 /***********************************************************//**
 Marks a clustered index record deleted. Writes an undo log record to
 undo log on this delete marking. Writes in the trx id field the id
@@ -540,6 +530,8 @@ btr_store_big_rec_extern_fields_func(
 					the "external storage" flags in offsets
 					will not correspond to rec when
 					this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
 #ifdef UNIV_DEBUG
 	mtr_t*		local_mtr,	/*!< in: mtr containing the
 					latch to rec and to the tree */
@@ -548,9 +540,12 @@ btr_store_big_rec_extern_fields_func(
 	ibool		update_in_place,/*! in: TRUE if the record is updated
 					in place (not delete+insert) */
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
-	const big_rec_t*big_rec_vec)	/*!< in: vector containing fields
-					to be stored externally */
-	__attribute__((nonnull));
+	mtr_t*		alloc_mtr)	/*!< in/out: in an insert, NULL;
+					in an update, local_mtr for
+					allocating BLOB pages and
+					updating BLOB pointers; alloc_mtr
+					must not have freed any leaf pages */
+	__attribute__((nonnull(1,2,3,4,5), warn_unused_result));

 /** Stores the fields in big_rec_vec to the tablespace and puts pointers to
 them in rec.  The extern flags in rec will have to be set beforehand.
@@ -559,21 +554,22 @@ file segment of the index tree.
 @param index	in: clustered index; MUST be X-latched by mtr
 @param b	in/out: block containing rec; MUST be X-latched by mtr
 @param rec	in/out: clustered index record
-@param offsets	in: rec_get_offsets(rec, index);
+@param offs	in: rec_get_offsets(rec, index);
 		the "external storage" flags in offsets will not be adjusted
+@param big	in: vector containing fields to be stored externally
 @param mtr	in: mini-transaction that holds x-latch on index and b
 @param upd	in: TRUE if the record is updated in place (not delete+insert)
-@param big	in: vector containing fields to be stored externally
+@param rmtr	in/out: in updates, the mini-transaction that holds rec
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 #ifdef UNIV_DEBUG
-# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,mtr,upd,big)
+# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,mtr,upd,rmtr)
 #elif defined UNIV_BLOB_LIGHT_DEBUG
-# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,upd,big)
+# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,upd,rmtr)
 #else
-# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,big)
+# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,rmtr)
 #endif

 /*******************************************************************//**

--- a/storage/innodb_plugin/include/buf0buf.h
+++ b/storage/innodb_plugin/include/buf0buf.h
@@ -372,15 +372,6 @@ buf_page_peek(
 /*==========*/
 	ulint	space,	/*!< in: space id */
 	ulint	offset);/*!< in: page number */
-/********************************************************************//**
-Resets the check_index_page_at_flush field of a page if found in the buffer
-pool. */
-UNIV_INTERN
-void
-buf_reset_check_index_page_at_flush(
-/*================================*/
-	ulint	space,	/*!< in: space id */
-	ulint	offset);/*!< in: page number */
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 /********************************************************************//**
 Sets file_page_was_freed TRUE if the page is found in the buffer pool.

--- a/storage/innodb_plugin/include/fsp0fsp.h
+++ b/storage/innodb_plugin/include/fsp0fsp.h
 /*****************************************************************************

-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -176,19 +176,18 @@ fseg_n_reserved_pages(
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize
 file space fragmentation.
-@return	the allocated page offset FIL_NULL if no page could be allocated */
-UNIV_INTERN
-ulint
-fseg_alloc_free_page(
-/*=================*/
-	fseg_header_t*	seg_header, /*!< in: segment header */
-	ulint		hint,	/*!< in: hint of which page would be desirable */
-	byte		direction, /*!< in: if the new page is needed because
+@param[in/out] seg_header	segment header
+@param[in] hint			hint of which page would be desirable
+@param[in] direction		if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
-				FSP_UP, FSP_NO_DIR */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+				FSP_UP, FSP_NO_DIR
+@param[in/out] mtr		mini-transaction
+@return	the allocated page offset FIL_NULL if no page could be allocated */
+#define fseg_alloc_free_page(seg_header, hint, direction, mtr)		\
+	fseg_alloc_free_page_general(seg_header, hint, direction,	\
+				     FALSE, mtr, mtr)
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
@@ -198,7 +197,7 @@ UNIV_INTERN
 ulint
 fseg_alloc_free_page_general(
 /*=========================*/
-	fseg_header_t*	seg_header,/*!< in: segment header */
+	fseg_header_t*	seg_header,/*!< in/out: segment header */
 	ulint		hint,	/*!< in: hint of which page would be desirable */
 	byte		direction,/*!< in: if the new page is needed because
 				of an index page split, and records are
@@ -210,7 +209,12 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr);	/*!< in: mtr handle */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized,
+				or NULL if this is a "fake allocation" of
+				a page that was previously freed in mtr */
+	__attribute__((warn_unused_result, nonnull(1,5)));
 /**********************************************************************//**
 Reserves free pages from a tablespace. All mini-transactions which may
 use several pages from the tablespace should call this function beforehand

--- a/storage/innodb_plugin/include/mtr0mtr.h
+++ b/storage/innodb_plugin/include/mtr0mtr.h
 /*****************************************************************************

-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -53,6 +53,8 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
 #define MTR_MEMO_MODIFY		54
 #define	MTR_MEMO_S_LOCK		55
 #define	MTR_MEMO_X_LOCK		56
+/** The mini-transaction freed a clustered index leaf page. */
+#define MTR_MEMO_FREE_CLUST_LEAF	57

 /** @name Log item types
 The log items are declared 'byte' so that the compiler can warn if val
@@ -387,9 +389,12 @@ struct mtr_struct{
 #endif
 	dyn_array_t	memo;	/*!< memo stack for locks etc. */
 	dyn_array_t	log;	/*!< mini-transaction log */
-	ibool		modifications;
-				/* TRUE if the mtr made modifications to
-				buffer pool pages */
+	unsigned	modifications:1;
+				/*!< TRUE if the mini-transaction
+				modified buffer pool pages */
+	unsigned	freed_clust_leaf:1;
+				/*!< TRUE if MTR_MEMO_FREE_CLUST_LEAF
+				was logged in the mini-transaction */
 	ulint		n_log_recs;
 				/* count of how many page initial log records
 				have been written to the mtr log */

--- a/storage/innodb_plugin/include/mtr0mtr.ic
+++ b/storage/innodb_plugin/include/mtr0mtr.ic
 /*****************************************************************************

-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -44,6 +44,7 @@ mtr_start(

 	mtr->log_mode = MTR_LOG_ALL;
 	mtr->modifications = FALSE;
+	mtr->freed_clust_leaf = FALSE;
 	mtr->n_log_recs = 0;

 	ut_d(mtr->state = MTR_ACTIVE);
@@ -67,7 +68,8 @@ mtr_memo_push(

 	ut_ad(object);
 	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
-	ut_ad(type <= MTR_MEMO_X_LOCK);
+	ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF);
+	ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf);
 	ut_ad(mtr);
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 	ut_ad(mtr->state == MTR_ACTIVE);

--- a/storage/innodb_plugin/include/trx0undo.h
+++ b/storage/innodb_plugin/include/trx0undo.h
 /*****************************************************************************

-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -204,17 +204,51 @@ trx_undo_add_page(
 	mtr_t*		mtr);	/*!< in: mtr which does not have a latch to any
 				undo log page; the caller must have reserved
 				the rollback segment mutex */
+/********************************************************************//**
+Frees the last undo log page.
+The caller must hold the rollback segment mutex. */
+UNIV_INTERN
+void
+trx_undo_free_last_page_func(
+/*==========================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log memory copy */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction which does not
+				have a latch to any undo log page or which
+				has allocated the undo log page */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+# define trx_undo_free_last_page(trx,undo,mtr)	\
+	trx_undo_free_last_page_func(trx,undo,mtr)
+#else /* UNIV_DEBUG */
+# define trx_undo_free_last_page(trx,undo,mtr)	\
+	trx_undo_free_last_page_func(undo,mtr)
+#endif /* UNIV_DEBUG */
+
 /***********************************************************************//**
 Truncates an undo log from the end. This function is used during a rollback
 to free space from an undo log. */
 UNIV_INTERN
 void
-trx_undo_truncate_end(
-/*==================*/
-	trx_t*		trx,	/*!< in: transaction whose undo log it is */
-	trx_undo_t*	undo,	/*!< in: undo log */
-	undo_no_t	limit);	/*!< in: all undo records with undo number
+trx_undo_truncate_end_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction whose undo log it is */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log */
+	undo_no_t	limit)	/*!< in: all undo records with undo number
 				>= this value should be truncated */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+# define trx_undo_truncate_end(trx,undo,limit)		\
+	trx_undo_truncate_end_func(trx,undo,limit)
+#else /* UNIV_DEBUG */
+# define trx_undo_truncate_end(trx,undo,limit)		\
+	trx_undo_truncate_end_func(undo,limit)
+#endif /* UNIV_DEBUG */
+
 /***********************************************************************//**
 Truncates an undo log from the start. This function is used during a purge
 operation. */

--- a/storage/innodb_plugin/mtr/mtr0mtr.c
+++ b/storage/innodb_plugin/mtr/mtr0mtr.c
 /*****************************************************************************

-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -58,12 +58,11 @@ mtr_memo_slot_release(
 			buf_page_release((buf_block_t*)object, type, mtr);
 		} else if (type == MTR_MEMO_S_LOCK) {
 			rw_lock_s_unlock((rw_lock_t*)object);
-#ifdef UNIV_DEBUG
 		} else if (type != MTR_MEMO_X_LOCK) {
-			ut_ad(type == MTR_MEMO_MODIFY);
+			ut_ad(type == MTR_MEMO_MODIFY
+			      || type == MTR_MEMO_FREE_CLUST_LEAF);
 			ut_ad(mtr_memo_contains(mtr, object,
 						MTR_MEMO_PAGE_X_FIX));
-#endif /* UNIV_DEBUG */
 		} else {
 			rw_lock_x_unlock((rw_lock_t*)object);
 		}

--- a/storage/innodb_plugin/row/row0ins.c
+++ b/storage/innodb_plugin/row/row0ins.c
@@ -2094,15 +2094,20 @@ row_ins_index_entry_low(
 			if (big_rec) {
 				ut_a(err == DB_SUCCESS);
 				/* Write out the externally stored
-				columns while still x-latching
-				index->lock and block->lock. We have
-				to mtr_commit(mtr) first, so that the
-				redo log will be written in the
-				correct order. Otherwise, we would run
-				into trouble on crash recovery if mtr
-				freed B-tree pages on which some of
-				the big_rec fields will be written. */
-				btr_cur_mtr_commit_and_start(&cursor, &mtr);
+				columns, but allocate the pages and
+				write the pointers using the
+				mini-transaction of the record update.
+				If any pages were freed in the update,
+				temporarily mark them allocated so
+				that off-page columns will not
+				overwrite them. We must do this,
+				because we will write the redo log for
+				the BLOB writes before writing the
+				redo log for the record update. Thus,
+				redo log application at crash recovery
+				will see BLOBs being written to free pages. */
+
+				btr_mark_freed_leaves(index, &mtr, TRUE);

 				rec = btr_cur_get_rec(&cursor);
 				offsets = rec_get_offsets(
@@ -2111,7 +2116,8 @@ row_ins_index_entry_low(

 				err = btr_store_big_rec_extern_fields(
 					index, btr_cur_get_block(&cursor),
-					rec, offsets, &mtr, FALSE, big_rec);
+					rec, offsets, big_rec, &mtr,
+					FALSE, &mtr);
 				/* If writing big_rec fails (for
 				example, because of DB_OUT_OF_FILE_SPACE),
 				the record will be corrupted. Even if
@@ -2124,6 +2130,9 @@ row_ins_index_entry_low(
 				undo log, and thus the record cannot
 				be rolled back. */
 				ut_a(err == DB_SUCCESS);
+				/* Free the pages again
+				in order to avoid a leak. */
+				btr_mark_freed_leaves(index, &mtr, FALSE);
 				goto stored_big_rec;
 			}
 		} else {
@@ -2165,7 +2174,7 @@ row_ins_index_entry_low(

 		err = btr_store_big_rec_extern_fields(
 			index, btr_cur_get_block(&cursor),
-			rec, offsets, &mtr, FALSE, big_rec);
+			rec, offsets, big_rec, &mtr, FALSE, NULL);

 stored_big_rec:
 		if (modify) {

--- a/storage/innodb_plugin/row/row0row.c
+++ b/storage/innodb_plugin/row/row0row.c
@@ -243,19 +243,20 @@ row_build(
 	}

 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
-	/* This condition can occur during crash recovery before
-	trx_rollback_active() has completed execution.
-
-	This condition is possible if the server crashed
-	during an insert or update before
-	btr_store_big_rec_extern_fields() did mtr_commit() all
-	BLOB pointers to the clustered index record.
-
-	If the record contains a null BLOB pointer, look up the
-	transaction that holds the implicit lock on this record, and
-	assert that it was recovered (and will soon be rolled back). */
-	ut_a(!rec_offs_any_null_extern(rec, offsets)
-	     || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets)));
+	if (rec_offs_any_null_extern(rec, offsets)) {
+		/* This condition can occur during crash recovery
+		before trx_rollback_active() has completed execution.
+
+		This condition is possible if the server crashed
+		during an insert or update-by-delete-and-insert before
+		btr_store_big_rec_extern_fields() did mtr_commit() all
+		BLOB pointers to the freshly inserted clustered index
+		record. */
+		ut_a(trx_assert_recovered(
+			     row_get_rec_trx_id(rec, index, offsets)));
+		ut_a(trx_undo_roll_ptr_is_insert(
+			     row_get_rec_roll_ptr(rec, index, offsets)));
+	}
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */

 	if (type != ROW_COPY_POINTERS) {

--- a/storage/innodb_plugin/row/row0upd.c
+++ b/storage/innodb_plugin/row/row0upd.c
@@ -1978,21 +1978,22 @@ row_upd_clust_rec(
 		rec_offs_init(offsets_);

 		ut_a(err == DB_SUCCESS);
-		/* Write out the externally stored columns while still
-		x-latching index->lock and block->lock. We have to
-		mtr_commit(mtr) first, so that the redo log will be
-		written in the correct order. Otherwise, we would run
-		into trouble on crash recovery if mtr freed B-tree
-		pages on which some of the big_rec fields will be
-		written. */
-		btr_cur_mtr_commit_and_start(btr_cur, mtr);
-
+		/* Write out the externally stored columns, but
+		allocate the pages and write the pointers using the
+		mini-transaction of the record update. If any pages
+		were freed in the update, temporarily mark them
+		allocated so that off-page columns will not overwrite
+		them. We must do this, because we write the redo log
+		for the BLOB writes before writing the redo log for
+		the record update. */
+
+		btr_mark_freed_leaves(index, mtr, TRUE);
 		rec = btr_cur_get_rec(btr_cur);
 		err = btr_store_big_rec_extern_fields(
 			index, btr_cur_get_block(btr_cur), rec,
 			rec_get_offsets(rec, index, offsets_,
 					ULINT_UNDEFINED, &heap),
-			mtr, TRUE, big_rec);
+			big_rec, mtr, TRUE, mtr);
 		/* If writing big_rec fails (for example, because of
 		DB_OUT_OF_FILE_SPACE), the record will be corrupted.
 		Even if we did not update any externally stored
@@ -2002,6 +2003,8 @@ row_upd_clust_rec(
 		to the undo log, and thus the record cannot be rolled
 		back. */
 		ut_a(err == DB_SUCCESS);
+		/* Free the pages again in order to avoid a leak. */
+		btr_mark_freed_leaves(index, mtr, FALSE);
 	}

 	mtr_commit(mtr);

--- a/storage/innodb_plugin/trx/trx0rec.c
+++ b/storage/innodb_plugin/trx/trx0rec.c
@@ -1097,22 +1097,29 @@ trx_undo_rec_get_partial_row(
 #endif /* !UNIV_HOTBACKUP */

 /***********************************************************************//**
-Erases the unused undo log page end. */
-static
-void
+Erases the unused undo log page end.
+@return TRUE if the page contained something, FALSE if it was empty */
+static __attribute__((nonnull, warn_unused_result))
+ibool
 trx_undo_erase_page_end(
 /*====================*/
-	page_t*	undo_page,	/*!< in: undo page whose end to erase */
-	mtr_t*	mtr)		/*!< in: mtr */
+	page_t*	undo_page,	/*!< in/out: undo page whose end to erase */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
 {
 	ulint	first_free;

 	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
 				      + TRX_UNDO_PAGE_FREE);
+	if (first_free == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
+		/* This was an empty page to begin with.
+		Do nothing here; the caller should free the page. */
+		return(FALSE);
+	}
 	memset(undo_page + first_free, 0xff,
 	       (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free);

 	mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr);
+	return(TRUE);
 }

 /***********************************************************//**
@@ -1134,7 +1141,11 @@ trx_undo_parse_erase_page_end(
 		return(ptr);
 	}

-	trx_undo_erase_page_end(page, mtr);
+	if (!trx_undo_erase_page_end(page, mtr)) {
+		/* The function trx_undo_erase_page_end() should not
+		have done anything to an empty page. */
+		ut_ad(0);
+	}

 	return(ptr);
 }
@@ -1180,6 +1191,9 @@ trx_undo_report_row_operation(
 	mem_heap_t*	heap		= NULL;
 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
 	ulint*		offsets		= offsets_;
+#ifdef UNIV_DEBUG
+	int		loop_count	= 0;
+#endif /* UNIV_DEBUG */
 	rec_offs_init(offsets_);

 	ut_a(dict_index_is_clust(index));
@@ -1242,7 +1256,7 @@ trx_undo_report_row_operation(

 	mtr_start(&mtr);

-	for (;;) {
+	do {
 		buf_block_t*	undo_block;
 		page_t*		undo_page;
 		ulint		offset;
@@ -1271,7 +1285,19 @@ trx_undo_report_row_operation(
 			version the replicate page constructed using the log
 			records stays identical to the original page */

-			trx_undo_erase_page_end(undo_page, &mtr);
+			if (!trx_undo_erase_page_end(undo_page, &mtr)) {
+				/* The record did not fit on an empty
+				undo page. Discard the freshly allocated
+				page and return an error. */
+
+				mutex_enter(&rseg->mutex);
+				trx_undo_free_last_page(trx, undo, &mtr);
+				mutex_exit(&rseg->mutex);
+
+				err = DB_TOO_BIG_RECORD;
+				goto err_exit;
+			}
+
 			mtr_commit(&mtr);
 		} else {
 			/* Success */
@@ -1291,16 +1317,15 @@ trx_undo_report_row_operation(
 			*roll_ptr = trx_undo_build_roll_ptr(
 				op_type == TRX_UNDO_INSERT_OP,
 				rseg->id, page_no, offset);
-			if (UNIV_LIKELY_NULL(heap)) {
-				mem_heap_free(heap);
-			}
-			return(DB_SUCCESS);
+			err = DB_SUCCESS;
+			goto func_exit;
 		}

 		ut_ad(page_no == undo->last_page_no);

 		/* We have to extend the undo log by one page */

+		ut_ad(++loop_count < 2);
 		mtr_start(&mtr);

 		/* When we add a page to an undo log, this is analogous to
@@ -1312,18 +1337,19 @@ trx_undo_report_row_operation(
 		page_no = trx_undo_add_page(trx, undo, &mtr);

 		mutex_exit(&(rseg->mutex));
+	} while (UNIV_LIKELY(page_no != FIL_NULL));

-		if (UNIV_UNLIKELY(page_no == FIL_NULL)) {
-			/* Did not succeed: out of space */
+	/* Did not succeed: out of space */
+	err = DB_OUT_OF_FILE_SPACE;

-			mutex_exit(&(trx->undo_mutex));
-			mtr_commit(&mtr);
-			if (UNIV_LIKELY_NULL(heap)) {
-				mem_heap_free(heap);
-			}
-			return(DB_OUT_OF_FILE_SPACE);
-		}
+err_exit:
+	mutex_exit(&trx->undo_mutex);
+	mtr_commit(&mtr);
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
 	}
+	return(err);
 }

 /*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/

--- a/storage/innodb_plugin/trx/trx0undo.c
+++ b/storage/innodb_plugin/trx/trx0undo.c
 /*****************************************************************************

-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -912,7 +912,7 @@ trx_undo_add_page(
 	page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
 					       + TRX_UNDO_FSEG_HEADER,
 					       undo->top_page_no + 1, FSP_UP,
-					       TRUE, mtr);
+					       TRUE, mtr, mtr);

 	fil_space_release_free_extents(undo->space, n_reserved);

@@ -998,29 +998,28 @@ trx_undo_free_page(
 }

 /********************************************************************//**
-Frees an undo log page when there is also the memory object for the undo
-log. */
-static
+Frees the last undo log page.
+The caller must hold the rollback segment mutex. */
+UNIV_INTERN
 void
-trx_undo_free_page_in_rollback(
-/*===========================*/
-	trx_t*		trx __attribute__((unused)), /*!< in: transaction */
-	trx_undo_t*	undo,	/*!< in: undo log memory copy */
-	ulint		page_no,/*!< in: page number to free: must not be the
-				header page */
-	mtr_t*		mtr)	/*!< in: mtr which does not have a latch to any
-				undo log page; the caller must have reserved
-				the rollback segment mutex */
+trx_undo_free_last_page_func(
+/*==========================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log memory copy */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction which does not
+				have a latch to any undo log page or which
+				has allocated the undo log page */
 {
-	ulint	last_page_no;
-
-	ut_ad(undo->hdr_page_no != page_no);
-	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(mutex_own(&trx->undo_mutex));
+	ut_ad(undo->hdr_page_no != undo->last_page_no);
+	ut_ad(undo->size > 0);

-	last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space,
-					  undo->hdr_page_no, page_no, mtr);
+	undo->last_page_no = trx_undo_free_page(
+		undo->rseg, FALSE, undo->space,
+		undo->hdr_page_no, undo->last_page_no, mtr);

-	undo->last_page_no = last_page_no;
 	undo->size--;
 }

@@ -1056,9 +1055,11 @@ Truncates an undo log from the end. This function is used during a rollback
 to free space from an undo log. */
 UNIV_INTERN
 void
-trx_undo_truncate_end(
-/*==================*/
-	trx_t*		trx,	/*!< in: transaction whose undo log it is */
+trx_undo_truncate_end_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction whose undo log it is */
+#endif /* UNIV_DEBUG */
 	trx_undo_t*	undo,	/*!< in: undo log */
 	undo_no_t	limit)	/*!< in: all undo records with undo number
 				>= this value should be truncated */
@@ -1084,18 +1085,7 @@ trx_undo_truncate_end(

 		rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no,
 						 undo->hdr_offset);
-		for (;;) {
-			if (rec == NULL) {
-				if (last_page_no == undo->hdr_page_no) {
-
-					goto function_exit;
-				}
-
-				trx_undo_free_page_in_rollback(
-					trx, undo, last_page_no, &mtr);
-				break;
-			}
-
+		while (rec) {
 			if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit)
 			    >= 0) {
 				/* Truncate at least this record off, maybe
@@ -1110,6 +1100,14 @@ trx_undo_truncate_end(
 							 undo->hdr_offset);
 		}

+		if (last_page_no == undo->hdr_page_no) {
+
+			goto function_exit;
+		}
+
+		ut_ad(last_page_no == undo->last_page_no);
+		trx_undo_free_last_page(trx, undo, &mtr);
+
 		mtr_commit(&mtr);
 	}