io_write.c 43.1 KB
Newer Older
1 2 3 4 5 6 7
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
 * Copyright 2012 Google, Inc.
 */

#include "bcachefs.h"
8
#include "alloc_foreground.h"
9
#include "bkey_buf.h"
10 11 12 13 14
#include "bset.h"
#include "btree_update.h"
#include "buckets.h"
#include "checksum.h"
#include "clock.h"
Kent Overstreet's avatar
Kent Overstreet committed
15
#include "compress.h"
16
#include "debug.h"
17
#include "ec.h"
18
#include "error.h"
19
#include "extent_update.h"
20
#include "inode.h"
Kent Overstreet's avatar
Kent Overstreet committed
21
#include "io_write.h"
22 23 24
#include "journal.h"
#include "keylist.h"
#include "move.h"
25
#include "nocow_locking.h"
26
#include "rebalance.h"
27
#include "subvolume.h"
28 29 30 31 32
#include "super.h"
#include "super-io.h"
#include "trace.h"

#include <linux/blkdev.h>
Kent Overstreet's avatar
Kent Overstreet committed
33
#include <linux/prefetch.h>
34
#include <linux/random.h>
35
#include <linux/sched/mm.h>
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82

#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT

static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
				       u64 now, int rw)
{
	u64 latency_capable =
		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
	/* ideally we'd be taking into account the device's variance here: */
	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
	s64 latency_over = io_latency - latency_threshold;

	if (latency_threshold && latency_over > 0) {
		/*
		 * bump up congested by approximately latency_over * 4 /
		 * latency_threshold - we don't need much accuracy here so don't
		 * bother with the divide:
		 */
		if (atomic_read(&ca->congested) < CONGESTED_MAX)
			atomic_add(latency_over >>
				   max_t(int, ilog2(latency_threshold) - 2, 0),
				   &ca->congested);

		ca->congested_last = now;
	} else if (atomic_read(&ca->congested) > 0) {
		atomic_dec(&ca->congested);
	}
}

void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
{
	atomic64_t *latency = &ca->cur_latency[rw];
	u64 now = local_clock();
	u64 io_latency = time_after64(now, submit_time)
		? now - submit_time
		: 0;
	u64 old, new, v = atomic64_read(latency);

	do {
		old = v;

		/*
		 * If the io latency was reasonably close to the current
		 * latency, skip doing the update and atomic operation - most of
		 * the time:
		 */
		if (abs((int) (old - io_latency)) < (old >> 1) &&
83
		    now & ~(~0U << 5))
84 85 86 87 88 89 90
			break;

		new = ewma_add(old, io_latency, 5);
	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);

	bch2_congested_acct(ca, io_latency, now, rw);

91
	__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
}

#endif

/* Allocate, free from mempool: */

void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
	struct bvec_iter_all iter;
	struct bio_vec *bv;

	bio_for_each_segment_all(bv, bio, iter)
		if (bv->bv_page != ZERO_PAGE(0))
			mempool_free(bv->bv_page, &c->bio_bounce_pages);
	bio->bi_vcnt = 0;
}

109
static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
110
{
111
	struct page *page;
112 113

	if (likely(!*using_mempool)) {
114
		page = alloc_page(GFP_NOFS);
115
		if (unlikely(!page)) {
116 117 118 119 120 121 122
			mutex_lock(&c->bio_bounce_pages_lock);
			*using_mempool = true;
			goto pool_alloc;

		}
	} else {
pool_alloc:
123
		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
124 125
	}

126
	return page;
127 128 129
}

void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
130
			       size_t size)
131 132 133
{
	bool using_mempool = false;

134 135
	while (size) {
		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
136
		unsigned len = min_t(size_t, PAGE_SIZE, size);
137

138 139 140
		BUG_ON(!bio_add_page(bio, page, len, 0));
		size -= len;
	}
141 142 143 144 145

	if (using_mempool)
		mutex_unlock(&c->bio_bounce_pages_lock);
}

146 147
/* Extent update path: */

148 149 150
int bch2_sum_sector_overwrites(struct btree_trans *trans,
			       struct btree_iter *extent_iter,
			       struct bkey_i *new,
151
			       bool *usage_increasing,
152 153
			       s64 *i_sectors_delta,
			       s64 *disk_sectors_delta)
154
{
155
	struct bch_fs *c = trans->c;
Kent Overstreet's avatar
Kent Overstreet committed
156
	struct btree_iter iter;
157
	struct bkey_s_c old;
158 159
	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
160 161
	int ret = 0;

162
	*usage_increasing	= false;
163 164
	*i_sectors_delta	= 0;
	*disk_sectors_delta	= 0;
165

Kent Overstreet's avatar
Kent Overstreet committed
166
	bch2_trans_copy_iter(&iter, extent_iter);
167

168
	for_each_btree_key_upto_continue_norestart(iter,
169
				new->k.p, BTREE_ITER_slots, old, ret) {
170 171 172
		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
			max(bkey_start_offset(&new->k),
			    bkey_start_offset(old.k));
173

174
		*i_sectors_delta += sectors *
175 176 177
			(bkey_extent_is_allocation(&new->k) -
			 bkey_extent_is_allocation(old.k));

178 179 180 181
		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
			: 0;
182

183
		if (!*usage_increasing &&
184 185
		    (new->k.p.snapshot != old.k->p.snapshot ||
		     new_replicas > bch2_bkey_replicas(c, old) ||
186
		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
187
			*usage_increasing = true;
188

189
		if (bkey_ge(old.k->p, new->k.p))
190 191 192
			break;
	}

Kent Overstreet's avatar
Kent Overstreet committed
193
	bch2_trans_iter_exit(trans, &iter);
194 195 196
	return ret;
}

197 198 199 200 201
static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
						    struct btree_iter *extent_iter,
						    u64 new_i_size,
						    s64 i_sectors_delta)
{
202 203 204 205 206 207 208 209 210 211 212
	/*
	 * Crazy performance optimization:
	 * Every extent update needs to also update the inode: the inode trigger
	 * will set bi->journal_seq to the journal sequence number of this
	 * transaction - for fsync.
	 *
	 * But if that's the only reason we're updating the inode (we're not
	 * updating bi_size or bi_sectors), then we don't need the inode update
	 * to be journalled - if we crash, the bi_journal_seq update will be
	 * lost, but that's fine.
	 */
213
	unsigned inode_update_flags = BTREE_UPDATE_nojournal;
214

215 216
	struct btree_iter iter;
	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
217 218 219
			      SPOS(0,
				   extent_iter->pos.inode,
				   extent_iter->snapshot),
220
			      BTREE_ITER_cached);
221
	int ret = bkey_err(k);
222
	if (unlikely(ret))
223
		return ret;
224

225 226 227 228 229 230 231 232 233 234 235 236 237 238
	/*
	 * varint_decode_fast(), in the inode .invalid method, reads up to 7
	 * bytes past the end of the buffer:
	 */
	struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
	ret = PTR_ERR_OR_ZERO(k_mut);
	if (unlikely(ret))
		goto err;

	bkey_reassemble(k_mut, k);

	if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
		k_mut = bch2_inode_to_v3(trans, k_mut);
		ret = PTR_ERR_OR_ZERO(k_mut);
239 240 241 242
		if (unlikely(ret))
			goto err;
	}

243
	struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
244

245
	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
	    new_i_size > le64_to_cpu(inode->v.bi_size)) {
		inode->v.bi_size = cpu_to_le64(new_i_size);
		inode_update_flags = 0;
	}

	if (i_sectors_delta) {
		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
		inode_update_flags = 0;
	}

	if (inode->k.p.snapshot != iter.snapshot) {
		inode->k.p.snapshot = iter.snapshot;
		inode_update_flags = 0;
	}

	ret = bch2_trans_update(trans, &iter, &inode->k_i,
262
				BTREE_UPDATE_internal_snapshot_node|
263 264 265 266 267 268
				inode_update_flags);
err:
	bch2_trans_iter_exit(trans, &iter);
	return ret;
}

269
int bch2_extent_update(struct btree_trans *trans,
270
		       subvol_inum inum,
271 272 273 274
		       struct btree_iter *iter,
		       struct bkey_i *k,
		       struct disk_reservation *disk_res,
		       u64 new_i_size,
275 276
		       s64 *i_sectors_delta_total,
		       bool check_enospc)
277
{
278
	struct bpos next_pos;
279
	bool usage_increasing;
280
	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
281 282
	int ret;

283 284 285 286 287 288 289 290 291 292
	/*
	 * This traverses us the iterator without changing iter->path->pos to
	 * search_key() (which is pos + 1 for extents): we want there to be a
	 * path already traversed at iter->pos because
	 * bch2_trans_extent_update() will use it to attempt extent merging
	 */
	ret = __bch2_btree_iter_traverse(iter);
	if (ret)
		return ret;

293
	ret = bch2_extent_trim_atomic(trans, iter, k);
294 295 296
	if (ret)
		return ret;

297 298
	next_pos = k->k.p;

299
	ret = bch2_sum_sector_overwrites(trans, iter, k,
300
			&usage_increasing,
301 302
			&i_sectors_delta,
			&disk_sectors_delta);
303 304 305
	if (ret)
		return ret;

306 307 308 309
	if (disk_res &&
	    disk_sectors_delta > (s64) disk_res->sectors) {
		ret = bch2_disk_reservation_add(trans->c, disk_res,
					disk_sectors_delta - disk_res->sectors,
310
					!check_enospc || !usage_increasing
311
					? BCH_DISK_RESERVATION_NOFAIL : 0);
312 313 314 315
		if (ret)
			return ret;
	}

316 317
	/*
	 * Note:
318
	 * We always have to do an inode update - even when i_size/i_sectors
319 320 321
	 * aren't changing - for fsync to work properly; fsync relies on
	 * inode->bi_journal_seq which is updated by the trigger code:
	 */
322 323 324
	ret =   bch2_extent_update_i_size_sectors(trans, iter,
						  min(k->k.p.offset << 9, new_i_size),
						  i_sectors_delta) ?:
325
		bch2_trans_update(trans, iter, k, 0) ?:
326
		bch2_trans_commit(trans, disk_res, NULL,
327 328
				BCH_TRANS_COMMIT_no_check_rw|
				BCH_TRANS_COMMIT_no_enospc);
329
	if (unlikely(ret))
330
		return ret;
331

332 333
	if (i_sectors_delta_total)
		*i_sectors_delta_total += i_sectors_delta;
334
	bch2_btree_iter_set_pos(iter, next_pos);
335
	return 0;
336 337
}

338
static int bch2_write_index_default(struct bch_write_op *op)
339 340
{
	struct bch_fs *c = op->c;
341
	struct bkey_buf sk;
342 343
	struct keylist *keys = &op->insert_keys;
	struct bkey_i *k = bch2_keylist_front(keys);
344
	struct btree_trans *trans = bch2_trans_get(c);
Kent Overstreet's avatar
Kent Overstreet committed
345
	struct btree_iter iter;
346 347 348 349
	subvol_inum inum = {
		.subvol = op->subvol,
		.inum	= k->k.p.inode,
	};
350 351
	int ret;

352 353
	BUG_ON(!inum.subvol);

354
	bch2_bkey_buf_init(&sk);
355 356

	do {
357
		bch2_trans_begin(trans);
358

Kent Overstreet's avatar
Kent Overstreet committed
359
		k = bch2_keylist_front(keys);
360
		bch2_bkey_buf_copy(&sk, c, k);
361

362
		ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
363
						  &sk.k->k.p.snapshot);
364
		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
365 366 367
			continue;
		if (ret)
			break;
368

369
		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
370
				     bkey_start_pos(&sk.k->k),
371
				     BTREE_ITER_slots|BTREE_ITER_intent);
372

373
		ret =   bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
374 375 376 377
			bch2_extent_update(trans, inum, &iter, sk.k,
					&op->res,
					op->new_i_size, &op->i_sectors_delta,
					op->flags & BCH_WRITE_CHECK_ENOSPC);
378
		bch2_trans_iter_exit(trans, &iter);
379

380
		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
381 382 383 384
			continue;
		if (ret)
			break;

385
		if (bkey_ge(iter.pos, k->k.p))
386 387 388
			bch2_keylist_pop_front(&op->insert_keys);
		else
			bch2_cut_front(iter.pos, k);
389 390
	} while (!bch2_keylist_empty(keys));

391
	bch2_trans_put(trans);
392
	bch2_bkey_buf_exit(&sk, c);
393 394 395 396

	return ret;
}

397 398 399 400
/* Writes */

void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
			       enum bch_data_type type,
Kent Overstreet's avatar
Kent Overstreet committed
401 402
			       const struct bkey_i *k,
			       bool nocow)
403
{
404
	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
405 406 407 408
	struct bch_write_bio *n;

	BUG_ON(c->opts.nochanges);

409
	bkey_for_each_ptr(ptrs, ptr) {
410
		BUG_ON(!bch2_dev_exists(c, ptr->dev));
411

412
		struct bch_dev *ca = bch2_dev_bkey_exists(c, ptr->dev);
413

414
		if (to_entry(ptr + 1) < ptrs.end) {
415
			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431

			n->bio.bi_end_io	= wbio->bio.bi_end_io;
			n->bio.bi_private	= wbio->bio.bi_private;
			n->parent		= wbio;
			n->split		= true;
			n->bounce		= false;
			n->put_bio		= true;
			n->bio.bi_opf		= wbio->bio.bi_opf;
			bio_inc_remaining(&wbio->bio);
		} else {
			n = wbio;
			n->split		= false;
		}

		n->c			= c;
		n->dev			= ptr->dev;
Kent Overstreet's avatar
Kent Overstreet committed
432
		n->have_ioref		= nocow || bch2_dev_get_ioref(ca,
433
					type == BCH_DATA_btree ? READ : WRITE);
Kent Overstreet's avatar
Kent Overstreet committed
434
		n->nocow		= nocow;
435
		n->submit_time		= local_clock();
436
		n->inode_offset		= bkey_start_offset(&k->k);
437 438 439 440 441 442 443 444
		n->bio.bi_iter.bi_sector = ptr->offset;

		if (likely(n->have_ioref)) {
			this_cpu_add(ca->io_done->sectors[WRITE][type],
				     bio_sectors(&n->bio));

			bio_set_dev(&n->bio, ca->disk_sb.bdev);

445
			if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
446 447 448 449 450 451 452 453 454 455 456 457
				bio_endio(&n->bio);
				continue;
			}

			submit_bio(&n->bio);
		} else {
			n->bio.bi_status	= BLK_STS_REMOVED;
			bio_endio(&n->bio);
		}
	}
}

458
static void __bch2_write(struct bch_write_op *);
459

460
static void bch2_write_done(struct closure *cl)
461 462 463 464
{
	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
	struct bch_fs *c = op->c;

465 466 467
	EBUG_ON(op->open_buckets.nr);

	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
468
	bch2_disk_reservation_put(c, &op->res);
469

470 471
	if (!(op->flags & BCH_WRITE_MOVE))
		bch2_write_ref_put(c, BCH_WRITE_REF_write);
472 473
	bch2_keylist_free(&op->insert_keys, op->inline_keys);

474 475
	EBUG_ON(cl->parent);
	closure_debug_destroy(cl);
476 477 478 479
	if (op->end_io)
		op->end_io(op);
}

480
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
481 482
{
	struct keylist *keys = &op->insert_keys;
483
	struct bkey_i *src, *dst = keys->keys, *n;
484 485 486 487

	for (src = keys->keys; src != keys->top; src = n) {
		n = bkey_next(src);

488 489 490
		if (bkey_extent_is_direct_data(&src->k)) {
			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
					    test_bit(ptr->dev, op->failed.d));
491

492 493
			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
				return -EIO;
494 495
		}

496
		if (dst != src)
497
			memmove_u64s_down(dst, src, src->k.u64s);
498 499 500 501
		dst = bkey_next(dst);
	}

	keys->top = dst;
502 503 504 505
	return 0;
}

/**
506 507
 * __bch2_write_index - after a write, update index to point to new data
 * @op:		bch_write_op to process
508 509 510 511 512 513 514 515 516 517 518 519 520
 */
static void __bch2_write_index(struct bch_write_op *op)
{
	struct bch_fs *c = op->c;
	struct keylist *keys = &op->insert_keys;
	unsigned dev;
	int ret = 0;

	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
		ret = bch2_write_drop_io_error_ptrs(op);
		if (ret)
			goto err;
	}
521 522 523

	if (!bch2_keylist_empty(keys)) {
		u64 sectors_start = keylist_sectors(keys);
524 525 526

		ret = !(op->flags & BCH_WRITE_MOVE)
			? bch2_write_index_default(op)
527
			: bch2_data_update_index_update(op);
528

529
		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
530 531 532 533
		BUG_ON(keylist_sectors(keys) && !ret);

		op->written += sectors_start - keylist_sectors(keys);

534
		if (ret && !bch2_err_matches(ret, EROFS)) {
535
			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
536 537

			bch_err_inum_offset_ratelimited(c,
538
				insert->k.p.inode, insert->k.p.offset << 9,
539 540
				"%s write error while doing btree update: %s",
				op->flags & BCH_WRITE_MOVE ? "move" : "user",
541
				bch2_err_str(ret));
542
		}
543 544 545

		if (ret)
			goto err;
546 547
	}
out:
548 549 550 551
	/* If some a bucket wasn't written, we can't erasure code it: */
	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
		bch2_open_bucket_write_error(c, &op->open_buckets, dev);

552
	bch2_open_buckets_put(c, &op->open_buckets);
553 554 555 556
	return;
err:
	keys->top = keys->keys;
	op->error = ret;
557
	op->flags |= BCH_WRITE_DONE;
558 559 560
	goto out;
}

561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
{
	if (state != wp->state) {
		u64 now = ktime_get_ns();

		if (wp->last_state_change &&
		    time_after64(now, wp->last_state_change))
			wp->time[wp->state] += now - wp->last_state_change;
		wp->state = state;
		wp->last_state_change = now;
	}
}

static inline void wp_update_state(struct write_point *wp, bool running)
{
	enum write_point_state state;

	state = running			 ? WRITE_POINT_running :
		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
					 : WRITE_POINT_stopped;

	__wp_update_state(wp, state);
}

585
static CLOSURE_CALLBACK(bch2_write_index)
586
{
587
	closure_type(op, struct bch_write_op, cl);
588 589
	struct write_point *wp = op->wp;
	struct workqueue_struct *wq = index_update_wq(op);
590
	unsigned long flags;
591

592 593 594 595
	if ((op->flags & BCH_WRITE_DONE) &&
	    (op->flags & BCH_WRITE_MOVE))
		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);

596
	spin_lock_irqsave(&wp->writes_lock, flags);
597 598
	if (wp->state == WRITE_POINT_waiting_io)
		__wp_update_state(wp, WRITE_POINT_waiting_work);
599 600
	list_add_tail(&op->wp_list, &wp->writes);
	spin_unlock_irqrestore (&wp->writes_lock, flags);
601 602 603 604

	queue_work(wq, &wp->index_update_work);
}

605 606 607 608
static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
{
	op->wp = wp;

609 610
	if (wp->state == WRITE_POINT_stopped) {
		spin_lock_irq(&wp->writes_lock);
611
		__wp_update_state(wp, WRITE_POINT_waiting_io);
612 613
		spin_unlock_irq(&wp->writes_lock);
	}
614 615
}

616 617 618 619 620 621 622
void bch2_write_point_do_index_updates(struct work_struct *work)
{
	struct write_point *wp =
		container_of(work, struct write_point, index_update_work);
	struct bch_write_op *op;

	while (1) {
623 624 625 626
		spin_lock_irq(&wp->writes_lock);
		op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
		if (op)
			list_del(&op->wp_list);
627
		wp_update_state(wp, op != NULL);
628
		spin_unlock_irq(&wp->writes_lock);
629 630 631 632 633 634 635 636

		if (!op)
			break;

		op->flags |= BCH_WRITE_IN_WORKER;

		__bch2_write_index(op);

637
		if (!(op->flags & BCH_WRITE_DONE))
638
			__bch2_write(op);
639 640
		else
			bch2_write_done(&op->cl);
641 642 643 644 645 646 647 648 649 650
	}
}

static void bch2_write_endio(struct bio *bio)
{
	struct closure *cl		= bio->bi_private;
	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
	struct bch_write_bio *wbio	= to_wbio(bio);
	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
	struct bch_fs *c		= wbio->c;
651
	struct bch_dev *ca		= bch2_dev_bkey_exists(c, wbio->dev);
652

653
	if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
654
				    op->pos.inode,
655
				    wbio->inode_offset << 9,
656
				    "data write error: %s",
657
				    bch2_blk_status_to_str(bio->bi_status))) {
658
		set_bit(wbio->dev, op->failed.d);
659 660
		op->flags |= BCH_WRITE_IO_ERROR;
	}
661

Kent Overstreet's avatar
Kent Overstreet committed
662 663 664
	if (wbio->nocow)
		set_bit(wbio->dev, op->devs_need_flush->d);

665 666 667 668 669 670 671 672 673 674 675 676 677
	if (wbio->have_ioref) {
		bch2_latency_acct(ca, wbio->submit_time, WRITE);
		percpu_ref_put(&ca->io_ref);
	}

	if (wbio->bounce)
		bch2_bio_free_pages_pool(c, bio);

	if (wbio->put_bio)
		bio_put(bio);

	if (parent)
		bio_endio(&parent->bio);
678
	else
679
		closure_put(cl);
680 681 682 683 684 685 686
}

static void init_append_extent(struct bch_write_op *op,
			       struct write_point *wp,
			       struct bversion version,
			       struct bch_extent_crc_unpacked crc)
{
687
	struct bkey_i_extent *e;
688 689

	op->pos.offset += crc.uncompressed_size;
690 691

	e = bkey_extent_init(op->insert_keys.top);
692 693 694
	e->k.p		= op->pos;
	e->k.size	= crc.uncompressed_size;
	e->k.version	= version;
695

696 697 698
	if (crc.csum_type ||
	    crc.compression_type ||
	    crc.nonce)
699
		bch2_extent_crc_append(&e->k_i, crc);
700

701
	bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
702
				       op->flags & BCH_WRITE_CACHED);
703

704 705 706 707 708 709
	bch2_keylist_push(&op->insert_keys);
}

static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
					struct write_point *wp,
					struct bio *src,
710 711
					bool *page_alloc_failed,
					void *buf)
712 713 714 715 716
{
	struct bch_write_bio *wbio;
	struct bio *bio;
	unsigned output_available =
		min(wp->sectors_free << 9, src->bi_iter.bi_size);
717 718 719 720
	unsigned pages = DIV_ROUND_UP(output_available +
				      (buf
				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
				       : 0), PAGE_SIZE);
721

722 723
	pages = min(pages, BIO_MAX_VECS);

724
	bio = bio_alloc_bioset(NULL, pages, 0,
725
			       GFP_NOFS, &c->bio_write);
726 727 728 729 730
	wbio			= wbio_init(bio);
	wbio->put_bio		= true;
	/* copy WRITE_SYNC flag */
	wbio->bio.bi_opf	= src->bi_opf;

731
	if (buf) {
732
		bch2_bio_map(bio, buf, output_available);
733 734 735 736 737
		return bio;
	}

	wbio->bounce		= true;

738 739 740 741
	/*
	 * We can't use mempool for more than c->sb.encoded_extent_max
	 * worth of pages, but we'd like to allocate more if we can:
	 */
742 743
	bch2_bio_alloc_pages_pool(c, bio,
				  min_t(unsigned, output_available,
744
					c->opts.encoded_extent_max));
745

746 747 748 749 750 751
	if (bio->bi_iter.bi_size < output_available)
		*page_alloc_failed =
			bch2_bio_alloc_pages(bio,
					     output_available -
					     bio->bi_iter.bi_size,
					     GFP_NOFS) != 0;
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787

	return bio;
}

static int bch2_write_rechecksum(struct bch_fs *c,
				 struct bch_write_op *op,
				 unsigned new_csum_type)
{
	struct bio *bio = &op->wbio.bio;
	struct bch_extent_crc_unpacked new_crc;
	int ret;

	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */

	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
	    bch2_csum_type_is_encryption(new_csum_type))
		new_csum_type = op->crc.csum_type;

	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
				  NULL, &new_crc,
				  op->crc.offset, op->crc.live_size,
				  new_csum_type);
	if (ret)
		return ret;

	bio_advance(bio, op->crc.offset << 9);
	bio->bi_iter.bi_size = op->crc.live_size << 9;
	op->crc = new_crc;
	return 0;
}

static int bch2_write_decrypt(struct bch_write_op *op)
{
	struct bch_fs *c = op->c;
	struct nonce nonce = extent_nonce(op->version, op->crc);
	struct bch_csum csum;
788
	int ret;
789 790 791 792 793 794 795 796 797 798 799

	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
		return 0;

	/*
	 * If we need to decrypt data in the write path, we'll no longer be able
	 * to verify the existing checksum (poly1305 mac, in this case) after
	 * it's decrypted - this is the last point we'll be able to reverify the
	 * checksum:
	 */
	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
800
	if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
801 802
		return -EIO;

803
	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
804 805
	op->crc.csum_type = 0;
	op->crc.csum = (struct bch_csum) { 0, 0 };
806
	return ret;
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
}

static enum prep_encoded_ret {
	PREP_ENCODED_OK,
	PREP_ENCODED_ERR,
	PREP_ENCODED_CHECKSUM_ERR,
	PREP_ENCODED_DO_WRITE,
} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
{
	struct bch_fs *c = op->c;
	struct bio *bio = &op->wbio.bio;

	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
		return PREP_ENCODED_OK;

	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);

	/* Can we just write the entire extent as is? */
	if (op->crc.uncompressed_size == op->crc.live_size &&
826
	    op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
827
	    op->crc.compressed_size <= wp->sectors_free &&
828
	    (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
829 830
	     op->incompressible)) {
		if (!crc_is_compressed(op->crc) &&
831
		    op->csum_type != op->crc.csum_type &&
832 833
		    bch2_write_rechecksum(c, op, op->csum_type) &&
		    !c->opts.no_data_io)
834 835 836 837 838 839 840 841 842
			return PREP_ENCODED_CHECKSUM_ERR;

		return PREP_ENCODED_DO_WRITE;
	}

	/*
	 * If the data is compressed and we couldn't write the entire extent as
	 * is, we have to decompress it:
	 */
843
	if (crc_is_compressed(op->crc)) {
844 845 846 847 848 849 850 851 852
		struct bch_csum csum;

		if (bch2_write_decrypt(op))
			return PREP_ENCODED_CHECKSUM_ERR;

		/* Last point we can still verify checksum: */
		csum = bch2_checksum_bio(c, op->crc.csum_type,
					 extent_nonce(op->version, op->crc),
					 bio);
853
		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870
			return PREP_ENCODED_CHECKSUM_ERR;

		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
			return PREP_ENCODED_ERR;
	}

	/*
	 * No longer have compressed data after this point - data might be
	 * encrypted:
	 */

	/*
	 * If the data is checksummed and we're only writing a subset,
	 * rechecksum and adjust bio to point to currently live data:
	 */
	if ((op->crc.live_size != op->crc.uncompressed_size ||
	     op->crc.csum_type != op->csum_type) &&
871 872
	    bch2_write_rechecksum(c, op, op->csum_type) &&
	    !c->opts.no_data_io)
873 874 875 876 877
		return PREP_ENCODED_CHECKSUM_ERR;

	/*
	 * If we want to compress the data, it has to be decrypted:
	 */
878
	if ((op->compression_opt ||
879 880 881 882 883 884 885 886
	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
	     bch2_csum_type_is_encryption(op->csum_type)) &&
	    bch2_write_decrypt(op))
		return PREP_ENCODED_CHECKSUM_ERR;

	return PREP_ENCODED_OK;
}

887 888
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
			     struct bio **_dst)
889 890 891 892
{
	struct bch_fs *c = op->c;
	struct bio *src = &op->wbio.bio, *dst = src;
	struct bvec_iter saved_iter;
893 894 895 896
	void *ec_buf;
	unsigned total_output = 0, total_input = 0;
	bool bounce = false;
	bool page_alloc_failed = false;
897 898 899 900
	int ret, more = 0;

	BUG_ON(!bio_sectors(src));

901 902
	ec_buf = bch2_writepoint_ec_buf(c, wp);

903 904 905 906 907 908 909 910 911
	switch (bch2_write_prep_encoded_data(op, wp)) {
	case PREP_ENCODED_OK:
		break;
	case PREP_ENCODED_ERR:
		ret = -EIO;
		goto err;
	case PREP_ENCODED_CHECKSUM_ERR:
		goto csum_err;
	case PREP_ENCODED_DO_WRITE:
912
		/* XXX look for bug here */
913 914 915 916 917 918 919
		if (ec_buf) {
			dst = bch2_write_bio_alloc(c, wp, src,
						   &page_alloc_failed,
						   ec_buf);
			bio_copy_data(dst, src);
			bounce = true;
		}
920 921 922 923
		init_append_extent(op, wp, op->version, op->crc);
		goto do_write;
	}

924
	if (ec_buf ||
925
	    op->compression_opt ||
926 927 928 929
	    (op->csum_type &&
	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
	    (bch2_csum_type_is_encryption(op->csum_type) &&
	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
930 931 932
		dst = bch2_write_bio_alloc(c, wp, src,
					   &page_alloc_failed,
					   ec_buf);
933 934 935 936 937 938
		bounce = true;
	}

	saved_iter = dst->bi_iter;

	do {
939
		struct bch_extent_crc_unpacked crc = { 0 };
940
		struct bversion version = op->version;
941
		size_t dst_len = 0, src_len = 0;
942 943

		if (page_alloc_failed &&
944 945
		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
946 947
			break;

948
		BUG_ON(op->compression_opt &&
949 950
		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
		       bch2_csum_type_is_encryption(op->crc.csum_type));
951
		BUG_ON(op->compression_opt && !bounce);
952

953 954
		crc.compression_type = op->incompressible
			? BCH_COMPRESSION_TYPE_incompressible
955
			: op->compression_opt
956
			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
957
					    op->compression_opt)
958
			: 0;
959
		if (!crc_is_compressed(crc)) {
960 961 962 963 964
			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);

			if (op->csum_type)
				dst_len = min_t(unsigned, dst_len,
965
						c->opts.encoded_extent_max);
966 967 968 969 970 971 972 973 974 975 976 977 978 979

			if (bounce) {
				swap(dst->bi_iter.bi_size, dst_len);
				bio_copy_data(dst, src);
				swap(dst->bi_iter.bi_size, dst_len);
			}

			src_len = dst_len;
		}

		BUG_ON(!src_len || !dst_len);

		if (bch2_csum_type_is_encryption(op->csum_type)) {
			if (bversion_zero(version)) {
980
				version.lo = atomic64_inc_return(&c->key_version);
981 982 983 984 985 986 987
			} else {
				crc.nonce = op->nonce;
				op->nonce += src_len >> 9;
			}
		}

		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
988
		    !crc_is_compressed(crc) &&
989 990
		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
		    bch2_csum_type_is_encryption(op->csum_type)) {
991 992
			u8 compression_type = crc.compression_type;
			u16 nonce = crc.nonce;
993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
			/*
			 * Note: when we're using rechecksum(), we need to be
			 * checksumming @src because it has all the data our
			 * existing checksum covers - if we bounced (because we
			 * were trying to compress), @dst will only have the
			 * part of the data the new checksum will cover.
			 *
			 * But normally we want to be checksumming post bounce,
			 * because part of the reason for bouncing is so the
			 * data can't be modified (by userspace) while it's in
			 * flight.
			 */
			if (bch2_rechecksum_bio(c, src, version, op->crc,
					&crc, &op->crc,
					src_len >> 9,
					bio_sectors(src) - (src_len >> 9),
					op->csum_type))
				goto csum_err;
1011 1012 1013 1014 1015 1016 1017
			/*
			 * rchecksum_bio sets compression_type on crc from op->crc,
			 * this isn't always correct as sometimes we're changing
			 * an extent from uncompressed to incompressible.
			 */
			crc.compression_type = compression_type;
			crc.nonce = nonce;
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
		} else {
			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
			    bch2_rechecksum_bio(c, src, version, op->crc,
					NULL, &op->crc,
					src_len >> 9,
					bio_sectors(src) - (src_len >> 9),
					op->crc.csum_type))
				goto csum_err;

			crc.compressed_size	= dst_len >> 9;
			crc.uncompressed_size	= src_len >> 9;
			crc.live_size		= src_len >> 9;

			swap(dst->bi_iter.bi_size, dst_len);
1032 1033 1034 1035 1036
			ret = bch2_encrypt_bio(c, op->csum_type,
					       extent_nonce(version, crc), dst);
			if (ret)
				goto err;

1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
			crc.csum = bch2_checksum_bio(c, op->csum_type,
					 extent_nonce(version, crc), dst);
			crc.csum_type = op->csum_type;
			swap(dst->bi_iter.bi_size, dst_len);
		}

		init_append_extent(op, wp, version, crc);

		if (dst != src)
			bio_advance(dst, dst_len);
		bio_advance(src, src_len);
1048 1049
		total_output	+= dst_len;
		total_input	+= src_len;
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
	} while (dst->bi_iter.bi_size &&
		 src->bi_iter.bi_size &&
		 wp->sectors_free &&
		 !bch2_keylist_realloc(&op->insert_keys,
				      op->inline_keys,
				      ARRAY_SIZE(op->inline_keys),
				      BKEY_EXTENT_U64s_MAX));

	more = src->bi_iter.bi_size != 0;

	dst->bi_iter = saved_iter;

1062 1063 1064 1065
	if (dst == src && more) {
		BUG_ON(total_output != total_input);

		dst = bio_split(src, total_input >> 9,
1066
				GFP_NOFS, &c->bio_write);
1067 1068 1069
		wbio_init(dst)->put_bio	= true;
		/* copy WRITE_SYNC flag */
		dst->bi_opf		= src->bi_opf;
1070 1071 1072 1073
	}

	dst->bi_iter.bi_size = total_output;
do_write:
1074
	*_dst = dst;
1075 1076
	return more;
csum_err:
1077 1078
	bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
		op->flags & BCH_WRITE_MOVE ? "move" : "user");
1079 1080
	ret = -EIO;
err:
1081
	if (to_wbio(dst)->bounce)
1082
		bch2_bio_free_pages_pool(c, dst);
1083
	if (to_wbio(dst)->put_bio)
1084 1085 1086 1087 1088
		bio_put(dst);

	return ret;
}

Kent Overstreet's avatar
Kent Overstreet committed
1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102
static bool bch2_extent_is_writeable(struct bch_write_op *op,
				     struct bkey_s_c k)
{
	struct bch_fs *c = op->c;
	struct bkey_s_c_extent e;
	struct extent_ptr_decoded p;
	const union bch_extent_entry *entry;
	unsigned replicas = 0;

	if (k.k->type != KEY_TYPE_extent)
		return false;

	e = bkey_s_c_to_extent(k);
	extent_for_each_ptr_decode(e, p, entry) {
1103
		if (crc_is_encoded(p.crc) || p.has_ec)
Kent Overstreet's avatar
Kent Overstreet committed
1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
			return false;

		replicas += bch2_extent_ptr_durability(c, &p);
	}

	return replicas >= op->opts.data_replicas;
}

static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
{
	struct bch_fs *c = op->c;

	for_each_keylist_key(&op->insert_keys, k) {
		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));

1119 1120
		bkey_for_each_ptr(ptrs, ptr) {
			struct bch_dev *ca = bch2_dev_bkey_exists(c, ptr->dev);
Kent Overstreet's avatar
Kent Overstreet committed
1121
			bch2_bucket_nocow_unlock(&c->nocow_locks,
1122
						 PTR_BUCKET_POS(ca, ptr),
1123
						 BUCKET_NOCOW_LOCK_UPDATE);
1124
		}
Kent Overstreet's avatar
Kent Overstreet committed
1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138
	}
}

static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
						  struct btree_iter *iter,
						  struct bkey_i *orig,
						  struct bkey_s_c k,
						  u64 new_i_size)
{
	if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
		/* trace this */
		return 0;
	}

1139 1140
	struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
	int ret = PTR_ERR_OR_ZERO(new);
Kent Overstreet's avatar
Kent Overstreet committed
1141 1142 1143 1144 1145 1146
	if (ret)
		return ret;

	bch2_cut_front(bkey_start_pos(&orig->k), new);
	bch2_cut_back(orig->k.p, new);

1147
	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
Kent Overstreet's avatar
Kent Overstreet committed
1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
	bkey_for_each_ptr(ptrs, ptr)
		ptr->unwritten = 0;

	/*
	 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
	 * that was done when we kicked off the write, and here it's important
	 * that we update the extent that we wrote to - even if a snapshot has
	 * since been created. The write is still outstanding, so we're ok
	 * w.r.t. snapshot atomicity:
	 */
	return  bch2_extent_update_i_size_sectors(trans, iter,
					min(new->k.p.offset << 9, new_i_size), 0) ?:
		bch2_trans_update(trans, iter, new,
1161
				  BTREE_UPDATE_internal_snapshot_node);
Kent Overstreet's avatar
Kent Overstreet committed
1162 1163 1164 1165 1166
}

static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{
	struct bch_fs *c = op->c;
1167
	struct btree_trans *trans = bch2_trans_get(c);
Kent Overstreet's avatar
Kent Overstreet committed
1168 1169

	for_each_keylist_key(&op->insert_keys, orig) {
1170
		int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
Kent Overstreet's avatar
Kent Overstreet committed
1171
				     bkey_start_pos(&orig->k), orig->k.p,
1172
				     BTREE_ITER_intent, k,
1173
				     NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1174
			bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
Kent Overstreet's avatar
Kent Overstreet committed
1175 1176 1177
		}));

		if (ret && !bch2_err_matches(ret, EROFS)) {
1178
			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
Kent Overstreet's avatar
Kent Overstreet committed
1179 1180

			bch_err_inum_offset_ratelimited(c,
1181
				insert->k.p.inode, insert->k.p.offset << 9,
1182 1183
				"%s write error while doing btree update: %s",
				op->flags & BCH_WRITE_MOVE ? "move" : "user",
Kent Overstreet's avatar
Kent Overstreet committed
1184 1185 1186 1187 1188 1189 1190 1191 1192
				bch2_err_str(ret));
		}

		if (ret) {
			op->error = ret;
			break;
		}
	}

1193
	bch2_trans_put(trans);
Kent Overstreet's avatar
Kent Overstreet committed
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205
}

static void __bch2_nocow_write_done(struct bch_write_op *op)
{
	bch2_nocow_write_unlock(op);

	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
		op->error = -EIO;
	} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
		bch2_nocow_write_convert_unwritten(op);
}

1206
static CLOSURE_CALLBACK(bch2_nocow_write_done)
Kent Overstreet's avatar
Kent Overstreet committed
1207
{
1208
	closure_type(op, struct bch_write_op, cl);
Kent Overstreet's avatar
Kent Overstreet committed
1209 1210 1211 1212 1213

	__bch2_nocow_write_done(op);
	bch2_write_done(cl);
}

1214 1215 1216 1217 1218 1219
struct bucket_to_lock {
	struct bpos		b;
	unsigned		gen;
	struct nocow_lock_bucket *l;
};

Kent Overstreet's avatar
Kent Overstreet committed
1220 1221 1222
static void bch2_nocow_write(struct bch_write_op *op)
{
	struct bch_fs *c = op->c;
1223
	struct btree_trans *trans;
Kent Overstreet's avatar
Kent Overstreet committed
1224 1225
	struct btree_iter iter;
	struct bkey_s_c k;
1226
	DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
Kent Overstreet's avatar
Kent Overstreet committed
1227
	u32 snapshot;
1228 1229
	struct bucket_to_lock *stale_at;
	int ret;
Kent Overstreet's avatar
Kent Overstreet committed
1230 1231 1232 1233

	if (op->flags & BCH_WRITE_MOVE)
		return;

1234
	darray_init(&buckets);
1235
	trans = bch2_trans_get(c);
Kent Overstreet's avatar
Kent Overstreet committed
1236
retry:
1237
	bch2_trans_begin(trans);
Kent Overstreet's avatar
Kent Overstreet committed
1238

1239
	ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
Kent Overstreet's avatar
Kent Overstreet committed
1240 1241 1242
	if (unlikely(ret))
		goto err;

1243
	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
Kent Overstreet's avatar
Kent Overstreet committed
1244
			     SPOS(op->pos.inode, op->pos.offset, snapshot),
1245
			     BTREE_ITER_slots);
Kent Overstreet's avatar
Kent Overstreet committed
1246 1247 1248
	while (1) {
		struct bio *bio = &op->wbio.bio;

1249
		buckets.nr = 0;
Kent Overstreet's avatar
Kent Overstreet committed
1250

1251 1252 1253 1254
		ret = bch2_trans_relock(trans);
		if (ret)
			break;

Kent Overstreet's avatar
Kent Overstreet committed
1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
		k = bch2_btree_iter_peek_slot(&iter);
		ret = bkey_err(k);
		if (ret)
			break;

		/* fall back to normal cow write path? */
		if (unlikely(k.k->p.snapshot != snapshot ||
			     !bch2_extent_is_writeable(op, k)))
			break;

		if (bch2_keylist_realloc(&op->insert_keys,
1266 1267 1268
					 op->inline_keys,
					 ARRAY_SIZE(op->inline_keys),
					 k.k->u64s))
Kent Overstreet's avatar
Kent Overstreet committed
1269 1270 1271
			break;

		/* Get iorefs before dropping btree locks: */
1272
		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
Kent Overstreet's avatar
Kent Overstreet committed
1273
		bkey_for_each_ptr(ptrs, ptr) {
1274 1275
			struct bch_dev *ca = bch2_dev_bkey_exists(c, ptr->dev);
			struct bpos b = PTR_BUCKET_POS(ca, ptr);
1276 1277 1278
			struct nocow_lock_bucket *l =
				bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
			prefetch(l);
Kent Overstreet's avatar
Kent Overstreet committed
1279

1280
			if (unlikely(!bch2_dev_get_ioref(ca, WRITE)))
Kent Overstreet's avatar
Kent Overstreet committed
1281 1282
				goto err_get_ioref;

1283 1284 1285 1286
			/* XXX allocating memory with btree locks held - rare */
			darray_push_gfp(&buckets, ((struct bucket_to_lock) {
						   .b = b, .gen = ptr->gen, .l = l,
						   }), GFP_KERNEL|__GFP_NOFAIL);
1287

Kent Overstreet's avatar
Kent Overstreet committed
1288 1289 1290 1291 1292 1293
			if (ptr->unwritten)
				op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
		}

		/* Unlock before taking nocow locks, doing IO: */
		bkey_reassemble(op->insert_keys.top, k);
1294
		bch2_trans_unlock(trans);
Kent Overstreet's avatar
Kent Overstreet committed
1295 1296 1297 1298 1299

		bch2_cut_front(op->pos, op->insert_keys.top);
		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);

1300
		darray_for_each(buckets, i) {
1301
			struct bch_dev *ca = bch2_dev_bkey_exists(c, i->b.inode);
Kent Overstreet's avatar
Kent Overstreet committed
1302

1303 1304
			__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
						 bucket_to_u64(i->b),
1305
						 BUCKET_NOCOW_LOCK_UPDATE);
Kent Overstreet's avatar
Kent Overstreet committed
1306 1307

			rcu_read_lock();
1308
			bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
Kent Overstreet's avatar
Kent Overstreet committed
1309 1310
			rcu_read_unlock();

1311 1312
			if (unlikely(stale)) {
				stale_at = i;
Kent Overstreet's avatar
Kent Overstreet committed
1313
				goto err_bucket_stale;
1314
			}
Kent Overstreet's avatar
Kent Overstreet committed
1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342
		}

		bio = &op->wbio.bio;
		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
					GFP_KERNEL, &c->bio_write);
			wbio_init(bio)->put_bio = true;
			bio->bi_opf = op->wbio.bio.bi_opf;
		} else {
			op->flags |= BCH_WRITE_DONE;
		}

		op->pos.offset += bio_sectors(bio);
		op->written += bio_sectors(bio);

		bio->bi_end_io	= bch2_write_endio;
		bio->bi_private	= &op->cl;
		bio->bi_opf |= REQ_OP_WRITE;
		closure_get(&op->cl);
		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
					  op->insert_keys.top, true);

		bch2_keylist_push(&op->insert_keys);
		if (op->flags & BCH_WRITE_DONE)
			break;
		bch2_btree_iter_advance(&iter);
	}
out:
1343
	bch2_trans_iter_exit(trans, &iter);
Kent Overstreet's avatar
Kent Overstreet committed
1344 1345 1346 1347 1348 1349
err:
	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
		goto retry;

	if (ret) {
		bch_err_inum_offset_ratelimited(c,
1350 1351
			op->pos.inode, op->pos.offset << 9,
			"%s: btree lookup error %s", __func__, bch2_err_str(ret));
Kent Overstreet's avatar
Kent Overstreet committed
1352 1353 1354 1355
		op->error = ret;
		op->flags |= BCH_WRITE_DONE;
	}

1356
	bch2_trans_put(trans);
1357
	darray_exit(&buckets);
Kent Overstreet's avatar
Kent Overstreet committed
1358 1359 1360 1361 1362 1363 1364 1365

	/* fallback to cow write path? */
	if (!(op->flags & BCH_WRITE_DONE)) {
		closure_sync(&op->cl);
		__bch2_nocow_write_done(op);
		op->insert_keys.top = op->insert_keys.keys;
	} else if (op->flags & BCH_WRITE_SYNC) {
		closure_sync(&op->cl);
1366
		bch2_nocow_write_done(&op->cl.work);
Kent Overstreet's avatar
Kent Overstreet committed
1367 1368 1369 1370 1371 1372 1373 1374 1375 1376
	} else {
		/*
		 * XXX
		 * needs to run out of process context because ei_quota_lock is
		 * a mutex
		 */
		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
	}
	return;
err_get_ioref:
1377
	darray_for_each(buckets, i)
1378
		percpu_ref_put(&bch2_dev_bkey_exists(c, i->b.inode)->io_ref);
Kent Overstreet's avatar
Kent Overstreet committed
1379 1380 1381 1382

	/* Fall back to COW path: */
	goto out;
err_bucket_stale:
1383 1384 1385 1386
	darray_for_each(buckets, i) {
		bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
		if (i == stale_at)
			break;
1387
	}
Kent Overstreet's avatar
Kent Overstreet committed
1388 1389

	/* We can retry this: */
1390
	ret = -BCH_ERR_transaction_restart;
1391
	goto err_get_ioref;
Kent Overstreet's avatar
Kent Overstreet committed
1392 1393
}

1394
static void __bch2_write(struct bch_write_op *op)
1395 1396
{
	struct bch_fs *c = op->c;
1397
	struct write_point *wp = NULL;
1398
	struct bio *bio = NULL;
1399
	unsigned nofs_flags;
1400
	int ret;
1401 1402

	nofs_flags = memalloc_nofs_save();
Kent Overstreet's avatar
Kent Overstreet committed
1403

1404
	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
Kent Overstreet's avatar
Kent Overstreet committed
1405 1406 1407 1408
		bch2_nocow_write(op);
		if (op->flags & BCH_WRITE_DONE)
			goto out_nofs_restore;
	}
1409
again:
1410 1411
	memset(&op->failed, 0, sizeof(op->failed));

1412
	do {
1413 1414 1415 1416
		struct bkey_i *key_to_write;
		unsigned key_to_write_offset = op->insert_keys.top_p -
			op->insert_keys.keys_p;

1417
		/* +1 for possible cache device: */
1418 1419
		if (op->open_buckets.nr + op->nr_replicas + 1 >
		    ARRAY_SIZE(op->open_buckets.v))
1420
			break;
1421 1422 1423 1424 1425

		if (bch2_keylist_realloc(&op->insert_keys,
					op->inline_keys,
					ARRAY_SIZE(op->inline_keys),
					BKEY_EXTENT_U64s_MAX))
1426
			break;
1427

1428 1429 1430 1431 1432
		/*
		 * The copygc thread is now global, which means it's no longer
		 * freeing up space on specific disks, which means that
		 * allocations for specific disks may hang arbitrarily long:
		 */
1433
		ret = bch2_trans_do(c, NULL, NULL, 0,
1434
			bch2_alloc_sectors_start_trans(trans,
1435 1436 1437 1438 1439 1440
				op->target,
				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
				op->write_point,
				&op->devs_have,
				op->nr_replicas,
				op->nr_replicas_required,
1441
				op->watermark,
1442 1443 1444 1445
				op->flags,
				(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
				? NULL : &op->cl, &wp));
1446
		if (unlikely(ret)) {
1447
			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
1448
				break;
1449

1450
			goto err;
1451 1452
		}

1453
		EBUG_ON(!wp);
1454

1455
		bch2_open_bucket_get(c, wp, &op->open_buckets);
1456
		ret = bch2_write_extent(op, wp, &bio);
1457

1458
		bch2_alloc_sectors_done_inlined(c, wp);
1459 1460
err:
		if (ret <= 0) {
1461
			op->flags |= BCH_WRITE_DONE;
1462

1463
			if (ret < 0) {
1464 1465 1466 1467
				if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
					bch_err_inum_offset_ratelimited(c,
						op->pos.inode,
						op->pos.offset << 9,
1468 1469 1470
						"%s(): %s error: %s", __func__,
						op->flags & BCH_WRITE_MOVE ? "move" : "user",
						bch2_err_str(ret));
1471 1472 1473 1474 1475
				op->error = ret;
				break;
			}
		}

1476 1477
		bio->bi_end_io	= bch2_write_endio;
		bio->bi_private	= &op->cl;
1478
		bio->bi_opf |= REQ_OP_WRITE;
1479

1480
		closure_get(bio->bi_private);
1481 1482 1483 1484

		key_to_write = (void *) (op->insert_keys.keys_p +
					 key_to_write_offset);

1485
		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
Kent Overstreet's avatar
Kent Overstreet committed
1486
					  key_to_write, false);
1487
	} while (ret);
1488

1489
	/*
1490 1491 1492 1493 1494
	 * Sync or no?
	 *
	 * If we're running asynchronously, wne may still want to block
	 * synchronously here if we weren't able to submit all of the IO at
	 * once, as that signals backpressure to the caller.
1495
	 */
1496 1497 1498
	if ((op->flags & BCH_WRITE_SYNC) ||
	    (!(op->flags & BCH_WRITE_DONE) &&
	     !(op->flags & BCH_WRITE_IN_WORKER))) {
1499 1500 1501 1502 1503
		if (closure_sync_timeout(&op->cl, HZ * 10)) {
			bch2_print_allocator_stuck(c);
			closure_sync(&op->cl);
		}

1504 1505
		__bch2_write_index(op);

1506 1507
		if (!(op->flags & BCH_WRITE_DONE))
			goto again;
1508
		bch2_write_done(&op->cl);
1509
	} else {
1510
		bch2_write_queue(op, wp);
1511
		continue_at(&op->cl, bch2_write_index, NULL);
1512
	}
Kent Overstreet's avatar
Kent Overstreet committed
1513
out_nofs_restore:
1514
	memalloc_nofs_restore(nofs_flags);
1515 1516
}

1517 1518 1519 1520 1521 1522 1523 1524
static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
{
	struct bio *bio = &op->wbio.bio;
	struct bvec_iter iter;
	struct bkey_i_inline_data *id;
	unsigned sectors;
	int ret;

1525 1526
	memset(&op->failed, 0, sizeof(op->failed));

1527 1528 1529
	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
	op->flags |= BCH_WRITE_DONE;

1530
	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
1531

1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556
	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
				   ARRAY_SIZE(op->inline_keys),
				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
	if (ret) {
		op->error = ret;
		goto err;
	}

	sectors = bio_sectors(bio);
	op->pos.offset += sectors;

	id = bkey_inline_data_init(op->insert_keys.top);
	id->k.p		= op->pos;
	id->k.version	= op->version;
	id->k.size	= sectors;

	iter = bio->bi_iter;
	iter.bi_size = data_len;
	memcpy_from_bio(id->v.data, bio, iter);

	while (data_len & 7)
		id->v.data[data_len++] = '\0';
	set_bkey_val_bytes(&id->k, data_len);
	bch2_keylist_push(&op->insert_keys);

1557
	__bch2_write_index(op);
1558
err:
1559
	bch2_write_done(&op->cl);
1560 1561
}

1562
/**
1563 1564
 * bch2_write() - handle a write to a cache device or flash only volume
 * @cl:		&bch_write_op->cl
1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578
 *
 * This is the starting point for any data to end up in a cache device; it could
 * be from a normal write, or a writeback write, or a write to a flash only
 * volume - it's also used by the moving garbage collector to compact data in
 * mostly empty buckets.
 *
 * It first writes the data to the cache, creating a list of keys to be inserted
 * (if the data won't fit in a single open bucket, there will be multiple keys);
 * after the data is written it calls bch_journal, and after the keys have been
 * added to the next journal write they're inserted into the btree.
 *
 * If op->discard is true, instead of inserting the data it invalidates the
 * region of the cache represented by op->bio and op->inode.
 */
1579
CLOSURE_CALLBACK(bch2_write)
1580
{
1581
	closure_type(op, struct bch_write_op, cl);
1582
	struct bio *bio = &op->wbio.bio;
1583
	struct bch_fs *c = op->c;
1584
	unsigned data_len;
1585

1586
	EBUG_ON(op->cl.parent);
1587 1588
	BUG_ON(!op->nr_replicas);
	BUG_ON(!op->write_point.v);
1589
	BUG_ON(bkey_eq(op->pos, POS_MAX));
1590

1591
	op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
1592 1593 1594 1595
	op->start_time = local_clock();
	bch2_keylist_init(&op->insert_keys, op->inline_keys);
	wbio_init(bio)->put_bio = false;

1596
	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
1597 1598 1599
		bch_err_inum_offset_ratelimited(c,
			op->pos.inode,
			op->pos.offset << 9,
1600 1601
			"%s write error: misaligned write",
			op->flags & BCH_WRITE_MOVE ? "move" : "user");
1602 1603 1604 1605
		op->error = -EIO;
		goto err;
	}

1606 1607 1608 1609 1610 1611
	if (c->opts.nochanges) {
		op->error = -BCH_ERR_erofs_no_writes;
		goto err;
	}

	if (!(op->flags & BCH_WRITE_MOVE) &&
1612
	    !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
1613
		op->error = -BCH_ERR_erofs_no_writes;
1614
		goto err;
1615 1616
	}

1617
	this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
1618
	bch2_increment_clock(c, bio_sectors(bio), WRITE);
1619

1620 1621 1622
	data_len = min_t(u64, bio->bi_iter.bi_size,
			 op->new_i_size - (op->pos.offset << 9));

1623 1624
	if (c->opts.inline_data &&
	    data_len <= min(block_bytes(c) / 2, 1024U)) {
1625 1626 1627 1628
		bch2_write_data_inline(op, data_len);
		return;
	}

1629
	__bch2_write(op);
1630 1631
	return;
err:
1632
	bch2_disk_reservation_put(c, &op->res);
1633

1634 1635
	closure_debug_destroy(&op->cl);
	if (op->end_io)
1636
		op->end_io(op);
1637 1638
}

1639
static const char * const bch2_write_flags[] = {
1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660
#define x(f)	#f,
	BCH_WRITE_FLAGS()
#undef x
	NULL
};

void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
{
	prt_str(out, "pos: ");
	bch2_bpos_to_text(out, op->pos);
	prt_newline(out);
	printbuf_indent_add(out, 2);

	prt_str(out, "started: ");
	bch2_pr_time_units(out, local_clock() - op->start_time);
	prt_newline(out);

	prt_str(out, "flags: ");
	prt_bitflags(out, bch2_write_flags, op->flags);
	prt_newline(out);

1661
	prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
1662 1663 1664 1665

	printbuf_indent_sub(out, 2);
}

Kent Overstreet's avatar
Kent Overstreet committed
1666
void bch2_fs_io_write_exit(struct bch_fs *c)
1667
{
Kent Overstreet's avatar
Kent Overstreet committed
1668
	mempool_exit(&c->bio_bounce_pages);
1669
	bioset_exit(&c->replica_set);
Kent Overstreet's avatar
Kent Overstreet committed
1670
	bioset_exit(&c->bio_write);
1671 1672
}

Kent Overstreet's avatar
Kent Overstreet committed
1673
int bch2_fs_io_write_init(struct bch_fs *c)
1674
{
1675 1676
	if (bioset_init(&c->bio_write,   1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
	    bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
Kent Overstreet's avatar
Kent Overstreet committed
1677
		return -BCH_ERR_ENOMEM_bio_write_init;
1678 1679

	if (mempool_init_page_pool(&c->bio_bounce_pages,
1680
				   max_t(unsigned,
1681 1682
					 c->opts.btree_node_size,
					 c->opts.encoded_extent_max) /
1683 1684 1685
				   PAGE_SIZE, 0))
		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;

1686 1687
	return 0;
}