linux/drivers/md/bcache/movinggc.c
Kuan-Wei Chiu 48fd7ebe00 Revert "bcache: remove heap-related macros and switch to generic min_heap"
This reverts commit 866898efbb.

The generic bottom-up min_heap implementation causes performance
regression in invalidate_buckets_lru(), a hot path in bcache.  Before the
cache is fully populated, new_bucket_prio() often returns zero, leading to
many equal comparisons.  In such cases, bottom-up sift_down performs up to
2 * log2(n) comparisons, while the original top-down approach completes
with just O() comparisons, resulting in a measurable performance gap.

The performance degradation is further worsened by the non-inlined
min_heap API functions introduced in commit 92a8b224b8 ("lib/min_heap:
introduce non-inline versions of min heap API functions"), adding function
call overhead to this critical path.

As reported by Robert, bcache now suffers from latency spikes, with P100
(max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. 
These regressions degrade bcache's effectiveness as a low-latency cache
layer and lead to frequent timeouts and application stalls in production
environments.

This revert aims to restore bcache's original low-latency behavior.

Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com
Link: https://lkml.kernel.org/r/20250614202353.1632957-3-visitorckw@gmail.com
Fixes: 866898efbb ("bcache: remove heap-related macros and switch to generic min_heap")
Fixes: 92a8b224b8 ("lib/min_heap: introduce non-inline versions of min heap API functions")
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Reported-by: Robert Pang <robertpang@google.com>
Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com
Acked-by: Coly Li <colyli@kernel.org>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-19 20:48:03 -07:00

253 lines
5.3 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Moving/copying garbage collector
*
* Copyright 2012 Google, Inc.
*/
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "request.h"
#include <trace/events/bcache.h>
struct moving_io {
struct closure cl;
struct keybuf_key *w;
struct data_insert_op op;
struct bbio bio;
};
static bool moving_pred(struct keybuf *buf, struct bkey *k)
{
struct cache_set *c = container_of(buf, struct cache_set,
moving_gc_keys);
unsigned int i;
for (i = 0; i < KEY_PTRS(k); i++)
if (ptr_available(c, k, i) &&
GC_MOVE(PTR_BUCKET(c, k, i)))
return true;
return false;
}
/* Moving GC - IO loop */
static CLOSURE_CALLBACK(moving_io_destructor)
{
closure_type(io, struct moving_io, cl);
kfree(io);
}
static CLOSURE_CALLBACK(write_moving_finish)
{
closure_type(io, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
bio_free_pages(bio);
if (io->op.replace_collision)
trace_bcache_gc_copy_collision(&io->w->key);
bch_keybuf_del(&io->op.c->moving_gc_keys, io->w);
up(&io->op.c->moving_in_flight);
closure_return_with_destructor(cl, moving_io_destructor);
}
static void read_moving_endio(struct bio *bio)
{
struct bbio *b = container_of(bio, struct bbio, bio);
struct moving_io *io = container_of(bio->bi_private,
struct moving_io, cl);
if (bio->bi_status)
io->op.status = bio->bi_status;
else if (!KEY_DIRTY(&b->key) &&
ptr_stale(io->op.c, &b->key, 0)) {
io->op.status = BLK_STS_IOERR;
}
bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move");
}
static void moving_init(struct moving_io *io)
{
struct bio *bio = &io->bio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs,
DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
bio_get(bio);
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9;
bio->bi_private = &io->cl;
bch_bio_map(bio, NULL);
}
static CLOSURE_CALLBACK(write_moving)
{
closure_type(io, struct moving_io, cl);
struct data_insert_op *op = &io->op;
if (!op->status) {
moving_init(io);
io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
op->write_prio = 1;
op->bio = &io->bio.bio;
op->writeback = KEY_DIRTY(&io->w->key);
op->csum = KEY_CSUM(&io->w->key);
bkey_copy(&op->replace_key, &io->w->key);
op->replace = true;
closure_call(&op->cl, bch_data_insert, NULL, cl);
}
continue_at(cl, write_moving_finish, op->wq);
}
static CLOSURE_CALLBACK(read_moving_submit)
{
closure_type(io, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
continue_at(cl, write_moving, io->op.wq);
}
static void read_moving(struct cache_set *c)
{
struct keybuf_key *w;
struct moving_io *io;
struct bio *bio;
struct closure cl;
closure_init_stack(&cl);
/* XXX: if we error, background writeback could stall indefinitely */
while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
&MAX_KEY, moving_pred);
if (!w)
break;
if (ptr_stale(c, &w->key, 0)) {
bch_keybuf_del(&c->moving_gc_keys, w);
continue;
}
io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
GFP_KERNEL);
if (!io)
goto err;
w->private = io;
io->w = w;
io->op.inode = KEY_INODE(&w->key);
io->op.c = c;
io->op.wq = c->moving_gc_wq;
moving_init(io);
bio = &io->bio.bio;
bio->bi_opf = REQ_OP_READ;
bio->bi_end_io = read_moving_endio;
if (bch_bio_alloc_pages(bio, GFP_KERNEL))
goto err;
trace_bcache_gc_copy(&w->key);
down(&c->moving_in_flight);
closure_call(&io->cl, read_moving_submit, NULL, &cl);
}
if (0) {
err: if (!IS_ERR_OR_NULL(w->private))
kfree(w->private);
bch_keybuf_del(&c->moving_gc_keys, w);
}
closure_sync(&cl);
}
static bool bucket_cmp(struct bucket *l, struct bucket *r)
{
return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
}
static unsigned int bucket_heap_top(struct cache *ca)
{
struct bucket *b;
return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
}
void bch_moving_gc(struct cache_set *c)
{
struct cache *ca = c->cache;
struct bucket *b;
unsigned long sectors_to_move, reserve_sectors;
if (!c->copy_gc_enabled)
return;
mutex_lock(&c->bucket_lock);
sectors_to_move = 0;
reserve_sectors = ca->sb.bucket_size *
fifo_used(&ca->free[RESERVE_MOVINGGC]);
ca->heap.used = 0;
for_each_bucket(b, ca) {
if (GC_MARK(b) == GC_MARK_METADATA ||
!GC_SECTORS_USED(b) ||
GC_SECTORS_USED(b) == ca->sb.bucket_size ||
atomic_read(&b->pin))
continue;
if (!heap_full(&ca->heap)) {
sectors_to_move += GC_SECTORS_USED(b);
heap_add(&ca->heap, b, bucket_cmp);
} else if (bucket_cmp(b, heap_peek(&ca->heap))) {
sectors_to_move -= bucket_heap_top(ca);
sectors_to_move += GC_SECTORS_USED(b);
ca->heap.data[0] = b;
heap_sift(&ca->heap, 0, bucket_cmp);
}
}
while (sectors_to_move > reserve_sectors) {
heap_pop(&ca->heap, b, bucket_cmp);
sectors_to_move -= GC_SECTORS_USED(b);
}
while (heap_pop(&ca->heap, b, bucket_cmp))
SET_GC_MOVE(b, 1);
mutex_unlock(&c->bucket_lock);
c->moving_gc_keys.last_scanned = ZERO_KEY;
read_moving(c);
}
void bch_moving_init_cache_set(struct cache_set *c)
{
bch_keybuf_init(&c->moving_gc_keys);
sema_init(&c->moving_in_flight, 64);
}