mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-08-18 06:25:16 +00:00

This reverts commit866898efbb
. The generic bottom-up min_heap implementation causes performance regression in invalidate_buckets_lru(), a hot path in bcache. Before the cache is fully populated, new_bucket_prio() often returns zero, leading to many equal comparisons. In such cases, bottom-up sift_down performs up to 2 * log2(n) comparisons, while the original top-down approach completes with just O() comparisons, resulting in a measurable performance gap. The performance degradation is further worsened by the non-inlined min_heap API functions introduced in commit92a8b224b8
("lib/min_heap: introduce non-inline versions of min heap API functions"), adding function call overhead to this critical path. As reported by Robert, bcache now suffers from latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. These regressions degrade bcache's effectiveness as a low-latency cache layer and lead to frequent timeouts and application stalls in production environments. This revert aims to restore bcache's original low-latency behavior. Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com Link: https://lkml.kernel.org/r/20250614202353.1632957-3-visitorckw@gmail.com Fixes:866898efbb
("bcache: remove heap-related macros and switch to generic min_heap") Fixes:92a8b224b8
("lib/min_heap: introduce non-inline versions of min heap API functions") Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com> Reported-by: Robert Pang <robertpang@google.com> Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com Acked-by: Coly Li <colyli@kernel.org> Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw> Cc: Kent Overstreet <kent.overstreet@linux.dev> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
253 lines
5.3 KiB
C
253 lines
5.3 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Moving/copying garbage collector
|
|
*
|
|
* Copyright 2012 Google, Inc.
|
|
*/
|
|
|
|
#include "bcache.h"
|
|
#include "btree.h"
|
|
#include "debug.h"
|
|
#include "request.h"
|
|
|
|
#include <trace/events/bcache.h>
|
|
|
|
struct moving_io {
|
|
struct closure cl;
|
|
struct keybuf_key *w;
|
|
struct data_insert_op op;
|
|
struct bbio bio;
|
|
};
|
|
|
|
static bool moving_pred(struct keybuf *buf, struct bkey *k)
|
|
{
|
|
struct cache_set *c = container_of(buf, struct cache_set,
|
|
moving_gc_keys);
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < KEY_PTRS(k); i++)
|
|
if (ptr_available(c, k, i) &&
|
|
GC_MOVE(PTR_BUCKET(c, k, i)))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Moving GC - IO loop */
|
|
|
|
static CLOSURE_CALLBACK(moving_io_destructor)
|
|
{
|
|
closure_type(io, struct moving_io, cl);
|
|
|
|
kfree(io);
|
|
}
|
|
|
|
static CLOSURE_CALLBACK(write_moving_finish)
|
|
{
|
|
closure_type(io, struct moving_io, cl);
|
|
struct bio *bio = &io->bio.bio;
|
|
|
|
bio_free_pages(bio);
|
|
|
|
if (io->op.replace_collision)
|
|
trace_bcache_gc_copy_collision(&io->w->key);
|
|
|
|
bch_keybuf_del(&io->op.c->moving_gc_keys, io->w);
|
|
|
|
up(&io->op.c->moving_in_flight);
|
|
|
|
closure_return_with_destructor(cl, moving_io_destructor);
|
|
}
|
|
|
|
static void read_moving_endio(struct bio *bio)
|
|
{
|
|
struct bbio *b = container_of(bio, struct bbio, bio);
|
|
struct moving_io *io = container_of(bio->bi_private,
|
|
struct moving_io, cl);
|
|
|
|
if (bio->bi_status)
|
|
io->op.status = bio->bi_status;
|
|
else if (!KEY_DIRTY(&b->key) &&
|
|
ptr_stale(io->op.c, &b->key, 0)) {
|
|
io->op.status = BLK_STS_IOERR;
|
|
}
|
|
|
|
bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move");
|
|
}
|
|
|
|
static void moving_init(struct moving_io *io)
|
|
{
|
|
struct bio *bio = &io->bio.bio;
|
|
|
|
bio_init(bio, NULL, bio->bi_inline_vecs,
|
|
DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0);
|
|
bio_get(bio);
|
|
bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
|
|
|
|
bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9;
|
|
bio->bi_private = &io->cl;
|
|
bch_bio_map(bio, NULL);
|
|
}
|
|
|
|
static CLOSURE_CALLBACK(write_moving)
|
|
{
|
|
closure_type(io, struct moving_io, cl);
|
|
struct data_insert_op *op = &io->op;
|
|
|
|
if (!op->status) {
|
|
moving_init(io);
|
|
|
|
io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
|
|
op->write_prio = 1;
|
|
op->bio = &io->bio.bio;
|
|
|
|
op->writeback = KEY_DIRTY(&io->w->key);
|
|
op->csum = KEY_CSUM(&io->w->key);
|
|
|
|
bkey_copy(&op->replace_key, &io->w->key);
|
|
op->replace = true;
|
|
|
|
closure_call(&op->cl, bch_data_insert, NULL, cl);
|
|
}
|
|
|
|
continue_at(cl, write_moving_finish, op->wq);
|
|
}
|
|
|
|
static CLOSURE_CALLBACK(read_moving_submit)
|
|
{
|
|
closure_type(io, struct moving_io, cl);
|
|
struct bio *bio = &io->bio.bio;
|
|
|
|
bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
|
|
|
|
continue_at(cl, write_moving, io->op.wq);
|
|
}
|
|
|
|
static void read_moving(struct cache_set *c)
|
|
{
|
|
struct keybuf_key *w;
|
|
struct moving_io *io;
|
|
struct bio *bio;
|
|
struct closure cl;
|
|
|
|
closure_init_stack(&cl);
|
|
|
|
/* XXX: if we error, background writeback could stall indefinitely */
|
|
|
|
while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
|
|
w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
|
|
&MAX_KEY, moving_pred);
|
|
if (!w)
|
|
break;
|
|
|
|
if (ptr_stale(c, &w->key, 0)) {
|
|
bch_keybuf_del(&c->moving_gc_keys, w);
|
|
continue;
|
|
}
|
|
|
|
io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
|
|
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
|
|
GFP_KERNEL);
|
|
if (!io)
|
|
goto err;
|
|
|
|
w->private = io;
|
|
io->w = w;
|
|
io->op.inode = KEY_INODE(&w->key);
|
|
io->op.c = c;
|
|
io->op.wq = c->moving_gc_wq;
|
|
|
|
moving_init(io);
|
|
bio = &io->bio.bio;
|
|
|
|
bio->bi_opf = REQ_OP_READ;
|
|
bio->bi_end_io = read_moving_endio;
|
|
|
|
if (bch_bio_alloc_pages(bio, GFP_KERNEL))
|
|
goto err;
|
|
|
|
trace_bcache_gc_copy(&w->key);
|
|
|
|
down(&c->moving_in_flight);
|
|
closure_call(&io->cl, read_moving_submit, NULL, &cl);
|
|
}
|
|
|
|
if (0) {
|
|
err: if (!IS_ERR_OR_NULL(w->private))
|
|
kfree(w->private);
|
|
|
|
bch_keybuf_del(&c->moving_gc_keys, w);
|
|
}
|
|
|
|
closure_sync(&cl);
|
|
}
|
|
|
|
static bool bucket_cmp(struct bucket *l, struct bucket *r)
|
|
{
|
|
return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
|
|
}
|
|
|
|
static unsigned int bucket_heap_top(struct cache *ca)
|
|
{
|
|
struct bucket *b;
|
|
|
|
return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
|
|
}
|
|
|
|
void bch_moving_gc(struct cache_set *c)
|
|
{
|
|
struct cache *ca = c->cache;
|
|
struct bucket *b;
|
|
unsigned long sectors_to_move, reserve_sectors;
|
|
|
|
if (!c->copy_gc_enabled)
|
|
return;
|
|
|
|
mutex_lock(&c->bucket_lock);
|
|
|
|
sectors_to_move = 0;
|
|
reserve_sectors = ca->sb.bucket_size *
|
|
fifo_used(&ca->free[RESERVE_MOVINGGC]);
|
|
|
|
ca->heap.used = 0;
|
|
|
|
for_each_bucket(b, ca) {
|
|
if (GC_MARK(b) == GC_MARK_METADATA ||
|
|
!GC_SECTORS_USED(b) ||
|
|
GC_SECTORS_USED(b) == ca->sb.bucket_size ||
|
|
atomic_read(&b->pin))
|
|
continue;
|
|
|
|
if (!heap_full(&ca->heap)) {
|
|
sectors_to_move += GC_SECTORS_USED(b);
|
|
heap_add(&ca->heap, b, bucket_cmp);
|
|
} else if (bucket_cmp(b, heap_peek(&ca->heap))) {
|
|
sectors_to_move -= bucket_heap_top(ca);
|
|
sectors_to_move += GC_SECTORS_USED(b);
|
|
|
|
ca->heap.data[0] = b;
|
|
heap_sift(&ca->heap, 0, bucket_cmp);
|
|
}
|
|
}
|
|
|
|
while (sectors_to_move > reserve_sectors) {
|
|
heap_pop(&ca->heap, b, bucket_cmp);
|
|
sectors_to_move -= GC_SECTORS_USED(b);
|
|
}
|
|
|
|
while (heap_pop(&ca->heap, b, bucket_cmp))
|
|
SET_GC_MOVE(b, 1);
|
|
|
|
mutex_unlock(&c->bucket_lock);
|
|
|
|
c->moving_gc_keys.last_scanned = ZERO_KEY;
|
|
|
|
read_moving(c);
|
|
}
|
|
|
|
void bch_moving_init_cache_set(struct cache_set *c)
|
|
{
|
|
bch_keybuf_init(&c->moving_gc_keys);
|
|
sema_init(&c->moving_in_flight, 64);
|
|
}
|