mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2025-12-07 15:16:09 +00:00
Main part of the disk accounting rewrite. This is a wholesale rewrite of the existing disk space accounting, which relies on percepu counters that are sharded by journal buffer, and rolled up and added to each journal write. With the new scheme, every set of counters is a distinct key in the accounting btree; this fixes scaling limitations of the old scheme, where counters took up space in each journal entry and required multiple percpu counters. Now, in memory accounting requires a single set of percpu counters - not multiple for each in flight journal buffer - and in the future we'll probably also have counters that don't use in memory percpu counters, they're not strictly required. An accounting update is now a normal btree update, using the btree write buffer path. At transaction commit time, we apply accounting updates to the in memory counters, which are percpu counters indexed in an eytzinger tree by the accounting key. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
144 lines
3.8 KiB
C
144 lines
3.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
|
|
#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
|
|
|
|
#include "replicas_format.h"
|
|
|
|
/*
|
|
* Disk accounting - KEY_TYPE_accounting - on disk format:
|
|
*
|
|
* Here, the key has considerably more structure than a typical key (bpos); an
|
|
* accounting key is 'struct disk_accounting_pos', which is a union of bpos.
|
|
*
|
|
* More specifically: a key is just a muliword integer (where word endianness
|
|
* matches native byte order), so we're treating bpos as an opaque 20 byte
|
|
* integer and mapping bch_accounting_key to that.
|
|
*
|
|
* This is a type-tagged union of all our various subtypes; a disk accounting
|
|
* key can be device counters, replicas counters, et cetera - it's extensible.
|
|
*
|
|
* The value is a list of u64s or s64s; the number of counters is specific to a
|
|
* given accounting type.
|
|
*
|
|
* Unlike with other key types, updates are _deltas_, and the deltas are not
|
|
* resolved until the update to the underlying btree, done by btree write buffer
|
|
* flush or journal replay.
|
|
*
|
|
* Journal replay in particular requires special handling. The journal tracks a
|
|
* range of entries which may possibly have not yet been applied to the btree
|
|
* yet - it does not know definitively whether individual entries are dirty and
|
|
* still need to be applied.
|
|
*
|
|
* To handle this, we use the version field of struct bkey, and give every
|
|
* accounting update a unique version number - a total ordering in time; the
|
|
* version number is derived from the key's position in the journal. Then
|
|
* journal replay can compare the version number of the key from the journal
|
|
* with the version number of the key in the btree to determine if a key needs
|
|
* to be replayed.
|
|
*
|
|
* For this to work, we must maintain this strict time ordering of updates as
|
|
* they are flushed to the btree, both via write buffer flush and via journal
|
|
* replay. This has complications for the write buffer code while journal replay
|
|
* is still in progress; the write buffer cannot flush any accounting keys to
|
|
* the btree until journal replay has finished replaying its accounting keys, or
|
|
* the (newer) version number of the keys from the write buffer will cause
|
|
* updates from journal replay to be lost.
|
|
*/
|
|
|
|
struct bch_accounting {
|
|
struct bch_val v;
|
|
__u64 d[];
|
|
};
|
|
|
|
#define BCH_ACCOUNTING_MAX_COUNTERS 3
|
|
|
|
#define BCH_DATA_TYPES() \
|
|
x(free, 0) \
|
|
x(sb, 1) \
|
|
x(journal, 2) \
|
|
x(btree, 3) \
|
|
x(user, 4) \
|
|
x(cached, 5) \
|
|
x(parity, 6) \
|
|
x(stripe, 7) \
|
|
x(need_gc_gens, 8) \
|
|
x(need_discard, 9) \
|
|
x(unstriped, 10)
|
|
|
|
enum bch_data_type {
|
|
#define x(t, n) BCH_DATA_##t,
|
|
BCH_DATA_TYPES()
|
|
#undef x
|
|
BCH_DATA_NR
|
|
};
|
|
|
|
static inline bool data_type_is_empty(enum bch_data_type type)
|
|
{
|
|
switch (type) {
|
|
case BCH_DATA_free:
|
|
case BCH_DATA_need_gc_gens:
|
|
case BCH_DATA_need_discard:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static inline bool data_type_is_hidden(enum bch_data_type type)
|
|
{
|
|
switch (type) {
|
|
case BCH_DATA_sb:
|
|
case BCH_DATA_journal:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
#define BCH_DISK_ACCOUNTING_TYPES() \
|
|
x(nr_inodes, 0) \
|
|
x(persistent_reserved, 1) \
|
|
x(replicas, 2) \
|
|
x(dev_data_type, 3)
|
|
|
|
enum disk_accounting_type {
|
|
#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
|
|
BCH_DISK_ACCOUNTING_TYPES()
|
|
#undef x
|
|
BCH_DISK_ACCOUNTING_TYPE_NR,
|
|
};
|
|
|
|
struct bch_nr_inodes {
|
|
};
|
|
|
|
struct bch_persistent_reserved {
|
|
__u8 nr_replicas;
|
|
};
|
|
|
|
struct bch_dev_data_type {
|
|
__u8 dev;
|
|
__u8 data_type;
|
|
};
|
|
|
|
struct bch_dev_stripe_buckets {
|
|
__u8 dev;
|
|
};
|
|
|
|
struct disk_accounting_pos {
|
|
union {
|
|
struct {
|
|
__u8 type;
|
|
union {
|
|
struct bch_nr_inodes nr_inodes;
|
|
struct bch_persistent_reserved persistent_reserved;
|
|
struct bch_replicas_entry_v1 replicas;
|
|
struct bch_dev_data_type dev_data_type;
|
|
struct bch_dev_stripe_buckets dev_stripe_buckets;
|
|
};
|
|
};
|
|
struct bpos _pad;
|
|
};
|
|
};
|
|
|
|
#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
|