mirror of
https://git.proxmox.com/git/pve-manager
synced 2025-05-01 19:32:34 +00:00

By switching from 'ceph osd tree' to the 'ceph osd df tree' mon API equivalent , we get the same data structure with more information per OSD. One of them is the number of PGs stored on that OSD. The number of PGs per OSD is an important number, for example when trying to figure out why the performance is not as good as expected. Therefore, adding it to the OSD overview visible by default should reduce the number of times, one needs to access the CLI. Comparing runtime cost on a 3 node ceph cluster with 4 OSDs each doing 50k iterations gives: Rate osd-df-tree osd-tree osd-df-tree 9141/s -- -25% osd-tree 12136/s 33% -- So, while definitively a bit slower, but it's still in the µs range, and as such below HTTP in TLS in TCP connection setup for most users, so worth the extra useful information. Signed-off-by: Aaron Lauterer <a.lauterer@proxmox.com> [ TL: slight rewording of subject and add benchmark data ] Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
839 lines
22 KiB
Perl
839 lines
22 KiB
Perl
package PVE::API2::Ceph::OSD;
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use Cwd qw(abs_path);
|
|
use IO::File;
|
|
use UUID;
|
|
|
|
use PVE::Ceph::Tools;
|
|
use PVE::Ceph::Services;
|
|
use PVE::CephConfig;
|
|
use PVE::Cluster qw(cfs_read_file cfs_write_file);
|
|
use PVE::Diskmanage;
|
|
use PVE::Storage::LVMPlugin;
|
|
use PVE::Exception qw(raise_param_exc);
|
|
use PVE::JSONSchema qw(get_standard_option);
|
|
use PVE::INotify;
|
|
use PVE::RADOS;
|
|
use PVE::RESTHandler;
|
|
use PVE::RPCEnvironment;
|
|
use PVE::Tools qw(run_command file_set_contents);
|
|
use PVE::ProcFSTools;
|
|
use PVE::Network;
|
|
|
|
use base qw(PVE::RESTHandler);
|
|
|
|
my $nodename = PVE::INotify::nodename();
|
|
|
|
my $get_osd_status = sub {
|
|
my ($rados, $osdid) = @_;
|
|
|
|
my $stat = $rados->mon_command({ prefix => 'osd dump' });
|
|
|
|
my $osdlist = $stat->{osds} || [];
|
|
|
|
my $flags = $stat->{flags} || undef;
|
|
|
|
my $osdstat;
|
|
foreach my $d (@$osdlist) {
|
|
$osdstat->{$d->{osd}} = $d if defined($d->{osd});
|
|
}
|
|
if (defined($osdid)) {
|
|
die "no such OSD '$osdid'\n" if !$osdstat->{$osdid};
|
|
return $osdstat->{$osdid};
|
|
}
|
|
|
|
return wantarray ? ($osdstat, $flags) : $osdstat;
|
|
};
|
|
|
|
my $get_osd_usage = sub {
|
|
my ($rados) = @_;
|
|
|
|
my $osdlist = $rados->mon_command({ prefix => 'pg dump', dumpcontents => [ 'osds' ]});
|
|
if (!($osdlist && ref($osdlist))) {
|
|
warn "got unknown result format for 'pg dump osds' command\n";
|
|
return [];
|
|
}
|
|
|
|
if (ref($osdlist) eq "HASH") { # since nautilus
|
|
$osdlist = $osdlist->{osd_stats};
|
|
}
|
|
|
|
my $osdstat = {};
|
|
for my $d (@$osdlist) {
|
|
$osdstat->{$d->{osd}} = $d if defined($d->{osd});
|
|
}
|
|
|
|
return $osdstat;
|
|
};
|
|
|
|
__PACKAGE__->register_method ({
|
|
name => 'index',
|
|
path => '',
|
|
method => 'GET',
|
|
description => "Get Ceph osd list/tree.",
|
|
proxyto => 'node',
|
|
protected => 1,
|
|
permissions => {
|
|
check => ['perm', '/', [ 'Sys.Audit', 'Datastore.Audit' ], any => 1],
|
|
},
|
|
parameters => {
|
|
additionalProperties => 0,
|
|
properties => {
|
|
node => get_standard_option('pve-node'),
|
|
},
|
|
},
|
|
# fixme: return a list instead of extjs tree format ?
|
|
returns => {
|
|
type => "object",
|
|
items => {
|
|
type => "object",
|
|
properties => {
|
|
flags => { type => "string" },
|
|
root => {
|
|
type => "object",
|
|
description => "Tree with OSDs in the CRUSH map structure.",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
code => sub {
|
|
my ($param) = @_;
|
|
|
|
PVE::Ceph::Tools::check_ceph_inited();
|
|
|
|
my $rados = PVE::RADOS->new();
|
|
my $res = $rados->mon_command({ prefix => 'osd df', output_method => 'tree', });
|
|
|
|
die "no tree nodes found\n" if !($res && $res->{nodes});
|
|
|
|
my ($osdhash, $flags) = $get_osd_status->($rados);
|
|
|
|
my $osd_usage = $get_osd_usage->($rados);
|
|
|
|
my $osdmetadata_res = $rados->mon_command({ prefix => 'osd metadata' });
|
|
my $osdmetadata = { map { $_->{id} => $_ } @$osdmetadata_res };
|
|
|
|
my $hostversions = PVE::Ceph::Services::get_ceph_versions();
|
|
|
|
my $nodes = {};
|
|
my $newnodes = {};
|
|
foreach my $e (@{$res->{nodes}}) {
|
|
my ($id, $name) = $e->@{qw(id name)};
|
|
|
|
$nodes->{$id} = $e;
|
|
|
|
my $new = {
|
|
id => $id,
|
|
name => $name,
|
|
type => $e->{type}
|
|
};
|
|
|
|
foreach my $opt (qw(status crush_weight reweight device_class pgs)) {
|
|
$new->{$opt} = $e->{$opt} if defined($e->{$opt});
|
|
}
|
|
|
|
if (my $stat = $osdhash->{$id}) {
|
|
$new->{in} = $stat->{in} if defined($stat->{in});
|
|
}
|
|
|
|
if (my $stat = $osd_usage->{$id}) {
|
|
$new->{total_space} = ($stat->{kb} || 1) * 1024;
|
|
$new->{bytes_used} = ($stat->{kb_used} || 0) * 1024;
|
|
$new->{percent_used} = ($new->{bytes_used}*100)/$new->{total_space};
|
|
if (my $d = $stat->{perf_stat}) {
|
|
$new->{commit_latency_ms} = $d->{commit_latency_ms};
|
|
$new->{apply_latency_ms} = $d->{apply_latency_ms};
|
|
}
|
|
}
|
|
|
|
my $osdmd = $osdmetadata->{$id};
|
|
if ($e->{type} eq 'osd' && $osdmd) {
|
|
if ($osdmd->{bluefs}) {
|
|
$new->{osdtype} = 'bluestore';
|
|
$new->{blfsdev} = $osdmd->{bluestore_bdev_dev_node};
|
|
$new->{dbdev} = $osdmd->{bluefs_db_dev_node};
|
|
$new->{waldev} = $osdmd->{bluefs_wal_dev_node};
|
|
} else {
|
|
$new->{osdtype} = 'filestore';
|
|
}
|
|
for my $field (qw(ceph_version ceph_version_short)) {
|
|
$new->{$field} = $osdmd->{$field} if $osdmd->{$field};
|
|
}
|
|
}
|
|
|
|
$newnodes->{$id} = $new;
|
|
}
|
|
|
|
foreach my $e (@{$res->{nodes}}) {
|
|
my ($id, $name) = $e->@{qw(id name)};
|
|
my $new = $newnodes->{$id};
|
|
|
|
if ($e->{children} && scalar(@{$e->{children}})) {
|
|
$new->{children} = [];
|
|
$new->{leaf} = 0;
|
|
foreach my $cid (@{$e->{children}}) {
|
|
$nodes->{$cid}->{parent} = $id;
|
|
if ($nodes->{$cid}->{type} eq 'osd' && $e->{type} eq 'host') {
|
|
$newnodes->{$cid}->{host} = $name;
|
|
}
|
|
push @{$new->{children}}, $newnodes->{$cid};
|
|
}
|
|
} else {
|
|
$new->{leaf} = ($id >= 0) ? 1 : 0;
|
|
}
|
|
|
|
if ($name && $e->{type} eq 'host') {
|
|
$new->{version} = $hostversions->{$name}->{version}->{str};
|
|
}
|
|
}
|
|
|
|
my $realroots = [];
|
|
foreach my $e (@{$res->{nodes}}) {
|
|
my $id = $e->{id};
|
|
if (!$nodes->{$id}->{parent}) {
|
|
push @$realroots, $newnodes->{$id};
|
|
}
|
|
}
|
|
|
|
die "no root node\n" if scalar(@$realroots) < 1;
|
|
|
|
my $data = {
|
|
root => {
|
|
leaf => 0,
|
|
children => $realroots
|
|
},
|
|
};
|
|
|
|
$data->{flags} = $flags if $flags; # we want this for the noout flag
|
|
|
|
return $data;
|
|
}});
|
|
|
|
__PACKAGE__->register_method ({
|
|
name => 'createosd',
|
|
path => '',
|
|
method => 'POST',
|
|
description => "Create OSD",
|
|
proxyto => 'node',
|
|
protected => 1,
|
|
parameters => {
|
|
additionalProperties => 0,
|
|
properties => {
|
|
node => get_standard_option('pve-node'),
|
|
dev => {
|
|
description => "Block device name.",
|
|
type => 'string',
|
|
},
|
|
db_dev => {
|
|
description => "Block device name for block.db.",
|
|
optional => 1,
|
|
type => 'string',
|
|
},
|
|
db_dev_size => {
|
|
description => "Size in GiB for block.db.",
|
|
verbose_description => "If a block.db is requested but the size is not given, ".
|
|
"will be automatically selected by: bluestore_block_db_size from the ".
|
|
"ceph database (osd or global section) or config (osd or global section)".
|
|
"in that order. If this is not available, it will be sized 10% of the size ".
|
|
"of the OSD device. Fails if the available size is not enough.",
|
|
optional => 1,
|
|
type => 'number',
|
|
default => 'bluestore_block_db_size or 10% of OSD size',
|
|
requires => 'db_dev',
|
|
minimum => 1.0,
|
|
},
|
|
wal_dev => {
|
|
description => "Block device name for block.wal.",
|
|
optional => 1,
|
|
type => 'string',
|
|
},
|
|
wal_dev_size => {
|
|
description => "Size in GiB for block.wal.",
|
|
verbose_description => "If a block.wal is requested but the size is not given, ".
|
|
"will be automatically selected by: bluestore_block_wal_size from the ".
|
|
"ceph database (osd or global section) or config (osd or global section)".
|
|
"in that order. If this is not available, it will be sized 1% of the size ".
|
|
"of the OSD device. Fails if the available size is not enough.",
|
|
optional => 1,
|
|
minimum => 0.5,
|
|
default => 'bluestore_block_wal_size or 1% of OSD size',
|
|
requires => 'wal_dev',
|
|
type => 'number',
|
|
},
|
|
encrypted => {
|
|
type => 'boolean',
|
|
optional => 1,
|
|
default => 0,
|
|
description => "Enables encryption of the OSD."
|
|
},
|
|
'crush-device-class' => {
|
|
optional => 1,
|
|
type => 'string',
|
|
description => "Set the device class of the OSD in crush."
|
|
},
|
|
},
|
|
},
|
|
returns => { type => 'string' },
|
|
code => sub {
|
|
my ($param) = @_;
|
|
|
|
my $rpcenv = PVE::RPCEnvironment::get();
|
|
|
|
my $authuser = $rpcenv->get_user();
|
|
|
|
# test basic requirements
|
|
PVE::Ceph::Tools::check_ceph_inited();
|
|
PVE::Ceph::Tools::setup_pve_symlinks();
|
|
PVE::Ceph::Tools::check_ceph_installed('ceph_osd');
|
|
PVE::Ceph::Tools::check_ceph_installed('ceph_volume');
|
|
|
|
# extract parameter info and fail if a device is set more than once
|
|
my $devs = {};
|
|
|
|
my $ceph_conf = cfs_read_file('ceph.conf');
|
|
|
|
my $osd_network = $ceph_conf->{global}->{cluster_network};
|
|
$osd_network //= $ceph_conf->{global}->{public_network}; # fallback
|
|
|
|
if ($osd_network) { # check only if something is configured
|
|
my $cluster_net_ips = PVE::Network::get_local_ip_from_cidr($osd_network);
|
|
if (scalar(@$cluster_net_ips) < 1) {
|
|
my $osd_net_obj = PVE::Network::IP_from_cidr($osd_network);
|
|
my $osd_base_cidr = $osd_net_obj->{ip} . "/" . $osd_net_obj->{prefixlen};
|
|
|
|
die "No address from ceph cluster network (${osd_base_cidr}) found on node '$nodename'. ".
|
|
"Check your network config.\n";
|
|
}
|
|
}
|
|
|
|
for my $type ( qw(dev db_dev wal_dev) ) {
|
|
next if !$param->{$type};
|
|
|
|
my $type_dev = PVE::Diskmanage::verify_blockdev_path($param->{$type});
|
|
(my $type_devname = $type_dev) =~ s|/dev/||;
|
|
|
|
raise_param_exc({ $type => "cannot chose '$type_dev' for more than one type." })
|
|
if grep { $_->{name} eq $type_devname } values %$devs;
|
|
|
|
$devs->{$type} = {
|
|
dev => $type_dev,
|
|
name => $type_devname,
|
|
};
|
|
|
|
if (my $size = $param->{"${type}_size"}) {
|
|
$devs->{$type}->{size} = PVE::Tools::convert_size($size, 'gb' => 'b') ;
|
|
}
|
|
}
|
|
|
|
my $test_disk_requirements = sub {
|
|
my ($disklist) = @_;
|
|
|
|
my $dev = $devs->{dev}->{dev};
|
|
my $devname = $devs->{dev}->{name};
|
|
die "unable to get device info for '$dev'\n" if !$disklist->{$devname};
|
|
die "device '$dev' is already in use\n" if $disklist->{$devname}->{used};
|
|
|
|
for my $type ( qw(db_dev wal_dev) ) {
|
|
my $d = $devs->{$type};
|
|
next if !$d;
|
|
my $name = $d->{name};
|
|
my $info = $disklist->{$name};
|
|
die "unable to get device info for '$d->{dev}' for type $type\n" if !$disklist->{$name};
|
|
if (my $usage = $info->{used}) {
|
|
if ($usage eq 'partitions') {
|
|
die "device '$d->{dev}' is not GPT partitioned\n" if !$info->{gpt};
|
|
} elsif ($usage ne 'LVM') {
|
|
die "device '$d->{dev}' is already in use and has no LVM on it\n";
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
# test disk requirements early
|
|
my $devlist = [ map { $_->{name} } values %$devs ];
|
|
my $disklist = PVE::Diskmanage::get_disks($devlist, 1, 1);
|
|
$test_disk_requirements->($disklist);
|
|
|
|
# get necessary ceph infos
|
|
my $rados = PVE::RADOS->new();
|
|
my $monstat = $rados->mon_command({ prefix => 'quorum_status' });
|
|
|
|
die "unable to get fsid\n" if !$monstat->{monmap} || !$monstat->{monmap}->{fsid};
|
|
my $fsid = $monstat->{monmap}->{fsid};
|
|
$fsid = $1 if $fsid =~ m/^([0-9a-f\-]+)$/;
|
|
|
|
my $ceph_bootstrap_osd_keyring = PVE::Ceph::Tools::get_config('ceph_bootstrap_osd_keyring');
|
|
|
|
if (! -f $ceph_bootstrap_osd_keyring && $ceph_conf->{global}->{auth_client_required} eq 'cephx') {
|
|
my $bindata = $rados->mon_command({
|
|
prefix => 'auth get-or-create',
|
|
entity => 'client.bootstrap-osd',
|
|
caps => [
|
|
'mon' => 'allow profile bootstrap-osd'
|
|
],
|
|
format => 'plain',
|
|
});
|
|
file_set_contents($ceph_bootstrap_osd_keyring, $bindata);
|
|
};
|
|
|
|
# See FIXME below
|
|
my @udev_trigger_devs = ();
|
|
|
|
my $create_part_or_lv = sub {
|
|
my ($dev, $size, $type) = @_;
|
|
|
|
$size =~ m/^(\d+)$/ or die "invalid size '$size'\n";
|
|
$size = $1;
|
|
|
|
die "'$dev->{devpath}' is smaller than requested size '$size' bytes\n"
|
|
if $dev->{size} < $size;
|
|
|
|
# sgdisk and lvcreate can only sizes divisible by 512b
|
|
# so we round down to the nearest kb
|
|
$size = PVE::Tools::convert_size($size, 'b' => 'kb', 1);
|
|
|
|
if (!$dev->{used}) {
|
|
# create pv,vg,lv
|
|
|
|
my $vg = "ceph-" . UUID::uuid();
|
|
my $lv = $type . "-" . UUID::uuid();
|
|
|
|
PVE::Storage::LVMPlugin::lvm_create_volume_group($dev->{devpath}, $vg);
|
|
PVE::Storage::LVMPlugin::lvcreate($vg, $lv, "${size}k");
|
|
|
|
if (PVE::Diskmanage::is_partition($dev->{devpath})) {
|
|
eval { PVE::Diskmanage::change_parttype($dev->{devpath}, '8E00'); };
|
|
warn $@ if $@;
|
|
}
|
|
|
|
push @udev_trigger_devs, $dev->{devpath};
|
|
|
|
return "$vg/$lv";
|
|
|
|
} elsif ($dev->{used} eq 'LVM') {
|
|
# check pv/vg and create lv
|
|
|
|
my $vgs = PVE::Storage::LVMPlugin::lvm_vgs(1);
|
|
my $vg;
|
|
for my $vgname ( sort keys %$vgs ) {
|
|
next if $vgname !~ /^ceph-/;
|
|
|
|
for my $pv ( @{$vgs->{$vgname}->{pvs}} ) {
|
|
next if $pv->{name} ne $dev->{devpath};
|
|
$vg = $vgname;
|
|
last;
|
|
}
|
|
last if $vg;
|
|
}
|
|
|
|
die "no ceph vg found on '$dev->{devpath}'\n" if !$vg;
|
|
die "vg '$vg' has not enough free space\n" if $vgs->{$vg}->{free} < $size;
|
|
|
|
my $lv = $type . "-" . UUID::uuid();
|
|
|
|
PVE::Storage::LVMPlugin::lvcreate($vg, $lv, "${size}k");
|
|
|
|
return "$vg/$lv";
|
|
|
|
} elsif ($dev->{used} eq 'partitions' && $dev->{gpt}) {
|
|
# create new partition at the end
|
|
my $parttypes = {
|
|
'osd-db' => '30CD0809-C2B2-499C-8879-2D6B78529876',
|
|
'osd-wal' => '5CE17FCE-4087-4169-B7FF-056CC58473F9',
|
|
};
|
|
|
|
my $part = PVE::Diskmanage::append_partition($dev->{devpath}, $size * 1024);
|
|
|
|
if (my $parttype = $parttypes->{$type}) {
|
|
eval { PVE::Diskmanage::change_parttype($part, $parttype); };
|
|
warn $@ if $@;
|
|
}
|
|
|
|
push @udev_trigger_devs, $part;
|
|
return $part;
|
|
}
|
|
|
|
die "cannot use '$dev->{devpath}' for '$type'\n";
|
|
};
|
|
|
|
my $worker = sub {
|
|
my $upid = shift;
|
|
|
|
PVE::Diskmanage::locked_disk_action(sub {
|
|
# update disklist and re-test requirements
|
|
$disklist = PVE::Diskmanage::get_disks($devlist, 1, 1);
|
|
$test_disk_requirements->($disklist);
|
|
|
|
my $dev_class = $param->{'crush-device-class'};
|
|
my $cmd = ['ceph-volume', 'lvm', 'create', '--cluster-fsid', $fsid ];
|
|
push @$cmd, '--crush-device-class', $dev_class if $dev_class;
|
|
|
|
my $devname = $devs->{dev}->{name};
|
|
my $devpath = $disklist->{$devname}->{devpath};
|
|
print "create OSD on $devpath (bluestore)\n";
|
|
|
|
push @udev_trigger_devs, $devpath;
|
|
|
|
my $osd_size = $disklist->{$devname}->{size};
|
|
my $size_map = {
|
|
db => int($osd_size / 10), # 10% of OSD
|
|
wal => int($osd_size / 100), # 1% of OSD
|
|
};
|
|
|
|
my $sizes;
|
|
foreach my $type ( qw(db wal) ) {
|
|
my $fallback_size = $size_map->{$type};
|
|
my $d = $devs->{"${type}_dev"};
|
|
next if !$d;
|
|
|
|
# size was not set via api, getting from config/fallback
|
|
if (!defined($d->{size})) {
|
|
$sizes = PVE::Ceph::Tools::get_db_wal_sizes() if !$sizes;
|
|
$d->{size} = $sizes->{$type} // $fallback_size;
|
|
}
|
|
print "creating block.$type on '$d->{dev}'\n";
|
|
my $name = $d->{name};
|
|
my $part_or_lv = $create_part_or_lv->($disklist->{$name}, $d->{size}, "osd-$type");
|
|
|
|
print "using '$part_or_lv' for block.$type\n";
|
|
push @$cmd, "--block.$type", $part_or_lv;
|
|
}
|
|
|
|
push @$cmd, '--data', $devpath;
|
|
push @$cmd, '--dmcrypt' if $param->{encrypted};
|
|
|
|
PVE::Diskmanage::wipe_blockdev($devpath);
|
|
|
|
if (PVE::Diskmanage::is_partition($devpath)) {
|
|
eval { PVE::Diskmanage::change_parttype($devpath, '8E00'); };
|
|
warn $@ if $@;
|
|
}
|
|
|
|
run_command($cmd);
|
|
|
|
# FIXME: Remove once we depend on systemd >= v249.
|
|
# Work around udev bug https://github.com/systemd/systemd/issues/18525 to ensure the
|
|
# udev database is updated.
|
|
eval { run_command(['udevadm', 'trigger', @udev_trigger_devs]); };
|
|
warn $@ if $@;
|
|
});
|
|
};
|
|
|
|
return $rpcenv->fork_worker('cephcreateosd', $devs->{dev}->{name}, $authuser, $worker);
|
|
}});
|
|
|
|
# Check if $osdid belongs to $nodename
|
|
# $tree ... rados osd tree (passing the tree makes it easy to test)
|
|
sub osd_belongs_to_node {
|
|
my ($tree, $nodename, $osdid) = @_;
|
|
return 0 if !($tree && $tree->{nodes});
|
|
|
|
my $node_map = {};
|
|
for my $el (grep { defined($_->{type}) && $_->{type} eq 'host' } @{$tree->{nodes}}) {
|
|
my $name = $el->{name};
|
|
die "internal error: duplicate host name found '$name'\n" if $node_map->{$name};
|
|
$node_map->{$name} = $el;
|
|
}
|
|
|
|
my $osds = $node_map->{$nodename}->{children};
|
|
return 0 if !$osds;
|
|
|
|
return grep($_ == $osdid, @$osds);
|
|
}
|
|
|
|
__PACKAGE__->register_method ({
|
|
name => 'destroyosd',
|
|
path => '{osdid}',
|
|
method => 'DELETE',
|
|
description => "Destroy OSD",
|
|
proxyto => 'node',
|
|
protected => 1,
|
|
parameters => {
|
|
additionalProperties => 0,
|
|
properties => {
|
|
node => get_standard_option('pve-node'),
|
|
osdid => {
|
|
description => 'OSD ID',
|
|
type => 'integer',
|
|
},
|
|
cleanup => {
|
|
description => "If set, we remove partition table entries.",
|
|
type => 'boolean',
|
|
optional => 1,
|
|
default => 0,
|
|
},
|
|
},
|
|
},
|
|
returns => { type => 'string' },
|
|
code => sub {
|
|
my ($param) = @_;
|
|
|
|
my $rpcenv = PVE::RPCEnvironment::get();
|
|
|
|
my $authuser = $rpcenv->get_user();
|
|
|
|
PVE::Ceph::Tools::check_ceph_inited();
|
|
|
|
my $osdid = $param->{osdid};
|
|
my $cleanup = $param->{cleanup};
|
|
|
|
my $rados = PVE::RADOS->new();
|
|
|
|
my $osd_belongs_to_node = osd_belongs_to_node(
|
|
$rados->mon_command({ prefix => 'osd tree' }),
|
|
$param->{node},
|
|
$osdid,
|
|
);
|
|
die "OSD osd.$osdid does not belong to node $param->{node}!"
|
|
if !$osd_belongs_to_node;
|
|
|
|
# dies if osdid is unknown
|
|
my $osdstat = $get_osd_status->($rados, $osdid);
|
|
|
|
die "osd is in use (in == 1)\n" if $osdstat->{in};
|
|
#&$run_ceph_cmd(['osd', 'out', $osdid]);
|
|
|
|
die "osd is still running (up == 1)\n" if $osdstat->{up};
|
|
|
|
my $osdsection = "osd.$osdid";
|
|
|
|
my $worker = sub {
|
|
my $upid = shift;
|
|
|
|
# reopen with longer timeout
|
|
$rados = PVE::RADOS->new(timeout => PVE::Ceph::Tools::get_config('long_rados_timeout'));
|
|
|
|
print "destroy OSD $osdsection\n";
|
|
|
|
eval {
|
|
PVE::Ceph::Services::ceph_service_cmd('stop', $osdsection);
|
|
PVE::Ceph::Services::ceph_service_cmd('disable', $osdsection);
|
|
};
|
|
warn $@ if $@;
|
|
|
|
print "Remove $osdsection from the CRUSH map\n";
|
|
$rados->mon_command({ prefix => "osd crush remove", name => $osdsection, format => 'plain' });
|
|
|
|
print "Remove the $osdsection authentication key.\n";
|
|
$rados->mon_command({ prefix => "auth del", entity => $osdsection, format => 'plain' });
|
|
|
|
print "Remove OSD $osdsection\n";
|
|
$rados->mon_command({ prefix => "osd rm", ids => [ $osdsection ], format => 'plain' });
|
|
|
|
# try to unmount from standard mount point
|
|
my $mountpoint = "/var/lib/ceph/osd/ceph-$osdid";
|
|
|
|
# See FIXME below
|
|
my $udev_trigger_devs = {};
|
|
|
|
my $remove_partition = sub {
|
|
my ($part) = @_;
|
|
|
|
return if !$part || (! -b $part );
|
|
my $partnum = PVE::Diskmanage::get_partnum($part);
|
|
my $devpath = PVE::Diskmanage::get_blockdev($part);
|
|
|
|
$udev_trigger_devs->{$devpath} = 1;
|
|
|
|
PVE::Diskmanage::wipe_blockdev($part);
|
|
print "remove partition $part (disk '${devpath}', partnum $partnum)\n";
|
|
eval { run_command(['/sbin/sgdisk', '-d', $partnum, "${devpath}"]); };
|
|
warn $@ if $@;
|
|
};
|
|
|
|
my $osd_list = PVE::Ceph::Tools::ceph_volume_list();
|
|
|
|
if ($osd_list->{$osdid}) { # ceph-volume managed
|
|
|
|
eval { PVE::Ceph::Tools::ceph_volume_zap($osdid, $cleanup) };
|
|
warn $@ if $@;
|
|
|
|
if ($cleanup) {
|
|
# try to remove pvs, but do not fail if it does not work
|
|
for my $osd_part (@{$osd_list->{$osdid}}) {
|
|
for my $dev (@{$osd_part->{devices}}) {
|
|
($dev) = ($dev =~ m|^(/dev/[-_.a-zA-Z0-9\/]+)$|); #untaint
|
|
|
|
eval { run_command(['/sbin/pvremove', $dev], errfunc => sub {}) };
|
|
warn $@ if $@;
|
|
|
|
$udev_trigger_devs->{$dev} = 1;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
my $partitions_to_remove = [];
|
|
if ($cleanup) {
|
|
if (my $mp = PVE::ProcFSTools::parse_proc_mounts()) {
|
|
foreach my $line (@$mp) {
|
|
my ($dev, $path, $fstype) = @$line;
|
|
next if !($dev && $path && $fstype);
|
|
next if $dev !~ m|^/dev/|;
|
|
|
|
if ($path eq $mountpoint) {
|
|
abs_path($dev) =~ m|^(/.+)| or die "invalid dev: $dev\n";
|
|
push @$partitions_to_remove, $1;
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach my $path (qw(journal block block.db block.wal)) {
|
|
abs_path("$mountpoint/$path") =~ m|^(/.+)| or die "invalid path: $path\n";
|
|
push @$partitions_to_remove, $1;
|
|
}
|
|
}
|
|
|
|
print "Unmount OSD $osdsection from $mountpoint\n";
|
|
eval { run_command(['/bin/umount', $mountpoint]); };
|
|
if (my $err = $@) {
|
|
warn $err;
|
|
} elsif ($cleanup) {
|
|
#be aware of the ceph udev rules which can remount.
|
|
foreach my $part (@$partitions_to_remove) {
|
|
$remove_partition->($part);
|
|
}
|
|
}
|
|
}
|
|
|
|
# FIXME: Remove once we depend on systemd >= v249.
|
|
# Work around udev bug https://github.com/systemd/systemd/issues/18525 to ensure the
|
|
# udev database is updated.
|
|
if ($cleanup) {
|
|
eval { run_command(['udevadm', 'trigger', keys $udev_trigger_devs->%*]); };
|
|
warn $@ if $@;
|
|
}
|
|
};
|
|
|
|
return $rpcenv->fork_worker('cephdestroyosd', $osdsection, $authuser, $worker);
|
|
}});
|
|
|
|
__PACKAGE__->register_method ({
|
|
name => 'in',
|
|
path => '{osdid}/in',
|
|
method => 'POST',
|
|
description => "ceph osd in",
|
|
proxyto => 'node',
|
|
protected => 1,
|
|
permissions => {
|
|
check => ['perm', '/', [ 'Sys.Modify' ]],
|
|
},
|
|
parameters => {
|
|
additionalProperties => 0,
|
|
properties => {
|
|
node => get_standard_option('pve-node'),
|
|
osdid => {
|
|
description => 'OSD ID',
|
|
type => 'integer',
|
|
},
|
|
},
|
|
},
|
|
returns => { type => "null" },
|
|
code => sub {
|
|
my ($param) = @_;
|
|
|
|
PVE::Ceph::Tools::check_ceph_inited();
|
|
|
|
my $osdid = $param->{osdid};
|
|
|
|
my $rados = PVE::RADOS->new();
|
|
|
|
$get_osd_status->($rados, $osdid); # osd exists?
|
|
|
|
my $osdsection = "osd.$osdid";
|
|
|
|
$rados->mon_command({ prefix => "osd in", ids => [ $osdsection ], format => 'plain' });
|
|
|
|
return undef;
|
|
}});
|
|
|
|
__PACKAGE__->register_method ({
|
|
name => 'out',
|
|
path => '{osdid}/out',
|
|
method => 'POST',
|
|
description => "ceph osd out",
|
|
proxyto => 'node',
|
|
protected => 1,
|
|
permissions => {
|
|
check => ['perm', '/', [ 'Sys.Modify' ]],
|
|
},
|
|
parameters => {
|
|
additionalProperties => 0,
|
|
properties => {
|
|
node => get_standard_option('pve-node'),
|
|
osdid => {
|
|
description => 'OSD ID',
|
|
type => 'integer',
|
|
},
|
|
},
|
|
},
|
|
returns => { type => "null" },
|
|
code => sub {
|
|
my ($param) = @_;
|
|
|
|
PVE::Ceph::Tools::check_ceph_inited();
|
|
|
|
my $osdid = $param->{osdid};
|
|
|
|
my $rados = PVE::RADOS->new();
|
|
|
|
$get_osd_status->($rados, $osdid); # osd exists?
|
|
|
|
my $osdsection = "osd.$osdid";
|
|
|
|
$rados->mon_command({ prefix => "osd out", ids => [ $osdsection ], format => 'plain' });
|
|
|
|
return undef;
|
|
}});
|
|
|
|
__PACKAGE__->register_method ({
|
|
name => 'scrub',
|
|
path => '{osdid}/scrub',
|
|
method => 'POST',
|
|
description => "Instruct the OSD to scrub.",
|
|
proxyto => 'node',
|
|
protected => 1,
|
|
permissions => {
|
|
check => ['perm', '/', [ 'Sys.Modify' ]],
|
|
},
|
|
parameters => {
|
|
additionalProperties => 0,
|
|
properties => {
|
|
node => get_standard_option('pve-node'),
|
|
osdid => {
|
|
description => 'OSD ID',
|
|
type => 'integer',
|
|
},
|
|
deep => {
|
|
description => 'If set, instructs a deep scrub instead of a normal one.',
|
|
type => 'boolean',
|
|
optional => 1,
|
|
default => 0,
|
|
},
|
|
},
|
|
},
|
|
returns => { type => "null" },
|
|
code => sub {
|
|
my ($param) = @_;
|
|
|
|
PVE::Ceph::Tools::check_ceph_inited();
|
|
|
|
my $osdid = $param->{osdid};
|
|
my $deep = $param->{deep} // 0;
|
|
|
|
my $rados = PVE::RADOS->new();
|
|
|
|
$get_osd_status->($rados, $osdid); # osd exists?
|
|
|
|
my $prefix = $deep ? 'osd deep-scrub' : 'osd scrub';
|
|
$rados->mon_command({ prefix => $prefix, who => $osdid });
|
|
|
|
return undef;
|
|
}});
|
|
|
|
1;
|