mirror of
https://git.proxmox.com/git/ceph.git
synced 2025-04-28 12:54:34 +00:00
import ceph pacific 16.2.14 source
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
b81a1d7f97
commit
a2f5a7e755
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
|
||||
# remove cmake/modules/FindPython* once 3.12 is required
|
||||
|
||||
project(ceph
|
||||
VERSION 16.2.13
|
||||
VERSION 16.2.14
|
||||
LANGUAGES CXX C ASM)
|
||||
|
||||
foreach(policy
|
||||
|
@ -32,6 +32,17 @@
|
||||
in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
|
||||
the restored file system is expected to have the same ID as before.
|
||||
|
||||
>= 16.2.14
|
||||
----------
|
||||
|
||||
* CEPHFS: After recovering a Ceph File System post following the disaster recovery
|
||||
procedure, the recovered files under `lost+found` directory can now be deleted.
|
||||
|
||||
* `ceph mgr dump` command now displays the name of the mgr module that
|
||||
registered a RADOS client in the `name` field added to elements of the
|
||||
`active_clients` array. Previously, only the address of a module's RADOS
|
||||
client was shown in the `active_clients` array.
|
||||
|
||||
>=16.2.12
|
||||
---------
|
||||
|
||||
@ -62,6 +73,65 @@
|
||||
namespaces was added to RBD in Nautilus 14.2.0 and it has been possible to
|
||||
map and unmap images in namespaces using the `image-spec` syntax since then
|
||||
but the corresponding option available in most other commands was missing.
|
||||
* RGW: Compression is now supported for objects uploaded with Server-Side Encryption.
|
||||
When both are enabled, compression is applied before encryption.
|
||||
* RGW: the "pubsub" functionality for storing bucket notifications inside Ceph
|
||||
is removed. Together with it, the "pubsub" zone should not be used anymore.
|
||||
The REST operations, as well as radosgw-admin commands for manipulating
|
||||
subscriptions, as well as fetching and acking the notifications are removed
|
||||
as well.
|
||||
In case that the endpoint to which the notifications are sent maybe down or
|
||||
disconnected, it is recommended to use persistent notifications to guarantee
|
||||
the delivery of the notifications. In case the system that consumes the
|
||||
notifications needs to pull them (instead of the notifications be pushed
|
||||
to it), an external message bus (e.g. rabbitmq, Kafka) should be used for
|
||||
that purpose.
|
||||
* RGW: The serialized format of notification and topics has changed, so that
|
||||
new/updated topics will be unreadable by old RGWs. We recommend completing
|
||||
the RGW upgrades before creating or modifying any notification topics.
|
||||
* RBD: Trailing newline in passphrase files (`<passphrase-file>` argument in
|
||||
`rbd encryption format` command and `--encryption-passphrase-file` option
|
||||
in other commands) is no longer stripped.
|
||||
* RBD: Support for layered client-side encryption is added. Cloned images
|
||||
can now be encrypted each with its own encryption format and passphrase,
|
||||
potentially different from that of the parent image. The efficient
|
||||
copy-on-write semantics intrinsic to unformatted (regular) cloned images
|
||||
are retained.
|
||||
* CEPHFS: Rename the `mds_max_retries_on_remount_failure` option to
|
||||
`client_max_retries_on_remount_failure` and move it from mds.yaml.in to
|
||||
mds-client.yaml.in because this option was only used by MDS client from its
|
||||
birth.
|
||||
* The `perf dump` and `perf schema` commands are deprecated in favor of new
|
||||
`counter dump` and `counter schema` commands. These new commands add support
|
||||
for labeled perf counters and also emit existing unlabeled perf counters. Some
|
||||
unlabeled perf counters became labeled in this release, with more to follow in
|
||||
future releases; such converted perf counters are no longer emitted by the
|
||||
`perf dump` and `perf schema` commands.
|
||||
* `ceph mgr dump` command now outputs `last_failure_osd_epoch` and
|
||||
`active_clients` fields at the top level. Previously, these fields were
|
||||
output under `always_on_modules` field.
|
||||
* RBD: All rbd-mirror daemon perf counters became labeled and as such are now
|
||||
emitted only by the new `counter dump` and `counter schema` commands. As part
|
||||
of the conversion, many also got renamed to better disambiguate journal-based
|
||||
and snapshot-based mirroring.
|
||||
* RBD: list-watchers C++ API (`Image::list_watchers`) now clears the passed
|
||||
`std::list` before potentially appending to it, aligning with the semantics
|
||||
of the corresponding C API (`rbd_watchers_list`).
|
||||
* Telemetry: Users who are opted-in to telemetry can also opt-in to
|
||||
participating in a leaderboard in the telemetry public
|
||||
dashboards (https://telemetry-public.ceph.com/). Users can now also add a
|
||||
description of the cluster to publicly appear in the leaderboard.
|
||||
For more details, see:
|
||||
https://docs.ceph.com/en/latest/mgr/telemetry/#leaderboard
|
||||
See a sample report with `ceph telemetry preview`.
|
||||
Opt-in to telemetry with `ceph telemetry on`.
|
||||
Opt-in to the leaderboard with
|
||||
`ceph config set mgr mgr/telemetry/leaderboard true`.
|
||||
Add leaderboard description with:
|
||||
`ceph config set mgr mgr/telemetry/leaderboard_description ‘Cluster description’`.
|
||||
* CEPHFS: After recovering a Ceph File System post following the disaster recovery
|
||||
procedure, the recovered files under `lost+found` directory can now be deleted.
|
||||
* core: cache-tiering is now deprecated.
|
||||
|
||||
>=16.2.8
|
||||
--------
|
||||
|
@ -135,7 +135,7 @@
|
||||
# main package definition
|
||||
#################################################################################
|
||||
Name: ceph
|
||||
Version: 16.2.13
|
||||
Version: 16.2.14
|
||||
Release: 0%{?dist}
|
||||
%if 0%{?fedora} || 0%{?rhel}
|
||||
Epoch: 2
|
||||
@ -151,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
|
||||
Group: System/Filesystems
|
||||
%endif
|
||||
URL: http://ceph.com/
|
||||
Source0: %{?_remote_tarball_prefix}ceph-16.2.13.tar.bz2
|
||||
Source0: %{?_remote_tarball_prefix}ceph-16.2.14.tar.bz2
|
||||
%if 0%{?suse_version}
|
||||
# _insert_obs_source_lines_here
|
||||
ExclusiveArch: x86_64 aarch64 ppc64le s390x
|
||||
@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus.
|
||||
# common
|
||||
#################################################################################
|
||||
%prep
|
||||
%autosetup -p1 -n ceph-16.2.13
|
||||
%autosetup -p1 -n ceph-16.2.14
|
||||
|
||||
%build
|
||||
# Disable lto on systems that do not support symver attribute
|
||||
|
@ -1,7 +1,13 @@
|
||||
ceph (16.2.13-1focal) focal; urgency=medium
|
||||
ceph (16.2.14-1focal) focal; urgency=medium
|
||||
|
||||
|
||||
-- Jenkins Build Slave User <jenkins-build@braggi17.front.sepia.ceph.com> Mon, 08 May 2023 20:49:59 +0000
|
||||
-- Jenkins Build Slave User <jenkins-build@braggi13.front.sepia.ceph.com> Tue, 29 Aug 2023 16:38:35 +0000
|
||||
|
||||
ceph (16.2.14-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
||||
-- Ceph Release Team <ceph-maintainers@ceph.io> Tue, 29 Aug 2023 15:43:56 +0000
|
||||
|
||||
ceph (16.2.13-1) stable; urgency=medium
|
||||
|
||||
|
@ -1 +1,3 @@
|
||||
lib/systemd/system/cephfs-mirror*
|
||||
usr/bin/cephfs-mirror
|
||||
usr/share/man/man8/cephfs-mirror.8
|
||||
|
@ -43,17 +43,17 @@ monitor hosts as well as to the monitor daemons' stderr.
|
||||
Ceph daemon logs
|
||||
================
|
||||
|
||||
Logging to journald
|
||||
-------------------
|
||||
Logging to stdout
|
||||
-----------------
|
||||
|
||||
Ceph daemons traditionally write logs to ``/var/log/ceph``. Ceph daemons log to
|
||||
journald by default and Ceph logs are captured by the container runtime
|
||||
environment. They are accessible via ``journalctl``.
|
||||
Ceph daemons traditionally write logs to ``/var/log/ceph``. Ceph
|
||||
daemons log to stderr by default and Ceph logs are captured by the
|
||||
container runtime environment. By default, most systems send these
|
||||
logs to journald, which means that they are accessible via
|
||||
``journalctl``.
|
||||
|
||||
.. note:: Prior to Quincy, ceph daemons logged to stderr.
|
||||
|
||||
Example of logging to journald
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Example of logging to stdout
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
For example, to view the logs for the daemon ``mon.foo`` for a cluster
|
||||
with ID ``5c5a50ae-272a-455d-99e9-32c6a013e694``, the command would be
|
||||
@ -69,11 +69,11 @@ Logging to files
|
||||
----------------
|
||||
|
||||
You can also configure Ceph daemons to log to files instead of to
|
||||
journald if you prefer logs to appear in files (as they did in earlier,
|
||||
stderr if you prefer logs to appear in files (as they did in earlier,
|
||||
pre-cephadm, pre-Octopus versions of Ceph). When Ceph logs to files,
|
||||
the logs appear in ``/var/log/ceph/<cluster-fsid>``. If you choose to
|
||||
configure Ceph to log to files instead of to journald, remember to
|
||||
configure Ceph so that it will not log to journald (the commands for
|
||||
configure Ceph to log to files instead of to stderr, remember to
|
||||
configure Ceph so that it will not log to stderr (the commands for
|
||||
this are covered below).
|
||||
|
||||
Enabling logging to files
|
||||
@ -86,10 +86,10 @@ To enable logging to files, run the following commands:
|
||||
ceph config set global log_to_file true
|
||||
ceph config set global mon_cluster_log_to_file true
|
||||
|
||||
Disabling logging to journald
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Disabling logging to stderr
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you choose to log to files, we recommend disabling logging to journald or else
|
||||
If you choose to log to files, we recommend disabling logging to stderr or else
|
||||
everything will be logged twice. Run the following commands to disable logging
|
||||
to stderr:
|
||||
|
||||
@ -97,11 +97,6 @@ to stderr:
|
||||
|
||||
ceph config set global log_to_stderr false
|
||||
ceph config set global mon_cluster_log_to_stderr false
|
||||
ceph config set global log_to_journald false
|
||||
ceph config set global mon_cluster_log_to_journald false
|
||||
|
||||
.. note:: You can change the default by passing --log-to-file during
|
||||
bootstrapping a new cluster.
|
||||
|
||||
Modifying the log retention schedule
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -558,6 +558,7 @@ For example:
|
||||
Extra Entrypoint Arguments
|
||||
==========================
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
For arguments intended for the container runtime rather than the process inside
|
||||
@ -577,6 +578,57 @@ the node-exporter service , one could apply a service spec like
|
||||
extra_entrypoint_args:
|
||||
- "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector2"
|
||||
|
||||
Custom Config Files
|
||||
===================
|
||||
|
||||
Cephadm supports specifying miscellaneous config files for daemons.
|
||||
To do so, users must provide both the content of the config file and the
|
||||
location within the daemon's container at which it should be mounted. After
|
||||
applying a YAML spec with custom config files specified and having cephadm
|
||||
redeploy the daemons for which the config files are specified, these files will
|
||||
be mounted within the daemon's container at the specified location.
|
||||
|
||||
Example service spec:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
service_type: grafana
|
||||
service_name: grafana
|
||||
custom_configs:
|
||||
- mount_path: /etc/example.conf
|
||||
content: |
|
||||
setting1 = value1
|
||||
setting2 = value2
|
||||
- mount_path: /usr/share/grafana/example.cert
|
||||
content: |
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
V2VyIGRhcyBsaWVzdCBpc3QgZG9vZi4gTG9yZW0gaXBzdW0gZG9sb3Igc2l0IGFt
|
||||
ZXQsIGNvbnNldGV0dXIgc2FkaXBzY2luZyBlbGl0ciwgc2VkIGRpYW0gbm9udW15
|
||||
IGVpcm1vZCB0ZW1wb3IgaW52aWR1bnQgdXQgbGFib3JlIGV0IGRvbG9yZSBtYWdu
|
||||
YSBhbGlxdXlhbSBlcmF0LCBzZWQgZGlhbSB2b2x1cHR1YS4gQXQgdmVybyBlb3Mg
|
||||
ZXQgYWNjdXNhbSBldCBqdXN0byBkdW8=
|
||||
-----END PRIVATE KEY-----
|
||||
-----BEGIN CERTIFICATE-----
|
||||
V2VyIGRhcyBsaWVzdCBpc3QgZG9vZi4gTG9yZW0gaXBzdW0gZG9sb3Igc2l0IGFt
|
||||
ZXQsIGNvbnNldGV0dXIgc2FkaXBzY2luZyBlbGl0ciwgc2VkIGRpYW0gbm9udW15
|
||||
IGVpcm1vZCB0ZW1wb3IgaW52aWR1bnQgdXQgbGFib3JlIGV0IGRvbG9yZSBtYWdu
|
||||
YSBhbGlxdXlhbSBlcmF0LCBzZWQgZGlhbSB2b2x1cHR1YS4gQXQgdmVybyBlb3Mg
|
||||
ZXQgYWNjdXNhbSBldCBqdXN0byBkdW8=
|
||||
-----END CERTIFICATE-----
|
||||
|
||||
To make these new config files actually get mounted within the
|
||||
containers for the daemons
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
ceph orch redeploy <service-name>
|
||||
|
||||
For example:
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
ceph orch redeploy grafana
|
||||
|
||||
.. _orch-rm:
|
||||
|
||||
Removing a Service
|
||||
|
@ -299,13 +299,16 @@ and the metrics will not be visible in Prometheus.
|
||||
Setting up Prometheus
|
||||
-----------------------
|
||||
|
||||
Setting Prometheus Retention Time
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Setting Prometheus Retention Size and Time
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Cephadm provides the option to set the Prometheus TDSB retention time using
|
||||
a ``retention_time`` field in the Prometheus service spec. The value defaults
|
||||
to 15 days (15d). If you would like a different value, such as 1 year (1y) you
|
||||
can apply a service spec similar to:
|
||||
Cephadm can configure Prometheus TSDB retention by specifying ``retention_time``
|
||||
and ``retention_size`` values in the Prometheus service spec.
|
||||
The retention time value defaults to 15 days (15d). Users can set a different value/unit where
|
||||
supported units are: 'y', 'w', 'd', 'h', 'm' and 's'. The retention size value defaults
|
||||
to 0 (disabled). Supported units in this case are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB' and 'EB'.
|
||||
|
||||
In the following example spec we set the retention time to 1 year and the size to 1GB.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
@ -314,6 +317,7 @@ can apply a service spec similar to:
|
||||
count: 1
|
||||
spec:
|
||||
retention_time: "1y"
|
||||
retention_size: "1GB"
|
||||
|
||||
.. note::
|
||||
|
||||
|
@ -308,7 +308,7 @@ Replacing an OSD
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
orch osd rm <osd_id(s)> --replace [--force]
|
||||
ceph orch osd rm <osd_id(s)> --replace [--force]
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -14,6 +14,8 @@ Requirements
|
||||
|
||||
The primary (local) and secondary (remote) Ceph clusters version should be Pacific or later.
|
||||
|
||||
.. _cephfs_mirroring_creating_users:
|
||||
|
||||
Creating Users
|
||||
--------------
|
||||
|
||||
@ -42,80 +44,155 @@ Mirror daemon should be spawned using `systemctl(1)` unit files::
|
||||
|
||||
$ cephfs-mirror --id mirror --cluster site-a -f
|
||||
|
||||
.. note:: User used here is `mirror` created in the `Creating Users` section.
|
||||
.. note:: The user specified here is `mirror`, the creation of which is
|
||||
described in the :ref:`Creating Users<cephfs_mirroring_creating_users>`
|
||||
section.
|
||||
|
||||
Multiple ``cephfs-mirror`` daemons may be deployed for concurrent
|
||||
synchronization and high availability. Mirror daemons share the synchronization
|
||||
load using a simple ``M/N`` policy, where ``M`` is the number of directories
|
||||
and ``N`` is the number of ``cephfs-mirror`` daemons.
|
||||
|
||||
When ``cephadm`` is used to manage a Ceph cluster, ``cephfs-mirror`` daemons can be
|
||||
deployed by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph orch apply cephfs-mirror
|
||||
|
||||
To deploy multiple mirror daemons, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph orch apply cephfs-mirror --placement=<placement-spec>
|
||||
|
||||
For example, to deploy 3 `cephfs-mirror` daemons on different hosts, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph orch apply cephfs-mirror --placement="3 host1,host2,host3"
|
||||
|
||||
Interface
|
||||
---------
|
||||
|
||||
`Mirroring` module (manager plugin) provides interfaces for managing directory snapshot
|
||||
mirroring. Manager interfaces are (mostly) wrappers around monitor commands for managing
|
||||
file system mirroring and is the recommended control interface.
|
||||
The `Mirroring` module (manager plugin) provides interfaces for managing
|
||||
directory snapshot mirroring. These are (mostly) wrappers around monitor
|
||||
commands for managing file system mirroring and is the recommended control
|
||||
interface.
|
||||
|
||||
Mirroring Module
|
||||
----------------
|
||||
|
||||
The mirroring module is responsible for assigning directories to mirror daemons for
|
||||
synchronization. Multiple mirror daemons can be spawned to achieve concurrency in
|
||||
directory snapshot synchronization. When mirror daemons are spawned (or terminated)
|
||||
, the mirroring module discovers the modified set of mirror daemons and rebalances
|
||||
the directory assignment amongst the new set thus providing high-availability.
|
||||
The mirroring module is responsible for assigning directories to mirror daemons
|
||||
for synchronization. Multiple mirror daemons can be spawned to achieve
|
||||
concurrency in directory snapshot synchronization. When mirror daemons are
|
||||
spawned (or terminated), the mirroring module discovers the modified set of
|
||||
mirror daemons and rebalances directory assignments across the new set, thus
|
||||
providing high-availability.
|
||||
|
||||
.. note:: Multiple mirror daemons is currently untested. Only a single mirror daemon
|
||||
is recommended.
|
||||
.. note:: Deploying a single mirror daemon is recommended. Running multiple
|
||||
daemons is untested.
|
||||
|
||||
Mirroring module is disabled by default. To enable mirroring use::
|
||||
The mirroring module is disabled by default. To enable the mirroring module,
|
||||
run the following command:
|
||||
|
||||
$ ceph mgr module enable mirroring
|
||||
.. prompt:: bash $
|
||||
|
||||
Mirroring module provides a family of commands to control mirroring of directory
|
||||
snapshots. To add or remove directories, mirroring needs to be enabled for a given
|
||||
file system. To enable mirroring use::
|
||||
ceph mgr module enable mirroring
|
||||
|
||||
$ ceph fs snapshot mirror enable <fs_name>
|
||||
The mirroring module provides a family of commands that can be used to control
|
||||
the mirroring of directory snapshots. To add or remove directories, mirroring
|
||||
must be enabled for a given file system. To enable mirroring for a given file
|
||||
system, run a command of the following form:
|
||||
|
||||
.. note:: Mirroring module commands use `fs snapshot mirror` prefix as compared to
|
||||
the monitor commands which `fs mirror` prefix. Make sure to use module
|
||||
commands.
|
||||
.. prompt:: bash $
|
||||
|
||||
To disable mirroring, use::
|
||||
ceph fs snapshot mirror enable <fs_name>
|
||||
|
||||
$ ceph fs snapshot mirror disable <fs_name>
|
||||
.. note:: "Mirroring module" commands are prefixed with ``fs snapshot mirror``.
|
||||
This distinguishes them from "monitor commands", which are prefixed with ``fs
|
||||
mirror``. Be sure (in this context) to use module commands.
|
||||
|
||||
Once mirroring is enabled, add a peer to which directory snapshots are to be mirrored.
|
||||
Peers follow `<client>@<cluster>` specification and get assigned a unique-id (UUID)
|
||||
when added. See `Creating Users` section on how to create Ceph users for mirroring.
|
||||
To disable mirroring for a given file system, run a command of the following form:
|
||||
|
||||
To add a peer use::
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph fs snapshot mirror peer_add <fs_name> <remote_cluster_spec> [<remote_fs_name>] [<remote_mon_host>] [<cephx_key>]
|
||||
ceph fs snapshot mirror disable <fs_name>
|
||||
|
||||
`<remote_fs_name>` is optional, and defaults to `<fs_name>` (on the remote cluster).
|
||||
After mirroring is enabled, add a peer to which directory snapshots are to be
|
||||
mirrored. Peers are specified by the ``<client>@<cluster>`` format, which is
|
||||
referred to elsewhere in this document as the ``remote_cluster_spec``. Peers
|
||||
are assigned a unique-id (UUID) when added. See the :ref:`Creating
|
||||
Users<cephfs_mirroring_creating_users>` section for instructions that describe
|
||||
how to create Ceph users for mirroring.
|
||||
|
||||
This requires the remote cluster ceph configuration and user keyring to be available in
|
||||
the primary cluster. See `Bootstrap Peers` section to avoid this. `peer_add` additionally
|
||||
supports passing the remote cluster monitor address and the user key. However, bootstrapping
|
||||
a peer is the recommended way to add a peer.
|
||||
To add a peer, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs snapshot mirror peer_add <fs_name> <remote_cluster_spec> [<remote_fs_name>] [<remote_mon_host>] [<cephx_key>]
|
||||
|
||||
``<remote_cluster_spec>`` is of the format ``client.<id>@<cluster_name>``.
|
||||
|
||||
``<remote_fs_name>`` is optional, and defaults to `<fs_name>` (on the remote
|
||||
cluster).
|
||||
|
||||
For this command to succeed, the remote cluster's Ceph configuration and user
|
||||
keyring must be available in the primary cluster. For example, if a user named
|
||||
``client_mirror`` is created on the remote cluster which has ``rwps``
|
||||
permissions for the remote file system named ``remote_fs`` (see `Creating
|
||||
Users`) and the remote cluster is named ``remote_ceph`` (that is, the remote
|
||||
cluster configuration file is named ``remote_ceph.conf`` on the primary
|
||||
cluster), run the following command to add the remote filesystem as a peer to
|
||||
the primary filesystem ``primary_fs``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs snapshot mirror peer_add primary_fs client.mirror_remote@remote_ceph remote_fs
|
||||
|
||||
To avoid having to maintain the remote cluster configuration file and remote
|
||||
ceph user keyring in the primary cluster, users can bootstrap a peer (which
|
||||
stores the relevant remote cluster details in the monitor config store on the
|
||||
primary cluster). See the :ref:`Bootstrap
|
||||
Peers<cephfs_mirroring_bootstrap_peers>` section.
|
||||
|
||||
The ``peer_add`` command supports passing the remote cluster monitor address
|
||||
and the user key. However, bootstrapping a peer is the recommended way to add a
|
||||
peer.
|
||||
|
||||
.. note:: Only a single peer is supported right now.
|
||||
|
||||
To remove a peer use::
|
||||
To remove a peer, run a command of the following form:
|
||||
|
||||
$ ceph fs snapshot mirror peer_remove <fs_name> <peer_uuid>
|
||||
.. prompt:: bash $
|
||||
|
||||
To list file system mirror peers use::
|
||||
ceph fs snapshot mirror peer_remove <fs_name> <peer_uuid>
|
||||
|
||||
$ ceph fs snapshot mirror peer_list <fs_name>
|
||||
To list file system mirror peers, run a command of the following form:
|
||||
|
||||
To configure a directory for mirroring, use::
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph fs snapshot mirror add <fs_name> <path>
|
||||
ceph fs snapshot mirror peer_list <fs_name>
|
||||
|
||||
To stop a mirroring directory snapshots use::
|
||||
To configure a directory for mirroring, run a command of the following form:
|
||||
|
||||
$ ceph fs snapshot mirror remove <fs_name> <path>
|
||||
.. prompt:: bash $
|
||||
|
||||
Only absolute directory paths are allowed. Also, paths are normalized by the mirroring
|
||||
module, therfore, `/a/b/../b` is equivalent to `/a/b`.
|
||||
ceph fs snapshot mirror add <fs_name> <path>
|
||||
|
||||
To stop mirroring directory snapshots, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs snapshot mirror remove <fs_name> <path>
|
||||
|
||||
Only absolute directory paths are allowed.
|
||||
|
||||
Paths are normalized by the mirroring module. This means that ``/a/b/../b`` is
|
||||
equivalent to ``/a/b``. Paths always start from the CephFS file-system root and
|
||||
not from the host system mount point.
|
||||
|
||||
For example::
|
||||
|
||||
$ mkdir -p /d0/d1/d2
|
||||
$ ceph fs snapshot mirror add cephfs /d0/d1/d2
|
||||
@ -123,16 +200,19 @@ module, therfore, `/a/b/../b` is equivalent to `/a/b`.
|
||||
$ ceph fs snapshot mirror add cephfs /d0/d1/../d1/d2
|
||||
Error EEXIST: directory /d0/d1/d2 is already tracked
|
||||
|
||||
Once a directory is added for mirroring, its subdirectory or ancestor directories are
|
||||
disallowed to be added for mirorring::
|
||||
After a directory is added for mirroring, the additional mirroring of
|
||||
subdirectories or ancestor directories is disallowed::
|
||||
|
||||
$ ceph fs snapshot mirror add cephfs /d0/d1
|
||||
Error EINVAL: /d0/d1 is a ancestor of tracked path /d0/d1/d2
|
||||
$ ceph fs snapshot mirror add cephfs /d0/d1/d2/d3
|
||||
Error EINVAL: /d0/d1/d2/d3 is a subtree of tracked path /d0/d1/d2
|
||||
|
||||
Commands to check directory mapping (to mirror daemons) and directory distribution are
|
||||
detailed in `Mirroring Status` section.
|
||||
The :ref:`Mirroring Status<cephfs_mirroring_mirroring_status>` section contains
|
||||
information about the commands for checking the directory mapping (to mirror
|
||||
daemons) and for checking the directory distribution.
|
||||
|
||||
.. _cephfs_mirroring_bootstrap_peers:
|
||||
|
||||
Bootstrap Peers
|
||||
---------------
|
||||
@ -160,6 +240,9 @@ e.g.::
|
||||
|
||||
$ ceph fs snapshot mirror peer_bootstrap import cephfs eyJmc2lkIjogIjBkZjE3MjE3LWRmY2QtNDAzMC05MDc5LTM2Nzk4NTVkNDJlZiIsICJmaWxlc3lzdGVtIjogImJhY2t1cF9mcyIsICJ1c2VyIjogImNsaWVudC5taXJyb3JfcGVlcl9ib290c3RyYXAiLCAic2l0ZV9uYW1lIjogInNpdGUtcmVtb3RlIiwgImtleSI6ICJBUUFhcDBCZ0xtRmpOeEFBVnNyZXozai9YYUV0T2UrbUJEZlJDZz09IiwgIm1vbl9ob3N0IjogIlt2MjoxOTIuMTY4LjAuNTo0MDkxOCx2MToxOTIuMTY4LjAuNTo0MDkxOV0ifQ==
|
||||
|
||||
|
||||
.. _cephfs_mirroring_mirroring_status:
|
||||
|
||||
Mirroring Status
|
||||
----------------
|
||||
|
||||
|
@ -78,7 +78,15 @@ By default, `cephfs-top` connects to cluster name `ceph`. To use a non-default c
|
||||
|
||||
$ cephfs-top -d <seconds>
|
||||
|
||||
Interval should be greater or equal to 0.5 second. Fractional seconds are honoured.
|
||||
Refresh interval should be a positive integer.
|
||||
|
||||
To dump the metrics to stdout without creating a curses display use::
|
||||
|
||||
$ cephfs-top --dump
|
||||
|
||||
To dump the metrics of the given filesystem to stdout without creating a curses display use::
|
||||
|
||||
$ cephfs-top --dumpfs <fs_name>
|
||||
|
||||
Interactive Commands
|
||||
--------------------
|
||||
@ -104,3 +112,5 @@ The metrics display can be scrolled using the Arrow Keys, PgUp/PgDn, Home/End an
|
||||
Sample screenshot running `cephfs-top` with 2 filesystems:
|
||||
|
||||
.. image:: cephfs-top.png
|
||||
|
||||
.. note:: Minimum compatible python version for cephfs-top is 3.6.0. cephfs-top is supported on distros RHEL 8, Ubuntu 18.04, CentOS 8 and above.
|
||||
|
@ -149,8 +149,8 @@ errors.
|
||||
|
||||
::
|
||||
|
||||
cephfs-data-scan scan_extents <data pool>
|
||||
cephfs-data-scan scan_inodes <data pool>
|
||||
cephfs-data-scan scan_extents [<data pool> [<extra data pool> ...]]
|
||||
cephfs-data-scan scan_inodes [<data pool>]
|
||||
cephfs-data-scan scan_links
|
||||
|
||||
'scan_extents' and 'scan_inodes' commands may take a *very long* time
|
||||
@ -166,22 +166,22 @@ The example below shows how to run 4 workers simultaneously:
|
||||
::
|
||||
|
||||
# Worker 0
|
||||
cephfs-data-scan scan_extents --worker_n 0 --worker_m 4 <data pool>
|
||||
cephfs-data-scan scan_extents --worker_n 0 --worker_m 4
|
||||
# Worker 1
|
||||
cephfs-data-scan scan_extents --worker_n 1 --worker_m 4 <data pool>
|
||||
cephfs-data-scan scan_extents --worker_n 1 --worker_m 4
|
||||
# Worker 2
|
||||
cephfs-data-scan scan_extents --worker_n 2 --worker_m 4 <data pool>
|
||||
cephfs-data-scan scan_extents --worker_n 2 --worker_m 4
|
||||
# Worker 3
|
||||
cephfs-data-scan scan_extents --worker_n 3 --worker_m 4 <data pool>
|
||||
cephfs-data-scan scan_extents --worker_n 3 --worker_m 4
|
||||
|
||||
# Worker 0
|
||||
cephfs-data-scan scan_inodes --worker_n 0 --worker_m 4 <data pool>
|
||||
cephfs-data-scan scan_inodes --worker_n 0 --worker_m 4
|
||||
# Worker 1
|
||||
cephfs-data-scan scan_inodes --worker_n 1 --worker_m 4 <data pool>
|
||||
cephfs-data-scan scan_inodes --worker_n 1 --worker_m 4
|
||||
# Worker 2
|
||||
cephfs-data-scan scan_inodes --worker_n 2 --worker_m 4 <data pool>
|
||||
cephfs-data-scan scan_inodes --worker_n 2 --worker_m 4
|
||||
# Worker 3
|
||||
cephfs-data-scan scan_inodes --worker_n 3 --worker_m 4 <data pool>
|
||||
cephfs-data-scan scan_inodes --worker_n 3 --worker_m 4
|
||||
|
||||
It is **important** to ensure that all workers have completed the
|
||||
scan_extents phase before any workers enter the scan_inodes phase.
|
||||
@ -191,8 +191,13 @@ operation to delete ancillary data geneated during recovery.
|
||||
|
||||
::
|
||||
|
||||
cephfs-data-scan cleanup <data pool>
|
||||
cephfs-data-scan cleanup [<data pool>]
|
||||
|
||||
Note, the data pool parameters for 'scan_extents', 'scan_inodes' and
|
||||
'cleanup' commands are optional, and usually the tool will be able to
|
||||
detect the pools automatically. Still you may override this. The
|
||||
'scan_extents' command needs all data pools to be specified, while
|
||||
'scan_inodes' and 'cleanup' commands need only the main data pool.
|
||||
|
||||
|
||||
Using an alternate metadata pool for recovery
|
||||
@ -250,8 +255,8 @@ Now perform the recovery of the metadata pool from the data pool:
|
||||
::
|
||||
|
||||
cephfs-data-scan init --force-init --filesystem cephfs_recovery --alternate-pool cephfs_recovery_meta
|
||||
cephfs-data-scan scan_extents --alternate-pool cephfs_recovery_meta --filesystem <fs_name> <data_pool>
|
||||
cephfs-data-scan scan_inodes --alternate-pool cephfs_recovery_meta --filesystem <fs_name> --force-corrupt <data_pool>
|
||||
cephfs-data-scan scan_extents --alternate-pool cephfs_recovery_meta --filesystem <fs_name>
|
||||
cephfs-data-scan scan_inodes --alternate-pool cephfs_recovery_meta --filesystem <fs_name> --force-corrupt
|
||||
cephfs-data-scan scan_links --filesystem cephfs_recovery
|
||||
|
||||
.. note::
|
||||
|
@ -3,23 +3,22 @@
|
||||
FS volumes and subvolumes
|
||||
=========================
|
||||
|
||||
The volumes
|
||||
module of the :term:`Ceph Manager` daemon (ceph-mgr) provides a single
|
||||
source of truth for CephFS exports. The OpenStack shared
|
||||
file system service (manila_) and Ceph Container Storage Interface (CSI_)
|
||||
storage administrators among others can use the common CLI provided by the
|
||||
ceph-mgr volumes module to manage CephFS exports.
|
||||
The volumes module of the :term:`Ceph Manager` daemon (ceph-mgr) provides a
|
||||
single source of truth for CephFS exports. The OpenStack shared file system
|
||||
service (manila_) and the Ceph Container Storage Interface (CSI_) storage
|
||||
administrators use the common CLI provided by the ceph-mgr ``volumes`` module
|
||||
to manage CephFS exports.
|
||||
|
||||
The ceph-mgr volumes module implements the following file system export
|
||||
abstactions:
|
||||
The ceph-mgr ``volumes`` module implements the following file system export
|
||||
abstractions:
|
||||
|
||||
* FS volumes, an abstraction for CephFS file systems
|
||||
|
||||
* FS subvolumes, an abstraction for independent CephFS directory trees
|
||||
|
||||
* FS subvolume groups, an abstraction for a directory level higher than FS
|
||||
subvolumes to effect policies (e.g., :doc:`/cephfs/file-layouts`) across a
|
||||
set of subvolumes
|
||||
subvolumes. Used to effect policies (e.g., :doc:`/cephfs/file-layouts`)
|
||||
across a set of subvolumes
|
||||
|
||||
Some possible use-cases for the export abstractions:
|
||||
|
||||
@ -38,67 +37,76 @@ Requirements
|
||||
mon 'allow r'
|
||||
mgr 'allow rw'
|
||||
|
||||
|
||||
FS Volumes
|
||||
----------
|
||||
|
||||
Create a volume using::
|
||||
Create a volume by running the following command:
|
||||
|
||||
$ ceph fs volume create <vol_name> [<placement>]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs volume create <vol_name> [<placement>]
|
||||
|
||||
This creates a CephFS file system and its data and metadata pools. It can also
|
||||
deploy MDS daemons for the filesystem using a ceph-mgr orchestrator
|
||||
module (see :doc:`/mgr/orchestrator`), for example Rook.
|
||||
deploy MDS daemons for the filesystem using a ceph-mgr orchestrator module (for
|
||||
example Rook). See :doc:`/mgr/orchestrator`.
|
||||
|
||||
<vol_name> is the volume name (an arbitrary string), and
|
||||
<placement> is an optional string that designates the hosts that should have
|
||||
an MDS running on them and, optionally, the total number of MDS daemons the cluster
|
||||
should have. For example, the
|
||||
following placement string means "deploy MDS on nodes ``host1`` and ``host2`` (one
|
||||
MDS per host):
|
||||
``<vol_name>`` is the volume name (an arbitrary string). ``<placement>`` is an
|
||||
optional string that specifies the hosts that should have an MDS running on
|
||||
them and, optionally, the total number of MDS daemons that the cluster should
|
||||
have. For example, the following placement string means "deploy MDS on nodes
|
||||
``host1`` and ``host2`` (one MDS per host)::
|
||||
|
||||
"host1,host2"
|
||||
|
||||
and this placement specification says to deploy two MDS daemons on each of
|
||||
nodes ``host1`` and ``host2`` (for a total of four MDS daemons in the cluster):
|
||||
The following placement specification means "deploy two MDS daemons on each of
|
||||
nodes ``host1`` and ``host2`` (for a total of four MDS daemons in the
|
||||
cluster)"::
|
||||
|
||||
"4 host1,host2"
|
||||
|
||||
For more details on placement specification refer to the :ref:`orchestrator-cli-service-spec`,
|
||||
but keep in mind that specifying placement via a YAML file is not supported.
|
||||
See :ref:`orchestrator-cli-service-spec` for more on placement specification.
|
||||
Specifying placement via a YAML file is not supported.
|
||||
|
||||
To remove a volume, run the following command::
|
||||
To remove a volume, run the following command:
|
||||
|
||||
$ ceph fs volume rm <vol_name> [--yes-i-really-mean-it]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs volume rm <vol_name> [--yes-i-really-mean-it]
|
||||
|
||||
This removes a file system and its data and metadata pools. It also tries to
|
||||
remove MDS daemons using the enabled ceph-mgr orchestrator module.
|
||||
|
||||
List volumes using::
|
||||
List volumes by running the following command:
|
||||
|
||||
$ ceph fs volume ls
|
||||
.. prompt:: bash $
|
||||
|
||||
Rename a volume using::
|
||||
ceph fs volume ls
|
||||
|
||||
$ ceph fs volume rename <vol_name> <new_vol_name> [--yes-i-really-mean-it]
|
||||
Rename a volume by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs volume rename <vol_name> <new_vol_name> [--yes-i-really-mean-it]
|
||||
|
||||
Renaming a volume can be an expensive operation that requires the following:
|
||||
|
||||
- Rename the orchestrator-managed MDS service to match the <new_vol_name>.
|
||||
This involves launching a MDS service with <new_vol_name> and bringing down
|
||||
the MDS service with <vol_name>.
|
||||
- Rename the file system matching <vol_name> to <new_vol_name>
|
||||
- Change the application tags on the data and metadata pools of the file system
|
||||
to <new_vol_name>
|
||||
- Rename the metadata and data pools of the file system.
|
||||
- Renaming the orchestrator-managed MDS service to match the <new_vol_name>.
|
||||
This involves launching a MDS service with ``<new_vol_name>`` and bringing
|
||||
down the MDS service with ``<vol_name>``.
|
||||
- Renaming the file system matching ``<vol_name>`` to ``<new_vol_name>``.
|
||||
- Changing the application tags on the data and metadata pools of the file system
|
||||
to ``<new_vol_name>``.
|
||||
- Renaming the metadata and data pools of the file system.
|
||||
|
||||
The CephX IDs authorized for <vol_name> need to be reauthorized for <new_vol_name>. Any
|
||||
on-going operations of the clients using these IDs may be disrupted. Mirroring is
|
||||
expected to be disabled on the volume.
|
||||
The CephX IDs that are authorized for ``<vol_name>`` must be reauthorized for
|
||||
``<new_vol_name>``. Any ongoing operations of the clients using these IDs may
|
||||
be disrupted. Ensure that mirroring is disabled on the volume.
|
||||
|
||||
To fetch the information of a CephFS volume, run::
|
||||
To fetch the information of a CephFS volume, run the following command:
|
||||
|
||||
$ ceph fs volume info vol_name [--human_readable]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs volume info vol_name [--human_readable]
|
||||
|
||||
The ``--human_readable`` flag shows used and available pool capacities in KB/MB/GB.
|
||||
|
||||
@ -142,9 +150,11 @@ Sample output of the ``volume info`` command::
|
||||
FS Subvolume groups
|
||||
-------------------
|
||||
|
||||
Create a subvolume group using::
|
||||
Create a subvolume group by running the following command:
|
||||
|
||||
$ ceph fs subvolumegroup create <vol_name> <group_name> [--size <size_in_bytes>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolumegroup create <vol_name> <group_name> [--size <size_in_bytes>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>]
|
||||
|
||||
The command succeeds even if the subvolume group already exists.
|
||||
|
||||
@ -152,32 +162,41 @@ When creating a subvolume group you can specify its data pool layout (see
|
||||
:doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals, and
|
||||
size in bytes. The size of the subvolume group is specified by setting
|
||||
a quota on it (see :doc:`/cephfs/quota`). By default, the subvolume group
|
||||
is created with octal file mode '755', uid '0', gid '0' and the data pool
|
||||
is created with octal file mode ``755``, uid ``0``, gid ``0`` and the data pool
|
||||
layout of its parent directory.
|
||||
|
||||
Remove a subvolume group by running a command of the following form:
|
||||
|
||||
Remove a subvolume group using::
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph fs subvolumegroup rm <vol_name> <group_name> [--force]
|
||||
ceph fs subvolumegroup rm <vol_name> <group_name> [--force]
|
||||
|
||||
The removal of a subvolume group fails if it is not empty or non-existent.
|
||||
'--force' flag allows the non-existent subvolume group remove command to succeed.
|
||||
The removal of a subvolume group fails if the subvolume group is not empty or
|
||||
is non-existent. The ``--force`` flag allows the non-existent "subvolume group
|
||||
remove command" to succeed.
|
||||
|
||||
|
||||
Fetch the absolute path of a subvolume group using::
|
||||
Fetch the absolute path of a subvolume group by running a command of the
|
||||
following form:
|
||||
|
||||
$ ceph fs subvolumegroup getpath <vol_name> <group_name>
|
||||
.. prompt:: bash $
|
||||
|
||||
List subvolume groups using::
|
||||
ceph fs subvolumegroup getpath <vol_name> <group_name>
|
||||
|
||||
$ ceph fs subvolumegroup ls <vol_name>
|
||||
List subvolume groups by running a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolumegroup ls <vol_name>
|
||||
|
||||
.. note:: Subvolume group snapshot feature is no longer supported in mainline CephFS (existing group
|
||||
snapshots can still be listed and deleted)
|
||||
|
||||
Fetch the metadata of a subvolume group using::
|
||||
Fetch the metadata of a subvolume group by running a command of the following form:
|
||||
|
||||
$ ceph fs subvolumegroup info <vol_name> <group_name>
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolumegroup info <vol_name> <group_name>
|
||||
|
||||
The output format is JSON and contains fields as follows:
|
||||
|
||||
@ -194,62 +213,77 @@ The output format is JSON and contains fields as follows:
|
||||
* ``created_at``: creation time of the subvolume group in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``data_pool``: data pool to which the subvolume group belongs
|
||||
|
||||
Check the presence of any subvolume group using::
|
||||
Check the presence of any subvolume group by running a command of the following form:
|
||||
|
||||
$ ceph fs subvolumegroup exist <vol_name>
|
||||
.. prompt:: bash $
|
||||
|
||||
The 'exist' command outputs:
|
||||
ceph fs subvolumegroup exist <vol_name>
|
||||
|
||||
The ``exist`` command outputs:
|
||||
|
||||
* "subvolumegroup exists": if any subvolumegroup is present
|
||||
* "no subvolumegroup exists": if no subvolumegroup is present
|
||||
|
||||
.. note:: This command checks for the presence of custom groups and not presence of the default one. To validate the emptiness of the volume, a subvolumegroup existence check alone is not sufficient. Subvolume existence also needs to be checked as there might be subvolumes in the default group.
|
||||
.. note:: This command checks for the presence of custom groups and not
|
||||
presence of the default one. To validate the emptiness of the volume, a
|
||||
subvolumegroup existence check alone is not sufficient. Subvolume existence
|
||||
also needs to be checked as there might be subvolumes in the default group.
|
||||
|
||||
Resize a subvolume group using::
|
||||
Resize a subvolume group by running a command of the following form:
|
||||
|
||||
$ ceph fs subvolumegroup resize <vol_name> <group_name> <new_size> [--no_shrink]
|
||||
.. prompt:: bash $
|
||||
|
||||
The command resizes the subvolume group quota using the size specified by ``new_size``.
|
||||
The ``--no_shrink`` flag prevents the subvolume group from shrinking below the current used
|
||||
size.
|
||||
ceph fs subvolumegroup resize <vol_name> <group_name> <new_size> [--no_shrink]
|
||||
|
||||
The subvolume group may be resized to an infinite size by passing ``inf`` or ``infinite``
|
||||
as the ``new_size``.
|
||||
The command resizes the subvolume group quota, using the size specified by
|
||||
``new_size``. The ``--no_shrink`` flag prevents the subvolume group from
|
||||
shrinking below the current used size.
|
||||
|
||||
Remove a snapshot of a subvolume group using::
|
||||
The subvolume group may be resized to an infinite size by passing ``inf`` or
|
||||
``infinite`` as the ``new_size``.
|
||||
|
||||
$ ceph fs subvolumegroup snapshot rm <vol_name> <group_name> <snap_name> [--force]
|
||||
Remove a snapshot of a subvolume group by running a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolumegroup snapshot rm <vol_name> <group_name> <snap_name> [--force]
|
||||
|
||||
Supplying the ``--force`` flag allows the command to succeed when it would otherwise
|
||||
fail due to the snapshot not existing.
|
||||
fail due to the nonexistence of the snapshot.
|
||||
|
||||
List snapshots of a subvolume group using::
|
||||
List snapshots of a subvolume group by running a command of the following form:
|
||||
|
||||
$ ceph fs subvolumegroup snapshot ls <vol_name> <group_name>
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolumegroup snapshot ls <vol_name> <group_name>
|
||||
|
||||
|
||||
FS Subvolumes
|
||||
-------------
|
||||
|
||||
Create a subvolume using::
|
||||
Create a subvolume using:
|
||||
|
||||
$ ceph fs subvolume create <vol_name> <subvol_name> [--size <size_in_bytes>] [--group_name <subvol_group_name>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>] [--namespace-isolated]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume create <vol_name> <subvol_name> [--size <size_in_bytes>] [--group_name <subvol_group_name>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>] [--namespace-isolated]
|
||||
|
||||
|
||||
The command succeeds even if the subvolume already exists.
|
||||
|
||||
When creating a subvolume you can specify its subvolume group, data pool layout,
|
||||
uid, gid, file mode in octal numerals, and size in bytes. The size of the subvolume is
|
||||
specified by setting a quota on it (see :doc:`/cephfs/quota`). The subvolume can be
|
||||
created in a separate RADOS namespace by specifying --namespace-isolated option. By
|
||||
default a subvolume is created within the default subvolume group, and with an octal file
|
||||
mode '755', uid of its subvolume group, gid of its subvolume group, data pool layout of
|
||||
its parent directory and no size limit.
|
||||
When creating a subvolume you can specify its subvolume group, data pool
|
||||
layout, uid, gid, file mode in octal numerals, and size in bytes. The size of
|
||||
the subvolume is specified by setting a quota on it (see :doc:`/cephfs/quota`).
|
||||
The subvolume can be created in a separate RADOS namespace by specifying
|
||||
--namespace-isolated option. By default a subvolume is created within the
|
||||
default subvolume group, and with an octal file mode '755', uid of its
|
||||
subvolume group, gid of its subvolume group, data pool layout of its parent
|
||||
directory and no size limit.
|
||||
|
||||
Remove a subvolume using::
|
||||
Remove a subvolume using:
|
||||
|
||||
$ ceph fs subvolume rm <vol_name> <subvol_name> [--group_name <subvol_group_name>] [--force] [--retain-snapshots]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume rm <vol_name> <subvol_name> [--group_name <subvol_group_name>] [--force] [--retain-snapshots]
|
||||
|
||||
The command removes the subvolume and its contents. It does this in two steps.
|
||||
First, it moves the subvolume to a trash folder, and then asynchronously purges
|
||||
@ -262,44 +296,62 @@ A subvolume can be removed retaining existing snapshots of the subvolume using t
|
||||
'--retain-snapshots' option. If snapshots are retained, the subvolume is considered
|
||||
empty for all operations not involving the retained snapshots.
|
||||
|
||||
.. note:: Snapshot retained subvolumes can be recreated using 'ceph fs subvolume create'
|
||||
.. note:: Snapshot retained subvolumes can be recreated using 'ceph fs
|
||||
subvolume create'
|
||||
|
||||
.. note:: Retained snapshots can be used as a clone source to recreate the subvolume, or clone to a newer subvolume.
|
||||
.. note:: Retained snapshots can be used as a clone source to recreate the
|
||||
subvolume, or clone to a newer subvolume.
|
||||
|
||||
Resize a subvolume using::
|
||||
Resize a subvolume using:
|
||||
|
||||
$ ceph fs subvolume resize <vol_name> <subvol_name> <new_size> [--group_name <subvol_group_name>] [--no_shrink]
|
||||
.. prompt:: bash $
|
||||
|
||||
The command resizes the subvolume quota using the size specified by ``new_size``.
|
||||
The `--no_shrink`` flag prevents the subvolume from shrinking below the current used size of the subvolume.
|
||||
ceph fs subvolume resize <vol_name> <subvol_name> <new_size> [--group_name <subvol_group_name>] [--no_shrink]
|
||||
|
||||
The subvolume can be resized to an unlimited (but sparse) logical size by passing ``inf`` or ``infinite`` as `` new_size``.
|
||||
The command resizes the subvolume quota using the size specified by
|
||||
``new_size``. The `--no_shrink`` flag prevents the subvolume from shrinking
|
||||
below the current used size of the subvolume.
|
||||
|
||||
Authorize cephx auth IDs, the read/read-write access to fs subvolumes::
|
||||
The subvolume can be resized to an unlimited (but sparse) logical size by
|
||||
passing ``inf`` or ``infinite`` as `` new_size``.
|
||||
|
||||
$ ceph fs subvolume authorize <vol_name> <sub_name> <auth_id> [--group_name=<group_name>] [--access_level=<access_level>]
|
||||
Authorize cephx auth IDs, the read/read-write access to fs subvolumes:
|
||||
|
||||
The 'access_level' takes 'r' or 'rw' as value.
|
||||
.. prompt:: bash $
|
||||
|
||||
Deauthorize cephx auth IDs, the read/read-write access to fs subvolumes::
|
||||
ceph fs subvolume authorize <vol_name> <sub_name> <auth_id> [--group_name=<group_name>] [--access_level=<access_level>]
|
||||
|
||||
$ ceph fs subvolume deauthorize <vol_name> <sub_name> <auth_id> [--group_name=<group_name>]
|
||||
The ``access_level`` takes ``r`` or ``rw`` as value.
|
||||
|
||||
List cephx auth IDs authorized to access fs subvolume::
|
||||
Deauthorize cephx auth IDs, the read/read-write access to fs subvolumes:
|
||||
|
||||
$ ceph fs subvolume authorized_list <vol_name> <sub_name> [--group_name=<group_name>]
|
||||
.. prompt:: bash $
|
||||
|
||||
Evict fs clients based on auth ID and subvolume mounted::
|
||||
ceph fs subvolume deauthorize <vol_name> <sub_name> <auth_id> [--group_name=<group_name>]
|
||||
|
||||
$ ceph fs subvolume evict <vol_name> <sub_name> <auth_id> [--group_name=<group_name>]
|
||||
List cephx auth IDs authorized to access fs subvolume:
|
||||
|
||||
Fetch the absolute path of a subvolume using::
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph fs subvolume getpath <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
ceph fs subvolume authorized_list <vol_name> <sub_name> [--group_name=<group_name>]
|
||||
|
||||
Fetch the information of a subvolume using::
|
||||
Evict fs clients based on auth ID and subvolume mounted:
|
||||
|
||||
$ ceph fs subvolume info <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume evict <vol_name> <sub_name> <auth_id> [--group_name=<group_name>]
|
||||
|
||||
Fetch the absolute path of a subvolume using:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume getpath <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Fetch the information of a subvolume using:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume info <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
The output format is JSON and contains fields as follows.
|
||||
|
||||
@ -339,67 +391,93 @@ A subvolume's ``state`` is based on the current state of the subvolume and conta
|
||||
* ``complete``: subvolume is ready for all operations
|
||||
* ``snapshot-retained``: subvolume is removed but its snapshots are retained
|
||||
|
||||
List subvolumes using::
|
||||
List subvolumes using:
|
||||
|
||||
$ ceph fs subvolume ls <vol_name> [--group_name <subvol_group_name>]
|
||||
.. prompt:: bash $
|
||||
|
||||
.. note:: subvolumes that are removed but have snapshots retained, are also listed.
|
||||
ceph fs subvolume ls <vol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Check the presence of any subvolume using::
|
||||
.. note:: subvolumes that are removed but have snapshots retained, are also
|
||||
listed.
|
||||
|
||||
$ ceph fs subvolume exist <vol_name> [--group_name <subvol_group_name>]
|
||||
Check the presence of any subvolume using:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume exist <vol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
These are the possible results of the ``exist`` command:
|
||||
|
||||
* ``subvolume exists``: if any subvolume of given group_name is present
|
||||
* ``no subvolume exists``: if no subvolume of given group_name is present
|
||||
|
||||
Set custom metadata on the subvolume as a key-value pair using::
|
||||
Set custom metadata on the subvolume as a key-value pair using:
|
||||
|
||||
$ ceph fs subvolume metadata set <vol_name> <subvol_name> <key_name> <value> [--group_name <subvol_group_name>]
|
||||
.. prompt:: bash $
|
||||
|
||||
.. note:: If the key_name already exists then the old value will get replaced by the new value.
|
||||
ceph fs subvolume metadata set <vol_name> <subvol_name> <key_name> <value> [--group_name <subvol_group_name>]
|
||||
|
||||
.. note:: key_name and value should be a string of ASCII characters (as specified in python's string.printable). key_name is case-insensitive and always stored in lower case.
|
||||
.. note:: If the key_name already exists then the old value will get replaced
|
||||
by the new value.
|
||||
|
||||
.. note:: Custom metadata on a subvolume is not preserved when snapshotting the subvolume, and hence, is also not preserved when cloning the subvolume snapshot.
|
||||
.. note:: key_name and value should be a string of ASCII characters (as
|
||||
specified in python's string.printable). key_name is case-insensitive and
|
||||
always stored in lower case.
|
||||
|
||||
Get custom metadata set on the subvolume using the metadata key::
|
||||
.. note:: Custom metadata on a subvolume is not preserved when snapshotting the
|
||||
subvolume, and hence, is also not preserved when cloning the subvolume
|
||||
snapshot.
|
||||
|
||||
$ ceph fs subvolume metadata get <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>]
|
||||
Get custom metadata set on the subvolume using the metadata key:
|
||||
|
||||
List custom metadata (key-value pairs) set on the subvolume using::
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph fs subvolume metadata ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
ceph fs subvolume metadata get <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Remove custom metadata set on the subvolume using the metadata key::
|
||||
List custom metadata (key-value pairs) set on the subvolume using:
|
||||
|
||||
$ ceph fs subvolume metadata rm <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume metadata ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Remove custom metadata set on the subvolume using the metadata key:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume metadata rm <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
||||
|
||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||
fail if the metadata key did not exist.
|
||||
|
||||
Create a snapshot of a subvolume using::
|
||||
Create a snapshot of a subvolume using:
|
||||
|
||||
$ ceph fs subvolume snapshot create <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume snapshot create <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Remove a snapshot of a subvolume using::
|
||||
Remove a snapshot of a subvolume using:
|
||||
|
||||
$ ceph fs subvolume snapshot rm <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>] [--force]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume snapshot rm <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>] [--force]
|
||||
|
||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||
fail if the snapshot did not exist.
|
||||
|
||||
.. note:: if the last snapshot within a snapshot retained subvolume is removed, the subvolume is also removed
|
||||
.. note:: if the last snapshot within a snapshot retained subvolume is removed,
|
||||
the subvolume is also removed
|
||||
|
||||
List snapshots of a subvolume using::
|
||||
List snapshots of a subvolume using:
|
||||
|
||||
$ ceph fs subvolume snapshot ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
.. prompt:: bash $
|
||||
|
||||
Fetch the information of a snapshot using::
|
||||
ceph fs subvolume snapshot ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
$ ceph fs subvolume snapshot info <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
Fetch the information of a snapshot using:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume snapshot info <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
|
||||
The output format is JSON and contains fields as follows.
|
||||
|
||||
@ -440,27 +518,40 @@ Sample output when no snapshot clone is in progress or pending::
|
||||
"has_pending_clones": "no"
|
||||
}
|
||||
|
||||
Set custom key-value metadata on the snapshot by running::
|
||||
Set custom key-value metadata on the snapshot by running:
|
||||
|
||||
$ ceph fs subvolume snapshot metadata set <vol_name> <subvol_name> <snap_name> <key_name> <value> [--group_name <subvol_group_name>]
|
||||
.. prompt:: bash $
|
||||
|
||||
.. note:: If the key_name already exists then the old value will get replaced by the new value.
|
||||
ceph fs subvolume snapshot metadata set <vol_name> <subvol_name> <snap_name> <key_name> <value> [--group_name <subvol_group_name>]
|
||||
|
||||
.. note:: The key_name and value should be a strings of ASCII characters (as specified in Python's ``string.printable``). The key_name is case-insensitive and always stored in lowercase.
|
||||
.. note:: If the key_name already exists then the old value will get replaced
|
||||
by the new value.
|
||||
|
||||
.. note:: Custom metadata on a snapshot is not preserved when snapshotting the subvolume, and hence is also not preserved when cloning the subvolume snapshot.
|
||||
.. note:: The key_name and value should be a strings of ASCII characters (as
|
||||
specified in Python's ``string.printable``). The key_name is
|
||||
case-insensitive and always stored in lowercase.
|
||||
|
||||
Get custom metadata set on the snapshot using the metadata key::
|
||||
.. note:: Custom metadata on a snapshot is not preserved when snapshotting the
|
||||
subvolume, and hence is also not preserved when cloning the subvolume
|
||||
snapshot.
|
||||
|
||||
$ ceph fs subvolume snapshot metadata get <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>]
|
||||
Get custom metadata set on the snapshot using the metadata key:
|
||||
|
||||
List custom metadata (key-value pairs) set on the snapshot using::
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph fs subvolume snapshot metadata ls <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
ceph fs subvolume snapshot metadata get <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Remove custom metadata set on the snapshot using the metadata key::
|
||||
List custom metadata (key-value pairs) set on the snapshot using:
|
||||
|
||||
$ ceph fs subvolume snapshot metadata rm <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume snapshot metadata ls <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Remove custom metadata set on the snapshot using the metadata key:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume snapshot metadata rm <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
||||
|
||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||
fail if the metadata key did not exist.
|
||||
@ -468,47 +559,73 @@ fail if the metadata key did not exist.
|
||||
Cloning Snapshots
|
||||
-----------------
|
||||
|
||||
Subvolumes can be created by cloning subvolume snapshots. Cloning is an asynchronous operation that copies
|
||||
data from a snapshot to a subvolume. Due to this bulk copying, cloning is inefficient for very large
|
||||
data sets.
|
||||
Subvolumes can be created by cloning subvolume snapshots. Cloning is an
|
||||
asynchronous operation that copies data from a snapshot to a subvolume. Due to
|
||||
this bulk copying, cloning is inefficient for very large data sets.
|
||||
|
||||
.. note:: Removing a snapshot (source subvolume) would fail if there are pending or in progress clone operations.
|
||||
.. note:: Removing a snapshot (source subvolume) would fail if there are
|
||||
pending or in progress clone operations.
|
||||
|
||||
Protecting snapshots prior to cloning was a prerequisite in the Nautilus release, and the commands to protect/unprotect
|
||||
snapshots were introduced for this purpose. This prerequisite, and hence the commands to protect/unprotect, is being
|
||||
deprecated and may be removed from a future release.
|
||||
Protecting snapshots prior to cloning was a prerequisite in the Nautilus
|
||||
release, and the commands to protect/unprotect snapshots were introduced for
|
||||
this purpose. This prerequisite, and hence the commands to protect/unprotect,
|
||||
is being deprecated and may be removed from a future release.
|
||||
|
||||
The commands being deprecated are::
|
||||
$ ceph fs subvolume snapshot protect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
$ ceph fs subvolume snapshot unprotect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
The commands being deprecated are:
|
||||
|
||||
.. note:: Using the above commands will not result in an error, but they have no useful purpose.
|
||||
.. prompt:: bash #
|
||||
|
||||
.. note:: Use the ``subvolume info`` command to fetch subvolume metadata regarding supported ``features`` to help decide if protect/unprotect of snapshots is required, based on the availability of the ``snapshot-autoprotect`` feature.
|
||||
ceph fs subvolume snapshot protect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
ceph fs subvolume snapshot unprotect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
|
||||
To initiate a clone operation use::
|
||||
.. note:: Using the above commands will not result in an error, but they have
|
||||
no useful purpose.
|
||||
|
||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name>
|
||||
.. note:: Use the ``subvolume info`` command to fetch subvolume metadata
|
||||
regarding supported ``features`` to help decide if protect/unprotect of
|
||||
snapshots is required, based on the availability of the
|
||||
``snapshot-autoprotect`` feature.
|
||||
|
||||
If a snapshot (source subvolume) is a part of non-default group, the group name needs to be specified::
|
||||
To initiate a clone operation use:
|
||||
|
||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --group_name <subvol_group_name>
|
||||
.. prompt:: bash $
|
||||
|
||||
Cloned subvolumes can be a part of a different group than the source snapshot (by default, cloned subvolumes are created in default group). To clone to a particular group use::
|
||||
ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name>
|
||||
|
||||
If a snapshot (source subvolume) is a part of non-default group, the group name
|
||||
needs to be specified:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --group_name <subvol_group_name>
|
||||
|
||||
Cloned subvolumes can be a part of a different group than the source snapshot
|
||||
(by default, cloned subvolumes are created in default group). To clone to a
|
||||
particular group use:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --target_group_name <subvol_group_name>
|
||||
|
||||
Similar to specifying a pool layout when creating a subvolume, pool layout can be specified when creating a cloned subvolume. To create a cloned subvolume with a specific pool layout use::
|
||||
Similar to specifying a pool layout when creating a subvolume, pool layout can
|
||||
be specified when creating a cloned subvolume. To create a cloned subvolume
|
||||
with a specific pool layout use:
|
||||
|
||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --pool_layout <pool_layout>
|
||||
.. prompt:: bash $
|
||||
|
||||
Configure the maximum number of concurrent clones. The default is 4::
|
||||
ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --pool_layout <pool_layout>
|
||||
|
||||
$ ceph config set mgr mgr/volumes/max_concurrent_clones <value>
|
||||
Configure the maximum number of concurrent clones. The default is 4:
|
||||
|
||||
To check the status of a clone operation use::
|
||||
.. prompt:: bash $
|
||||
|
||||
$ ceph fs clone status <vol_name> <clone_name> [--group_name <group_name>]
|
||||
ceph config set mgr mgr/volumes/max_concurrent_clones <value>
|
||||
|
||||
To check the status of a clone operation use:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs clone status <vol_name> <clone_name> [--group_name <group_name>]
|
||||
|
||||
A clone can be in one of the following states:
|
||||
|
||||
@ -538,7 +655,8 @@ Here is an example of an ``in-progress`` clone::
|
||||
}
|
||||
}
|
||||
|
||||
.. note:: The ``failure`` section will be shown only if the clone's state is ``failed`` or ``cancelled``
|
||||
.. note:: The ``failure`` section will be shown only if the clone's state is
|
||||
``failed`` or ``cancelled``
|
||||
|
||||
Here is an example of a ``failed`` clone::
|
||||
|
||||
@ -560,9 +678,11 @@ Here is an example of a ``failed`` clone::
|
||||
}
|
||||
}
|
||||
|
||||
(NOTE: since ``subvol1`` is in the default group, the ``source`` object's ``clone status`` does not include the group name)
|
||||
(NOTE: since ``subvol1`` is in the default group, the ``source`` object's
|
||||
``clone status`` does not include the group name)
|
||||
|
||||
.. note:: Cloned subvolumes are accessible only after the clone operation has successfully completed.
|
||||
.. note:: Cloned subvolumes are accessible only after the clone operation has
|
||||
successfully completed.
|
||||
|
||||
After a successful clone operation, ``clone status`` will look like the below::
|
||||
|
||||
@ -576,37 +696,47 @@ After a successful clone operation, ``clone status`` will look like the below::
|
||||
If a clone operation is unsuccessful, the ``state`` value will be ``failed``.
|
||||
|
||||
To retry a failed clone operation, the incomplete clone must be deleted and the
|
||||
clone operation must be issued again. To delete a partial clone use::
|
||||
clone operation must be issued again. To delete a partial clone use:
|
||||
|
||||
$ ceph fs subvolume rm <vol_name> <clone_name> [--group_name <group_name>] --force
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume rm <vol_name> <clone_name> [--group_name <group_name>] --force
|
||||
|
||||
.. note:: Cloning synchronizes only directories, regular files and symbolic
|
||||
links. Inode timestamps (access and modification times) are synchronized up
|
||||
to seconds granularity.
|
||||
|
||||
An ``in-progress`` or a ``pending`` clone operation may be canceled. To cancel
|
||||
a clone operation use the ``clone cancel`` command::
|
||||
a clone operation use the ``clone cancel`` command:
|
||||
|
||||
$ ceph fs clone cancel <vol_name> <clone_name> [--group_name <group_name>]
|
||||
.. prompt:: bash $
|
||||
|
||||
On successful cancellation, the cloned subvolume is moved to the ``canceled``
|
||||
state::
|
||||
ceph fs clone cancel <vol_name> <clone_name> [--group_name <group_name>]
|
||||
|
||||
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
|
||||
$ ceph fs clone cancel cephfs clone1
|
||||
$ ceph fs clone status cephfs clone1
|
||||
{
|
||||
"status": {
|
||||
"state": "canceled",
|
||||
"source": {
|
||||
"volume": "cephfs",
|
||||
"subvolume": "subvol1",
|
||||
"snapshot": "snap1"
|
||||
}
|
||||
On successful cancellation, the cloned subvolume is moved to the ``canceled`` state:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
|
||||
ceph fs clone cancel cephfs clone1
|
||||
ceph fs clone status cephfs clone1
|
||||
|
||||
::
|
||||
|
||||
{
|
||||
"status": {
|
||||
"state": "canceled",
|
||||
"source": {
|
||||
"volume": "cephfs",
|
||||
"subvolume": "subvol1",
|
||||
"snapshot": "snap1"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.. note:: The canceled cloned may be deleted by supplying the ``--force`` option to the `fs subvolume rm` command.
|
||||
.. note:: The canceled cloned may be deleted by supplying the ``--force``
|
||||
option to the `fs subvolume rm` command.
|
||||
|
||||
|
||||
.. _subvol-pinning:
|
||||
@ -614,28 +744,33 @@ state::
|
||||
Pinning Subvolumes and Subvolume Groups
|
||||
---------------------------------------
|
||||
|
||||
|
||||
Subvolumes and subvolume groups may be automatically pinned to ranks according
|
||||
to policies. This can distribute load across MDS ranks in predictable and
|
||||
stable ways. Review :ref:`cephfs-pinning` and :ref:`cephfs-ephemeral-pinning`
|
||||
for details on how pinning works.
|
||||
|
||||
Pinning is configured by::
|
||||
Pinning is configured by:
|
||||
|
||||
$ ceph fs subvolumegroup pin <vol_name> <group_name> <pin_type> <pin_setting>
|
||||
.. prompt:: bash $
|
||||
|
||||
or for subvolumes::
|
||||
ceph fs subvolumegroup pin <vol_name> <group_name> <pin_type> <pin_setting>
|
||||
|
||||
$ ceph fs subvolume pin <vol_name> <group_name> <pin_type> <pin_setting>
|
||||
or for subvolumes:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolume pin <vol_name> <group_name> <pin_type> <pin_setting>
|
||||
|
||||
Typically you will want to set subvolume group pins. The ``pin_type`` may be
|
||||
one of ``export``, ``distributed``, or ``random``. The ``pin_setting``
|
||||
corresponds to the extended attributed "value" as in the pinning documentation
|
||||
referenced above.
|
||||
|
||||
So, for example, setting a distributed pinning strategy on a subvolume group::
|
||||
So, for example, setting a distributed pinning strategy on a subvolume group:
|
||||
|
||||
$ ceph fs subvolumegroup pin cephfilesystem-a csi distributed 1
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph fs subvolumegroup pin cephfilesystem-a csi distributed 1
|
||||
|
||||
Will enable distributed subtree partitioning policy for the "csi" subvolume
|
||||
group. This will cause every subvolume within the group to be automatically
|
||||
|
@ -123,7 +123,9 @@ other daemons, please see :ref:`health-checks`.
|
||||
from properly cleaning up resources used by client requests. This message
|
||||
appears if a client appears to have more than ``max_completed_requests``
|
||||
(default 100000) requests that are complete on the MDS side but haven't
|
||||
yet been accounted for in the client's *oldest tid* value.
|
||||
yet been accounted for in the client's *oldest tid* value. The last tid
|
||||
used by the MDS to trim completed client requests (or flush) is included
|
||||
as part of `session ls` (or `client ls`) command as a debug aid.
|
||||
* ``MDS_DAMAGE``
|
||||
|
||||
Message
|
||||
@ -168,3 +170,15 @@ other daemons, please see :ref:`health-checks`.
|
||||
the actual cache size (in memory) is at least 50% greater than
|
||||
``mds_cache_memory_limit`` (default 1GB). Modify ``mds_health_cache_threshold``
|
||||
to set the warning ratio.
|
||||
|
||||
* ``MDS_CLIENTS_LAGGY``
|
||||
|
||||
Message
|
||||
"Client *ID* is laggy; not evicted because some OSD(s) is/are laggy"
|
||||
|
||||
Description
|
||||
If OSD(s) is laggy (due to certain conditions like network cut-off, etc)
|
||||
then it might make clients laggy(session might get idle or cannot flush
|
||||
dirty data for cap revokes). If ``defer_client_eviction_on_laggy_osds`` is
|
||||
set to true (default true), client eviction will not take place and thus
|
||||
this health warning will be generated.
|
||||
|
@ -501,6 +501,25 @@
|
||||
:Type: 32-bit Integer
|
||||
:Default: ``0``
|
||||
|
||||
``mds_inject_skip_replaying_inotable``
|
||||
|
||||
:Description: Ceph will skip replaying the inotable when replaying the journal,
|
||||
and the premary MDS will crash, while the replacing MDS won't.
|
||||
(for developers only).
|
||||
|
||||
:Type: Boolean
|
||||
:Default: ``false``
|
||||
|
||||
|
||||
``mds_kill_skip_replaying_inotable``
|
||||
|
||||
:Description: Ceph will skip replaying the inotable when replaying the journal,
|
||||
and the premary MDS will crash, while the replacing MDS won't.
|
||||
(for developers only).
|
||||
|
||||
:Type: Boolean
|
||||
:Default: ``false``
|
||||
|
||||
|
||||
``mds_wipe_sessions``
|
||||
|
||||
|
@ -53,7 +53,8 @@ If you have more than one FS on your Ceph cluster, use the option
|
||||
|
||||
ceph-fuse --id foo --client_fs mycephfs2 /mnt/mycephfs2
|
||||
|
||||
You may also add a ``client_fs`` setting to your ``ceph.conf``
|
||||
You may also add a ``client_fs`` setting to your ``ceph.conf``. Alternatively, the option
|
||||
``--client_mds_namespace`` is supported for backward compatibility.
|
||||
|
||||
Unmounting CephFS
|
||||
=================
|
||||
|
@ -96,6 +96,28 @@ non-default FS as follows::
|
||||
|
||||
mount -t ceph :/ /mnt/mycephfs2 -o name=fs,fs=mycephfs2
|
||||
|
||||
Backward Compatibility
|
||||
======================
|
||||
The old syntax is supported for backward compatibility.
|
||||
|
||||
To mount CephFS with the kernel driver::
|
||||
|
||||
mkdir /mnt/mycephfs
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin
|
||||
|
||||
The key-value argument right after option ``-o`` is CephX credential;
|
||||
``name`` is the username of the CephX user we are using to mount CephFS.
|
||||
|
||||
To mount a non-default FS ``cephfs2``, in case the cluster has multiple FSs::
|
||||
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin,fs=cephfs2
|
||||
|
||||
or
|
||||
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin,mds_namespace=cephfs2
|
||||
|
||||
.. note:: The option ``mds_namespace`` is deprecated. Use ``fs=`` instead when using the old syntax for mounting.
|
||||
|
||||
Unmounting CephFS
|
||||
=================
|
||||
To unmount the Ceph file system, use the ``umount`` command as usual::
|
||||
|
@ -60,6 +60,18 @@ added as comments in the sample conf. There are options to do the following:
|
||||
- enable read delegations (need at least v13.0.1 'libcephfs2' package
|
||||
and v2.6.0 stable 'nfs-ganesha' and 'nfs-ganesha-ceph' packages)
|
||||
|
||||
.. important::
|
||||
|
||||
Under certain conditions, NFS access using the CephFS FSAL fails. This
|
||||
causes an error to be thrown that reads "Input/output error". Under these
|
||||
circumstances, the application metadata must be set for the CephFS metadata
|
||||
and CephFS data pools. Do this by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool application set <cephfs_metadata_pool> cephfs <cephfs_data_pool> cephfs
|
||||
|
||||
|
||||
Configuration for libcephfs clients
|
||||
-----------------------------------
|
||||
|
||||
|
@ -143,3 +143,14 @@ The types of damage that can be reported and repaired by File System Scrub are:
|
||||
|
||||
* BACKTRACE : Inode's backtrace in the data pool is corrupted.
|
||||
|
||||
Evaluate strays using recursive scrub
|
||||
=====================================
|
||||
|
||||
- In order to evaluate strays i.e. purge stray directories in ``~mdsdir`` use the following command::
|
||||
|
||||
ceph tell mds.<fsname>:0 scrub start ~mdsdir recursive
|
||||
|
||||
- ``~mdsdir`` is not enqueued by default when scrubbing at the CephFS root. In order to perform stray evaluation
|
||||
at root, run scrub with flags ``scrub_mdsdir`` and ``recursive``::
|
||||
|
||||
ceph tell mds.<fsname>:0 scrub start / recursive,scrub_mdsdir
|
||||
|
@ -142,6 +142,19 @@ Examples::
|
||||
ceph fs snap-schedule retention add / 24h4w # add 24 hourly and 4 weekly to retention
|
||||
ceph fs snap-schedule retention remove / 7d4w # remove 7 daily and 4 weekly, leaves 24 hourly
|
||||
|
||||
.. note: When adding a path to snap-schedule, remember to strip off the mount
|
||||
point path prefix. Paths to snap-schedule should start at the appropriate
|
||||
CephFS file system root and not at the host file system root.
|
||||
e.g. if the Ceph File System is mounted at ``/mnt`` and the path under which
|
||||
snapshots need to be taken is ``/mnt/some/path`` then the acutal path required
|
||||
by snap-schedule is only ``/some/path``.
|
||||
|
||||
.. note: It should be noted that the "created" field in the snap-schedule status
|
||||
command output is the timestamp at which the schedule was created. The "created"
|
||||
timestamp has nothing to do with the creation of actual snapshots. The actual
|
||||
snapshot creation is accounted for in the "created_count" field, which is a
|
||||
cumulative count of the total number of snapshots created so far.
|
||||
|
||||
Active and inactive schedules
|
||||
-----------------------------
|
||||
Snapshot schedules can be added for a path that doesn't exist yet in the
|
||||
|
@ -188,6 +188,98 @@ You can enable dynamic debug against the CephFS module.
|
||||
|
||||
Please see: https://github.com/ceph/ceph/blob/master/src/script/kcon_all.sh
|
||||
|
||||
In-memory Log Dump
|
||||
==================
|
||||
|
||||
In-memory logs can be dumped by setting ``mds_extraordinary_events_dump_interval``
|
||||
during a lower level debugging (log level < 10). ``mds_extraordinary_events_dump_interval``
|
||||
is the interval in seconds for dumping the recent in-memory logs when there is an Extra-Ordinary event.
|
||||
|
||||
The Extra-Ordinary events are classified as:
|
||||
|
||||
* Client Eviction
|
||||
* Missed Beacon ACK from the monitors
|
||||
* Missed Internal Heartbeats
|
||||
|
||||
In-memory Log Dump is disabled by default to prevent log file bloat in a production environment.
|
||||
The below commands consecutively enables it::
|
||||
|
||||
$ ceph config set mds debug_mds <log_level>/<gather_level>
|
||||
$ ceph config set mds mds_extraordinary_events_dump_interval <seconds>
|
||||
|
||||
The ``log_level`` should be < 10 and ``gather_level`` should be >= 10 to enable in-memory log dump.
|
||||
When it is enabled, the MDS checks for the extra-ordinary events every
|
||||
``mds_extraordinary_events_dump_interval`` seconds and if any of them occurs, MDS dumps the
|
||||
in-memory logs containing the relevant event details in ceph-mds log.
|
||||
|
||||
.. note:: For higher log levels (log_level >= 10) there is no reason to dump the In-memory Logs and a
|
||||
lower gather level (gather_level < 10) is insufficient to gather In-memory Logs. Thus a
|
||||
log level >=10 or a gather level < 10 in debug_mds would prevent enabling the In-memory Log Dump.
|
||||
In such cases, when there is a failure it's required to reset the value of
|
||||
mds_extraordinary_events_dump_interval to 0 before enabling using the above commands.
|
||||
|
||||
The In-memory Log Dump can be disabled using::
|
||||
|
||||
$ ceph config set mds mds_extraordinary_events_dump_interval 0
|
||||
|
||||
Filesystems Become Inaccessible After an Upgrade
|
||||
================================================
|
||||
|
||||
.. note::
|
||||
You can avoid ``operation not permitted`` errors by running this procedure
|
||||
before an upgrade. As of May 2023, it seems that ``operation not permitted``
|
||||
errors of the kind discussed here occur after upgrades after Nautilus
|
||||
(inclusive).
|
||||
|
||||
IF
|
||||
|
||||
you have CephFS file systems that have data and metadata pools that were
|
||||
created by a ``ceph fs new`` command (meaning that they were not created
|
||||
with the defaults)
|
||||
|
||||
OR
|
||||
|
||||
you have an existing CephFS file system and are upgrading to a new post-Nautilus
|
||||
major version of Ceph
|
||||
|
||||
THEN
|
||||
|
||||
in order for the documented ``ceph fs authorize...`` commands to function as
|
||||
documented (and to avoid 'operation not permitted' errors when doing file I/O
|
||||
or similar security-related problems for all users except the ``client.admin``
|
||||
user), you must first run:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool application set <your metadata pool name> cephfs metadata <your ceph fs filesystem name>
|
||||
|
||||
and
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool application set <your data pool name> cephfs data <your ceph fs filesystem name>
|
||||
|
||||
Otherwise, when the OSDs receive a request to read or write data (not the
|
||||
directory info, but file data) they will not know which Ceph file system name
|
||||
to look up. This is true also of pool names, because the 'defaults' themselves
|
||||
changed in the major releases, from::
|
||||
|
||||
data pool=fsname
|
||||
metadata pool=fsname_metadata
|
||||
|
||||
to::
|
||||
|
||||
data pool=fsname.data and
|
||||
metadata pool=fsname.meta
|
||||
|
||||
Any setup that used ``client.admin`` for all mounts did not run into this
|
||||
problem, because the admin key gave blanket permissions.
|
||||
|
||||
A temporary fix involves changing mount requests to the 'client.admin' user and
|
||||
its associated key. A less drastic but half-fix is to change the osd cap for
|
||||
your user to just ``caps osd = "allow rw"`` and delete ``tag cephfs
|
||||
data=....``
|
||||
|
||||
Reporting Issues
|
||||
================
|
||||
|
||||
|
@ -87,7 +87,8 @@ Optionals are represented as a presence byte, followed by the item if it exists.
|
||||
T element[present? 1 : 0]; // Only if present is non-zero.
|
||||
}
|
||||
|
||||
Optionals are used to encode ``boost::optional``.
|
||||
Optionals are used to encode ``boost::optional`` and, since introducing
|
||||
C++17 to Ceph, ``std::optional``.
|
||||
|
||||
Pair
|
||||
----
|
||||
|
@ -5,7 +5,7 @@ jerasure plugin
|
||||
Introduction
|
||||
------------
|
||||
|
||||
The parameters interpreted by the jerasure plugin are:
|
||||
The parameters interpreted by the ``jerasure`` plugin are:
|
||||
|
||||
::
|
||||
|
||||
@ -31,3 +31,5 @@ upstream repositories `http://jerasure.org/jerasure/jerasure
|
||||
`http://jerasure.org/jerasure/gf-complete
|
||||
<http://jerasure.org/jerasure/gf-complete>`_ . The difference
|
||||
between the two, if any, should match pull requests against upstream.
|
||||
Note that as of 2023, the ``jerasure.org`` web site may no longer be
|
||||
legitimate and/or associated with the original project.
|
||||
|
93
ceph/doc/dev/osd_internals/past_intervals.rst
Normal file
93
ceph/doc/dev/osd_internals/past_intervals.rst
Normal file
@ -0,0 +1,93 @@
|
||||
=============
|
||||
PastIntervals
|
||||
=============
|
||||
|
||||
Purpose
|
||||
-------
|
||||
|
||||
There are two situations where we need to consider the set of all acting-set
|
||||
OSDs for a PG back to some epoch ``e``:
|
||||
|
||||
* During peering, we need to consider the acting set for every epoch back to
|
||||
``last_epoch_started``, the last epoch in which the PG completed peering and
|
||||
became active.
|
||||
(see :doc:`/dev/osd_internals/last_epoch_started` for a detailed explanation)
|
||||
* During recovery, we need to consider the acting set for every epoch back to
|
||||
``last_epoch_clean``, the last epoch at which all of the OSDs in the acting
|
||||
set were fully recovered, and the acting set was full.
|
||||
|
||||
For either of these purposes, we could build such a set by iterating backwards
|
||||
from the current OSDMap to the relevant epoch. Instead, we maintain a structure
|
||||
PastIntervals for each PG.
|
||||
|
||||
An ``interval`` is a contiguous sequence of OSDMap epochs where the PG mapping
|
||||
didn't change. This includes changes to the acting set, the up set, the
|
||||
primary, and several other parameters fully spelled out in
|
||||
PastIntervals::check_new_interval.
|
||||
|
||||
Maintenance and Trimming
|
||||
------------------------
|
||||
|
||||
The PastIntervals structure stores a record for each ``interval`` back to
|
||||
last_epoch_clean. On each new ``interval`` (See AdvMap reactions,
|
||||
PeeringState::should_restart_peering, and PeeringState::start_peering_interval)
|
||||
each OSD with the PG will add the new ``interval`` to its local PastIntervals.
|
||||
Activation messages to OSDs which do not already have the PG contain the
|
||||
sender's PastIntervals so that the recipient needn't rebuild it. (See
|
||||
PeeringState::activate needs_past_intervals).
|
||||
|
||||
PastIntervals are trimmed in two places. First, when the primary marks the
|
||||
PG clean, it clears its past_intervals instance
|
||||
(PeeringState::try_mark_clean()). The replicas will do the same thing when
|
||||
they receive the info (See PeeringState::update_history).
|
||||
|
||||
The second, more complex, case is in PeeringState::start_peering_interval. In
|
||||
the event of a "map gap", we assume that the PG actually has gone clean, but we
|
||||
haven't received a pg_info_t with the updated ``last_epoch_clean`` value yet.
|
||||
To explain this behavior, we need to discuss OSDMap trimming.
|
||||
|
||||
OSDMap Trimming
|
||||
---------------
|
||||
|
||||
OSDMaps are created by the Monitor quorum and gossiped out to the OSDs. The
|
||||
Monitor cluster also determines when OSDs (and the Monitors) are allowed to
|
||||
trim old OSDMap epochs. For the reasons explained above in this document, the
|
||||
primary constraint is that we must retain all OSDMaps back to some epoch such
|
||||
that all PGs have been clean at that or a later epoch (min_last_epoch_clean).
|
||||
(See OSDMonitor::get_trim_to).
|
||||
|
||||
The Monitor quorum determines min_last_epoch_clean through MOSDBeacon messages
|
||||
sent periodically by each OSDs. Each message contains a set of PGs for which
|
||||
the OSD is primary at that moment as well as the min_last_epoch_clean across
|
||||
that set. The Monitors track these values in OSDMonitor::last_epoch_clean.
|
||||
|
||||
There is a subtlety in the min_last_epoch_clean value used by the OSD to
|
||||
populate the MOSDBeacon. OSD::collect_pg_stats invokes PG::with_pg_stats to
|
||||
obtain the lec value, which actually uses
|
||||
pg_stat_t::get_effective_last_epoch_clean() rather than
|
||||
info.history.last_epoch_clean. If the PG is currently clean,
|
||||
pg_stat_t::get_effective_last_epoch_clean() is the current epoch rather than
|
||||
last_epoch_clean -- this works because the PG is clean at that epoch and it
|
||||
allows OSDMaps to be trimmed during periods where OSDMaps are being created
|
||||
(due to snapshot activity, perhaps), but no PGs are undergoing ``interval``
|
||||
changes.
|
||||
|
||||
Back to PastIntervals
|
||||
---------------------
|
||||
|
||||
We can now understand our second trimming case above. If OSDMaps have been
|
||||
trimmed up to epoch ``e``, we know that the PG must have been clean at some epoch
|
||||
>= ``e`` (indeed, **all** PGs must have been), so we can drop our PastIntevals.
|
||||
|
||||
This dependency also pops up in PeeringState::check_past_interval_bounds().
|
||||
PeeringState::get_required_past_interval_bounds takes as a parameter
|
||||
oldest_epoch, which comes from OSDSuperblock::cluster_osdmap_trim_lower_bound.
|
||||
We use cluster_osdmap_trim_lower_bound rather than a specific osd's oldest_map
|
||||
because we don't necessarily trim all MOSDMap::cluster_osdmap_trim_lower_bound.
|
||||
In order to avoid doing too much work at once we limit the amount of osdmaps
|
||||
trimmed using ``osd_target_transaction_size`` in OSD::trim_maps().
|
||||
For this reason, a specific OSD's oldest_map can lag behind
|
||||
OSDSuperblock::cluster_osdmap_trim_lower_bound
|
||||
for a while.
|
||||
|
||||
See https://tracker.ceph.com/issues/49689 for an example.
|
@ -12,12 +12,13 @@
|
||||
:ref:`BlueStore<rados_config_storage_devices_bluestore>`
|
||||
OSD BlueStore is a storage back end used by OSD daemons, and
|
||||
was designed specifically for use with Ceph. BlueStore was
|
||||
introduced in the Ceph Kraken release. In the Ceph Luminous
|
||||
release, BlueStore became Ceph's default storage back end,
|
||||
supplanting FileStore. Unlike :term:`filestore`, BlueStore
|
||||
stores objects directly on Ceph block devices without any file
|
||||
system interface. Since Luminous (12.2), BlueStore has been
|
||||
Ceph's default and recommended storage back end.
|
||||
introduced in the Ceph Kraken release. The Luminous release of
|
||||
Ceph promoted BlueStore to the default OSD back end,
|
||||
supplanting FileStore. As of the Reef release, FileStore is no
|
||||
longer available as a storage backend.
|
||||
|
||||
BlueStore stores objects directly on Ceph block devices without
|
||||
a mounted file system.
|
||||
|
||||
Bucket
|
||||
In the context of :term:`RGW`, a bucket is a group of objects.
|
||||
|
@ -11,6 +11,12 @@ Ceph delivers **object, block, and file storage in one unified system**.
|
||||
Ceph project. (Click anywhere in this paragraph to read the "Basic
|
||||
Workflow" page of the Ceph Developer Guide.) <basic workflow dev guide>`.
|
||||
|
||||
.. note::
|
||||
|
||||
:ref:`If you want to make a commit to the documentation but you don't
|
||||
know how to get started, read the "Documenting Ceph" page. (Click anywhere
|
||||
in this paragraph to read the "Documenting Ceph" page.) <documenting_ceph>`.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
|
||||
|
@ -36,6 +36,22 @@ Options
|
||||
|
||||
Perform a selftest. This mode performs a sanity check of ``stats`` module.
|
||||
|
||||
.. option:: --conffile [CONFFILE]
|
||||
|
||||
Path to cluster configuration file
|
||||
|
||||
.. option:: -d [DELAY], --delay [DELAY]
|
||||
|
||||
Refresh interval in seconds (default: 1)
|
||||
|
||||
.. option:: --dump
|
||||
|
||||
Dump the metrics to stdout
|
||||
|
||||
.. option:: --dumpfs <fs_name>
|
||||
|
||||
Dump the metrics of the given filesystem to stdout
|
||||
|
||||
Descriptions of fields
|
||||
======================
|
||||
|
||||
|
@ -110,6 +110,12 @@ Basic
|
||||
them. If an inode contains any stale file locks, read/write on the inode
|
||||
is not allowed until applications release all stale file locks.
|
||||
|
||||
:command: `fs=<fs-name>`
|
||||
Specify the non-default file system to be mounted, when using the old syntax.
|
||||
|
||||
:command: `mds_namespace=<fs-name>`
|
||||
A synonym of "fs=" (Deprecated).
|
||||
|
||||
Advanced
|
||||
--------
|
||||
:command:`cap_release_safety`
|
||||
@ -236,6 +242,10 @@ history::
|
||||
mount.ceph :/ /mnt/mycephfs -o name=fs_username,secretfile=/etc/ceph/fs_username.secret
|
||||
|
||||
|
||||
To mount using the old syntax::
|
||||
|
||||
mount -t ceph 192.168.0.1:/ /mnt/mycephfs
|
||||
|
||||
Availability
|
||||
============
|
||||
|
||||
|
@ -18,9 +18,11 @@ for all reporting entities are returned in text exposition format.
|
||||
Enabling prometheus output
|
||||
==========================
|
||||
|
||||
The *prometheus* module is enabled with::
|
||||
The *prometheus* module is enabled with:
|
||||
|
||||
ceph mgr module enable prometheus
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mgr module enable prometheus
|
||||
|
||||
Configuration
|
||||
-------------
|
||||
@ -36,10 +38,10 @@ configurable with ``ceph config set``, with keys
|
||||
is registered with Prometheus's `registry
|
||||
<https://github.com/prometheus/prometheus/wiki/Default-port-allocations>`_.
|
||||
|
||||
::
|
||||
|
||||
ceph config set mgr mgr/prometheus/server_addr 0.0.0.0
|
||||
ceph config set mgr mgr/prometheus/server_port 9283
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/server_addr 0.0.0.
|
||||
ceph config set mgr mgr/prometheus/server_port 9283
|
||||
|
||||
.. warning::
|
||||
|
||||
@ -54,9 +56,11 @@ recommended to use 15 seconds as scrape interval, though, in some cases it
|
||||
might be useful to increase the scrape interval.
|
||||
|
||||
To set a different scrape interval in the Prometheus module, set
|
||||
``scrape_interval`` to the desired value::
|
||||
``scrape_interval`` to the desired value:
|
||||
|
||||
ceph config set mgr mgr/prometheus/scrape_interval 20
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/scrape_interval 20
|
||||
|
||||
On large clusters (>1000 OSDs), the time to fetch the metrics may become
|
||||
significant. Without the cache, the Prometheus manager module could, especially
|
||||
@ -75,35 +79,47 @@ This behavior can be configured. By default, it will return a 503 HTTP status
|
||||
code (service unavailable). You can set other options using the ``ceph config
|
||||
set`` commands.
|
||||
|
||||
To tell the module to respond with possibly stale data, set it to ``return``::
|
||||
To tell the module to respond with possibly stale data, set it to ``return``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/stale_cache_strategy return
|
||||
|
||||
To tell the module to respond with "service unavailable", set it to ``fail``::
|
||||
To tell the module to respond with "service unavailable", set it to ``fail``:
|
||||
|
||||
ceph config set mgr mgr/prometheus/stale_cache_strategy fail
|
||||
.. prompt:: bash $
|
||||
|
||||
If you are confident that you don't require the cache, you can disable it::
|
||||
ceph config set mgr mgr/prometheus/stale_cache_strategy fail
|
||||
|
||||
ceph config set mgr mgr/prometheus/cache false
|
||||
If you are confident that you don't require the cache, you can disable it:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/cache false
|
||||
|
||||
If you are using the prometheus module behind some kind of reverse proxy or
|
||||
loadbalancer, you can simplify discovering the active instance by switching
|
||||
to ``error``-mode::
|
||||
to ``error``-mode:
|
||||
|
||||
ceph config set mgr mgr/prometheus/standby_behaviour error
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/standby_behaviour error
|
||||
|
||||
If set, the prometheus module will repond with a HTTP error when requesting ``/``
|
||||
from the standby instance. The default error code is 500, but you can configure
|
||||
the HTTP response code with::
|
||||
the HTTP response code with:
|
||||
|
||||
ceph config set mgr mgr/prometheus/standby_error_status_code 503
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/standby_error_status_code 503
|
||||
|
||||
Valid error codes are between 400-599.
|
||||
|
||||
To switch back to the default behaviour, simply set the config key to ``default``::
|
||||
To switch back to the default behaviour, simply set the config key to ``default``:
|
||||
|
||||
ceph config set mgr mgr/prometheus/standby_behaviour default
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/standby_behaviour default
|
||||
|
||||
.. _prometheus-rbd-io-statistics:
|
||||
|
||||
@ -154,9 +170,17 @@ configuration parameter. The parameter is a comma or space separated list
|
||||
of ``pool[/namespace]`` entries. If the namespace is not specified the
|
||||
statistics are collected for all namespaces in the pool.
|
||||
|
||||
Example to activate the RBD-enabled pools ``pool1``, ``pool2`` and ``poolN``::
|
||||
Example to activate the RBD-enabled pools ``pool1``, ``pool2`` and ``poolN``:
|
||||
|
||||
ceph config set mgr mgr/prometheus/rbd_stats_pools "pool1,pool2,poolN"
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/rbd_stats_pools "pool1,pool2,poolN"
|
||||
|
||||
The wildcard can be used to indicate all pools or namespaces:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/rbd_stats_pools "*"
|
||||
|
||||
The module makes the list of all available images scanning the specified
|
||||
pools and namespaces and refreshes it periodically. The period is
|
||||
@ -165,9 +189,22 @@ parameter (in sec) and is 300 sec (5 minutes) by default. The module will
|
||||
force refresh earlier if it detects statistics from a previously unknown
|
||||
RBD image.
|
||||
|
||||
Example to turn up the sync interval to 10 minutes::
|
||||
Example to turn up the sync interval to 10 minutes:
|
||||
|
||||
ceph config set mgr mgr/prometheus/rbd_stats_pools_refresh_interval 600
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/rbd_stats_pools_refresh_interval 600
|
||||
|
||||
Ceph daemon performance counters metrics
|
||||
-----------------------------------------
|
||||
|
||||
With the introduction of ``ceph-exporter`` daemon, the prometheus module will no longer export Ceph daemon
|
||||
perf counters as prometheus metrics by default. However, one may re-enable exporting these metrics by setting
|
||||
the module option ``exclude_perf_counters`` to ``false``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/prometheus/exclude_perf_counters false
|
||||
|
||||
Statistic names and labels
|
||||
==========================
|
||||
|
@ -153,3 +153,24 @@ completely optional, and disabled by default.::
|
||||
ceph config set mgr mgr/telemetry/description 'My first Ceph cluster'
|
||||
ceph config set mgr mgr/telemetry/channel_ident true
|
||||
|
||||
Leaderboard
|
||||
-----------
|
||||
|
||||
To participate in a leaderboard in the `public dashboards
|
||||
<https://telemetry-public.ceph.com/>`_, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/telemetry/leaderboard true
|
||||
|
||||
The leaderboard displays basic information about the cluster. This includes the
|
||||
total storage capacity and the number of OSDs. To add a description of the
|
||||
cluster, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/telemetry/leaderboard_description 'Ceph cluster for Computational Biology at the University of XYZ'
|
||||
|
||||
If the ``ident`` channel is enabled, its details will not be displayed in the
|
||||
leaderboard.
|
||||
|
||||
|
@ -1,84 +1,95 @@
|
||||
==========================
|
||||
BlueStore Config Reference
|
||||
==========================
|
||||
==================================
|
||||
BlueStore Configuration Reference
|
||||
==================================
|
||||
|
||||
Devices
|
||||
=======
|
||||
|
||||
BlueStore manages either one, two, or (in certain cases) three storage
|
||||
devices.
|
||||
BlueStore manages either one, two, or in certain cases three storage devices.
|
||||
These *devices* are "devices" in the Linux/Unix sense. This means that they are
|
||||
assets listed under ``/dev`` or ``/devices``. Each of these devices may be an
|
||||
entire storage drive, or a partition of a storage drive, or a logical volume.
|
||||
BlueStore does not create or mount a conventional file system on devices that
|
||||
it uses; BlueStore reads and writes to the devices directly in a "raw" fashion.
|
||||
|
||||
In the simplest case, BlueStore consumes a single (primary) storage device.
|
||||
The storage device is normally used as a whole, occupying the full device that
|
||||
is managed directly by BlueStore. This *primary device* is normally identified
|
||||
by a ``block`` symlink in the data directory.
|
||||
In the simplest case, BlueStore consumes all of a single storage device. This
|
||||
device is known as the *primary device*. The primary device is identified by
|
||||
the ``block`` symlink in the data directory.
|
||||
|
||||
The data directory is a ``tmpfs`` mount which gets populated (at boot time, or
|
||||
when ``ceph-volume`` activates it) with all the common OSD files that hold
|
||||
information about the OSD, like: its identifier, which cluster it belongs to,
|
||||
and its private keyring.
|
||||
The data directory is a ``tmpfs`` mount. When this data directory is booted or
|
||||
activated by ``ceph-volume``, it is populated with metadata files and links
|
||||
that hold information about the OSD: for example, the OSD's identifier, the
|
||||
name of the cluster that the OSD belongs to, and the OSD's private keyring.
|
||||
|
||||
It is also possible to deploy BlueStore across one or two additional devices:
|
||||
In more complicated cases, BlueStore is deployed across one or two additional
|
||||
devices:
|
||||
|
||||
* A *write-ahead log (WAL) device* (identified as ``block.wal`` in the data directory) can be
|
||||
used for BlueStore's internal journal or write-ahead log. It is only useful
|
||||
to use a WAL device if the device is faster than the primary device (e.g.,
|
||||
when it is on an SSD and the primary device is an HDD).
|
||||
* A *write-ahead log (WAL) device* (identified as ``block.wal`` in the data
|
||||
directory) can be used to separate out BlueStore's internal journal or
|
||||
write-ahead log. Using a WAL device is advantageous only if the WAL device
|
||||
is faster than the primary device (for example, if the WAL device is an SSD
|
||||
and the primary device is an HDD).
|
||||
* A *DB device* (identified as ``block.db`` in the data directory) can be used
|
||||
for storing BlueStore's internal metadata. BlueStore (or rather, the
|
||||
embedded RocksDB) will put as much metadata as it can on the DB device to
|
||||
improve performance. If the DB device fills up, metadata will spill back
|
||||
onto the primary device (where it would have been otherwise). Again, it is
|
||||
only helpful to provision a DB device if it is faster than the primary
|
||||
device.
|
||||
to store BlueStore's internal metadata. BlueStore (or more precisely, the
|
||||
embedded RocksDB) will put as much metadata as it can on the DB device in
|
||||
order to improve performance. If the DB device becomes full, metadata will
|
||||
spill back onto the primary device (where it would have been located in the
|
||||
absence of the DB device). Again, it is advantageous to provision a DB device
|
||||
only if it is faster than the primary device.
|
||||
|
||||
If there is only a small amount of fast storage available (e.g., less
|
||||
than a gigabyte), we recommend using it as a WAL device. If there is
|
||||
more, provisioning a DB device makes more sense. The BlueStore
|
||||
journal will always be placed on the fastest device available, so
|
||||
using a DB device will provide the same benefit that the WAL device
|
||||
would while *also* allowing additional metadata to be stored there (if
|
||||
it will fit). This means that if a DB device is specified but an explicit
|
||||
WAL device is not, the WAL will be implicitly colocated with the DB on the faster
|
||||
device.
|
||||
If there is only a small amount of fast storage available (for example, less
|
||||
than a gigabyte), we recommend using the available space as a WAL device. But
|
||||
if more fast storage is available, it makes more sense to provision a DB
|
||||
device. Because the BlueStore journal is always placed on the fastest device
|
||||
available, using a DB device provides the same benefit that using a WAL device
|
||||
would, while *also* allowing additional metadata to be stored off the primary
|
||||
device (provided that it fits). DB devices make this possible because whenever
|
||||
a DB device is specified but an explicit WAL device is not, the WAL will be
|
||||
implicitly colocated with the DB on the faster device.
|
||||
|
||||
A single-device (colocated) BlueStore OSD can be provisioned with:
|
||||
To provision a single-device (colocated) BlueStore OSD, run the following
|
||||
command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm prepare --bluestore --data <device>
|
||||
|
||||
To specify a WAL device and/or DB device:
|
||||
To specify a WAL device or DB device, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm prepare --bluestore --data <device> --block.wal <wal-device> --block.db <db-device>
|
||||
|
||||
.. note:: ``--data`` can be a Logical Volume using *vg/lv* notation. Other
|
||||
devices can be existing logical volumes or GPT partitions.
|
||||
.. note:: The option ``--data`` can take as its argument any of the the
|
||||
following devices: logical volumes specified using *vg/lv* notation,
|
||||
existing logical volumes, and GPT partitions.
|
||||
|
||||
|
||||
|
||||
Provisioning strategies
|
||||
-----------------------
|
||||
Although there are multiple ways to deploy a BlueStore OSD (unlike Filestore
|
||||
which had just one), there are two common arrangements that should help clarify
|
||||
the deployment strategy:
|
||||
|
||||
BlueStore differs from Filestore in that there are several ways to deploy a
|
||||
BlueStore OSD. However, the overall deployment strategy for BlueStore can be
|
||||
clarified by examining just these two common arrangements:
|
||||
|
||||
.. _bluestore-single-type-device-config:
|
||||
|
||||
**block (data) only**
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
If all devices are the same type, for example all rotational drives, and
|
||||
there are no fast devices to use for metadata, it makes sense to specify the
|
||||
block device only and to not separate ``block.db`` or ``block.wal``. The
|
||||
:ref:`ceph-volume-lvm` command for a single ``/dev/sda`` device looks like:
|
||||
If all devices are of the same type (for example, they are all HDDs), and if
|
||||
there are no fast devices available for the storage of metadata, then it makes
|
||||
sense to specify the block device only and to leave ``block.db`` and
|
||||
``block.wal`` unseparated. The :ref:`ceph-volume-lvm` command for a single
|
||||
``/dev/sda`` device is as follows:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm create --bluestore --data /dev/sda
|
||||
|
||||
If logical volumes have already been created for each device, (a single LV
|
||||
using 100% of the device), then the :ref:`ceph-volume-lvm` call for an LV named
|
||||
``ceph-vg/block-lv`` would look like:
|
||||
If the devices to be used for a BlueStore OSD are pre-created logical volumes,
|
||||
then the :ref:`ceph-volume-lvm` call for an logical volume named
|
||||
``ceph-vg/block-lv`` is as follows:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -88,15 +99,18 @@ using 100% of the device), then the :ref:`ceph-volume-lvm` call for an LV named
|
||||
|
||||
**block and block.db**
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
If you have a mix of fast and slow devices (SSD / NVMe and rotational),
|
||||
it is recommended to place ``block.db`` on the faster device while ``block``
|
||||
(data) lives on the slower (spinning drive).
|
||||
|
||||
You must create these volume groups and logical volumes manually as
|
||||
the ``ceph-volume`` tool is currently not able to do so automatically.
|
||||
If you have a mix of fast and slow devices (for example, SSD or HDD), then we
|
||||
recommend placing ``block.db`` on the faster device while ``block`` (that is,
|
||||
the data) is stored on the slower device (that is, the rotational drive).
|
||||
|
||||
For the below example, let us assume four rotational (``sda``, ``sdb``, ``sdc``, and ``sdd``)
|
||||
and one (fast) solid state drive (``sdx``). First create the volume groups:
|
||||
You must create these volume groups and these logical volumes manually. as The
|
||||
``ceph-volume`` tool is currently unable to do so [create them?] automatically.
|
||||
|
||||
The following procedure illustrates the manual creation of volume groups and
|
||||
logical volumes. For this example, we shall assume four rotational drives
|
||||
(``sda``, ``sdb``, ``sdc``, and ``sdd``) and one (fast) SSD (``sdx``). First,
|
||||
to create the volume groups, run the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -105,7 +119,7 @@ and one (fast) solid state drive (``sdx``). First create the volume groups:
|
||||
vgcreate ceph-block-2 /dev/sdc
|
||||
vgcreate ceph-block-3 /dev/sdd
|
||||
|
||||
Now create the logical volumes for ``block``:
|
||||
Next, to create the logical volumes for ``block``, run the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -114,8 +128,9 @@ Now create the logical volumes for ``block``:
|
||||
lvcreate -l 100%FREE -n block-2 ceph-block-2
|
||||
lvcreate -l 100%FREE -n block-3 ceph-block-3
|
||||
|
||||
We are creating 4 OSDs for the four slow spinning devices, so assuming a 200GB
|
||||
SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB:
|
||||
Because there are four HDDs, there will be four OSDs. Supposing that there is a
|
||||
200GB SSD in ``/dev/sdx``, we can create four 50GB logical volumes by running
|
||||
the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -125,7 +140,7 @@ SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB:
|
||||
lvcreate -L 50GB -n db-2 ceph-db-0
|
||||
lvcreate -L 50GB -n db-3 ceph-db-0
|
||||
|
||||
Finally, create the 4 OSDs with ``ceph-volume``:
|
||||
Finally, to create the four OSDs, run the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -134,149 +149,153 @@ Finally, create the 4 OSDs with ``ceph-volume``:
|
||||
ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
|
||||
ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
|
||||
|
||||
These operations should end up creating four OSDs, with ``block`` on the slower
|
||||
rotational drives with a 50 GB logical volume (DB) for each on the solid state
|
||||
drive.
|
||||
After this procedure is finished, there should be four OSDs, ``block`` should
|
||||
be on the four HDDs, and each HDD should have a 50GB logical volume
|
||||
(specifically, a DB device) on the shared SSD.
|
||||
|
||||
Sizing
|
||||
======
|
||||
When using a :ref:`mixed spinning and solid drive setup
|
||||
<bluestore-mixed-device-config>` it is important to make a large enough
|
||||
``block.db`` logical volume for BlueStore. Generally, ``block.db`` should have
|
||||
*as large as possible* logical volumes.
|
||||
When using a :ref:`mixed spinning-and-solid-drive setup
|
||||
<bluestore-mixed-device-config>`, it is important to make a large enough
|
||||
``block.db`` logical volume for BlueStore. The logical volumes associated with
|
||||
``block.db`` should have logical volumes that are *as large as possible*.
|
||||
|
||||
The general recommendation is to have ``block.db`` size in between 1% to 4%
|
||||
of ``block`` size. For RGW workloads, it is recommended that the ``block.db``
|
||||
size isn't smaller than 4% of ``block``, because RGW heavily uses it to store
|
||||
metadata (omap keys). For example, if the ``block`` size is 1TB, then ``block.db`` shouldn't
|
||||
be less than 40GB. For RBD workloads, 1% to 2% of ``block`` size is usually enough.
|
||||
It is generally recommended that the size of ``block.db`` be somewhere between
|
||||
1% and 4% of the size of ``block``. For RGW workloads, it is recommended that
|
||||
the ``block.db`` be at least 4% of the ``block`` size, because RGW makes heavy
|
||||
use of ``block.db`` to store metadata (in particular, omap keys). For example,
|
||||
if the ``block`` size is 1TB, then ``block.db`` should have a size of at least
|
||||
40GB. For RBD workloads, however, ``block.db`` usually needs no more than 1% to
|
||||
2% of the ``block`` size.
|
||||
|
||||
In older releases, internal level sizes mean that the DB can fully utilize only
|
||||
specific partition / LV sizes that correspond to sums of L0, L0+L1, L1+L2,
|
||||
etc. sizes, which with default settings means roughly 3 GB, 30 GB, 300 GB, and
|
||||
so forth. Most deployments will not substantially benefit from sizing to
|
||||
accommodate L3 and higher, though DB compaction can be facilitated by doubling
|
||||
these figures to 6GB, 60GB, and 600GB.
|
||||
In older releases, internal level sizes are such that the DB can fully utilize
|
||||
only those specific partition / logical volume sizes that correspond to sums of
|
||||
L0, L0+L1, L1+L2, and so on--that is, given default settings, sizes of roughly
|
||||
3GB, 30GB, 300GB, and so on. Most deployments do not substantially benefit from
|
||||
sizing that accommodates L3 and higher, though DB compaction can be facilitated
|
||||
by doubling these figures to 6GB, 60GB, and 600GB.
|
||||
|
||||
Improvements in releases beginning with Nautilus 14.2.12 and Octopus 15.2.6
|
||||
enable better utilization of arbitrary DB device sizes, and the Pacific
|
||||
release brings experimental dynamic level support. Users of older releases may
|
||||
thus wish to plan ahead by provisioning larger DB devices today so that their
|
||||
benefits may be realized with future upgrades.
|
||||
|
||||
When *not* using a mix of fast and slow devices, it isn't required to create
|
||||
separate logical volumes for ``block.db`` (or ``block.wal``). BlueStore will
|
||||
automatically colocate these within the space of ``block``.
|
||||
Improvements in Nautilus 14.2.12, Octopus 15.2.6, and subsequent releases allow
|
||||
for better utilization of arbitrarily-sized DB devices. Moreover, the Pacific
|
||||
release brings experimental dynamic-level support. Because of these advances,
|
||||
users of older releases might want to plan ahead by provisioning larger DB
|
||||
devices today so that the benefits of scale can be realized when upgrades are
|
||||
made in the future.
|
||||
|
||||
When *not* using a mix of fast and slow devices, there is no requirement to
|
||||
create separate logical volumes for ``block.db`` or ``block.wal``. BlueStore
|
||||
will automatically colocate these devices within the space of ``block``.
|
||||
|
||||
Automatic Cache Sizing
|
||||
======================
|
||||
|
||||
BlueStore can be configured to automatically resize its caches when TCMalloc
|
||||
is configured as the memory allocator and the ``bluestore_cache_autotune``
|
||||
setting is enabled. This option is currently enabled by default. BlueStore
|
||||
will attempt to keep OSD heap memory usage under a designated target size via
|
||||
the ``osd_memory_target`` configuration option. This is a best effort
|
||||
algorithm and caches will not shrink smaller than the amount specified by
|
||||
``osd_memory_cache_min``. Cache ratios will be chosen based on a hierarchy
|
||||
of priorities. If priority information is not available, the
|
||||
``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio`` options are
|
||||
used as fallbacks.
|
||||
BlueStore can be configured to automatically resize its caches, provided that
|
||||
certain conditions are met: TCMalloc must be configured as the memory allocator
|
||||
and the ``bluestore_cache_autotune`` configuration option must be enabled (note
|
||||
that it is currently enabled by default). When automatic cache sizing is in
|
||||
effect, BlueStore attempts to keep OSD heap-memory usage under a certain target
|
||||
size (as determined by ``osd_memory_target``). This approach makes use of a
|
||||
best-effort algorithm and caches do not shrink smaller than the size defined by
|
||||
the value of ``osd_memory_cache_min``. Cache ratios are selected in accordance
|
||||
with a hierarchy of priorities. But if priority information is not available,
|
||||
the values specified in the ``bluestore_cache_meta_ratio`` and
|
||||
``bluestore_cache_kv_ratio`` options are used as fallback cache ratios.
|
||||
|
||||
|
||||
Manual Cache Sizing
|
||||
===================
|
||||
|
||||
The amount of memory consumed by each OSD for BlueStore caches is
|
||||
determined by the ``bluestore_cache_size`` configuration option. If
|
||||
that config option is not set (i.e., remains at 0), there is a
|
||||
different default value that is used depending on whether an HDD or
|
||||
SSD is used for the primary device (set by the
|
||||
``bluestore_cache_size_ssd`` and ``bluestore_cache_size_hdd`` config
|
||||
options).
|
||||
The amount of memory consumed by each OSD to be used for its BlueStore cache is
|
||||
determined by the ``bluestore_cache_size`` configuration option. If that option
|
||||
has not been specified (that is, if it remains at 0), then Ceph uses a
|
||||
different configuration option to determine the default memory budget:
|
||||
``bluestore_cache_size_hdd`` if the primary device is an HDD, or
|
||||
``bluestore_cache_size_ssd`` if the primary device is an SSD.
|
||||
|
||||
BlueStore and the rest of the Ceph OSD daemon do the best they can
|
||||
to work within this memory budget. Note that on top of the configured
|
||||
cache size, there is also memory consumed by the OSD itself, and
|
||||
some additional utilization due to memory fragmentation and other
|
||||
allocator overhead.
|
||||
BlueStore and the rest of the Ceph OSD daemon make every effort to work within
|
||||
this memory budget. Note that in addition to the configured cache size, there
|
||||
is also memory consumed by the OSD itself. There is additional utilization due
|
||||
to memory fragmentation and other allocator overhead.
|
||||
|
||||
The configured cache memory budget can be used in a few different ways:
|
||||
The configured cache-memory budget can be used to store the following types of
|
||||
things:
|
||||
|
||||
* Key/Value metadata (i.e., RocksDB's internal cache)
|
||||
* Key/Value metadata (that is, RocksDB's internal cache)
|
||||
* BlueStore metadata
|
||||
* BlueStore data (i.e., recently read or written object data)
|
||||
* BlueStore data (that is, recently read or recently written object data)
|
||||
|
||||
Cache memory usage is governed by the following options:
|
||||
``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio``.
|
||||
The fraction of the cache devoted to data
|
||||
is governed by the effective bluestore cache size (depending on
|
||||
``bluestore_cache_size[_ssd|_hdd]`` settings and the device class of the primary
|
||||
device) as well as the meta and kv ratios.
|
||||
The data fraction can be calculated by
|
||||
``<effective_cache_size> * (1 - bluestore_cache_meta_ratio - bluestore_cache_kv_ratio)``
|
||||
Cache memory usage is governed by the configuration options
|
||||
``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio``. The fraction
|
||||
of the cache that is reserved for data is governed by both the effective
|
||||
BlueStore cache size (which depends on the relevant
|
||||
``bluestore_cache_size[_ssd|_hdd]`` option and the device class of the primary
|
||||
device) and the "meta" and "kv" ratios. This data fraction can be calculated
|
||||
with the following formula: ``<effective_cache_size> * (1 -
|
||||
bluestore_cache_meta_ratio - bluestore_cache_kv_ratio)``.
|
||||
|
||||
Checksums
|
||||
=========
|
||||
|
||||
BlueStore checksums all metadata and data written to disk. Metadata
|
||||
checksumming is handled by RocksDB and uses `crc32c`. Data
|
||||
checksumming is done by BlueStore and can make use of `crc32c`,
|
||||
`xxhash32`, or `xxhash64`. The default is `crc32c` and should be
|
||||
suitable for most purposes.
|
||||
BlueStore checksums all metadata and all data written to disk. Metadata
|
||||
checksumming is handled by RocksDB and uses the `crc32c` algorithm. By
|
||||
contrast, data checksumming is handled by BlueStore and can use either
|
||||
`crc32c`, `xxhash32`, or `xxhash64`. Nonetheless, `crc32c` is the default
|
||||
checksum algorithm and it is suitable for most purposes.
|
||||
|
||||
Full data checksumming does increase the amount of metadata that
|
||||
BlueStore must store and manage. When possible, e.g., when clients
|
||||
hint that data is written and read sequentially, BlueStore will
|
||||
checksum larger blocks, but in many cases it must store a checksum
|
||||
value (usually 4 bytes) for every 4 kilobyte block of data.
|
||||
Full data checksumming increases the amount of metadata that BlueStore must
|
||||
store and manage. Whenever possible (for example, when clients hint that data
|
||||
is written and read sequentially), BlueStore will checksum larger blocks. In
|
||||
many cases, however, it must store a checksum value (usually 4 bytes) for every
|
||||
4 KB block of data.
|
||||
|
||||
It is possible to use a smaller checksum value by truncating the
|
||||
checksum to two or one byte, reducing the metadata overhead. The
|
||||
trade-off is that the probability that a random error will not be
|
||||
detected is higher with a smaller checksum, going from about one in
|
||||
four billion with a 32-bit (4 byte) checksum to one in 65,536 for a
|
||||
16-bit (2 byte) checksum or one in 256 for an 8-bit (1 byte) checksum.
|
||||
The smaller checksum values can be used by selecting `crc32c_16` or
|
||||
`crc32c_8` as the checksum algorithm.
|
||||
It is possible to obtain a smaller checksum value by truncating the checksum to
|
||||
one or two bytes and reducing the metadata overhead. A drawback of this
|
||||
approach is that it increases the probability of a random error going
|
||||
undetected: about one in four billion given a 32-bit (4 byte) checksum, 1 in
|
||||
65,536 given a 16-bit (2 byte) checksum, and 1 in 256 given an 8-bit (1 byte)
|
||||
checksum. To use the smaller checksum values, select `crc32c_16` or `crc32c_8`
|
||||
as the checksum algorithm.
|
||||
|
||||
The *checksum algorithm* can be set either via a per-pool
|
||||
``csum_type`` property or the global config option. For example:
|
||||
The *checksum algorithm* can be specified either via a per-pool ``csum_type``
|
||||
configuration option or via the global configuration option. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set <pool-name> csum_type <algorithm>
|
||||
|
||||
|
||||
Inline Compression
|
||||
==================
|
||||
|
||||
BlueStore supports inline compression using `snappy`, `zlib`, or
|
||||
`lz4`. Please note that the `lz4` compression plugin is not
|
||||
distributed in the official release.
|
||||
BlueStore supports inline compression using `snappy`, `zlib`, `lz4`, or `zstd`.
|
||||
|
||||
Whether data in BlueStore is compressed is determined by a combination
|
||||
of the *compression mode* and any hints associated with a write
|
||||
operation. The modes are:
|
||||
Whether data in BlueStore is compressed is determined by two factors: (1) the
|
||||
*compression mode* and (2) any client hints associated with a write operation.
|
||||
The compression modes are as follows:
|
||||
|
||||
* **none**: Never compress data.
|
||||
* **passive**: Do not compress data unless the write operation has a
|
||||
*compressible* hint set.
|
||||
* **aggressive**: Compress data unless the write operation has an
|
||||
* **aggressive**: Do compress data unless the write operation has an
|
||||
*incompressible* hint set.
|
||||
* **force**: Try to compress data no matter what.
|
||||
|
||||
For more information about the *compressible* and *incompressible* IO
|
||||
hints, see :c:func:`rados_set_alloc_hint`.
|
||||
For more information about the *compressible* and *incompressible* I/O hints,
|
||||
see :c:func:`rados_set_alloc_hint`.
|
||||
|
||||
Note that regardless of the mode, if the size of the data chunk is not
|
||||
reduced sufficiently it will not be used and the original
|
||||
(uncompressed) data will be stored. For example, if the ``bluestore
|
||||
compression required ratio`` is set to ``.7`` then the compressed data
|
||||
must be 70% of the size of the original (or smaller).
|
||||
Note that data in Bluestore will be compressed only if the data chunk will be
|
||||
sufficiently reduced in size (as determined by the ``bluestore compression
|
||||
required ratio`` setting). No matter which compression modes have been used, if
|
||||
the data chunk is too big, then it will be discarded and the original
|
||||
(uncompressed) data will be stored instead. For example, if ``bluestore
|
||||
compression required ratio`` is set to ``.7``, then data compression will take
|
||||
place only if the size of the compressed data is no more than 70% of the size
|
||||
of the original data.
|
||||
|
||||
The *compression mode*, *compression algorithm*, *compression required
|
||||
ratio*, *min blob size*, and *max blob size* can be set either via a
|
||||
per-pool property or a global config option. Pool properties can be
|
||||
set with:
|
||||
The *compression mode*, *compression algorithm*, *compression required ratio*,
|
||||
*min blob size*, and *max blob size* settings can be specified either via a
|
||||
per-pool property or via a global config option. To specify pool properties,
|
||||
run the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -291,192 +310,202 @@ set with:
|
||||
RocksDB Sharding
|
||||
================
|
||||
|
||||
Internally BlueStore uses multiple types of key-value data,
|
||||
stored in RocksDB. Each data type in BlueStore is assigned a
|
||||
unique prefix. Until Pacific all key-value data was stored in
|
||||
single RocksDB column family: 'default'. Since Pacific,
|
||||
BlueStore can divide this data into multiple RocksDB column
|
||||
families. When keys have similar access frequency, modification
|
||||
frequency and lifetime, BlueStore benefits from better caching
|
||||
and more precise compaction. This improves performance, and also
|
||||
requires less disk space during compaction, since each column
|
||||
family is smaller and can compact independent of others.
|
||||
BlueStore maintains several types of internal key-value data, all of which are
|
||||
stored in RocksDB. Each data type in BlueStore is assigned a unique prefix.
|
||||
Prior to the Pacific release, all key-value data was stored in a single RocksDB
|
||||
column family: 'default'. In Pacific and later releases, however, BlueStore can
|
||||
divide key-value data into several RocksDB column families. BlueStore achieves
|
||||
better caching and more precise compaction when keys are similar: specifically,
|
||||
when keys have similar access frequency, similar modification frequency, and a
|
||||
similar lifetime. Under such conditions, performance is improved and less disk
|
||||
space is required during compaction (because each column family is smaller and
|
||||
is able to compact independently of the others).
|
||||
|
||||
OSDs deployed in Pacific or later use RocksDB sharding by default.
|
||||
If Ceph is upgraded to Pacific from a previous version, sharding is off.
|
||||
OSDs deployed in Pacific or later releases use RocksDB sharding by default.
|
||||
However, if Ceph has been upgraded to Pacific or a later version from a
|
||||
previous version, sharding is disabled on any OSDs that were created before
|
||||
Pacific.
|
||||
|
||||
To enable sharding and apply the Pacific defaults, stop an OSD and run
|
||||
To enable sharding and apply the Pacific defaults to a specific OSD, stop the
|
||||
OSD and run the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-bluestore-tool \
|
||||
ceph-bluestore-tool \
|
||||
--path <data path> \
|
||||
--sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \
|
||||
--sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \
|
||||
reshard
|
||||
|
||||
|
||||
Throttling
|
||||
SPDK Usage
|
||||
==========
|
||||
|
||||
SPDK Usage
|
||||
==================
|
||||
|
||||
If you want to use the SPDK driver for NVMe devices, you must prepare your system.
|
||||
Refer to `SPDK document`__ for more details.
|
||||
To use the SPDK driver for NVMe devices, you must first prepare your system.
|
||||
See `SPDK document`__.
|
||||
|
||||
.. __: http://www.spdk.io/doc/getting_started.html#getting_started_examples
|
||||
|
||||
SPDK offers a script to configure the device automatically. Users can run the
|
||||
script as root:
|
||||
SPDK offers a script that will configure the device automatically. Run this
|
||||
script with root permissions:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo src/spdk/scripts/setup.sh
|
||||
|
||||
You will need to specify the subject NVMe device's device selector with
|
||||
the "spdk:" prefix for ``bluestore_block_path``.
|
||||
You will need to specify the subject NVMe device's device selector with the
|
||||
"spdk:" prefix for ``bluestore_block_path``.
|
||||
|
||||
For example, you can find the device selector of an Intel PCIe SSD with:
|
||||
In the following example, you first find the device selector of an Intel NVMe
|
||||
SSD by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
lspci -mm -n -D -d 8086:0953
|
||||
lspci -mm -n -d -d 8086:0953
|
||||
|
||||
The device selector always has the form of ``DDDD:BB:DD.FF`` or ``DDDD.BB.DD.FF``.
|
||||
The form of the device selector is either ``DDDD:BB:DD.FF`` or
|
||||
``DDDD.BB.DD.FF``.
|
||||
|
||||
and then set::
|
||||
Next, supposing that ``0000:01:00.0`` is the device selector found in the
|
||||
output of the ``lspci`` command, you can specify the device selector by running
|
||||
the following command::
|
||||
|
||||
bluestore_block_path = "spdk:trtype:PCIe traddr:0000:01:00.0"
|
||||
bluestore_block_path = "spdk:trtype:pcie traddr:0000:01:00.0"
|
||||
|
||||
Where ``0000:01:00.0`` is the device selector found in the output of ``lspci``
|
||||
command above.
|
||||
|
||||
You may also specify a remote NVMeoF target over the TCP transport as in the
|
||||
You may also specify a remote NVMeoF target over the TCP transport, as in the
|
||||
following example::
|
||||
|
||||
bluestore_block_path = "spdk:trtype:TCP traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1"
|
||||
bluestore_block_path = "spdk:trtype:tcp traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1"
|
||||
|
||||
To run multiple SPDK instances per node, you must specify the
|
||||
amount of dpdk memory in MB that each instance will use, to make sure each
|
||||
instance uses its own DPDK memory.
|
||||
To run multiple SPDK instances per node, you must make sure each instance uses
|
||||
its own DPDK memory by specifying for each instance the amount of DPDK memory
|
||||
(in MB) that the instance will use.
|
||||
|
||||
In most cases, a single device can be used for data, DB, and WAL. We describe
|
||||
In most cases, a single device can be used for data, DB, and WAL. We describe
|
||||
this strategy as *colocating* these components. Be sure to enter the below
|
||||
settings to ensure that all IOs are issued through SPDK.::
|
||||
settings to ensure that all I/Os are issued through SPDK::
|
||||
|
||||
bluestore_block_db_path = ""
|
||||
bluestore_block_db_size = 0
|
||||
bluestore_block_wal_path = ""
|
||||
bluestore_block_wal_size = 0
|
||||
|
||||
Otherwise, the current implementation will populate the SPDK map files with
|
||||
kernel file system symbols and will use the kernel driver to issue DB/WAL IO.
|
||||
If these settings are not entered, then the current implementation will
|
||||
populate the SPDK map files with kernel file system symbols and will use the
|
||||
kernel driver to issue DB/WAL I/Os.
|
||||
|
||||
Minimum Allocation Size
|
||||
========================
|
||||
=======================
|
||||
|
||||
There is a configured minimum amount of storage that BlueStore will allocate on
|
||||
an OSD. In practice, this is the least amount of capacity that a RADOS object
|
||||
can consume. The value of `bluestore_min_alloc_size` is derived from the
|
||||
value of `bluestore_min_alloc_size_hdd` or `bluestore_min_alloc_size_ssd`
|
||||
depending on the OSD's ``rotational`` attribute. This means that when an OSD
|
||||
is created on an HDD, BlueStore will be initialized with the current value
|
||||
of `bluestore_min_alloc_size_hdd`, and SSD OSDs (including NVMe devices)
|
||||
with the value of `bluestore_min_alloc_size_ssd`.
|
||||
There is a configured minimum amount of storage that BlueStore allocates on an
|
||||
underlying storage device. In practice, this is the least amount of capacity
|
||||
that even a tiny RADOS object can consume on each OSD's primary device. The
|
||||
configuration option in question-- ``bluestore_min_alloc_size`` --derives
|
||||
its value from the value of either ``bluestore_min_alloc_size_hdd`` or
|
||||
``bluestore_min_alloc_size_ssd``, depending on the OSD's ``rotational``
|
||||
attribute. Thus if an OSD is created on an HDD, BlueStore is initialized with
|
||||
the current value of ``bluestore_min_alloc_size_hdd``; but with SSD OSDs
|
||||
(including NVMe devices), Bluestore is initialized with the current value of
|
||||
``bluestore_min_alloc_size_ssd``.
|
||||
|
||||
Through the Mimic release, the default values were 64KB and 16KB for rotational
|
||||
(HDD) and non-rotational (SSD) media respectively. Octopus changed the default
|
||||
for SSD (non-rotational) media to 4KB, and Pacific changed the default for HDD
|
||||
(rotational) media to 4KB as well.
|
||||
In Mimic and earlier releases, the default values were 64KB for rotational
|
||||
media (HDD) and 16KB for non-rotational media (SSD). The Octopus release
|
||||
changed the the default value for non-rotational media (SSD) to 4KB, and the
|
||||
Pacific release changed the default value for rotational media (HDD) to 4KB.
|
||||
|
||||
These changes were driven by space amplification experienced by Ceph RADOS
|
||||
GateWay (RGW) deployments that host large numbers of small files
|
||||
These changes were driven by space amplification that was experienced by Ceph
|
||||
RADOS GateWay (RGW) deployments that hosted large numbers of small files
|
||||
(S3/Swift objects).
|
||||
|
||||
For example, when an RGW client stores a 1KB S3 object, it is written to a
|
||||
single RADOS object. With the default `min_alloc_size` value, 4KB of
|
||||
underlying drive space is allocated. This means that roughly
|
||||
(4KB - 1KB) == 3KB is allocated but never used, which corresponds to 300%
|
||||
overhead or 25% efficiency. Similarly, a 5KB user object will be stored
|
||||
as one 4KB and one 1KB RADOS object, again stranding 4KB of device capcity,
|
||||
though in this case the overhead is a much smaller percentage. Think of this
|
||||
in terms of the remainder from a modulus operation. The overhead *percentage*
|
||||
thus decreases rapidly as user object size increases.
|
||||
For example, when an RGW client stores a 1 KB S3 object, that object is written
|
||||
to a single RADOS object. In accordance with the default
|
||||
``min_alloc_size`` value, 4 KB of underlying drive space is allocated.
|
||||
This means that roughly 3 KB (that is, 4 KB minus 1 KB) is allocated but never
|
||||
used: this corresponds to 300% overhead or 25% efficiency. Similarly, a 5 KB
|
||||
user object will be stored as two RADOS objects, a 4 KB RADOS object and a 1 KB
|
||||
RADOS object, with the result that 4KB of device capacity is stranded. In this
|
||||
case, however, the overhead percentage is much smaller. Think of this in terms
|
||||
of the remainder from a modulus operation. The overhead *percentage* thus
|
||||
decreases rapidly as object size increases.
|
||||
|
||||
An easily missed additional subtlety is that this
|
||||
takes place for *each* replica. So when using the default three copies of
|
||||
data (3R), a 1KB S3 object actually consumes roughly 9KB of storage device
|
||||
capacity. If erasure coding (EC) is used instead of replication, the
|
||||
amplification may be even higher: for a ``k=4,m=2`` pool, our 1KB S3 object
|
||||
will allocate (6 * 4KB) = 24KB of device capacity.
|
||||
There is an additional subtlety that is easily missed: the amplification
|
||||
phenomenon just described takes place for *each* replica. For example, when
|
||||
using the default of three copies of data (3R), a 1 KB S3 object actually
|
||||
strands roughly 9 KB of storage device capacity. If erasure coding (EC) is used
|
||||
instead of replication, the amplification might be even higher: for a ``k=4,
|
||||
m=2`` pool, our 1 KB S3 object allocates 24 KB (that is, 4 KB multiplied by 6)
|
||||
of device capacity.
|
||||
|
||||
When an RGW bucket pool contains many relatively large user objects, the effect
|
||||
of this phenomenon is often negligible, but should be considered for deployments
|
||||
that expect a signficiant fraction of relatively small objects.
|
||||
of this phenomenon is often negligible. However, with deployments that can
|
||||
expect a significant fraction of relatively small user objects, the effect
|
||||
should be taken into consideration.
|
||||
|
||||
The 4KB default value aligns well with conventional HDD and SSD devices. Some
|
||||
new coarse-IU (Indirection Unit) QLC SSDs however perform and wear best
|
||||
when `bluestore_min_alloc_size_ssd`
|
||||
is set at OSD creation to match the device's IU:. 8KB, 16KB, or even 64KB.
|
||||
These novel storage drives allow one to achieve read performance competitive
|
||||
with conventional TLC SSDs and write performance faster than HDDs, with
|
||||
high density and lower cost than TLC SSDs.
|
||||
The 4KB default value aligns well with conventional HDD and SSD devices.
|
||||
However, certain novel coarse-IU (Indirection Unit) QLC SSDs perform and wear
|
||||
best when ``bluestore_min_alloc_size_ssd`` is specified at OSD creation
|
||||
to match the device's IU: this might be 8KB, 16KB, or even 64KB. These novel
|
||||
storage drives can achieve read performance that is competitive with that of
|
||||
conventional TLC SSDs and write performance that is faster than that of HDDs,
|
||||
with higher density and lower cost than TLC SSDs.
|
||||
|
||||
Note that when creating OSDs on these devices, one must carefully apply the
|
||||
non-default value only to appropriate devices, and not to conventional SSD and
|
||||
HDD devices. This may be done through careful ordering of OSD creation, custom
|
||||
OSD device classes, and especially by the use of central configuration _masks_.
|
||||
Note that when creating OSDs on these novel devices, one must be careful to
|
||||
apply the non-default value only to appropriate devices, and not to
|
||||
conventional HDD and SSD devices. Error can be avoided through careful ordering
|
||||
of OSD creation, with custom OSD device classes, and especially by the use of
|
||||
central configuration *masks*.
|
||||
|
||||
Quincy and later releases add
|
||||
the `bluestore_use_optimal_io_size_for_min_alloc_size`
|
||||
option that enables automatic discovery of the appropriate value as each OSD is
|
||||
created. Note that the use of ``bcache``, ``OpenCAS``, ``dmcrypt``,
|
||||
``ATA over Ethernet``, `iSCSI`, or other device layering / abstraction
|
||||
technologies may confound the determination of appropriate values. OSDs
|
||||
deployed on top of VMware storage have been reported to also
|
||||
sometimes report a ``rotational`` attribute that does not match the underlying
|
||||
hardware.
|
||||
In Quincy and later releases, you can use the
|
||||
``bluestore_use_optimal_io_size_for_min_alloc_size`` option to allow
|
||||
automatic discovery of the correct value as each OSD is created. Note that the
|
||||
use of ``bcache``, ``OpenCAS``, ``dmcrypt``, ``ATA over Ethernet``, `iSCSI`, or
|
||||
other device-layering and abstraction technologies might confound the
|
||||
determination of correct values. Moreover, OSDs deployed on top of VMware
|
||||
storage have sometimes been found to report a ``rotational`` attribute that
|
||||
does not match the underlying hardware.
|
||||
|
||||
We suggest inspecting such OSDs at startup via logs and admin sockets to ensure that
|
||||
behavior is appropriate. Note that this also may not work as desired with
|
||||
older kernels. You can check for this by examining the presence and value
|
||||
of ``/sys/block/<drive>/queue/optimal_io_size``.
|
||||
We suggest inspecting such OSDs at startup via logs and admin sockets in order
|
||||
to ensure that their behavior is correct. Be aware that this kind of inspection
|
||||
might not work as expected with older kernels. To check for this issue,
|
||||
examine the presence and value of ``/sys/block/<drive>/queue/optimal_io_size``.
|
||||
|
||||
You may also inspect a given OSD:
|
||||
.. note:: When running Reef or a later Ceph release, the ``min_alloc_size``
|
||||
baked into each OSD is conveniently reported by ``ceph osd metadata``.
|
||||
|
||||
To inspect a specific OSD, run the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph osd metadata osd.1701 | grep rotational
|
||||
ceph osd metadata osd.1701 | egrep rotational\|alloc
|
||||
|
||||
This space amplification may manifest as an unusually high ratio of raw to
|
||||
stored data reported by ``ceph df``. ``ceph osd df`` may also report
|
||||
anomalously high ``%USE`` / ``VAR`` values when
|
||||
compared to other, ostensibly identical OSDs. A pool using OSDs with
|
||||
mismatched ``min_alloc_size`` values may experience unexpected balancer
|
||||
behavior as well.
|
||||
This space amplification might manifest as an unusually high ratio of raw to
|
||||
stored data as reported by ``ceph df``. There might also be ``%USE`` / ``VAR``
|
||||
values reported by ``ceph osd df`` that are unusually high in comparison to
|
||||
other, ostensibly identical, OSDs. Finally, there might be unexpected balancer
|
||||
behavior in pools that use OSDs that have mismatched ``min_alloc_size`` values.
|
||||
|
||||
Note that this BlueStore attribute takes effect *only* at OSD creation; if
|
||||
changed later, a given OSD's behavior will not change unless / until it is
|
||||
destroyed and redeployed with the appropriate option value(s). Upgrading
|
||||
to a later Ceph release will *not* change the value used by OSDs deployed
|
||||
under older releases or with other settings.
|
||||
This BlueStore attribute takes effect *only* at OSD creation; if the attribute
|
||||
is changed later, a specific OSD's behavior will not change unless and until
|
||||
the OSD is destroyed and redeployed with the appropriate option value(s).
|
||||
Upgrading to a later Ceph release will *not* change the value used by OSDs that
|
||||
were deployed under older releases or with other settings.
|
||||
|
||||
DSA (Data Streaming Accelerator Usage)
|
||||
DSA (Data Streaming Accelerator) Usage
|
||||
======================================
|
||||
|
||||
If you want to use the DML library to drive DSA device for offloading
|
||||
read/write operations on Persist memory in Bluestore. You need to install
|
||||
`DML`_ and `idxd-config`_ library in your machine with SPR (Sapphire Rapids) CPU.
|
||||
If you want to use the DML library to drive the DSA device for offloading
|
||||
read/write operations on persistent memory (PMEM) in BlueStore, you need to
|
||||
install `DML`_ and the `idxd-config`_ library. This will work only on machines
|
||||
that have a SPR (Sapphire Rapids) CPU.
|
||||
|
||||
.. _DML: https://github.com/intel/DML
|
||||
.. _dml: https://github.com/intel/dml
|
||||
.. _idxd-config: https://github.com/intel/idxd-config
|
||||
|
||||
After installing the DML software, you need to configure the shared
|
||||
work queues (WQs) with the following WQ configuration example via accel-config tool:
|
||||
After installing the DML software, configure the shared work queues (WQs) with
|
||||
reference to the following WQ configuration example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="MyApp1" --priority=10 --block-on-fault=1 dsa0/wq0.1
|
||||
accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="myapp1" --priority=10 --block-on-fault=1 dsa0/wq0.1
|
||||
accel-config config-engine dsa0/engine0.1 --group-id=1
|
||||
accel-config enable-device dsa0
|
||||
accel-config enable-wq dsa0/wq0.1
|
||||
|
@ -218,4 +218,4 @@ If you need to allow multiple clusters to exist on the same host, use
|
||||
.. _Hardware Recommendations: ../../../start/hardware-recommendations
|
||||
.. _Network Configuration Reference: ../network-config-ref
|
||||
.. _OSD Config Reference: ../osd-config-ref
|
||||
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interactio
|
||||
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction
|
||||
|
@ -2,8 +2,14 @@
|
||||
Filestore Config Reference
|
||||
============================
|
||||
|
||||
The Filestore back end is no longer the default when creating new OSDs,
|
||||
though Filestore OSDs are still supported.
|
||||
.. note:: Since the Luminous release of Ceph, Filestore has not been Ceph's
|
||||
default storage back end. Since the Luminous release of Ceph, BlueStore has
|
||||
been Ceph's default storage back end. However, Filestore OSDs are still
|
||||
supported. See :ref:`OSD Back Ends
|
||||
<rados_config_storage_devices_osd_backends>`. See :ref:`BlueStore Migration
|
||||
<rados_operations_bluestore_migration>` for instructions explaining how to
|
||||
replace an existing Filestore back end with a BlueStore back end.
|
||||
|
||||
|
||||
``filestore debug omap check``
|
||||
|
||||
@ -18,26 +24,31 @@ though Filestore OSDs are still supported.
|
||||
Extended Attributes
|
||||
===================
|
||||
|
||||
Extended Attributes (XATTRs) are important for Filestore OSDs.
|
||||
Some file systems have limits on the number of bytes that can be stored in XATTRs.
|
||||
Additionally, in some cases, the file system may not be as fast as an alternative
|
||||
method of storing XATTRs. The following settings may help improve performance
|
||||
by using a method of storing XATTRs that is extrinsic to the underlying file system.
|
||||
Extended Attributes (XATTRs) are important for Filestore OSDs. However, Certain
|
||||
disadvantages can occur when the underlying file system is used for the storage
|
||||
of XATTRs: some file systems have limits on the number of bytes that can be
|
||||
stored in XATTRs, and your file system might in some cases therefore run slower
|
||||
than would an alternative method of storing XATTRs. For this reason, a method
|
||||
of storing XATTRs extrinsic to the underlying file system might improve
|
||||
performance. To implement such an extrinsic method, refer to the following
|
||||
settings.
|
||||
|
||||
Ceph XATTRs are stored as ``inline xattr``, using the XATTRs provided
|
||||
by the underlying file system, if it does not impose a size limit. If
|
||||
there is a size limit (4KB total on ext4, for instance), some Ceph
|
||||
XATTRs will be stored in a key/value database when either the
|
||||
If the underlying file system has no size limit, then Ceph XATTRs are stored as
|
||||
``inline xattr``, using the XATTRs provided by the file system. But if there is
|
||||
a size limit (for example, ext4 imposes a limit of 4 KB total), then some Ceph
|
||||
XATTRs will be stored in a key/value database when the limit is reached. More
|
||||
precisely, this begins to occur when either the
|
||||
``filestore_max_inline_xattr_size`` or ``filestore_max_inline_xattrs``
|
||||
threshold is reached.
|
||||
|
||||
|
||||
``filestore_max_inline_xattr_size``
|
||||
|
||||
:Description: The maximum size of an XATTR stored in the file system (i.e., XFS,
|
||||
Btrfs, EXT4, etc.) per object. Should not be larger than the
|
||||
file system can handle. Default value of 0 means to use the value
|
||||
specific to the underlying file system.
|
||||
:Description: Defines the maximum size per object of an XATTR that can be
|
||||
stored in the file system (for example, XFS, Btrfs, ext4). The
|
||||
specified size should not be larger than the file system can
|
||||
handle. Using the default value of 0 instructs Filestore to use
|
||||
the value specific to the file system.
|
||||
:Type: Unsigned 32-bit Integer
|
||||
:Required: No
|
||||
:Default: ``0``
|
||||
@ -45,8 +56,9 @@ threshold is reached.
|
||||
|
||||
``filestore_max_inline_xattr_size_xfs``
|
||||
|
||||
:Description: The maximum size of an XATTR stored in the XFS file system.
|
||||
Only used if ``filestore_max_inline_xattr_size`` == 0.
|
||||
:Description: Defines the maximum size of an XATTR that can be stored in the
|
||||
XFS file system. This setting is used only if
|
||||
``filestore_max_inline_xattr_size`` == 0.
|
||||
:Type: Unsigned 32-bit Integer
|
||||
:Required: No
|
||||
:Default: ``65536``
|
||||
@ -54,8 +66,9 @@ threshold is reached.
|
||||
|
||||
``filestore_max_inline_xattr_size_btrfs``
|
||||
|
||||
:Description: The maximum size of an XATTR stored in the Btrfs file system.
|
||||
Only used if ``filestore_max_inline_xattr_size`` == 0.
|
||||
:Description: Defines the maximum size of an XATTR that can be stored in the
|
||||
Btrfs file system. This setting is used only if
|
||||
``filestore_max_inline_xattr_size`` == 0.
|
||||
:Type: Unsigned 32-bit Integer
|
||||
:Required: No
|
||||
:Default: ``2048``
|
||||
@ -63,8 +76,8 @@ threshold is reached.
|
||||
|
||||
``filestore_max_inline_xattr_size_other``
|
||||
|
||||
:Description: The maximum size of an XATTR stored in other file systems.
|
||||
Only used if ``filestore_max_inline_xattr_size`` == 0.
|
||||
:Description: Defines the maximum size of an XATTR that can be stored in other file systems.
|
||||
This setting is used only if ``filestore_max_inline_xattr_size`` == 0.
|
||||
:Type: Unsigned 32-bit Integer
|
||||
:Required: No
|
||||
:Default: ``512``
|
||||
@ -72,9 +85,8 @@ threshold is reached.
|
||||
|
||||
``filestore_max_inline_xattrs``
|
||||
|
||||
:Description: The maximum number of XATTRs stored in the file system per object.
|
||||
Default value of 0 means to use the value specific to the
|
||||
underlying file system.
|
||||
:Description: Defines the maximum number of XATTRs per object that can be stored in the file system.
|
||||
Using the default value of 0 instructs Filestore to use the value specific to the file system.
|
||||
:Type: 32-bit Integer
|
||||
:Required: No
|
||||
:Default: ``0``
|
||||
@ -82,8 +94,8 @@ threshold is reached.
|
||||
|
||||
``filestore_max_inline_xattrs_xfs``
|
||||
|
||||
:Description: The maximum number of XATTRs stored in the XFS file system per object.
|
||||
Only used if ``filestore_max_inline_xattrs`` == 0.
|
||||
:Description: Defines the maximum number of XATTRs per object that can be stored in the XFS file system.
|
||||
This setting is used only if ``filestore_max_inline_xattrs`` == 0.
|
||||
:Type: 32-bit Integer
|
||||
:Required: No
|
||||
:Default: ``10``
|
||||
@ -91,8 +103,8 @@ threshold is reached.
|
||||
|
||||
``filestore_max_inline_xattrs_btrfs``
|
||||
|
||||
:Description: The maximum number of XATTRs stored in the Btrfs file system per object.
|
||||
Only used if ``filestore_max_inline_xattrs`` == 0.
|
||||
:Description: Defines the maximum number of XATTRs per object that can be stored in the Btrfs file system.
|
||||
This setting is used only if ``filestore_max_inline_xattrs`` == 0.
|
||||
:Type: 32-bit Integer
|
||||
:Required: No
|
||||
:Default: ``10``
|
||||
@ -100,8 +112,8 @@ threshold is reached.
|
||||
|
||||
``filestore_max_inline_xattrs_other``
|
||||
|
||||
:Description: The maximum number of XATTRs stored in other file systems per object.
|
||||
Only used if ``filestore_max_inline_xattrs`` == 0.
|
||||
:Description: Defines the maximum number of XATTRs per object that can be stored in other file systems.
|
||||
This setting is used only if ``filestore_max_inline_xattrs`` == 0.
|
||||
:Type: 32-bit Integer
|
||||
:Required: No
|
||||
:Default: ``2``
|
||||
@ -111,18 +123,19 @@ threshold is reached.
|
||||
Synchronization Intervals
|
||||
=========================
|
||||
|
||||
Filestore needs to periodically quiesce writes and synchronize the
|
||||
file system, which creates a consistent commit point. It can then free journal
|
||||
entries up to the commit point. Synchronizing more frequently tends to reduce
|
||||
the time required to perform synchronization, and reduces the amount of data
|
||||
that needs to remain in the journal. Less frequent synchronization allows the
|
||||
backing file system to coalesce small writes and metadata updates more
|
||||
optimally, potentially resulting in more efficient synchronization at the
|
||||
expense of potentially increasing tail latency.
|
||||
Filestore must periodically quiesce writes and synchronize the file system.
|
||||
Each synchronization creates a consistent commit point. When the commit point
|
||||
is created, Filestore is able to free all journal entries up to that point.
|
||||
More-frequent synchronization tends to reduce both synchronization time and
|
||||
the amount of data that needs to remain in the journal. Less-frequent
|
||||
synchronization allows the backing file system to coalesce small writes and
|
||||
metadata updates, potentially increasing synchronization
|
||||
efficiency but also potentially increasing tail latency.
|
||||
|
||||
|
||||
``filestore_max_sync_interval``
|
||||
|
||||
:Description: The maximum interval in seconds for synchronizing Filestore.
|
||||
:Description: Defines the maximum interval (in seconds) for synchronizing Filestore.
|
||||
:Type: Double
|
||||
:Required: No
|
||||
:Default: ``5``
|
||||
@ -130,7 +143,7 @@ expense of potentially increasing tail latency.
|
||||
|
||||
``filestore_min_sync_interval``
|
||||
|
||||
:Description: The minimum interval in seconds for synchronizing Filestore.
|
||||
:Description: Defines the minimum interval (in seconds) for synchronizing Filestore.
|
||||
:Type: Double
|
||||
:Required: No
|
||||
:Default: ``.01``
|
||||
@ -142,14 +155,14 @@ Flusher
|
||||
=======
|
||||
|
||||
The Filestore flusher forces data from large writes to be written out using
|
||||
``sync_file_range`` before the sync in order to (hopefully) reduce the cost of
|
||||
the eventual sync. In practice, disabling 'filestore_flusher' seems to improve
|
||||
performance in some cases.
|
||||
``sync_file_range`` prior to the synchronization.
|
||||
Ideally, this action reduces the cost of the eventual synchronization. In practice, however, disabling
|
||||
'filestore_flusher' seems in some cases to improve performance.
|
||||
|
||||
|
||||
``filestore_flusher``
|
||||
|
||||
:Description: Enables the filestore flusher.
|
||||
:Description: Enables the Filestore flusher.
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
@ -158,7 +171,7 @@ performance in some cases.
|
||||
|
||||
``filestore_flusher_max_fds``
|
||||
|
||||
:Description: Sets the maximum number of file descriptors for the flusher.
|
||||
:Description: Defines the maximum number of file descriptors for the flusher.
|
||||
:Type: Integer
|
||||
:Required: No
|
||||
:Default: ``512``
|
||||
@ -176,7 +189,7 @@ performance in some cases.
|
||||
|
||||
``filestore_fsync_flushes_journal_data``
|
||||
|
||||
:Description: Flush journal data during file system synchronization.
|
||||
:Description: Flushes journal data during file-system synchronization.
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
@ -187,11 +200,11 @@ performance in some cases.
|
||||
Queue
|
||||
=====
|
||||
|
||||
The following settings provide limits on the size of the Filestore queue.
|
||||
The following settings define limits on the size of the Filestore queue:
|
||||
|
||||
``filestore_queue_max_ops``
|
||||
|
||||
:Description: Defines the maximum number of in progress operations the file store accepts before blocking on queuing new operations.
|
||||
:Description: Defines the maximum number of in-progress operations that Filestore accepts before it blocks the queueing of any new operations.
|
||||
:Type: Integer
|
||||
:Required: No. Minimal impact on performance.
|
||||
:Default: ``50``
|
||||
@ -199,23 +212,20 @@ The following settings provide limits on the size of the Filestore queue.
|
||||
|
||||
``filestore_queue_max_bytes``
|
||||
|
||||
:Description: The maximum number of bytes for an operation.
|
||||
:Description: Defines the maximum number of bytes permitted per operation.
|
||||
:Type: Integer
|
||||
:Required: No
|
||||
:Default: ``100 << 20``
|
||||
|
||||
|
||||
|
||||
|
||||
.. index:: filestore; timeouts
|
||||
|
||||
Timeouts
|
||||
========
|
||||
|
||||
|
||||
``filestore_op_threads``
|
||||
|
||||
:Description: The number of file system operation threads that execute in parallel.
|
||||
:Description: Defines the number of file-system operation threads that execute in parallel.
|
||||
:Type: Integer
|
||||
:Required: No
|
||||
:Default: ``2``
|
||||
@ -223,7 +233,7 @@ Timeouts
|
||||
|
||||
``filestore_op_thread_timeout``
|
||||
|
||||
:Description: The timeout for a file system operation thread (in seconds).
|
||||
:Description: Defines the timeout (in seconds) for a file-system operation thread.
|
||||
:Type: Integer
|
||||
:Required: No
|
||||
:Default: ``60``
|
||||
@ -231,7 +241,7 @@ Timeouts
|
||||
|
||||
``filestore_op_thread_suicide_timeout``
|
||||
|
||||
:Description: The timeout for a commit operation before cancelling the commit (in seconds).
|
||||
:Description: Defines the timeout (in seconds) for a commit operation before the commit is cancelled.
|
||||
:Type: Integer
|
||||
:Required: No
|
||||
:Default: ``180``
|
||||
@ -245,17 +255,17 @@ B-Tree Filesystem
|
||||
|
||||
``filestore_btrfs_snap``
|
||||
|
||||
:Description: Enable snapshots for a ``btrfs`` filestore.
|
||||
:Description: Enables snapshots for a ``btrfs`` Filestore.
|
||||
:Type: Boolean
|
||||
:Required: No. Only used for ``btrfs``.
|
||||
:Required: No. Used only for ``btrfs``.
|
||||
:Default: ``true``
|
||||
|
||||
|
||||
``filestore_btrfs_clone_range``
|
||||
|
||||
:Description: Enable cloning ranges for a ``btrfs`` filestore.
|
||||
:Description: Enables cloning ranges for a ``btrfs`` Filestore.
|
||||
:Type: Boolean
|
||||
:Required: No. Only used for ``btrfs``.
|
||||
:Required: No. Used only for ``btrfs``.
|
||||
:Default: ``true``
|
||||
|
||||
|
||||
@ -267,7 +277,7 @@ Journal
|
||||
|
||||
``filestore_journal_parallel``
|
||||
|
||||
:Description: Enables parallel journaling, default for Btrfs.
|
||||
:Description: Enables parallel journaling, default for ``btrfs``.
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
@ -275,7 +285,7 @@ Journal
|
||||
|
||||
``filestore_journal_writeahead``
|
||||
|
||||
:Description: Enables writeahead journaling, default for XFS.
|
||||
:Description: Enables write-ahead journaling, default for XFS.
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
@ -283,7 +293,7 @@ Journal
|
||||
|
||||
``filestore_journal_trailing``
|
||||
|
||||
:Description: Deprecated, never use.
|
||||
:Description: Deprecated. **Never use.**
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
@ -295,8 +305,8 @@ Misc
|
||||
|
||||
``filestore_merge_threshold``
|
||||
|
||||
:Description: Min number of files in a subdir before merging into parent
|
||||
NOTE: A negative value means to disable subdir merging
|
||||
:Description: Defines the minimum number of files permitted in a subdirectory before the subdirectory is merged into its parent directory.
|
||||
NOTE: A negative value means that subdirectory merging is disabled.
|
||||
:Type: Integer
|
||||
:Required: No
|
||||
:Default: ``-10``
|
||||
@ -305,8 +315,8 @@ Misc
|
||||
``filestore_split_multiple``
|
||||
|
||||
:Description: ``(filestore_split_multiple * abs(filestore_merge_threshold) + (rand() % filestore_split_rand_factor)) * 16``
|
||||
is the maximum number of files in a subdirectory before
|
||||
splitting into child directories.
|
||||
is the maximum number of files permitted in a subdirectory
|
||||
before the subdirectory is split into child directories.
|
||||
|
||||
:Type: Integer
|
||||
:Required: No
|
||||
@ -316,10 +326,10 @@ Misc
|
||||
``filestore_split_rand_factor``
|
||||
|
||||
:Description: A random factor added to the split threshold to avoid
|
||||
too many (expensive) Filestore splits occurring at once. See
|
||||
``filestore_split_multiple`` for details.
|
||||
This can only be changed offline for an existing OSD,
|
||||
via the ``ceph-objectstore-tool apply-layout-settings`` command.
|
||||
too many (expensive) Filestore splits occurring at the same time.
|
||||
For details, see ``filestore_split_multiple``.
|
||||
To change this setting for an existing OSD, it is necessary to take the OSD
|
||||
offline before running the ``ceph-objectstore-tool apply-layout-settings`` command.
|
||||
|
||||
:Type: Unsigned 32-bit Integer
|
||||
:Required: No
|
||||
@ -328,7 +338,7 @@ Misc
|
||||
|
||||
``filestore_update_to``
|
||||
|
||||
:Description: Limits Filestore auto upgrade to specified version.
|
||||
:Description: Limits automatic upgrades to a specified version of Filestore. Useful in cases in which you want to avoid upgrading to a specific version.
|
||||
:Type: Integer
|
||||
:Required: No
|
||||
:Default: ``1000``
|
||||
@ -336,7 +346,7 @@ Misc
|
||||
|
||||
``filestore_blackhole``
|
||||
|
||||
:Description: Drop any new transactions on the floor.
|
||||
:Description: Drops any new transactions on the floor, similar to redirecting to NULL.
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
@ -344,7 +354,7 @@ Misc
|
||||
|
||||
``filestore_dump_file``
|
||||
|
||||
:Description: File onto which store transaction dumps.
|
||||
:Description: Defines the file that transaction dumps are stored on.
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
@ -352,7 +362,7 @@ Misc
|
||||
|
||||
``filestore_kill_at``
|
||||
|
||||
:Description: inject a failure at the n'th opportunity
|
||||
:Description: Injects a failure at the *n*\th opportunity.
|
||||
:Type: String
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
@ -360,8 +370,7 @@ Misc
|
||||
|
||||
``filestore_fail_eio``
|
||||
|
||||
:Description: Fail/Crash on eio.
|
||||
:Description: Fail/Crash on EIO.
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``true``
|
||||
|
||||
|
@ -16,24 +16,29 @@ consistent, but you can add, remove or replace a monitor in a cluster. See
|
||||
Background
|
||||
==========
|
||||
|
||||
Ceph Monitors maintain a "master copy" of the :term:`Cluster Map`, which means a
|
||||
:term:`Ceph Client` can determine the location of all Ceph Monitors, Ceph OSD
|
||||
Daemons, and Ceph Metadata Servers just by connecting to one Ceph Monitor and
|
||||
retrieving a current cluster map. Before Ceph Clients can read from or write to
|
||||
Ceph OSD Daemons or Ceph Metadata Servers, they must connect to a Ceph Monitor
|
||||
first. With a current copy of the cluster map and the CRUSH algorithm, a Ceph
|
||||
Client can compute the location for any object. The ability to compute object
|
||||
locations allows a Ceph Client to talk directly to Ceph OSD Daemons, which is a
|
||||
very important aspect of Ceph's high scalability and performance. See
|
||||
`Scalability and High Availability`_ for additional details.
|
||||
Ceph Monitors maintain a "master copy" of the :term:`Cluster Map`.
|
||||
|
||||
The primary role of the Ceph Monitor is to maintain a master copy of the cluster
|
||||
map. Ceph Monitors also provide authentication and logging services. Ceph
|
||||
Monitors write all changes in the monitor services to a single Paxos instance,
|
||||
and Paxos writes the changes to a key/value store for strong consistency. Ceph
|
||||
Monitors can query the most recent version of the cluster map during sync
|
||||
operations. Ceph Monitors leverage the key/value store's snapshots and iterators
|
||||
(using leveldb) to perform store-wide synchronization.
|
||||
The maintenance by Ceph Monitors of a :term:`Cluster Map` makes it possible for
|
||||
a :term:`Ceph Client` to determine the location of all Ceph Monitors, Ceph OSD
|
||||
Daemons, and Ceph Metadata Servers by connecting to one Ceph Monitor and
|
||||
retrieving a current cluster map. Before Ceph Clients can read from or write to
|
||||
Ceph OSD Daemons or Ceph Metadata Servers, they must connect to a Ceph Monitor.
|
||||
When a Ceph client has a current copy of the cluster map and the CRUSH
|
||||
algorithm, it can compute the location for any RADOS object within in the
|
||||
cluster. This ability to compute the locations of objects makes it possible for
|
||||
Ceph Clients to talk directly to Ceph OSD Daemons. This direct communication
|
||||
with Ceph OSD Daemons represents an improvment upon traditional storage
|
||||
architectures in which clients were required to communicate with a central
|
||||
component, and that improvment contributes to Ceph's high scalability and
|
||||
performance. See `Scalability and High Availability`_ for additional details.
|
||||
|
||||
The Ceph Monitor's primary function is to maintain a master copy of the cluster
|
||||
map. Monitors also provide authentication and logging services. All changes in
|
||||
the monitor services are written by the Ceph Monitor to a single Paxos
|
||||
instance, and Paxos writes the changes to a key/value store for strong
|
||||
consistency. Ceph Monitors are able to query the most recent version of the
|
||||
cluster map during sync operations, and they use the key/value store's
|
||||
snapshots and iterators (using leveldb) to perform store-wide synchronization.
|
||||
|
||||
.. ditaa::
|
||||
/-------------\ /-------------\
|
||||
@ -56,12 +61,6 @@ operations. Ceph Monitors leverage the key/value store's snapshots and iterators
|
||||
| cCCC |*---------------------+
|
||||
\-------------/
|
||||
|
||||
|
||||
.. deprecated:: version 0.58
|
||||
|
||||
In Ceph versions 0.58 and earlier, Ceph Monitors use a Paxos instance for
|
||||
each service and store the map as a file.
|
||||
|
||||
.. index:: Ceph Monitor; cluster map
|
||||
|
||||
Cluster Maps
|
||||
|
@ -25,6 +25,7 @@ There are two Ceph daemons that store data on devices:
|
||||
additional monitoring and providing interfaces to external
|
||||
monitoring and management systems.
|
||||
|
||||
.. _rados_config_storage_devices_osd_backends:
|
||||
|
||||
OSD Back Ends
|
||||
=============
|
||||
|
@ -3,14 +3,15 @@
|
||||
Balancer
|
||||
========
|
||||
|
||||
The *balancer* can optimize the placement of PGs across OSDs in
|
||||
order to achieve a balanced distribution, either automatically or in a
|
||||
supervised fashion.
|
||||
The *balancer* can optimize the allocation of placement groups (PGs) across
|
||||
OSDs in order to achieve a balanced distribution. The balancer can operate
|
||||
either automatically or in a supervised fashion.
|
||||
|
||||
|
||||
Status
|
||||
------
|
||||
|
||||
The current status of the balancer can be checked at any time with:
|
||||
To check the current status of the balancer, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -20,70 +21,78 @@ The current status of the balancer can be checked at any time with:
|
||||
Automatic balancing
|
||||
-------------------
|
||||
|
||||
The automatic balancing feature is enabled by default in ``upmap``
|
||||
mode. Please refer to :ref:`upmap` for more details. The balancer can be
|
||||
turned off with:
|
||||
When the balancer is in ``upmap`` mode, the automatic balancing feature is
|
||||
enabled by default. For more details, see :ref:`upmap`. To disable the
|
||||
balancer, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer off
|
||||
|
||||
The balancer mode can be changed to ``crush-compat`` mode, which is
|
||||
backward compatible with older clients, and will make small changes to
|
||||
the data distribution over time to ensure that OSDs are equally utilized.
|
||||
The balancer mode can be changed from ``upmap`` mode to ``crush-compat`` mode.
|
||||
``crush-compat`` mode is backward compatible with older clients. In
|
||||
``crush-compat`` mode, the balancer automatically makes small changes to the
|
||||
data distribution in order to ensure that OSDs are utilized equally.
|
||||
|
||||
|
||||
Throttling
|
||||
----------
|
||||
|
||||
No adjustments will be made to the PG distribution if the cluster is
|
||||
degraded (e.g., because an OSD has failed and the system has not yet
|
||||
healed itself).
|
||||
If the cluster is degraded (that is, if an OSD has failed and the system hasn't
|
||||
healed itself yet), then the balancer will not make any adjustments to the PG
|
||||
distribution.
|
||||
|
||||
When the cluster is healthy, the balancer will throttle its changes
|
||||
such that the percentage of PGs that are misplaced (i.e., that need to
|
||||
be moved) is below a threshold of (by default) 5%. The
|
||||
``target_max_misplaced_ratio`` threshold can be adjusted with:
|
||||
When the cluster is healthy, the balancer will incrementally move a small
|
||||
fraction of unbalanced PGs in order to improve distribution. This fraction
|
||||
will not exceed a certain threshold that defaults to 5%. To adjust this
|
||||
``target_max_misplaced_ratio`` threshold setting, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr target_max_misplaced_ratio .07 # 7%
|
||||
|
||||
Set the number of seconds to sleep in between runs of the automatic balancer:
|
||||
The balancer sleeps between runs. To set the number of seconds for this
|
||||
interval of sleep, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/sleep_interval 60
|
||||
|
||||
Set the time of day to begin automatic balancing in HHMM format:
|
||||
To set the time of day (in HHMM format) at which automatic balancing begins,
|
||||
run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/begin_time 0000
|
||||
|
||||
Set the time of day to finish automatic balancing in HHMM format:
|
||||
To set the time of day (in HHMM format) at which automatic balancing ends, run
|
||||
the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/end_time 2359
|
||||
|
||||
Restrict automatic balancing to this day of the week or later.
|
||||
Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on:
|
||||
Automatic balancing can be restricted to certain days of the week. To restrict
|
||||
it to a specific day of the week or later (as with crontab, ``0`` is Sunday,
|
||||
``1`` is Monday, and so on), run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/begin_weekday 0
|
||||
|
||||
Restrict automatic balancing to this day of the week or earlier.
|
||||
Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on:
|
||||
To restrict automatic balancing to a specific day of the week or earlier
|
||||
(again, ``0`` is Sunday, ``1`` is Monday, and so on), run the following
|
||||
command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/end_weekday 6
|
||||
|
||||
Pool IDs to which the automatic balancing will be limited.
|
||||
The default for this is an empty string, meaning all pools will be balanced.
|
||||
The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` command:
|
||||
Automatic balancing can be restricted to certain pools. By default, the value
|
||||
of this setting is an empty string, so that all pools are automatically
|
||||
balanced. To restrict automatic balancing to specific pools, retrieve their
|
||||
numeric pool IDs (by running the :command:`ceph osd pool ls detail` command),
|
||||
and then run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -93,43 +102,41 @@ The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` c
|
||||
Modes
|
||||
-----
|
||||
|
||||
There are currently two supported balancer modes:
|
||||
There are two supported balancer modes:
|
||||
|
||||
#. **crush-compat**. The CRUSH compat mode uses the compat weight-set
|
||||
feature (introduced in Luminous) to manage an alternative set of
|
||||
weights for devices in the CRUSH hierarchy. The normal weights
|
||||
should remain set to the size of the device to reflect the target
|
||||
amount of data that we want to store on the device. The balancer
|
||||
then optimizes the weight-set values, adjusting them up or down in
|
||||
small increments, in order to achieve a distribution that matches
|
||||
the target distribution as closely as possible. (Because PG
|
||||
placement is a pseudorandom process, there is a natural amount of
|
||||
variation in the placement; by optimizing the weights we
|
||||
counter-act that natural variation.)
|
||||
#. **crush-compat**. This mode uses the compat weight-set feature (introduced
|
||||
in Luminous) to manage an alternative set of weights for devices in the
|
||||
CRUSH hierarchy. When the balancer is operating in this mode, the normal
|
||||
weights should remain set to the size of the device in order to reflect the
|
||||
target amount of data intended to be stored on the device. The balancer will
|
||||
then optimize the weight-set values, adjusting them up or down in small
|
||||
increments, in order to achieve a distribution that matches the target
|
||||
distribution as closely as possible. (Because PG placement is a pseudorandom
|
||||
process, it is subject to a natural amount of variation; optimizing the
|
||||
weights serves to counteract that natural variation.)
|
||||
|
||||
Notably, this mode is *fully backwards compatible* with older
|
||||
clients: when an OSDMap and CRUSH map is shared with older clients,
|
||||
we present the optimized weights as the "real" weights.
|
||||
Note that this mode is *fully backward compatible* with older clients: when
|
||||
an OSD Map and CRUSH map are shared with older clients, Ceph presents the
|
||||
optimized weights as the "real" weights.
|
||||
|
||||
The primary restriction of this mode is that the balancer cannot
|
||||
handle multiple CRUSH hierarchies with different placement rules if
|
||||
the subtrees of the hierarchy share any OSDs. (This is normally
|
||||
not the case, and is generally not a recommended configuration
|
||||
because it is hard to manage the space utilization on the shared
|
||||
OSDs.)
|
||||
The primary limitation of this mode is that the balancer cannot handle
|
||||
multiple CRUSH hierarchies with different placement rules if the subtrees of
|
||||
the hierarchy share any OSDs. (Such sharing of OSDs is not typical and,
|
||||
because of the difficulty of managing the space utilization on the shared
|
||||
OSDs, is generally not recommended.)
|
||||
|
||||
#. **upmap**. Starting with Luminous, the OSDMap can store explicit
|
||||
mappings for individual OSDs as exceptions to the normal CRUSH
|
||||
placement calculation. These `upmap` entries provide fine-grained
|
||||
control over the PG mapping. This CRUSH mode will optimize the
|
||||
placement of individual PGs in order to achieve a balanced
|
||||
distribution. In most cases, this distribution is "perfect," which
|
||||
an equal number of PGs on each OSD (+/-1 PG, since they might not
|
||||
divide evenly).
|
||||
#. **upmap**. In Luminous and later releases, the OSDMap can store explicit
|
||||
mappings for individual OSDs as exceptions to the normal CRUSH placement
|
||||
calculation. These ``upmap`` entries provide fine-grained control over the
|
||||
PG mapping. This balancer mode optimizes the placement of individual PGs in
|
||||
order to achieve a balanced distribution. In most cases, the resulting
|
||||
distribution is nearly perfect: that is, there is an equal number of PGs on
|
||||
each OSD (±1 PG, since the total number might not divide evenly).
|
||||
|
||||
Note that using upmap requires that all clients be Luminous or newer.
|
||||
To use``upmap``, all clients must be Luminous or newer.
|
||||
|
||||
The default mode is ``upmap``. The mode can be adjusted with:
|
||||
The default mode is ``upmap``. The mode can be changed to ``crush-compat`` by
|
||||
running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -138,69 +145,77 @@ The default mode is ``upmap``. The mode can be adjusted with:
|
||||
Supervised optimization
|
||||
-----------------------
|
||||
|
||||
The balancer operation is broken into a few distinct phases:
|
||||
Supervised use of the balancer can be understood in terms of three distinct
|
||||
phases:
|
||||
|
||||
#. building a *plan*
|
||||
#. evaluating the quality of the data distribution, either for the current PG distribution, or the PG distribution that would result after executing a *plan*
|
||||
#. executing the *plan*
|
||||
#. building a plan
|
||||
#. evaluating the quality of the data distribution, either for the current PG
|
||||
distribution or for the PG distribution that would result after executing a
|
||||
plan
|
||||
#. executing the plan
|
||||
|
||||
To evaluate and score the current distribution:
|
||||
To evaluate the current distribution, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer eval
|
||||
|
||||
You can also evaluate the distribution for a single pool with:
|
||||
To evaluate the distribution for a single pool, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer eval <pool-name>
|
||||
|
||||
Greater detail for the evaluation can be seen with:
|
||||
To see the evaluation in greater detail, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer eval-verbose ...
|
||||
|
||||
The balancer can generate a plan, using the currently configured mode, with:
|
||||
|
||||
To instruct the balancer to generate a plan (using the currently configured
|
||||
mode), make up a name (any useful identifying string) for the plan, and run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer optimize <plan-name>
|
||||
|
||||
The name is provided by the user and can be any useful identifying string. The contents of a plan can be seen with:
|
||||
To see the contents of a plan, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer show <plan-name>
|
||||
|
||||
All plans can be shown with:
|
||||
To display all plans, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer ls
|
||||
|
||||
Old plans can be discarded with:
|
||||
To discard an old plan, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer rm <plan-name>
|
||||
|
||||
Currently recorded plans are shown as part of the status command:
|
||||
To see currently recorded plans, examine the output of the following status
|
||||
command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer status
|
||||
|
||||
The quality of the distribution that would result after executing a plan can be calculated with:
|
||||
To evaluate the distribution that would result from executing a specific plan,
|
||||
run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer eval <plan-name>
|
||||
|
||||
Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with:
|
||||
If a plan is expected to improve the distribution (that is, the plan's score is
|
||||
lower than the current cluster state's score), you can execute that plan by
|
||||
running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer execute <plan-name>
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _rados_operations_bluestore_migration:
|
||||
|
||||
=====================
|
||||
BlueStore Migration
|
||||
=====================
|
||||
|
@ -1,6 +1,10 @@
|
||||
===============
|
||||
Cache Tiering
|
||||
===============
|
||||
.. warning:: Cache tiering has been deprecated in the Reef release as it
|
||||
has lacked a maintainer for a very long time. This does not mean
|
||||
it will be certainly removed, but we may choose to remove it
|
||||
without much further notice.
|
||||
|
||||
A cache tier provides Ceph Clients with better I/O performance for a subset of
|
||||
the data stored in a backing storage tier. Cache tiering involves creating a
|
||||
|
@ -315,7 +315,7 @@ the hierarchy is visible as a separate column (labeled either
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tree
|
||||
ceph osd crush tree
|
||||
|
||||
When both *compat* and *per-pool* weight sets are in use, data
|
||||
placement for a particular pool will use its own per-pool weight set
|
||||
|
@ -2,40 +2,44 @@
|
||||
Data Placement Overview
|
||||
=========================
|
||||
|
||||
Ceph stores, replicates and rebalances data objects across a RADOS cluster
|
||||
dynamically. With many different users storing objects in different pools for
|
||||
different purposes on countless OSDs, Ceph operations require some data
|
||||
placement planning. The main data placement planning concepts in Ceph include:
|
||||
Ceph stores, replicates, and rebalances data objects across a RADOS cluster
|
||||
dynamically. Because different users store objects in different pools for
|
||||
different purposes on many OSDs, Ceph operations require a certain amount of
|
||||
data- placement planning. The main data-placement planning concepts in Ceph
|
||||
include:
|
||||
|
||||
- **Pools:** Ceph stores data within pools, which are logical groups for storing
|
||||
objects. Pools manage the number of placement groups, the number of replicas,
|
||||
and the CRUSH rule for the pool. To store data in a pool, you must have
|
||||
an authenticated user with permissions for the pool. Ceph can snapshot pools.
|
||||
See `Pools`_ for additional details.
|
||||
- **Pools:** Ceph stores data within pools, which are logical groups used for
|
||||
storing objects. Pools manage the number of placement groups, the number of
|
||||
replicas, and the CRUSH rule for the pool. To store data in a pool, it is
|
||||
necessary to be an authenticated user with permissions for the pool. Ceph is
|
||||
able to make snapshots of pools. For additional details, see `Pools`_.
|
||||
|
||||
- **Placement Groups:** Ceph maps objects to placement groups (PGs).
|
||||
Placement groups (PGs) are shards or fragments of a logical object pool
|
||||
that place objects as a group into OSDs. Placement groups reduce the amount
|
||||
of per-object metadata when Ceph stores the data in OSDs. A larger number of
|
||||
placement groups (e.g., 100 per OSD) leads to better balancing. See
|
||||
`Placement Groups`_ for additional details.
|
||||
- **Placement Groups:** Ceph maps objects to placement groups. Placement
|
||||
groups (PGs) are shards or fragments of a logical object pool that place
|
||||
objects as a group into OSDs. Placement groups reduce the amount of
|
||||
per-object metadata that is necessary for Ceph to store the data in OSDs. A
|
||||
greater number of placement groups (for example, 100 PGs per OSD as compared
|
||||
with 50 PGs per OSD) leads to better balancing.
|
||||
|
||||
- **CRUSH Maps:** CRUSH is a big part of what allows Ceph to scale without
|
||||
performance bottlenecks, without limitations to scalability, and without a
|
||||
single point of failure. CRUSH maps provide the physical topology of the
|
||||
cluster to the CRUSH algorithm to determine where the data for an object
|
||||
and its replicas should be stored, and how to do so across failure domains
|
||||
for added data safety among other things. See `CRUSH Maps`_ for additional
|
||||
details.
|
||||
- **CRUSH Maps:** CRUSH plays a major role in allowing Ceph to scale while
|
||||
avoiding certain pitfalls, such as performance bottlenecks, limitations to
|
||||
scalability, and single points of failure. CRUSH maps provide the physical
|
||||
topology of the cluster to the CRUSH algorithm, so that it can determine both
|
||||
(1) where the data for an object and its replicas should be stored and (2)
|
||||
how to store that data across failure domains so as to improve data safety.
|
||||
For additional details, see `CRUSH Maps`_.
|
||||
|
||||
- **Balancer:** The balancer is a feature that will automatically optimize the
|
||||
distribution of PGs across devices to achieve a balanced data distribution,
|
||||
maximizing the amount of data that can be stored in the cluster and evenly
|
||||
distributing the workload across OSDs.
|
||||
- **Balancer:** The balancer is a feature that automatically optimizes the
|
||||
distribution of placement groups across devices in order to achieve a
|
||||
balanced data distribution, in order to maximize the amount of data that can
|
||||
be stored in the cluster, and in order to evenly distribute the workload
|
||||
across OSDs.
|
||||
|
||||
When you initially set up a test cluster, you can use the default values. Once
|
||||
you begin planning for a large Ceph cluster, refer to pools, placement groups
|
||||
and CRUSH for data placement operations.
|
||||
It is possible to use the default values for each of the above components.
|
||||
Default values are recommended for a test cluster's initial setup. However,
|
||||
when planning a large Ceph cluster, values should be customized for
|
||||
data-placement operations with reference to the different roles played by
|
||||
pools, placement groups, and CRUSH.
|
||||
|
||||
.. _Pools: ../pools
|
||||
.. _Placement Groups: ../placement-groups
|
||||
|
@ -3,28 +3,32 @@
|
||||
Device Management
|
||||
=================
|
||||
|
||||
Ceph tracks which hardware storage devices (e.g., HDDs, SSDs) are consumed by
|
||||
which daemons, and collects health metrics about those devices in order to
|
||||
provide tools to predict and/or automatically respond to hardware failure.
|
||||
Device management allows Ceph to address hardware failure. Ceph tracks hardware
|
||||
storage devices (HDDs, SSDs) to see which devices are managed by which daemons.
|
||||
Ceph also collects health metrics about these devices. By doing so, Ceph can
|
||||
provide tools that predict hardware failure and can automatically respond to
|
||||
hardware failure.
|
||||
|
||||
Device tracking
|
||||
---------------
|
||||
|
||||
You can query which storage devices are in use with:
|
||||
To see a list of the storage devices that are in use, run the following
|
||||
command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device ls
|
||||
|
||||
You can also list devices by daemon or by host:
|
||||
Alternatively, to list devices by daemon or by host, run a command of one of
|
||||
the following forms:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device ls-by-daemon <daemon>
|
||||
ceph device ls-by-host <host>
|
||||
|
||||
For any individual device, you can query information about its
|
||||
location and how it is being consumed with:
|
||||
To see information about the location of an specific device and about how the
|
||||
device is being consumed, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -33,103 +37,107 @@ location and how it is being consumed with:
|
||||
Identifying physical devices
|
||||
----------------------------
|
||||
|
||||
You can blink the drive LEDs on hardware enclosures to make the replacement of
|
||||
failed disks easy and less error-prone. Use the following command::
|
||||
To make the replacement of failed disks easier and less error-prone, you can
|
||||
(in some cases) "blink" the drive's LEDs on hardware enclosures by running a
|
||||
command of the following form::
|
||||
|
||||
device light on|off <devid> [ident|fault] [--force]
|
||||
|
||||
The ``<devid>`` parameter is the device identification. You can obtain this
|
||||
information using the following command:
|
||||
.. note:: Using this command to blink the lights might not work. Whether it
|
||||
works will depend upon such factors as your kernel revision, your SES
|
||||
firmware, or the setup of your HBA.
|
||||
|
||||
The ``<devid>`` parameter is the device identification. To retrieve this
|
||||
information, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device ls
|
||||
|
||||
The ``[ident|fault]`` parameter is used to set the kind of light to blink.
|
||||
By default, the `identification` light is used.
|
||||
The ``[ident|fault]`` parameter determines which kind of light will blink. By
|
||||
default, the `identification` light is used.
|
||||
|
||||
.. note::
|
||||
This command needs the Cephadm or the Rook `orchestrator <https://docs.ceph.com/docs/master/mgr/orchestrator/#orchestrator-cli-module>`_ module enabled.
|
||||
The orchestrator module enabled is shown by executing the following command:
|
||||
.. note:: This command works only if the Cephadm or the Rook `orchestrator
|
||||
<https://docs.ceph.com/docs/master/mgr/orchestrator/#orchestrator-cli-module>`_
|
||||
module is enabled. To see which orchestrator module is enabled, run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph orch status
|
||||
|
||||
The command behind the scene to blink the drive LEDs is `lsmcli`. If you need
|
||||
to customize this command you can configure this via a Jinja2 template::
|
||||
The command that makes the drive's LEDs blink is `lsmcli`. To customize this
|
||||
command, configure it via a Jinja2 template by running commands of the
|
||||
following forms::
|
||||
|
||||
ceph config-key set mgr/cephadm/blink_device_light_cmd "<template>"
|
||||
ceph config-key set mgr/cephadm/<host>/blink_device_light_cmd "lsmcli local-disk-{{ ident_fault }}-led-{{'on' if on else 'off'}} --path '{{ path or dev }}'"
|
||||
|
||||
The Jinja2 template is rendered using the following arguments:
|
||||
The following arguments can be used to customize the Jinja2 template:
|
||||
|
||||
* ``on``
|
||||
A boolean value.
|
||||
* ``ident_fault``
|
||||
A string containing `ident` or `fault`.
|
||||
A string that contains `ident` or `fault`.
|
||||
* ``dev``
|
||||
A string containing the device ID, e.g. `SanDisk_X400_M.2_2280_512GB_162924424784`.
|
||||
A string that contains the device ID: for example, `SanDisk_X400_M.2_2280_512GB_162924424784`.
|
||||
* ``path``
|
||||
A string containing the device path, e.g. `/dev/sda`.
|
||||
A string that contains the device path: for example, `/dev/sda`.
|
||||
|
||||
.. _enabling-monitoring:
|
||||
|
||||
Enabling monitoring
|
||||
-------------------
|
||||
|
||||
Ceph can also monitor health metrics associated with your device. For
|
||||
example, SATA hard disks implement a standard called SMART that
|
||||
provides a wide range of internal metrics about the device's usage and
|
||||
health, like the number of hours powered on, number of power cycles,
|
||||
or unrecoverable read errors. Other device types like SAS and NVMe
|
||||
implement a similar set of metrics (via slightly different standards).
|
||||
All of these can be collected by Ceph via the ``smartctl`` tool.
|
||||
Ceph can also monitor the health metrics associated with your device. For
|
||||
example, SATA drives implement a standard called SMART that provides a wide
|
||||
range of internal metrics about the device's usage and health (for example: the
|
||||
number of hours powered on, the number of power cycles, the number of
|
||||
unrecoverable read errors). Other device types such as SAS and NVMe present a
|
||||
similar set of metrics (via slightly different standards). All of these
|
||||
metrics can be collected by Ceph via the ``smartctl`` tool.
|
||||
|
||||
You can enable or disable health monitoring with:
|
||||
You can enable or disable health monitoring by running one of the following
|
||||
commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device monitoring on
|
||||
|
||||
or:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device monitoring off
|
||||
|
||||
|
||||
Scraping
|
||||
--------
|
||||
|
||||
If monitoring is enabled, metrics will automatically be scraped at regular intervals. That interval can be configured with:
|
||||
If monitoring is enabled, device metrics will be scraped automatically at
|
||||
regular intervals. To configure that interval, run a command of the following
|
||||
form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/devicehealth/scrape_frequency <seconds>
|
||||
|
||||
The default is to scrape once every 24 hours.
|
||||
By default, device metrics are scraped once every 24 hours.
|
||||
|
||||
You can manually trigger a scrape of all devices with:
|
||||
To manually scrape all devices, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device scrape-health-metrics
|
||||
|
||||
A single device can be scraped with:
|
||||
To scrape a single device, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device scrape-health-metrics <device-id>
|
||||
|
||||
Or a single daemon's devices can be scraped with:
|
||||
To scrape a single daemon's devices, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device scrape-daemon-health-metrics <who>
|
||||
|
||||
The stored health metrics for a device can be retrieved (optionally
|
||||
for a specific timestamp) with:
|
||||
To retrieve the stored health metrics for a device (optionally for a specific
|
||||
timestamp), run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -138,71 +146,82 @@ for a specific timestamp) with:
|
||||
Failure prediction
|
||||
------------------
|
||||
|
||||
Ceph can predict life expectancy and device failures based on the
|
||||
health metrics it collects. There are three modes:
|
||||
Ceph can predict drive life expectancy and device failures by analyzing the
|
||||
health metrics that it collects. The prediction modes are as follows:
|
||||
|
||||
* *none*: disable device failure prediction.
|
||||
* *local*: use a pre-trained prediction model from the ceph-mgr daemon
|
||||
* *local*: use a pre-trained prediction model from the ``ceph-mgr`` daemon.
|
||||
|
||||
The prediction mode can be configured with:
|
||||
To configure the prediction mode, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set global device_failure_prediction_mode <mode>
|
||||
|
||||
Prediction normally runs in the background on a periodic basis, so it
|
||||
may take some time before life expectancy values are populated. You
|
||||
can see the life expectancy of all devices in output from:
|
||||
Under normal conditions, failure prediction runs periodically in the
|
||||
background. For this reason, life expectancy values might be populated only
|
||||
after a significant amount of time has passed. The life expectancy of all
|
||||
devices is displayed in the output of the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device ls
|
||||
|
||||
You can also query the metadata for a specific device with:
|
||||
To see the metadata of a specific device, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device info <devid>
|
||||
|
||||
You can explicitly force prediction of a device's life expectancy with:
|
||||
To explicitly force prediction of a specific device's life expectancy, run a
|
||||
command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device predict-life-expectancy <devid>
|
||||
|
||||
If you are not using Ceph's internal device failure prediction but
|
||||
have some external source of information about device failures, you
|
||||
can inform Ceph of a device's life expectancy with:
|
||||
In addition to Ceph's internal device failure prediction, you might have an
|
||||
external source of information about device failures. To inform Ceph of a
|
||||
specific device's life expectancy, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device set-life-expectancy <devid> <from> [<to>]
|
||||
|
||||
Life expectancies are expressed as a time interval so that
|
||||
uncertainty can be expressed in the form of a wide interval. The
|
||||
interval end can also be left unspecified.
|
||||
Life expectancies are expressed as a time interval. This means that the
|
||||
uncertainty of the life expectancy can be expressed in the form of a range of
|
||||
time, and perhaps a wide range of time. The interval's end can be left
|
||||
unspecified.
|
||||
|
||||
Health alerts
|
||||
-------------
|
||||
|
||||
The ``mgr/devicehealth/warn_threshold`` controls how soon an expected
|
||||
device failure must be before we generate a health warning.
|
||||
The ``mgr/devicehealth/warn_threshold`` configuration option controls the
|
||||
health check for an expected device failure. If the device is expected to fail
|
||||
within the specified time interval, an alert is raised.
|
||||
|
||||
The stored life expectancy of all devices can be checked, and any
|
||||
appropriate health alerts generated, with:
|
||||
To check the stored life expectancy of all devices and generate any appropriate
|
||||
health alert, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device check-health
|
||||
|
||||
Automatic Mitigation
|
||||
--------------------
|
||||
Automatic Migration
|
||||
-------------------
|
||||
|
||||
If the ``mgr/devicehealth/self_heal`` option is enabled (it is by
|
||||
default), then for devices that are expected to fail soon the module
|
||||
will automatically migrate data away from them by marking the devices
|
||||
"out".
|
||||
The ``mgr/devicehealth/self_heal`` option (enabled by default) automatically
|
||||
migrates data away from devices that are expected to fail soon. If this option
|
||||
is enabled, the module marks such devices ``out`` so that automatic migration
|
||||
will occur.
|
||||
|
||||
The ``mgr/devicehealth/mark_out_threshold`` controls how soon an
|
||||
expected device failure must be before we automatically mark an osd
|
||||
"out".
|
||||
.. note:: The ``mon_osd_min_up_ratio`` configuration option can help prevent
|
||||
this process from cascading to total failure. If the "self heal" module
|
||||
marks ``out`` so many OSDs that the ratio value of ``mon_osd_min_up_ratio``
|
||||
is exceeded, then the cluster raises the ``DEVICE_HEALTH_TOOMANY`` health
|
||||
check. For instructions on what to do in this situation, see
|
||||
:ref:`DEVICE_HEALTH_TOOMANY<rados_health_checks_device_health_toomany>`.
|
||||
|
||||
The ``mgr/devicehealth/mark_out_threshold`` configuration option specifies the
|
||||
time interval for automatic migration. If a device is expected to fail within
|
||||
the specified time interval, it will be automatically marked ``out``.
|
||||
|
@ -6,9 +6,11 @@ The *jerasure* plugin is the most generic and flexible plugin, it is
|
||||
also the default for Ceph erasure coded pools.
|
||||
|
||||
The *jerasure* plugin encapsulates the `Jerasure
|
||||
<http://jerasure.org>`_ library. It is
|
||||
recommended to read the *jerasure* documentation to get a better
|
||||
understanding of the parameters.
|
||||
<https://github.com/ceph/jerasure>`_ library. It is
|
||||
recommended to read the ``jerasure`` documentation to
|
||||
understand the parameters. Note that the ``jerasure.org``
|
||||
web site as of 2023 may no longer be connected to the original
|
||||
project or legitimate.
|
||||
|
||||
Create a jerasure profile
|
||||
=========================
|
||||
|
@ -843,6 +843,8 @@ This message can be silenced by disabling self-heal behavior (that is, setting
|
||||
``mgr/devicehealth/mark_out_threshold``, or by addressing whichever condition
|
||||
is preventing data from being migrated off of the ailing OSD(s).
|
||||
|
||||
.. _rados_health_checks_device_health_toomany:
|
||||
|
||||
DEVICE_HEALTH_TOOMANY
|
||||
_____________________
|
||||
|
||||
|
@ -117,11 +117,12 @@ pseudo-random placement that takes into account the failure domains that you
|
||||
have set in your `CRUSH map`_; for this reason, PGs are rarely assigned to
|
||||
immediately adjacent OSDs in a large cluster.
|
||||
|
||||
Ceph processes a client request using the **Acting Set**, which is the set of
|
||||
OSDs that will actually handle the requests since they have a full and working
|
||||
version of a placement group shard. The set of OSDs that should contain a shard
|
||||
of a particular placement group as the **Up Set**, i.e. where data is
|
||||
moved/copied to (or planned to be).
|
||||
Ceph processes client requests with the **Acting Set** of OSDs: this is the set
|
||||
of OSDs that currently have a full and working version of a PG shard and that
|
||||
are therefore responsible for handling requests. By contrast, the **Up Set** is
|
||||
the set of OSDs that contain a shard of a specific PG. Data is moved or copied
|
||||
to the **Up Set**, or planned to be moved or copied, to the **Up Set**. See
|
||||
:ref:`Placement Group Concepts <rados_operations_pg_concepts>`.
|
||||
|
||||
Sometimes an OSD in the Acting Set is ``down`` or otherwise unable to
|
||||
service requests for objects in the PG. When this kind of situation
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _rados_operations_pg_concepts:
|
||||
|
||||
==========================
|
||||
Placement Group Concepts
|
||||
==========================
|
||||
|
@ -7,209 +7,256 @@ Stretch Clusters
|
||||
|
||||
Stretch Clusters
|
||||
================
|
||||
Ceph generally expects all parts of its network and overall cluster to be
|
||||
equally reliable, with failures randomly distributed across the CRUSH map.
|
||||
So you may lose a switch that knocks out a number of OSDs, but we expect
|
||||
the remaining OSDs and monitors to route around that.
|
||||
|
||||
This is usually a good choice, but may not work well in some
|
||||
stretched cluster configurations where a significant part of your cluster
|
||||
is stuck behind a single network component. For instance, a single
|
||||
cluster which is located in multiple data centers, and you want to
|
||||
sustain the loss of a full DC.
|
||||
A stretch cluster is a cluster that has servers in geographically separated
|
||||
data centers, distributed over a WAN. Stretch clusters have LAN-like high-speed
|
||||
and low-latency connections, but limited links. Stretch clusters have a higher
|
||||
likelihood of (possibly asymmetric) network splits, and a higher likelihood of
|
||||
temporary or complete loss of an entire data center (which can represent
|
||||
one-third to one-half of the total cluster).
|
||||
|
||||
There are two standard configurations we've seen deployed, with either
|
||||
two or three data centers (or, in clouds, availability zones). With two
|
||||
zones, we expect each site to hold a copy of the data, and for a third
|
||||
site to have a tiebreaker monitor (this can be a VM or high-latency compared
|
||||
to the main sites) to pick a winner if the network connection fails and both
|
||||
DCs remain alive. For three sites, we expect a copy of the data and an equal
|
||||
number of monitors in each site.
|
||||
Ceph is designed with the expectation that all parts of its network and cluster
|
||||
will be reliable and that failures will be distributed randomly across the
|
||||
CRUSH map. Even if a switch goes down and causes the loss of many OSDs, Ceph is
|
||||
designed so that the remaining OSDs and monitors will route around such a loss.
|
||||
|
||||
Note that the standard Ceph configuration will survive MANY failures of the
|
||||
network or data centers and it will never compromise data consistency. If you
|
||||
bring back enough Ceph servers following a failure, it will recover. If you
|
||||
lose a data center, but can still form a quorum of monitors and have all the data
|
||||
available (with enough copies to satisfy pools' ``min_size``, or CRUSH rules
|
||||
that will re-replicate to meet it), Ceph will maintain availability.
|
||||
Sometimes this cannot be relied upon. If you have a "stretched-cluster"
|
||||
deployment in which much of your cluster is behind a single network component,
|
||||
you might need to use **stretch mode** to ensure data integrity.
|
||||
|
||||
What can't it handle?
|
||||
We will here consider two standard configurations: a configuration with two
|
||||
data centers (or, in clouds, two availability zones), and a configuration with
|
||||
three data centers (or, in clouds, three availability zones).
|
||||
|
||||
In the two-site configuration, Ceph expects each of the sites to hold a copy of
|
||||
the data, and Ceph also expects there to be a third site that has a tiebreaker
|
||||
monitor. This tiebreaker monitor picks a winner if the network connection fails
|
||||
and both data centers remain alive.
|
||||
|
||||
The tiebreaker monitor can be a VM. It can also have high latency relative to
|
||||
the two main sites.
|
||||
|
||||
The standard Ceph configuration is able to survive MANY network failures or
|
||||
data-center failures without ever compromising data availability. If enough
|
||||
Ceph servers are brought back following a failure, the cluster *will* recover.
|
||||
If you lose a data center but are still able to form a quorum of monitors and
|
||||
still have all the data available, Ceph will maintain availability. (This
|
||||
assumes that the cluster has enough copies to satisfy the pools' ``min_size``
|
||||
configuration option, or (failing that) that the cluster has CRUSH rules in
|
||||
place that will cause the cluster to re-replicate the data until the
|
||||
``min_size`` configuration option has been met.)
|
||||
|
||||
Stretch Cluster Issues
|
||||
======================
|
||||
No matter what happens, Ceph will not compromise on data integrity
|
||||
and consistency. If there's a failure in your network or a loss of nodes and
|
||||
you can restore service, Ceph will return to normal functionality on its own.
|
||||
|
||||
But there are scenarios where you lose data availibility despite having
|
||||
enough servers available to satisfy Ceph's consistency and sizing constraints, or
|
||||
where you may be surprised to not satisfy Ceph's constraints.
|
||||
The first important category of these failures resolve around inconsistent
|
||||
networks -- if there's a netsplit, Ceph may be unable to mark OSDs down and kick
|
||||
them out of the acting PG sets despite the primary being unable to replicate data.
|
||||
If this happens, IO will not be permitted, because Ceph can't satisfy its durability
|
||||
guarantees.
|
||||
Ceph does not permit the compromise of data integrity and data consistency
|
||||
under any circumstances. When service is restored after a network failure or a
|
||||
loss of Ceph nodes, Ceph will restore itself to a state of normal functioning
|
||||
without operator intervention.
|
||||
|
||||
Ceph does not permit the compromise of data integrity or data consistency, but
|
||||
there are situations in which *data availability* is compromised. These
|
||||
situations can occur even though there are enough clusters available to satisfy
|
||||
Ceph's consistency and sizing constraints. In some situations, you might
|
||||
discover that your cluster does not satisfy those constraints.
|
||||
|
||||
The first category of these failures that we will discuss involves inconsistent
|
||||
networks -- if there is a netsplit (a disconnection between two servers that
|
||||
splits the network into two pieces), Ceph might be unable to mark OSDs ``down``
|
||||
and remove them from the acting PG sets. This failure to mark ODSs ``down``
|
||||
will occur, despite the fact that the primary PG is unable to replicate data (a
|
||||
situation that, under normal non-netsplit circumstances, would result in the
|
||||
marking of affected OSDs as ``down`` and their removal from the PG). If this
|
||||
happens, Ceph will be unable to satisfy its durability guarantees and
|
||||
consequently IO will not be permitted.
|
||||
|
||||
The second category of failures that we will discuss involves the situation in
|
||||
which the constraints are not sufficient to guarantee the replication of data
|
||||
across data centers, though it might seem that the data is correctly replicated
|
||||
across data centers. For example, in a scenario in which there are two data
|
||||
centers named Data Center A and Data Center B, and the CRUSH rule targets three
|
||||
replicas and places a replica in each data center with a ``min_size`` of ``2``,
|
||||
the PG might go active with two replicas in Data Center A and zero replicas in
|
||||
Data Center B. In a situation of this kind, the loss of Data Center A means
|
||||
that the data is lost and Ceph will not be able to operate on it. This
|
||||
situation is surprisingly difficult to avoid using only standard CRUSH rules.
|
||||
|
||||
The second important category of failures is when you think you have data replicated
|
||||
across data centers, but the constraints aren't sufficient to guarantee this.
|
||||
For instance, you might have data centers A and B, and your CRUSH rule targets 3 copies
|
||||
and places a copy in each data center with a ``min_size`` of 2. The PG may go active with
|
||||
2 copies in site A and no copies in site B, which means that if you then lose site A you
|
||||
have lost data and Ceph can't operate on it. This situation is surprisingly difficult
|
||||
to avoid with standard CRUSH rules.
|
||||
|
||||
Stretch Mode
|
||||
============
|
||||
The new stretch mode is designed to handle the 2-site case. Three sites are
|
||||
just as susceptible to netsplit issues, but are much more tolerant of
|
||||
component availability outages than 2-site clusters are.
|
||||
Stretch mode is designed to handle deployments in which you cannot guarantee the
|
||||
replication of data across two data centers. This kind of situation can arise
|
||||
when the cluster's CRUSH rule specifies that three copies are to be made, but
|
||||
then a copy is placed in each data center with a ``min_size`` of 2. Under such
|
||||
conditions, a placement group can become active with two copies in the first
|
||||
data center and no copies in the second data center.
|
||||
|
||||
To enter stretch mode, you must set the location of each monitor, matching
|
||||
your CRUSH map. For instance, to place ``mon.a`` in your first data center:
|
||||
|
||||
.. prompt:: bash $
|
||||
Entering Stretch Mode
|
||||
---------------------
|
||||
|
||||
ceph mon set_location a datacenter=site1
|
||||
To enable stretch mode, you must set the location of each monitor, matching
|
||||
your CRUSH map. This procedure shows how to do this.
|
||||
|
||||
Next, generate a CRUSH rule which will place 2 copies in each data center. This
|
||||
will require editing the CRUSH map directly:
|
||||
|
||||
.. prompt:: bash $
|
||||
#. Place ``mon.a`` in your first data center:
|
||||
|
||||
ceph osd getcrushmap > crush.map.bin
|
||||
crushtool -d crush.map.bin -o crush.map.txt
|
||||
.. prompt:: bash $
|
||||
|
||||
Now edit the ``crush.map.txt`` file to add a new rule. Here
|
||||
there is only one other rule, so this is ID 1, but you may need
|
||||
to use a different rule ID. We also have two datacenter buckets
|
||||
named ``site1`` and ``site2``::
|
||||
ceph mon set_location a datacenter=site1
|
||||
|
||||
rule stretch_rule {
|
||||
id 1
|
||||
type replicated
|
||||
min_size 1
|
||||
max_size 10
|
||||
step take site1
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
step take site2
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
}
|
||||
#. Generate a CRUSH rule that places two copies in each data center.
|
||||
This requires editing the CRUSH map directly:
|
||||
|
||||
Finally, inject the CRUSH map to make the rule available to the cluster:
|
||||
.. prompt:: bash $
|
||||
|
||||
.. prompt:: bash $
|
||||
ceph osd getcrushmap > crush.map.bin
|
||||
crushtool -d crush.map.bin -o crush.map.txt
|
||||
|
||||
crushtool -c crush.map.txt -o crush2.map.bin
|
||||
ceph osd setcrushmap -i crush2.map.bin
|
||||
#. Edit the ``crush.map.txt`` file to add a new rule. Here there is only one
|
||||
other rule (``id 1``), but you might need to use a different rule ID. We
|
||||
have two data-center buckets named ``site1`` and ``site2``:
|
||||
|
||||
If you aren't already running your monitors in connectivity mode, do so with
|
||||
the instructions in `Changing Monitor Elections`_.
|
||||
::
|
||||
|
||||
rule stretch_rule {
|
||||
id 1
|
||||
min_size 1
|
||||
max_size 10
|
||||
type replicated
|
||||
step take site1
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
step take site2
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
}
|
||||
|
||||
#. Inject the CRUSH map to make the rule available to the cluster:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
crushtool -c crush.map.txt -o crush2.map.bin
|
||||
ceph osd setcrushmap -i crush2.map.bin
|
||||
|
||||
#. Run the monitors in connectivity mode. See `Changing Monitor Elections`_.
|
||||
|
||||
#. Command the cluster to enter stretch mode. In this example, ``mon.e`` is the
|
||||
tiebreaker monitor and we are splitting across data centers. The tiebreaker
|
||||
monitor must be assigned a data center that is neither ``site1`` nor
|
||||
``site2``. For this purpose you can create another data-center bucket named
|
||||
``site3`` in your CRUSH and place ``mon.e`` there:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon set_location e datacenter=site3
|
||||
ceph mon enable_stretch_mode e stretch_rule datacenter
|
||||
|
||||
When stretch mode is enabled, PGs will become active only when they peer
|
||||
across data centers (or across whichever CRUSH bucket type was specified),
|
||||
assuming both are alive. Pools will increase in size from the default ``3`` to
|
||||
``4``, and two copies will be expected in each site. OSDs will be allowed to
|
||||
connect to monitors only if they are in the same data center as the monitors.
|
||||
New monitors will not be allowed to join the cluster if they do not specify a
|
||||
location.
|
||||
|
||||
If all OSDs and monitors in one of the data centers become inaccessible at once,
|
||||
the surviving data center enters a "degraded stretch mode". A warning will be
|
||||
issued, the ``min_size`` will be reduced to ``1``, and the cluster will be
|
||||
allowed to go active with the data in the single remaining site. The pool size
|
||||
does not change, so warnings will be generated that report that the pools are
|
||||
too small -- but a special stretch mode flag will prevent the OSDs from
|
||||
creating extra copies in the remaining data center. This means that the data
|
||||
center will keep only two copies, just as before.
|
||||
|
||||
When the missing data center comes back, the cluster will enter a "recovery
|
||||
stretch mode". This changes the warning and allows peering, but requires OSDs
|
||||
only from the data center that was ``up`` throughout the duration of the
|
||||
downtime. When all PGs are in a known state, and are neither degraded nor
|
||||
incomplete, the cluster transitions back to regular stretch mode, ends the
|
||||
warning, restores ``min_size`` to its original value (``2``), requires both
|
||||
sites to peer, and no longer requires the site that was up throughout the
|
||||
duration of the downtime when peering (which makes failover to the other site
|
||||
possible, if needed).
|
||||
|
||||
.. _Changing Monitor elections: ../change-mon-elections
|
||||
|
||||
And lastly, tell the cluster to enter stretch mode. Here, ``mon.e`` is the
|
||||
tiebreaker and we are splitting across data centers. ``mon.e`` should be also
|
||||
set a datacenter, that will differ from ``site1`` and ``site2``. For this
|
||||
purpose you can create another datacenter bucket named ```site3`` in your
|
||||
CRUSH and place ``mon.e`` there:
|
||||
Limitations of Stretch Mode
|
||||
===========================
|
||||
When using stretch mode, OSDs must be located at exactly two sites.
|
||||
|
||||
.. prompt:: bash $
|
||||
Two monitors should be run in each data center, plus a tiebreaker in a third
|
||||
(or in the cloud) for a total of five monitors. While in stretch mode, OSDs
|
||||
will connect only to monitors within the data center in which they are located.
|
||||
OSDs *DO NOT* connect to the tiebreaker monitor.
|
||||
|
||||
ceph mon set_location e datacenter=site3
|
||||
ceph mon enable_stretch_mode e stretch_rule datacenter
|
||||
Erasure-coded pools cannot be used with stretch mode. Attempts to use erasure
|
||||
coded pools with stretch mode will fail. Erasure coded pools cannot be created
|
||||
while in stretch mode.
|
||||
|
||||
When stretch mode is enabled, the OSDs wlll only take PGs active when
|
||||
they peer across data centers (or whatever other CRUSH bucket type
|
||||
you specified), assuming both are alive. Pools will increase in size
|
||||
from the default 3 to 4, expecting 2 copies in each site. OSDs will only
|
||||
be allowed to connect to monitors in the same data center. New monitors
|
||||
will not be allowed to join the cluster if they do not specify a location.
|
||||
To use stretch mode, you will need to create a CRUSH rule that provides two
|
||||
replicas in each data center. Ensure that there are four total replicas: two in
|
||||
each data center. If pools exist in the cluster that do not have the default
|
||||
``size`` or ``min_size``, Ceph will not enter stretch mode. An example of such
|
||||
a CRUSH rule is given above.
|
||||
|
||||
If all the OSDs and monitors from a data center become inaccessible
|
||||
at once, the surviving data center will enter a degraded stretch mode. This
|
||||
will issue a warning, reduce the min_size to 1, and allow
|
||||
the cluster to go active with data in the single remaining site. Note that
|
||||
we do not change the pool size, so you will also get warnings that the
|
||||
pools are too small -- but a special stretch mode flag will prevent the OSDs
|
||||
from creating extra copies in the remaining data center (so it will only keep
|
||||
2 copies, as before).
|
||||
Because stretch mode runs with ``min_size`` set to ``1`` (or, more directly,
|
||||
``min_size 1``), we recommend enabling stretch mode only when using OSDs on
|
||||
SSDs (including NVMe OSDs). Hybrid HDD+SDD or HDD-only OSDs are not recommended
|
||||
due to the long time it takes for them to recover after connectivity between
|
||||
data centers has been restored. This reduces the potential for data loss.
|
||||
|
||||
When the missing data center comes back, the cluster will enter
|
||||
recovery stretch mode. This changes the warning and allows peering, but
|
||||
still only requires OSDs from the data center which was up the whole time.
|
||||
When all PGs are in a known state, and are neither degraded nor incomplete,
|
||||
the cluster transitions back to regular stretch mode, ends the warning,
|
||||
restores min_size to its starting value (2) and requires both sites to peer,
|
||||
and stops requiring the always-alive site when peering (so that you can fail
|
||||
over to the other site, if necessary).
|
||||
|
||||
|
||||
Stretch Mode Limitations
|
||||
========================
|
||||
As implied by the setup, stretch mode only handles 2 sites with OSDs.
|
||||
|
||||
While it is not enforced, you should run 2 monitors in each site plus
|
||||
a tiebreaker, for a total of 5. This is because OSDs can only connect
|
||||
to monitors in their own site when in stretch mode.
|
||||
|
||||
You cannot use erasure coded pools with stretch mode. If you try, it will
|
||||
refuse, and it will not allow you to create EC pools once in stretch mode.
|
||||
|
||||
You must create your own CRUSH rule which provides 2 copies in each site, and
|
||||
you must use 4 total copies with 2 in each site. If you have existing pools
|
||||
with non-default size/min_size, Ceph will object when you attempt to
|
||||
enable stretch mode.
|
||||
|
||||
Because it runs with ``min_size 1`` when degraded, you should only use stretch
|
||||
mode with all-flash OSDs. This minimizes the time needed to recover once
|
||||
connectivity is restored, and thus minimizes the potential for data loss.
|
||||
|
||||
Hopefully, future development will extend this feature to support EC pools and
|
||||
running with more than 2 full sites.
|
||||
In the future, stretch mode might support erasure-coded pools and might support
|
||||
deployments that have more than two data centers.
|
||||
|
||||
Other commands
|
||||
==============
|
||||
If your tiebreaker monitor fails for some reason, you can replace it. Turn on
|
||||
a new monitor and run:
|
||||
|
||||
Replacing a failed tiebreaker monitor
|
||||
-------------------------------------
|
||||
|
||||
Turn on a new monitor and run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon set_new_tiebreaker mon.<new_mon_name>
|
||||
|
||||
This command will protest if the new monitor is in the same location as existing
|
||||
non-tiebreaker monitors. This command WILL NOT remove the previous tiebreaker
|
||||
monitor; you should do so yourself.
|
||||
This command protests if the new monitor is in the same location as the
|
||||
existing non-tiebreaker monitors. **This command WILL NOT remove the previous
|
||||
tiebreaker monitor.** Remove the previous tiebreaker monitor yourself.
|
||||
|
||||
Also in 16.2.7, if you are writing your own tooling for deploying Ceph, you can use a new
|
||||
``--set-crush-location`` option when booting monitors, instead of running
|
||||
``ceph mon set_location``. This option accepts only a single "bucket=loc" pair, eg
|
||||
``ceph-mon --set-crush-location 'datacenter=a'``, which must match the
|
||||
bucket type you specified when running ``enable_stretch_mode``.
|
||||
Using "--set-crush-location" and not "ceph mon set_location"
|
||||
------------------------------------------------------------
|
||||
|
||||
If you write your own tooling for deploying Ceph, use the
|
||||
``--set-crush-location`` option when booting monitors instead of running ``ceph
|
||||
mon set_location``. This option accepts only a single ``bucket=loc`` pair (for
|
||||
example, ``ceph-mon --set-crush-location 'datacenter=a'``), and that pair must
|
||||
match the bucket type that was specified when running ``enable_stretch_mode``.
|
||||
|
||||
When in stretch degraded mode, the cluster will go into "recovery" mode automatically
|
||||
when the disconnected data center comes back. If that doesn't work, or you want to
|
||||
enable recovery mode early, you can invoke:
|
||||
Forcing recovery stretch mode
|
||||
-----------------------------
|
||||
|
||||
When in stretch degraded mode, the cluster will go into "recovery" mode
|
||||
automatically when the disconnected data center comes back. If that does not
|
||||
happen or you want to enable recovery mode early, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd force_recovery_stretch_mode --yes-i-really-mean-it
|
||||
|
||||
But this command should not be necessary; it is included to deal with
|
||||
unanticipated situations.
|
||||
Forcing normal stretch mode
|
||||
---------------------------
|
||||
|
||||
When in recovery mode, the cluster should go back into normal stretch mode
|
||||
when the PGs are healthy. If this doesn't happen, or you want to force the
|
||||
When in recovery mode, the cluster should go back into normal stretch mode when
|
||||
the PGs are healthy. If this fails to happen or if you want to force the
|
||||
cross-data-center peering early and are willing to risk data downtime (or have
|
||||
verified separately that all the PGs can peer, even if they aren't fully
|
||||
recovered), you can invoke:
|
||||
recovered), run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd force_healthy_stretch_mode --yes-i-really-mean-it
|
||||
|
||||
This command should not be necessary; it is included to deal with
|
||||
unanticipated situations. But you might wish to invoke it to remove
|
||||
the ``HEALTH_WARN`` state which recovery mode generates.
|
||||
This command can be used to to remove the ``HEALTH_WARN`` state, which recovery
|
||||
mode generates.
|
||||
|
@ -337,45 +337,53 @@ Pool
|
||||
|
||||
A pool is a logical partition where users store data.
|
||||
In Ceph deployments, it is common to create a pool as a logical partition for
|
||||
similar types of data. For example, when deploying Ceph as a backend for
|
||||
similar types of data. For example, when deploying Ceph as a back end for
|
||||
OpenStack, a typical deployment would have pools for volumes, images, backups
|
||||
and virtual machines, and users such as ``client.glance``, ``client.cinder``,
|
||||
etc.
|
||||
and virtual machines, and such users as ``client.glance`` and ``client.cinder``.
|
||||
|
||||
Application Tags
|
||||
----------------
|
||||
|
||||
Access may be restricted to specific pools as defined by their application
|
||||
metadata. The ``*`` wildcard may be used for the ``key`` argument, the
|
||||
``value`` argument, or both. ``all`` is a synony for ``*``.
|
||||
``value`` argument, or both. The ``all`` tag is a synonym for ``*``.
|
||||
|
||||
Namespace
|
||||
---------
|
||||
|
||||
Objects within a pool can be associated to a namespace--a logical group of
|
||||
Objects within a pool can be associated to a namespace: that is, to a logical group of
|
||||
objects within the pool. A user's access to a pool can be associated with a
|
||||
namespace such that reads and writes by the user take place only within the
|
||||
namespace. Objects written to a namespace within the pool can only be accessed
|
||||
namespace so that reads and writes by the user can take place only within the
|
||||
namespace. Objects written to a namespace within the pool can be accessed only
|
||||
by users who have access to the namespace.
|
||||
|
||||
.. note:: Namespaces are primarily useful for applications written on top of
|
||||
``librados`` where the logical grouping can alleviate the need to create
|
||||
different pools. Ceph Object Gateway (in releases beginning with
|
||||
Luminous) uses namespaces for various
|
||||
metadata objects.
|
||||
``librados``. In such situations, the logical grouping provided by
|
||||
namespaces can obviate the need to create different pools. In Luminous and
|
||||
later releases, Ceph Object Gateway uses namespaces for various metadata
|
||||
objects.
|
||||
|
||||
The rationale for namespaces is that pools can be a computationally expensive
|
||||
method of segregating data sets for the purposes of authorizing separate sets
|
||||
of users. For example, a pool should have ~100 placement groups per OSD. So an
|
||||
exemplary cluster with 1000 OSDs would have 100,000 placement groups for one
|
||||
pool. Each pool would create another 100,000 placement groups in the exemplary
|
||||
cluster. By contrast, writing an object to a namespace simply associates the
|
||||
namespace to the object name with out the computational overhead of a separate
|
||||
pool. Rather than creating a separate pool for a user or set of users, you may
|
||||
use a namespace. **Note:** Only available using ``librados`` at this time.
|
||||
The rationale for namespaces is this: namespaces are relatively less
|
||||
computationally expensive than pools, which (pools) can be a computationally
|
||||
expensive method of segregating data sets between different authorized users.
|
||||
|
||||
Access may be restricted to specific RADOS namespaces using the ``namespace``
|
||||
capability. Limited globbing of namespaces is supported; if the last character
|
||||
For example, a pool ought to host approximately 100 placement-group replicas
|
||||
per OSD. This means that a cluster with 1000 OSDs and three 3R replicated pools
|
||||
would have (in a single pool) 100,000 placement-group replicas, and that means
|
||||
that it has 33,333 Placement Groups.
|
||||
|
||||
By contrast, writing an object to a namespace simply associates the namespace
|
||||
to the object name without incurring the computational overhead of a separate
|
||||
pool. Instead of creating a separate pool for a user or set of users, you can
|
||||
use a namespace.
|
||||
|
||||
.. note::
|
||||
|
||||
Namespaces are available only when using ``librados``.
|
||||
|
||||
|
||||
Access may be restricted to specific RADOS namespaces by use of the ``namespace``
|
||||
capability. Limited globbing of namespaces (that is, use of wildcards (``*``)) is supported: if the last character
|
||||
of the specified namespace is ``*``, then access is granted to any namespace
|
||||
starting with the provided argument.
|
||||
|
||||
@ -383,64 +391,60 @@ Managing Users
|
||||
==============
|
||||
|
||||
User management functionality provides Ceph Storage Cluster administrators with
|
||||
the ability to create, update and delete users directly in the Ceph Storage
|
||||
the ability to create, update, and delete users directly in the Ceph Storage
|
||||
Cluster.
|
||||
|
||||
When you create or delete users in the Ceph Storage Cluster, you may need to
|
||||
distribute keys to clients so that they can be added to keyrings. See `Keyring
|
||||
Management`_ for details.
|
||||
When you create or delete users in the Ceph Storage Cluster, you might need to
|
||||
distribute keys to clients so that they can be added to keyrings. For details, see `Keyring
|
||||
Management`_.
|
||||
|
||||
List Users
|
||||
----------
|
||||
Listing Users
|
||||
-------------
|
||||
|
||||
To list the users in your cluster, execute the following:
|
||||
To list the users in your cluster, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth ls
|
||||
ceph auth ls
|
||||
|
||||
Ceph will list out all users in your cluster. For example, in a two-node
|
||||
exemplary cluster, ``ceph auth ls`` will output something that looks like
|
||||
this::
|
||||
Ceph will list all users in your cluster. For example, in a two-node
|
||||
cluster, ``ceph auth ls`` will provide an output that resembles the following::
|
||||
|
||||
installed auth entries:
|
||||
installed auth entries:
|
||||
|
||||
osd.0
|
||||
key: AQCvCbtToC6MDhAATtuT70Sl+DymPCfDSsyV4w==
|
||||
caps: [mon] allow profile osd
|
||||
caps: [osd] allow *
|
||||
osd.1
|
||||
key: AQC4CbtTCFJBChAAVq5spj0ff4eHZICxIOVZeA==
|
||||
caps: [mon] allow profile osd
|
||||
caps: [osd] allow *
|
||||
client.admin
|
||||
key: AQBHCbtT6APDHhAA5W00cBchwkQjh3dkKsyPjw==
|
||||
caps: [mds] allow
|
||||
caps: [mon] allow *
|
||||
caps: [osd] allow *
|
||||
client.bootstrap-mds
|
||||
key: AQBICbtTOK9uGBAAdbe5zcIGHZL3T/u2g6EBww==
|
||||
caps: [mon] allow profile bootstrap-mds
|
||||
client.bootstrap-osd
|
||||
key: AQBHCbtT4GxqORAADE5u7RkpCN/oo4e5W0uBtw==
|
||||
caps: [mon] allow profile bootstrap-osd
|
||||
osd.0
|
||||
key: AQCvCbtToC6MDhAATtuT70Sl+DymPCfDSsyV4w==
|
||||
caps: [mon] allow profile osd
|
||||
caps: [osd] allow *
|
||||
osd.1
|
||||
key: AQC4CbtTCFJBChAAVq5spj0ff4eHZICxIOVZeA==
|
||||
caps: [mon] allow profile osd
|
||||
caps: [osd] allow *
|
||||
client.admin
|
||||
key: AQBHCbtT6APDHhAA5W00cBchwkQjh3dkKsyPjw==
|
||||
caps: [mds] allow
|
||||
caps: [mon] allow *
|
||||
caps: [osd] allow *
|
||||
client.bootstrap-mds
|
||||
key: AQBICbtTOK9uGBAAdbe5zcIGHZL3T/u2g6EBww==
|
||||
caps: [mon] allow profile bootstrap-mds
|
||||
client.bootstrap-osd
|
||||
key: AQBHCbtT4GxqORAADE5u7RkpCN/oo4e5W0uBtw==
|
||||
caps: [mon] allow profile bootstrap-osd
|
||||
|
||||
|
||||
Note that the ``TYPE.ID`` notation for users applies such that ``osd.0`` is a
|
||||
user of type ``osd`` and its ID is ``0``, ``client.admin`` is a user of type
|
||||
``client`` and its ID is ``admin`` (i.e., the default ``client.admin`` user).
|
||||
Note also that each entry has a ``key: <value>`` entry, and one or more
|
||||
Note that, according to the ``TYPE.ID`` notation for users, ``osd.0`` is a
|
||||
user of type ``osd`` and an ID of ``0``, and ``client.admin`` is a user of type
|
||||
``client`` and an ID of ``admin`` (that is, the default ``client.admin`` user).
|
||||
Note too that each entry has a ``key: <value>`` entry, and also has one or more
|
||||
``caps:`` entries.
|
||||
|
||||
You may use the ``-o {filename}`` option with ``ceph auth ls`` to
|
||||
save the output to a file.
|
||||
To save the output of ``ceph auth ls`` to a file, use the ``-o {filename}`` option.
|
||||
|
||||
|
||||
Get a User
|
||||
----------
|
||||
Getting a User
|
||||
--------------
|
||||
|
||||
To retrieve a specific user, key and capabilities, execute the
|
||||
following:
|
||||
To retrieve a specific user, key, and capabilities, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -452,8 +456,7 @@ For example:
|
||||
|
||||
ceph auth get client.admin
|
||||
|
||||
You may also use the ``-o {filename}`` option with ``ceph auth get`` to
|
||||
save the output to a file. Developers may also execute the following:
|
||||
To save the output of ``ceph auth get`` to a file, use the ``-o {filename}`` option. Developers may also run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -461,42 +464,49 @@ save the output to a file. Developers may also execute the following:
|
||||
|
||||
The ``auth export`` command is identical to ``auth get``.
|
||||
|
||||
Add a User
|
||||
----------
|
||||
.. _rados_ops_adding_a_user:
|
||||
|
||||
Adding a user creates a username (i.e., ``TYPE.ID``), a secret key and
|
||||
any capabilities included in the command you use to create the user.
|
||||
Adding a User
|
||||
-------------
|
||||
|
||||
A user's key enables the user to authenticate with the Ceph Storage Cluster.
|
||||
Adding a user creates a user name (that is, ``TYPE.ID``), a secret key, and
|
||||
any capabilities specified in the command that creates the user.
|
||||
|
||||
A user's key allows the user to authenticate with the Ceph Storage Cluster.
|
||||
The user's capabilities authorize the user to read, write, or execute on Ceph
|
||||
monitors (``mon``), Ceph OSDs (``osd``) or Ceph Metadata Servers (``mds``).
|
||||
monitors (``mon``), Ceph OSDs (``osd``) or Ceph Metadata Servers (``mds``).
|
||||
|
||||
There are a few ways to add a user:
|
||||
|
||||
- ``ceph auth add``: This command is the canonical way to add a user. It
|
||||
will create the user, generate a key and add any specified capabilities.
|
||||
will create the user, generate a key, and add any specified capabilities.
|
||||
|
||||
- ``ceph auth get-or-create``: This command is often the most convenient way
|
||||
to create a user, because it returns a keyfile format with the user name
|
||||
(in brackets) and the key. If the user already exists, this command
|
||||
simply returns the user name and key in the keyfile format. You may use the
|
||||
``-o {filename}`` option to save the output to a file.
|
||||
simply returns the user name and key in the keyfile format. To save the output to
|
||||
a file, use the ``-o {filename}`` option.
|
||||
|
||||
- ``ceph auth get-or-create-key``: This command is a convenient way to create
|
||||
a user and return the user's key (only). This is useful for clients that
|
||||
need the key only (e.g., libvirt). If the user already exists, this command
|
||||
simply returns the key. You may use the ``-o {filename}`` option to save the
|
||||
output to a file.
|
||||
a user and return the user's key and nothing else. This is useful for clients that
|
||||
need only the key (for example, libvirt). If the user already exists, this command
|
||||
simply returns the key. To save the output to
|
||||
a file, use the ``-o {filename}`` option.
|
||||
|
||||
When creating client users, you may create a user with no capabilities. A user
|
||||
It is possible, when creating client users, to create a user with no capabilities. A user
|
||||
with no capabilities is useless beyond mere authentication, because the client
|
||||
cannot retrieve the cluster map from the monitor. However, you can create a
|
||||
user with no capabilities if you wish to defer adding capabilities later using
|
||||
the ``ceph auth caps`` command.
|
||||
cannot retrieve the cluster map from the monitor. However, you might want to create a user
|
||||
with no capabilities and wait until later to add capabilities to the user by using the ``ceph auth caps`` comand.
|
||||
|
||||
A typical user has at least read capabilities on the Ceph monitor and
|
||||
read and write capability on Ceph OSDs. Additionally, a user's OSD permissions
|
||||
are often restricted to accessing a particular pool:
|
||||
read and write capabilities on Ceph OSDs. A user's OSD permissions
|
||||
are often restricted so that the user can access only one particular pool.
|
||||
In the following example, the commands (1) add a client named ``john`` that has read capabilities on the Ceph monitor
|
||||
and read and write capabilities on the pool named ``liverpool``, (2) authorize a client named ``paul`` to have read capabilities on the Ceph monitor and
|
||||
read and write capabilities on the pool named ``liverpool``, (3) authorize a client named ``george`` to have read capabilities on the Ceph monitor and
|
||||
read and write capabilities on the pool named ``liverpool`` and use the keyring named ``george.keyring`` to make this authorization, and (4) authorize
|
||||
a client named ``ringo`` to have read capabilities on the Ceph monitor and read and write capabilities on the pool named ``liverpool`` and use the key
|
||||
named ``ringo.key`` to make this authorization:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -505,21 +515,19 @@ are often restricted to accessing a particular pool:
|
||||
ceph auth get-or-create client.george mon 'allow r' osd 'allow rw pool=liverpool' -o george.keyring
|
||||
ceph auth get-or-create-key client.ringo mon 'allow r' osd 'allow rw pool=liverpool' -o ringo.key
|
||||
|
||||
|
||||
.. important:: If you provide a user with capabilities to OSDs, but you DO NOT
|
||||
restrict access to particular pools, the user will have access to ALL
|
||||
pools in the cluster!
|
||||
.. important:: Any user that has capabilities on OSDs will have access to ALL pools in the cluster
|
||||
unless that user's access has been restricted to a proper subset of the pools in the cluster.
|
||||
|
||||
|
||||
.. _modify-user-capabilities:
|
||||
|
||||
Modify User Capabilities
|
||||
------------------------
|
||||
Modifying User Capabilities
|
||||
---------------------------
|
||||
|
||||
The ``ceph auth caps`` command allows you to specify a user and change the
|
||||
The ``ceph auth caps`` command allows you to specify a user and change that
|
||||
user's capabilities. Setting new capabilities will overwrite current capabilities.
|
||||
To view current capabilities run ``ceph auth get USERTYPE.USERID``. To add
|
||||
capabilities, you should also specify the existing capabilities when using the form:
|
||||
To view current capabilities, run ``ceph auth get USERTYPE.USERID``.
|
||||
To add capabilities, run a command of the following form (and be sure to specify the existing capabilities):
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -534,10 +542,10 @@ For example:
|
||||
ceph auth caps client.paul mon 'allow rw' osd 'allow rwx pool=liverpool'
|
||||
ceph auth caps client.brian-manager mon 'allow *' osd 'allow *'
|
||||
|
||||
See `Authorization (Capabilities)`_ for additional details on capabilities.
|
||||
For additional details on capabilities, see `Authorization (Capabilities)`_.
|
||||
|
||||
Delete a User
|
||||
-------------
|
||||
Deleting a User
|
||||
---------------
|
||||
|
||||
To delete a user, use ``ceph auth del``:
|
||||
|
||||
@ -545,34 +553,34 @@ To delete a user, use ``ceph auth del``:
|
||||
|
||||
ceph auth del {TYPE}.{ID}
|
||||
|
||||
Where ``{TYPE}`` is one of ``client``, ``osd``, ``mon``, or ``mds``,
|
||||
and ``{ID}`` is the user name or ID of the daemon.
|
||||
Here ``{TYPE}`` is either ``client``, ``osd``, ``mon``, or ``mds``,
|
||||
and ``{ID}`` is the user name or the ID of the daemon.
|
||||
|
||||
|
||||
Print a User's Key
|
||||
------------------
|
||||
Printing a User's Key
|
||||
---------------------
|
||||
|
||||
To print a user's authentication key to standard output, execute the following:
|
||||
To print a user's authentication key to standard output, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth print-key {TYPE}.{ID}
|
||||
|
||||
Where ``{TYPE}`` is one of ``client``, ``osd``, ``mon``, or ``mds``,
|
||||
and ``{ID}`` is the user name or ID of the daemon.
|
||||
Here ``{TYPE}`` is either ``client``, ``osd``, ``mon``, or ``mds``,
|
||||
and ``{ID}`` is the user name or the ID of the daemon.
|
||||
|
||||
Printing a user's key is useful when you need to populate client
|
||||
software with a user's key (e.g., libvirt):
|
||||
When it is necessary to populate client software with a user's key (as in the case of libvirt),
|
||||
you can print the user's key by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
mount -t ceph serverhost:/ mountpoint -o name=client.user,secret=`ceph auth print-key client.user`
|
||||
|
||||
Import a User(s)
|
||||
Importing a User
|
||||
----------------
|
||||
|
||||
To import one or more users, use ``ceph auth import`` and
|
||||
specify a keyring:
|
||||
specify a keyring as follows:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -584,47 +592,49 @@ For example:
|
||||
|
||||
sudo ceph auth import -i /etc/ceph/ceph.keyring
|
||||
|
||||
|
||||
.. note:: The Ceph storage cluster will add new users, their keys and their
|
||||
capabilities and will update existing users, their keys and their
|
||||
.. note:: The Ceph storage cluster will add new users, their keys, and their
|
||||
capabilities and will update existing users, their keys, and their
|
||||
capabilities.
|
||||
|
||||
Keyring Management
|
||||
==================
|
||||
|
||||
When you access Ceph via a Ceph client, the Ceph client will look for a local
|
||||
keyring. Ceph presets the ``keyring`` setting with the following four keyring
|
||||
names by default so you don't have to set them in your Ceph configuration file
|
||||
unless you want to override the defaults (not recommended):
|
||||
keyring. Ceph presets the ``keyring`` setting with four keyring
|
||||
names by default. For this reason, you do not have to set the keyring names in your Ceph configuration file
|
||||
unless you want to override these defaults (which is not recommended). The four default keyring names are as follows:
|
||||
|
||||
- ``/etc/ceph/$cluster.$name.keyring``
|
||||
- ``/etc/ceph/$cluster.keyring``
|
||||
- ``/etc/ceph/keyring``
|
||||
- ``/etc/ceph/keyring.bin``
|
||||
|
||||
The ``$cluster`` metavariable is your Ceph cluster name as defined by the
|
||||
name of the Ceph configuration file (i.e., ``ceph.conf`` means the cluster name
|
||||
is ``ceph``; thus, ``ceph.keyring``). The ``$name`` metavariable is the user
|
||||
type and user ID (e.g., ``client.admin``; thus, ``ceph.client.admin.keyring``).
|
||||
The ``$cluster`` metavariable found in the first two default keyring names above
|
||||
is your Ceph cluster name as defined by the name of the Ceph configuration
|
||||
file: for example, if the Ceph configuration file is named ``ceph.conf``,
|
||||
then your Ceph cluster name is ``ceph`` and the second name above would be
|
||||
``ceph.keyring``. The ``$name`` metavariable is the user type and user ID:
|
||||
for example, given the user ``client.admin``, the first name above would be
|
||||
``ceph.client.admin.keyring``.
|
||||
|
||||
.. note:: When executing commands that read or write to ``/etc/ceph``, you may
|
||||
need to use ``sudo`` to execute the command as ``root``.
|
||||
.. note:: When running commands that read or write to ``/etc/ceph``, you might
|
||||
need to use ``sudo`` to run the command as ``root``.
|
||||
|
||||
After you create a user (e.g., ``client.ringo``), you must get the key and add
|
||||
After you create a user (for example, ``client.ringo``), you must get the key and add
|
||||
it to a keyring on a Ceph client so that the user can access the Ceph Storage
|
||||
Cluster.
|
||||
|
||||
The `User Management`_ section details how to list, get, add, modify and delete
|
||||
users directly in the Ceph Storage Cluster. However, Ceph also provides the
|
||||
The `User Management`_ section details how to list, get, add, modify, and delete
|
||||
users directly in the Ceph Storage Cluster. In addition, Ceph provides the
|
||||
``ceph-authtool`` utility to allow you to manage keyrings from a Ceph client.
|
||||
|
||||
Create a Keyring
|
||||
----------------
|
||||
Creating a Keyring
|
||||
------------------
|
||||
|
||||
When you use the procedures in the `Managing Users`_ section to create users,
|
||||
you need to provide user keys to the Ceph client(s) so that the Ceph client
|
||||
can retrieve the key for the specified user and authenticate with the Ceph
|
||||
Storage Cluster. Ceph Clients access keyrings to lookup a user name and
|
||||
you must provide user keys to the Ceph client(s). This is required so that the Ceph client(s)
|
||||
can retrieve the key for the specified user and authenticate that user against the Ceph
|
||||
Storage Cluster. Ceph clients access keyrings in order to look up a user name and
|
||||
retrieve the user's key.
|
||||
|
||||
The ``ceph-authtool`` utility allows you to create a keyring. To create an
|
||||
@ -635,45 +645,44 @@ empty keyring, use ``--create-keyring`` or ``-C``. For example:
|
||||
ceph-authtool --create-keyring /path/to/keyring
|
||||
|
||||
When creating a keyring with multiple users, we recommend using the cluster name
|
||||
(e.g., ``$cluster.keyring``) for the keyring filename and saving it in the
|
||||
``/etc/ceph`` directory so that the ``keyring`` configuration default setting
|
||||
will pick up the filename without requiring you to specify it in the local copy
|
||||
of your Ceph configuration file. For example, create ``ceph.keyring`` by
|
||||
executing the following:
|
||||
(of the form ``$cluster.keyring``) for the keyring filename and saving the keyring in the
|
||||
``/etc/ceph`` directory. By doing this, you ensure that the ``keyring`` configuration default setting
|
||||
will pick up the filename without requiring you to specify the filename in the local copy
|
||||
of your Ceph configuration file. For example, you can create ``ceph.keyring`` by
|
||||
running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo ceph-authtool -C /etc/ceph/ceph.keyring
|
||||
|
||||
When creating a keyring with a single user, we recommend using the cluster name,
|
||||
the user type and the user name and saving it in the ``/etc/ceph`` directory.
|
||||
For example, ``ceph.client.admin.keyring`` for the ``client.admin`` user.
|
||||
the user type, and the user name, and saving the keyring in the ``/etc/ceph`` directory.
|
||||
For example, we recommend that the ``client.admin`` user use ``ceph.client.admin.keyring``.
|
||||
|
||||
To create a keyring in ``/etc/ceph``, you must do so as ``root``. This means
|
||||
the file will have ``rw`` permissions for the ``root`` user only, which is
|
||||
that the file will have ``rw`` permissions for the ``root`` user only, which is
|
||||
appropriate when the keyring contains administrator keys. However, if you
|
||||
intend to use the keyring for a particular user or group of users, ensure
|
||||
that you execute ``chown`` or ``chmod`` to establish appropriate keyring
|
||||
intend to use the keyring for a particular user or group of users, be sure to use ``chown`` or ``chmod`` to establish appropriate keyring
|
||||
ownership and access.
|
||||
|
||||
Add a User to a Keyring
|
||||
-----------------------
|
||||
Adding a User to a Keyring
|
||||
--------------------------
|
||||
|
||||
When you `Add a User`_ to the Ceph Storage Cluster, you can use the `Get a
|
||||
User`_ procedure to retrieve a user, key and capabilities and save the user to a
|
||||
keyring.
|
||||
When you :ref:`Add a user<rados_ops_adding_a_user>` to the Ceph Storage
|
||||
Cluster, you can use the `Getting a User`_ procedure to retrieve a user, key,
|
||||
and capabilities and then save the user to a keyring.
|
||||
|
||||
When you only want to use one user per keyring, the `Get a User`_ procedure with
|
||||
If you want to use only one user per keyring, the `Getting a User`_ procedure with
|
||||
the ``-o`` option will save the output in the keyring file format. For example,
|
||||
to create a keyring for the ``client.admin`` user, execute the following:
|
||||
to create a keyring for the ``client.admin`` user, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo ceph auth get client.admin -o /etc/ceph/ceph.client.admin.keyring
|
||||
|
||||
Notice that we use the recommended file format for an individual user.
|
||||
Notice that the file format in this command is the file format conventionally used when manipulating the keyrings of individual users.
|
||||
|
||||
When you want to import users to a keyring, you can use ``ceph-authtool``
|
||||
If you want to import users to a keyring, you can use ``ceph-authtool``
|
||||
to specify the destination keyring and the source keyring.
|
||||
For example:
|
||||
|
||||
@ -681,19 +690,19 @@ For example:
|
||||
|
||||
sudo ceph-authtool /etc/ceph/ceph.keyring --import-keyring /etc/ceph/ceph.client.admin.keyring
|
||||
|
||||
Create a User
|
||||
-------------
|
||||
Creating a User
|
||||
---------------
|
||||
|
||||
Ceph provides the `Add a User`_ function to create a user directly in the Ceph
|
||||
Storage Cluster. However, you can also create a user, keys and capabilities
|
||||
directly on a Ceph client keyring. Then, you can import the user to the Ceph
|
||||
Ceph provides the `Adding a User`_ function to create a user directly in the Ceph
|
||||
Storage Cluster. However, you can also create a user, keys, and capabilities
|
||||
directly on a Ceph client keyring, and then import the user to the Ceph
|
||||
Storage Cluster. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo ceph-authtool -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx' /etc/ceph/ceph.keyring
|
||||
|
||||
See `Authorization (Capabilities)`_ for additional details on capabilities.
|
||||
For additional details on capabilities, see `Authorization (Capabilities)`_.
|
||||
|
||||
You can also create a keyring and add a new user to the keyring simultaneously.
|
||||
For example:
|
||||
@ -702,36 +711,37 @@ For example:
|
||||
|
||||
sudo ceph-authtool -C /etc/ceph/ceph.keyring -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx' --gen-key
|
||||
|
||||
In the foregoing scenarios, the new user ``client.ringo`` is only in the
|
||||
keyring. To add the new user to the Ceph Storage Cluster, you must still add
|
||||
the new user to the Ceph Storage Cluster:
|
||||
In the above examples, the new user ``client.ringo`` has been added only to the
|
||||
keyring. The new user has not been added to the Ceph Storage Cluster.
|
||||
|
||||
To add the new user ``client.ringo`` to the Ceph Storage Cluster, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo ceph auth add client.ringo -i /etc/ceph/ceph.keyring
|
||||
|
||||
Modify a User
|
||||
-------------
|
||||
Modifying a User
|
||||
----------------
|
||||
|
||||
To modify the capabilities of a user record in a keyring, specify the keyring,
|
||||
and the user followed by the capabilities. For example:
|
||||
To modify the capabilities of a user record in a keyring, specify the keyring
|
||||
and the user, followed by the capabilities. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo ceph-authtool /etc/ceph/ceph.keyring -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx'
|
||||
|
||||
To update the user to the Ceph Storage Cluster, you must update the user
|
||||
in the keyring to the user entry in the Ceph Storage Cluster:
|
||||
To update the user in the Ceph Storage Cluster, you must update the user
|
||||
in the keyring to the user entry in the Ceph Storage Cluster. To do so, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo ceph auth import -i /etc/ceph/ceph.keyring
|
||||
|
||||
See `Import a User(s)`_ for details on updating a Ceph Storage Cluster user
|
||||
from a keyring.
|
||||
For details on updating a Ceph Storage Cluster user from a
|
||||
keyring, see `Importing a User`_
|
||||
|
||||
You may also `Modify User Capabilities`_ directly in the cluster, store the
|
||||
results to a keyring file; then, import the keyring into your main
|
||||
You may also :ref:`Modify user capabilities<modify-user-capabilities>` directly in the cluster, store the
|
||||
results to a keyring file, and then import the keyring into your main
|
||||
``ceph.keyring`` file.
|
||||
|
||||
Command Line Usage
|
||||
@ -741,12 +751,12 @@ Ceph supports the following usage for user name and secret:
|
||||
|
||||
``--id`` | ``--user``
|
||||
|
||||
:Description: Ceph identifies users with a type and an ID (e.g., ``TYPE.ID`` or
|
||||
``client.admin``, ``client.user1``). The ``id``, ``name`` and
|
||||
``-n`` options enable you to specify the ID portion of the user
|
||||
name (e.g., ``admin``, ``user1``, ``foo``, etc.). You can specify
|
||||
:Description: Ceph identifies users with a type and an ID: the form of this user identification is ``TYPE.ID``, and examples of the type and ID are
|
||||
``client.admin`` and ``client.user1``. The ``id``, ``name`` and
|
||||
``-n`` options allow you to specify the ID portion of the user
|
||||
name (for example, ``admin``, ``user1``, ``foo``). You can specify
|
||||
the user with the ``--id`` and omit the type. For example,
|
||||
to specify user ``client.foo`` enter the following:
|
||||
to specify user ``client.foo``, run the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -756,10 +766,10 @@ Ceph supports the following usage for user name and secret:
|
||||
|
||||
``--name`` | ``-n``
|
||||
|
||||
:Description: Ceph identifies users with a type and an ID (e.g., ``TYPE.ID`` or
|
||||
``client.admin``, ``client.user1``). The ``--name`` and ``-n``
|
||||
options enables you to specify the fully qualified user name.
|
||||
You must specify the user type (typically ``client``) with the
|
||||
:Description: Ceph identifies users with a type and an ID: the form of this user identification is ``TYPE.ID``, and examples of the type and ID are
|
||||
``client.admin`` and ``client.user1``. The ``--name`` and ``-n``
|
||||
options allow you to specify the fully qualified user name.
|
||||
You are required to specify the user type (typically ``client``) with the
|
||||
user ID. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
@ -770,8 +780,8 @@ Ceph supports the following usage for user name and secret:
|
||||
|
||||
``--keyring``
|
||||
|
||||
:Description: The path to the keyring containing one or more user name and
|
||||
secret. The ``--secret`` option provides the same functionality,
|
||||
:Description: The path to the keyring that contains one or more user names and
|
||||
secrets. The ``--secret`` option provides the same functionality,
|
||||
but it does not work with Ceph RADOS Gateway, which uses
|
||||
``--secret`` for another purpose. You may retrieve a keyring with
|
||||
``ceph auth get-or-create`` and store it locally. This is a
|
||||
@ -788,43 +798,42 @@ Ceph supports the following usage for user name and secret:
|
||||
Limitations
|
||||
===========
|
||||
|
||||
The ``cephx`` protocol authenticates Ceph clients and servers to each other. It
|
||||
The ``cephx`` protocol authenticates Ceph clients and servers to each other. It
|
||||
is not intended to handle authentication of human users or application programs
|
||||
run on their behalf. If that effect is required to handle your access control
|
||||
needs, you must have another mechanism, which is likely to be specific to the
|
||||
front end used to access the Ceph object store. This other mechanism has the
|
||||
role of ensuring that only acceptable users and programs are able to run on the
|
||||
machine that Ceph will permit to access its object store.
|
||||
that are run on their behalf. If your access control
|
||||
needs require that kind of authentication, you will need to have some other mechanism, which is likely to be specific to the
|
||||
front end that is used to access the Ceph object store. This other mechanism would ensure that only acceptable users and programs are able to run on the
|
||||
machine that Ceph permits to access its object store.
|
||||
|
||||
The keys used to authenticate Ceph clients and servers are typically stored in
|
||||
a plain text file with appropriate permissions in a trusted host.
|
||||
a plain text file on a trusted host. Appropriate permissions must be set on the plain text file.
|
||||
|
||||
.. important:: Storing keys in plaintext files has security shortcomings, but
|
||||
they are difficult to avoid, given the basic authentication methods Ceph
|
||||
uses in the background. Those setting up Ceph systems should be aware of
|
||||
uses in the background. Anyone setting up Ceph systems should be aware of
|
||||
these shortcomings.
|
||||
|
||||
In particular, arbitrary user machines, especially portable machines, should not
|
||||
In particular, user machines, especially portable machines, should not
|
||||
be configured to interact directly with Ceph, since that mode of use would
|
||||
require the storage of a plaintext authentication key on an insecure machine.
|
||||
Anyone who stole that machine or obtained surreptitious access to it could
|
||||
obtain the key that will allow them to authenticate their own machines to Ceph.
|
||||
Anyone who stole that machine or obtained access to it could
|
||||
obtain a key that allows them to authenticate their own machines to Ceph.
|
||||
|
||||
Rather than permitting potentially insecure machines to access a Ceph object
|
||||
store directly, users should be required to sign in to a trusted machine in
|
||||
your environment using a method that provides sufficient security for your
|
||||
purposes. That trusted machine will store the plaintext Ceph keys for the
|
||||
human users. A future version of Ceph may address these particular
|
||||
Instead of permitting potentially insecure machines to access a Ceph object
|
||||
store directly, you should require users to sign in to a trusted machine in
|
||||
your environment, using a method that provides sufficient security for your
|
||||
purposes. That trusted machine will store the plaintext Ceph keys for the
|
||||
human users. A future version of Ceph might address these particular
|
||||
authentication issues more fully.
|
||||
|
||||
At the moment, none of the Ceph authentication protocols provide secrecy for
|
||||
messages in transit. Thus, an eavesdropper on the wire can hear and understand
|
||||
all data sent between clients and servers in Ceph, even if it cannot create or
|
||||
alter them. Further, Ceph does not include options to encrypt user data in the
|
||||
object store. Users can hand-encrypt and store their own data in the Ceph
|
||||
object store, of course, but Ceph provides no features to perform object
|
||||
encryption itself. Those storing sensitive data in Ceph should consider
|
||||
encrypting their data before providing it to the Ceph system.
|
||||
At present, none of the Ceph authentication protocols provide secrecy for
|
||||
messages in transit. As a result, an eavesdropper on the wire can hear and understand
|
||||
all data sent between clients and servers in Ceph, even if the eavesdropper cannot create or
|
||||
alter the data. Similarly, Ceph does not include options to encrypt user data in the
|
||||
object store. Users can, of course, hand-encrypt and store their own data in the Ceph
|
||||
object store, but Ceph itself provides no features to perform object
|
||||
encryption. Anyone storing sensitive data in Ceph should consider
|
||||
encrypting their data before providing it to the Ceph system.
|
||||
|
||||
|
||||
.. _Architecture - High Availability Authentication: ../../../architecture#high-availability-authentication
|
||||
|
@ -36,8 +36,9 @@ resharding tasks, one at a time.
|
||||
Multisite
|
||||
=========
|
||||
|
||||
Dynamic resharding is not supported in a multisite environment.
|
||||
|
||||
Prior to the Reef release, RGW does not support dynamic resharding in a
|
||||
multisite environment. For information on dynamic resharding, see
|
||||
:ref:`Resharding <feature_resharding>` in the RGW multisite documentation.
|
||||
|
||||
Configuration
|
||||
=============
|
||||
|
@ -1130,7 +1130,7 @@ To view the configuration of a zonegroup, run this command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
dosgw-admin zonegroup get [--rgw-zonegroup=<zonegroup>]
|
||||
radosgw-admin zonegroup get [--rgw-zonegroup=<zonegroup>]
|
||||
|
||||
The zonegroup configuration looks like this:
|
||||
|
||||
@ -1582,14 +1582,23 @@ Supported Features
|
||||
|
||||
.. _feature_resharding:
|
||||
|
||||
resharding
|
||||
Resharding
|
||||
~~~~~~~~~~
|
||||
|
||||
Allows buckets to be resharded in a multisite configuration without interrupting the replication of their objects. When ``rgw_dynamic_resharding`` is enabled, it runs on each zone independently, and zones may choose different shard counts for the same bucket. When buckets are resharded manually with ``radosgw-admin bucket reshard``, only that zone's bucket is modified. A zone feature should only be marked as supported after all of its radosgws and osds have upgraded.
|
||||
This feature allows buckets to be resharded in a multisite configuration
|
||||
without interrupting the replication of their objects. When
|
||||
``rgw_dynamic_resharding`` is enabled, it runs on each zone independently, and
|
||||
zones may choose different shard counts for the same bucket. When buckets are
|
||||
resharded manually with ``radosgw-admin bucket reshard``, only that zone's
|
||||
bucket is modified. A zone feature should only be marked as supported after all
|
||||
of its RGWs and OSDs have upgraded.
|
||||
|
||||
.. note:: Dynamic resharding is not supported in multisite deployments prior to
|
||||
the Reef release.
|
||||
|
||||
|
||||
Commands
|
||||
-----------------
|
||||
--------
|
||||
|
||||
Add support for a zone feature
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -138,9 +138,6 @@ updating, use the name of an existing topic and different endpoint values).
|
||||
.. tip:: Any notification already associated with the topic must be re-created
|
||||
in order for the topic to update.
|
||||
|
||||
.. note:: For rabbitmq, ``push-endpoint`` (with a hyphen in the middle) must be
|
||||
changed to ``push_endpoint`` (with an underscore in the middle).
|
||||
|
||||
::
|
||||
|
||||
POST
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _documenting_ceph:
|
||||
|
||||
==================
|
||||
Documenting Ceph
|
||||
==================
|
||||
|
@ -12,9 +12,10 @@ These are exciting times in the Ceph community! Get involved!
|
||||
| **Blog** | Check the Ceph Blog_ periodically to keep track | http://ceph.com/community/blog/ |
|
||||
| | of Ceph progress and important announcements. | |
|
||||
+----------------------+-------------------------------------------------+-----------------------------------------------+
|
||||
| **Planet Ceph** | Check the blog aggregation on Planet Ceph for | https://ceph.com/category/planet/ |
|
||||
| **Planet Ceph** | Check the blog aggregation on Planet Ceph for | https://old.ceph.com/category/planet/ |
|
||||
| | interesting stories, information and | |
|
||||
| | experiences from the community. | |
|
||||
| | experiences from the community. **NOTE: NO | |
|
||||
| | longer updated as of 2023.** | |
|
||||
+----------------------+-------------------------------------------------+-----------------------------------------------+
|
||||
| **Wiki** | Check the Ceph Wiki is a source for more | http://wiki.ceph.com/ |
|
||||
| | community and development related topics. You | |
|
||||
|
@ -2,14 +2,24 @@
|
||||
Intro to Ceph
|
||||
===============
|
||||
|
||||
Whether you want to provide :term:`Ceph Object Storage` and/or
|
||||
:term:`Ceph Block Device` services to :term:`Cloud Platforms`, deploy
|
||||
a :term:`Ceph File System` or use Ceph for another purpose, all
|
||||
:term:`Ceph Storage Cluster` deployments begin with setting up each
|
||||
:term:`Ceph Node`, your network, and the Ceph Storage Cluster. A Ceph
|
||||
Storage Cluster requires at least one Ceph Monitor, Ceph Manager, and
|
||||
Ceph OSD (Object Storage Daemon). The Ceph Metadata Server is also
|
||||
required when running Ceph File System clients.
|
||||
Ceph can be used to provide :term:`Ceph Object Storage` to :term:`Cloud
|
||||
Platforms` and Ceph can be used to provide :term:`Ceph Block Device` services
|
||||
to :term:`Cloud Platforms`. Ceph can be used to deploy a :term:`Ceph File
|
||||
System`. All :term:`Ceph Storage Cluster` deployments begin with setting up
|
||||
each :term:`Ceph Node` and then setting up the network.
|
||||
|
||||
A Ceph Storage Cluster requires the following: at least one Ceph Monitor and at
|
||||
least one Ceph Manager, and at least as many Ceph OSDs as there are copies of
|
||||
an object stored on the Ceph cluster (for example, if three copies of a given
|
||||
object are stored on the Ceph cluster, then at least three OSDs must exist in
|
||||
that Ceph cluster).
|
||||
|
||||
The Ceph Metadata Server is necessary to run Ceph File System clients.
|
||||
|
||||
.. note::
|
||||
|
||||
It is a best practice to have a Ceph Manager for each Monitor, but it is not
|
||||
necessary.
|
||||
|
||||
.. ditaa::
|
||||
|
||||
|
@ -18,19 +18,18 @@ Linux Kernel
|
||||
maintenance" kernel series provided by either http://kernel.org or
|
||||
your Linux distribution on any client hosts.
|
||||
|
||||
For RBD, if you choose to *track* long-term kernels, we currently recommend
|
||||
4.x-based "longterm maintenance" kernel series or later:
|
||||
|
||||
- 4.19.z
|
||||
- 4.14.z
|
||||
- 5.x
|
||||
For RBD, if you choose to *track* long-term kernels, we recommend
|
||||
*at least* 4.19-based "longterm maintenance" kernel series. If you can
|
||||
use a newer "stable" or "longterm maintenance" kernel series, do it.
|
||||
|
||||
For CephFS, see the section about `Mounting CephFS using Kernel Driver`_
|
||||
for kernel version guidance.
|
||||
|
||||
Older kernel client versions may not support your `CRUSH tunables`_ profile
|
||||
or other newer features of the Ceph cluster, requiring the storage cluster
|
||||
to be configured with those features disabled.
|
||||
or other newer features of the Ceph cluster, requiring the storage cluster to
|
||||
be configured with those features disabled. For RBD, a kernel of version 5.3
|
||||
or CentOS 8.2 is the minimum necessary for reasonable support for RBD image
|
||||
features.
|
||||
|
||||
|
||||
Platforms
|
||||
|
@ -178,45 +178,77 @@ function install_pkg_on_ubuntu {
|
||||
fi
|
||||
}
|
||||
|
||||
boost_ver=1.73
|
||||
|
||||
function clean_boost_on_ubuntu {
|
||||
in_jenkins && echo "CI_DEBUG: Start clean_boost_on_ubuntu() in install-deps.sh"
|
||||
# Find currently installed version. If there are multiple
|
||||
# versions, they end up newline separated
|
||||
local installed_ver=$(apt -qq list --installed ceph-libboost*-dev 2>/dev/null |
|
||||
cut -d' ' -f2 |
|
||||
cut -d'.' -f1,2 |
|
||||
sort -u)
|
||||
# If installed_ver contains whitespace, we can't really count on it,
|
||||
# but otherwise, bail out if the version installed is the version
|
||||
# we want.
|
||||
if test -n "$installed_ver" &&
|
||||
echo -n "$installed_ver" | tr '[:space:]' ' ' | grep -v -q ' '; then
|
||||
if echo "$installed_ver" | grep -q "^$boost_ver"; then
|
||||
return
|
||||
fi
|
||||
fi
|
||||
|
||||
# Historical packages
|
||||
$SUDO rm -f /etc/apt/sources.list.d/ceph-libboost*.list
|
||||
# Currently used
|
||||
$SUDO rm -f /etc/apt/sources.list.d/libboost.list
|
||||
# Refresh package list so things aren't in the available list.
|
||||
$SUDO env DEBIAN_FRONTEND=noninteractive apt-get update -y || true
|
||||
# Remove all ceph-libboost packages. We have an early return if
|
||||
# the desired version is already (and the only) version installed,
|
||||
# so no need to spare it.
|
||||
if test -n "$installed_ver"; then
|
||||
$SUDO env DEBIAN_FRONTEND=noninteractive apt-get -y --fix-missing remove "ceph-libboost*"
|
||||
fi
|
||||
}
|
||||
|
||||
function install_boost_on_ubuntu {
|
||||
local ver=1.73
|
||||
in_jenkins && echo "CI_DEBUG: Running install_boost_on_ubuntu() in install-deps.sh"
|
||||
# Once we get to this point, clean_boost_on_ubuntu() should ensure
|
||||
# that there is no more than one installed version.
|
||||
local installed_ver=$(apt -qq list --installed ceph-libboost*-dev 2>/dev/null |
|
||||
grep -e 'libboost[0-9].[0-9]\+-dev' |
|
||||
cut -d' ' -f2 |
|
||||
cut -d'.' -f1,2)
|
||||
if test -n "$installed_ver"; then
|
||||
if echo "$installed_ver" | grep -q "^$ver"; then
|
||||
if echo "$installed_ver" | grep -q "^$boost_ver"; then
|
||||
return
|
||||
else
|
||||
$SUDO env DEBIAN_FRONTEND=noninteractive apt-get -y remove "ceph-libboost.*${installed_ver}.*"
|
||||
$SUDO rm -f /etc/apt/sources.list.d/ceph-libboost${installed_ver}.list
|
||||
fi
|
||||
fi
|
||||
local codename=$1
|
||||
local project=libboost
|
||||
local sha1=7aba8a1882670522ee1d1ee1bba0ea170b292dec
|
||||
install_pkg_on_ubuntu \
|
||||
$project \
|
||||
$sha1 \
|
||||
$codename \
|
||||
check \
|
||||
ceph-libboost-atomic$ver-dev \
|
||||
ceph-libboost-chrono$ver-dev \
|
||||
ceph-libboost-container$ver-dev \
|
||||
ceph-libboost-context$ver-dev \
|
||||
ceph-libboost-coroutine$ver-dev \
|
||||
ceph-libboost-date-time$ver-dev \
|
||||
ceph-libboost-filesystem$ver-dev \
|
||||
ceph-libboost-iostreams$ver-dev \
|
||||
ceph-libboost-program-options$ver-dev \
|
||||
ceph-libboost-python$ver-dev \
|
||||
ceph-libboost-random$ver-dev \
|
||||
ceph-libboost-regex$ver-dev \
|
||||
ceph-libboost-system$ver-dev \
|
||||
ceph-libboost-test$ver-dev \
|
||||
ceph-libboost-thread$ver-dev \
|
||||
ceph-libboost-timer$ver-dev
|
||||
$project \
|
||||
$sha1 \
|
||||
$codename \
|
||||
check \
|
||||
ceph-libboost-atomic${boost_ver}-dev \
|
||||
ceph-libboost-chrono${boost_ver}-dev \
|
||||
ceph-libboost-container${boost_ver}-dev \
|
||||
ceph-libboost-context${boost_ver}-dev \
|
||||
ceph-libboost-coroutine${boost_ver}-dev \
|
||||
ceph-libboost-date-time${boost_ver}-dev \
|
||||
ceph-libboost-filesystem${boost_ver}-dev \
|
||||
ceph-libboost-iostreams${boost_ver}-dev \
|
||||
ceph-libboost-program-options${boost_ver}-dev \
|
||||
ceph-libboost-python${boost_ver}-dev \
|
||||
ceph-libboost-random${boost_ver}-dev \
|
||||
ceph-libboost-regex${boost_ver}-dev \
|
||||
ceph-libboost-system${boost_ver}-dev \
|
||||
ceph-libboost-test${boost_ver}-dev \
|
||||
ceph-libboost-thread${boost_ver}-dev \
|
||||
ceph-libboost-timer${boost_ver}-dev
|
||||
}
|
||||
|
||||
function install_libzbd_on_ubuntu {
|
||||
@ -310,6 +342,9 @@ else
|
||||
case "$ID" in
|
||||
debian|ubuntu|devuan|elementary)
|
||||
echo "Using apt-get to install dependencies"
|
||||
# Put this before any other invocation of apt so it can clean
|
||||
# up in a broken case.
|
||||
clean_boost_on_ubuntu
|
||||
$SUDO apt-get install -y devscripts equivs
|
||||
$SUDO apt-get install -y dpkg-dev
|
||||
ensure_python3_sphinx_on_ubuntu
|
||||
@ -319,6 +354,27 @@ else
|
||||
[ ! $NO_BOOST_PKGS ] && install_boost_on_ubuntu bionic
|
||||
$with_zbd && install_libzbd_on_ubuntu bionic
|
||||
;;
|
||||
*Jammy*)
|
||||
[ ! $NO_BOOST_PKGS ] && \
|
||||
$SUDO env DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||
libboost-atomic-dev \
|
||||
libboost-chrono-dev \
|
||||
libboost-container-dev \
|
||||
libboost-context-dev \
|
||||
libboost-coroutine-dev \
|
||||
libboost-date-time-dev \
|
||||
libboost-filesystem-dev \
|
||||
libboost-iostreams-dev \
|
||||
libboost-program-options-dev \
|
||||
libboost-python-dev \
|
||||
libboost-random-dev \
|
||||
libboost-regex-dev \
|
||||
libboost-system-dev \
|
||||
libboost-test-dev \
|
||||
libboost-thread-dev \
|
||||
libboost-timer-dev \
|
||||
gcc
|
||||
;;
|
||||
*)
|
||||
$SUDO apt-get install -y gcc
|
||||
;;
|
||||
|
@ -1,5 +1,7 @@
|
||||
# https://tracker.ceph.com/issues/45802
|
||||
# https://tracker.ceph.com/issues/61168
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(PG_AVAILABILITY\)
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
|
@ -8,6 +8,9 @@ overrides:
|
||||
- slow request
|
||||
- MDS_CLIENT_LATE_RELEASE
|
||||
- t responding to mclientcaps
|
||||
- Degraded data redundancy
|
||||
- MDS_CLIENTS_LAGGY
|
||||
- Reduced data availability
|
||||
tasks:
|
||||
- cephfs_test_runner:
|
||||
fail_on_skip: false
|
||||
|
0
ceph/qa/suites/fs/mirror-ha/cephfs-mirror/+
Normal file
0
ceph/qa/suites/fs/mirror-ha/cephfs-mirror/+
Normal file
@ -0,0 +1,14 @@
|
||||
meta:
|
||||
- desc: create/rm volumes and set configs
|
||||
|
||||
tasks:
|
||||
- exec:
|
||||
mon.a:
|
||||
- "ceph fs volume create dc"
|
||||
- "ceph fs volume create dc-backup"
|
||||
- full_sequential_finally:
|
||||
- exec:
|
||||
mon.a:
|
||||
- ceph config set mon mon_allow_pool_delete true
|
||||
- ceph fs volume rm dc --yes-i-really-mean-it
|
||||
- ceph fs volume rm dc-backup --yes-i-really-mean-it
|
@ -8,10 +8,6 @@ overrides:
|
||||
debug client: 10
|
||||
|
||||
tasks:
|
||||
- exec:
|
||||
client.1:
|
||||
- "ceph fs volume create dc"
|
||||
- "ceph fs volume create dc-backup"
|
||||
- ceph-fuse:
|
||||
client.1:
|
||||
cephfs_name: dc
|
||||
|
@ -11,3 +11,4 @@ overrides:
|
||||
- has not responded to cap revoke by MDS for over
|
||||
- MDS_CLIENT_LATE_RELEASE
|
||||
- responding to mclientcaps
|
||||
- RECENT_CRASH
|
||||
|
@ -1,10 +1,12 @@
|
||||
meta:
|
||||
- desc: 1 ceph cluster with 1 mon, 1 mgr, 3 osds, 1 mds
|
||||
- desc: 1 ceph cluster with 1 mon, 1 mgr, 3 osds, 2 mds, 2 clients
|
||||
roles:
|
||||
- - mon.a
|
||||
- mgr.x
|
||||
- mds.a
|
||||
- mds.b
|
||||
- osd.0
|
||||
- osd.1
|
||||
- osd.2
|
||||
- client.0
|
||||
- client.1
|
||||
|
@ -5,3 +5,4 @@ tasks:
|
||||
- tasks.cephfs.test_volumes.TestVolumes
|
||||
- tasks.cephfs.test_volumes.TestSubvolumeGroups
|
||||
- tasks.cephfs.test_volumes.TestSubvolumes
|
||||
- tasks.cephfs.test_subvolume.TestSubvolume
|
||||
|
0
ceph/qa/suites/fs/workload/subvolume/$
Normal file
0
ceph/qa/suites/fs/workload/subvolume/$
Normal file
@ -0,0 +1,11 @@
|
||||
overrides:
|
||||
ceph:
|
||||
subvols:
|
||||
create: 2
|
||||
subvol_options: "--namespace-isolated --size 25000000000"
|
||||
ceph-fuse:
|
||||
client.0:
|
||||
mount_subvol_num: 0
|
||||
kclient:
|
||||
client.0:
|
||||
mount_subvol_num: 1
|
@ -0,0 +1,11 @@
|
||||
overrides:
|
||||
ceph:
|
||||
subvols:
|
||||
create: 2
|
||||
subvol_options: "--namespace-isolated"
|
||||
ceph-fuse:
|
||||
client.0:
|
||||
mount_subvol_num: 0
|
||||
kclient:
|
||||
client.0:
|
||||
mount_subvol_num: 1
|
@ -0,0 +1,10 @@
|
||||
overrides:
|
||||
ceph:
|
||||
subvols:
|
||||
create: 2
|
||||
ceph-fuse:
|
||||
client.0:
|
||||
mount_subvol_num: 0
|
||||
kclient:
|
||||
client.0:
|
||||
mount_subvol_num: 1
|
11
ceph/qa/suites/fs/workload/subvolume/with-quota.yaml
Normal file
11
ceph/qa/suites/fs/workload/subvolume/with-quota.yaml
Normal file
@ -0,0 +1,11 @@
|
||||
overrides:
|
||||
ceph:
|
||||
subvols:
|
||||
create: 2
|
||||
subvol_options: "--size 25000000000"
|
||||
ceph-fuse:
|
||||
client.0:
|
||||
mount_subvol_num: 0
|
||||
kclient:
|
||||
client.0:
|
||||
mount_subvol_num: 1
|
@ -0,0 +1,12 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- pv
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
all:
|
||||
- rbd/diff_continuous.sh
|
||||
env:
|
||||
RBD_DEVICE_TYPE: "krbd"
|
1
ceph/qa/suites/orch/cephadm/workunits/task/.qa
Symbolic link
1
ceph/qa/suites/orch/cephadm/workunits/task/.qa
Symbolic link
@ -0,0 +1 @@
|
||||
../.qa/
|
@ -0,0 +1 @@
|
||||
../.qa/
|
@ -0,0 +1 @@
|
||||
.qa/distros/podman/centos_8.stream_container_tools.yaml
|
@ -18,3 +18,4 @@ tasks:
|
||||
clients:
|
||||
client.0:
|
||||
- cephadm/test_iscsi_pids_limit.sh
|
||||
- cephadm/test_iscsi_etc_hosts.sh
|
@ -1 +0,0 @@
|
||||
../orch/rook
|
@ -16,7 +16,7 @@ override:
|
||||
ceph:
|
||||
conf:
|
||||
mon:
|
||||
osd default pool size: 3
|
||||
osd pool default size: 3
|
||||
osd min pg log entries: 5
|
||||
osd max pg log entries: 10
|
||||
tasks:
|
||||
|
@ -12,11 +12,11 @@ openstack:
|
||||
- volumes: # attached to each instance
|
||||
count: 3
|
||||
size: 10 # GB
|
||||
override:
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
mon:
|
||||
osd default pool size: 3
|
||||
osd pool default size: 3
|
||||
tasks:
|
||||
- install:
|
||||
- ceph:
|
||||
|
@ -20,6 +20,10 @@ overrides:
|
||||
debug monc: 20
|
||||
mon:
|
||||
mon warn on pool no app: false
|
||||
osd:
|
||||
osd class load list: "*"
|
||||
osd class default list: "*"
|
||||
osd client watch timeout: 120
|
||||
tasks:
|
||||
- workunit:
|
||||
timeout: 6h
|
||||
|
@ -0,0 +1,14 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_packages:
|
||||
- rbd-nbd
|
||||
extra_system_packages:
|
||||
- pv
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- rbd/diff_continuous.sh
|
||||
env:
|
||||
RBD_DEVICE_TYPE: "nbd"
|
@ -8,6 +8,7 @@ tasks:
|
||||
- qemu-kvm-block-rbd
|
||||
deb:
|
||||
- qemu-block-extra
|
||||
- qemu-utils
|
||||
- ceph:
|
||||
fs: xfs
|
||||
conf:
|
||||
|
@ -8,6 +8,7 @@ tasks:
|
||||
- qemu-kvm-block-rbd
|
||||
deb:
|
||||
- qemu-block-extra
|
||||
- qemu-utils
|
||||
- ceph:
|
||||
fs: xfs
|
||||
conf:
|
||||
|
@ -8,6 +8,7 @@ tasks:
|
||||
- qemu-kvm-block-rbd
|
||||
deb:
|
||||
- qemu-block-extra
|
||||
- qemu-utils
|
||||
- ceph:
|
||||
fs: xfs
|
||||
conf:
|
||||
|
@ -8,6 +8,7 @@ tasks:
|
||||
- qemu-kvm-block-rbd
|
||||
deb:
|
||||
- qemu-block-extra
|
||||
- qemu-utils
|
||||
- ceph:
|
||||
fs: xfs
|
||||
conf:
|
||||
|
@ -18,6 +18,5 @@ overrides:
|
||||
endpoints: [c2.client.0]
|
||||
- name: test-zone3
|
||||
endpoints: [c1.client.1]
|
||||
- name: test-zone4
|
||||
endpoints: [c2.client.1]
|
||||
is_pubsub: true
|
||||
rgw-multisite-tests:
|
||||
args: [tests.py]
|
5
ceph/qa/suites/rgw/verify/tasks/versioning.yaml
Normal file
5
ceph/qa/suites/rgw/verify/tasks/versioning.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- rgw/run-versioning.sh
|
@ -9,4 +9,6 @@ workload:
|
||||
clients:
|
||||
client.0:
|
||||
- cls
|
||||
env:
|
||||
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||
- print: "**** done end rados_api.yaml"
|
||||
|
@ -7,4 +7,6 @@ stress-tasks:
|
||||
clients:
|
||||
client.0:
|
||||
- cls/test_cls_rbd.sh
|
||||
env:
|
||||
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
||||
|
@ -7,4 +7,6 @@ first-half-tasks:
|
||||
clients:
|
||||
client.0:
|
||||
- cls/test_cls_rbd.sh
|
||||
env:
|
||||
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
||||
|
@ -7,4 +7,6 @@ stress-tasks:
|
||||
clients:
|
||||
client.0:
|
||||
- cls/test_cls_rbd.sh
|
||||
env:
|
||||
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
||||
|
@ -262,6 +262,7 @@ def ceph_log(ctx, config):
|
||||
run.wait(
|
||||
ctx.cluster.run(
|
||||
args=[
|
||||
'time',
|
||||
'sudo',
|
||||
'find',
|
||||
'/var/log/ceph',
|
||||
@ -271,10 +272,15 @@ def ceph_log(ctx, config):
|
||||
run.Raw('|'),
|
||||
'sudo',
|
||||
'xargs',
|
||||
'--max-args=1',
|
||||
'--max-procs=0',
|
||||
'--verbose',
|
||||
'-0',
|
||||
'--no-run-if-empty',
|
||||
'--',
|
||||
'gzip',
|
||||
'-5',
|
||||
'--verbose',
|
||||
'--',
|
||||
],
|
||||
wait=False,
|
||||
@ -445,6 +451,9 @@ def cephfs_setup(ctx, config):
|
||||
name = fs_config.pop('name')
|
||||
temp = deepcopy(cephfs_config)
|
||||
teuthology.deep_merge(temp, fs_config)
|
||||
subvols = config.get('subvols', None)
|
||||
if subvols:
|
||||
teuthology.deep_merge(temp, {'subvols': subvols})
|
||||
fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
|
||||
if set_allow_multifs:
|
||||
fs.set_allow_multifs()
|
||||
|
@ -524,6 +524,7 @@ def build_ceph_cluster(ctx, config):
|
||||
run.wait(
|
||||
ctx.cluster.run(
|
||||
args=[
|
||||
'time',
|
||||
'sudo',
|
||||
'find',
|
||||
'/var/log/ceph',
|
||||
@ -533,10 +534,15 @@ def build_ceph_cluster(ctx, config):
|
||||
run.Raw('|'),
|
||||
'sudo',
|
||||
'xargs',
|
||||
'--max-args=1',
|
||||
'--max-procs=0',
|
||||
'--verbose',
|
||||
'-0',
|
||||
'--no-run-if-empty',
|
||||
'--',
|
||||
'gzip',
|
||||
'-5',
|
||||
'--verbose',
|
||||
'--',
|
||||
],
|
||||
wait=False,
|
||||
|
@ -72,6 +72,20 @@ def task(ctx, config):
|
||||
mount_timeout: 120 # default is 30, give up if /sys/ is not populated
|
||||
- interactive:
|
||||
|
||||
Example that creates and mounts a subvol:
|
||||
|
||||
overrides:
|
||||
ceph:
|
||||
subvols:
|
||||
create: 2
|
||||
subvol_options: "--namespace-isolated --size 25000000000"
|
||||
ceph-fuse:
|
||||
client.0:
|
||||
mount_subvol_num: 0
|
||||
kclient:
|
||||
client.1:
|
||||
mount_subvol_num: 1
|
||||
|
||||
:param ctx: Context
|
||||
:param config: Configuration
|
||||
"""
|
||||
|
@ -3148,11 +3148,14 @@ class CephManager:
|
||||
raise
|
||||
self.log("quorum is size %d" % size)
|
||||
|
||||
def get_mon_health(self, debug=False):
|
||||
def get_mon_health(self, debug=False, detail=False):
|
||||
"""
|
||||
Extract all the monitor health information.
|
||||
"""
|
||||
out = self.raw_cluster_cmd('health', '--format=json')
|
||||
if detail:
|
||||
out = self.raw_cluster_cmd('health', 'detail', '--format=json')
|
||||
else:
|
||||
out = self.raw_cluster_cmd('health', '--format=json')
|
||||
if debug:
|
||||
self.log('health:\n{h}'.format(h=out))
|
||||
return json.loads(out)
|
||||
|
@ -92,7 +92,7 @@ class CephTestCase(unittest.TestCase):
|
||||
|
||||
|
||||
def assert_cluster_log(self, expected_pattern, invert_match=False,
|
||||
timeout=10, watch_channel=None):
|
||||
timeout=10, watch_channel=None, present=True):
|
||||
"""
|
||||
Context manager. Assert that during execution, or up to 5 seconds later,
|
||||
the Ceph cluster log emits a message matching the expected pattern.
|
||||
@ -102,6 +102,8 @@ class CephTestCase(unittest.TestCase):
|
||||
:param watch_channel: Specifies the channel to be watched. This can be
|
||||
'cluster', 'audit', ...
|
||||
:type watch_channel: str
|
||||
:param present: Assert the log entry is present (default: True) or not (False).
|
||||
:type present: bool
|
||||
"""
|
||||
|
||||
ceph_manager = self.ceph_cluster.mon_manager
|
||||
@ -118,10 +120,13 @@ class CephTestCase(unittest.TestCase):
|
||||
self.watcher_process = ceph_manager.run_ceph_w(watch_channel)
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
fail = False
|
||||
if not self.watcher_process.finished:
|
||||
# Check if we got an early match, wait a bit if we didn't
|
||||
if self.match():
|
||||
if present and self.match():
|
||||
return
|
||||
elif not present and self.match():
|
||||
fail = True
|
||||
else:
|
||||
log.debug("No log hits yet, waiting...")
|
||||
# Default monc tick interval is 10s, so wait that long and
|
||||
@ -134,18 +139,23 @@ class CephTestCase(unittest.TestCase):
|
||||
except CommandFailedError:
|
||||
pass
|
||||
|
||||
if not self.match():
|
||||
log.error("Log output: \n{0}\n".format(self.watcher_process.stdout.getvalue()))
|
||||
raise AssertionError("Expected log message not found: '{0}'".format(expected_pattern))
|
||||
if present and not self.match():
|
||||
log.error(f"Log output: \n{self.watcher_process.stdout.getvalue()}\n")
|
||||
raise AssertionError(f"Expected log message found: '{expected_pattern}'")
|
||||
elif fail or (not present and self.match()):
|
||||
log.error(f"Log output: \n{self.watcher_process.stdout.getvalue()}\n")
|
||||
raise AssertionError(f"Unexpected log message found: '{expected_pattern}'")
|
||||
|
||||
return ContextManager()
|
||||
|
||||
def wait_for_health(self, pattern, timeout):
|
||||
def wait_for_health(self, pattern, timeout, check_in_detail=None):
|
||||
"""
|
||||
Wait until 'ceph health' contains messages matching the pattern
|
||||
Also check if @check_in_detail matches detailed health messages
|
||||
only when @pattern is a code string.
|
||||
"""
|
||||
def seen_health_warning():
|
||||
health = self.ceph_cluster.mon_manager.get_mon_health()
|
||||
health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=bool(check_in_detail))
|
||||
codes = [s for s in health['checks']]
|
||||
summary_strings = [s[1]['summary']['message'] for s in health['checks'].items()]
|
||||
if len(summary_strings) == 0:
|
||||
@ -156,7 +166,16 @@ class CephTestCase(unittest.TestCase):
|
||||
if pattern in ss:
|
||||
return True
|
||||
if pattern in codes:
|
||||
return True
|
||||
if not check_in_detail:
|
||||
return True
|
||||
# check if the string is in detail list if asked
|
||||
detail_strings = [ss['message'] for ss in \
|
||||
[s for s in health['checks'][pattern]['detail']]]
|
||||
log.debug(f'detail_strings: {detail_strings}')
|
||||
for ds in detail_strings:
|
||||
if check_in_detail in ds:
|
||||
return True
|
||||
log.debug(f'detail string "{check_in_detail}" not found')
|
||||
|
||||
log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
|
||||
return False
|
||||
|
@ -257,6 +257,7 @@ def ceph_log(ctx, config):
|
||||
run.wait(
|
||||
ctx.cluster.run(
|
||||
args=[
|
||||
'time',
|
||||
'sudo',
|
||||
'find',
|
||||
'/var/log/ceph', # all logs, not just for the cluster
|
||||
@ -267,10 +268,15 @@ def ceph_log(ctx, config):
|
||||
run.Raw('|'),
|
||||
'sudo',
|
||||
'xargs',
|
||||
'--max-args=1',
|
||||
'--max-procs=0',
|
||||
'--verbose',
|
||||
'-0',
|
||||
'--no-run-if-empty',
|
||||
'--',
|
||||
'gzip',
|
||||
'-5',
|
||||
'--verbose',
|
||||
'--',
|
||||
],
|
||||
wait=False,
|
||||
@ -818,7 +824,6 @@ def ceph_mdss(ctx, config):
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def ceph_monitoring(daemon_type, ctx, config):
|
||||
"""
|
||||
|
@ -163,7 +163,7 @@ class CephFSTestCase(CephTestCase):
|
||||
# In case some test messed with auth caps, reset them
|
||||
for client_id in client_mount_ids:
|
||||
cmd = ['auth', 'caps', f'client.{client_id}', 'mon','allow r',
|
||||
'osd', f'allow rw pool={self.fs.get_data_pool_name()}',
|
||||
'osd', f'allow rw tag cephfs data={self.fs.name}',
|
||||
'mds', 'allow']
|
||||
|
||||
if self.run_cluster_cmd_result(cmd) == 0:
|
||||
|
@ -369,6 +369,9 @@ class MDSCluster(CephCluster):
|
||||
"""
|
||||
self.mds_daemons[mds_id].signal(sig, silent);
|
||||
|
||||
def mds_is_running(self, mds_id):
|
||||
return self.mds_daemons[mds_id].running()
|
||||
|
||||
def newfs(self, name='cephfs', create=True):
|
||||
return Filesystem(self._ctx, name=name, create=create)
|
||||
|
||||
@ -748,6 +751,7 @@ class Filesystem(MDSCluster):
|
||||
raise
|
||||
|
||||
if self.fs_config is not None:
|
||||
log.debug(f"fs_config: {self.fs_config}")
|
||||
max_mds = self.fs_config.get('max_mds', 1)
|
||||
if max_mds > 1:
|
||||
self.set_max_mds(max_mds)
|
||||
@ -760,6 +764,34 @@ class Filesystem(MDSCluster):
|
||||
if session_timeout != 60:
|
||||
self.set_session_timeout(session_timeout)
|
||||
|
||||
if self.fs_config.get('subvols', None) is not None:
|
||||
log.debug(f"Creating {self.fs_config.get('subvols')} subvols "
|
||||
f"for filesystem '{self.name}'")
|
||||
if not hasattr(self._ctx, "created_subvols"):
|
||||
self._ctx.created_subvols = dict()
|
||||
|
||||
subvols = self.fs_config.get('subvols')
|
||||
assert(isinstance(subvols, dict))
|
||||
assert(isinstance(subvols['create'], int))
|
||||
assert(subvols['create'] > 0)
|
||||
|
||||
for sv in range(0, subvols['create']):
|
||||
sv_name = f'sv_{sv}'
|
||||
self.mon_manager.raw_cluster_cmd(
|
||||
'fs', 'subvolume', 'create', self.name, sv_name,
|
||||
self.fs_config.get('subvol_options', ''))
|
||||
|
||||
if self.name not in self._ctx.created_subvols:
|
||||
self._ctx.created_subvols[self.name] = []
|
||||
|
||||
subvol_path = self.mon_manager.raw_cluster_cmd(
|
||||
'fs', 'subvolume', 'getpath', self.name, sv_name)
|
||||
subvol_path = subvol_path.strip()
|
||||
self._ctx.created_subvols[self.name].append(subvol_path)
|
||||
else:
|
||||
log.debug(f"Not Creating any subvols for filesystem '{self.name}'")
|
||||
|
||||
|
||||
self.getinfo(refresh = True)
|
||||
|
||||
# wait pgs to be clean
|
||||
@ -1090,6 +1122,10 @@ class Filesystem(MDSCluster):
|
||||
def rank_fail(self, rank=0):
|
||||
self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank))
|
||||
|
||||
def rank_is_running(self, rank=0, status=None):
|
||||
name = self.get_rank(rank=rank, status=status)['name']
|
||||
return self.mds_is_running(name)
|
||||
|
||||
def get_ranks(self, status=None):
|
||||
if status is None:
|
||||
status = self.getinfo()
|
||||
@ -1537,7 +1573,7 @@ class Filesystem(MDSCluster):
|
||||
if quiet:
|
||||
base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1']
|
||||
else:
|
||||
base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
|
||||
base_args = [os.path.join(self._prefix, tool), '--debug-mds=20', '--debug-ms=1', '--debug-objecter=1']
|
||||
|
||||
if rank is not None:
|
||||
base_args.extend(["--rank", "%s" % str(rank)])
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user