mirror of
https://git.proxmox.com/git/ceph.git
synced 2025-08-04 08:46:12 +00:00
import ceph pacific 16.2.14 source
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
b81a1d7f97
commit
a2f5a7e755
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
|
|||||||
# remove cmake/modules/FindPython* once 3.12 is required
|
# remove cmake/modules/FindPython* once 3.12 is required
|
||||||
|
|
||||||
project(ceph
|
project(ceph
|
||||||
VERSION 16.2.13
|
VERSION 16.2.14
|
||||||
LANGUAGES CXX C ASM)
|
LANGUAGES CXX C ASM)
|
||||||
|
|
||||||
foreach(policy
|
foreach(policy
|
||||||
|
@ -32,6 +32,17 @@
|
|||||||
in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
|
in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
|
||||||
the restored file system is expected to have the same ID as before.
|
the restored file system is expected to have the same ID as before.
|
||||||
|
|
||||||
|
>= 16.2.14
|
||||||
|
----------
|
||||||
|
|
||||||
|
* CEPHFS: After recovering a Ceph File System post following the disaster recovery
|
||||||
|
procedure, the recovered files under `lost+found` directory can now be deleted.
|
||||||
|
|
||||||
|
* `ceph mgr dump` command now displays the name of the mgr module that
|
||||||
|
registered a RADOS client in the `name` field added to elements of the
|
||||||
|
`active_clients` array. Previously, only the address of a module's RADOS
|
||||||
|
client was shown in the `active_clients` array.
|
||||||
|
|
||||||
>=16.2.12
|
>=16.2.12
|
||||||
---------
|
---------
|
||||||
|
|
||||||
@ -62,6 +73,65 @@
|
|||||||
namespaces was added to RBD in Nautilus 14.2.0 and it has been possible to
|
namespaces was added to RBD in Nautilus 14.2.0 and it has been possible to
|
||||||
map and unmap images in namespaces using the `image-spec` syntax since then
|
map and unmap images in namespaces using the `image-spec` syntax since then
|
||||||
but the corresponding option available in most other commands was missing.
|
but the corresponding option available in most other commands was missing.
|
||||||
|
* RGW: Compression is now supported for objects uploaded with Server-Side Encryption.
|
||||||
|
When both are enabled, compression is applied before encryption.
|
||||||
|
* RGW: the "pubsub" functionality for storing bucket notifications inside Ceph
|
||||||
|
is removed. Together with it, the "pubsub" zone should not be used anymore.
|
||||||
|
The REST operations, as well as radosgw-admin commands for manipulating
|
||||||
|
subscriptions, as well as fetching and acking the notifications are removed
|
||||||
|
as well.
|
||||||
|
In case that the endpoint to which the notifications are sent maybe down or
|
||||||
|
disconnected, it is recommended to use persistent notifications to guarantee
|
||||||
|
the delivery of the notifications. In case the system that consumes the
|
||||||
|
notifications needs to pull them (instead of the notifications be pushed
|
||||||
|
to it), an external message bus (e.g. rabbitmq, Kafka) should be used for
|
||||||
|
that purpose.
|
||||||
|
* RGW: The serialized format of notification and topics has changed, so that
|
||||||
|
new/updated topics will be unreadable by old RGWs. We recommend completing
|
||||||
|
the RGW upgrades before creating or modifying any notification topics.
|
||||||
|
* RBD: Trailing newline in passphrase files (`<passphrase-file>` argument in
|
||||||
|
`rbd encryption format` command and `--encryption-passphrase-file` option
|
||||||
|
in other commands) is no longer stripped.
|
||||||
|
* RBD: Support for layered client-side encryption is added. Cloned images
|
||||||
|
can now be encrypted each with its own encryption format and passphrase,
|
||||||
|
potentially different from that of the parent image. The efficient
|
||||||
|
copy-on-write semantics intrinsic to unformatted (regular) cloned images
|
||||||
|
are retained.
|
||||||
|
* CEPHFS: Rename the `mds_max_retries_on_remount_failure` option to
|
||||||
|
`client_max_retries_on_remount_failure` and move it from mds.yaml.in to
|
||||||
|
mds-client.yaml.in because this option was only used by MDS client from its
|
||||||
|
birth.
|
||||||
|
* The `perf dump` and `perf schema` commands are deprecated in favor of new
|
||||||
|
`counter dump` and `counter schema` commands. These new commands add support
|
||||||
|
for labeled perf counters and also emit existing unlabeled perf counters. Some
|
||||||
|
unlabeled perf counters became labeled in this release, with more to follow in
|
||||||
|
future releases; such converted perf counters are no longer emitted by the
|
||||||
|
`perf dump` and `perf schema` commands.
|
||||||
|
* `ceph mgr dump` command now outputs `last_failure_osd_epoch` and
|
||||||
|
`active_clients` fields at the top level. Previously, these fields were
|
||||||
|
output under `always_on_modules` field.
|
||||||
|
* RBD: All rbd-mirror daemon perf counters became labeled and as such are now
|
||||||
|
emitted only by the new `counter dump` and `counter schema` commands. As part
|
||||||
|
of the conversion, many also got renamed to better disambiguate journal-based
|
||||||
|
and snapshot-based mirroring.
|
||||||
|
* RBD: list-watchers C++ API (`Image::list_watchers`) now clears the passed
|
||||||
|
`std::list` before potentially appending to it, aligning with the semantics
|
||||||
|
of the corresponding C API (`rbd_watchers_list`).
|
||||||
|
* Telemetry: Users who are opted-in to telemetry can also opt-in to
|
||||||
|
participating in a leaderboard in the telemetry public
|
||||||
|
dashboards (https://telemetry-public.ceph.com/). Users can now also add a
|
||||||
|
description of the cluster to publicly appear in the leaderboard.
|
||||||
|
For more details, see:
|
||||||
|
https://docs.ceph.com/en/latest/mgr/telemetry/#leaderboard
|
||||||
|
See a sample report with `ceph telemetry preview`.
|
||||||
|
Opt-in to telemetry with `ceph telemetry on`.
|
||||||
|
Opt-in to the leaderboard with
|
||||||
|
`ceph config set mgr mgr/telemetry/leaderboard true`.
|
||||||
|
Add leaderboard description with:
|
||||||
|
`ceph config set mgr mgr/telemetry/leaderboard_description ‘Cluster description’`.
|
||||||
|
* CEPHFS: After recovering a Ceph File System post following the disaster recovery
|
||||||
|
procedure, the recovered files under `lost+found` directory can now be deleted.
|
||||||
|
* core: cache-tiering is now deprecated.
|
||||||
|
|
||||||
>=16.2.8
|
>=16.2.8
|
||||||
--------
|
--------
|
||||||
|
@ -135,7 +135,7 @@
|
|||||||
# main package definition
|
# main package definition
|
||||||
#################################################################################
|
#################################################################################
|
||||||
Name: ceph
|
Name: ceph
|
||||||
Version: 16.2.13
|
Version: 16.2.14
|
||||||
Release: 0%{?dist}
|
Release: 0%{?dist}
|
||||||
%if 0%{?fedora} || 0%{?rhel}
|
%if 0%{?fedora} || 0%{?rhel}
|
||||||
Epoch: 2
|
Epoch: 2
|
||||||
@ -151,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
|
|||||||
Group: System/Filesystems
|
Group: System/Filesystems
|
||||||
%endif
|
%endif
|
||||||
URL: http://ceph.com/
|
URL: http://ceph.com/
|
||||||
Source0: %{?_remote_tarball_prefix}ceph-16.2.13.tar.bz2
|
Source0: %{?_remote_tarball_prefix}ceph-16.2.14.tar.bz2
|
||||||
%if 0%{?suse_version}
|
%if 0%{?suse_version}
|
||||||
# _insert_obs_source_lines_here
|
# _insert_obs_source_lines_here
|
||||||
ExclusiveArch: x86_64 aarch64 ppc64le s390x
|
ExclusiveArch: x86_64 aarch64 ppc64le s390x
|
||||||
@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus.
|
|||||||
# common
|
# common
|
||||||
#################################################################################
|
#################################################################################
|
||||||
%prep
|
%prep
|
||||||
%autosetup -p1 -n ceph-16.2.13
|
%autosetup -p1 -n ceph-16.2.14
|
||||||
|
|
||||||
%build
|
%build
|
||||||
# Disable lto on systems that do not support symver attribute
|
# Disable lto on systems that do not support symver attribute
|
||||||
|
@ -1,7 +1,13 @@
|
|||||||
ceph (16.2.13-1focal) focal; urgency=medium
|
ceph (16.2.14-1focal) focal; urgency=medium
|
||||||
|
|
||||||
|
|
||||||
-- Jenkins Build Slave User <jenkins-build@braggi17.front.sepia.ceph.com> Mon, 08 May 2023 20:49:59 +0000
|
-- Jenkins Build Slave User <jenkins-build@braggi13.front.sepia.ceph.com> Tue, 29 Aug 2023 16:38:35 +0000
|
||||||
|
|
||||||
|
ceph (16.2.14-1) stable; urgency=medium
|
||||||
|
|
||||||
|
* New upstream release
|
||||||
|
|
||||||
|
-- Ceph Release Team <ceph-maintainers@ceph.io> Tue, 29 Aug 2023 15:43:56 +0000
|
||||||
|
|
||||||
ceph (16.2.13-1) stable; urgency=medium
|
ceph (16.2.13-1) stable; urgency=medium
|
||||||
|
|
||||||
|
@ -1 +1,3 @@
|
|||||||
|
lib/systemd/system/cephfs-mirror*
|
||||||
usr/bin/cephfs-mirror
|
usr/bin/cephfs-mirror
|
||||||
|
usr/share/man/man8/cephfs-mirror.8
|
||||||
|
@ -43,17 +43,17 @@ monitor hosts as well as to the monitor daemons' stderr.
|
|||||||
Ceph daemon logs
|
Ceph daemon logs
|
||||||
================
|
================
|
||||||
|
|
||||||
Logging to journald
|
Logging to stdout
|
||||||
-------------------
|
-----------------
|
||||||
|
|
||||||
Ceph daemons traditionally write logs to ``/var/log/ceph``. Ceph daemons log to
|
Ceph daemons traditionally write logs to ``/var/log/ceph``. Ceph
|
||||||
journald by default and Ceph logs are captured by the container runtime
|
daemons log to stderr by default and Ceph logs are captured by the
|
||||||
environment. They are accessible via ``journalctl``.
|
container runtime environment. By default, most systems send these
|
||||||
|
logs to journald, which means that they are accessible via
|
||||||
|
``journalctl``.
|
||||||
|
|
||||||
.. note:: Prior to Quincy, ceph daemons logged to stderr.
|
Example of logging to stdout
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
Example of logging to journald
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
For example, to view the logs for the daemon ``mon.foo`` for a cluster
|
For example, to view the logs for the daemon ``mon.foo`` for a cluster
|
||||||
with ID ``5c5a50ae-272a-455d-99e9-32c6a013e694``, the command would be
|
with ID ``5c5a50ae-272a-455d-99e9-32c6a013e694``, the command would be
|
||||||
@ -69,11 +69,11 @@ Logging to files
|
|||||||
----------------
|
----------------
|
||||||
|
|
||||||
You can also configure Ceph daemons to log to files instead of to
|
You can also configure Ceph daemons to log to files instead of to
|
||||||
journald if you prefer logs to appear in files (as they did in earlier,
|
stderr if you prefer logs to appear in files (as they did in earlier,
|
||||||
pre-cephadm, pre-Octopus versions of Ceph). When Ceph logs to files,
|
pre-cephadm, pre-Octopus versions of Ceph). When Ceph logs to files,
|
||||||
the logs appear in ``/var/log/ceph/<cluster-fsid>``. If you choose to
|
the logs appear in ``/var/log/ceph/<cluster-fsid>``. If you choose to
|
||||||
configure Ceph to log to files instead of to journald, remember to
|
configure Ceph to log to files instead of to stderr, remember to
|
||||||
configure Ceph so that it will not log to journald (the commands for
|
configure Ceph so that it will not log to stderr (the commands for
|
||||||
this are covered below).
|
this are covered below).
|
||||||
|
|
||||||
Enabling logging to files
|
Enabling logging to files
|
||||||
@ -86,10 +86,10 @@ To enable logging to files, run the following commands:
|
|||||||
ceph config set global log_to_file true
|
ceph config set global log_to_file true
|
||||||
ceph config set global mon_cluster_log_to_file true
|
ceph config set global mon_cluster_log_to_file true
|
||||||
|
|
||||||
Disabling logging to journald
|
Disabling logging to stderr
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
If you choose to log to files, we recommend disabling logging to journald or else
|
If you choose to log to files, we recommend disabling logging to stderr or else
|
||||||
everything will be logged twice. Run the following commands to disable logging
|
everything will be logged twice. Run the following commands to disable logging
|
||||||
to stderr:
|
to stderr:
|
||||||
|
|
||||||
@ -97,11 +97,6 @@ to stderr:
|
|||||||
|
|
||||||
ceph config set global log_to_stderr false
|
ceph config set global log_to_stderr false
|
||||||
ceph config set global mon_cluster_log_to_stderr false
|
ceph config set global mon_cluster_log_to_stderr false
|
||||||
ceph config set global log_to_journald false
|
|
||||||
ceph config set global mon_cluster_log_to_journald false
|
|
||||||
|
|
||||||
.. note:: You can change the default by passing --log-to-file during
|
|
||||||
bootstrapping a new cluster.
|
|
||||||
|
|
||||||
Modifying the log retention schedule
|
Modifying the log retention schedule
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -558,6 +558,7 @@ For example:
|
|||||||
Extra Entrypoint Arguments
|
Extra Entrypoint Arguments
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
For arguments intended for the container runtime rather than the process inside
|
For arguments intended for the container runtime rather than the process inside
|
||||||
@ -577,6 +578,57 @@ the node-exporter service , one could apply a service spec like
|
|||||||
extra_entrypoint_args:
|
extra_entrypoint_args:
|
||||||
- "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector2"
|
- "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector2"
|
||||||
|
|
||||||
|
Custom Config Files
|
||||||
|
===================
|
||||||
|
|
||||||
|
Cephadm supports specifying miscellaneous config files for daemons.
|
||||||
|
To do so, users must provide both the content of the config file and the
|
||||||
|
location within the daemon's container at which it should be mounted. After
|
||||||
|
applying a YAML spec with custom config files specified and having cephadm
|
||||||
|
redeploy the daemons for which the config files are specified, these files will
|
||||||
|
be mounted within the daemon's container at the specified location.
|
||||||
|
|
||||||
|
Example service spec:
|
||||||
|
|
||||||
|
.. code-block:: yaml
|
||||||
|
|
||||||
|
service_type: grafana
|
||||||
|
service_name: grafana
|
||||||
|
custom_configs:
|
||||||
|
- mount_path: /etc/example.conf
|
||||||
|
content: |
|
||||||
|
setting1 = value1
|
||||||
|
setting2 = value2
|
||||||
|
- mount_path: /usr/share/grafana/example.cert
|
||||||
|
content: |
|
||||||
|
-----BEGIN PRIVATE KEY-----
|
||||||
|
V2VyIGRhcyBsaWVzdCBpc3QgZG9vZi4gTG9yZW0gaXBzdW0gZG9sb3Igc2l0IGFt
|
||||||
|
ZXQsIGNvbnNldGV0dXIgc2FkaXBzY2luZyBlbGl0ciwgc2VkIGRpYW0gbm9udW15
|
||||||
|
IGVpcm1vZCB0ZW1wb3IgaW52aWR1bnQgdXQgbGFib3JlIGV0IGRvbG9yZSBtYWdu
|
||||||
|
YSBhbGlxdXlhbSBlcmF0LCBzZWQgZGlhbSB2b2x1cHR1YS4gQXQgdmVybyBlb3Mg
|
||||||
|
ZXQgYWNjdXNhbSBldCBqdXN0byBkdW8=
|
||||||
|
-----END PRIVATE KEY-----
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
V2VyIGRhcyBsaWVzdCBpc3QgZG9vZi4gTG9yZW0gaXBzdW0gZG9sb3Igc2l0IGFt
|
||||||
|
ZXQsIGNvbnNldGV0dXIgc2FkaXBzY2luZyBlbGl0ciwgc2VkIGRpYW0gbm9udW15
|
||||||
|
IGVpcm1vZCB0ZW1wb3IgaW52aWR1bnQgdXQgbGFib3JlIGV0IGRvbG9yZSBtYWdu
|
||||||
|
YSBhbGlxdXlhbSBlcmF0LCBzZWQgZGlhbSB2b2x1cHR1YS4gQXQgdmVybyBlb3Mg
|
||||||
|
ZXQgYWNjdXNhbSBldCBqdXN0byBkdW8=
|
||||||
|
-----END CERTIFICATE-----
|
||||||
|
|
||||||
|
To make these new config files actually get mounted within the
|
||||||
|
containers for the daemons
|
||||||
|
|
||||||
|
.. prompt:: bash
|
||||||
|
|
||||||
|
ceph orch redeploy <service-name>
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
.. prompt:: bash
|
||||||
|
|
||||||
|
ceph orch redeploy grafana
|
||||||
|
|
||||||
.. _orch-rm:
|
.. _orch-rm:
|
||||||
|
|
||||||
Removing a Service
|
Removing a Service
|
||||||
|
@ -299,13 +299,16 @@ and the metrics will not be visible in Prometheus.
|
|||||||
Setting up Prometheus
|
Setting up Prometheus
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
Setting Prometheus Retention Time
|
Setting Prometheus Retention Size and Time
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Cephadm provides the option to set the Prometheus TDSB retention time using
|
Cephadm can configure Prometheus TSDB retention by specifying ``retention_time``
|
||||||
a ``retention_time`` field in the Prometheus service spec. The value defaults
|
and ``retention_size`` values in the Prometheus service spec.
|
||||||
to 15 days (15d). If you would like a different value, such as 1 year (1y) you
|
The retention time value defaults to 15 days (15d). Users can set a different value/unit where
|
||||||
can apply a service spec similar to:
|
supported units are: 'y', 'w', 'd', 'h', 'm' and 's'. The retention size value defaults
|
||||||
|
to 0 (disabled). Supported units in this case are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB' and 'EB'.
|
||||||
|
|
||||||
|
In the following example spec we set the retention time to 1 year and the size to 1GB.
|
||||||
|
|
||||||
.. code-block:: yaml
|
.. code-block:: yaml
|
||||||
|
|
||||||
@ -314,6 +317,7 @@ can apply a service spec similar to:
|
|||||||
count: 1
|
count: 1
|
||||||
spec:
|
spec:
|
||||||
retention_time: "1y"
|
retention_time: "1y"
|
||||||
|
retention_size: "1GB"
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
|
@ -308,7 +308,7 @@ Replacing an OSD
|
|||||||
|
|
||||||
.. prompt:: bash #
|
.. prompt:: bash #
|
||||||
|
|
||||||
orch osd rm <osd_id(s)> --replace [--force]
|
ceph orch osd rm <osd_id(s)> --replace [--force]
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
@ -14,6 +14,8 @@ Requirements
|
|||||||
|
|
||||||
The primary (local) and secondary (remote) Ceph clusters version should be Pacific or later.
|
The primary (local) and secondary (remote) Ceph clusters version should be Pacific or later.
|
||||||
|
|
||||||
|
.. _cephfs_mirroring_creating_users:
|
||||||
|
|
||||||
Creating Users
|
Creating Users
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
@ -42,80 +44,155 @@ Mirror daemon should be spawned using `systemctl(1)` unit files::
|
|||||||
|
|
||||||
$ cephfs-mirror --id mirror --cluster site-a -f
|
$ cephfs-mirror --id mirror --cluster site-a -f
|
||||||
|
|
||||||
.. note:: User used here is `mirror` created in the `Creating Users` section.
|
.. note:: The user specified here is `mirror`, the creation of which is
|
||||||
|
described in the :ref:`Creating Users<cephfs_mirroring_creating_users>`
|
||||||
|
section.
|
||||||
|
|
||||||
|
Multiple ``cephfs-mirror`` daemons may be deployed for concurrent
|
||||||
|
synchronization and high availability. Mirror daemons share the synchronization
|
||||||
|
load using a simple ``M/N`` policy, where ``M`` is the number of directories
|
||||||
|
and ``N`` is the number of ``cephfs-mirror`` daemons.
|
||||||
|
|
||||||
|
When ``cephadm`` is used to manage a Ceph cluster, ``cephfs-mirror`` daemons can be
|
||||||
|
deployed by running the following command:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph orch apply cephfs-mirror
|
||||||
|
|
||||||
|
To deploy multiple mirror daemons, run a command of the following form:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph orch apply cephfs-mirror --placement=<placement-spec>
|
||||||
|
|
||||||
|
For example, to deploy 3 `cephfs-mirror` daemons on different hosts, run a command of the following form:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
$ ceph orch apply cephfs-mirror --placement="3 host1,host2,host3"
|
||||||
|
|
||||||
Interface
|
Interface
|
||||||
---------
|
---------
|
||||||
|
|
||||||
`Mirroring` module (manager plugin) provides interfaces for managing directory snapshot
|
The `Mirroring` module (manager plugin) provides interfaces for managing
|
||||||
mirroring. Manager interfaces are (mostly) wrappers around monitor commands for managing
|
directory snapshot mirroring. These are (mostly) wrappers around monitor
|
||||||
file system mirroring and is the recommended control interface.
|
commands for managing file system mirroring and is the recommended control
|
||||||
|
interface.
|
||||||
|
|
||||||
Mirroring Module
|
Mirroring Module
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
The mirroring module is responsible for assigning directories to mirror daemons for
|
The mirroring module is responsible for assigning directories to mirror daemons
|
||||||
synchronization. Multiple mirror daemons can be spawned to achieve concurrency in
|
for synchronization. Multiple mirror daemons can be spawned to achieve
|
||||||
directory snapshot synchronization. When mirror daemons are spawned (or terminated)
|
concurrency in directory snapshot synchronization. When mirror daemons are
|
||||||
, the mirroring module discovers the modified set of mirror daemons and rebalances
|
spawned (or terminated), the mirroring module discovers the modified set of
|
||||||
the directory assignment amongst the new set thus providing high-availability.
|
mirror daemons and rebalances directory assignments across the new set, thus
|
||||||
|
providing high-availability.
|
||||||
|
|
||||||
.. note:: Multiple mirror daemons is currently untested. Only a single mirror daemon
|
.. note:: Deploying a single mirror daemon is recommended. Running multiple
|
||||||
is recommended.
|
daemons is untested.
|
||||||
|
|
||||||
Mirroring module is disabled by default. To enable mirroring use::
|
The mirroring module is disabled by default. To enable the mirroring module,
|
||||||
|
run the following command:
|
||||||
|
|
||||||
$ ceph mgr module enable mirroring
|
.. prompt:: bash $
|
||||||
|
|
||||||
Mirroring module provides a family of commands to control mirroring of directory
|
ceph mgr module enable mirroring
|
||||||
snapshots. To add or remove directories, mirroring needs to be enabled for a given
|
|
||||||
file system. To enable mirroring use::
|
|
||||||
|
|
||||||
$ ceph fs snapshot mirror enable <fs_name>
|
The mirroring module provides a family of commands that can be used to control
|
||||||
|
the mirroring of directory snapshots. To add or remove directories, mirroring
|
||||||
|
must be enabled for a given file system. To enable mirroring for a given file
|
||||||
|
system, run a command of the following form:
|
||||||
|
|
||||||
.. note:: Mirroring module commands use `fs snapshot mirror` prefix as compared to
|
.. prompt:: bash $
|
||||||
the monitor commands which `fs mirror` prefix. Make sure to use module
|
|
||||||
commands.
|
|
||||||
|
|
||||||
To disable mirroring, use::
|
ceph fs snapshot mirror enable <fs_name>
|
||||||
|
|
||||||
$ ceph fs snapshot mirror disable <fs_name>
|
.. note:: "Mirroring module" commands are prefixed with ``fs snapshot mirror``.
|
||||||
|
This distinguishes them from "monitor commands", which are prefixed with ``fs
|
||||||
|
mirror``. Be sure (in this context) to use module commands.
|
||||||
|
|
||||||
Once mirroring is enabled, add a peer to which directory snapshots are to be mirrored.
|
To disable mirroring for a given file system, run a command of the following form:
|
||||||
Peers follow `<client>@<cluster>` specification and get assigned a unique-id (UUID)
|
|
||||||
when added. See `Creating Users` section on how to create Ceph users for mirroring.
|
|
||||||
|
|
||||||
To add a peer use::
|
.. prompt:: bash $
|
||||||
|
|
||||||
$ ceph fs snapshot mirror peer_add <fs_name> <remote_cluster_spec> [<remote_fs_name>] [<remote_mon_host>] [<cephx_key>]
|
ceph fs snapshot mirror disable <fs_name>
|
||||||
|
|
||||||
`<remote_fs_name>` is optional, and defaults to `<fs_name>` (on the remote cluster).
|
After mirroring is enabled, add a peer to which directory snapshots are to be
|
||||||
|
mirrored. Peers are specified by the ``<client>@<cluster>`` format, which is
|
||||||
|
referred to elsewhere in this document as the ``remote_cluster_spec``. Peers
|
||||||
|
are assigned a unique-id (UUID) when added. See the :ref:`Creating
|
||||||
|
Users<cephfs_mirroring_creating_users>` section for instructions that describe
|
||||||
|
how to create Ceph users for mirroring.
|
||||||
|
|
||||||
This requires the remote cluster ceph configuration and user keyring to be available in
|
To add a peer, run a command of the following form:
|
||||||
the primary cluster. See `Bootstrap Peers` section to avoid this. `peer_add` additionally
|
|
||||||
supports passing the remote cluster monitor address and the user key. However, bootstrapping
|
.. prompt:: bash $
|
||||||
a peer is the recommended way to add a peer.
|
|
||||||
|
ceph fs snapshot mirror peer_add <fs_name> <remote_cluster_spec> [<remote_fs_name>] [<remote_mon_host>] [<cephx_key>]
|
||||||
|
|
||||||
|
``<remote_cluster_spec>`` is of the format ``client.<id>@<cluster_name>``.
|
||||||
|
|
||||||
|
``<remote_fs_name>`` is optional, and defaults to `<fs_name>` (on the remote
|
||||||
|
cluster).
|
||||||
|
|
||||||
|
For this command to succeed, the remote cluster's Ceph configuration and user
|
||||||
|
keyring must be available in the primary cluster. For example, if a user named
|
||||||
|
``client_mirror`` is created on the remote cluster which has ``rwps``
|
||||||
|
permissions for the remote file system named ``remote_fs`` (see `Creating
|
||||||
|
Users`) and the remote cluster is named ``remote_ceph`` (that is, the remote
|
||||||
|
cluster configuration file is named ``remote_ceph.conf`` on the primary
|
||||||
|
cluster), run the following command to add the remote filesystem as a peer to
|
||||||
|
the primary filesystem ``primary_fs``:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs snapshot mirror peer_add primary_fs client.mirror_remote@remote_ceph remote_fs
|
||||||
|
|
||||||
|
To avoid having to maintain the remote cluster configuration file and remote
|
||||||
|
ceph user keyring in the primary cluster, users can bootstrap a peer (which
|
||||||
|
stores the relevant remote cluster details in the monitor config store on the
|
||||||
|
primary cluster). See the :ref:`Bootstrap
|
||||||
|
Peers<cephfs_mirroring_bootstrap_peers>` section.
|
||||||
|
|
||||||
|
The ``peer_add`` command supports passing the remote cluster monitor address
|
||||||
|
and the user key. However, bootstrapping a peer is the recommended way to add a
|
||||||
|
peer.
|
||||||
|
|
||||||
.. note:: Only a single peer is supported right now.
|
.. note:: Only a single peer is supported right now.
|
||||||
|
|
||||||
To remove a peer use::
|
To remove a peer, run a command of the following form:
|
||||||
|
|
||||||
$ ceph fs snapshot mirror peer_remove <fs_name> <peer_uuid>
|
.. prompt:: bash $
|
||||||
|
|
||||||
To list file system mirror peers use::
|
ceph fs snapshot mirror peer_remove <fs_name> <peer_uuid>
|
||||||
|
|
||||||
$ ceph fs snapshot mirror peer_list <fs_name>
|
To list file system mirror peers, run a command of the following form:
|
||||||
|
|
||||||
To configure a directory for mirroring, use::
|
.. prompt:: bash $
|
||||||
|
|
||||||
$ ceph fs snapshot mirror add <fs_name> <path>
|
ceph fs snapshot mirror peer_list <fs_name>
|
||||||
|
|
||||||
To stop a mirroring directory snapshots use::
|
To configure a directory for mirroring, run a command of the following form:
|
||||||
|
|
||||||
$ ceph fs snapshot mirror remove <fs_name> <path>
|
.. prompt:: bash $
|
||||||
|
|
||||||
Only absolute directory paths are allowed. Also, paths are normalized by the mirroring
|
ceph fs snapshot mirror add <fs_name> <path>
|
||||||
module, therfore, `/a/b/../b` is equivalent to `/a/b`.
|
|
||||||
|
To stop mirroring directory snapshots, run a command of the following form:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs snapshot mirror remove <fs_name> <path>
|
||||||
|
|
||||||
|
Only absolute directory paths are allowed.
|
||||||
|
|
||||||
|
Paths are normalized by the mirroring module. This means that ``/a/b/../b`` is
|
||||||
|
equivalent to ``/a/b``. Paths always start from the CephFS file-system root and
|
||||||
|
not from the host system mount point.
|
||||||
|
|
||||||
|
For example::
|
||||||
|
|
||||||
$ mkdir -p /d0/d1/d2
|
$ mkdir -p /d0/d1/d2
|
||||||
$ ceph fs snapshot mirror add cephfs /d0/d1/d2
|
$ ceph fs snapshot mirror add cephfs /d0/d1/d2
|
||||||
@ -123,16 +200,19 @@ module, therfore, `/a/b/../b` is equivalent to `/a/b`.
|
|||||||
$ ceph fs snapshot mirror add cephfs /d0/d1/../d1/d2
|
$ ceph fs snapshot mirror add cephfs /d0/d1/../d1/d2
|
||||||
Error EEXIST: directory /d0/d1/d2 is already tracked
|
Error EEXIST: directory /d0/d1/d2 is already tracked
|
||||||
|
|
||||||
Once a directory is added for mirroring, its subdirectory or ancestor directories are
|
After a directory is added for mirroring, the additional mirroring of
|
||||||
disallowed to be added for mirorring::
|
subdirectories or ancestor directories is disallowed::
|
||||||
|
|
||||||
$ ceph fs snapshot mirror add cephfs /d0/d1
|
$ ceph fs snapshot mirror add cephfs /d0/d1
|
||||||
Error EINVAL: /d0/d1 is a ancestor of tracked path /d0/d1/d2
|
Error EINVAL: /d0/d1 is a ancestor of tracked path /d0/d1/d2
|
||||||
$ ceph fs snapshot mirror add cephfs /d0/d1/d2/d3
|
$ ceph fs snapshot mirror add cephfs /d0/d1/d2/d3
|
||||||
Error EINVAL: /d0/d1/d2/d3 is a subtree of tracked path /d0/d1/d2
|
Error EINVAL: /d0/d1/d2/d3 is a subtree of tracked path /d0/d1/d2
|
||||||
|
|
||||||
Commands to check directory mapping (to mirror daemons) and directory distribution are
|
The :ref:`Mirroring Status<cephfs_mirroring_mirroring_status>` section contains
|
||||||
detailed in `Mirroring Status` section.
|
information about the commands for checking the directory mapping (to mirror
|
||||||
|
daemons) and for checking the directory distribution.
|
||||||
|
|
||||||
|
.. _cephfs_mirroring_bootstrap_peers:
|
||||||
|
|
||||||
Bootstrap Peers
|
Bootstrap Peers
|
||||||
---------------
|
---------------
|
||||||
@ -160,6 +240,9 @@ e.g.::
|
|||||||
|
|
||||||
$ ceph fs snapshot mirror peer_bootstrap import cephfs eyJmc2lkIjogIjBkZjE3MjE3LWRmY2QtNDAzMC05MDc5LTM2Nzk4NTVkNDJlZiIsICJmaWxlc3lzdGVtIjogImJhY2t1cF9mcyIsICJ1c2VyIjogImNsaWVudC5taXJyb3JfcGVlcl9ib290c3RyYXAiLCAic2l0ZV9uYW1lIjogInNpdGUtcmVtb3RlIiwgImtleSI6ICJBUUFhcDBCZ0xtRmpOeEFBVnNyZXozai9YYUV0T2UrbUJEZlJDZz09IiwgIm1vbl9ob3N0IjogIlt2MjoxOTIuMTY4LjAuNTo0MDkxOCx2MToxOTIuMTY4LjAuNTo0MDkxOV0ifQ==
|
$ ceph fs snapshot mirror peer_bootstrap import cephfs eyJmc2lkIjogIjBkZjE3MjE3LWRmY2QtNDAzMC05MDc5LTM2Nzk4NTVkNDJlZiIsICJmaWxlc3lzdGVtIjogImJhY2t1cF9mcyIsICJ1c2VyIjogImNsaWVudC5taXJyb3JfcGVlcl9ib290c3RyYXAiLCAic2l0ZV9uYW1lIjogInNpdGUtcmVtb3RlIiwgImtleSI6ICJBUUFhcDBCZ0xtRmpOeEFBVnNyZXozai9YYUV0T2UrbUJEZlJDZz09IiwgIm1vbl9ob3N0IjogIlt2MjoxOTIuMTY4LjAuNTo0MDkxOCx2MToxOTIuMTY4LjAuNTo0MDkxOV0ifQ==
|
||||||
|
|
||||||
|
|
||||||
|
.. _cephfs_mirroring_mirroring_status:
|
||||||
|
|
||||||
Mirroring Status
|
Mirroring Status
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
|
@ -78,7 +78,15 @@ By default, `cephfs-top` connects to cluster name `ceph`. To use a non-default c
|
|||||||
|
|
||||||
$ cephfs-top -d <seconds>
|
$ cephfs-top -d <seconds>
|
||||||
|
|
||||||
Interval should be greater or equal to 0.5 second. Fractional seconds are honoured.
|
Refresh interval should be a positive integer.
|
||||||
|
|
||||||
|
To dump the metrics to stdout without creating a curses display use::
|
||||||
|
|
||||||
|
$ cephfs-top --dump
|
||||||
|
|
||||||
|
To dump the metrics of the given filesystem to stdout without creating a curses display use::
|
||||||
|
|
||||||
|
$ cephfs-top --dumpfs <fs_name>
|
||||||
|
|
||||||
Interactive Commands
|
Interactive Commands
|
||||||
--------------------
|
--------------------
|
||||||
@ -104,3 +112,5 @@ The metrics display can be scrolled using the Arrow Keys, PgUp/PgDn, Home/End an
|
|||||||
Sample screenshot running `cephfs-top` with 2 filesystems:
|
Sample screenshot running `cephfs-top` with 2 filesystems:
|
||||||
|
|
||||||
.. image:: cephfs-top.png
|
.. image:: cephfs-top.png
|
||||||
|
|
||||||
|
.. note:: Minimum compatible python version for cephfs-top is 3.6.0. cephfs-top is supported on distros RHEL 8, Ubuntu 18.04, CentOS 8 and above.
|
||||||
|
@ -149,8 +149,8 @@ errors.
|
|||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
cephfs-data-scan scan_extents <data pool>
|
cephfs-data-scan scan_extents [<data pool> [<extra data pool> ...]]
|
||||||
cephfs-data-scan scan_inodes <data pool>
|
cephfs-data-scan scan_inodes [<data pool>]
|
||||||
cephfs-data-scan scan_links
|
cephfs-data-scan scan_links
|
||||||
|
|
||||||
'scan_extents' and 'scan_inodes' commands may take a *very long* time
|
'scan_extents' and 'scan_inodes' commands may take a *very long* time
|
||||||
@ -166,22 +166,22 @@ The example below shows how to run 4 workers simultaneously:
|
|||||||
::
|
::
|
||||||
|
|
||||||
# Worker 0
|
# Worker 0
|
||||||
cephfs-data-scan scan_extents --worker_n 0 --worker_m 4 <data pool>
|
cephfs-data-scan scan_extents --worker_n 0 --worker_m 4
|
||||||
# Worker 1
|
# Worker 1
|
||||||
cephfs-data-scan scan_extents --worker_n 1 --worker_m 4 <data pool>
|
cephfs-data-scan scan_extents --worker_n 1 --worker_m 4
|
||||||
# Worker 2
|
# Worker 2
|
||||||
cephfs-data-scan scan_extents --worker_n 2 --worker_m 4 <data pool>
|
cephfs-data-scan scan_extents --worker_n 2 --worker_m 4
|
||||||
# Worker 3
|
# Worker 3
|
||||||
cephfs-data-scan scan_extents --worker_n 3 --worker_m 4 <data pool>
|
cephfs-data-scan scan_extents --worker_n 3 --worker_m 4
|
||||||
|
|
||||||
# Worker 0
|
# Worker 0
|
||||||
cephfs-data-scan scan_inodes --worker_n 0 --worker_m 4 <data pool>
|
cephfs-data-scan scan_inodes --worker_n 0 --worker_m 4
|
||||||
# Worker 1
|
# Worker 1
|
||||||
cephfs-data-scan scan_inodes --worker_n 1 --worker_m 4 <data pool>
|
cephfs-data-scan scan_inodes --worker_n 1 --worker_m 4
|
||||||
# Worker 2
|
# Worker 2
|
||||||
cephfs-data-scan scan_inodes --worker_n 2 --worker_m 4 <data pool>
|
cephfs-data-scan scan_inodes --worker_n 2 --worker_m 4
|
||||||
# Worker 3
|
# Worker 3
|
||||||
cephfs-data-scan scan_inodes --worker_n 3 --worker_m 4 <data pool>
|
cephfs-data-scan scan_inodes --worker_n 3 --worker_m 4
|
||||||
|
|
||||||
It is **important** to ensure that all workers have completed the
|
It is **important** to ensure that all workers have completed the
|
||||||
scan_extents phase before any workers enter the scan_inodes phase.
|
scan_extents phase before any workers enter the scan_inodes phase.
|
||||||
@ -191,8 +191,13 @@ operation to delete ancillary data geneated during recovery.
|
|||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
cephfs-data-scan cleanup <data pool>
|
cephfs-data-scan cleanup [<data pool>]
|
||||||
|
|
||||||
|
Note, the data pool parameters for 'scan_extents', 'scan_inodes' and
|
||||||
|
'cleanup' commands are optional, and usually the tool will be able to
|
||||||
|
detect the pools automatically. Still you may override this. The
|
||||||
|
'scan_extents' command needs all data pools to be specified, while
|
||||||
|
'scan_inodes' and 'cleanup' commands need only the main data pool.
|
||||||
|
|
||||||
|
|
||||||
Using an alternate metadata pool for recovery
|
Using an alternate metadata pool for recovery
|
||||||
@ -250,8 +255,8 @@ Now perform the recovery of the metadata pool from the data pool:
|
|||||||
::
|
::
|
||||||
|
|
||||||
cephfs-data-scan init --force-init --filesystem cephfs_recovery --alternate-pool cephfs_recovery_meta
|
cephfs-data-scan init --force-init --filesystem cephfs_recovery --alternate-pool cephfs_recovery_meta
|
||||||
cephfs-data-scan scan_extents --alternate-pool cephfs_recovery_meta --filesystem <fs_name> <data_pool>
|
cephfs-data-scan scan_extents --alternate-pool cephfs_recovery_meta --filesystem <fs_name>
|
||||||
cephfs-data-scan scan_inodes --alternate-pool cephfs_recovery_meta --filesystem <fs_name> --force-corrupt <data_pool>
|
cephfs-data-scan scan_inodes --alternate-pool cephfs_recovery_meta --filesystem <fs_name> --force-corrupt
|
||||||
cephfs-data-scan scan_links --filesystem cephfs_recovery
|
cephfs-data-scan scan_links --filesystem cephfs_recovery
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
@ -3,23 +3,22 @@
|
|||||||
FS volumes and subvolumes
|
FS volumes and subvolumes
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
The volumes
|
The volumes module of the :term:`Ceph Manager` daemon (ceph-mgr) provides a
|
||||||
module of the :term:`Ceph Manager` daemon (ceph-mgr) provides a single
|
single source of truth for CephFS exports. The OpenStack shared file system
|
||||||
source of truth for CephFS exports. The OpenStack shared
|
service (manila_) and the Ceph Container Storage Interface (CSI_) storage
|
||||||
file system service (manila_) and Ceph Container Storage Interface (CSI_)
|
administrators use the common CLI provided by the ceph-mgr ``volumes`` module
|
||||||
storage administrators among others can use the common CLI provided by the
|
to manage CephFS exports.
|
||||||
ceph-mgr volumes module to manage CephFS exports.
|
|
||||||
|
|
||||||
The ceph-mgr volumes module implements the following file system export
|
The ceph-mgr ``volumes`` module implements the following file system export
|
||||||
abstactions:
|
abstractions:
|
||||||
|
|
||||||
* FS volumes, an abstraction for CephFS file systems
|
* FS volumes, an abstraction for CephFS file systems
|
||||||
|
|
||||||
* FS subvolumes, an abstraction for independent CephFS directory trees
|
* FS subvolumes, an abstraction for independent CephFS directory trees
|
||||||
|
|
||||||
* FS subvolume groups, an abstraction for a directory level higher than FS
|
* FS subvolume groups, an abstraction for a directory level higher than FS
|
||||||
subvolumes to effect policies (e.g., :doc:`/cephfs/file-layouts`) across a
|
subvolumes. Used to effect policies (e.g., :doc:`/cephfs/file-layouts`)
|
||||||
set of subvolumes
|
across a set of subvolumes
|
||||||
|
|
||||||
Some possible use-cases for the export abstractions:
|
Some possible use-cases for the export abstractions:
|
||||||
|
|
||||||
@ -38,67 +37,76 @@ Requirements
|
|||||||
mon 'allow r'
|
mon 'allow r'
|
||||||
mgr 'allow rw'
|
mgr 'allow rw'
|
||||||
|
|
||||||
|
|
||||||
FS Volumes
|
FS Volumes
|
||||||
----------
|
----------
|
||||||
|
|
||||||
Create a volume using::
|
Create a volume by running the following command:
|
||||||
|
|
||||||
$ ceph fs volume create <vol_name> [<placement>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs volume create <vol_name> [<placement>]
|
||||||
|
|
||||||
This creates a CephFS file system and its data and metadata pools. It can also
|
This creates a CephFS file system and its data and metadata pools. It can also
|
||||||
deploy MDS daemons for the filesystem using a ceph-mgr orchestrator
|
deploy MDS daemons for the filesystem using a ceph-mgr orchestrator module (for
|
||||||
module (see :doc:`/mgr/orchestrator`), for example Rook.
|
example Rook). See :doc:`/mgr/orchestrator`.
|
||||||
|
|
||||||
<vol_name> is the volume name (an arbitrary string), and
|
``<vol_name>`` is the volume name (an arbitrary string). ``<placement>`` is an
|
||||||
<placement> is an optional string that designates the hosts that should have
|
optional string that specifies the hosts that should have an MDS running on
|
||||||
an MDS running on them and, optionally, the total number of MDS daemons the cluster
|
them and, optionally, the total number of MDS daemons that the cluster should
|
||||||
should have. For example, the
|
have. For example, the following placement string means "deploy MDS on nodes
|
||||||
following placement string means "deploy MDS on nodes ``host1`` and ``host2`` (one
|
``host1`` and ``host2`` (one MDS per host)::
|
||||||
MDS per host):
|
|
||||||
|
|
||||||
"host1,host2"
|
"host1,host2"
|
||||||
|
|
||||||
and this placement specification says to deploy two MDS daemons on each of
|
The following placement specification means "deploy two MDS daemons on each of
|
||||||
nodes ``host1`` and ``host2`` (for a total of four MDS daemons in the cluster):
|
nodes ``host1`` and ``host2`` (for a total of four MDS daemons in the
|
||||||
|
cluster)"::
|
||||||
|
|
||||||
"4 host1,host2"
|
"4 host1,host2"
|
||||||
|
|
||||||
For more details on placement specification refer to the :ref:`orchestrator-cli-service-spec`,
|
See :ref:`orchestrator-cli-service-spec` for more on placement specification.
|
||||||
but keep in mind that specifying placement via a YAML file is not supported.
|
Specifying placement via a YAML file is not supported.
|
||||||
|
|
||||||
To remove a volume, run the following command::
|
To remove a volume, run the following command:
|
||||||
|
|
||||||
$ ceph fs volume rm <vol_name> [--yes-i-really-mean-it]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs volume rm <vol_name> [--yes-i-really-mean-it]
|
||||||
|
|
||||||
This removes a file system and its data and metadata pools. It also tries to
|
This removes a file system and its data and metadata pools. It also tries to
|
||||||
remove MDS daemons using the enabled ceph-mgr orchestrator module.
|
remove MDS daemons using the enabled ceph-mgr orchestrator module.
|
||||||
|
|
||||||
List volumes using::
|
List volumes by running the following command:
|
||||||
|
|
||||||
$ ceph fs volume ls
|
.. prompt:: bash $
|
||||||
|
|
||||||
Rename a volume using::
|
ceph fs volume ls
|
||||||
|
|
||||||
$ ceph fs volume rename <vol_name> <new_vol_name> [--yes-i-really-mean-it]
|
Rename a volume by running the following command:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs volume rename <vol_name> <new_vol_name> [--yes-i-really-mean-it]
|
||||||
|
|
||||||
Renaming a volume can be an expensive operation that requires the following:
|
Renaming a volume can be an expensive operation that requires the following:
|
||||||
|
|
||||||
- Rename the orchestrator-managed MDS service to match the <new_vol_name>.
|
- Renaming the orchestrator-managed MDS service to match the <new_vol_name>.
|
||||||
This involves launching a MDS service with <new_vol_name> and bringing down
|
This involves launching a MDS service with ``<new_vol_name>`` and bringing
|
||||||
the MDS service with <vol_name>.
|
down the MDS service with ``<vol_name>``.
|
||||||
- Rename the file system matching <vol_name> to <new_vol_name>
|
- Renaming the file system matching ``<vol_name>`` to ``<new_vol_name>``.
|
||||||
- Change the application tags on the data and metadata pools of the file system
|
- Changing the application tags on the data and metadata pools of the file system
|
||||||
to <new_vol_name>
|
to ``<new_vol_name>``.
|
||||||
- Rename the metadata and data pools of the file system.
|
- Renaming the metadata and data pools of the file system.
|
||||||
|
|
||||||
The CephX IDs authorized for <vol_name> need to be reauthorized for <new_vol_name>. Any
|
The CephX IDs that are authorized for ``<vol_name>`` must be reauthorized for
|
||||||
on-going operations of the clients using these IDs may be disrupted. Mirroring is
|
``<new_vol_name>``. Any ongoing operations of the clients using these IDs may
|
||||||
expected to be disabled on the volume.
|
be disrupted. Ensure that mirroring is disabled on the volume.
|
||||||
|
|
||||||
To fetch the information of a CephFS volume, run::
|
To fetch the information of a CephFS volume, run the following command:
|
||||||
|
|
||||||
$ ceph fs volume info vol_name [--human_readable]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs volume info vol_name [--human_readable]
|
||||||
|
|
||||||
The ``--human_readable`` flag shows used and available pool capacities in KB/MB/GB.
|
The ``--human_readable`` flag shows used and available pool capacities in KB/MB/GB.
|
||||||
|
|
||||||
@ -142,9 +150,11 @@ Sample output of the ``volume info`` command::
|
|||||||
FS Subvolume groups
|
FS Subvolume groups
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Create a subvolume group using::
|
Create a subvolume group by running the following command:
|
||||||
|
|
||||||
$ ceph fs subvolumegroup create <vol_name> <group_name> [--size <size_in_bytes>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolumegroup create <vol_name> <group_name> [--size <size_in_bytes>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>]
|
||||||
|
|
||||||
The command succeeds even if the subvolume group already exists.
|
The command succeeds even if the subvolume group already exists.
|
||||||
|
|
||||||
@ -152,32 +162,41 @@ When creating a subvolume group you can specify its data pool layout (see
|
|||||||
:doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals, and
|
:doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals, and
|
||||||
size in bytes. The size of the subvolume group is specified by setting
|
size in bytes. The size of the subvolume group is specified by setting
|
||||||
a quota on it (see :doc:`/cephfs/quota`). By default, the subvolume group
|
a quota on it (see :doc:`/cephfs/quota`). By default, the subvolume group
|
||||||
is created with octal file mode '755', uid '0', gid '0' and the data pool
|
is created with octal file mode ``755``, uid ``0``, gid ``0`` and the data pool
|
||||||
layout of its parent directory.
|
layout of its parent directory.
|
||||||
|
|
||||||
|
Remove a subvolume group by running a command of the following form:
|
||||||
|
|
||||||
Remove a subvolume group using::
|
.. prompt:: bash $
|
||||||
|
|
||||||
$ ceph fs subvolumegroup rm <vol_name> <group_name> [--force]
|
ceph fs subvolumegroup rm <vol_name> <group_name> [--force]
|
||||||
|
|
||||||
The removal of a subvolume group fails if it is not empty or non-existent.
|
The removal of a subvolume group fails if the subvolume group is not empty or
|
||||||
'--force' flag allows the non-existent subvolume group remove command to succeed.
|
is non-existent. The ``--force`` flag allows the non-existent "subvolume group
|
||||||
|
remove command" to succeed.
|
||||||
|
|
||||||
|
|
||||||
Fetch the absolute path of a subvolume group using::
|
Fetch the absolute path of a subvolume group by running a command of the
|
||||||
|
following form:
|
||||||
|
|
||||||
$ ceph fs subvolumegroup getpath <vol_name> <group_name>
|
.. prompt:: bash $
|
||||||
|
|
||||||
List subvolume groups using::
|
ceph fs subvolumegroup getpath <vol_name> <group_name>
|
||||||
|
|
||||||
$ ceph fs subvolumegroup ls <vol_name>
|
List subvolume groups by running a command of the following form:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolumegroup ls <vol_name>
|
||||||
|
|
||||||
.. note:: Subvolume group snapshot feature is no longer supported in mainline CephFS (existing group
|
.. note:: Subvolume group snapshot feature is no longer supported in mainline CephFS (existing group
|
||||||
snapshots can still be listed and deleted)
|
snapshots can still be listed and deleted)
|
||||||
|
|
||||||
Fetch the metadata of a subvolume group using::
|
Fetch the metadata of a subvolume group by running a command of the following form:
|
||||||
|
|
||||||
$ ceph fs subvolumegroup info <vol_name> <group_name>
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolumegroup info <vol_name> <group_name>
|
||||||
|
|
||||||
The output format is JSON and contains fields as follows:
|
The output format is JSON and contains fields as follows:
|
||||||
|
|
||||||
@ -194,62 +213,77 @@ The output format is JSON and contains fields as follows:
|
|||||||
* ``created_at``: creation time of the subvolume group in the format "YYYY-MM-DD HH:MM:SS"
|
* ``created_at``: creation time of the subvolume group in the format "YYYY-MM-DD HH:MM:SS"
|
||||||
* ``data_pool``: data pool to which the subvolume group belongs
|
* ``data_pool``: data pool to which the subvolume group belongs
|
||||||
|
|
||||||
Check the presence of any subvolume group using::
|
Check the presence of any subvolume group by running a command of the following form:
|
||||||
|
|
||||||
$ ceph fs subvolumegroup exist <vol_name>
|
.. prompt:: bash $
|
||||||
|
|
||||||
The 'exist' command outputs:
|
ceph fs subvolumegroup exist <vol_name>
|
||||||
|
|
||||||
|
The ``exist`` command outputs:
|
||||||
|
|
||||||
* "subvolumegroup exists": if any subvolumegroup is present
|
* "subvolumegroup exists": if any subvolumegroup is present
|
||||||
* "no subvolumegroup exists": if no subvolumegroup is present
|
* "no subvolumegroup exists": if no subvolumegroup is present
|
||||||
|
|
||||||
.. note:: This command checks for the presence of custom groups and not presence of the default one. To validate the emptiness of the volume, a subvolumegroup existence check alone is not sufficient. Subvolume existence also needs to be checked as there might be subvolumes in the default group.
|
.. note:: This command checks for the presence of custom groups and not
|
||||||
|
presence of the default one. To validate the emptiness of the volume, a
|
||||||
|
subvolumegroup existence check alone is not sufficient. Subvolume existence
|
||||||
|
also needs to be checked as there might be subvolumes in the default group.
|
||||||
|
|
||||||
Resize a subvolume group using::
|
Resize a subvolume group by running a command of the following form:
|
||||||
|
|
||||||
$ ceph fs subvolumegroup resize <vol_name> <group_name> <new_size> [--no_shrink]
|
.. prompt:: bash $
|
||||||
|
|
||||||
The command resizes the subvolume group quota using the size specified by ``new_size``.
|
ceph fs subvolumegroup resize <vol_name> <group_name> <new_size> [--no_shrink]
|
||||||
The ``--no_shrink`` flag prevents the subvolume group from shrinking below the current used
|
|
||||||
size.
|
|
||||||
|
|
||||||
The subvolume group may be resized to an infinite size by passing ``inf`` or ``infinite``
|
The command resizes the subvolume group quota, using the size specified by
|
||||||
as the ``new_size``.
|
``new_size``. The ``--no_shrink`` flag prevents the subvolume group from
|
||||||
|
shrinking below the current used size.
|
||||||
|
|
||||||
Remove a snapshot of a subvolume group using::
|
The subvolume group may be resized to an infinite size by passing ``inf`` or
|
||||||
|
``infinite`` as the ``new_size``.
|
||||||
|
|
||||||
$ ceph fs subvolumegroup snapshot rm <vol_name> <group_name> <snap_name> [--force]
|
Remove a snapshot of a subvolume group by running a command of the following form:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolumegroup snapshot rm <vol_name> <group_name> <snap_name> [--force]
|
||||||
|
|
||||||
Supplying the ``--force`` flag allows the command to succeed when it would otherwise
|
Supplying the ``--force`` flag allows the command to succeed when it would otherwise
|
||||||
fail due to the snapshot not existing.
|
fail due to the nonexistence of the snapshot.
|
||||||
|
|
||||||
List snapshots of a subvolume group using::
|
List snapshots of a subvolume group by running a command of the following form:
|
||||||
|
|
||||||
$ ceph fs subvolumegroup snapshot ls <vol_name> <group_name>
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolumegroup snapshot ls <vol_name> <group_name>
|
||||||
|
|
||||||
|
|
||||||
FS Subvolumes
|
FS Subvolumes
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
Create a subvolume using::
|
Create a subvolume using:
|
||||||
|
|
||||||
$ ceph fs subvolume create <vol_name> <subvol_name> [--size <size_in_bytes>] [--group_name <subvol_group_name>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>] [--namespace-isolated]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume create <vol_name> <subvol_name> [--size <size_in_bytes>] [--group_name <subvol_group_name>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>] [--namespace-isolated]
|
||||||
|
|
||||||
|
|
||||||
The command succeeds even if the subvolume already exists.
|
The command succeeds even if the subvolume already exists.
|
||||||
|
|
||||||
When creating a subvolume you can specify its subvolume group, data pool layout,
|
When creating a subvolume you can specify its subvolume group, data pool
|
||||||
uid, gid, file mode in octal numerals, and size in bytes. The size of the subvolume is
|
layout, uid, gid, file mode in octal numerals, and size in bytes. The size of
|
||||||
specified by setting a quota on it (see :doc:`/cephfs/quota`). The subvolume can be
|
the subvolume is specified by setting a quota on it (see :doc:`/cephfs/quota`).
|
||||||
created in a separate RADOS namespace by specifying --namespace-isolated option. By
|
The subvolume can be created in a separate RADOS namespace by specifying
|
||||||
default a subvolume is created within the default subvolume group, and with an octal file
|
--namespace-isolated option. By default a subvolume is created within the
|
||||||
mode '755', uid of its subvolume group, gid of its subvolume group, data pool layout of
|
default subvolume group, and with an octal file mode '755', uid of its
|
||||||
its parent directory and no size limit.
|
subvolume group, gid of its subvolume group, data pool layout of its parent
|
||||||
|
directory and no size limit.
|
||||||
|
|
||||||
Remove a subvolume using::
|
Remove a subvolume using:
|
||||||
|
|
||||||
$ ceph fs subvolume rm <vol_name> <subvol_name> [--group_name <subvol_group_name>] [--force] [--retain-snapshots]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume rm <vol_name> <subvol_name> [--group_name <subvol_group_name>] [--force] [--retain-snapshots]
|
||||||
|
|
||||||
The command removes the subvolume and its contents. It does this in two steps.
|
The command removes the subvolume and its contents. It does this in two steps.
|
||||||
First, it moves the subvolume to a trash folder, and then asynchronously purges
|
First, it moves the subvolume to a trash folder, and then asynchronously purges
|
||||||
@ -262,44 +296,62 @@ A subvolume can be removed retaining existing snapshots of the subvolume using t
|
|||||||
'--retain-snapshots' option. If snapshots are retained, the subvolume is considered
|
'--retain-snapshots' option. If snapshots are retained, the subvolume is considered
|
||||||
empty for all operations not involving the retained snapshots.
|
empty for all operations not involving the retained snapshots.
|
||||||
|
|
||||||
.. note:: Snapshot retained subvolumes can be recreated using 'ceph fs subvolume create'
|
.. note:: Snapshot retained subvolumes can be recreated using 'ceph fs
|
||||||
|
subvolume create'
|
||||||
|
|
||||||
.. note:: Retained snapshots can be used as a clone source to recreate the subvolume, or clone to a newer subvolume.
|
.. note:: Retained snapshots can be used as a clone source to recreate the
|
||||||
|
subvolume, or clone to a newer subvolume.
|
||||||
|
|
||||||
Resize a subvolume using::
|
Resize a subvolume using:
|
||||||
|
|
||||||
$ ceph fs subvolume resize <vol_name> <subvol_name> <new_size> [--group_name <subvol_group_name>] [--no_shrink]
|
.. prompt:: bash $
|
||||||
|
|
||||||
The command resizes the subvolume quota using the size specified by ``new_size``.
|
ceph fs subvolume resize <vol_name> <subvol_name> <new_size> [--group_name <subvol_group_name>] [--no_shrink]
|
||||||
The `--no_shrink`` flag prevents the subvolume from shrinking below the current used size of the subvolume.
|
|
||||||
|
|
||||||
The subvolume can be resized to an unlimited (but sparse) logical size by passing ``inf`` or ``infinite`` as `` new_size``.
|
The command resizes the subvolume quota using the size specified by
|
||||||
|
``new_size``. The `--no_shrink`` flag prevents the subvolume from shrinking
|
||||||
|
below the current used size of the subvolume.
|
||||||
|
|
||||||
Authorize cephx auth IDs, the read/read-write access to fs subvolumes::
|
The subvolume can be resized to an unlimited (but sparse) logical size by
|
||||||
|
passing ``inf`` or ``infinite`` as `` new_size``.
|
||||||
|
|
||||||
$ ceph fs subvolume authorize <vol_name> <sub_name> <auth_id> [--group_name=<group_name>] [--access_level=<access_level>]
|
Authorize cephx auth IDs, the read/read-write access to fs subvolumes:
|
||||||
|
|
||||||
The 'access_level' takes 'r' or 'rw' as value.
|
.. prompt:: bash $
|
||||||
|
|
||||||
Deauthorize cephx auth IDs, the read/read-write access to fs subvolumes::
|
ceph fs subvolume authorize <vol_name> <sub_name> <auth_id> [--group_name=<group_name>] [--access_level=<access_level>]
|
||||||
|
|
||||||
$ ceph fs subvolume deauthorize <vol_name> <sub_name> <auth_id> [--group_name=<group_name>]
|
The ``access_level`` takes ``r`` or ``rw`` as value.
|
||||||
|
|
||||||
List cephx auth IDs authorized to access fs subvolume::
|
Deauthorize cephx auth IDs, the read/read-write access to fs subvolumes:
|
||||||
|
|
||||||
$ ceph fs subvolume authorized_list <vol_name> <sub_name> [--group_name=<group_name>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
Evict fs clients based on auth ID and subvolume mounted::
|
ceph fs subvolume deauthorize <vol_name> <sub_name> <auth_id> [--group_name=<group_name>]
|
||||||
|
|
||||||
$ ceph fs subvolume evict <vol_name> <sub_name> <auth_id> [--group_name=<group_name>]
|
List cephx auth IDs authorized to access fs subvolume:
|
||||||
|
|
||||||
Fetch the absolute path of a subvolume using::
|
.. prompt:: bash $
|
||||||
|
|
||||||
$ ceph fs subvolume getpath <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
ceph fs subvolume authorized_list <vol_name> <sub_name> [--group_name=<group_name>]
|
||||||
|
|
||||||
Fetch the information of a subvolume using::
|
Evict fs clients based on auth ID and subvolume mounted:
|
||||||
|
|
||||||
$ ceph fs subvolume info <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume evict <vol_name> <sub_name> <auth_id> [--group_name=<group_name>]
|
||||||
|
|
||||||
|
Fetch the absolute path of a subvolume using:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume getpath <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
|
Fetch the information of a subvolume using:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume info <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
The output format is JSON and contains fields as follows.
|
The output format is JSON and contains fields as follows.
|
||||||
|
|
||||||
@ -339,67 +391,93 @@ A subvolume's ``state`` is based on the current state of the subvolume and conta
|
|||||||
* ``complete``: subvolume is ready for all operations
|
* ``complete``: subvolume is ready for all operations
|
||||||
* ``snapshot-retained``: subvolume is removed but its snapshots are retained
|
* ``snapshot-retained``: subvolume is removed but its snapshots are retained
|
||||||
|
|
||||||
List subvolumes using::
|
List subvolumes using:
|
||||||
|
|
||||||
$ ceph fs subvolume ls <vol_name> [--group_name <subvol_group_name>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
.. note:: subvolumes that are removed but have snapshots retained, are also listed.
|
ceph fs subvolume ls <vol_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
Check the presence of any subvolume using::
|
.. note:: subvolumes that are removed but have snapshots retained, are also
|
||||||
|
listed.
|
||||||
|
|
||||||
$ ceph fs subvolume exist <vol_name> [--group_name <subvol_group_name>]
|
Check the presence of any subvolume using:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume exist <vol_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
These are the possible results of the ``exist`` command:
|
These are the possible results of the ``exist`` command:
|
||||||
|
|
||||||
* ``subvolume exists``: if any subvolume of given group_name is present
|
* ``subvolume exists``: if any subvolume of given group_name is present
|
||||||
* ``no subvolume exists``: if no subvolume of given group_name is present
|
* ``no subvolume exists``: if no subvolume of given group_name is present
|
||||||
|
|
||||||
Set custom metadata on the subvolume as a key-value pair using::
|
Set custom metadata on the subvolume as a key-value pair using:
|
||||||
|
|
||||||
$ ceph fs subvolume metadata set <vol_name> <subvol_name> <key_name> <value> [--group_name <subvol_group_name>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
.. note:: If the key_name already exists then the old value will get replaced by the new value.
|
ceph fs subvolume metadata set <vol_name> <subvol_name> <key_name> <value> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
.. note:: key_name and value should be a string of ASCII characters (as specified in python's string.printable). key_name is case-insensitive and always stored in lower case.
|
.. note:: If the key_name already exists then the old value will get replaced
|
||||||
|
by the new value.
|
||||||
|
|
||||||
.. note:: Custom metadata on a subvolume is not preserved when snapshotting the subvolume, and hence, is also not preserved when cloning the subvolume snapshot.
|
.. note:: key_name and value should be a string of ASCII characters (as
|
||||||
|
specified in python's string.printable). key_name is case-insensitive and
|
||||||
|
always stored in lower case.
|
||||||
|
|
||||||
Get custom metadata set on the subvolume using the metadata key::
|
.. note:: Custom metadata on a subvolume is not preserved when snapshotting the
|
||||||
|
subvolume, and hence, is also not preserved when cloning the subvolume
|
||||||
|
snapshot.
|
||||||
|
|
||||||
$ ceph fs subvolume metadata get <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>]
|
Get custom metadata set on the subvolume using the metadata key:
|
||||||
|
|
||||||
List custom metadata (key-value pairs) set on the subvolume using::
|
.. prompt:: bash $
|
||||||
|
|
||||||
$ ceph fs subvolume metadata ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
ceph fs subvolume metadata get <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
Remove custom metadata set on the subvolume using the metadata key::
|
List custom metadata (key-value pairs) set on the subvolume using:
|
||||||
|
|
||||||
$ ceph fs subvolume metadata rm <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume metadata ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
|
Remove custom metadata set on the subvolume using the metadata key:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume metadata rm <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
||||||
|
|
||||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||||
fail if the metadata key did not exist.
|
fail if the metadata key did not exist.
|
||||||
|
|
||||||
Create a snapshot of a subvolume using::
|
Create a snapshot of a subvolume using:
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot create <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume snapshot create <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
Remove a snapshot of a subvolume using::
|
Remove a snapshot of a subvolume using:
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot rm <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>] [--force]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume snapshot rm <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>] [--force]
|
||||||
|
|
||||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||||
fail if the snapshot did not exist.
|
fail if the snapshot did not exist.
|
||||||
|
|
||||||
.. note:: if the last snapshot within a snapshot retained subvolume is removed, the subvolume is also removed
|
.. note:: if the last snapshot within a snapshot retained subvolume is removed,
|
||||||
|
the subvolume is also removed
|
||||||
|
|
||||||
List snapshots of a subvolume using::
|
List snapshots of a subvolume using:
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
Fetch the information of a snapshot using::
|
ceph fs subvolume snapshot ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot info <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
Fetch the information of a snapshot using:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume snapshot info <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
The output format is JSON and contains fields as follows.
|
The output format is JSON and contains fields as follows.
|
||||||
|
|
||||||
@ -440,27 +518,40 @@ Sample output when no snapshot clone is in progress or pending::
|
|||||||
"has_pending_clones": "no"
|
"has_pending_clones": "no"
|
||||||
}
|
}
|
||||||
|
|
||||||
Set custom key-value metadata on the snapshot by running::
|
Set custom key-value metadata on the snapshot by running:
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot metadata set <vol_name> <subvol_name> <snap_name> <key_name> <value> [--group_name <subvol_group_name>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
.. note:: If the key_name already exists then the old value will get replaced by the new value.
|
ceph fs subvolume snapshot metadata set <vol_name> <subvol_name> <snap_name> <key_name> <value> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
.. note:: The key_name and value should be a strings of ASCII characters (as specified in Python's ``string.printable``). The key_name is case-insensitive and always stored in lowercase.
|
.. note:: If the key_name already exists then the old value will get replaced
|
||||||
|
by the new value.
|
||||||
|
|
||||||
.. note:: Custom metadata on a snapshot is not preserved when snapshotting the subvolume, and hence is also not preserved when cloning the subvolume snapshot.
|
.. note:: The key_name and value should be a strings of ASCII characters (as
|
||||||
|
specified in Python's ``string.printable``). The key_name is
|
||||||
|
case-insensitive and always stored in lowercase.
|
||||||
|
|
||||||
Get custom metadata set on the snapshot using the metadata key::
|
.. note:: Custom metadata on a snapshot is not preserved when snapshotting the
|
||||||
|
subvolume, and hence is also not preserved when cloning the subvolume
|
||||||
|
snapshot.
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot metadata get <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>]
|
Get custom metadata set on the snapshot using the metadata key:
|
||||||
|
|
||||||
List custom metadata (key-value pairs) set on the snapshot using::
|
.. prompt:: bash $
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot metadata ls <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
ceph fs subvolume snapshot metadata get <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
Remove custom metadata set on the snapshot using the metadata key::
|
List custom metadata (key-value pairs) set on the snapshot using:
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot metadata rm <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume snapshot metadata ls <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
|
Remove custom metadata set on the snapshot using the metadata key:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume snapshot metadata rm <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
||||||
|
|
||||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||||
fail if the metadata key did not exist.
|
fail if the metadata key did not exist.
|
||||||
@ -468,47 +559,73 @@ fail if the metadata key did not exist.
|
|||||||
Cloning Snapshots
|
Cloning Snapshots
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
Subvolumes can be created by cloning subvolume snapshots. Cloning is an asynchronous operation that copies
|
Subvolumes can be created by cloning subvolume snapshots. Cloning is an
|
||||||
data from a snapshot to a subvolume. Due to this bulk copying, cloning is inefficient for very large
|
asynchronous operation that copies data from a snapshot to a subvolume. Due to
|
||||||
data sets.
|
this bulk copying, cloning is inefficient for very large data sets.
|
||||||
|
|
||||||
.. note:: Removing a snapshot (source subvolume) would fail if there are pending or in progress clone operations.
|
.. note:: Removing a snapshot (source subvolume) would fail if there are
|
||||||
|
pending or in progress clone operations.
|
||||||
|
|
||||||
Protecting snapshots prior to cloning was a prerequisite in the Nautilus release, and the commands to protect/unprotect
|
Protecting snapshots prior to cloning was a prerequisite in the Nautilus
|
||||||
snapshots were introduced for this purpose. This prerequisite, and hence the commands to protect/unprotect, is being
|
release, and the commands to protect/unprotect snapshots were introduced for
|
||||||
deprecated and may be removed from a future release.
|
this purpose. This prerequisite, and hence the commands to protect/unprotect,
|
||||||
|
is being deprecated and may be removed from a future release.
|
||||||
|
|
||||||
The commands being deprecated are::
|
The commands being deprecated are:
|
||||||
$ ceph fs subvolume snapshot protect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
|
||||||
$ ceph fs subvolume snapshot unprotect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
|
||||||
|
|
||||||
.. note:: Using the above commands will not result in an error, but they have no useful purpose.
|
.. prompt:: bash #
|
||||||
|
|
||||||
.. note:: Use the ``subvolume info`` command to fetch subvolume metadata regarding supported ``features`` to help decide if protect/unprotect of snapshots is required, based on the availability of the ``snapshot-autoprotect`` feature.
|
ceph fs subvolume snapshot protect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||||
|
ceph fs subvolume snapshot unprotect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||||
|
|
||||||
To initiate a clone operation use::
|
.. note:: Using the above commands will not result in an error, but they have
|
||||||
|
no useful purpose.
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name>
|
.. note:: Use the ``subvolume info`` command to fetch subvolume metadata
|
||||||
|
regarding supported ``features`` to help decide if protect/unprotect of
|
||||||
|
snapshots is required, based on the availability of the
|
||||||
|
``snapshot-autoprotect`` feature.
|
||||||
|
|
||||||
If a snapshot (source subvolume) is a part of non-default group, the group name needs to be specified::
|
To initiate a clone operation use:
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --group_name <subvol_group_name>
|
.. prompt:: bash $
|
||||||
|
|
||||||
Cloned subvolumes can be a part of a different group than the source snapshot (by default, cloned subvolumes are created in default group). To clone to a particular group use::
|
ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name>
|
||||||
|
|
||||||
|
If a snapshot (source subvolume) is a part of non-default group, the group name
|
||||||
|
needs to be specified:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --group_name <subvol_group_name>
|
||||||
|
|
||||||
|
Cloned subvolumes can be a part of a different group than the source snapshot
|
||||||
|
(by default, cloned subvolumes are created in default group). To clone to a
|
||||||
|
particular group use:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --target_group_name <subvol_group_name>
|
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --target_group_name <subvol_group_name>
|
||||||
|
|
||||||
Similar to specifying a pool layout when creating a subvolume, pool layout can be specified when creating a cloned subvolume. To create a cloned subvolume with a specific pool layout use::
|
Similar to specifying a pool layout when creating a subvolume, pool layout can
|
||||||
|
be specified when creating a cloned subvolume. To create a cloned subvolume
|
||||||
|
with a specific pool layout use:
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --pool_layout <pool_layout>
|
.. prompt:: bash $
|
||||||
|
|
||||||
Configure the maximum number of concurrent clones. The default is 4::
|
ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --pool_layout <pool_layout>
|
||||||
|
|
||||||
$ ceph config set mgr mgr/volumes/max_concurrent_clones <value>
|
Configure the maximum number of concurrent clones. The default is 4:
|
||||||
|
|
||||||
To check the status of a clone operation use::
|
.. prompt:: bash $
|
||||||
|
|
||||||
$ ceph fs clone status <vol_name> <clone_name> [--group_name <group_name>]
|
ceph config set mgr mgr/volumes/max_concurrent_clones <value>
|
||||||
|
|
||||||
|
To check the status of a clone operation use:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs clone status <vol_name> <clone_name> [--group_name <group_name>]
|
||||||
|
|
||||||
A clone can be in one of the following states:
|
A clone can be in one of the following states:
|
||||||
|
|
||||||
@ -538,7 +655,8 @@ Here is an example of an ``in-progress`` clone::
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
.. note:: The ``failure`` section will be shown only if the clone's state is ``failed`` or ``cancelled``
|
.. note:: The ``failure`` section will be shown only if the clone's state is
|
||||||
|
``failed`` or ``cancelled``
|
||||||
|
|
||||||
Here is an example of a ``failed`` clone::
|
Here is an example of a ``failed`` clone::
|
||||||
|
|
||||||
@ -560,9 +678,11 @@ Here is an example of a ``failed`` clone::
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
(NOTE: since ``subvol1`` is in the default group, the ``source`` object's ``clone status`` does not include the group name)
|
(NOTE: since ``subvol1`` is in the default group, the ``source`` object's
|
||||||
|
``clone status`` does not include the group name)
|
||||||
|
|
||||||
.. note:: Cloned subvolumes are accessible only after the clone operation has successfully completed.
|
.. note:: Cloned subvolumes are accessible only after the clone operation has
|
||||||
|
successfully completed.
|
||||||
|
|
||||||
After a successful clone operation, ``clone status`` will look like the below::
|
After a successful clone operation, ``clone status`` will look like the below::
|
||||||
|
|
||||||
@ -576,37 +696,47 @@ After a successful clone operation, ``clone status`` will look like the below::
|
|||||||
If a clone operation is unsuccessful, the ``state`` value will be ``failed``.
|
If a clone operation is unsuccessful, the ``state`` value will be ``failed``.
|
||||||
|
|
||||||
To retry a failed clone operation, the incomplete clone must be deleted and the
|
To retry a failed clone operation, the incomplete clone must be deleted and the
|
||||||
clone operation must be issued again. To delete a partial clone use::
|
clone operation must be issued again. To delete a partial clone use:
|
||||||
|
|
||||||
$ ceph fs subvolume rm <vol_name> <clone_name> [--group_name <group_name>] --force
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume rm <vol_name> <clone_name> [--group_name <group_name>] --force
|
||||||
|
|
||||||
.. note:: Cloning synchronizes only directories, regular files and symbolic
|
.. note:: Cloning synchronizes only directories, regular files and symbolic
|
||||||
links. Inode timestamps (access and modification times) are synchronized up
|
links. Inode timestamps (access and modification times) are synchronized up
|
||||||
to seconds granularity.
|
to seconds granularity.
|
||||||
|
|
||||||
An ``in-progress`` or a ``pending`` clone operation may be canceled. To cancel
|
An ``in-progress`` or a ``pending`` clone operation may be canceled. To cancel
|
||||||
a clone operation use the ``clone cancel`` command::
|
a clone operation use the ``clone cancel`` command:
|
||||||
|
|
||||||
$ ceph fs clone cancel <vol_name> <clone_name> [--group_name <group_name>]
|
.. prompt:: bash $
|
||||||
|
|
||||||
On successful cancellation, the cloned subvolume is moved to the ``canceled``
|
ceph fs clone cancel <vol_name> <clone_name> [--group_name <group_name>]
|
||||||
state::
|
|
||||||
|
|
||||||
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
|
On successful cancellation, the cloned subvolume is moved to the ``canceled`` state:
|
||||||
$ ceph fs clone cancel cephfs clone1
|
|
||||||
$ ceph fs clone status cephfs clone1
|
.. prompt:: bash #
|
||||||
{
|
|
||||||
"status": {
|
ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
|
||||||
"state": "canceled",
|
ceph fs clone cancel cephfs clone1
|
||||||
"source": {
|
ceph fs clone status cephfs clone1
|
||||||
"volume": "cephfs",
|
|
||||||
"subvolume": "subvol1",
|
::
|
||||||
"snapshot": "snap1"
|
|
||||||
}
|
{
|
||||||
|
"status": {
|
||||||
|
"state": "canceled",
|
||||||
|
"source": {
|
||||||
|
"volume": "cephfs",
|
||||||
|
"subvolume": "subvol1",
|
||||||
|
"snapshot": "snap1"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
.. note:: The canceled cloned may be deleted by supplying the ``--force`` option to the `fs subvolume rm` command.
|
.. note:: The canceled cloned may be deleted by supplying the ``--force``
|
||||||
|
option to the `fs subvolume rm` command.
|
||||||
|
|
||||||
|
|
||||||
.. _subvol-pinning:
|
.. _subvol-pinning:
|
||||||
@ -614,28 +744,33 @@ state::
|
|||||||
Pinning Subvolumes and Subvolume Groups
|
Pinning Subvolumes and Subvolume Groups
|
||||||
---------------------------------------
|
---------------------------------------
|
||||||
|
|
||||||
|
|
||||||
Subvolumes and subvolume groups may be automatically pinned to ranks according
|
Subvolumes and subvolume groups may be automatically pinned to ranks according
|
||||||
to policies. This can distribute load across MDS ranks in predictable and
|
to policies. This can distribute load across MDS ranks in predictable and
|
||||||
stable ways. Review :ref:`cephfs-pinning` and :ref:`cephfs-ephemeral-pinning`
|
stable ways. Review :ref:`cephfs-pinning` and :ref:`cephfs-ephemeral-pinning`
|
||||||
for details on how pinning works.
|
for details on how pinning works.
|
||||||
|
|
||||||
Pinning is configured by::
|
Pinning is configured by:
|
||||||
|
|
||||||
$ ceph fs subvolumegroup pin <vol_name> <group_name> <pin_type> <pin_setting>
|
.. prompt:: bash $
|
||||||
|
|
||||||
or for subvolumes::
|
ceph fs subvolumegroup pin <vol_name> <group_name> <pin_type> <pin_setting>
|
||||||
|
|
||||||
$ ceph fs subvolume pin <vol_name> <group_name> <pin_type> <pin_setting>
|
or for subvolumes:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolume pin <vol_name> <group_name> <pin_type> <pin_setting>
|
||||||
|
|
||||||
Typically you will want to set subvolume group pins. The ``pin_type`` may be
|
Typically you will want to set subvolume group pins. The ``pin_type`` may be
|
||||||
one of ``export``, ``distributed``, or ``random``. The ``pin_setting``
|
one of ``export``, ``distributed``, or ``random``. The ``pin_setting``
|
||||||
corresponds to the extended attributed "value" as in the pinning documentation
|
corresponds to the extended attributed "value" as in the pinning documentation
|
||||||
referenced above.
|
referenced above.
|
||||||
|
|
||||||
So, for example, setting a distributed pinning strategy on a subvolume group::
|
So, for example, setting a distributed pinning strategy on a subvolume group:
|
||||||
|
|
||||||
$ ceph fs subvolumegroup pin cephfilesystem-a csi distributed 1
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph fs subvolumegroup pin cephfilesystem-a csi distributed 1
|
||||||
|
|
||||||
Will enable distributed subtree partitioning policy for the "csi" subvolume
|
Will enable distributed subtree partitioning policy for the "csi" subvolume
|
||||||
group. This will cause every subvolume within the group to be automatically
|
group. This will cause every subvolume within the group to be automatically
|
||||||
|
@ -123,7 +123,9 @@ other daemons, please see :ref:`health-checks`.
|
|||||||
from properly cleaning up resources used by client requests. This message
|
from properly cleaning up resources used by client requests. This message
|
||||||
appears if a client appears to have more than ``max_completed_requests``
|
appears if a client appears to have more than ``max_completed_requests``
|
||||||
(default 100000) requests that are complete on the MDS side but haven't
|
(default 100000) requests that are complete on the MDS side but haven't
|
||||||
yet been accounted for in the client's *oldest tid* value.
|
yet been accounted for in the client's *oldest tid* value. The last tid
|
||||||
|
used by the MDS to trim completed client requests (or flush) is included
|
||||||
|
as part of `session ls` (or `client ls`) command as a debug aid.
|
||||||
* ``MDS_DAMAGE``
|
* ``MDS_DAMAGE``
|
||||||
|
|
||||||
Message
|
Message
|
||||||
@ -168,3 +170,15 @@ other daemons, please see :ref:`health-checks`.
|
|||||||
the actual cache size (in memory) is at least 50% greater than
|
the actual cache size (in memory) is at least 50% greater than
|
||||||
``mds_cache_memory_limit`` (default 1GB). Modify ``mds_health_cache_threshold``
|
``mds_cache_memory_limit`` (default 1GB). Modify ``mds_health_cache_threshold``
|
||||||
to set the warning ratio.
|
to set the warning ratio.
|
||||||
|
|
||||||
|
* ``MDS_CLIENTS_LAGGY``
|
||||||
|
|
||||||
|
Message
|
||||||
|
"Client *ID* is laggy; not evicted because some OSD(s) is/are laggy"
|
||||||
|
|
||||||
|
Description
|
||||||
|
If OSD(s) is laggy (due to certain conditions like network cut-off, etc)
|
||||||
|
then it might make clients laggy(session might get idle or cannot flush
|
||||||
|
dirty data for cap revokes). If ``defer_client_eviction_on_laggy_osds`` is
|
||||||
|
set to true (default true), client eviction will not take place and thus
|
||||||
|
this health warning will be generated.
|
||||||
|
@ -501,6 +501,25 @@
|
|||||||
:Type: 32-bit Integer
|
:Type: 32-bit Integer
|
||||||
:Default: ``0``
|
:Default: ``0``
|
||||||
|
|
||||||
|
``mds_inject_skip_replaying_inotable``
|
||||||
|
|
||||||
|
:Description: Ceph will skip replaying the inotable when replaying the journal,
|
||||||
|
and the premary MDS will crash, while the replacing MDS won't.
|
||||||
|
(for developers only).
|
||||||
|
|
||||||
|
:Type: Boolean
|
||||||
|
:Default: ``false``
|
||||||
|
|
||||||
|
|
||||||
|
``mds_kill_skip_replaying_inotable``
|
||||||
|
|
||||||
|
:Description: Ceph will skip replaying the inotable when replaying the journal,
|
||||||
|
and the premary MDS will crash, while the replacing MDS won't.
|
||||||
|
(for developers only).
|
||||||
|
|
||||||
|
:Type: Boolean
|
||||||
|
:Default: ``false``
|
||||||
|
|
||||||
|
|
||||||
``mds_wipe_sessions``
|
``mds_wipe_sessions``
|
||||||
|
|
||||||
|
@ -53,7 +53,8 @@ If you have more than one FS on your Ceph cluster, use the option
|
|||||||
|
|
||||||
ceph-fuse --id foo --client_fs mycephfs2 /mnt/mycephfs2
|
ceph-fuse --id foo --client_fs mycephfs2 /mnt/mycephfs2
|
||||||
|
|
||||||
You may also add a ``client_fs`` setting to your ``ceph.conf``
|
You may also add a ``client_fs`` setting to your ``ceph.conf``. Alternatively, the option
|
||||||
|
``--client_mds_namespace`` is supported for backward compatibility.
|
||||||
|
|
||||||
Unmounting CephFS
|
Unmounting CephFS
|
||||||
=================
|
=================
|
||||||
|
@ -96,6 +96,28 @@ non-default FS as follows::
|
|||||||
|
|
||||||
mount -t ceph :/ /mnt/mycephfs2 -o name=fs,fs=mycephfs2
|
mount -t ceph :/ /mnt/mycephfs2 -o name=fs,fs=mycephfs2
|
||||||
|
|
||||||
|
Backward Compatibility
|
||||||
|
======================
|
||||||
|
The old syntax is supported for backward compatibility.
|
||||||
|
|
||||||
|
To mount CephFS with the kernel driver::
|
||||||
|
|
||||||
|
mkdir /mnt/mycephfs
|
||||||
|
mount -t ceph :/ /mnt/mycephfs -o name=admin
|
||||||
|
|
||||||
|
The key-value argument right after option ``-o`` is CephX credential;
|
||||||
|
``name`` is the username of the CephX user we are using to mount CephFS.
|
||||||
|
|
||||||
|
To mount a non-default FS ``cephfs2``, in case the cluster has multiple FSs::
|
||||||
|
|
||||||
|
mount -t ceph :/ /mnt/mycephfs -o name=admin,fs=cephfs2
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
mount -t ceph :/ /mnt/mycephfs -o name=admin,mds_namespace=cephfs2
|
||||||
|
|
||||||
|
.. note:: The option ``mds_namespace`` is deprecated. Use ``fs=`` instead when using the old syntax for mounting.
|
||||||
|
|
||||||
Unmounting CephFS
|
Unmounting CephFS
|
||||||
=================
|
=================
|
||||||
To unmount the Ceph file system, use the ``umount`` command as usual::
|
To unmount the Ceph file system, use the ``umount`` command as usual::
|
||||||
|
@ -60,6 +60,18 @@ added as comments in the sample conf. There are options to do the following:
|
|||||||
- enable read delegations (need at least v13.0.1 'libcephfs2' package
|
- enable read delegations (need at least v13.0.1 'libcephfs2' package
|
||||||
and v2.6.0 stable 'nfs-ganesha' and 'nfs-ganesha-ceph' packages)
|
and v2.6.0 stable 'nfs-ganesha' and 'nfs-ganesha-ceph' packages)
|
||||||
|
|
||||||
|
.. important::
|
||||||
|
|
||||||
|
Under certain conditions, NFS access using the CephFS FSAL fails. This
|
||||||
|
causes an error to be thrown that reads "Input/output error". Under these
|
||||||
|
circumstances, the application metadata must be set for the CephFS metadata
|
||||||
|
and CephFS data pools. Do this by running the following command:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph osd pool application set <cephfs_metadata_pool> cephfs <cephfs_data_pool> cephfs
|
||||||
|
|
||||||
|
|
||||||
Configuration for libcephfs clients
|
Configuration for libcephfs clients
|
||||||
-----------------------------------
|
-----------------------------------
|
||||||
|
|
||||||
|
@ -143,3 +143,14 @@ The types of damage that can be reported and repaired by File System Scrub are:
|
|||||||
|
|
||||||
* BACKTRACE : Inode's backtrace in the data pool is corrupted.
|
* BACKTRACE : Inode's backtrace in the data pool is corrupted.
|
||||||
|
|
||||||
|
Evaluate strays using recursive scrub
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
- In order to evaluate strays i.e. purge stray directories in ``~mdsdir`` use the following command::
|
||||||
|
|
||||||
|
ceph tell mds.<fsname>:0 scrub start ~mdsdir recursive
|
||||||
|
|
||||||
|
- ``~mdsdir`` is not enqueued by default when scrubbing at the CephFS root. In order to perform stray evaluation
|
||||||
|
at root, run scrub with flags ``scrub_mdsdir`` and ``recursive``::
|
||||||
|
|
||||||
|
ceph tell mds.<fsname>:0 scrub start / recursive,scrub_mdsdir
|
||||||
|
@ -142,6 +142,19 @@ Examples::
|
|||||||
ceph fs snap-schedule retention add / 24h4w # add 24 hourly and 4 weekly to retention
|
ceph fs snap-schedule retention add / 24h4w # add 24 hourly and 4 weekly to retention
|
||||||
ceph fs snap-schedule retention remove / 7d4w # remove 7 daily and 4 weekly, leaves 24 hourly
|
ceph fs snap-schedule retention remove / 7d4w # remove 7 daily and 4 weekly, leaves 24 hourly
|
||||||
|
|
||||||
|
.. note: When adding a path to snap-schedule, remember to strip off the mount
|
||||||
|
point path prefix. Paths to snap-schedule should start at the appropriate
|
||||||
|
CephFS file system root and not at the host file system root.
|
||||||
|
e.g. if the Ceph File System is mounted at ``/mnt`` and the path under which
|
||||||
|
snapshots need to be taken is ``/mnt/some/path`` then the acutal path required
|
||||||
|
by snap-schedule is only ``/some/path``.
|
||||||
|
|
||||||
|
.. note: It should be noted that the "created" field in the snap-schedule status
|
||||||
|
command output is the timestamp at which the schedule was created. The "created"
|
||||||
|
timestamp has nothing to do with the creation of actual snapshots. The actual
|
||||||
|
snapshot creation is accounted for in the "created_count" field, which is a
|
||||||
|
cumulative count of the total number of snapshots created so far.
|
||||||
|
|
||||||
Active and inactive schedules
|
Active and inactive schedules
|
||||||
-----------------------------
|
-----------------------------
|
||||||
Snapshot schedules can be added for a path that doesn't exist yet in the
|
Snapshot schedules can be added for a path that doesn't exist yet in the
|
||||||
|
@ -188,6 +188,98 @@ You can enable dynamic debug against the CephFS module.
|
|||||||
|
|
||||||
Please see: https://github.com/ceph/ceph/blob/master/src/script/kcon_all.sh
|
Please see: https://github.com/ceph/ceph/blob/master/src/script/kcon_all.sh
|
||||||
|
|
||||||
|
In-memory Log Dump
|
||||||
|
==================
|
||||||
|
|
||||||
|
In-memory logs can be dumped by setting ``mds_extraordinary_events_dump_interval``
|
||||||
|
during a lower level debugging (log level < 10). ``mds_extraordinary_events_dump_interval``
|
||||||
|
is the interval in seconds for dumping the recent in-memory logs when there is an Extra-Ordinary event.
|
||||||
|
|
||||||
|
The Extra-Ordinary events are classified as:
|
||||||
|
|
||||||
|
* Client Eviction
|
||||||
|
* Missed Beacon ACK from the monitors
|
||||||
|
* Missed Internal Heartbeats
|
||||||
|
|
||||||
|
In-memory Log Dump is disabled by default to prevent log file bloat in a production environment.
|
||||||
|
The below commands consecutively enables it::
|
||||||
|
|
||||||
|
$ ceph config set mds debug_mds <log_level>/<gather_level>
|
||||||
|
$ ceph config set mds mds_extraordinary_events_dump_interval <seconds>
|
||||||
|
|
||||||
|
The ``log_level`` should be < 10 and ``gather_level`` should be >= 10 to enable in-memory log dump.
|
||||||
|
When it is enabled, the MDS checks for the extra-ordinary events every
|
||||||
|
``mds_extraordinary_events_dump_interval`` seconds and if any of them occurs, MDS dumps the
|
||||||
|
in-memory logs containing the relevant event details in ceph-mds log.
|
||||||
|
|
||||||
|
.. note:: For higher log levels (log_level >= 10) there is no reason to dump the In-memory Logs and a
|
||||||
|
lower gather level (gather_level < 10) is insufficient to gather In-memory Logs. Thus a
|
||||||
|
log level >=10 or a gather level < 10 in debug_mds would prevent enabling the In-memory Log Dump.
|
||||||
|
In such cases, when there is a failure it's required to reset the value of
|
||||||
|
mds_extraordinary_events_dump_interval to 0 before enabling using the above commands.
|
||||||
|
|
||||||
|
The In-memory Log Dump can be disabled using::
|
||||||
|
|
||||||
|
$ ceph config set mds mds_extraordinary_events_dump_interval 0
|
||||||
|
|
||||||
|
Filesystems Become Inaccessible After an Upgrade
|
||||||
|
================================================
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
You can avoid ``operation not permitted`` errors by running this procedure
|
||||||
|
before an upgrade. As of May 2023, it seems that ``operation not permitted``
|
||||||
|
errors of the kind discussed here occur after upgrades after Nautilus
|
||||||
|
(inclusive).
|
||||||
|
|
||||||
|
IF
|
||||||
|
|
||||||
|
you have CephFS file systems that have data and metadata pools that were
|
||||||
|
created by a ``ceph fs new`` command (meaning that they were not created
|
||||||
|
with the defaults)
|
||||||
|
|
||||||
|
OR
|
||||||
|
|
||||||
|
you have an existing CephFS file system and are upgrading to a new post-Nautilus
|
||||||
|
major version of Ceph
|
||||||
|
|
||||||
|
THEN
|
||||||
|
|
||||||
|
in order for the documented ``ceph fs authorize...`` commands to function as
|
||||||
|
documented (and to avoid 'operation not permitted' errors when doing file I/O
|
||||||
|
or similar security-related problems for all users except the ``client.admin``
|
||||||
|
user), you must first run:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph osd pool application set <your metadata pool name> cephfs metadata <your ceph fs filesystem name>
|
||||||
|
|
||||||
|
and
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph osd pool application set <your data pool name> cephfs data <your ceph fs filesystem name>
|
||||||
|
|
||||||
|
Otherwise, when the OSDs receive a request to read or write data (not the
|
||||||
|
directory info, but file data) they will not know which Ceph file system name
|
||||||
|
to look up. This is true also of pool names, because the 'defaults' themselves
|
||||||
|
changed in the major releases, from::
|
||||||
|
|
||||||
|
data pool=fsname
|
||||||
|
metadata pool=fsname_metadata
|
||||||
|
|
||||||
|
to::
|
||||||
|
|
||||||
|
data pool=fsname.data and
|
||||||
|
metadata pool=fsname.meta
|
||||||
|
|
||||||
|
Any setup that used ``client.admin`` for all mounts did not run into this
|
||||||
|
problem, because the admin key gave blanket permissions.
|
||||||
|
|
||||||
|
A temporary fix involves changing mount requests to the 'client.admin' user and
|
||||||
|
its associated key. A less drastic but half-fix is to change the osd cap for
|
||||||
|
your user to just ``caps osd = "allow rw"`` and delete ``tag cephfs
|
||||||
|
data=....``
|
||||||
|
|
||||||
Reporting Issues
|
Reporting Issues
|
||||||
================
|
================
|
||||||
|
|
||||||
|
@ -87,7 +87,8 @@ Optionals are represented as a presence byte, followed by the item if it exists.
|
|||||||
T element[present? 1 : 0]; // Only if present is non-zero.
|
T element[present? 1 : 0]; // Only if present is non-zero.
|
||||||
}
|
}
|
||||||
|
|
||||||
Optionals are used to encode ``boost::optional``.
|
Optionals are used to encode ``boost::optional`` and, since introducing
|
||||||
|
C++17 to Ceph, ``std::optional``.
|
||||||
|
|
||||||
Pair
|
Pair
|
||||||
----
|
----
|
||||||
|
@ -5,7 +5,7 @@ jerasure plugin
|
|||||||
Introduction
|
Introduction
|
||||||
------------
|
------------
|
||||||
|
|
||||||
The parameters interpreted by the jerasure plugin are:
|
The parameters interpreted by the ``jerasure`` plugin are:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
@ -31,3 +31,5 @@ upstream repositories `http://jerasure.org/jerasure/jerasure
|
|||||||
`http://jerasure.org/jerasure/gf-complete
|
`http://jerasure.org/jerasure/gf-complete
|
||||||
<http://jerasure.org/jerasure/gf-complete>`_ . The difference
|
<http://jerasure.org/jerasure/gf-complete>`_ . The difference
|
||||||
between the two, if any, should match pull requests against upstream.
|
between the two, if any, should match pull requests against upstream.
|
||||||
|
Note that as of 2023, the ``jerasure.org`` web site may no longer be
|
||||||
|
legitimate and/or associated with the original project.
|
||||||
|
93
ceph/doc/dev/osd_internals/past_intervals.rst
Normal file
93
ceph/doc/dev/osd_internals/past_intervals.rst
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
=============
|
||||||
|
PastIntervals
|
||||||
|
=============
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
-------
|
||||||
|
|
||||||
|
There are two situations where we need to consider the set of all acting-set
|
||||||
|
OSDs for a PG back to some epoch ``e``:
|
||||||
|
|
||||||
|
* During peering, we need to consider the acting set for every epoch back to
|
||||||
|
``last_epoch_started``, the last epoch in which the PG completed peering and
|
||||||
|
became active.
|
||||||
|
(see :doc:`/dev/osd_internals/last_epoch_started` for a detailed explanation)
|
||||||
|
* During recovery, we need to consider the acting set for every epoch back to
|
||||||
|
``last_epoch_clean``, the last epoch at which all of the OSDs in the acting
|
||||||
|
set were fully recovered, and the acting set was full.
|
||||||
|
|
||||||
|
For either of these purposes, we could build such a set by iterating backwards
|
||||||
|
from the current OSDMap to the relevant epoch. Instead, we maintain a structure
|
||||||
|
PastIntervals for each PG.
|
||||||
|
|
||||||
|
An ``interval`` is a contiguous sequence of OSDMap epochs where the PG mapping
|
||||||
|
didn't change. This includes changes to the acting set, the up set, the
|
||||||
|
primary, and several other parameters fully spelled out in
|
||||||
|
PastIntervals::check_new_interval.
|
||||||
|
|
||||||
|
Maintenance and Trimming
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
The PastIntervals structure stores a record for each ``interval`` back to
|
||||||
|
last_epoch_clean. On each new ``interval`` (See AdvMap reactions,
|
||||||
|
PeeringState::should_restart_peering, and PeeringState::start_peering_interval)
|
||||||
|
each OSD with the PG will add the new ``interval`` to its local PastIntervals.
|
||||||
|
Activation messages to OSDs which do not already have the PG contain the
|
||||||
|
sender's PastIntervals so that the recipient needn't rebuild it. (See
|
||||||
|
PeeringState::activate needs_past_intervals).
|
||||||
|
|
||||||
|
PastIntervals are trimmed in two places. First, when the primary marks the
|
||||||
|
PG clean, it clears its past_intervals instance
|
||||||
|
(PeeringState::try_mark_clean()). The replicas will do the same thing when
|
||||||
|
they receive the info (See PeeringState::update_history).
|
||||||
|
|
||||||
|
The second, more complex, case is in PeeringState::start_peering_interval. In
|
||||||
|
the event of a "map gap", we assume that the PG actually has gone clean, but we
|
||||||
|
haven't received a pg_info_t with the updated ``last_epoch_clean`` value yet.
|
||||||
|
To explain this behavior, we need to discuss OSDMap trimming.
|
||||||
|
|
||||||
|
OSDMap Trimming
|
||||||
|
---------------
|
||||||
|
|
||||||
|
OSDMaps are created by the Monitor quorum and gossiped out to the OSDs. The
|
||||||
|
Monitor cluster also determines when OSDs (and the Monitors) are allowed to
|
||||||
|
trim old OSDMap epochs. For the reasons explained above in this document, the
|
||||||
|
primary constraint is that we must retain all OSDMaps back to some epoch such
|
||||||
|
that all PGs have been clean at that or a later epoch (min_last_epoch_clean).
|
||||||
|
(See OSDMonitor::get_trim_to).
|
||||||
|
|
||||||
|
The Monitor quorum determines min_last_epoch_clean through MOSDBeacon messages
|
||||||
|
sent periodically by each OSDs. Each message contains a set of PGs for which
|
||||||
|
the OSD is primary at that moment as well as the min_last_epoch_clean across
|
||||||
|
that set. The Monitors track these values in OSDMonitor::last_epoch_clean.
|
||||||
|
|
||||||
|
There is a subtlety in the min_last_epoch_clean value used by the OSD to
|
||||||
|
populate the MOSDBeacon. OSD::collect_pg_stats invokes PG::with_pg_stats to
|
||||||
|
obtain the lec value, which actually uses
|
||||||
|
pg_stat_t::get_effective_last_epoch_clean() rather than
|
||||||
|
info.history.last_epoch_clean. If the PG is currently clean,
|
||||||
|
pg_stat_t::get_effective_last_epoch_clean() is the current epoch rather than
|
||||||
|
last_epoch_clean -- this works because the PG is clean at that epoch and it
|
||||||
|
allows OSDMaps to be trimmed during periods where OSDMaps are being created
|
||||||
|
(due to snapshot activity, perhaps), but no PGs are undergoing ``interval``
|
||||||
|
changes.
|
||||||
|
|
||||||
|
Back to PastIntervals
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
We can now understand our second trimming case above. If OSDMaps have been
|
||||||
|
trimmed up to epoch ``e``, we know that the PG must have been clean at some epoch
|
||||||
|
>= ``e`` (indeed, **all** PGs must have been), so we can drop our PastIntevals.
|
||||||
|
|
||||||
|
This dependency also pops up in PeeringState::check_past_interval_bounds().
|
||||||
|
PeeringState::get_required_past_interval_bounds takes as a parameter
|
||||||
|
oldest_epoch, which comes from OSDSuperblock::cluster_osdmap_trim_lower_bound.
|
||||||
|
We use cluster_osdmap_trim_lower_bound rather than a specific osd's oldest_map
|
||||||
|
because we don't necessarily trim all MOSDMap::cluster_osdmap_trim_lower_bound.
|
||||||
|
In order to avoid doing too much work at once we limit the amount of osdmaps
|
||||||
|
trimmed using ``osd_target_transaction_size`` in OSD::trim_maps().
|
||||||
|
For this reason, a specific OSD's oldest_map can lag behind
|
||||||
|
OSDSuperblock::cluster_osdmap_trim_lower_bound
|
||||||
|
for a while.
|
||||||
|
|
||||||
|
See https://tracker.ceph.com/issues/49689 for an example.
|
@ -12,12 +12,13 @@
|
|||||||
:ref:`BlueStore<rados_config_storage_devices_bluestore>`
|
:ref:`BlueStore<rados_config_storage_devices_bluestore>`
|
||||||
OSD BlueStore is a storage back end used by OSD daemons, and
|
OSD BlueStore is a storage back end used by OSD daemons, and
|
||||||
was designed specifically for use with Ceph. BlueStore was
|
was designed specifically for use with Ceph. BlueStore was
|
||||||
introduced in the Ceph Kraken release. In the Ceph Luminous
|
introduced in the Ceph Kraken release. The Luminous release of
|
||||||
release, BlueStore became Ceph's default storage back end,
|
Ceph promoted BlueStore to the default OSD back end,
|
||||||
supplanting FileStore. Unlike :term:`filestore`, BlueStore
|
supplanting FileStore. As of the Reef release, FileStore is no
|
||||||
stores objects directly on Ceph block devices without any file
|
longer available as a storage backend.
|
||||||
system interface. Since Luminous (12.2), BlueStore has been
|
|
||||||
Ceph's default and recommended storage back end.
|
BlueStore stores objects directly on Ceph block devices without
|
||||||
|
a mounted file system.
|
||||||
|
|
||||||
Bucket
|
Bucket
|
||||||
In the context of :term:`RGW`, a bucket is a group of objects.
|
In the context of :term:`RGW`, a bucket is a group of objects.
|
||||||
|
@ -11,6 +11,12 @@ Ceph delivers **object, block, and file storage in one unified system**.
|
|||||||
Ceph project. (Click anywhere in this paragraph to read the "Basic
|
Ceph project. (Click anywhere in this paragraph to read the "Basic
|
||||||
Workflow" page of the Ceph Developer Guide.) <basic workflow dev guide>`.
|
Workflow" page of the Ceph Developer Guide.) <basic workflow dev guide>`.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
:ref:`If you want to make a commit to the documentation but you don't
|
||||||
|
know how to get started, read the "Documenting Ceph" page. (Click anywhere
|
||||||
|
in this paragraph to read the "Documenting Ceph" page.) <documenting_ceph>`.
|
||||||
|
|
||||||
.. raw:: html
|
.. raw:: html
|
||||||
|
|
||||||
<style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
|
<style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
|
||||||
|
@ -36,6 +36,22 @@ Options
|
|||||||
|
|
||||||
Perform a selftest. This mode performs a sanity check of ``stats`` module.
|
Perform a selftest. This mode performs a sanity check of ``stats`` module.
|
||||||
|
|
||||||
|
.. option:: --conffile [CONFFILE]
|
||||||
|
|
||||||
|
Path to cluster configuration file
|
||||||
|
|
||||||
|
.. option:: -d [DELAY], --delay [DELAY]
|
||||||
|
|
||||||
|
Refresh interval in seconds (default: 1)
|
||||||
|
|
||||||
|
.. option:: --dump
|
||||||
|
|
||||||
|
Dump the metrics to stdout
|
||||||
|
|
||||||
|
.. option:: --dumpfs <fs_name>
|
||||||
|
|
||||||
|
Dump the metrics of the given filesystem to stdout
|
||||||
|
|
||||||
Descriptions of fields
|
Descriptions of fields
|
||||||
======================
|
======================
|
||||||
|
|
||||||
|
@ -110,6 +110,12 @@ Basic
|
|||||||
them. If an inode contains any stale file locks, read/write on the inode
|
them. If an inode contains any stale file locks, read/write on the inode
|
||||||
is not allowed until applications release all stale file locks.
|
is not allowed until applications release all stale file locks.
|
||||||
|
|
||||||
|
:command: `fs=<fs-name>`
|
||||||
|
Specify the non-default file system to be mounted, when using the old syntax.
|
||||||
|
|
||||||
|
:command: `mds_namespace=<fs-name>`
|
||||||
|
A synonym of "fs=" (Deprecated).
|
||||||
|
|
||||||
Advanced
|
Advanced
|
||||||
--------
|
--------
|
||||||
:command:`cap_release_safety`
|
:command:`cap_release_safety`
|
||||||
@ -236,6 +242,10 @@ history::
|
|||||||
mount.ceph :/ /mnt/mycephfs -o name=fs_username,secretfile=/etc/ceph/fs_username.secret
|
mount.ceph :/ /mnt/mycephfs -o name=fs_username,secretfile=/etc/ceph/fs_username.secret
|
||||||
|
|
||||||
|
|
||||||
|
To mount using the old syntax::
|
||||||
|
|
||||||
|
mount -t ceph 192.168.0.1:/ /mnt/mycephfs
|
||||||
|
|
||||||
Availability
|
Availability
|
||||||
============
|
============
|
||||||
|
|
||||||
|
@ -18,9 +18,11 @@ for all reporting entities are returned in text exposition format.
|
|||||||
Enabling prometheus output
|
Enabling prometheus output
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
The *prometheus* module is enabled with::
|
The *prometheus* module is enabled with:
|
||||||
|
|
||||||
ceph mgr module enable prometheus
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph mgr module enable prometheus
|
||||||
|
|
||||||
Configuration
|
Configuration
|
||||||
-------------
|
-------------
|
||||||
@ -36,10 +38,10 @@ configurable with ``ceph config set``, with keys
|
|||||||
is registered with Prometheus's `registry
|
is registered with Prometheus's `registry
|
||||||
<https://github.com/prometheus/prometheus/wiki/Default-port-allocations>`_.
|
<https://github.com/prometheus/prometheus/wiki/Default-port-allocations>`_.
|
||||||
|
|
||||||
::
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/server_addr 0.0.0.0
|
ceph config set mgr mgr/prometheus/server_addr 0.0.0.
|
||||||
ceph config set mgr mgr/prometheus/server_port 9283
|
ceph config set mgr mgr/prometheus/server_port 9283
|
||||||
|
|
||||||
.. warning::
|
.. warning::
|
||||||
|
|
||||||
@ -54,9 +56,11 @@ recommended to use 15 seconds as scrape interval, though, in some cases it
|
|||||||
might be useful to increase the scrape interval.
|
might be useful to increase the scrape interval.
|
||||||
|
|
||||||
To set a different scrape interval in the Prometheus module, set
|
To set a different scrape interval in the Prometheus module, set
|
||||||
``scrape_interval`` to the desired value::
|
``scrape_interval`` to the desired value:
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/scrape_interval 20
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/scrape_interval 20
|
||||||
|
|
||||||
On large clusters (>1000 OSDs), the time to fetch the metrics may become
|
On large clusters (>1000 OSDs), the time to fetch the metrics may become
|
||||||
significant. Without the cache, the Prometheus manager module could, especially
|
significant. Without the cache, the Prometheus manager module could, especially
|
||||||
@ -75,35 +79,47 @@ This behavior can be configured. By default, it will return a 503 HTTP status
|
|||||||
code (service unavailable). You can set other options using the ``ceph config
|
code (service unavailable). You can set other options using the ``ceph config
|
||||||
set`` commands.
|
set`` commands.
|
||||||
|
|
||||||
To tell the module to respond with possibly stale data, set it to ``return``::
|
To tell the module to respond with possibly stale data, set it to ``return``:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/stale_cache_strategy return
|
ceph config set mgr mgr/prometheus/stale_cache_strategy return
|
||||||
|
|
||||||
To tell the module to respond with "service unavailable", set it to ``fail``::
|
To tell the module to respond with "service unavailable", set it to ``fail``:
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/stale_cache_strategy fail
|
.. prompt:: bash $
|
||||||
|
|
||||||
If you are confident that you don't require the cache, you can disable it::
|
ceph config set mgr mgr/prometheus/stale_cache_strategy fail
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/cache false
|
If you are confident that you don't require the cache, you can disable it:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/cache false
|
||||||
|
|
||||||
If you are using the prometheus module behind some kind of reverse proxy or
|
If you are using the prometheus module behind some kind of reverse proxy or
|
||||||
loadbalancer, you can simplify discovering the active instance by switching
|
loadbalancer, you can simplify discovering the active instance by switching
|
||||||
to ``error``-mode::
|
to ``error``-mode:
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/standby_behaviour error
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/standby_behaviour error
|
||||||
|
|
||||||
If set, the prometheus module will repond with a HTTP error when requesting ``/``
|
If set, the prometheus module will repond with a HTTP error when requesting ``/``
|
||||||
from the standby instance. The default error code is 500, but you can configure
|
from the standby instance. The default error code is 500, but you can configure
|
||||||
the HTTP response code with::
|
the HTTP response code with:
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/standby_error_status_code 503
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/standby_error_status_code 503
|
||||||
|
|
||||||
Valid error codes are between 400-599.
|
Valid error codes are between 400-599.
|
||||||
|
|
||||||
To switch back to the default behaviour, simply set the config key to ``default``::
|
To switch back to the default behaviour, simply set the config key to ``default``:
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/standby_behaviour default
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/standby_behaviour default
|
||||||
|
|
||||||
.. _prometheus-rbd-io-statistics:
|
.. _prometheus-rbd-io-statistics:
|
||||||
|
|
||||||
@ -154,9 +170,17 @@ configuration parameter. The parameter is a comma or space separated list
|
|||||||
of ``pool[/namespace]`` entries. If the namespace is not specified the
|
of ``pool[/namespace]`` entries. If the namespace is not specified the
|
||||||
statistics are collected for all namespaces in the pool.
|
statistics are collected for all namespaces in the pool.
|
||||||
|
|
||||||
Example to activate the RBD-enabled pools ``pool1``, ``pool2`` and ``poolN``::
|
Example to activate the RBD-enabled pools ``pool1``, ``pool2`` and ``poolN``:
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/rbd_stats_pools "pool1,pool2,poolN"
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/rbd_stats_pools "pool1,pool2,poolN"
|
||||||
|
|
||||||
|
The wildcard can be used to indicate all pools or namespaces:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/rbd_stats_pools "*"
|
||||||
|
|
||||||
The module makes the list of all available images scanning the specified
|
The module makes the list of all available images scanning the specified
|
||||||
pools and namespaces and refreshes it periodically. The period is
|
pools and namespaces and refreshes it periodically. The period is
|
||||||
@ -165,9 +189,22 @@ parameter (in sec) and is 300 sec (5 minutes) by default. The module will
|
|||||||
force refresh earlier if it detects statistics from a previously unknown
|
force refresh earlier if it detects statistics from a previously unknown
|
||||||
RBD image.
|
RBD image.
|
||||||
|
|
||||||
Example to turn up the sync interval to 10 minutes::
|
Example to turn up the sync interval to 10 minutes:
|
||||||
|
|
||||||
ceph config set mgr mgr/prometheus/rbd_stats_pools_refresh_interval 600
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/rbd_stats_pools_refresh_interval 600
|
||||||
|
|
||||||
|
Ceph daemon performance counters metrics
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
|
With the introduction of ``ceph-exporter`` daemon, the prometheus module will no longer export Ceph daemon
|
||||||
|
perf counters as prometheus metrics by default. However, one may re-enable exporting these metrics by setting
|
||||||
|
the module option ``exclude_perf_counters`` to ``false``:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/prometheus/exclude_perf_counters false
|
||||||
|
|
||||||
Statistic names and labels
|
Statistic names and labels
|
||||||
==========================
|
==========================
|
||||||
|
@ -153,3 +153,24 @@ completely optional, and disabled by default.::
|
|||||||
ceph config set mgr mgr/telemetry/description 'My first Ceph cluster'
|
ceph config set mgr mgr/telemetry/description 'My first Ceph cluster'
|
||||||
ceph config set mgr mgr/telemetry/channel_ident true
|
ceph config set mgr mgr/telemetry/channel_ident true
|
||||||
|
|
||||||
|
Leaderboard
|
||||||
|
-----------
|
||||||
|
|
||||||
|
To participate in a leaderboard in the `public dashboards
|
||||||
|
<https://telemetry-public.ceph.com/>`_, run the following command:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/telemetry/leaderboard true
|
||||||
|
|
||||||
|
The leaderboard displays basic information about the cluster. This includes the
|
||||||
|
total storage capacity and the number of OSDs. To add a description of the
|
||||||
|
cluster, run a command of the following form:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph config set mgr mgr/telemetry/leaderboard_description 'Ceph cluster for Computational Biology at the University of XYZ'
|
||||||
|
|
||||||
|
If the ``ident`` channel is enabled, its details will not be displayed in the
|
||||||
|
leaderboard.
|
||||||
|
|
||||||
|
@ -1,84 +1,95 @@
|
|||||||
==========================
|
==================================
|
||||||
BlueStore Config Reference
|
BlueStore Configuration Reference
|
||||||
==========================
|
==================================
|
||||||
|
|
||||||
Devices
|
Devices
|
||||||
=======
|
=======
|
||||||
|
|
||||||
BlueStore manages either one, two, or (in certain cases) three storage
|
BlueStore manages either one, two, or in certain cases three storage devices.
|
||||||
devices.
|
These *devices* are "devices" in the Linux/Unix sense. This means that they are
|
||||||
|
assets listed under ``/dev`` or ``/devices``. Each of these devices may be an
|
||||||
|
entire storage drive, or a partition of a storage drive, or a logical volume.
|
||||||
|
BlueStore does not create or mount a conventional file system on devices that
|
||||||
|
it uses; BlueStore reads and writes to the devices directly in a "raw" fashion.
|
||||||
|
|
||||||
In the simplest case, BlueStore consumes a single (primary) storage device.
|
In the simplest case, BlueStore consumes all of a single storage device. This
|
||||||
The storage device is normally used as a whole, occupying the full device that
|
device is known as the *primary device*. The primary device is identified by
|
||||||
is managed directly by BlueStore. This *primary device* is normally identified
|
the ``block`` symlink in the data directory.
|
||||||
by a ``block`` symlink in the data directory.
|
|
||||||
|
|
||||||
The data directory is a ``tmpfs`` mount which gets populated (at boot time, or
|
The data directory is a ``tmpfs`` mount. When this data directory is booted or
|
||||||
when ``ceph-volume`` activates it) with all the common OSD files that hold
|
activated by ``ceph-volume``, it is populated with metadata files and links
|
||||||
information about the OSD, like: its identifier, which cluster it belongs to,
|
that hold information about the OSD: for example, the OSD's identifier, the
|
||||||
and its private keyring.
|
name of the cluster that the OSD belongs to, and the OSD's private keyring.
|
||||||
|
|
||||||
It is also possible to deploy BlueStore across one or two additional devices:
|
In more complicated cases, BlueStore is deployed across one or two additional
|
||||||
|
devices:
|
||||||
|
|
||||||
* A *write-ahead log (WAL) device* (identified as ``block.wal`` in the data directory) can be
|
* A *write-ahead log (WAL) device* (identified as ``block.wal`` in the data
|
||||||
used for BlueStore's internal journal or write-ahead log. It is only useful
|
directory) can be used to separate out BlueStore's internal journal or
|
||||||
to use a WAL device if the device is faster than the primary device (e.g.,
|
write-ahead log. Using a WAL device is advantageous only if the WAL device
|
||||||
when it is on an SSD and the primary device is an HDD).
|
is faster than the primary device (for example, if the WAL device is an SSD
|
||||||
|
and the primary device is an HDD).
|
||||||
* A *DB device* (identified as ``block.db`` in the data directory) can be used
|
* A *DB device* (identified as ``block.db`` in the data directory) can be used
|
||||||
for storing BlueStore's internal metadata. BlueStore (or rather, the
|
to store BlueStore's internal metadata. BlueStore (or more precisely, the
|
||||||
embedded RocksDB) will put as much metadata as it can on the DB device to
|
embedded RocksDB) will put as much metadata as it can on the DB device in
|
||||||
improve performance. If the DB device fills up, metadata will spill back
|
order to improve performance. If the DB device becomes full, metadata will
|
||||||
onto the primary device (where it would have been otherwise). Again, it is
|
spill back onto the primary device (where it would have been located in the
|
||||||
only helpful to provision a DB device if it is faster than the primary
|
absence of the DB device). Again, it is advantageous to provision a DB device
|
||||||
device.
|
only if it is faster than the primary device.
|
||||||
|
|
||||||
If there is only a small amount of fast storage available (e.g., less
|
If there is only a small amount of fast storage available (for example, less
|
||||||
than a gigabyte), we recommend using it as a WAL device. If there is
|
than a gigabyte), we recommend using the available space as a WAL device. But
|
||||||
more, provisioning a DB device makes more sense. The BlueStore
|
if more fast storage is available, it makes more sense to provision a DB
|
||||||
journal will always be placed on the fastest device available, so
|
device. Because the BlueStore journal is always placed on the fastest device
|
||||||
using a DB device will provide the same benefit that the WAL device
|
available, using a DB device provides the same benefit that using a WAL device
|
||||||
would while *also* allowing additional metadata to be stored there (if
|
would, while *also* allowing additional metadata to be stored off the primary
|
||||||
it will fit). This means that if a DB device is specified but an explicit
|
device (provided that it fits). DB devices make this possible because whenever
|
||||||
WAL device is not, the WAL will be implicitly colocated with the DB on the faster
|
a DB device is specified but an explicit WAL device is not, the WAL will be
|
||||||
device.
|
implicitly colocated with the DB on the faster device.
|
||||||
|
|
||||||
A single-device (colocated) BlueStore OSD can be provisioned with:
|
To provision a single-device (colocated) BlueStore OSD, run the following
|
||||||
|
command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph-volume lvm prepare --bluestore --data <device>
|
ceph-volume lvm prepare --bluestore --data <device>
|
||||||
|
|
||||||
To specify a WAL device and/or DB device:
|
To specify a WAL device or DB device, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph-volume lvm prepare --bluestore --data <device> --block.wal <wal-device> --block.db <db-device>
|
ceph-volume lvm prepare --bluestore --data <device> --block.wal <wal-device> --block.db <db-device>
|
||||||
|
|
||||||
.. note:: ``--data`` can be a Logical Volume using *vg/lv* notation. Other
|
.. note:: The option ``--data`` can take as its argument any of the the
|
||||||
devices can be existing logical volumes or GPT partitions.
|
following devices: logical volumes specified using *vg/lv* notation,
|
||||||
|
existing logical volumes, and GPT partitions.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Provisioning strategies
|
Provisioning strategies
|
||||||
-----------------------
|
-----------------------
|
||||||
Although there are multiple ways to deploy a BlueStore OSD (unlike Filestore
|
|
||||||
which had just one), there are two common arrangements that should help clarify
|
BlueStore differs from Filestore in that there are several ways to deploy a
|
||||||
the deployment strategy:
|
BlueStore OSD. However, the overall deployment strategy for BlueStore can be
|
||||||
|
clarified by examining just these two common arrangements:
|
||||||
|
|
||||||
.. _bluestore-single-type-device-config:
|
.. _bluestore-single-type-device-config:
|
||||||
|
|
||||||
**block (data) only**
|
**block (data) only**
|
||||||
^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
If all devices are the same type, for example all rotational drives, and
|
If all devices are of the same type (for example, they are all HDDs), and if
|
||||||
there are no fast devices to use for metadata, it makes sense to specify the
|
there are no fast devices available for the storage of metadata, then it makes
|
||||||
block device only and to not separate ``block.db`` or ``block.wal``. The
|
sense to specify the block device only and to leave ``block.db`` and
|
||||||
:ref:`ceph-volume-lvm` command for a single ``/dev/sda`` device looks like:
|
``block.wal`` unseparated. The :ref:`ceph-volume-lvm` command for a single
|
||||||
|
``/dev/sda`` device is as follows:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph-volume lvm create --bluestore --data /dev/sda
|
ceph-volume lvm create --bluestore --data /dev/sda
|
||||||
|
|
||||||
If logical volumes have already been created for each device, (a single LV
|
If the devices to be used for a BlueStore OSD are pre-created logical volumes,
|
||||||
using 100% of the device), then the :ref:`ceph-volume-lvm` call for an LV named
|
then the :ref:`ceph-volume-lvm` call for an logical volume named
|
||||||
``ceph-vg/block-lv`` would look like:
|
``ceph-vg/block-lv`` is as follows:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -88,15 +99,18 @@ using 100% of the device), then the :ref:`ceph-volume-lvm` call for an LV named
|
|||||||
|
|
||||||
**block and block.db**
|
**block and block.db**
|
||||||
^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^
|
||||||
If you have a mix of fast and slow devices (SSD / NVMe and rotational),
|
|
||||||
it is recommended to place ``block.db`` on the faster device while ``block``
|
|
||||||
(data) lives on the slower (spinning drive).
|
|
||||||
|
|
||||||
You must create these volume groups and logical volumes manually as
|
If you have a mix of fast and slow devices (for example, SSD or HDD), then we
|
||||||
the ``ceph-volume`` tool is currently not able to do so automatically.
|
recommend placing ``block.db`` on the faster device while ``block`` (that is,
|
||||||
|
the data) is stored on the slower device (that is, the rotational drive).
|
||||||
|
|
||||||
For the below example, let us assume four rotational (``sda``, ``sdb``, ``sdc``, and ``sdd``)
|
You must create these volume groups and these logical volumes manually. as The
|
||||||
and one (fast) solid state drive (``sdx``). First create the volume groups:
|
``ceph-volume`` tool is currently unable to do so [create them?] automatically.
|
||||||
|
|
||||||
|
The following procedure illustrates the manual creation of volume groups and
|
||||||
|
logical volumes. For this example, we shall assume four rotational drives
|
||||||
|
(``sda``, ``sdb``, ``sdc``, and ``sdd``) and one (fast) SSD (``sdx``). First,
|
||||||
|
to create the volume groups, run the following commands:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -105,7 +119,7 @@ and one (fast) solid state drive (``sdx``). First create the volume groups:
|
|||||||
vgcreate ceph-block-2 /dev/sdc
|
vgcreate ceph-block-2 /dev/sdc
|
||||||
vgcreate ceph-block-3 /dev/sdd
|
vgcreate ceph-block-3 /dev/sdd
|
||||||
|
|
||||||
Now create the logical volumes for ``block``:
|
Next, to create the logical volumes for ``block``, run the following commands:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -114,8 +128,9 @@ Now create the logical volumes for ``block``:
|
|||||||
lvcreate -l 100%FREE -n block-2 ceph-block-2
|
lvcreate -l 100%FREE -n block-2 ceph-block-2
|
||||||
lvcreate -l 100%FREE -n block-3 ceph-block-3
|
lvcreate -l 100%FREE -n block-3 ceph-block-3
|
||||||
|
|
||||||
We are creating 4 OSDs for the four slow spinning devices, so assuming a 200GB
|
Because there are four HDDs, there will be four OSDs. Supposing that there is a
|
||||||
SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB:
|
200GB SSD in ``/dev/sdx``, we can create four 50GB logical volumes by running
|
||||||
|
the following commands:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -125,7 +140,7 @@ SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB:
|
|||||||
lvcreate -L 50GB -n db-2 ceph-db-0
|
lvcreate -L 50GB -n db-2 ceph-db-0
|
||||||
lvcreate -L 50GB -n db-3 ceph-db-0
|
lvcreate -L 50GB -n db-3 ceph-db-0
|
||||||
|
|
||||||
Finally, create the 4 OSDs with ``ceph-volume``:
|
Finally, to create the four OSDs, run the following commands:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -134,149 +149,153 @@ Finally, create the 4 OSDs with ``ceph-volume``:
|
|||||||
ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
|
ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
|
||||||
ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
|
ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
|
||||||
|
|
||||||
These operations should end up creating four OSDs, with ``block`` on the slower
|
After this procedure is finished, there should be four OSDs, ``block`` should
|
||||||
rotational drives with a 50 GB logical volume (DB) for each on the solid state
|
be on the four HDDs, and each HDD should have a 50GB logical volume
|
||||||
drive.
|
(specifically, a DB device) on the shared SSD.
|
||||||
|
|
||||||
Sizing
|
Sizing
|
||||||
======
|
======
|
||||||
When using a :ref:`mixed spinning and solid drive setup
|
When using a :ref:`mixed spinning-and-solid-drive setup
|
||||||
<bluestore-mixed-device-config>` it is important to make a large enough
|
<bluestore-mixed-device-config>`, it is important to make a large enough
|
||||||
``block.db`` logical volume for BlueStore. Generally, ``block.db`` should have
|
``block.db`` logical volume for BlueStore. The logical volumes associated with
|
||||||
*as large as possible* logical volumes.
|
``block.db`` should have logical volumes that are *as large as possible*.
|
||||||
|
|
||||||
The general recommendation is to have ``block.db`` size in between 1% to 4%
|
It is generally recommended that the size of ``block.db`` be somewhere between
|
||||||
of ``block`` size. For RGW workloads, it is recommended that the ``block.db``
|
1% and 4% of the size of ``block``. For RGW workloads, it is recommended that
|
||||||
size isn't smaller than 4% of ``block``, because RGW heavily uses it to store
|
the ``block.db`` be at least 4% of the ``block`` size, because RGW makes heavy
|
||||||
metadata (omap keys). For example, if the ``block`` size is 1TB, then ``block.db`` shouldn't
|
use of ``block.db`` to store metadata (in particular, omap keys). For example,
|
||||||
be less than 40GB. For RBD workloads, 1% to 2% of ``block`` size is usually enough.
|
if the ``block`` size is 1TB, then ``block.db`` should have a size of at least
|
||||||
|
40GB. For RBD workloads, however, ``block.db`` usually needs no more than 1% to
|
||||||
|
2% of the ``block`` size.
|
||||||
|
|
||||||
In older releases, internal level sizes mean that the DB can fully utilize only
|
In older releases, internal level sizes are such that the DB can fully utilize
|
||||||
specific partition / LV sizes that correspond to sums of L0, L0+L1, L1+L2,
|
only those specific partition / logical volume sizes that correspond to sums of
|
||||||
etc. sizes, which with default settings means roughly 3 GB, 30 GB, 300 GB, and
|
L0, L0+L1, L1+L2, and so on--that is, given default settings, sizes of roughly
|
||||||
so forth. Most deployments will not substantially benefit from sizing to
|
3GB, 30GB, 300GB, and so on. Most deployments do not substantially benefit from
|
||||||
accommodate L3 and higher, though DB compaction can be facilitated by doubling
|
sizing that accommodates L3 and higher, though DB compaction can be facilitated
|
||||||
these figures to 6GB, 60GB, and 600GB.
|
by doubling these figures to 6GB, 60GB, and 600GB.
|
||||||
|
|
||||||
Improvements in releases beginning with Nautilus 14.2.12 and Octopus 15.2.6
|
Improvements in Nautilus 14.2.12, Octopus 15.2.6, and subsequent releases allow
|
||||||
enable better utilization of arbitrary DB device sizes, and the Pacific
|
for better utilization of arbitrarily-sized DB devices. Moreover, the Pacific
|
||||||
release brings experimental dynamic level support. Users of older releases may
|
release brings experimental dynamic-level support. Because of these advances,
|
||||||
thus wish to plan ahead by provisioning larger DB devices today so that their
|
users of older releases might want to plan ahead by provisioning larger DB
|
||||||
benefits may be realized with future upgrades.
|
devices today so that the benefits of scale can be realized when upgrades are
|
||||||
|
made in the future.
|
||||||
When *not* using a mix of fast and slow devices, it isn't required to create
|
|
||||||
separate logical volumes for ``block.db`` (or ``block.wal``). BlueStore will
|
|
||||||
automatically colocate these within the space of ``block``.
|
|
||||||
|
|
||||||
|
When *not* using a mix of fast and slow devices, there is no requirement to
|
||||||
|
create separate logical volumes for ``block.db`` or ``block.wal``. BlueStore
|
||||||
|
will automatically colocate these devices within the space of ``block``.
|
||||||
|
|
||||||
Automatic Cache Sizing
|
Automatic Cache Sizing
|
||||||
======================
|
======================
|
||||||
|
|
||||||
BlueStore can be configured to automatically resize its caches when TCMalloc
|
BlueStore can be configured to automatically resize its caches, provided that
|
||||||
is configured as the memory allocator and the ``bluestore_cache_autotune``
|
certain conditions are met: TCMalloc must be configured as the memory allocator
|
||||||
setting is enabled. This option is currently enabled by default. BlueStore
|
and the ``bluestore_cache_autotune`` configuration option must be enabled (note
|
||||||
will attempt to keep OSD heap memory usage under a designated target size via
|
that it is currently enabled by default). When automatic cache sizing is in
|
||||||
the ``osd_memory_target`` configuration option. This is a best effort
|
effect, BlueStore attempts to keep OSD heap-memory usage under a certain target
|
||||||
algorithm and caches will not shrink smaller than the amount specified by
|
size (as determined by ``osd_memory_target``). This approach makes use of a
|
||||||
``osd_memory_cache_min``. Cache ratios will be chosen based on a hierarchy
|
best-effort algorithm and caches do not shrink smaller than the size defined by
|
||||||
of priorities. If priority information is not available, the
|
the value of ``osd_memory_cache_min``. Cache ratios are selected in accordance
|
||||||
``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio`` options are
|
with a hierarchy of priorities. But if priority information is not available,
|
||||||
used as fallbacks.
|
the values specified in the ``bluestore_cache_meta_ratio`` and
|
||||||
|
``bluestore_cache_kv_ratio`` options are used as fallback cache ratios.
|
||||||
|
|
||||||
|
|
||||||
Manual Cache Sizing
|
Manual Cache Sizing
|
||||||
===================
|
===================
|
||||||
|
|
||||||
The amount of memory consumed by each OSD for BlueStore caches is
|
The amount of memory consumed by each OSD to be used for its BlueStore cache is
|
||||||
determined by the ``bluestore_cache_size`` configuration option. If
|
determined by the ``bluestore_cache_size`` configuration option. If that option
|
||||||
that config option is not set (i.e., remains at 0), there is a
|
has not been specified (that is, if it remains at 0), then Ceph uses a
|
||||||
different default value that is used depending on whether an HDD or
|
different configuration option to determine the default memory budget:
|
||||||
SSD is used for the primary device (set by the
|
``bluestore_cache_size_hdd`` if the primary device is an HDD, or
|
||||||
``bluestore_cache_size_ssd`` and ``bluestore_cache_size_hdd`` config
|
``bluestore_cache_size_ssd`` if the primary device is an SSD.
|
||||||
options).
|
|
||||||
|
|
||||||
BlueStore and the rest of the Ceph OSD daemon do the best they can
|
BlueStore and the rest of the Ceph OSD daemon make every effort to work within
|
||||||
to work within this memory budget. Note that on top of the configured
|
this memory budget. Note that in addition to the configured cache size, there
|
||||||
cache size, there is also memory consumed by the OSD itself, and
|
is also memory consumed by the OSD itself. There is additional utilization due
|
||||||
some additional utilization due to memory fragmentation and other
|
to memory fragmentation and other allocator overhead.
|
||||||
allocator overhead.
|
|
||||||
|
|
||||||
The configured cache memory budget can be used in a few different ways:
|
The configured cache-memory budget can be used to store the following types of
|
||||||
|
things:
|
||||||
|
|
||||||
* Key/Value metadata (i.e., RocksDB's internal cache)
|
* Key/Value metadata (that is, RocksDB's internal cache)
|
||||||
* BlueStore metadata
|
* BlueStore metadata
|
||||||
* BlueStore data (i.e., recently read or written object data)
|
* BlueStore data (that is, recently read or recently written object data)
|
||||||
|
|
||||||
Cache memory usage is governed by the following options:
|
Cache memory usage is governed by the configuration options
|
||||||
``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio``.
|
``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio``. The fraction
|
||||||
The fraction of the cache devoted to data
|
of the cache that is reserved for data is governed by both the effective
|
||||||
is governed by the effective bluestore cache size (depending on
|
BlueStore cache size (which depends on the relevant
|
||||||
``bluestore_cache_size[_ssd|_hdd]`` settings and the device class of the primary
|
``bluestore_cache_size[_ssd|_hdd]`` option and the device class of the primary
|
||||||
device) as well as the meta and kv ratios.
|
device) and the "meta" and "kv" ratios. This data fraction can be calculated
|
||||||
The data fraction can be calculated by
|
with the following formula: ``<effective_cache_size> * (1 -
|
||||||
``<effective_cache_size> * (1 - bluestore_cache_meta_ratio - bluestore_cache_kv_ratio)``
|
bluestore_cache_meta_ratio - bluestore_cache_kv_ratio)``.
|
||||||
|
|
||||||
Checksums
|
Checksums
|
||||||
=========
|
=========
|
||||||
|
|
||||||
BlueStore checksums all metadata and data written to disk. Metadata
|
BlueStore checksums all metadata and all data written to disk. Metadata
|
||||||
checksumming is handled by RocksDB and uses `crc32c`. Data
|
checksumming is handled by RocksDB and uses the `crc32c` algorithm. By
|
||||||
checksumming is done by BlueStore and can make use of `crc32c`,
|
contrast, data checksumming is handled by BlueStore and can use either
|
||||||
`xxhash32`, or `xxhash64`. The default is `crc32c` and should be
|
`crc32c`, `xxhash32`, or `xxhash64`. Nonetheless, `crc32c` is the default
|
||||||
suitable for most purposes.
|
checksum algorithm and it is suitable for most purposes.
|
||||||
|
|
||||||
Full data checksumming does increase the amount of metadata that
|
Full data checksumming increases the amount of metadata that BlueStore must
|
||||||
BlueStore must store and manage. When possible, e.g., when clients
|
store and manage. Whenever possible (for example, when clients hint that data
|
||||||
hint that data is written and read sequentially, BlueStore will
|
is written and read sequentially), BlueStore will checksum larger blocks. In
|
||||||
checksum larger blocks, but in many cases it must store a checksum
|
many cases, however, it must store a checksum value (usually 4 bytes) for every
|
||||||
value (usually 4 bytes) for every 4 kilobyte block of data.
|
4 KB block of data.
|
||||||
|
|
||||||
It is possible to use a smaller checksum value by truncating the
|
It is possible to obtain a smaller checksum value by truncating the checksum to
|
||||||
checksum to two or one byte, reducing the metadata overhead. The
|
one or two bytes and reducing the metadata overhead. A drawback of this
|
||||||
trade-off is that the probability that a random error will not be
|
approach is that it increases the probability of a random error going
|
||||||
detected is higher with a smaller checksum, going from about one in
|
undetected: about one in four billion given a 32-bit (4 byte) checksum, 1 in
|
||||||
four billion with a 32-bit (4 byte) checksum to one in 65,536 for a
|
65,536 given a 16-bit (2 byte) checksum, and 1 in 256 given an 8-bit (1 byte)
|
||||||
16-bit (2 byte) checksum or one in 256 for an 8-bit (1 byte) checksum.
|
checksum. To use the smaller checksum values, select `crc32c_16` or `crc32c_8`
|
||||||
The smaller checksum values can be used by selecting `crc32c_16` or
|
as the checksum algorithm.
|
||||||
`crc32c_8` as the checksum algorithm.
|
|
||||||
|
|
||||||
The *checksum algorithm* can be set either via a per-pool
|
The *checksum algorithm* can be specified either via a per-pool ``csum_type``
|
||||||
``csum_type`` property or the global config option. For example:
|
configuration option or via the global configuration option. For example:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph osd pool set <pool-name> csum_type <algorithm>
|
ceph osd pool set <pool-name> csum_type <algorithm>
|
||||||
|
|
||||||
|
|
||||||
Inline Compression
|
Inline Compression
|
||||||
==================
|
==================
|
||||||
|
|
||||||
BlueStore supports inline compression using `snappy`, `zlib`, or
|
BlueStore supports inline compression using `snappy`, `zlib`, `lz4`, or `zstd`.
|
||||||
`lz4`. Please note that the `lz4` compression plugin is not
|
|
||||||
distributed in the official release.
|
|
||||||
|
|
||||||
Whether data in BlueStore is compressed is determined by a combination
|
Whether data in BlueStore is compressed is determined by two factors: (1) the
|
||||||
of the *compression mode* and any hints associated with a write
|
*compression mode* and (2) any client hints associated with a write operation.
|
||||||
operation. The modes are:
|
The compression modes are as follows:
|
||||||
|
|
||||||
* **none**: Never compress data.
|
* **none**: Never compress data.
|
||||||
* **passive**: Do not compress data unless the write operation has a
|
* **passive**: Do not compress data unless the write operation has a
|
||||||
*compressible* hint set.
|
*compressible* hint set.
|
||||||
* **aggressive**: Compress data unless the write operation has an
|
* **aggressive**: Do compress data unless the write operation has an
|
||||||
*incompressible* hint set.
|
*incompressible* hint set.
|
||||||
* **force**: Try to compress data no matter what.
|
* **force**: Try to compress data no matter what.
|
||||||
|
|
||||||
For more information about the *compressible* and *incompressible* IO
|
For more information about the *compressible* and *incompressible* I/O hints,
|
||||||
hints, see :c:func:`rados_set_alloc_hint`.
|
see :c:func:`rados_set_alloc_hint`.
|
||||||
|
|
||||||
Note that regardless of the mode, if the size of the data chunk is not
|
Note that data in Bluestore will be compressed only if the data chunk will be
|
||||||
reduced sufficiently it will not be used and the original
|
sufficiently reduced in size (as determined by the ``bluestore compression
|
||||||
(uncompressed) data will be stored. For example, if the ``bluestore
|
required ratio`` setting). No matter which compression modes have been used, if
|
||||||
compression required ratio`` is set to ``.7`` then the compressed data
|
the data chunk is too big, then it will be discarded and the original
|
||||||
must be 70% of the size of the original (or smaller).
|
(uncompressed) data will be stored instead. For example, if ``bluestore
|
||||||
|
compression required ratio`` is set to ``.7``, then data compression will take
|
||||||
|
place only if the size of the compressed data is no more than 70% of the size
|
||||||
|
of the original data.
|
||||||
|
|
||||||
The *compression mode*, *compression algorithm*, *compression required
|
The *compression mode*, *compression algorithm*, *compression required ratio*,
|
||||||
ratio*, *min blob size*, and *max blob size* can be set either via a
|
*min blob size*, and *max blob size* settings can be specified either via a
|
||||||
per-pool property or a global config option. Pool properties can be
|
per-pool property or via a global config option. To specify pool properties,
|
||||||
set with:
|
run the following commands:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -291,192 +310,202 @@ set with:
|
|||||||
RocksDB Sharding
|
RocksDB Sharding
|
||||||
================
|
================
|
||||||
|
|
||||||
Internally BlueStore uses multiple types of key-value data,
|
BlueStore maintains several types of internal key-value data, all of which are
|
||||||
stored in RocksDB. Each data type in BlueStore is assigned a
|
stored in RocksDB. Each data type in BlueStore is assigned a unique prefix.
|
||||||
unique prefix. Until Pacific all key-value data was stored in
|
Prior to the Pacific release, all key-value data was stored in a single RocksDB
|
||||||
single RocksDB column family: 'default'. Since Pacific,
|
column family: 'default'. In Pacific and later releases, however, BlueStore can
|
||||||
BlueStore can divide this data into multiple RocksDB column
|
divide key-value data into several RocksDB column families. BlueStore achieves
|
||||||
families. When keys have similar access frequency, modification
|
better caching and more precise compaction when keys are similar: specifically,
|
||||||
frequency and lifetime, BlueStore benefits from better caching
|
when keys have similar access frequency, similar modification frequency, and a
|
||||||
and more precise compaction. This improves performance, and also
|
similar lifetime. Under such conditions, performance is improved and less disk
|
||||||
requires less disk space during compaction, since each column
|
space is required during compaction (because each column family is smaller and
|
||||||
family is smaller and can compact independent of others.
|
is able to compact independently of the others).
|
||||||
|
|
||||||
OSDs deployed in Pacific or later use RocksDB sharding by default.
|
OSDs deployed in Pacific or later releases use RocksDB sharding by default.
|
||||||
If Ceph is upgraded to Pacific from a previous version, sharding is off.
|
However, if Ceph has been upgraded to Pacific or a later version from a
|
||||||
|
previous version, sharding is disabled on any OSDs that were created before
|
||||||
|
Pacific.
|
||||||
|
|
||||||
To enable sharding and apply the Pacific defaults, stop an OSD and run
|
To enable sharding and apply the Pacific defaults to a specific OSD, stop the
|
||||||
|
OSD and run the following command:
|
||||||
|
|
||||||
.. prompt:: bash #
|
.. prompt:: bash #
|
||||||
|
|
||||||
ceph-bluestore-tool \
|
ceph-bluestore-tool \
|
||||||
--path <data path> \
|
--path <data path> \
|
||||||
--sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \
|
--sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \
|
||||||
reshard
|
reshard
|
||||||
|
|
||||||
|
|
||||||
Throttling
|
SPDK Usage
|
||||||
==========
|
==========
|
||||||
|
|
||||||
SPDK Usage
|
To use the SPDK driver for NVMe devices, you must first prepare your system.
|
||||||
==================
|
See `SPDK document`__.
|
||||||
|
|
||||||
If you want to use the SPDK driver for NVMe devices, you must prepare your system.
|
|
||||||
Refer to `SPDK document`__ for more details.
|
|
||||||
|
|
||||||
.. __: http://www.spdk.io/doc/getting_started.html#getting_started_examples
|
.. __: http://www.spdk.io/doc/getting_started.html#getting_started_examples
|
||||||
|
|
||||||
SPDK offers a script to configure the device automatically. Users can run the
|
SPDK offers a script that will configure the device automatically. Run this
|
||||||
script as root:
|
script with root permissions:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
sudo src/spdk/scripts/setup.sh
|
sudo src/spdk/scripts/setup.sh
|
||||||
|
|
||||||
You will need to specify the subject NVMe device's device selector with
|
You will need to specify the subject NVMe device's device selector with the
|
||||||
the "spdk:" prefix for ``bluestore_block_path``.
|
"spdk:" prefix for ``bluestore_block_path``.
|
||||||
|
|
||||||
For example, you can find the device selector of an Intel PCIe SSD with:
|
In the following example, you first find the device selector of an Intel NVMe
|
||||||
|
SSD by running the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
lspci -mm -n -D -d 8086:0953
|
lspci -mm -n -d -d 8086:0953
|
||||||
|
|
||||||
The device selector always has the form of ``DDDD:BB:DD.FF`` or ``DDDD.BB.DD.FF``.
|
The form of the device selector is either ``DDDD:BB:DD.FF`` or
|
||||||
|
``DDDD.BB.DD.FF``.
|
||||||
|
|
||||||
and then set::
|
Next, supposing that ``0000:01:00.0`` is the device selector found in the
|
||||||
|
output of the ``lspci`` command, you can specify the device selector by running
|
||||||
|
the following command::
|
||||||
|
|
||||||
bluestore_block_path = "spdk:trtype:PCIe traddr:0000:01:00.0"
|
bluestore_block_path = "spdk:trtype:pcie traddr:0000:01:00.0"
|
||||||
|
|
||||||
Where ``0000:01:00.0`` is the device selector found in the output of ``lspci``
|
You may also specify a remote NVMeoF target over the TCP transport, as in the
|
||||||
command above.
|
|
||||||
|
|
||||||
You may also specify a remote NVMeoF target over the TCP transport as in the
|
|
||||||
following example::
|
following example::
|
||||||
|
|
||||||
bluestore_block_path = "spdk:trtype:TCP traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1"
|
bluestore_block_path = "spdk:trtype:tcp traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1"
|
||||||
|
|
||||||
To run multiple SPDK instances per node, you must specify the
|
To run multiple SPDK instances per node, you must make sure each instance uses
|
||||||
amount of dpdk memory in MB that each instance will use, to make sure each
|
its own DPDK memory by specifying for each instance the amount of DPDK memory
|
||||||
instance uses its own DPDK memory.
|
(in MB) that the instance will use.
|
||||||
|
|
||||||
In most cases, a single device can be used for data, DB, and WAL. We describe
|
In most cases, a single device can be used for data, DB, and WAL. We describe
|
||||||
this strategy as *colocating* these components. Be sure to enter the below
|
this strategy as *colocating* these components. Be sure to enter the below
|
||||||
settings to ensure that all IOs are issued through SPDK.::
|
settings to ensure that all I/Os are issued through SPDK::
|
||||||
|
|
||||||
bluestore_block_db_path = ""
|
bluestore_block_db_path = ""
|
||||||
bluestore_block_db_size = 0
|
bluestore_block_db_size = 0
|
||||||
bluestore_block_wal_path = ""
|
bluestore_block_wal_path = ""
|
||||||
bluestore_block_wal_size = 0
|
bluestore_block_wal_size = 0
|
||||||
|
|
||||||
Otherwise, the current implementation will populate the SPDK map files with
|
If these settings are not entered, then the current implementation will
|
||||||
kernel file system symbols and will use the kernel driver to issue DB/WAL IO.
|
populate the SPDK map files with kernel file system symbols and will use the
|
||||||
|
kernel driver to issue DB/WAL I/Os.
|
||||||
|
|
||||||
Minimum Allocation Size
|
Minimum Allocation Size
|
||||||
========================
|
=======================
|
||||||
|
|
||||||
There is a configured minimum amount of storage that BlueStore will allocate on
|
There is a configured minimum amount of storage that BlueStore allocates on an
|
||||||
an OSD. In practice, this is the least amount of capacity that a RADOS object
|
underlying storage device. In practice, this is the least amount of capacity
|
||||||
can consume. The value of `bluestore_min_alloc_size` is derived from the
|
that even a tiny RADOS object can consume on each OSD's primary device. The
|
||||||
value of `bluestore_min_alloc_size_hdd` or `bluestore_min_alloc_size_ssd`
|
configuration option in question-- ``bluestore_min_alloc_size`` --derives
|
||||||
depending on the OSD's ``rotational`` attribute. This means that when an OSD
|
its value from the value of either ``bluestore_min_alloc_size_hdd`` or
|
||||||
is created on an HDD, BlueStore will be initialized with the current value
|
``bluestore_min_alloc_size_ssd``, depending on the OSD's ``rotational``
|
||||||
of `bluestore_min_alloc_size_hdd`, and SSD OSDs (including NVMe devices)
|
attribute. Thus if an OSD is created on an HDD, BlueStore is initialized with
|
||||||
with the value of `bluestore_min_alloc_size_ssd`.
|
the current value of ``bluestore_min_alloc_size_hdd``; but with SSD OSDs
|
||||||
|
(including NVMe devices), Bluestore is initialized with the current value of
|
||||||
|
``bluestore_min_alloc_size_ssd``.
|
||||||
|
|
||||||
Through the Mimic release, the default values were 64KB and 16KB for rotational
|
In Mimic and earlier releases, the default values were 64KB for rotational
|
||||||
(HDD) and non-rotational (SSD) media respectively. Octopus changed the default
|
media (HDD) and 16KB for non-rotational media (SSD). The Octopus release
|
||||||
for SSD (non-rotational) media to 4KB, and Pacific changed the default for HDD
|
changed the the default value for non-rotational media (SSD) to 4KB, and the
|
||||||
(rotational) media to 4KB as well.
|
Pacific release changed the default value for rotational media (HDD) to 4KB.
|
||||||
|
|
||||||
These changes were driven by space amplification experienced by Ceph RADOS
|
These changes were driven by space amplification that was experienced by Ceph
|
||||||
GateWay (RGW) deployments that host large numbers of small files
|
RADOS GateWay (RGW) deployments that hosted large numbers of small files
|
||||||
(S3/Swift objects).
|
(S3/Swift objects).
|
||||||
|
|
||||||
For example, when an RGW client stores a 1KB S3 object, it is written to a
|
For example, when an RGW client stores a 1 KB S3 object, that object is written
|
||||||
single RADOS object. With the default `min_alloc_size` value, 4KB of
|
to a single RADOS object. In accordance with the default
|
||||||
underlying drive space is allocated. This means that roughly
|
``min_alloc_size`` value, 4 KB of underlying drive space is allocated.
|
||||||
(4KB - 1KB) == 3KB is allocated but never used, which corresponds to 300%
|
This means that roughly 3 KB (that is, 4 KB minus 1 KB) is allocated but never
|
||||||
overhead or 25% efficiency. Similarly, a 5KB user object will be stored
|
used: this corresponds to 300% overhead or 25% efficiency. Similarly, a 5 KB
|
||||||
as one 4KB and one 1KB RADOS object, again stranding 4KB of device capcity,
|
user object will be stored as two RADOS objects, a 4 KB RADOS object and a 1 KB
|
||||||
though in this case the overhead is a much smaller percentage. Think of this
|
RADOS object, with the result that 4KB of device capacity is stranded. In this
|
||||||
in terms of the remainder from a modulus operation. The overhead *percentage*
|
case, however, the overhead percentage is much smaller. Think of this in terms
|
||||||
thus decreases rapidly as user object size increases.
|
of the remainder from a modulus operation. The overhead *percentage* thus
|
||||||
|
decreases rapidly as object size increases.
|
||||||
|
|
||||||
An easily missed additional subtlety is that this
|
There is an additional subtlety that is easily missed: the amplification
|
||||||
takes place for *each* replica. So when using the default three copies of
|
phenomenon just described takes place for *each* replica. For example, when
|
||||||
data (3R), a 1KB S3 object actually consumes roughly 9KB of storage device
|
using the default of three copies of data (3R), a 1 KB S3 object actually
|
||||||
capacity. If erasure coding (EC) is used instead of replication, the
|
strands roughly 9 KB of storage device capacity. If erasure coding (EC) is used
|
||||||
amplification may be even higher: for a ``k=4,m=2`` pool, our 1KB S3 object
|
instead of replication, the amplification might be even higher: for a ``k=4,
|
||||||
will allocate (6 * 4KB) = 24KB of device capacity.
|
m=2`` pool, our 1 KB S3 object allocates 24 KB (that is, 4 KB multiplied by 6)
|
||||||
|
of device capacity.
|
||||||
|
|
||||||
When an RGW bucket pool contains many relatively large user objects, the effect
|
When an RGW bucket pool contains many relatively large user objects, the effect
|
||||||
of this phenomenon is often negligible, but should be considered for deployments
|
of this phenomenon is often negligible. However, with deployments that can
|
||||||
that expect a signficiant fraction of relatively small objects.
|
expect a significant fraction of relatively small user objects, the effect
|
||||||
|
should be taken into consideration.
|
||||||
|
|
||||||
The 4KB default value aligns well with conventional HDD and SSD devices. Some
|
The 4KB default value aligns well with conventional HDD and SSD devices.
|
||||||
new coarse-IU (Indirection Unit) QLC SSDs however perform and wear best
|
However, certain novel coarse-IU (Indirection Unit) QLC SSDs perform and wear
|
||||||
when `bluestore_min_alloc_size_ssd`
|
best when ``bluestore_min_alloc_size_ssd`` is specified at OSD creation
|
||||||
is set at OSD creation to match the device's IU:. 8KB, 16KB, or even 64KB.
|
to match the device's IU: this might be 8KB, 16KB, or even 64KB. These novel
|
||||||
These novel storage drives allow one to achieve read performance competitive
|
storage drives can achieve read performance that is competitive with that of
|
||||||
with conventional TLC SSDs and write performance faster than HDDs, with
|
conventional TLC SSDs and write performance that is faster than that of HDDs,
|
||||||
high density and lower cost than TLC SSDs.
|
with higher density and lower cost than TLC SSDs.
|
||||||
|
|
||||||
Note that when creating OSDs on these devices, one must carefully apply the
|
Note that when creating OSDs on these novel devices, one must be careful to
|
||||||
non-default value only to appropriate devices, and not to conventional SSD and
|
apply the non-default value only to appropriate devices, and not to
|
||||||
HDD devices. This may be done through careful ordering of OSD creation, custom
|
conventional HDD and SSD devices. Error can be avoided through careful ordering
|
||||||
OSD device classes, and especially by the use of central configuration _masks_.
|
of OSD creation, with custom OSD device classes, and especially by the use of
|
||||||
|
central configuration *masks*.
|
||||||
|
|
||||||
Quincy and later releases add
|
In Quincy and later releases, you can use the
|
||||||
the `bluestore_use_optimal_io_size_for_min_alloc_size`
|
``bluestore_use_optimal_io_size_for_min_alloc_size`` option to allow
|
||||||
option that enables automatic discovery of the appropriate value as each OSD is
|
automatic discovery of the correct value as each OSD is created. Note that the
|
||||||
created. Note that the use of ``bcache``, ``OpenCAS``, ``dmcrypt``,
|
use of ``bcache``, ``OpenCAS``, ``dmcrypt``, ``ATA over Ethernet``, `iSCSI`, or
|
||||||
``ATA over Ethernet``, `iSCSI`, or other device layering / abstraction
|
other device-layering and abstraction technologies might confound the
|
||||||
technologies may confound the determination of appropriate values. OSDs
|
determination of correct values. Moreover, OSDs deployed on top of VMware
|
||||||
deployed on top of VMware storage have been reported to also
|
storage have sometimes been found to report a ``rotational`` attribute that
|
||||||
sometimes report a ``rotational`` attribute that does not match the underlying
|
does not match the underlying hardware.
|
||||||
hardware.
|
|
||||||
|
|
||||||
We suggest inspecting such OSDs at startup via logs and admin sockets to ensure that
|
We suggest inspecting such OSDs at startup via logs and admin sockets in order
|
||||||
behavior is appropriate. Note that this also may not work as desired with
|
to ensure that their behavior is correct. Be aware that this kind of inspection
|
||||||
older kernels. You can check for this by examining the presence and value
|
might not work as expected with older kernels. To check for this issue,
|
||||||
of ``/sys/block/<drive>/queue/optimal_io_size``.
|
examine the presence and value of ``/sys/block/<drive>/queue/optimal_io_size``.
|
||||||
|
|
||||||
You may also inspect a given OSD:
|
.. note:: When running Reef or a later Ceph release, the ``min_alloc_size``
|
||||||
|
baked into each OSD is conveniently reported by ``ceph osd metadata``.
|
||||||
|
|
||||||
|
To inspect a specific OSD, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash #
|
.. prompt:: bash #
|
||||||
|
|
||||||
ceph osd metadata osd.1701 | grep rotational
|
ceph osd metadata osd.1701 | egrep rotational\|alloc
|
||||||
|
|
||||||
This space amplification may manifest as an unusually high ratio of raw to
|
This space amplification might manifest as an unusually high ratio of raw to
|
||||||
stored data reported by ``ceph df``. ``ceph osd df`` may also report
|
stored data as reported by ``ceph df``. There might also be ``%USE`` / ``VAR``
|
||||||
anomalously high ``%USE`` / ``VAR`` values when
|
values reported by ``ceph osd df`` that are unusually high in comparison to
|
||||||
compared to other, ostensibly identical OSDs. A pool using OSDs with
|
other, ostensibly identical, OSDs. Finally, there might be unexpected balancer
|
||||||
mismatched ``min_alloc_size`` values may experience unexpected balancer
|
behavior in pools that use OSDs that have mismatched ``min_alloc_size`` values.
|
||||||
behavior as well.
|
|
||||||
|
|
||||||
Note that this BlueStore attribute takes effect *only* at OSD creation; if
|
This BlueStore attribute takes effect *only* at OSD creation; if the attribute
|
||||||
changed later, a given OSD's behavior will not change unless / until it is
|
is changed later, a specific OSD's behavior will not change unless and until
|
||||||
destroyed and redeployed with the appropriate option value(s). Upgrading
|
the OSD is destroyed and redeployed with the appropriate option value(s).
|
||||||
to a later Ceph release will *not* change the value used by OSDs deployed
|
Upgrading to a later Ceph release will *not* change the value used by OSDs that
|
||||||
under older releases or with other settings.
|
were deployed under older releases or with other settings.
|
||||||
|
|
||||||
DSA (Data Streaming Accelerator Usage)
|
DSA (Data Streaming Accelerator) Usage
|
||||||
======================================
|
======================================
|
||||||
|
|
||||||
If you want to use the DML library to drive DSA device for offloading
|
If you want to use the DML library to drive the DSA device for offloading
|
||||||
read/write operations on Persist memory in Bluestore. You need to install
|
read/write operations on persistent memory (PMEM) in BlueStore, you need to
|
||||||
`DML`_ and `idxd-config`_ library in your machine with SPR (Sapphire Rapids) CPU.
|
install `DML`_ and the `idxd-config`_ library. This will work only on machines
|
||||||
|
that have a SPR (Sapphire Rapids) CPU.
|
||||||
|
|
||||||
.. _DML: https://github.com/intel/DML
|
.. _dml: https://github.com/intel/dml
|
||||||
.. _idxd-config: https://github.com/intel/idxd-config
|
.. _idxd-config: https://github.com/intel/idxd-config
|
||||||
|
|
||||||
After installing the DML software, you need to configure the shared
|
After installing the DML software, configure the shared work queues (WQs) with
|
||||||
work queues (WQs) with the following WQ configuration example via accel-config tool:
|
reference to the following WQ configuration example:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="MyApp1" --priority=10 --block-on-fault=1 dsa0/wq0.1
|
accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="myapp1" --priority=10 --block-on-fault=1 dsa0/wq0.1
|
||||||
accel-config config-engine dsa0/engine0.1 --group-id=1
|
accel-config config-engine dsa0/engine0.1 --group-id=1
|
||||||
accel-config enable-device dsa0
|
accel-config enable-device dsa0
|
||||||
accel-config enable-wq dsa0/wq0.1
|
accel-config enable-wq dsa0/wq0.1
|
||||||
|
@ -218,4 +218,4 @@ If you need to allow multiple clusters to exist on the same host, use
|
|||||||
.. _Hardware Recommendations: ../../../start/hardware-recommendations
|
.. _Hardware Recommendations: ../../../start/hardware-recommendations
|
||||||
.. _Network Configuration Reference: ../network-config-ref
|
.. _Network Configuration Reference: ../network-config-ref
|
||||||
.. _OSD Config Reference: ../osd-config-ref
|
.. _OSD Config Reference: ../osd-config-ref
|
||||||
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interactio
|
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction
|
||||||
|
@ -2,8 +2,14 @@
|
|||||||
Filestore Config Reference
|
Filestore Config Reference
|
||||||
============================
|
============================
|
||||||
|
|
||||||
The Filestore back end is no longer the default when creating new OSDs,
|
.. note:: Since the Luminous release of Ceph, Filestore has not been Ceph's
|
||||||
though Filestore OSDs are still supported.
|
default storage back end. Since the Luminous release of Ceph, BlueStore has
|
||||||
|
been Ceph's default storage back end. However, Filestore OSDs are still
|
||||||
|
supported. See :ref:`OSD Back Ends
|
||||||
|
<rados_config_storage_devices_osd_backends>`. See :ref:`BlueStore Migration
|
||||||
|
<rados_operations_bluestore_migration>` for instructions explaining how to
|
||||||
|
replace an existing Filestore back end with a BlueStore back end.
|
||||||
|
|
||||||
|
|
||||||
``filestore debug omap check``
|
``filestore debug omap check``
|
||||||
|
|
||||||
@ -18,26 +24,31 @@ though Filestore OSDs are still supported.
|
|||||||
Extended Attributes
|
Extended Attributes
|
||||||
===================
|
===================
|
||||||
|
|
||||||
Extended Attributes (XATTRs) are important for Filestore OSDs.
|
Extended Attributes (XATTRs) are important for Filestore OSDs. However, Certain
|
||||||
Some file systems have limits on the number of bytes that can be stored in XATTRs.
|
disadvantages can occur when the underlying file system is used for the storage
|
||||||
Additionally, in some cases, the file system may not be as fast as an alternative
|
of XATTRs: some file systems have limits on the number of bytes that can be
|
||||||
method of storing XATTRs. The following settings may help improve performance
|
stored in XATTRs, and your file system might in some cases therefore run slower
|
||||||
by using a method of storing XATTRs that is extrinsic to the underlying file system.
|
than would an alternative method of storing XATTRs. For this reason, a method
|
||||||
|
of storing XATTRs extrinsic to the underlying file system might improve
|
||||||
|
performance. To implement such an extrinsic method, refer to the following
|
||||||
|
settings.
|
||||||
|
|
||||||
Ceph XATTRs are stored as ``inline xattr``, using the XATTRs provided
|
If the underlying file system has no size limit, then Ceph XATTRs are stored as
|
||||||
by the underlying file system, if it does not impose a size limit. If
|
``inline xattr``, using the XATTRs provided by the file system. But if there is
|
||||||
there is a size limit (4KB total on ext4, for instance), some Ceph
|
a size limit (for example, ext4 imposes a limit of 4 KB total), then some Ceph
|
||||||
XATTRs will be stored in a key/value database when either the
|
XATTRs will be stored in a key/value database when the limit is reached. More
|
||||||
|
precisely, this begins to occur when either the
|
||||||
``filestore_max_inline_xattr_size`` or ``filestore_max_inline_xattrs``
|
``filestore_max_inline_xattr_size`` or ``filestore_max_inline_xattrs``
|
||||||
threshold is reached.
|
threshold is reached.
|
||||||
|
|
||||||
|
|
||||||
``filestore_max_inline_xattr_size``
|
``filestore_max_inline_xattr_size``
|
||||||
|
|
||||||
:Description: The maximum size of an XATTR stored in the file system (i.e., XFS,
|
:Description: Defines the maximum size per object of an XATTR that can be
|
||||||
Btrfs, EXT4, etc.) per object. Should not be larger than the
|
stored in the file system (for example, XFS, Btrfs, ext4). The
|
||||||
file system can handle. Default value of 0 means to use the value
|
specified size should not be larger than the file system can
|
||||||
specific to the underlying file system.
|
handle. Using the default value of 0 instructs Filestore to use
|
||||||
|
the value specific to the file system.
|
||||||
:Type: Unsigned 32-bit Integer
|
:Type: Unsigned 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``0``
|
:Default: ``0``
|
||||||
@ -45,8 +56,9 @@ threshold is reached.
|
|||||||
|
|
||||||
``filestore_max_inline_xattr_size_xfs``
|
``filestore_max_inline_xattr_size_xfs``
|
||||||
|
|
||||||
:Description: The maximum size of an XATTR stored in the XFS file system.
|
:Description: Defines the maximum size of an XATTR that can be stored in the
|
||||||
Only used if ``filestore_max_inline_xattr_size`` == 0.
|
XFS file system. This setting is used only if
|
||||||
|
``filestore_max_inline_xattr_size`` == 0.
|
||||||
:Type: Unsigned 32-bit Integer
|
:Type: Unsigned 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``65536``
|
:Default: ``65536``
|
||||||
@ -54,8 +66,9 @@ threshold is reached.
|
|||||||
|
|
||||||
``filestore_max_inline_xattr_size_btrfs``
|
``filestore_max_inline_xattr_size_btrfs``
|
||||||
|
|
||||||
:Description: The maximum size of an XATTR stored in the Btrfs file system.
|
:Description: Defines the maximum size of an XATTR that can be stored in the
|
||||||
Only used if ``filestore_max_inline_xattr_size`` == 0.
|
Btrfs file system. This setting is used only if
|
||||||
|
``filestore_max_inline_xattr_size`` == 0.
|
||||||
:Type: Unsigned 32-bit Integer
|
:Type: Unsigned 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``2048``
|
:Default: ``2048``
|
||||||
@ -63,8 +76,8 @@ threshold is reached.
|
|||||||
|
|
||||||
``filestore_max_inline_xattr_size_other``
|
``filestore_max_inline_xattr_size_other``
|
||||||
|
|
||||||
:Description: The maximum size of an XATTR stored in other file systems.
|
:Description: Defines the maximum size of an XATTR that can be stored in other file systems.
|
||||||
Only used if ``filestore_max_inline_xattr_size`` == 0.
|
This setting is used only if ``filestore_max_inline_xattr_size`` == 0.
|
||||||
:Type: Unsigned 32-bit Integer
|
:Type: Unsigned 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``512``
|
:Default: ``512``
|
||||||
@ -72,9 +85,8 @@ threshold is reached.
|
|||||||
|
|
||||||
``filestore_max_inline_xattrs``
|
``filestore_max_inline_xattrs``
|
||||||
|
|
||||||
:Description: The maximum number of XATTRs stored in the file system per object.
|
:Description: Defines the maximum number of XATTRs per object that can be stored in the file system.
|
||||||
Default value of 0 means to use the value specific to the
|
Using the default value of 0 instructs Filestore to use the value specific to the file system.
|
||||||
underlying file system.
|
|
||||||
:Type: 32-bit Integer
|
:Type: 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``0``
|
:Default: ``0``
|
||||||
@ -82,8 +94,8 @@ threshold is reached.
|
|||||||
|
|
||||||
``filestore_max_inline_xattrs_xfs``
|
``filestore_max_inline_xattrs_xfs``
|
||||||
|
|
||||||
:Description: The maximum number of XATTRs stored in the XFS file system per object.
|
:Description: Defines the maximum number of XATTRs per object that can be stored in the XFS file system.
|
||||||
Only used if ``filestore_max_inline_xattrs`` == 0.
|
This setting is used only if ``filestore_max_inline_xattrs`` == 0.
|
||||||
:Type: 32-bit Integer
|
:Type: 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``10``
|
:Default: ``10``
|
||||||
@ -91,8 +103,8 @@ threshold is reached.
|
|||||||
|
|
||||||
``filestore_max_inline_xattrs_btrfs``
|
``filestore_max_inline_xattrs_btrfs``
|
||||||
|
|
||||||
:Description: The maximum number of XATTRs stored in the Btrfs file system per object.
|
:Description: Defines the maximum number of XATTRs per object that can be stored in the Btrfs file system.
|
||||||
Only used if ``filestore_max_inline_xattrs`` == 0.
|
This setting is used only if ``filestore_max_inline_xattrs`` == 0.
|
||||||
:Type: 32-bit Integer
|
:Type: 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``10``
|
:Default: ``10``
|
||||||
@ -100,8 +112,8 @@ threshold is reached.
|
|||||||
|
|
||||||
``filestore_max_inline_xattrs_other``
|
``filestore_max_inline_xattrs_other``
|
||||||
|
|
||||||
:Description: The maximum number of XATTRs stored in other file systems per object.
|
:Description: Defines the maximum number of XATTRs per object that can be stored in other file systems.
|
||||||
Only used if ``filestore_max_inline_xattrs`` == 0.
|
This setting is used only if ``filestore_max_inline_xattrs`` == 0.
|
||||||
:Type: 32-bit Integer
|
:Type: 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``2``
|
:Default: ``2``
|
||||||
@ -111,18 +123,19 @@ threshold is reached.
|
|||||||
Synchronization Intervals
|
Synchronization Intervals
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
Filestore needs to periodically quiesce writes and synchronize the
|
Filestore must periodically quiesce writes and synchronize the file system.
|
||||||
file system, which creates a consistent commit point. It can then free journal
|
Each synchronization creates a consistent commit point. When the commit point
|
||||||
entries up to the commit point. Synchronizing more frequently tends to reduce
|
is created, Filestore is able to free all journal entries up to that point.
|
||||||
the time required to perform synchronization, and reduces the amount of data
|
More-frequent synchronization tends to reduce both synchronization time and
|
||||||
that needs to remain in the journal. Less frequent synchronization allows the
|
the amount of data that needs to remain in the journal. Less-frequent
|
||||||
backing file system to coalesce small writes and metadata updates more
|
synchronization allows the backing file system to coalesce small writes and
|
||||||
optimally, potentially resulting in more efficient synchronization at the
|
metadata updates, potentially increasing synchronization
|
||||||
expense of potentially increasing tail latency.
|
efficiency but also potentially increasing tail latency.
|
||||||
|
|
||||||
|
|
||||||
``filestore_max_sync_interval``
|
``filestore_max_sync_interval``
|
||||||
|
|
||||||
:Description: The maximum interval in seconds for synchronizing Filestore.
|
:Description: Defines the maximum interval (in seconds) for synchronizing Filestore.
|
||||||
:Type: Double
|
:Type: Double
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``5``
|
:Default: ``5``
|
||||||
@ -130,7 +143,7 @@ expense of potentially increasing tail latency.
|
|||||||
|
|
||||||
``filestore_min_sync_interval``
|
``filestore_min_sync_interval``
|
||||||
|
|
||||||
:Description: The minimum interval in seconds for synchronizing Filestore.
|
:Description: Defines the minimum interval (in seconds) for synchronizing Filestore.
|
||||||
:Type: Double
|
:Type: Double
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``.01``
|
:Default: ``.01``
|
||||||
@ -142,14 +155,14 @@ Flusher
|
|||||||
=======
|
=======
|
||||||
|
|
||||||
The Filestore flusher forces data from large writes to be written out using
|
The Filestore flusher forces data from large writes to be written out using
|
||||||
``sync_file_range`` before the sync in order to (hopefully) reduce the cost of
|
``sync_file_range`` prior to the synchronization.
|
||||||
the eventual sync. In practice, disabling 'filestore_flusher' seems to improve
|
Ideally, this action reduces the cost of the eventual synchronization. In practice, however, disabling
|
||||||
performance in some cases.
|
'filestore_flusher' seems in some cases to improve performance.
|
||||||
|
|
||||||
|
|
||||||
``filestore_flusher``
|
``filestore_flusher``
|
||||||
|
|
||||||
:Description: Enables the filestore flusher.
|
:Description: Enables the Filestore flusher.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``false``
|
:Default: ``false``
|
||||||
@ -158,7 +171,7 @@ performance in some cases.
|
|||||||
|
|
||||||
``filestore_flusher_max_fds``
|
``filestore_flusher_max_fds``
|
||||||
|
|
||||||
:Description: Sets the maximum number of file descriptors for the flusher.
|
:Description: Defines the maximum number of file descriptors for the flusher.
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``512``
|
:Default: ``512``
|
||||||
@ -176,7 +189,7 @@ performance in some cases.
|
|||||||
|
|
||||||
``filestore_fsync_flushes_journal_data``
|
``filestore_fsync_flushes_journal_data``
|
||||||
|
|
||||||
:Description: Flush journal data during file system synchronization.
|
:Description: Flushes journal data during file-system synchronization.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``false``
|
:Default: ``false``
|
||||||
@ -187,11 +200,11 @@ performance in some cases.
|
|||||||
Queue
|
Queue
|
||||||
=====
|
=====
|
||||||
|
|
||||||
The following settings provide limits on the size of the Filestore queue.
|
The following settings define limits on the size of the Filestore queue:
|
||||||
|
|
||||||
``filestore_queue_max_ops``
|
``filestore_queue_max_ops``
|
||||||
|
|
||||||
:Description: Defines the maximum number of in progress operations the file store accepts before blocking on queuing new operations.
|
:Description: Defines the maximum number of in-progress operations that Filestore accepts before it blocks the queueing of any new operations.
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No. Minimal impact on performance.
|
:Required: No. Minimal impact on performance.
|
||||||
:Default: ``50``
|
:Default: ``50``
|
||||||
@ -199,23 +212,20 @@ The following settings provide limits on the size of the Filestore queue.
|
|||||||
|
|
||||||
``filestore_queue_max_bytes``
|
``filestore_queue_max_bytes``
|
||||||
|
|
||||||
:Description: The maximum number of bytes for an operation.
|
:Description: Defines the maximum number of bytes permitted per operation.
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``100 << 20``
|
:Default: ``100 << 20``
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.. index:: filestore; timeouts
|
.. index:: filestore; timeouts
|
||||||
|
|
||||||
Timeouts
|
Timeouts
|
||||||
========
|
========
|
||||||
|
|
||||||
|
|
||||||
``filestore_op_threads``
|
``filestore_op_threads``
|
||||||
|
|
||||||
:Description: The number of file system operation threads that execute in parallel.
|
:Description: Defines the number of file-system operation threads that execute in parallel.
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``2``
|
:Default: ``2``
|
||||||
@ -223,7 +233,7 @@ Timeouts
|
|||||||
|
|
||||||
``filestore_op_thread_timeout``
|
``filestore_op_thread_timeout``
|
||||||
|
|
||||||
:Description: The timeout for a file system operation thread (in seconds).
|
:Description: Defines the timeout (in seconds) for a file-system operation thread.
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``60``
|
:Default: ``60``
|
||||||
@ -231,7 +241,7 @@ Timeouts
|
|||||||
|
|
||||||
``filestore_op_thread_suicide_timeout``
|
``filestore_op_thread_suicide_timeout``
|
||||||
|
|
||||||
:Description: The timeout for a commit operation before cancelling the commit (in seconds).
|
:Description: Defines the timeout (in seconds) for a commit operation before the commit is cancelled.
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``180``
|
:Default: ``180``
|
||||||
@ -245,17 +255,17 @@ B-Tree Filesystem
|
|||||||
|
|
||||||
``filestore_btrfs_snap``
|
``filestore_btrfs_snap``
|
||||||
|
|
||||||
:Description: Enable snapshots for a ``btrfs`` filestore.
|
:Description: Enables snapshots for a ``btrfs`` Filestore.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No. Only used for ``btrfs``.
|
:Required: No. Used only for ``btrfs``.
|
||||||
:Default: ``true``
|
:Default: ``true``
|
||||||
|
|
||||||
|
|
||||||
``filestore_btrfs_clone_range``
|
``filestore_btrfs_clone_range``
|
||||||
|
|
||||||
:Description: Enable cloning ranges for a ``btrfs`` filestore.
|
:Description: Enables cloning ranges for a ``btrfs`` Filestore.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No. Only used for ``btrfs``.
|
:Required: No. Used only for ``btrfs``.
|
||||||
:Default: ``true``
|
:Default: ``true``
|
||||||
|
|
||||||
|
|
||||||
@ -267,7 +277,7 @@ Journal
|
|||||||
|
|
||||||
``filestore_journal_parallel``
|
``filestore_journal_parallel``
|
||||||
|
|
||||||
:Description: Enables parallel journaling, default for Btrfs.
|
:Description: Enables parallel journaling, default for ``btrfs``.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``false``
|
:Default: ``false``
|
||||||
@ -275,7 +285,7 @@ Journal
|
|||||||
|
|
||||||
``filestore_journal_writeahead``
|
``filestore_journal_writeahead``
|
||||||
|
|
||||||
:Description: Enables writeahead journaling, default for XFS.
|
:Description: Enables write-ahead journaling, default for XFS.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``false``
|
:Default: ``false``
|
||||||
@ -283,7 +293,7 @@ Journal
|
|||||||
|
|
||||||
``filestore_journal_trailing``
|
``filestore_journal_trailing``
|
||||||
|
|
||||||
:Description: Deprecated, never use.
|
:Description: Deprecated. **Never use.**
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``false``
|
:Default: ``false``
|
||||||
@ -295,8 +305,8 @@ Misc
|
|||||||
|
|
||||||
``filestore_merge_threshold``
|
``filestore_merge_threshold``
|
||||||
|
|
||||||
:Description: Min number of files in a subdir before merging into parent
|
:Description: Defines the minimum number of files permitted in a subdirectory before the subdirectory is merged into its parent directory.
|
||||||
NOTE: A negative value means to disable subdir merging
|
NOTE: A negative value means that subdirectory merging is disabled.
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``-10``
|
:Default: ``-10``
|
||||||
@ -305,8 +315,8 @@ Misc
|
|||||||
``filestore_split_multiple``
|
``filestore_split_multiple``
|
||||||
|
|
||||||
:Description: ``(filestore_split_multiple * abs(filestore_merge_threshold) + (rand() % filestore_split_rand_factor)) * 16``
|
:Description: ``(filestore_split_multiple * abs(filestore_merge_threshold) + (rand() % filestore_split_rand_factor)) * 16``
|
||||||
is the maximum number of files in a subdirectory before
|
is the maximum number of files permitted in a subdirectory
|
||||||
splitting into child directories.
|
before the subdirectory is split into child directories.
|
||||||
|
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
@ -316,10 +326,10 @@ Misc
|
|||||||
``filestore_split_rand_factor``
|
``filestore_split_rand_factor``
|
||||||
|
|
||||||
:Description: A random factor added to the split threshold to avoid
|
:Description: A random factor added to the split threshold to avoid
|
||||||
too many (expensive) Filestore splits occurring at once. See
|
too many (expensive) Filestore splits occurring at the same time.
|
||||||
``filestore_split_multiple`` for details.
|
For details, see ``filestore_split_multiple``.
|
||||||
This can only be changed offline for an existing OSD,
|
To change this setting for an existing OSD, it is necessary to take the OSD
|
||||||
via the ``ceph-objectstore-tool apply-layout-settings`` command.
|
offline before running the ``ceph-objectstore-tool apply-layout-settings`` command.
|
||||||
|
|
||||||
:Type: Unsigned 32-bit Integer
|
:Type: Unsigned 32-bit Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
@ -328,7 +338,7 @@ Misc
|
|||||||
|
|
||||||
``filestore_update_to``
|
``filestore_update_to``
|
||||||
|
|
||||||
:Description: Limits Filestore auto upgrade to specified version.
|
:Description: Limits automatic upgrades to a specified version of Filestore. Useful in cases in which you want to avoid upgrading to a specific version.
|
||||||
:Type: Integer
|
:Type: Integer
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``1000``
|
:Default: ``1000``
|
||||||
@ -336,7 +346,7 @@ Misc
|
|||||||
|
|
||||||
``filestore_blackhole``
|
``filestore_blackhole``
|
||||||
|
|
||||||
:Description: Drop any new transactions on the floor.
|
:Description: Drops any new transactions on the floor, similar to redirecting to NULL.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``false``
|
:Default: ``false``
|
||||||
@ -344,7 +354,7 @@ Misc
|
|||||||
|
|
||||||
``filestore_dump_file``
|
``filestore_dump_file``
|
||||||
|
|
||||||
:Description: File onto which store transaction dumps.
|
:Description: Defines the file that transaction dumps are stored on.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``false``
|
:Default: ``false``
|
||||||
@ -352,7 +362,7 @@ Misc
|
|||||||
|
|
||||||
``filestore_kill_at``
|
``filestore_kill_at``
|
||||||
|
|
||||||
:Description: inject a failure at the n'th opportunity
|
:Description: Injects a failure at the *n*\th opportunity.
|
||||||
:Type: String
|
:Type: String
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``false``
|
:Default: ``false``
|
||||||
@ -360,8 +370,7 @@ Misc
|
|||||||
|
|
||||||
``filestore_fail_eio``
|
``filestore_fail_eio``
|
||||||
|
|
||||||
:Description: Fail/Crash on eio.
|
:Description: Fail/Crash on EIO.
|
||||||
:Type: Boolean
|
:Type: Boolean
|
||||||
:Required: No
|
:Required: No
|
||||||
:Default: ``true``
|
:Default: ``true``
|
||||||
|
|
||||||
|
@ -16,24 +16,29 @@ consistent, but you can add, remove or replace a monitor in a cluster. See
|
|||||||
Background
|
Background
|
||||||
==========
|
==========
|
||||||
|
|
||||||
Ceph Monitors maintain a "master copy" of the :term:`Cluster Map`, which means a
|
Ceph Monitors maintain a "master copy" of the :term:`Cluster Map`.
|
||||||
:term:`Ceph Client` can determine the location of all Ceph Monitors, Ceph OSD
|
|
||||||
Daemons, and Ceph Metadata Servers just by connecting to one Ceph Monitor and
|
|
||||||
retrieving a current cluster map. Before Ceph Clients can read from or write to
|
|
||||||
Ceph OSD Daemons or Ceph Metadata Servers, they must connect to a Ceph Monitor
|
|
||||||
first. With a current copy of the cluster map and the CRUSH algorithm, a Ceph
|
|
||||||
Client can compute the location for any object. The ability to compute object
|
|
||||||
locations allows a Ceph Client to talk directly to Ceph OSD Daemons, which is a
|
|
||||||
very important aspect of Ceph's high scalability and performance. See
|
|
||||||
`Scalability and High Availability`_ for additional details.
|
|
||||||
|
|
||||||
The primary role of the Ceph Monitor is to maintain a master copy of the cluster
|
The maintenance by Ceph Monitors of a :term:`Cluster Map` makes it possible for
|
||||||
map. Ceph Monitors also provide authentication and logging services. Ceph
|
a :term:`Ceph Client` to determine the location of all Ceph Monitors, Ceph OSD
|
||||||
Monitors write all changes in the monitor services to a single Paxos instance,
|
Daemons, and Ceph Metadata Servers by connecting to one Ceph Monitor and
|
||||||
and Paxos writes the changes to a key/value store for strong consistency. Ceph
|
retrieving a current cluster map. Before Ceph Clients can read from or write to
|
||||||
Monitors can query the most recent version of the cluster map during sync
|
Ceph OSD Daemons or Ceph Metadata Servers, they must connect to a Ceph Monitor.
|
||||||
operations. Ceph Monitors leverage the key/value store's snapshots and iterators
|
When a Ceph client has a current copy of the cluster map and the CRUSH
|
||||||
(using leveldb) to perform store-wide synchronization.
|
algorithm, it can compute the location for any RADOS object within in the
|
||||||
|
cluster. This ability to compute the locations of objects makes it possible for
|
||||||
|
Ceph Clients to talk directly to Ceph OSD Daemons. This direct communication
|
||||||
|
with Ceph OSD Daemons represents an improvment upon traditional storage
|
||||||
|
architectures in which clients were required to communicate with a central
|
||||||
|
component, and that improvment contributes to Ceph's high scalability and
|
||||||
|
performance. See `Scalability and High Availability`_ for additional details.
|
||||||
|
|
||||||
|
The Ceph Monitor's primary function is to maintain a master copy of the cluster
|
||||||
|
map. Monitors also provide authentication and logging services. All changes in
|
||||||
|
the monitor services are written by the Ceph Monitor to a single Paxos
|
||||||
|
instance, and Paxos writes the changes to a key/value store for strong
|
||||||
|
consistency. Ceph Monitors are able to query the most recent version of the
|
||||||
|
cluster map during sync operations, and they use the key/value store's
|
||||||
|
snapshots and iterators (using leveldb) to perform store-wide synchronization.
|
||||||
|
|
||||||
.. ditaa::
|
.. ditaa::
|
||||||
/-------------\ /-------------\
|
/-------------\ /-------------\
|
||||||
@ -56,12 +61,6 @@ operations. Ceph Monitors leverage the key/value store's snapshots and iterators
|
|||||||
| cCCC |*---------------------+
|
| cCCC |*---------------------+
|
||||||
\-------------/
|
\-------------/
|
||||||
|
|
||||||
|
|
||||||
.. deprecated:: version 0.58
|
|
||||||
|
|
||||||
In Ceph versions 0.58 and earlier, Ceph Monitors use a Paxos instance for
|
|
||||||
each service and store the map as a file.
|
|
||||||
|
|
||||||
.. index:: Ceph Monitor; cluster map
|
.. index:: Ceph Monitor; cluster map
|
||||||
|
|
||||||
Cluster Maps
|
Cluster Maps
|
||||||
|
@ -25,6 +25,7 @@ There are two Ceph daemons that store data on devices:
|
|||||||
additional monitoring and providing interfaces to external
|
additional monitoring and providing interfaces to external
|
||||||
monitoring and management systems.
|
monitoring and management systems.
|
||||||
|
|
||||||
|
.. _rados_config_storage_devices_osd_backends:
|
||||||
|
|
||||||
OSD Back Ends
|
OSD Back Ends
|
||||||
=============
|
=============
|
||||||
|
@ -3,14 +3,15 @@
|
|||||||
Balancer
|
Balancer
|
||||||
========
|
========
|
||||||
|
|
||||||
The *balancer* can optimize the placement of PGs across OSDs in
|
The *balancer* can optimize the allocation of placement groups (PGs) across
|
||||||
order to achieve a balanced distribution, either automatically or in a
|
OSDs in order to achieve a balanced distribution. The balancer can operate
|
||||||
supervised fashion.
|
either automatically or in a supervised fashion.
|
||||||
|
|
||||||
|
|
||||||
Status
|
Status
|
||||||
------
|
------
|
||||||
|
|
||||||
The current status of the balancer can be checked at any time with:
|
To check the current status of the balancer, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -20,70 +21,78 @@ The current status of the balancer can be checked at any time with:
|
|||||||
Automatic balancing
|
Automatic balancing
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
The automatic balancing feature is enabled by default in ``upmap``
|
When the balancer is in ``upmap`` mode, the automatic balancing feature is
|
||||||
mode. Please refer to :ref:`upmap` for more details. The balancer can be
|
enabled by default. For more details, see :ref:`upmap`. To disable the
|
||||||
turned off with:
|
balancer, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer off
|
ceph balancer off
|
||||||
|
|
||||||
The balancer mode can be changed to ``crush-compat`` mode, which is
|
The balancer mode can be changed from ``upmap`` mode to ``crush-compat`` mode.
|
||||||
backward compatible with older clients, and will make small changes to
|
``crush-compat`` mode is backward compatible with older clients. In
|
||||||
the data distribution over time to ensure that OSDs are equally utilized.
|
``crush-compat`` mode, the balancer automatically makes small changes to the
|
||||||
|
data distribution in order to ensure that OSDs are utilized equally.
|
||||||
|
|
||||||
|
|
||||||
Throttling
|
Throttling
|
||||||
----------
|
----------
|
||||||
|
|
||||||
No adjustments will be made to the PG distribution if the cluster is
|
If the cluster is degraded (that is, if an OSD has failed and the system hasn't
|
||||||
degraded (e.g., because an OSD has failed and the system has not yet
|
healed itself yet), then the balancer will not make any adjustments to the PG
|
||||||
healed itself).
|
distribution.
|
||||||
|
|
||||||
When the cluster is healthy, the balancer will throttle its changes
|
When the cluster is healthy, the balancer will incrementally move a small
|
||||||
such that the percentage of PGs that are misplaced (i.e., that need to
|
fraction of unbalanced PGs in order to improve distribution. This fraction
|
||||||
be moved) is below a threshold of (by default) 5%. The
|
will not exceed a certain threshold that defaults to 5%. To adjust this
|
||||||
``target_max_misplaced_ratio`` threshold can be adjusted with:
|
``target_max_misplaced_ratio`` threshold setting, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr target_max_misplaced_ratio .07 # 7%
|
ceph config set mgr target_max_misplaced_ratio .07 # 7%
|
||||||
|
|
||||||
Set the number of seconds to sleep in between runs of the automatic balancer:
|
The balancer sleeps between runs. To set the number of seconds for this
|
||||||
|
interval of sleep, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr mgr/balancer/sleep_interval 60
|
ceph config set mgr mgr/balancer/sleep_interval 60
|
||||||
|
|
||||||
Set the time of day to begin automatic balancing in HHMM format:
|
To set the time of day (in HHMM format) at which automatic balancing begins,
|
||||||
|
run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr mgr/balancer/begin_time 0000
|
ceph config set mgr mgr/balancer/begin_time 0000
|
||||||
|
|
||||||
Set the time of day to finish automatic balancing in HHMM format:
|
To set the time of day (in HHMM format) at which automatic balancing ends, run
|
||||||
|
the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr mgr/balancer/end_time 2359
|
ceph config set mgr mgr/balancer/end_time 2359
|
||||||
|
|
||||||
Restrict automatic balancing to this day of the week or later.
|
Automatic balancing can be restricted to certain days of the week. To restrict
|
||||||
Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on:
|
it to a specific day of the week or later (as with crontab, ``0`` is Sunday,
|
||||||
|
``1`` is Monday, and so on), run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr mgr/balancer/begin_weekday 0
|
ceph config set mgr mgr/balancer/begin_weekday 0
|
||||||
|
|
||||||
Restrict automatic balancing to this day of the week or earlier.
|
To restrict automatic balancing to a specific day of the week or earlier
|
||||||
Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on:
|
(again, ``0`` is Sunday, ``1`` is Monday, and so on), run the following
|
||||||
|
command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr mgr/balancer/end_weekday 6
|
ceph config set mgr mgr/balancer/end_weekday 6
|
||||||
|
|
||||||
Pool IDs to which the automatic balancing will be limited.
|
Automatic balancing can be restricted to certain pools. By default, the value
|
||||||
The default for this is an empty string, meaning all pools will be balanced.
|
of this setting is an empty string, so that all pools are automatically
|
||||||
The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` command:
|
balanced. To restrict automatic balancing to specific pools, retrieve their
|
||||||
|
numeric pool IDs (by running the :command:`ceph osd pool ls detail` command),
|
||||||
|
and then run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -93,43 +102,41 @@ The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` c
|
|||||||
Modes
|
Modes
|
||||||
-----
|
-----
|
||||||
|
|
||||||
There are currently two supported balancer modes:
|
There are two supported balancer modes:
|
||||||
|
|
||||||
#. **crush-compat**. The CRUSH compat mode uses the compat weight-set
|
#. **crush-compat**. This mode uses the compat weight-set feature (introduced
|
||||||
feature (introduced in Luminous) to manage an alternative set of
|
in Luminous) to manage an alternative set of weights for devices in the
|
||||||
weights for devices in the CRUSH hierarchy. The normal weights
|
CRUSH hierarchy. When the balancer is operating in this mode, the normal
|
||||||
should remain set to the size of the device to reflect the target
|
weights should remain set to the size of the device in order to reflect the
|
||||||
amount of data that we want to store on the device. The balancer
|
target amount of data intended to be stored on the device. The balancer will
|
||||||
then optimizes the weight-set values, adjusting them up or down in
|
then optimize the weight-set values, adjusting them up or down in small
|
||||||
small increments, in order to achieve a distribution that matches
|
increments, in order to achieve a distribution that matches the target
|
||||||
the target distribution as closely as possible. (Because PG
|
distribution as closely as possible. (Because PG placement is a pseudorandom
|
||||||
placement is a pseudorandom process, there is a natural amount of
|
process, it is subject to a natural amount of variation; optimizing the
|
||||||
variation in the placement; by optimizing the weights we
|
weights serves to counteract that natural variation.)
|
||||||
counter-act that natural variation.)
|
|
||||||
|
|
||||||
Notably, this mode is *fully backwards compatible* with older
|
Note that this mode is *fully backward compatible* with older clients: when
|
||||||
clients: when an OSDMap and CRUSH map is shared with older clients,
|
an OSD Map and CRUSH map are shared with older clients, Ceph presents the
|
||||||
we present the optimized weights as the "real" weights.
|
optimized weights as the "real" weights.
|
||||||
|
|
||||||
The primary restriction of this mode is that the balancer cannot
|
The primary limitation of this mode is that the balancer cannot handle
|
||||||
handle multiple CRUSH hierarchies with different placement rules if
|
multiple CRUSH hierarchies with different placement rules if the subtrees of
|
||||||
the subtrees of the hierarchy share any OSDs. (This is normally
|
the hierarchy share any OSDs. (Such sharing of OSDs is not typical and,
|
||||||
not the case, and is generally not a recommended configuration
|
because of the difficulty of managing the space utilization on the shared
|
||||||
because it is hard to manage the space utilization on the shared
|
OSDs, is generally not recommended.)
|
||||||
OSDs.)
|
|
||||||
|
|
||||||
#. **upmap**. Starting with Luminous, the OSDMap can store explicit
|
#. **upmap**. In Luminous and later releases, the OSDMap can store explicit
|
||||||
mappings for individual OSDs as exceptions to the normal CRUSH
|
mappings for individual OSDs as exceptions to the normal CRUSH placement
|
||||||
placement calculation. These `upmap` entries provide fine-grained
|
calculation. These ``upmap`` entries provide fine-grained control over the
|
||||||
control over the PG mapping. This CRUSH mode will optimize the
|
PG mapping. This balancer mode optimizes the placement of individual PGs in
|
||||||
placement of individual PGs in order to achieve a balanced
|
order to achieve a balanced distribution. In most cases, the resulting
|
||||||
distribution. In most cases, this distribution is "perfect," which
|
distribution is nearly perfect: that is, there is an equal number of PGs on
|
||||||
an equal number of PGs on each OSD (+/-1 PG, since they might not
|
each OSD (±1 PG, since the total number might not divide evenly).
|
||||||
divide evenly).
|
|
||||||
|
|
||||||
Note that using upmap requires that all clients be Luminous or newer.
|
To use``upmap``, all clients must be Luminous or newer.
|
||||||
|
|
||||||
The default mode is ``upmap``. The mode can be adjusted with:
|
The default mode is ``upmap``. The mode can be changed to ``crush-compat`` by
|
||||||
|
running the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -138,69 +145,77 @@ The default mode is ``upmap``. The mode can be adjusted with:
|
|||||||
Supervised optimization
|
Supervised optimization
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
The balancer operation is broken into a few distinct phases:
|
Supervised use of the balancer can be understood in terms of three distinct
|
||||||
|
phases:
|
||||||
|
|
||||||
#. building a *plan*
|
#. building a plan
|
||||||
#. evaluating the quality of the data distribution, either for the current PG distribution, or the PG distribution that would result after executing a *plan*
|
#. evaluating the quality of the data distribution, either for the current PG
|
||||||
#. executing the *plan*
|
distribution or for the PG distribution that would result after executing a
|
||||||
|
plan
|
||||||
|
#. executing the plan
|
||||||
|
|
||||||
To evaluate and score the current distribution:
|
To evaluate the current distribution, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer eval
|
ceph balancer eval
|
||||||
|
|
||||||
You can also evaluate the distribution for a single pool with:
|
To evaluate the distribution for a single pool, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer eval <pool-name>
|
ceph balancer eval <pool-name>
|
||||||
|
|
||||||
Greater detail for the evaluation can be seen with:
|
To see the evaluation in greater detail, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer eval-verbose ...
|
ceph balancer eval-verbose ...
|
||||||
|
|
||||||
The balancer can generate a plan, using the currently configured mode, with:
|
To instruct the balancer to generate a plan (using the currently configured
|
||||||
|
mode), make up a name (any useful identifying string) for the plan, and run the
|
||||||
|
following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer optimize <plan-name>
|
ceph balancer optimize <plan-name>
|
||||||
|
|
||||||
The name is provided by the user and can be any useful identifying string. The contents of a plan can be seen with:
|
To see the contents of a plan, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer show <plan-name>
|
ceph balancer show <plan-name>
|
||||||
|
|
||||||
All plans can be shown with:
|
To display all plans, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer ls
|
ceph balancer ls
|
||||||
|
|
||||||
Old plans can be discarded with:
|
To discard an old plan, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer rm <plan-name>
|
ceph balancer rm <plan-name>
|
||||||
|
|
||||||
Currently recorded plans are shown as part of the status command:
|
To see currently recorded plans, examine the output of the following status
|
||||||
|
command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer status
|
ceph balancer status
|
||||||
|
|
||||||
The quality of the distribution that would result after executing a plan can be calculated with:
|
To evaluate the distribution that would result from executing a specific plan,
|
||||||
|
run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer eval <plan-name>
|
ceph balancer eval <plan-name>
|
||||||
|
|
||||||
Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with:
|
If a plan is expected to improve the distribution (that is, the plan's score is
|
||||||
|
lower than the current cluster state's score), you can execute that plan by
|
||||||
|
running the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph balancer execute <plan-name>
|
ceph balancer execute <plan-name>
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
.. _rados_operations_bluestore_migration:
|
||||||
|
|
||||||
=====================
|
=====================
|
||||||
BlueStore Migration
|
BlueStore Migration
|
||||||
=====================
|
=====================
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
===============
|
===============
|
||||||
Cache Tiering
|
Cache Tiering
|
||||||
===============
|
===============
|
||||||
|
.. warning:: Cache tiering has been deprecated in the Reef release as it
|
||||||
|
has lacked a maintainer for a very long time. This does not mean
|
||||||
|
it will be certainly removed, but we may choose to remove it
|
||||||
|
without much further notice.
|
||||||
|
|
||||||
A cache tier provides Ceph Clients with better I/O performance for a subset of
|
A cache tier provides Ceph Clients with better I/O performance for a subset of
|
||||||
the data stored in a backing storage tier. Cache tiering involves creating a
|
the data stored in a backing storage tier. Cache tiering involves creating a
|
||||||
|
@ -315,7 +315,7 @@ the hierarchy is visible as a separate column (labeled either
|
|||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph osd tree
|
ceph osd crush tree
|
||||||
|
|
||||||
When both *compat* and *per-pool* weight sets are in use, data
|
When both *compat* and *per-pool* weight sets are in use, data
|
||||||
placement for a particular pool will use its own per-pool weight set
|
placement for a particular pool will use its own per-pool weight set
|
||||||
|
@ -2,40 +2,44 @@
|
|||||||
Data Placement Overview
|
Data Placement Overview
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
Ceph stores, replicates and rebalances data objects across a RADOS cluster
|
Ceph stores, replicates, and rebalances data objects across a RADOS cluster
|
||||||
dynamically. With many different users storing objects in different pools for
|
dynamically. Because different users store objects in different pools for
|
||||||
different purposes on countless OSDs, Ceph operations require some data
|
different purposes on many OSDs, Ceph operations require a certain amount of
|
||||||
placement planning. The main data placement planning concepts in Ceph include:
|
data- placement planning. The main data-placement planning concepts in Ceph
|
||||||
|
include:
|
||||||
|
|
||||||
- **Pools:** Ceph stores data within pools, which are logical groups for storing
|
- **Pools:** Ceph stores data within pools, which are logical groups used for
|
||||||
objects. Pools manage the number of placement groups, the number of replicas,
|
storing objects. Pools manage the number of placement groups, the number of
|
||||||
and the CRUSH rule for the pool. To store data in a pool, you must have
|
replicas, and the CRUSH rule for the pool. To store data in a pool, it is
|
||||||
an authenticated user with permissions for the pool. Ceph can snapshot pools.
|
necessary to be an authenticated user with permissions for the pool. Ceph is
|
||||||
See `Pools`_ for additional details.
|
able to make snapshots of pools. For additional details, see `Pools`_.
|
||||||
|
|
||||||
- **Placement Groups:** Ceph maps objects to placement groups (PGs).
|
- **Placement Groups:** Ceph maps objects to placement groups. Placement
|
||||||
Placement groups (PGs) are shards or fragments of a logical object pool
|
groups (PGs) are shards or fragments of a logical object pool that place
|
||||||
that place objects as a group into OSDs. Placement groups reduce the amount
|
objects as a group into OSDs. Placement groups reduce the amount of
|
||||||
of per-object metadata when Ceph stores the data in OSDs. A larger number of
|
per-object metadata that is necessary for Ceph to store the data in OSDs. A
|
||||||
placement groups (e.g., 100 per OSD) leads to better balancing. See
|
greater number of placement groups (for example, 100 PGs per OSD as compared
|
||||||
`Placement Groups`_ for additional details.
|
with 50 PGs per OSD) leads to better balancing.
|
||||||
|
|
||||||
- **CRUSH Maps:** CRUSH is a big part of what allows Ceph to scale without
|
- **CRUSH Maps:** CRUSH plays a major role in allowing Ceph to scale while
|
||||||
performance bottlenecks, without limitations to scalability, and without a
|
avoiding certain pitfalls, such as performance bottlenecks, limitations to
|
||||||
single point of failure. CRUSH maps provide the physical topology of the
|
scalability, and single points of failure. CRUSH maps provide the physical
|
||||||
cluster to the CRUSH algorithm to determine where the data for an object
|
topology of the cluster to the CRUSH algorithm, so that it can determine both
|
||||||
and its replicas should be stored, and how to do so across failure domains
|
(1) where the data for an object and its replicas should be stored and (2)
|
||||||
for added data safety among other things. See `CRUSH Maps`_ for additional
|
how to store that data across failure domains so as to improve data safety.
|
||||||
details.
|
For additional details, see `CRUSH Maps`_.
|
||||||
|
|
||||||
- **Balancer:** The balancer is a feature that will automatically optimize the
|
- **Balancer:** The balancer is a feature that automatically optimizes the
|
||||||
distribution of PGs across devices to achieve a balanced data distribution,
|
distribution of placement groups across devices in order to achieve a
|
||||||
maximizing the amount of data that can be stored in the cluster and evenly
|
balanced data distribution, in order to maximize the amount of data that can
|
||||||
distributing the workload across OSDs.
|
be stored in the cluster, and in order to evenly distribute the workload
|
||||||
|
across OSDs.
|
||||||
|
|
||||||
When you initially set up a test cluster, you can use the default values. Once
|
It is possible to use the default values for each of the above components.
|
||||||
you begin planning for a large Ceph cluster, refer to pools, placement groups
|
Default values are recommended for a test cluster's initial setup. However,
|
||||||
and CRUSH for data placement operations.
|
when planning a large Ceph cluster, values should be customized for
|
||||||
|
data-placement operations with reference to the different roles played by
|
||||||
|
pools, placement groups, and CRUSH.
|
||||||
|
|
||||||
.. _Pools: ../pools
|
.. _Pools: ../pools
|
||||||
.. _Placement Groups: ../placement-groups
|
.. _Placement Groups: ../placement-groups
|
||||||
|
@ -3,28 +3,32 @@
|
|||||||
Device Management
|
Device Management
|
||||||
=================
|
=================
|
||||||
|
|
||||||
Ceph tracks which hardware storage devices (e.g., HDDs, SSDs) are consumed by
|
Device management allows Ceph to address hardware failure. Ceph tracks hardware
|
||||||
which daemons, and collects health metrics about those devices in order to
|
storage devices (HDDs, SSDs) to see which devices are managed by which daemons.
|
||||||
provide tools to predict and/or automatically respond to hardware failure.
|
Ceph also collects health metrics about these devices. By doing so, Ceph can
|
||||||
|
provide tools that predict hardware failure and can automatically respond to
|
||||||
|
hardware failure.
|
||||||
|
|
||||||
Device tracking
|
Device tracking
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
You can query which storage devices are in use with:
|
To see a list of the storage devices that are in use, run the following
|
||||||
|
command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device ls
|
ceph device ls
|
||||||
|
|
||||||
You can also list devices by daemon or by host:
|
Alternatively, to list devices by daemon or by host, run a command of one of
|
||||||
|
the following forms:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device ls-by-daemon <daemon>
|
ceph device ls-by-daemon <daemon>
|
||||||
ceph device ls-by-host <host>
|
ceph device ls-by-host <host>
|
||||||
|
|
||||||
For any individual device, you can query information about its
|
To see information about the location of an specific device and about how the
|
||||||
location and how it is being consumed with:
|
device is being consumed, run a command of the following form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -33,103 +37,107 @@ location and how it is being consumed with:
|
|||||||
Identifying physical devices
|
Identifying physical devices
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
You can blink the drive LEDs on hardware enclosures to make the replacement of
|
To make the replacement of failed disks easier and less error-prone, you can
|
||||||
failed disks easy and less error-prone. Use the following command::
|
(in some cases) "blink" the drive's LEDs on hardware enclosures by running a
|
||||||
|
command of the following form::
|
||||||
|
|
||||||
device light on|off <devid> [ident|fault] [--force]
|
device light on|off <devid> [ident|fault] [--force]
|
||||||
|
|
||||||
The ``<devid>`` parameter is the device identification. You can obtain this
|
.. note:: Using this command to blink the lights might not work. Whether it
|
||||||
information using the following command:
|
works will depend upon such factors as your kernel revision, your SES
|
||||||
|
firmware, or the setup of your HBA.
|
||||||
|
|
||||||
|
The ``<devid>`` parameter is the device identification. To retrieve this
|
||||||
|
information, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device ls
|
ceph device ls
|
||||||
|
|
||||||
The ``[ident|fault]`` parameter is used to set the kind of light to blink.
|
The ``[ident|fault]`` parameter determines which kind of light will blink. By
|
||||||
By default, the `identification` light is used.
|
default, the `identification` light is used.
|
||||||
|
|
||||||
.. note::
|
.. note:: This command works only if the Cephadm or the Rook `orchestrator
|
||||||
This command needs the Cephadm or the Rook `orchestrator <https://docs.ceph.com/docs/master/mgr/orchestrator/#orchestrator-cli-module>`_ module enabled.
|
<https://docs.ceph.com/docs/master/mgr/orchestrator/#orchestrator-cli-module>`_
|
||||||
The orchestrator module enabled is shown by executing the following command:
|
module is enabled. To see which orchestrator module is enabled, run the
|
||||||
|
following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph orch status
|
ceph orch status
|
||||||
|
|
||||||
The command behind the scene to blink the drive LEDs is `lsmcli`. If you need
|
The command that makes the drive's LEDs blink is `lsmcli`. To customize this
|
||||||
to customize this command you can configure this via a Jinja2 template::
|
command, configure it via a Jinja2 template by running commands of the
|
||||||
|
following forms::
|
||||||
|
|
||||||
ceph config-key set mgr/cephadm/blink_device_light_cmd "<template>"
|
ceph config-key set mgr/cephadm/blink_device_light_cmd "<template>"
|
||||||
ceph config-key set mgr/cephadm/<host>/blink_device_light_cmd "lsmcli local-disk-{{ ident_fault }}-led-{{'on' if on else 'off'}} --path '{{ path or dev }}'"
|
ceph config-key set mgr/cephadm/<host>/blink_device_light_cmd "lsmcli local-disk-{{ ident_fault }}-led-{{'on' if on else 'off'}} --path '{{ path or dev }}'"
|
||||||
|
|
||||||
The Jinja2 template is rendered using the following arguments:
|
The following arguments can be used to customize the Jinja2 template:
|
||||||
|
|
||||||
* ``on``
|
* ``on``
|
||||||
A boolean value.
|
A boolean value.
|
||||||
* ``ident_fault``
|
* ``ident_fault``
|
||||||
A string containing `ident` or `fault`.
|
A string that contains `ident` or `fault`.
|
||||||
* ``dev``
|
* ``dev``
|
||||||
A string containing the device ID, e.g. `SanDisk_X400_M.2_2280_512GB_162924424784`.
|
A string that contains the device ID: for example, `SanDisk_X400_M.2_2280_512GB_162924424784`.
|
||||||
* ``path``
|
* ``path``
|
||||||
A string containing the device path, e.g. `/dev/sda`.
|
A string that contains the device path: for example, `/dev/sda`.
|
||||||
|
|
||||||
.. _enabling-monitoring:
|
.. _enabling-monitoring:
|
||||||
|
|
||||||
Enabling monitoring
|
Enabling monitoring
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
Ceph can also monitor health metrics associated with your device. For
|
Ceph can also monitor the health metrics associated with your device. For
|
||||||
example, SATA hard disks implement a standard called SMART that
|
example, SATA drives implement a standard called SMART that provides a wide
|
||||||
provides a wide range of internal metrics about the device's usage and
|
range of internal metrics about the device's usage and health (for example: the
|
||||||
health, like the number of hours powered on, number of power cycles,
|
number of hours powered on, the number of power cycles, the number of
|
||||||
or unrecoverable read errors. Other device types like SAS and NVMe
|
unrecoverable read errors). Other device types such as SAS and NVMe present a
|
||||||
implement a similar set of metrics (via slightly different standards).
|
similar set of metrics (via slightly different standards). All of these
|
||||||
All of these can be collected by Ceph via the ``smartctl`` tool.
|
metrics can be collected by Ceph via the ``smartctl`` tool.
|
||||||
|
|
||||||
You can enable or disable health monitoring with:
|
You can enable or disable health monitoring by running one of the following
|
||||||
|
commands:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device monitoring on
|
ceph device monitoring on
|
||||||
|
|
||||||
or:
|
|
||||||
|
|
||||||
.. prompt:: bash $
|
|
||||||
|
|
||||||
ceph device monitoring off
|
ceph device monitoring off
|
||||||
|
|
||||||
|
|
||||||
Scraping
|
Scraping
|
||||||
--------
|
--------
|
||||||
|
|
||||||
If monitoring is enabled, metrics will automatically be scraped at regular intervals. That interval can be configured with:
|
If monitoring is enabled, device metrics will be scraped automatically at
|
||||||
|
regular intervals. To configure that interval, run a command of the following
|
||||||
|
form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set mgr mgr/devicehealth/scrape_frequency <seconds>
|
ceph config set mgr mgr/devicehealth/scrape_frequency <seconds>
|
||||||
|
|
||||||
The default is to scrape once every 24 hours.
|
By default, device metrics are scraped once every 24 hours.
|
||||||
|
|
||||||
You can manually trigger a scrape of all devices with:
|
To manually scrape all devices, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device scrape-health-metrics
|
ceph device scrape-health-metrics
|
||||||
|
|
||||||
A single device can be scraped with:
|
To scrape a single device, run a command of the following form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device scrape-health-metrics <device-id>
|
ceph device scrape-health-metrics <device-id>
|
||||||
|
|
||||||
Or a single daemon's devices can be scraped with:
|
To scrape a single daemon's devices, run a command of the following form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device scrape-daemon-health-metrics <who>
|
ceph device scrape-daemon-health-metrics <who>
|
||||||
|
|
||||||
The stored health metrics for a device can be retrieved (optionally
|
To retrieve the stored health metrics for a device (optionally for a specific
|
||||||
for a specific timestamp) with:
|
timestamp), run a command of the following form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -138,71 +146,82 @@ for a specific timestamp) with:
|
|||||||
Failure prediction
|
Failure prediction
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
Ceph can predict life expectancy and device failures based on the
|
Ceph can predict drive life expectancy and device failures by analyzing the
|
||||||
health metrics it collects. There are three modes:
|
health metrics that it collects. The prediction modes are as follows:
|
||||||
|
|
||||||
* *none*: disable device failure prediction.
|
* *none*: disable device failure prediction.
|
||||||
* *local*: use a pre-trained prediction model from the ceph-mgr daemon
|
* *local*: use a pre-trained prediction model from the ``ceph-mgr`` daemon.
|
||||||
|
|
||||||
The prediction mode can be configured with:
|
To configure the prediction mode, run a command of the following form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph config set global device_failure_prediction_mode <mode>
|
ceph config set global device_failure_prediction_mode <mode>
|
||||||
|
|
||||||
Prediction normally runs in the background on a periodic basis, so it
|
Under normal conditions, failure prediction runs periodically in the
|
||||||
may take some time before life expectancy values are populated. You
|
background. For this reason, life expectancy values might be populated only
|
||||||
can see the life expectancy of all devices in output from:
|
after a significant amount of time has passed. The life expectancy of all
|
||||||
|
devices is displayed in the output of the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device ls
|
ceph device ls
|
||||||
|
|
||||||
You can also query the metadata for a specific device with:
|
To see the metadata of a specific device, run a command of the following form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device info <devid>
|
ceph device info <devid>
|
||||||
|
|
||||||
You can explicitly force prediction of a device's life expectancy with:
|
To explicitly force prediction of a specific device's life expectancy, run a
|
||||||
|
command of the following form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device predict-life-expectancy <devid>
|
ceph device predict-life-expectancy <devid>
|
||||||
|
|
||||||
If you are not using Ceph's internal device failure prediction but
|
In addition to Ceph's internal device failure prediction, you might have an
|
||||||
have some external source of information about device failures, you
|
external source of information about device failures. To inform Ceph of a
|
||||||
can inform Ceph of a device's life expectancy with:
|
specific device's life expectancy, run a command of the following form:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device set-life-expectancy <devid> <from> [<to>]
|
ceph device set-life-expectancy <devid> <from> [<to>]
|
||||||
|
|
||||||
Life expectancies are expressed as a time interval so that
|
Life expectancies are expressed as a time interval. This means that the
|
||||||
uncertainty can be expressed in the form of a wide interval. The
|
uncertainty of the life expectancy can be expressed in the form of a range of
|
||||||
interval end can also be left unspecified.
|
time, and perhaps a wide range of time. The interval's end can be left
|
||||||
|
unspecified.
|
||||||
|
|
||||||
Health alerts
|
Health alerts
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
The ``mgr/devicehealth/warn_threshold`` controls how soon an expected
|
The ``mgr/devicehealth/warn_threshold`` configuration option controls the
|
||||||
device failure must be before we generate a health warning.
|
health check for an expected device failure. If the device is expected to fail
|
||||||
|
within the specified time interval, an alert is raised.
|
||||||
|
|
||||||
The stored life expectancy of all devices can be checked, and any
|
To check the stored life expectancy of all devices and generate any appropriate
|
||||||
appropriate health alerts generated, with:
|
health alert, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph device check-health
|
ceph device check-health
|
||||||
|
|
||||||
Automatic Mitigation
|
Automatic Migration
|
||||||
--------------------
|
-------------------
|
||||||
|
|
||||||
If the ``mgr/devicehealth/self_heal`` option is enabled (it is by
|
The ``mgr/devicehealth/self_heal`` option (enabled by default) automatically
|
||||||
default), then for devices that are expected to fail soon the module
|
migrates data away from devices that are expected to fail soon. If this option
|
||||||
will automatically migrate data away from them by marking the devices
|
is enabled, the module marks such devices ``out`` so that automatic migration
|
||||||
"out".
|
will occur.
|
||||||
|
|
||||||
The ``mgr/devicehealth/mark_out_threshold`` controls how soon an
|
.. note:: The ``mon_osd_min_up_ratio`` configuration option can help prevent
|
||||||
expected device failure must be before we automatically mark an osd
|
this process from cascading to total failure. If the "self heal" module
|
||||||
"out".
|
marks ``out`` so many OSDs that the ratio value of ``mon_osd_min_up_ratio``
|
||||||
|
is exceeded, then the cluster raises the ``DEVICE_HEALTH_TOOMANY`` health
|
||||||
|
check. For instructions on what to do in this situation, see
|
||||||
|
:ref:`DEVICE_HEALTH_TOOMANY<rados_health_checks_device_health_toomany>`.
|
||||||
|
|
||||||
|
The ``mgr/devicehealth/mark_out_threshold`` configuration option specifies the
|
||||||
|
time interval for automatic migration. If a device is expected to fail within
|
||||||
|
the specified time interval, it will be automatically marked ``out``.
|
||||||
|
@ -6,9 +6,11 @@ The *jerasure* plugin is the most generic and flexible plugin, it is
|
|||||||
also the default for Ceph erasure coded pools.
|
also the default for Ceph erasure coded pools.
|
||||||
|
|
||||||
The *jerasure* plugin encapsulates the `Jerasure
|
The *jerasure* plugin encapsulates the `Jerasure
|
||||||
<http://jerasure.org>`_ library. It is
|
<https://github.com/ceph/jerasure>`_ library. It is
|
||||||
recommended to read the *jerasure* documentation to get a better
|
recommended to read the ``jerasure`` documentation to
|
||||||
understanding of the parameters.
|
understand the parameters. Note that the ``jerasure.org``
|
||||||
|
web site as of 2023 may no longer be connected to the original
|
||||||
|
project or legitimate.
|
||||||
|
|
||||||
Create a jerasure profile
|
Create a jerasure profile
|
||||||
=========================
|
=========================
|
||||||
|
@ -843,6 +843,8 @@ This message can be silenced by disabling self-heal behavior (that is, setting
|
|||||||
``mgr/devicehealth/mark_out_threshold``, or by addressing whichever condition
|
``mgr/devicehealth/mark_out_threshold``, or by addressing whichever condition
|
||||||
is preventing data from being migrated off of the ailing OSD(s).
|
is preventing data from being migrated off of the ailing OSD(s).
|
||||||
|
|
||||||
|
.. _rados_health_checks_device_health_toomany:
|
||||||
|
|
||||||
DEVICE_HEALTH_TOOMANY
|
DEVICE_HEALTH_TOOMANY
|
||||||
_____________________
|
_____________________
|
||||||
|
|
||||||
|
@ -117,11 +117,12 @@ pseudo-random placement that takes into account the failure domains that you
|
|||||||
have set in your `CRUSH map`_; for this reason, PGs are rarely assigned to
|
have set in your `CRUSH map`_; for this reason, PGs are rarely assigned to
|
||||||
immediately adjacent OSDs in a large cluster.
|
immediately adjacent OSDs in a large cluster.
|
||||||
|
|
||||||
Ceph processes a client request using the **Acting Set**, which is the set of
|
Ceph processes client requests with the **Acting Set** of OSDs: this is the set
|
||||||
OSDs that will actually handle the requests since they have a full and working
|
of OSDs that currently have a full and working version of a PG shard and that
|
||||||
version of a placement group shard. The set of OSDs that should contain a shard
|
are therefore responsible for handling requests. By contrast, the **Up Set** is
|
||||||
of a particular placement group as the **Up Set**, i.e. where data is
|
the set of OSDs that contain a shard of a specific PG. Data is moved or copied
|
||||||
moved/copied to (or planned to be).
|
to the **Up Set**, or planned to be moved or copied, to the **Up Set**. See
|
||||||
|
:ref:`Placement Group Concepts <rados_operations_pg_concepts>`.
|
||||||
|
|
||||||
Sometimes an OSD in the Acting Set is ``down`` or otherwise unable to
|
Sometimes an OSD in the Acting Set is ``down`` or otherwise unable to
|
||||||
service requests for objects in the PG. When this kind of situation
|
service requests for objects in the PG. When this kind of situation
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
.. _rados_operations_pg_concepts:
|
||||||
|
|
||||||
==========================
|
==========================
|
||||||
Placement Group Concepts
|
Placement Group Concepts
|
||||||
==========================
|
==========================
|
||||||
|
@ -7,209 +7,256 @@ Stretch Clusters
|
|||||||
|
|
||||||
Stretch Clusters
|
Stretch Clusters
|
||||||
================
|
================
|
||||||
Ceph generally expects all parts of its network and overall cluster to be
|
|
||||||
equally reliable, with failures randomly distributed across the CRUSH map.
|
|
||||||
So you may lose a switch that knocks out a number of OSDs, but we expect
|
|
||||||
the remaining OSDs and monitors to route around that.
|
|
||||||
|
|
||||||
This is usually a good choice, but may not work well in some
|
A stretch cluster is a cluster that has servers in geographically separated
|
||||||
stretched cluster configurations where a significant part of your cluster
|
data centers, distributed over a WAN. Stretch clusters have LAN-like high-speed
|
||||||
is stuck behind a single network component. For instance, a single
|
and low-latency connections, but limited links. Stretch clusters have a higher
|
||||||
cluster which is located in multiple data centers, and you want to
|
likelihood of (possibly asymmetric) network splits, and a higher likelihood of
|
||||||
sustain the loss of a full DC.
|
temporary or complete loss of an entire data center (which can represent
|
||||||
|
one-third to one-half of the total cluster).
|
||||||
|
|
||||||
There are two standard configurations we've seen deployed, with either
|
Ceph is designed with the expectation that all parts of its network and cluster
|
||||||
two or three data centers (or, in clouds, availability zones). With two
|
will be reliable and that failures will be distributed randomly across the
|
||||||
zones, we expect each site to hold a copy of the data, and for a third
|
CRUSH map. Even if a switch goes down and causes the loss of many OSDs, Ceph is
|
||||||
site to have a tiebreaker monitor (this can be a VM or high-latency compared
|
designed so that the remaining OSDs and monitors will route around such a loss.
|
||||||
to the main sites) to pick a winner if the network connection fails and both
|
|
||||||
DCs remain alive. For three sites, we expect a copy of the data and an equal
|
|
||||||
number of monitors in each site.
|
|
||||||
|
|
||||||
Note that the standard Ceph configuration will survive MANY failures of the
|
Sometimes this cannot be relied upon. If you have a "stretched-cluster"
|
||||||
network or data centers and it will never compromise data consistency. If you
|
deployment in which much of your cluster is behind a single network component,
|
||||||
bring back enough Ceph servers following a failure, it will recover. If you
|
you might need to use **stretch mode** to ensure data integrity.
|
||||||
lose a data center, but can still form a quorum of monitors and have all the data
|
|
||||||
available (with enough copies to satisfy pools' ``min_size``, or CRUSH rules
|
|
||||||
that will re-replicate to meet it), Ceph will maintain availability.
|
|
||||||
|
|
||||||
What can't it handle?
|
We will here consider two standard configurations: a configuration with two
|
||||||
|
data centers (or, in clouds, two availability zones), and a configuration with
|
||||||
|
three data centers (or, in clouds, three availability zones).
|
||||||
|
|
||||||
|
In the two-site configuration, Ceph expects each of the sites to hold a copy of
|
||||||
|
the data, and Ceph also expects there to be a third site that has a tiebreaker
|
||||||
|
monitor. This tiebreaker monitor picks a winner if the network connection fails
|
||||||
|
and both data centers remain alive.
|
||||||
|
|
||||||
|
The tiebreaker monitor can be a VM. It can also have high latency relative to
|
||||||
|
the two main sites.
|
||||||
|
|
||||||
|
The standard Ceph configuration is able to survive MANY network failures or
|
||||||
|
data-center failures without ever compromising data availability. If enough
|
||||||
|
Ceph servers are brought back following a failure, the cluster *will* recover.
|
||||||
|
If you lose a data center but are still able to form a quorum of monitors and
|
||||||
|
still have all the data available, Ceph will maintain availability. (This
|
||||||
|
assumes that the cluster has enough copies to satisfy the pools' ``min_size``
|
||||||
|
configuration option, or (failing that) that the cluster has CRUSH rules in
|
||||||
|
place that will cause the cluster to re-replicate the data until the
|
||||||
|
``min_size`` configuration option has been met.)
|
||||||
|
|
||||||
Stretch Cluster Issues
|
Stretch Cluster Issues
|
||||||
======================
|
======================
|
||||||
No matter what happens, Ceph will not compromise on data integrity
|
|
||||||
and consistency. If there's a failure in your network or a loss of nodes and
|
|
||||||
you can restore service, Ceph will return to normal functionality on its own.
|
|
||||||
|
|
||||||
But there are scenarios where you lose data availibility despite having
|
Ceph does not permit the compromise of data integrity and data consistency
|
||||||
enough servers available to satisfy Ceph's consistency and sizing constraints, or
|
under any circumstances. When service is restored after a network failure or a
|
||||||
where you may be surprised to not satisfy Ceph's constraints.
|
loss of Ceph nodes, Ceph will restore itself to a state of normal functioning
|
||||||
The first important category of these failures resolve around inconsistent
|
without operator intervention.
|
||||||
networks -- if there's a netsplit, Ceph may be unable to mark OSDs down and kick
|
|
||||||
them out of the acting PG sets despite the primary being unable to replicate data.
|
Ceph does not permit the compromise of data integrity or data consistency, but
|
||||||
If this happens, IO will not be permitted, because Ceph can't satisfy its durability
|
there are situations in which *data availability* is compromised. These
|
||||||
guarantees.
|
situations can occur even though there are enough clusters available to satisfy
|
||||||
|
Ceph's consistency and sizing constraints. In some situations, you might
|
||||||
|
discover that your cluster does not satisfy those constraints.
|
||||||
|
|
||||||
|
The first category of these failures that we will discuss involves inconsistent
|
||||||
|
networks -- if there is a netsplit (a disconnection between two servers that
|
||||||
|
splits the network into two pieces), Ceph might be unable to mark OSDs ``down``
|
||||||
|
and remove them from the acting PG sets. This failure to mark ODSs ``down``
|
||||||
|
will occur, despite the fact that the primary PG is unable to replicate data (a
|
||||||
|
situation that, under normal non-netsplit circumstances, would result in the
|
||||||
|
marking of affected OSDs as ``down`` and their removal from the PG). If this
|
||||||
|
happens, Ceph will be unable to satisfy its durability guarantees and
|
||||||
|
consequently IO will not be permitted.
|
||||||
|
|
||||||
|
The second category of failures that we will discuss involves the situation in
|
||||||
|
which the constraints are not sufficient to guarantee the replication of data
|
||||||
|
across data centers, though it might seem that the data is correctly replicated
|
||||||
|
across data centers. For example, in a scenario in which there are two data
|
||||||
|
centers named Data Center A and Data Center B, and the CRUSH rule targets three
|
||||||
|
replicas and places a replica in each data center with a ``min_size`` of ``2``,
|
||||||
|
the PG might go active with two replicas in Data Center A and zero replicas in
|
||||||
|
Data Center B. In a situation of this kind, the loss of Data Center A means
|
||||||
|
that the data is lost and Ceph will not be able to operate on it. This
|
||||||
|
situation is surprisingly difficult to avoid using only standard CRUSH rules.
|
||||||
|
|
||||||
The second important category of failures is when you think you have data replicated
|
|
||||||
across data centers, but the constraints aren't sufficient to guarantee this.
|
|
||||||
For instance, you might have data centers A and B, and your CRUSH rule targets 3 copies
|
|
||||||
and places a copy in each data center with a ``min_size`` of 2. The PG may go active with
|
|
||||||
2 copies in site A and no copies in site B, which means that if you then lose site A you
|
|
||||||
have lost data and Ceph can't operate on it. This situation is surprisingly difficult
|
|
||||||
to avoid with standard CRUSH rules.
|
|
||||||
|
|
||||||
Stretch Mode
|
Stretch Mode
|
||||||
============
|
============
|
||||||
The new stretch mode is designed to handle the 2-site case. Three sites are
|
Stretch mode is designed to handle deployments in which you cannot guarantee the
|
||||||
just as susceptible to netsplit issues, but are much more tolerant of
|
replication of data across two data centers. This kind of situation can arise
|
||||||
component availability outages than 2-site clusters are.
|
when the cluster's CRUSH rule specifies that three copies are to be made, but
|
||||||
|
then a copy is placed in each data center with a ``min_size`` of 2. Under such
|
||||||
|
conditions, a placement group can become active with two copies in the first
|
||||||
|
data center and no copies in the second data center.
|
||||||
|
|
||||||
To enter stretch mode, you must set the location of each monitor, matching
|
|
||||||
your CRUSH map. For instance, to place ``mon.a`` in your first data center:
|
|
||||||
|
|
||||||
.. prompt:: bash $
|
Entering Stretch Mode
|
||||||
|
---------------------
|
||||||
|
|
||||||
ceph mon set_location a datacenter=site1
|
To enable stretch mode, you must set the location of each monitor, matching
|
||||||
|
your CRUSH map. This procedure shows how to do this.
|
||||||
|
|
||||||
Next, generate a CRUSH rule which will place 2 copies in each data center. This
|
|
||||||
will require editing the CRUSH map directly:
|
|
||||||
|
|
||||||
.. prompt:: bash $
|
#. Place ``mon.a`` in your first data center:
|
||||||
|
|
||||||
ceph osd getcrushmap > crush.map.bin
|
.. prompt:: bash $
|
||||||
crushtool -d crush.map.bin -o crush.map.txt
|
|
||||||
|
|
||||||
Now edit the ``crush.map.txt`` file to add a new rule. Here
|
ceph mon set_location a datacenter=site1
|
||||||
there is only one other rule, so this is ID 1, but you may need
|
|
||||||
to use a different rule ID. We also have two datacenter buckets
|
|
||||||
named ``site1`` and ``site2``::
|
|
||||||
|
|
||||||
rule stretch_rule {
|
#. Generate a CRUSH rule that places two copies in each data center.
|
||||||
id 1
|
This requires editing the CRUSH map directly:
|
||||||
type replicated
|
|
||||||
min_size 1
|
|
||||||
max_size 10
|
|
||||||
step take site1
|
|
||||||
step chooseleaf firstn 2 type host
|
|
||||||
step emit
|
|
||||||
step take site2
|
|
||||||
step chooseleaf firstn 2 type host
|
|
||||||
step emit
|
|
||||||
}
|
|
||||||
|
|
||||||
Finally, inject the CRUSH map to make the rule available to the cluster:
|
.. prompt:: bash $
|
||||||
|
|
||||||
.. prompt:: bash $
|
ceph osd getcrushmap > crush.map.bin
|
||||||
|
crushtool -d crush.map.bin -o crush.map.txt
|
||||||
|
|
||||||
crushtool -c crush.map.txt -o crush2.map.bin
|
#. Edit the ``crush.map.txt`` file to add a new rule. Here there is only one
|
||||||
ceph osd setcrushmap -i crush2.map.bin
|
other rule (``id 1``), but you might need to use a different rule ID. We
|
||||||
|
have two data-center buckets named ``site1`` and ``site2``:
|
||||||
|
|
||||||
If you aren't already running your monitors in connectivity mode, do so with
|
::
|
||||||
the instructions in `Changing Monitor Elections`_.
|
|
||||||
|
rule stretch_rule {
|
||||||
|
id 1
|
||||||
|
min_size 1
|
||||||
|
max_size 10
|
||||||
|
type replicated
|
||||||
|
step take site1
|
||||||
|
step chooseleaf firstn 2 type host
|
||||||
|
step emit
|
||||||
|
step take site2
|
||||||
|
step chooseleaf firstn 2 type host
|
||||||
|
step emit
|
||||||
|
}
|
||||||
|
|
||||||
|
#. Inject the CRUSH map to make the rule available to the cluster:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
crushtool -c crush.map.txt -o crush2.map.bin
|
||||||
|
ceph osd setcrushmap -i crush2.map.bin
|
||||||
|
|
||||||
|
#. Run the monitors in connectivity mode. See `Changing Monitor Elections`_.
|
||||||
|
|
||||||
|
#. Command the cluster to enter stretch mode. In this example, ``mon.e`` is the
|
||||||
|
tiebreaker monitor and we are splitting across data centers. The tiebreaker
|
||||||
|
monitor must be assigned a data center that is neither ``site1`` nor
|
||||||
|
``site2``. For this purpose you can create another data-center bucket named
|
||||||
|
``site3`` in your CRUSH and place ``mon.e`` there:
|
||||||
|
|
||||||
|
.. prompt:: bash $
|
||||||
|
|
||||||
|
ceph mon set_location e datacenter=site3
|
||||||
|
ceph mon enable_stretch_mode e stretch_rule datacenter
|
||||||
|
|
||||||
|
When stretch mode is enabled, PGs will become active only when they peer
|
||||||
|
across data centers (or across whichever CRUSH bucket type was specified),
|
||||||
|
assuming both are alive. Pools will increase in size from the default ``3`` to
|
||||||
|
``4``, and two copies will be expected in each site. OSDs will be allowed to
|
||||||
|
connect to monitors only if they are in the same data center as the monitors.
|
||||||
|
New monitors will not be allowed to join the cluster if they do not specify a
|
||||||
|
location.
|
||||||
|
|
||||||
|
If all OSDs and monitors in one of the data centers become inaccessible at once,
|
||||||
|
the surviving data center enters a "degraded stretch mode". A warning will be
|
||||||
|
issued, the ``min_size`` will be reduced to ``1``, and the cluster will be
|
||||||
|
allowed to go active with the data in the single remaining site. The pool size
|
||||||
|
does not change, so warnings will be generated that report that the pools are
|
||||||
|
too small -- but a special stretch mode flag will prevent the OSDs from
|
||||||
|
creating extra copies in the remaining data center. This means that the data
|
||||||
|
center will keep only two copies, just as before.
|
||||||
|
|
||||||
|
When the missing data center comes back, the cluster will enter a "recovery
|
||||||
|
stretch mode". This changes the warning and allows peering, but requires OSDs
|
||||||
|
only from the data center that was ``up`` throughout the duration of the
|
||||||
|
downtime. When all PGs are in a known state, and are neither degraded nor
|
||||||
|
incomplete, the cluster transitions back to regular stretch mode, ends the
|
||||||
|
warning, restores ``min_size`` to its original value (``2``), requires both
|
||||||
|
sites to peer, and no longer requires the site that was up throughout the
|
||||||
|
duration of the downtime when peering (which makes failover to the other site
|
||||||
|
possible, if needed).
|
||||||
|
|
||||||
.. _Changing Monitor elections: ../change-mon-elections
|
.. _Changing Monitor elections: ../change-mon-elections
|
||||||
|
|
||||||
And lastly, tell the cluster to enter stretch mode. Here, ``mon.e`` is the
|
Limitations of Stretch Mode
|
||||||
tiebreaker and we are splitting across data centers. ``mon.e`` should be also
|
===========================
|
||||||
set a datacenter, that will differ from ``site1`` and ``site2``. For this
|
When using stretch mode, OSDs must be located at exactly two sites.
|
||||||
purpose you can create another datacenter bucket named ```site3`` in your
|
|
||||||
CRUSH and place ``mon.e`` there:
|
|
||||||
|
|
||||||
.. prompt:: bash $
|
Two monitors should be run in each data center, plus a tiebreaker in a third
|
||||||
|
(or in the cloud) for a total of five monitors. While in stretch mode, OSDs
|
||||||
|
will connect only to monitors within the data center in which they are located.
|
||||||
|
OSDs *DO NOT* connect to the tiebreaker monitor.
|
||||||
|
|
||||||
ceph mon set_location e datacenter=site3
|
Erasure-coded pools cannot be used with stretch mode. Attempts to use erasure
|
||||||
ceph mon enable_stretch_mode e stretch_rule datacenter
|
coded pools with stretch mode will fail. Erasure coded pools cannot be created
|
||||||
|
while in stretch mode.
|
||||||
|
|
||||||
When stretch mode is enabled, the OSDs wlll only take PGs active when
|
To use stretch mode, you will need to create a CRUSH rule that provides two
|
||||||
they peer across data centers (or whatever other CRUSH bucket type
|
replicas in each data center. Ensure that there are four total replicas: two in
|
||||||
you specified), assuming both are alive. Pools will increase in size
|
each data center. If pools exist in the cluster that do not have the default
|
||||||
from the default 3 to 4, expecting 2 copies in each site. OSDs will only
|
``size`` or ``min_size``, Ceph will not enter stretch mode. An example of such
|
||||||
be allowed to connect to monitors in the same data center. New monitors
|
a CRUSH rule is given above.
|
||||||
will not be allowed to join the cluster if they do not specify a location.
|
|
||||||
|
|
||||||
If all the OSDs and monitors from a data center become inaccessible
|
Because stretch mode runs with ``min_size`` set to ``1`` (or, more directly,
|
||||||
at once, the surviving data center will enter a degraded stretch mode. This
|
``min_size 1``), we recommend enabling stretch mode only when using OSDs on
|
||||||
will issue a warning, reduce the min_size to 1, and allow
|
SSDs (including NVMe OSDs). Hybrid HDD+SDD or HDD-only OSDs are not recommended
|
||||||
the cluster to go active with data in the single remaining site. Note that
|
due to the long time it takes for them to recover after connectivity between
|
||||||
we do not change the pool size, so you will also get warnings that the
|
data centers has been restored. This reduces the potential for data loss.
|
||||||
pools are too small -- but a special stretch mode flag will prevent the OSDs
|
|
||||||
from creating extra copies in the remaining data center (so it will only keep
|
|
||||||
2 copies, as before).
|
|
||||||
|
|
||||||
When the missing data center comes back, the cluster will enter
|
In the future, stretch mode might support erasure-coded pools and might support
|
||||||
recovery stretch mode. This changes the warning and allows peering, but
|
deployments that have more than two data centers.
|
||||||
still only requires OSDs from the data center which was up the whole time.
|
|
||||||
When all PGs are in a known state, and are neither degraded nor incomplete,
|
|
||||||
the cluster transitions back to regular stretch mode, ends the warning,
|
|
||||||
restores min_size to its starting value (2) and requires both sites to peer,
|
|
||||||
and stops requiring the always-alive site when peering (so that you can fail
|
|
||||||
over to the other site, if necessary).
|
|
||||||
|
|
||||||
|
|
||||||
Stretch Mode Limitations
|
|
||||||
========================
|
|
||||||
As implied by the setup, stretch mode only handles 2 sites with OSDs.
|
|
||||||
|
|
||||||
While it is not enforced, you should run 2 monitors in each site plus
|
|
||||||
a tiebreaker, for a total of 5. This is because OSDs can only connect
|
|
||||||
to monitors in their own site when in stretch mode.
|
|
||||||
|
|
||||||
You cannot use erasure coded pools with stretch mode. If you try, it will
|
|
||||||
refuse, and it will not allow you to create EC pools once in stretch mode.
|
|
||||||
|
|
||||||
You must create your own CRUSH rule which provides 2 copies in each site, and
|
|
||||||
you must use 4 total copies with 2 in each site. If you have existing pools
|
|
||||||
with non-default size/min_size, Ceph will object when you attempt to
|
|
||||||
enable stretch mode.
|
|
||||||
|
|
||||||
Because it runs with ``min_size 1`` when degraded, you should only use stretch
|
|
||||||
mode with all-flash OSDs. This minimizes the time needed to recover once
|
|
||||||
connectivity is restored, and thus minimizes the potential for data loss.
|
|
||||||
|
|
||||||
Hopefully, future development will extend this feature to support EC pools and
|
|
||||||
running with more than 2 full sites.
|
|
||||||
|
|
||||||
Other commands
|
Other commands
|
||||||
==============
|
==============
|
||||||
If your tiebreaker monitor fails for some reason, you can replace it. Turn on
|
|
||||||
a new monitor and run:
|
Replacing a failed tiebreaker monitor
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
|
Turn on a new monitor and run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph mon set_new_tiebreaker mon.<new_mon_name>
|
ceph mon set_new_tiebreaker mon.<new_mon_name>
|
||||||
|
|
||||||
This command will protest if the new monitor is in the same location as existing
|
This command protests if the new monitor is in the same location as the
|
||||||
non-tiebreaker monitors. This command WILL NOT remove the previous tiebreaker
|
existing non-tiebreaker monitors. **This command WILL NOT remove the previous
|
||||||
monitor; you should do so yourself.
|
tiebreaker monitor.** Remove the previous tiebreaker monitor yourself.
|
||||||
|
|
||||||
Also in 16.2.7, if you are writing your own tooling for deploying Ceph, you can use a new
|
Using "--set-crush-location" and not "ceph mon set_location"
|
||||||
``--set-crush-location`` option when booting monitors, instead of running
|
------------------------------------------------------------
|
||||||
``ceph mon set_location``. This option accepts only a single "bucket=loc" pair, eg
|
|
||||||
``ceph-mon --set-crush-location 'datacenter=a'``, which must match the
|
|
||||||
bucket type you specified when running ``enable_stretch_mode``.
|
|
||||||
|
|
||||||
|
If you write your own tooling for deploying Ceph, use the
|
||||||
|
``--set-crush-location`` option when booting monitors instead of running ``ceph
|
||||||
|
mon set_location``. This option accepts only a single ``bucket=loc`` pair (for
|
||||||
|
example, ``ceph-mon --set-crush-location 'datacenter=a'``), and that pair must
|
||||||
|
match the bucket type that was specified when running ``enable_stretch_mode``.
|
||||||
|
|
||||||
When in stretch degraded mode, the cluster will go into "recovery" mode automatically
|
Forcing recovery stretch mode
|
||||||
when the disconnected data center comes back. If that doesn't work, or you want to
|
-----------------------------
|
||||||
enable recovery mode early, you can invoke:
|
|
||||||
|
When in stretch degraded mode, the cluster will go into "recovery" mode
|
||||||
|
automatically when the disconnected data center comes back. If that does not
|
||||||
|
happen or you want to enable recovery mode early, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph osd force_recovery_stretch_mode --yes-i-really-mean-it
|
ceph osd force_recovery_stretch_mode --yes-i-really-mean-it
|
||||||
|
|
||||||
But this command should not be necessary; it is included to deal with
|
Forcing normal stretch mode
|
||||||
unanticipated situations.
|
---------------------------
|
||||||
|
|
||||||
When in recovery mode, the cluster should go back into normal stretch mode
|
When in recovery mode, the cluster should go back into normal stretch mode when
|
||||||
when the PGs are healthy. If this doesn't happen, or you want to force the
|
the PGs are healthy. If this fails to happen or if you want to force the
|
||||||
cross-data-center peering early and are willing to risk data downtime (or have
|
cross-data-center peering early and are willing to risk data downtime (or have
|
||||||
verified separately that all the PGs can peer, even if they aren't fully
|
verified separately that all the PGs can peer, even if they aren't fully
|
||||||
recovered), you can invoke:
|
recovered), run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph osd force_healthy_stretch_mode --yes-i-really-mean-it
|
ceph osd force_healthy_stretch_mode --yes-i-really-mean-it
|
||||||
|
|
||||||
This command should not be necessary; it is included to deal with
|
This command can be used to to remove the ``HEALTH_WARN`` state, which recovery
|
||||||
unanticipated situations. But you might wish to invoke it to remove
|
mode generates.
|
||||||
the ``HEALTH_WARN`` state which recovery mode generates.
|
|
||||||
|
@ -337,45 +337,53 @@ Pool
|
|||||||
|
|
||||||
A pool is a logical partition where users store data.
|
A pool is a logical partition where users store data.
|
||||||
In Ceph deployments, it is common to create a pool as a logical partition for
|
In Ceph deployments, it is common to create a pool as a logical partition for
|
||||||
similar types of data. For example, when deploying Ceph as a backend for
|
similar types of data. For example, when deploying Ceph as a back end for
|
||||||
OpenStack, a typical deployment would have pools for volumes, images, backups
|
OpenStack, a typical deployment would have pools for volumes, images, backups
|
||||||
and virtual machines, and users such as ``client.glance``, ``client.cinder``,
|
and virtual machines, and such users as ``client.glance`` and ``client.cinder``.
|
||||||
etc.
|
|
||||||
|
|
||||||
Application Tags
|
Application Tags
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
Access may be restricted to specific pools as defined by their application
|
Access may be restricted to specific pools as defined by their application
|
||||||
metadata. The ``*`` wildcard may be used for the ``key`` argument, the
|
metadata. The ``*`` wildcard may be used for the ``key`` argument, the
|
||||||
``value`` argument, or both. ``all`` is a synony for ``*``.
|
``value`` argument, or both. The ``all`` tag is a synonym for ``*``.
|
||||||
|
|
||||||
Namespace
|
Namespace
|
||||||
---------
|
---------
|
||||||
|
|
||||||
Objects within a pool can be associated to a namespace--a logical group of
|
Objects within a pool can be associated to a namespace: that is, to a logical group of
|
||||||
objects within the pool. A user's access to a pool can be associated with a
|
objects within the pool. A user's access to a pool can be associated with a
|
||||||
namespace such that reads and writes by the user take place only within the
|
namespace so that reads and writes by the user can take place only within the
|
||||||
namespace. Objects written to a namespace within the pool can only be accessed
|
namespace. Objects written to a namespace within the pool can be accessed only
|
||||||
by users who have access to the namespace.
|
by users who have access to the namespace.
|
||||||
|
|
||||||
.. note:: Namespaces are primarily useful for applications written on top of
|
.. note:: Namespaces are primarily useful for applications written on top of
|
||||||
``librados`` where the logical grouping can alleviate the need to create
|
``librados``. In such situations, the logical grouping provided by
|
||||||
different pools. Ceph Object Gateway (in releases beginning with
|
namespaces can obviate the need to create different pools. In Luminous and
|
||||||
Luminous) uses namespaces for various
|
later releases, Ceph Object Gateway uses namespaces for various metadata
|
||||||
metadata objects.
|
objects.
|
||||||
|
|
||||||
The rationale for namespaces is that pools can be a computationally expensive
|
The rationale for namespaces is this: namespaces are relatively less
|
||||||
method of segregating data sets for the purposes of authorizing separate sets
|
computationally expensive than pools, which (pools) can be a computationally
|
||||||
of users. For example, a pool should have ~100 placement groups per OSD. So an
|
expensive method of segregating data sets between different authorized users.
|
||||||
exemplary cluster with 1000 OSDs would have 100,000 placement groups for one
|
|
||||||
pool. Each pool would create another 100,000 placement groups in the exemplary
|
|
||||||
cluster. By contrast, writing an object to a namespace simply associates the
|
|
||||||
namespace to the object name with out the computational overhead of a separate
|
|
||||||
pool. Rather than creating a separate pool for a user or set of users, you may
|
|
||||||
use a namespace. **Note:** Only available using ``librados`` at this time.
|
|
||||||
|
|
||||||
Access may be restricted to specific RADOS namespaces using the ``namespace``
|
For example, a pool ought to host approximately 100 placement-group replicas
|
||||||
capability. Limited globbing of namespaces is supported; if the last character
|
per OSD. This means that a cluster with 1000 OSDs and three 3R replicated pools
|
||||||
|
would have (in a single pool) 100,000 placement-group replicas, and that means
|
||||||
|
that it has 33,333 Placement Groups.
|
||||||
|
|
||||||
|
By contrast, writing an object to a namespace simply associates the namespace
|
||||||
|
to the object name without incurring the computational overhead of a separate
|
||||||
|
pool. Instead of creating a separate pool for a user or set of users, you can
|
||||||
|
use a namespace.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Namespaces are available only when using ``librados``.
|
||||||
|
|
||||||
|
|
||||||
|
Access may be restricted to specific RADOS namespaces by use of the ``namespace``
|
||||||
|
capability. Limited globbing of namespaces (that is, use of wildcards (``*``)) is supported: if the last character
|
||||||
of the specified namespace is ``*``, then access is granted to any namespace
|
of the specified namespace is ``*``, then access is granted to any namespace
|
||||||
starting with the provided argument.
|
starting with the provided argument.
|
||||||
|
|
||||||
@ -383,64 +391,60 @@ Managing Users
|
|||||||
==============
|
==============
|
||||||
|
|
||||||
User management functionality provides Ceph Storage Cluster administrators with
|
User management functionality provides Ceph Storage Cluster administrators with
|
||||||
the ability to create, update and delete users directly in the Ceph Storage
|
the ability to create, update, and delete users directly in the Ceph Storage
|
||||||
Cluster.
|
Cluster.
|
||||||
|
|
||||||
When you create or delete users in the Ceph Storage Cluster, you may need to
|
When you create or delete users in the Ceph Storage Cluster, you might need to
|
||||||
distribute keys to clients so that they can be added to keyrings. See `Keyring
|
distribute keys to clients so that they can be added to keyrings. For details, see `Keyring
|
||||||
Management`_ for details.
|
Management`_.
|
||||||
|
|
||||||
List Users
|
Listing Users
|
||||||
----------
|
-------------
|
||||||
|
|
||||||
To list the users in your cluster, execute the following:
|
To list the users in your cluster, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph auth ls
|
ceph auth ls
|
||||||
|
|
||||||
Ceph will list out all users in your cluster. For example, in a two-node
|
Ceph will list all users in your cluster. For example, in a two-node
|
||||||
exemplary cluster, ``ceph auth ls`` will output something that looks like
|
cluster, ``ceph auth ls`` will provide an output that resembles the following::
|
||||||
this::
|
|
||||||
|
|
||||||
installed auth entries:
|
installed auth entries:
|
||||||
|
|
||||||
osd.0
|
osd.0
|
||||||
key: AQCvCbtToC6MDhAATtuT70Sl+DymPCfDSsyV4w==
|
key: AQCvCbtToC6MDhAATtuT70Sl+DymPCfDSsyV4w==
|
||||||
caps: [mon] allow profile osd
|
caps: [mon] allow profile osd
|
||||||
caps: [osd] allow *
|
caps: [osd] allow *
|
||||||
osd.1
|
osd.1
|
||||||
key: AQC4CbtTCFJBChAAVq5spj0ff4eHZICxIOVZeA==
|
key: AQC4CbtTCFJBChAAVq5spj0ff4eHZICxIOVZeA==
|
||||||
caps: [mon] allow profile osd
|
caps: [mon] allow profile osd
|
||||||
caps: [osd] allow *
|
caps: [osd] allow *
|
||||||
client.admin
|
client.admin
|
||||||
key: AQBHCbtT6APDHhAA5W00cBchwkQjh3dkKsyPjw==
|
key: AQBHCbtT6APDHhAA5W00cBchwkQjh3dkKsyPjw==
|
||||||
caps: [mds] allow
|
caps: [mds] allow
|
||||||
caps: [mon] allow *
|
caps: [mon] allow *
|
||||||
caps: [osd] allow *
|
caps: [osd] allow *
|
||||||
client.bootstrap-mds
|
client.bootstrap-mds
|
||||||
key: AQBICbtTOK9uGBAAdbe5zcIGHZL3T/u2g6EBww==
|
key: AQBICbtTOK9uGBAAdbe5zcIGHZL3T/u2g6EBww==
|
||||||
caps: [mon] allow profile bootstrap-mds
|
caps: [mon] allow profile bootstrap-mds
|
||||||
client.bootstrap-osd
|
client.bootstrap-osd
|
||||||
key: AQBHCbtT4GxqORAADE5u7RkpCN/oo4e5W0uBtw==
|
key: AQBHCbtT4GxqORAADE5u7RkpCN/oo4e5W0uBtw==
|
||||||
caps: [mon] allow profile bootstrap-osd
|
caps: [mon] allow profile bootstrap-osd
|
||||||
|
|
||||||
|
Note that, according to the ``TYPE.ID`` notation for users, ``osd.0`` is a
|
||||||
Note that the ``TYPE.ID`` notation for users applies such that ``osd.0`` is a
|
user of type ``osd`` and an ID of ``0``, and ``client.admin`` is a user of type
|
||||||
user of type ``osd`` and its ID is ``0``, ``client.admin`` is a user of type
|
``client`` and an ID of ``admin`` (that is, the default ``client.admin`` user).
|
||||||
``client`` and its ID is ``admin`` (i.e., the default ``client.admin`` user).
|
Note too that each entry has a ``key: <value>`` entry, and also has one or more
|
||||||
Note also that each entry has a ``key: <value>`` entry, and one or more
|
|
||||||
``caps:`` entries.
|
``caps:`` entries.
|
||||||
|
|
||||||
You may use the ``-o {filename}`` option with ``ceph auth ls`` to
|
To save the output of ``ceph auth ls`` to a file, use the ``-o {filename}`` option.
|
||||||
save the output to a file.
|
|
||||||
|
|
||||||
|
|
||||||
Get a User
|
Getting a User
|
||||||
----------
|
--------------
|
||||||
|
|
||||||
To retrieve a specific user, key and capabilities, execute the
|
To retrieve a specific user, key, and capabilities, run the following command:
|
||||||
following:
|
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -452,8 +456,7 @@ For example:
|
|||||||
|
|
||||||
ceph auth get client.admin
|
ceph auth get client.admin
|
||||||
|
|
||||||
You may also use the ``-o {filename}`` option with ``ceph auth get`` to
|
To save the output of ``ceph auth get`` to a file, use the ``-o {filename}`` option. Developers may also run the following command:
|
||||||
save the output to a file. Developers may also execute the following:
|
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -461,42 +464,49 @@ save the output to a file. Developers may also execute the following:
|
|||||||
|
|
||||||
The ``auth export`` command is identical to ``auth get``.
|
The ``auth export`` command is identical to ``auth get``.
|
||||||
|
|
||||||
Add a User
|
.. _rados_ops_adding_a_user:
|
||||||
----------
|
|
||||||
|
|
||||||
Adding a user creates a username (i.e., ``TYPE.ID``), a secret key and
|
Adding a User
|
||||||
any capabilities included in the command you use to create the user.
|
-------------
|
||||||
|
|
||||||
A user's key enables the user to authenticate with the Ceph Storage Cluster.
|
Adding a user creates a user name (that is, ``TYPE.ID``), a secret key, and
|
||||||
|
any capabilities specified in the command that creates the user.
|
||||||
|
|
||||||
|
A user's key allows the user to authenticate with the Ceph Storage Cluster.
|
||||||
The user's capabilities authorize the user to read, write, or execute on Ceph
|
The user's capabilities authorize the user to read, write, or execute on Ceph
|
||||||
monitors (``mon``), Ceph OSDs (``osd``) or Ceph Metadata Servers (``mds``).
|
monitors (``mon``), Ceph OSDs (``osd``) or Ceph Metadata Servers (``mds``).
|
||||||
|
|
||||||
There are a few ways to add a user:
|
There are a few ways to add a user:
|
||||||
|
|
||||||
- ``ceph auth add``: This command is the canonical way to add a user. It
|
- ``ceph auth add``: This command is the canonical way to add a user. It
|
||||||
will create the user, generate a key and add any specified capabilities.
|
will create the user, generate a key, and add any specified capabilities.
|
||||||
|
|
||||||
- ``ceph auth get-or-create``: This command is often the most convenient way
|
- ``ceph auth get-or-create``: This command is often the most convenient way
|
||||||
to create a user, because it returns a keyfile format with the user name
|
to create a user, because it returns a keyfile format with the user name
|
||||||
(in brackets) and the key. If the user already exists, this command
|
(in brackets) and the key. If the user already exists, this command
|
||||||
simply returns the user name and key in the keyfile format. You may use the
|
simply returns the user name and key in the keyfile format. To save the output to
|
||||||
``-o {filename}`` option to save the output to a file.
|
a file, use the ``-o {filename}`` option.
|
||||||
|
|
||||||
- ``ceph auth get-or-create-key``: This command is a convenient way to create
|
- ``ceph auth get-or-create-key``: This command is a convenient way to create
|
||||||
a user and return the user's key (only). This is useful for clients that
|
a user and return the user's key and nothing else. This is useful for clients that
|
||||||
need the key only (e.g., libvirt). If the user already exists, this command
|
need only the key (for example, libvirt). If the user already exists, this command
|
||||||
simply returns the key. You may use the ``-o {filename}`` option to save the
|
simply returns the key. To save the output to
|
||||||
output to a file.
|
a file, use the ``-o {filename}`` option.
|
||||||
|
|
||||||
When creating client users, you may create a user with no capabilities. A user
|
It is possible, when creating client users, to create a user with no capabilities. A user
|
||||||
with no capabilities is useless beyond mere authentication, because the client
|
with no capabilities is useless beyond mere authentication, because the client
|
||||||
cannot retrieve the cluster map from the monitor. However, you can create a
|
cannot retrieve the cluster map from the monitor. However, you might want to create a user
|
||||||
user with no capabilities if you wish to defer adding capabilities later using
|
with no capabilities and wait until later to add capabilities to the user by using the ``ceph auth caps`` comand.
|
||||||
the ``ceph auth caps`` command.
|
|
||||||
|
|
||||||
A typical user has at least read capabilities on the Ceph monitor and
|
A typical user has at least read capabilities on the Ceph monitor and
|
||||||
read and write capability on Ceph OSDs. Additionally, a user's OSD permissions
|
read and write capabilities on Ceph OSDs. A user's OSD permissions
|
||||||
are often restricted to accessing a particular pool:
|
are often restricted so that the user can access only one particular pool.
|
||||||
|
In the following example, the commands (1) add a client named ``john`` that has read capabilities on the Ceph monitor
|
||||||
|
and read and write capabilities on the pool named ``liverpool``, (2) authorize a client named ``paul`` to have read capabilities on the Ceph monitor and
|
||||||
|
read and write capabilities on the pool named ``liverpool``, (3) authorize a client named ``george`` to have read capabilities on the Ceph monitor and
|
||||||
|
read and write capabilities on the pool named ``liverpool`` and use the keyring named ``george.keyring`` to make this authorization, and (4) authorize
|
||||||
|
a client named ``ringo`` to have read capabilities on the Ceph monitor and read and write capabilities on the pool named ``liverpool`` and use the key
|
||||||
|
named ``ringo.key`` to make this authorization:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -505,21 +515,19 @@ are often restricted to accessing a particular pool:
|
|||||||
ceph auth get-or-create client.george mon 'allow r' osd 'allow rw pool=liverpool' -o george.keyring
|
ceph auth get-or-create client.george mon 'allow r' osd 'allow rw pool=liverpool' -o george.keyring
|
||||||
ceph auth get-or-create-key client.ringo mon 'allow r' osd 'allow rw pool=liverpool' -o ringo.key
|
ceph auth get-or-create-key client.ringo mon 'allow r' osd 'allow rw pool=liverpool' -o ringo.key
|
||||||
|
|
||||||
|
.. important:: Any user that has capabilities on OSDs will have access to ALL pools in the cluster
|
||||||
.. important:: If you provide a user with capabilities to OSDs, but you DO NOT
|
unless that user's access has been restricted to a proper subset of the pools in the cluster.
|
||||||
restrict access to particular pools, the user will have access to ALL
|
|
||||||
pools in the cluster!
|
|
||||||
|
|
||||||
|
|
||||||
.. _modify-user-capabilities:
|
.. _modify-user-capabilities:
|
||||||
|
|
||||||
Modify User Capabilities
|
Modifying User Capabilities
|
||||||
------------------------
|
---------------------------
|
||||||
|
|
||||||
The ``ceph auth caps`` command allows you to specify a user and change the
|
The ``ceph auth caps`` command allows you to specify a user and change that
|
||||||
user's capabilities. Setting new capabilities will overwrite current capabilities.
|
user's capabilities. Setting new capabilities will overwrite current capabilities.
|
||||||
To view current capabilities run ``ceph auth get USERTYPE.USERID``. To add
|
To view current capabilities, run ``ceph auth get USERTYPE.USERID``.
|
||||||
capabilities, you should also specify the existing capabilities when using the form:
|
To add capabilities, run a command of the following form (and be sure to specify the existing capabilities):
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -534,10 +542,10 @@ For example:
|
|||||||
ceph auth caps client.paul mon 'allow rw' osd 'allow rwx pool=liverpool'
|
ceph auth caps client.paul mon 'allow rw' osd 'allow rwx pool=liverpool'
|
||||||
ceph auth caps client.brian-manager mon 'allow *' osd 'allow *'
|
ceph auth caps client.brian-manager mon 'allow *' osd 'allow *'
|
||||||
|
|
||||||
See `Authorization (Capabilities)`_ for additional details on capabilities.
|
For additional details on capabilities, see `Authorization (Capabilities)`_.
|
||||||
|
|
||||||
Delete a User
|
Deleting a User
|
||||||
-------------
|
---------------
|
||||||
|
|
||||||
To delete a user, use ``ceph auth del``:
|
To delete a user, use ``ceph auth del``:
|
||||||
|
|
||||||
@ -545,34 +553,34 @@ To delete a user, use ``ceph auth del``:
|
|||||||
|
|
||||||
ceph auth del {TYPE}.{ID}
|
ceph auth del {TYPE}.{ID}
|
||||||
|
|
||||||
Where ``{TYPE}`` is one of ``client``, ``osd``, ``mon``, or ``mds``,
|
Here ``{TYPE}`` is either ``client``, ``osd``, ``mon``, or ``mds``,
|
||||||
and ``{ID}`` is the user name or ID of the daemon.
|
and ``{ID}`` is the user name or the ID of the daemon.
|
||||||
|
|
||||||
|
|
||||||
Print a User's Key
|
Printing a User's Key
|
||||||
------------------
|
---------------------
|
||||||
|
|
||||||
To print a user's authentication key to standard output, execute the following:
|
To print a user's authentication key to standard output, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
ceph auth print-key {TYPE}.{ID}
|
ceph auth print-key {TYPE}.{ID}
|
||||||
|
|
||||||
Where ``{TYPE}`` is one of ``client``, ``osd``, ``mon``, or ``mds``,
|
Here ``{TYPE}`` is either ``client``, ``osd``, ``mon``, or ``mds``,
|
||||||
and ``{ID}`` is the user name or ID of the daemon.
|
and ``{ID}`` is the user name or the ID of the daemon.
|
||||||
|
|
||||||
Printing a user's key is useful when you need to populate client
|
When it is necessary to populate client software with a user's key (as in the case of libvirt),
|
||||||
software with a user's key (e.g., libvirt):
|
you can print the user's key by running the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
mount -t ceph serverhost:/ mountpoint -o name=client.user,secret=`ceph auth print-key client.user`
|
mount -t ceph serverhost:/ mountpoint -o name=client.user,secret=`ceph auth print-key client.user`
|
||||||
|
|
||||||
Import a User(s)
|
Importing a User
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
To import one or more users, use ``ceph auth import`` and
|
To import one or more users, use ``ceph auth import`` and
|
||||||
specify a keyring:
|
specify a keyring as follows:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -584,47 +592,49 @@ For example:
|
|||||||
|
|
||||||
sudo ceph auth import -i /etc/ceph/ceph.keyring
|
sudo ceph auth import -i /etc/ceph/ceph.keyring
|
||||||
|
|
||||||
|
.. note:: The Ceph storage cluster will add new users, their keys, and their
|
||||||
.. note:: The Ceph storage cluster will add new users, their keys and their
|
capabilities and will update existing users, their keys, and their
|
||||||
capabilities and will update existing users, their keys and their
|
|
||||||
capabilities.
|
capabilities.
|
||||||
|
|
||||||
Keyring Management
|
Keyring Management
|
||||||
==================
|
==================
|
||||||
|
|
||||||
When you access Ceph via a Ceph client, the Ceph client will look for a local
|
When you access Ceph via a Ceph client, the Ceph client will look for a local
|
||||||
keyring. Ceph presets the ``keyring`` setting with the following four keyring
|
keyring. Ceph presets the ``keyring`` setting with four keyring
|
||||||
names by default so you don't have to set them in your Ceph configuration file
|
names by default. For this reason, you do not have to set the keyring names in your Ceph configuration file
|
||||||
unless you want to override the defaults (not recommended):
|
unless you want to override these defaults (which is not recommended). The four default keyring names are as follows:
|
||||||
|
|
||||||
- ``/etc/ceph/$cluster.$name.keyring``
|
- ``/etc/ceph/$cluster.$name.keyring``
|
||||||
- ``/etc/ceph/$cluster.keyring``
|
- ``/etc/ceph/$cluster.keyring``
|
||||||
- ``/etc/ceph/keyring``
|
- ``/etc/ceph/keyring``
|
||||||
- ``/etc/ceph/keyring.bin``
|
- ``/etc/ceph/keyring.bin``
|
||||||
|
|
||||||
The ``$cluster`` metavariable is your Ceph cluster name as defined by the
|
The ``$cluster`` metavariable found in the first two default keyring names above
|
||||||
name of the Ceph configuration file (i.e., ``ceph.conf`` means the cluster name
|
is your Ceph cluster name as defined by the name of the Ceph configuration
|
||||||
is ``ceph``; thus, ``ceph.keyring``). The ``$name`` metavariable is the user
|
file: for example, if the Ceph configuration file is named ``ceph.conf``,
|
||||||
type and user ID (e.g., ``client.admin``; thus, ``ceph.client.admin.keyring``).
|
then your Ceph cluster name is ``ceph`` and the second name above would be
|
||||||
|
``ceph.keyring``. The ``$name`` metavariable is the user type and user ID:
|
||||||
|
for example, given the user ``client.admin``, the first name above would be
|
||||||
|
``ceph.client.admin.keyring``.
|
||||||
|
|
||||||
.. note:: When executing commands that read or write to ``/etc/ceph``, you may
|
.. note:: When running commands that read or write to ``/etc/ceph``, you might
|
||||||
need to use ``sudo`` to execute the command as ``root``.
|
need to use ``sudo`` to run the command as ``root``.
|
||||||
|
|
||||||
After you create a user (e.g., ``client.ringo``), you must get the key and add
|
After you create a user (for example, ``client.ringo``), you must get the key and add
|
||||||
it to a keyring on a Ceph client so that the user can access the Ceph Storage
|
it to a keyring on a Ceph client so that the user can access the Ceph Storage
|
||||||
Cluster.
|
Cluster.
|
||||||
|
|
||||||
The `User Management`_ section details how to list, get, add, modify and delete
|
The `User Management`_ section details how to list, get, add, modify, and delete
|
||||||
users directly in the Ceph Storage Cluster. However, Ceph also provides the
|
users directly in the Ceph Storage Cluster. In addition, Ceph provides the
|
||||||
``ceph-authtool`` utility to allow you to manage keyrings from a Ceph client.
|
``ceph-authtool`` utility to allow you to manage keyrings from a Ceph client.
|
||||||
|
|
||||||
Create a Keyring
|
Creating a Keyring
|
||||||
----------------
|
------------------
|
||||||
|
|
||||||
When you use the procedures in the `Managing Users`_ section to create users,
|
When you use the procedures in the `Managing Users`_ section to create users,
|
||||||
you need to provide user keys to the Ceph client(s) so that the Ceph client
|
you must provide user keys to the Ceph client(s). This is required so that the Ceph client(s)
|
||||||
can retrieve the key for the specified user and authenticate with the Ceph
|
can retrieve the key for the specified user and authenticate that user against the Ceph
|
||||||
Storage Cluster. Ceph Clients access keyrings to lookup a user name and
|
Storage Cluster. Ceph clients access keyrings in order to look up a user name and
|
||||||
retrieve the user's key.
|
retrieve the user's key.
|
||||||
|
|
||||||
The ``ceph-authtool`` utility allows you to create a keyring. To create an
|
The ``ceph-authtool`` utility allows you to create a keyring. To create an
|
||||||
@ -635,45 +645,44 @@ empty keyring, use ``--create-keyring`` or ``-C``. For example:
|
|||||||
ceph-authtool --create-keyring /path/to/keyring
|
ceph-authtool --create-keyring /path/to/keyring
|
||||||
|
|
||||||
When creating a keyring with multiple users, we recommend using the cluster name
|
When creating a keyring with multiple users, we recommend using the cluster name
|
||||||
(e.g., ``$cluster.keyring``) for the keyring filename and saving it in the
|
(of the form ``$cluster.keyring``) for the keyring filename and saving the keyring in the
|
||||||
``/etc/ceph`` directory so that the ``keyring`` configuration default setting
|
``/etc/ceph`` directory. By doing this, you ensure that the ``keyring`` configuration default setting
|
||||||
will pick up the filename without requiring you to specify it in the local copy
|
will pick up the filename without requiring you to specify the filename in the local copy
|
||||||
of your Ceph configuration file. For example, create ``ceph.keyring`` by
|
of your Ceph configuration file. For example, you can create ``ceph.keyring`` by
|
||||||
executing the following:
|
running the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
sudo ceph-authtool -C /etc/ceph/ceph.keyring
|
sudo ceph-authtool -C /etc/ceph/ceph.keyring
|
||||||
|
|
||||||
When creating a keyring with a single user, we recommend using the cluster name,
|
When creating a keyring with a single user, we recommend using the cluster name,
|
||||||
the user type and the user name and saving it in the ``/etc/ceph`` directory.
|
the user type, and the user name, and saving the keyring in the ``/etc/ceph`` directory.
|
||||||
For example, ``ceph.client.admin.keyring`` for the ``client.admin`` user.
|
For example, we recommend that the ``client.admin`` user use ``ceph.client.admin.keyring``.
|
||||||
|
|
||||||
To create a keyring in ``/etc/ceph``, you must do so as ``root``. This means
|
To create a keyring in ``/etc/ceph``, you must do so as ``root``. This means
|
||||||
the file will have ``rw`` permissions for the ``root`` user only, which is
|
that the file will have ``rw`` permissions for the ``root`` user only, which is
|
||||||
appropriate when the keyring contains administrator keys. However, if you
|
appropriate when the keyring contains administrator keys. However, if you
|
||||||
intend to use the keyring for a particular user or group of users, ensure
|
intend to use the keyring for a particular user or group of users, be sure to use ``chown`` or ``chmod`` to establish appropriate keyring
|
||||||
that you execute ``chown`` or ``chmod`` to establish appropriate keyring
|
|
||||||
ownership and access.
|
ownership and access.
|
||||||
|
|
||||||
Add a User to a Keyring
|
Adding a User to a Keyring
|
||||||
-----------------------
|
--------------------------
|
||||||
|
|
||||||
When you `Add a User`_ to the Ceph Storage Cluster, you can use the `Get a
|
When you :ref:`Add a user<rados_ops_adding_a_user>` to the Ceph Storage
|
||||||
User`_ procedure to retrieve a user, key and capabilities and save the user to a
|
Cluster, you can use the `Getting a User`_ procedure to retrieve a user, key,
|
||||||
keyring.
|
and capabilities and then save the user to a keyring.
|
||||||
|
|
||||||
When you only want to use one user per keyring, the `Get a User`_ procedure with
|
If you want to use only one user per keyring, the `Getting a User`_ procedure with
|
||||||
the ``-o`` option will save the output in the keyring file format. For example,
|
the ``-o`` option will save the output in the keyring file format. For example,
|
||||||
to create a keyring for the ``client.admin`` user, execute the following:
|
to create a keyring for the ``client.admin`` user, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
sudo ceph auth get client.admin -o /etc/ceph/ceph.client.admin.keyring
|
sudo ceph auth get client.admin -o /etc/ceph/ceph.client.admin.keyring
|
||||||
|
|
||||||
Notice that we use the recommended file format for an individual user.
|
Notice that the file format in this command is the file format conventionally used when manipulating the keyrings of individual users.
|
||||||
|
|
||||||
When you want to import users to a keyring, you can use ``ceph-authtool``
|
If you want to import users to a keyring, you can use ``ceph-authtool``
|
||||||
to specify the destination keyring and the source keyring.
|
to specify the destination keyring and the source keyring.
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
@ -681,19 +690,19 @@ For example:
|
|||||||
|
|
||||||
sudo ceph-authtool /etc/ceph/ceph.keyring --import-keyring /etc/ceph/ceph.client.admin.keyring
|
sudo ceph-authtool /etc/ceph/ceph.keyring --import-keyring /etc/ceph/ceph.client.admin.keyring
|
||||||
|
|
||||||
Create a User
|
Creating a User
|
||||||
-------------
|
---------------
|
||||||
|
|
||||||
Ceph provides the `Add a User`_ function to create a user directly in the Ceph
|
Ceph provides the `Adding a User`_ function to create a user directly in the Ceph
|
||||||
Storage Cluster. However, you can also create a user, keys and capabilities
|
Storage Cluster. However, you can also create a user, keys, and capabilities
|
||||||
directly on a Ceph client keyring. Then, you can import the user to the Ceph
|
directly on a Ceph client keyring, and then import the user to the Ceph
|
||||||
Storage Cluster. For example:
|
Storage Cluster. For example:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
sudo ceph-authtool -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx' /etc/ceph/ceph.keyring
|
sudo ceph-authtool -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx' /etc/ceph/ceph.keyring
|
||||||
|
|
||||||
See `Authorization (Capabilities)`_ for additional details on capabilities.
|
For additional details on capabilities, see `Authorization (Capabilities)`_.
|
||||||
|
|
||||||
You can also create a keyring and add a new user to the keyring simultaneously.
|
You can also create a keyring and add a new user to the keyring simultaneously.
|
||||||
For example:
|
For example:
|
||||||
@ -702,36 +711,37 @@ For example:
|
|||||||
|
|
||||||
sudo ceph-authtool -C /etc/ceph/ceph.keyring -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx' --gen-key
|
sudo ceph-authtool -C /etc/ceph/ceph.keyring -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx' --gen-key
|
||||||
|
|
||||||
In the foregoing scenarios, the new user ``client.ringo`` is only in the
|
In the above examples, the new user ``client.ringo`` has been added only to the
|
||||||
keyring. To add the new user to the Ceph Storage Cluster, you must still add
|
keyring. The new user has not been added to the Ceph Storage Cluster.
|
||||||
the new user to the Ceph Storage Cluster:
|
|
||||||
|
To add the new user ``client.ringo`` to the Ceph Storage Cluster, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
sudo ceph auth add client.ringo -i /etc/ceph/ceph.keyring
|
sudo ceph auth add client.ringo -i /etc/ceph/ceph.keyring
|
||||||
|
|
||||||
Modify a User
|
Modifying a User
|
||||||
-------------
|
----------------
|
||||||
|
|
||||||
To modify the capabilities of a user record in a keyring, specify the keyring,
|
To modify the capabilities of a user record in a keyring, specify the keyring
|
||||||
and the user followed by the capabilities. For example:
|
and the user, followed by the capabilities. For example:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
sudo ceph-authtool /etc/ceph/ceph.keyring -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx'
|
sudo ceph-authtool /etc/ceph/ceph.keyring -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx'
|
||||||
|
|
||||||
To update the user to the Ceph Storage Cluster, you must update the user
|
To update the user in the Ceph Storage Cluster, you must update the user
|
||||||
in the keyring to the user entry in the Ceph Storage Cluster:
|
in the keyring to the user entry in the Ceph Storage Cluster. To do so, run the following command:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
sudo ceph auth import -i /etc/ceph/ceph.keyring
|
sudo ceph auth import -i /etc/ceph/ceph.keyring
|
||||||
|
|
||||||
See `Import a User(s)`_ for details on updating a Ceph Storage Cluster user
|
For details on updating a Ceph Storage Cluster user from a
|
||||||
from a keyring.
|
keyring, see `Importing a User`_
|
||||||
|
|
||||||
You may also `Modify User Capabilities`_ directly in the cluster, store the
|
You may also :ref:`Modify user capabilities<modify-user-capabilities>` directly in the cluster, store the
|
||||||
results to a keyring file; then, import the keyring into your main
|
results to a keyring file, and then import the keyring into your main
|
||||||
``ceph.keyring`` file.
|
``ceph.keyring`` file.
|
||||||
|
|
||||||
Command Line Usage
|
Command Line Usage
|
||||||
@ -741,12 +751,12 @@ Ceph supports the following usage for user name and secret:
|
|||||||
|
|
||||||
``--id`` | ``--user``
|
``--id`` | ``--user``
|
||||||
|
|
||||||
:Description: Ceph identifies users with a type and an ID (e.g., ``TYPE.ID`` or
|
:Description: Ceph identifies users with a type and an ID: the form of this user identification is ``TYPE.ID``, and examples of the type and ID are
|
||||||
``client.admin``, ``client.user1``). The ``id``, ``name`` and
|
``client.admin`` and ``client.user1``. The ``id``, ``name`` and
|
||||||
``-n`` options enable you to specify the ID portion of the user
|
``-n`` options allow you to specify the ID portion of the user
|
||||||
name (e.g., ``admin``, ``user1``, ``foo``, etc.). You can specify
|
name (for example, ``admin``, ``user1``, ``foo``). You can specify
|
||||||
the user with the ``--id`` and omit the type. For example,
|
the user with the ``--id`` and omit the type. For example,
|
||||||
to specify user ``client.foo`` enter the following:
|
to specify user ``client.foo``, run the following commands:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
|
|
||||||
@ -756,10 +766,10 @@ Ceph supports the following usage for user name and secret:
|
|||||||
|
|
||||||
``--name`` | ``-n``
|
``--name`` | ``-n``
|
||||||
|
|
||||||
:Description: Ceph identifies users with a type and an ID (e.g., ``TYPE.ID`` or
|
:Description: Ceph identifies users with a type and an ID: the form of this user identification is ``TYPE.ID``, and examples of the type and ID are
|
||||||
``client.admin``, ``client.user1``). The ``--name`` and ``-n``
|
``client.admin`` and ``client.user1``. The ``--name`` and ``-n``
|
||||||
options enables you to specify the fully qualified user name.
|
options allow you to specify the fully qualified user name.
|
||||||
You must specify the user type (typically ``client``) with the
|
You are required to specify the user type (typically ``client``) with the
|
||||||
user ID. For example:
|
user ID. For example:
|
||||||
|
|
||||||
.. prompt:: bash $
|
.. prompt:: bash $
|
||||||
@ -770,8 +780,8 @@ Ceph supports the following usage for user name and secret:
|
|||||||
|
|
||||||
``--keyring``
|
``--keyring``
|
||||||
|
|
||||||
:Description: The path to the keyring containing one or more user name and
|
:Description: The path to the keyring that contains one or more user names and
|
||||||
secret. The ``--secret`` option provides the same functionality,
|
secrets. The ``--secret`` option provides the same functionality,
|
||||||
but it does not work with Ceph RADOS Gateway, which uses
|
but it does not work with Ceph RADOS Gateway, which uses
|
||||||
``--secret`` for another purpose. You may retrieve a keyring with
|
``--secret`` for another purpose. You may retrieve a keyring with
|
||||||
``ceph auth get-or-create`` and store it locally. This is a
|
``ceph auth get-or-create`` and store it locally. This is a
|
||||||
@ -788,43 +798,42 @@ Ceph supports the following usage for user name and secret:
|
|||||||
Limitations
|
Limitations
|
||||||
===========
|
===========
|
||||||
|
|
||||||
The ``cephx`` protocol authenticates Ceph clients and servers to each other. It
|
The ``cephx`` protocol authenticates Ceph clients and servers to each other. It
|
||||||
is not intended to handle authentication of human users or application programs
|
is not intended to handle authentication of human users or application programs
|
||||||
run on their behalf. If that effect is required to handle your access control
|
that are run on their behalf. If your access control
|
||||||
needs, you must have another mechanism, which is likely to be specific to the
|
needs require that kind of authentication, you will need to have some other mechanism, which is likely to be specific to the
|
||||||
front end used to access the Ceph object store. This other mechanism has the
|
front end that is used to access the Ceph object store. This other mechanism would ensure that only acceptable users and programs are able to run on the
|
||||||
role of ensuring that only acceptable users and programs are able to run on the
|
machine that Ceph permits to access its object store.
|
||||||
machine that Ceph will permit to access its object store.
|
|
||||||
|
|
||||||
The keys used to authenticate Ceph clients and servers are typically stored in
|
The keys used to authenticate Ceph clients and servers are typically stored in
|
||||||
a plain text file with appropriate permissions in a trusted host.
|
a plain text file on a trusted host. Appropriate permissions must be set on the plain text file.
|
||||||
|
|
||||||
.. important:: Storing keys in plaintext files has security shortcomings, but
|
.. important:: Storing keys in plaintext files has security shortcomings, but
|
||||||
they are difficult to avoid, given the basic authentication methods Ceph
|
they are difficult to avoid, given the basic authentication methods Ceph
|
||||||
uses in the background. Those setting up Ceph systems should be aware of
|
uses in the background. Anyone setting up Ceph systems should be aware of
|
||||||
these shortcomings.
|
these shortcomings.
|
||||||
|
|
||||||
In particular, arbitrary user machines, especially portable machines, should not
|
In particular, user machines, especially portable machines, should not
|
||||||
be configured to interact directly with Ceph, since that mode of use would
|
be configured to interact directly with Ceph, since that mode of use would
|
||||||
require the storage of a plaintext authentication key on an insecure machine.
|
require the storage of a plaintext authentication key on an insecure machine.
|
||||||
Anyone who stole that machine or obtained surreptitious access to it could
|
Anyone who stole that machine or obtained access to it could
|
||||||
obtain the key that will allow them to authenticate their own machines to Ceph.
|
obtain a key that allows them to authenticate their own machines to Ceph.
|
||||||
|
|
||||||
Rather than permitting potentially insecure machines to access a Ceph object
|
Instead of permitting potentially insecure machines to access a Ceph object
|
||||||
store directly, users should be required to sign in to a trusted machine in
|
store directly, you should require users to sign in to a trusted machine in
|
||||||
your environment using a method that provides sufficient security for your
|
your environment, using a method that provides sufficient security for your
|
||||||
purposes. That trusted machine will store the plaintext Ceph keys for the
|
purposes. That trusted machine will store the plaintext Ceph keys for the
|
||||||
human users. A future version of Ceph may address these particular
|
human users. A future version of Ceph might address these particular
|
||||||
authentication issues more fully.
|
authentication issues more fully.
|
||||||
|
|
||||||
At the moment, none of the Ceph authentication protocols provide secrecy for
|
At present, none of the Ceph authentication protocols provide secrecy for
|
||||||
messages in transit. Thus, an eavesdropper on the wire can hear and understand
|
messages in transit. As a result, an eavesdropper on the wire can hear and understand
|
||||||
all data sent between clients and servers in Ceph, even if it cannot create or
|
all data sent between clients and servers in Ceph, even if the eavesdropper cannot create or
|
||||||
alter them. Further, Ceph does not include options to encrypt user data in the
|
alter the data. Similarly, Ceph does not include options to encrypt user data in the
|
||||||
object store. Users can hand-encrypt and store their own data in the Ceph
|
object store. Users can, of course, hand-encrypt and store their own data in the Ceph
|
||||||
object store, of course, but Ceph provides no features to perform object
|
object store, but Ceph itself provides no features to perform object
|
||||||
encryption itself. Those storing sensitive data in Ceph should consider
|
encryption. Anyone storing sensitive data in Ceph should consider
|
||||||
encrypting their data before providing it to the Ceph system.
|
encrypting their data before providing it to the Ceph system.
|
||||||
|
|
||||||
|
|
||||||
.. _Architecture - High Availability Authentication: ../../../architecture#high-availability-authentication
|
.. _Architecture - High Availability Authentication: ../../../architecture#high-availability-authentication
|
||||||
|
@ -36,8 +36,9 @@ resharding tasks, one at a time.
|
|||||||
Multisite
|
Multisite
|
||||||
=========
|
=========
|
||||||
|
|
||||||
Dynamic resharding is not supported in a multisite environment.
|
Prior to the Reef release, RGW does not support dynamic resharding in a
|
||||||
|
multisite environment. For information on dynamic resharding, see
|
||||||
|
:ref:`Resharding <feature_resharding>` in the RGW multisite documentation.
|
||||||
|
|
||||||
Configuration
|
Configuration
|
||||||
=============
|
=============
|
||||||
|
@ -1130,7 +1130,7 @@ To view the configuration of a zonegroup, run this command:
|
|||||||
|
|
||||||
.. prompt:: bash #
|
.. prompt:: bash #
|
||||||
|
|
||||||
dosgw-admin zonegroup get [--rgw-zonegroup=<zonegroup>]
|
radosgw-admin zonegroup get [--rgw-zonegroup=<zonegroup>]
|
||||||
|
|
||||||
The zonegroup configuration looks like this:
|
The zonegroup configuration looks like this:
|
||||||
|
|
||||||
@ -1582,14 +1582,23 @@ Supported Features
|
|||||||
|
|
||||||
.. _feature_resharding:
|
.. _feature_resharding:
|
||||||
|
|
||||||
resharding
|
Resharding
|
||||||
~~~~~~~~~~
|
~~~~~~~~~~
|
||||||
|
|
||||||
Allows buckets to be resharded in a multisite configuration without interrupting the replication of their objects. When ``rgw_dynamic_resharding`` is enabled, it runs on each zone independently, and zones may choose different shard counts for the same bucket. When buckets are resharded manually with ``radosgw-admin bucket reshard``, only that zone's bucket is modified. A zone feature should only be marked as supported after all of its radosgws and osds have upgraded.
|
This feature allows buckets to be resharded in a multisite configuration
|
||||||
|
without interrupting the replication of their objects. When
|
||||||
|
``rgw_dynamic_resharding`` is enabled, it runs on each zone independently, and
|
||||||
|
zones may choose different shard counts for the same bucket. When buckets are
|
||||||
|
resharded manually with ``radosgw-admin bucket reshard``, only that zone's
|
||||||
|
bucket is modified. A zone feature should only be marked as supported after all
|
||||||
|
of its RGWs and OSDs have upgraded.
|
||||||
|
|
||||||
|
.. note:: Dynamic resharding is not supported in multisite deployments prior to
|
||||||
|
the Reef release.
|
||||||
|
|
||||||
|
|
||||||
Commands
|
Commands
|
||||||
-----------------
|
--------
|
||||||
|
|
||||||
Add support for a zone feature
|
Add support for a zone feature
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -138,9 +138,6 @@ updating, use the name of an existing topic and different endpoint values).
|
|||||||
.. tip:: Any notification already associated with the topic must be re-created
|
.. tip:: Any notification already associated with the topic must be re-created
|
||||||
in order for the topic to update.
|
in order for the topic to update.
|
||||||
|
|
||||||
.. note:: For rabbitmq, ``push-endpoint`` (with a hyphen in the middle) must be
|
|
||||||
changed to ``push_endpoint`` (with an underscore in the middle).
|
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
POST
|
POST
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
.. _documenting_ceph:
|
||||||
|
|
||||||
==================
|
==================
|
||||||
Documenting Ceph
|
Documenting Ceph
|
||||||
==================
|
==================
|
||||||
|
@ -12,9 +12,10 @@ These are exciting times in the Ceph community! Get involved!
|
|||||||
| **Blog** | Check the Ceph Blog_ periodically to keep track | http://ceph.com/community/blog/ |
|
| **Blog** | Check the Ceph Blog_ periodically to keep track | http://ceph.com/community/blog/ |
|
||||||
| | of Ceph progress and important announcements. | |
|
| | of Ceph progress and important announcements. | |
|
||||||
+----------------------+-------------------------------------------------+-----------------------------------------------+
|
+----------------------+-------------------------------------------------+-----------------------------------------------+
|
||||||
| **Planet Ceph** | Check the blog aggregation on Planet Ceph for | https://ceph.com/category/planet/ |
|
| **Planet Ceph** | Check the blog aggregation on Planet Ceph for | https://old.ceph.com/category/planet/ |
|
||||||
| | interesting stories, information and | |
|
| | interesting stories, information and | |
|
||||||
| | experiences from the community. | |
|
| | experiences from the community. **NOTE: NO | |
|
||||||
|
| | longer updated as of 2023.** | |
|
||||||
+----------------------+-------------------------------------------------+-----------------------------------------------+
|
+----------------------+-------------------------------------------------+-----------------------------------------------+
|
||||||
| **Wiki** | Check the Ceph Wiki is a source for more | http://wiki.ceph.com/ |
|
| **Wiki** | Check the Ceph Wiki is a source for more | http://wiki.ceph.com/ |
|
||||||
| | community and development related topics. You | |
|
| | community and development related topics. You | |
|
||||||
|
@ -2,14 +2,24 @@
|
|||||||
Intro to Ceph
|
Intro to Ceph
|
||||||
===============
|
===============
|
||||||
|
|
||||||
Whether you want to provide :term:`Ceph Object Storage` and/or
|
Ceph can be used to provide :term:`Ceph Object Storage` to :term:`Cloud
|
||||||
:term:`Ceph Block Device` services to :term:`Cloud Platforms`, deploy
|
Platforms` and Ceph can be used to provide :term:`Ceph Block Device` services
|
||||||
a :term:`Ceph File System` or use Ceph for another purpose, all
|
to :term:`Cloud Platforms`. Ceph can be used to deploy a :term:`Ceph File
|
||||||
:term:`Ceph Storage Cluster` deployments begin with setting up each
|
System`. All :term:`Ceph Storage Cluster` deployments begin with setting up
|
||||||
:term:`Ceph Node`, your network, and the Ceph Storage Cluster. A Ceph
|
each :term:`Ceph Node` and then setting up the network.
|
||||||
Storage Cluster requires at least one Ceph Monitor, Ceph Manager, and
|
|
||||||
Ceph OSD (Object Storage Daemon). The Ceph Metadata Server is also
|
A Ceph Storage Cluster requires the following: at least one Ceph Monitor and at
|
||||||
required when running Ceph File System clients.
|
least one Ceph Manager, and at least as many Ceph OSDs as there are copies of
|
||||||
|
an object stored on the Ceph cluster (for example, if three copies of a given
|
||||||
|
object are stored on the Ceph cluster, then at least three OSDs must exist in
|
||||||
|
that Ceph cluster).
|
||||||
|
|
||||||
|
The Ceph Metadata Server is necessary to run Ceph File System clients.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
It is a best practice to have a Ceph Manager for each Monitor, but it is not
|
||||||
|
necessary.
|
||||||
|
|
||||||
.. ditaa::
|
.. ditaa::
|
||||||
|
|
||||||
|
@ -18,19 +18,18 @@ Linux Kernel
|
|||||||
maintenance" kernel series provided by either http://kernel.org or
|
maintenance" kernel series provided by either http://kernel.org or
|
||||||
your Linux distribution on any client hosts.
|
your Linux distribution on any client hosts.
|
||||||
|
|
||||||
For RBD, if you choose to *track* long-term kernels, we currently recommend
|
For RBD, if you choose to *track* long-term kernels, we recommend
|
||||||
4.x-based "longterm maintenance" kernel series or later:
|
*at least* 4.19-based "longterm maintenance" kernel series. If you can
|
||||||
|
use a newer "stable" or "longterm maintenance" kernel series, do it.
|
||||||
- 4.19.z
|
|
||||||
- 4.14.z
|
|
||||||
- 5.x
|
|
||||||
|
|
||||||
For CephFS, see the section about `Mounting CephFS using Kernel Driver`_
|
For CephFS, see the section about `Mounting CephFS using Kernel Driver`_
|
||||||
for kernel version guidance.
|
for kernel version guidance.
|
||||||
|
|
||||||
Older kernel client versions may not support your `CRUSH tunables`_ profile
|
Older kernel client versions may not support your `CRUSH tunables`_ profile
|
||||||
or other newer features of the Ceph cluster, requiring the storage cluster
|
or other newer features of the Ceph cluster, requiring the storage cluster to
|
||||||
to be configured with those features disabled.
|
be configured with those features disabled. For RBD, a kernel of version 5.3
|
||||||
|
or CentOS 8.2 is the minimum necessary for reasonable support for RBD image
|
||||||
|
features.
|
||||||
|
|
||||||
|
|
||||||
Platforms
|
Platforms
|
||||||
|
@ -178,45 +178,77 @@ function install_pkg_on_ubuntu {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boost_ver=1.73
|
||||||
|
|
||||||
|
function clean_boost_on_ubuntu {
|
||||||
|
in_jenkins && echo "CI_DEBUG: Start clean_boost_on_ubuntu() in install-deps.sh"
|
||||||
|
# Find currently installed version. If there are multiple
|
||||||
|
# versions, they end up newline separated
|
||||||
|
local installed_ver=$(apt -qq list --installed ceph-libboost*-dev 2>/dev/null |
|
||||||
|
cut -d' ' -f2 |
|
||||||
|
cut -d'.' -f1,2 |
|
||||||
|
sort -u)
|
||||||
|
# If installed_ver contains whitespace, we can't really count on it,
|
||||||
|
# but otherwise, bail out if the version installed is the version
|
||||||
|
# we want.
|
||||||
|
if test -n "$installed_ver" &&
|
||||||
|
echo -n "$installed_ver" | tr '[:space:]' ' ' | grep -v -q ' '; then
|
||||||
|
if echo "$installed_ver" | grep -q "^$boost_ver"; then
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Historical packages
|
||||||
|
$SUDO rm -f /etc/apt/sources.list.d/ceph-libboost*.list
|
||||||
|
# Currently used
|
||||||
|
$SUDO rm -f /etc/apt/sources.list.d/libboost.list
|
||||||
|
# Refresh package list so things aren't in the available list.
|
||||||
|
$SUDO env DEBIAN_FRONTEND=noninteractive apt-get update -y || true
|
||||||
|
# Remove all ceph-libboost packages. We have an early return if
|
||||||
|
# the desired version is already (and the only) version installed,
|
||||||
|
# so no need to spare it.
|
||||||
|
if test -n "$installed_ver"; then
|
||||||
|
$SUDO env DEBIAN_FRONTEND=noninteractive apt-get -y --fix-missing remove "ceph-libboost*"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
function install_boost_on_ubuntu {
|
function install_boost_on_ubuntu {
|
||||||
local ver=1.73
|
|
||||||
in_jenkins && echo "CI_DEBUG: Running install_boost_on_ubuntu() in install-deps.sh"
|
in_jenkins && echo "CI_DEBUG: Running install_boost_on_ubuntu() in install-deps.sh"
|
||||||
|
# Once we get to this point, clean_boost_on_ubuntu() should ensure
|
||||||
|
# that there is no more than one installed version.
|
||||||
local installed_ver=$(apt -qq list --installed ceph-libboost*-dev 2>/dev/null |
|
local installed_ver=$(apt -qq list --installed ceph-libboost*-dev 2>/dev/null |
|
||||||
grep -e 'libboost[0-9].[0-9]\+-dev' |
|
grep -e 'libboost[0-9].[0-9]\+-dev' |
|
||||||
cut -d' ' -f2 |
|
cut -d' ' -f2 |
|
||||||
cut -d'.' -f1,2)
|
cut -d'.' -f1,2)
|
||||||
if test -n "$installed_ver"; then
|
if test -n "$installed_ver"; then
|
||||||
if echo "$installed_ver" | grep -q "^$ver"; then
|
if echo "$installed_ver" | grep -q "^$boost_ver"; then
|
||||||
return
|
return
|
||||||
else
|
|
||||||
$SUDO env DEBIAN_FRONTEND=noninteractive apt-get -y remove "ceph-libboost.*${installed_ver}.*"
|
|
||||||
$SUDO rm -f /etc/apt/sources.list.d/ceph-libboost${installed_ver}.list
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
local codename=$1
|
local codename=$1
|
||||||
local project=libboost
|
local project=libboost
|
||||||
local sha1=7aba8a1882670522ee1d1ee1bba0ea170b292dec
|
local sha1=7aba8a1882670522ee1d1ee1bba0ea170b292dec
|
||||||
install_pkg_on_ubuntu \
|
install_pkg_on_ubuntu \
|
||||||
$project \
|
$project \
|
||||||
$sha1 \
|
$sha1 \
|
||||||
$codename \
|
$codename \
|
||||||
check \
|
check \
|
||||||
ceph-libboost-atomic$ver-dev \
|
ceph-libboost-atomic${boost_ver}-dev \
|
||||||
ceph-libboost-chrono$ver-dev \
|
ceph-libboost-chrono${boost_ver}-dev \
|
||||||
ceph-libboost-container$ver-dev \
|
ceph-libboost-container${boost_ver}-dev \
|
||||||
ceph-libboost-context$ver-dev \
|
ceph-libboost-context${boost_ver}-dev \
|
||||||
ceph-libboost-coroutine$ver-dev \
|
ceph-libboost-coroutine${boost_ver}-dev \
|
||||||
ceph-libboost-date-time$ver-dev \
|
ceph-libboost-date-time${boost_ver}-dev \
|
||||||
ceph-libboost-filesystem$ver-dev \
|
ceph-libboost-filesystem${boost_ver}-dev \
|
||||||
ceph-libboost-iostreams$ver-dev \
|
ceph-libboost-iostreams${boost_ver}-dev \
|
||||||
ceph-libboost-program-options$ver-dev \
|
ceph-libboost-program-options${boost_ver}-dev \
|
||||||
ceph-libboost-python$ver-dev \
|
ceph-libboost-python${boost_ver}-dev \
|
||||||
ceph-libboost-random$ver-dev \
|
ceph-libboost-random${boost_ver}-dev \
|
||||||
ceph-libboost-regex$ver-dev \
|
ceph-libboost-regex${boost_ver}-dev \
|
||||||
ceph-libboost-system$ver-dev \
|
ceph-libboost-system${boost_ver}-dev \
|
||||||
ceph-libboost-test$ver-dev \
|
ceph-libboost-test${boost_ver}-dev \
|
||||||
ceph-libboost-thread$ver-dev \
|
ceph-libboost-thread${boost_ver}-dev \
|
||||||
ceph-libboost-timer$ver-dev
|
ceph-libboost-timer${boost_ver}-dev
|
||||||
}
|
}
|
||||||
|
|
||||||
function install_libzbd_on_ubuntu {
|
function install_libzbd_on_ubuntu {
|
||||||
@ -310,6 +342,9 @@ else
|
|||||||
case "$ID" in
|
case "$ID" in
|
||||||
debian|ubuntu|devuan|elementary)
|
debian|ubuntu|devuan|elementary)
|
||||||
echo "Using apt-get to install dependencies"
|
echo "Using apt-get to install dependencies"
|
||||||
|
# Put this before any other invocation of apt so it can clean
|
||||||
|
# up in a broken case.
|
||||||
|
clean_boost_on_ubuntu
|
||||||
$SUDO apt-get install -y devscripts equivs
|
$SUDO apt-get install -y devscripts equivs
|
||||||
$SUDO apt-get install -y dpkg-dev
|
$SUDO apt-get install -y dpkg-dev
|
||||||
ensure_python3_sphinx_on_ubuntu
|
ensure_python3_sphinx_on_ubuntu
|
||||||
@ -319,6 +354,27 @@ else
|
|||||||
[ ! $NO_BOOST_PKGS ] && install_boost_on_ubuntu bionic
|
[ ! $NO_BOOST_PKGS ] && install_boost_on_ubuntu bionic
|
||||||
$with_zbd && install_libzbd_on_ubuntu bionic
|
$with_zbd && install_libzbd_on_ubuntu bionic
|
||||||
;;
|
;;
|
||||||
|
*Jammy*)
|
||||||
|
[ ! $NO_BOOST_PKGS ] && \
|
||||||
|
$SUDO env DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||||
|
libboost-atomic-dev \
|
||||||
|
libboost-chrono-dev \
|
||||||
|
libboost-container-dev \
|
||||||
|
libboost-context-dev \
|
||||||
|
libboost-coroutine-dev \
|
||||||
|
libboost-date-time-dev \
|
||||||
|
libboost-filesystem-dev \
|
||||||
|
libboost-iostreams-dev \
|
||||||
|
libboost-program-options-dev \
|
||||||
|
libboost-python-dev \
|
||||||
|
libboost-random-dev \
|
||||||
|
libboost-regex-dev \
|
||||||
|
libboost-system-dev \
|
||||||
|
libboost-test-dev \
|
||||||
|
libboost-thread-dev \
|
||||||
|
libboost-timer-dev \
|
||||||
|
gcc
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
$SUDO apt-get install -y gcc
|
$SUDO apt-get install -y gcc
|
||||||
;;
|
;;
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
# https://tracker.ceph.com/issues/45802
|
# https://tracker.ceph.com/issues/45802
|
||||||
|
# https://tracker.ceph.com/issues/61168
|
||||||
overrides:
|
overrides:
|
||||||
ceph:
|
ceph:
|
||||||
log-ignorelist:
|
log-ignorelist:
|
||||||
- \(PG_AVAILABILITY\)
|
- \(PG_AVAILABILITY\)
|
||||||
|
- \(POOL_APP_NOT_ENABLED\)
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
overrides:
|
overrides:
|
||||||
ceph:
|
ceph:
|
||||||
log-ignorelist:
|
log-ignorelist:
|
||||||
|
@ -8,6 +8,9 @@ overrides:
|
|||||||
- slow request
|
- slow request
|
||||||
- MDS_CLIENT_LATE_RELEASE
|
- MDS_CLIENT_LATE_RELEASE
|
||||||
- t responding to mclientcaps
|
- t responding to mclientcaps
|
||||||
|
- Degraded data redundancy
|
||||||
|
- MDS_CLIENTS_LAGGY
|
||||||
|
- Reduced data availability
|
||||||
tasks:
|
tasks:
|
||||||
- cephfs_test_runner:
|
- cephfs_test_runner:
|
||||||
fail_on_skip: false
|
fail_on_skip: false
|
||||||
|
0
ceph/qa/suites/fs/mirror-ha/cephfs-mirror/+
Normal file
0
ceph/qa/suites/fs/mirror-ha/cephfs-mirror/+
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
meta:
|
||||||
|
- desc: create/rm volumes and set configs
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- exec:
|
||||||
|
mon.a:
|
||||||
|
- "ceph fs volume create dc"
|
||||||
|
- "ceph fs volume create dc-backup"
|
||||||
|
- full_sequential_finally:
|
||||||
|
- exec:
|
||||||
|
mon.a:
|
||||||
|
- ceph config set mon mon_allow_pool_delete true
|
||||||
|
- ceph fs volume rm dc --yes-i-really-mean-it
|
||||||
|
- ceph fs volume rm dc-backup --yes-i-really-mean-it
|
@ -8,10 +8,6 @@ overrides:
|
|||||||
debug client: 10
|
debug client: 10
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- exec:
|
|
||||||
client.1:
|
|
||||||
- "ceph fs volume create dc"
|
|
||||||
- "ceph fs volume create dc-backup"
|
|
||||||
- ceph-fuse:
|
- ceph-fuse:
|
||||||
client.1:
|
client.1:
|
||||||
cephfs_name: dc
|
cephfs_name: dc
|
||||||
|
@ -11,3 +11,4 @@ overrides:
|
|||||||
- has not responded to cap revoke by MDS for over
|
- has not responded to cap revoke by MDS for over
|
||||||
- MDS_CLIENT_LATE_RELEASE
|
- MDS_CLIENT_LATE_RELEASE
|
||||||
- responding to mclientcaps
|
- responding to mclientcaps
|
||||||
|
- RECENT_CRASH
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
meta:
|
meta:
|
||||||
- desc: 1 ceph cluster with 1 mon, 1 mgr, 3 osds, 1 mds
|
- desc: 1 ceph cluster with 1 mon, 1 mgr, 3 osds, 2 mds, 2 clients
|
||||||
roles:
|
roles:
|
||||||
- - mon.a
|
- - mon.a
|
||||||
- mgr.x
|
- mgr.x
|
||||||
- mds.a
|
- mds.a
|
||||||
|
- mds.b
|
||||||
- osd.0
|
- osd.0
|
||||||
- osd.1
|
- osd.1
|
||||||
- osd.2
|
- osd.2
|
||||||
- client.0
|
- client.0
|
||||||
|
- client.1
|
||||||
|
@ -5,3 +5,4 @@ tasks:
|
|||||||
- tasks.cephfs.test_volumes.TestVolumes
|
- tasks.cephfs.test_volumes.TestVolumes
|
||||||
- tasks.cephfs.test_volumes.TestSubvolumeGroups
|
- tasks.cephfs.test_volumes.TestSubvolumeGroups
|
||||||
- tasks.cephfs.test_volumes.TestSubvolumes
|
- tasks.cephfs.test_volumes.TestSubvolumes
|
||||||
|
- tasks.cephfs.test_subvolume.TestSubvolume
|
||||||
|
0
ceph/qa/suites/fs/workload/subvolume/$
Normal file
0
ceph/qa/suites/fs/workload/subvolume/$
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
overrides:
|
||||||
|
ceph:
|
||||||
|
subvols:
|
||||||
|
create: 2
|
||||||
|
subvol_options: "--namespace-isolated --size 25000000000"
|
||||||
|
ceph-fuse:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 0
|
||||||
|
kclient:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 1
|
@ -0,0 +1,11 @@
|
|||||||
|
overrides:
|
||||||
|
ceph:
|
||||||
|
subvols:
|
||||||
|
create: 2
|
||||||
|
subvol_options: "--namespace-isolated"
|
||||||
|
ceph-fuse:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 0
|
||||||
|
kclient:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 1
|
@ -0,0 +1,10 @@
|
|||||||
|
overrides:
|
||||||
|
ceph:
|
||||||
|
subvols:
|
||||||
|
create: 2
|
||||||
|
ceph-fuse:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 0
|
||||||
|
kclient:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 1
|
11
ceph/qa/suites/fs/workload/subvolume/with-quota.yaml
Normal file
11
ceph/qa/suites/fs/workload/subvolume/with-quota.yaml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
overrides:
|
||||||
|
ceph:
|
||||||
|
subvols:
|
||||||
|
create: 2
|
||||||
|
subvol_options: "--size 25000000000"
|
||||||
|
ceph-fuse:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 0
|
||||||
|
kclient:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 1
|
@ -0,0 +1,12 @@
|
|||||||
|
overrides:
|
||||||
|
install:
|
||||||
|
ceph:
|
||||||
|
extra_system_packages:
|
||||||
|
- pv
|
||||||
|
tasks:
|
||||||
|
- workunit:
|
||||||
|
clients:
|
||||||
|
all:
|
||||||
|
- rbd/diff_continuous.sh
|
||||||
|
env:
|
||||||
|
RBD_DEVICE_TYPE: "krbd"
|
1
ceph/qa/suites/orch/cephadm/workunits/task/.qa
Symbolic link
1
ceph/qa/suites/orch/cephadm/workunits/task/.qa
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../.qa/
|
@ -0,0 +1 @@
|
|||||||
|
../.qa/
|
@ -0,0 +1 @@
|
|||||||
|
.qa/distros/podman/centos_8.stream_container_tools.yaml
|
@ -18,3 +18,4 @@ tasks:
|
|||||||
clients:
|
clients:
|
||||||
client.0:
|
client.0:
|
||||||
- cephadm/test_iscsi_pids_limit.sh
|
- cephadm/test_iscsi_pids_limit.sh
|
||||||
|
- cephadm/test_iscsi_etc_hosts.sh
|
@ -1 +0,0 @@
|
|||||||
../orch/rook
|
|
@ -16,7 +16,7 @@ override:
|
|||||||
ceph:
|
ceph:
|
||||||
conf:
|
conf:
|
||||||
mon:
|
mon:
|
||||||
osd default pool size: 3
|
osd pool default size: 3
|
||||||
osd min pg log entries: 5
|
osd min pg log entries: 5
|
||||||
osd max pg log entries: 10
|
osd max pg log entries: 10
|
||||||
tasks:
|
tasks:
|
||||||
|
@ -12,11 +12,11 @@ openstack:
|
|||||||
- volumes: # attached to each instance
|
- volumes: # attached to each instance
|
||||||
count: 3
|
count: 3
|
||||||
size: 10 # GB
|
size: 10 # GB
|
||||||
override:
|
overrides:
|
||||||
ceph:
|
ceph:
|
||||||
conf:
|
conf:
|
||||||
mon:
|
mon:
|
||||||
osd default pool size: 3
|
osd pool default size: 3
|
||||||
tasks:
|
tasks:
|
||||||
- install:
|
- install:
|
||||||
- ceph:
|
- ceph:
|
||||||
|
@ -20,6 +20,10 @@ overrides:
|
|||||||
debug monc: 20
|
debug monc: 20
|
||||||
mon:
|
mon:
|
||||||
mon warn on pool no app: false
|
mon warn on pool no app: false
|
||||||
|
osd:
|
||||||
|
osd class load list: "*"
|
||||||
|
osd class default list: "*"
|
||||||
|
osd client watch timeout: 120
|
||||||
tasks:
|
tasks:
|
||||||
- workunit:
|
- workunit:
|
||||||
timeout: 6h
|
timeout: 6h
|
||||||
|
@ -0,0 +1,14 @@
|
|||||||
|
overrides:
|
||||||
|
install:
|
||||||
|
ceph:
|
||||||
|
extra_packages:
|
||||||
|
- rbd-nbd
|
||||||
|
extra_system_packages:
|
||||||
|
- pv
|
||||||
|
tasks:
|
||||||
|
- workunit:
|
||||||
|
clients:
|
||||||
|
client.0:
|
||||||
|
- rbd/diff_continuous.sh
|
||||||
|
env:
|
||||||
|
RBD_DEVICE_TYPE: "nbd"
|
@ -8,6 +8,7 @@ tasks:
|
|||||||
- qemu-kvm-block-rbd
|
- qemu-kvm-block-rbd
|
||||||
deb:
|
deb:
|
||||||
- qemu-block-extra
|
- qemu-block-extra
|
||||||
|
- qemu-utils
|
||||||
- ceph:
|
- ceph:
|
||||||
fs: xfs
|
fs: xfs
|
||||||
conf:
|
conf:
|
||||||
|
@ -8,6 +8,7 @@ tasks:
|
|||||||
- qemu-kvm-block-rbd
|
- qemu-kvm-block-rbd
|
||||||
deb:
|
deb:
|
||||||
- qemu-block-extra
|
- qemu-block-extra
|
||||||
|
- qemu-utils
|
||||||
- ceph:
|
- ceph:
|
||||||
fs: xfs
|
fs: xfs
|
||||||
conf:
|
conf:
|
||||||
|
@ -8,6 +8,7 @@ tasks:
|
|||||||
- qemu-kvm-block-rbd
|
- qemu-kvm-block-rbd
|
||||||
deb:
|
deb:
|
||||||
- qemu-block-extra
|
- qemu-block-extra
|
||||||
|
- qemu-utils
|
||||||
- ceph:
|
- ceph:
|
||||||
fs: xfs
|
fs: xfs
|
||||||
conf:
|
conf:
|
||||||
|
@ -8,6 +8,7 @@ tasks:
|
|||||||
- qemu-kvm-block-rbd
|
- qemu-kvm-block-rbd
|
||||||
deb:
|
deb:
|
||||||
- qemu-block-extra
|
- qemu-block-extra
|
||||||
|
- qemu-utils
|
||||||
- ceph:
|
- ceph:
|
||||||
fs: xfs
|
fs: xfs
|
||||||
conf:
|
conf:
|
||||||
|
@ -18,6 +18,5 @@ overrides:
|
|||||||
endpoints: [c2.client.0]
|
endpoints: [c2.client.0]
|
||||||
- name: test-zone3
|
- name: test-zone3
|
||||||
endpoints: [c1.client.1]
|
endpoints: [c1.client.1]
|
||||||
- name: test-zone4
|
rgw-multisite-tests:
|
||||||
endpoints: [c2.client.1]
|
args: [tests.py]
|
||||||
is_pubsub: true
|
|
5
ceph/qa/suites/rgw/verify/tasks/versioning.yaml
Normal file
5
ceph/qa/suites/rgw/verify/tasks/versioning.yaml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
tasks:
|
||||||
|
- workunit:
|
||||||
|
clients:
|
||||||
|
client.0:
|
||||||
|
- rgw/run-versioning.sh
|
@ -9,4 +9,6 @@ workload:
|
|||||||
clients:
|
clients:
|
||||||
client.0:
|
client.0:
|
||||||
- cls
|
- cls
|
||||||
|
env:
|
||||||
|
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||||
- print: "**** done end rados_api.yaml"
|
- print: "**** done end rados_api.yaml"
|
||||||
|
@ -7,4 +7,6 @@ stress-tasks:
|
|||||||
clients:
|
clients:
|
||||||
client.0:
|
client.0:
|
||||||
- cls/test_cls_rbd.sh
|
- cls/test_cls_rbd.sh
|
||||||
|
env:
|
||||||
|
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||||
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
||||||
|
@ -7,4 +7,6 @@ first-half-tasks:
|
|||||||
clients:
|
clients:
|
||||||
client.0:
|
client.0:
|
||||||
- cls/test_cls_rbd.sh
|
- cls/test_cls_rbd.sh
|
||||||
|
env:
|
||||||
|
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||||
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
||||||
|
@ -7,4 +7,6 @@ stress-tasks:
|
|||||||
clients:
|
clients:
|
||||||
client.0:
|
client.0:
|
||||||
- cls/test_cls_rbd.sh
|
- cls/test_cls_rbd.sh
|
||||||
|
env:
|
||||||
|
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||||
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
- print: "**** done cls/test_cls_rbd.sh 5-workload"
|
||||||
|
@ -262,6 +262,7 @@ def ceph_log(ctx, config):
|
|||||||
run.wait(
|
run.wait(
|
||||||
ctx.cluster.run(
|
ctx.cluster.run(
|
||||||
args=[
|
args=[
|
||||||
|
'time',
|
||||||
'sudo',
|
'sudo',
|
||||||
'find',
|
'find',
|
||||||
'/var/log/ceph',
|
'/var/log/ceph',
|
||||||
@ -271,10 +272,15 @@ def ceph_log(ctx, config):
|
|||||||
run.Raw('|'),
|
run.Raw('|'),
|
||||||
'sudo',
|
'sudo',
|
||||||
'xargs',
|
'xargs',
|
||||||
|
'--max-args=1',
|
||||||
|
'--max-procs=0',
|
||||||
|
'--verbose',
|
||||||
'-0',
|
'-0',
|
||||||
'--no-run-if-empty',
|
'--no-run-if-empty',
|
||||||
'--',
|
'--',
|
||||||
'gzip',
|
'gzip',
|
||||||
|
'-5',
|
||||||
|
'--verbose',
|
||||||
'--',
|
'--',
|
||||||
],
|
],
|
||||||
wait=False,
|
wait=False,
|
||||||
@ -445,6 +451,9 @@ def cephfs_setup(ctx, config):
|
|||||||
name = fs_config.pop('name')
|
name = fs_config.pop('name')
|
||||||
temp = deepcopy(cephfs_config)
|
temp = deepcopy(cephfs_config)
|
||||||
teuthology.deep_merge(temp, fs_config)
|
teuthology.deep_merge(temp, fs_config)
|
||||||
|
subvols = config.get('subvols', None)
|
||||||
|
if subvols:
|
||||||
|
teuthology.deep_merge(temp, {'subvols': subvols})
|
||||||
fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
|
fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
|
||||||
if set_allow_multifs:
|
if set_allow_multifs:
|
||||||
fs.set_allow_multifs()
|
fs.set_allow_multifs()
|
||||||
|
@ -524,6 +524,7 @@ def build_ceph_cluster(ctx, config):
|
|||||||
run.wait(
|
run.wait(
|
||||||
ctx.cluster.run(
|
ctx.cluster.run(
|
||||||
args=[
|
args=[
|
||||||
|
'time',
|
||||||
'sudo',
|
'sudo',
|
||||||
'find',
|
'find',
|
||||||
'/var/log/ceph',
|
'/var/log/ceph',
|
||||||
@ -533,10 +534,15 @@ def build_ceph_cluster(ctx, config):
|
|||||||
run.Raw('|'),
|
run.Raw('|'),
|
||||||
'sudo',
|
'sudo',
|
||||||
'xargs',
|
'xargs',
|
||||||
|
'--max-args=1',
|
||||||
|
'--max-procs=0',
|
||||||
|
'--verbose',
|
||||||
'-0',
|
'-0',
|
||||||
'--no-run-if-empty',
|
'--no-run-if-empty',
|
||||||
'--',
|
'--',
|
||||||
'gzip',
|
'gzip',
|
||||||
|
'-5',
|
||||||
|
'--verbose',
|
||||||
'--',
|
'--',
|
||||||
],
|
],
|
||||||
wait=False,
|
wait=False,
|
||||||
|
@ -72,6 +72,20 @@ def task(ctx, config):
|
|||||||
mount_timeout: 120 # default is 30, give up if /sys/ is not populated
|
mount_timeout: 120 # default is 30, give up if /sys/ is not populated
|
||||||
- interactive:
|
- interactive:
|
||||||
|
|
||||||
|
Example that creates and mounts a subvol:
|
||||||
|
|
||||||
|
overrides:
|
||||||
|
ceph:
|
||||||
|
subvols:
|
||||||
|
create: 2
|
||||||
|
subvol_options: "--namespace-isolated --size 25000000000"
|
||||||
|
ceph-fuse:
|
||||||
|
client.0:
|
||||||
|
mount_subvol_num: 0
|
||||||
|
kclient:
|
||||||
|
client.1:
|
||||||
|
mount_subvol_num: 1
|
||||||
|
|
||||||
:param ctx: Context
|
:param ctx: Context
|
||||||
:param config: Configuration
|
:param config: Configuration
|
||||||
"""
|
"""
|
||||||
|
@ -3148,11 +3148,14 @@ class CephManager:
|
|||||||
raise
|
raise
|
||||||
self.log("quorum is size %d" % size)
|
self.log("quorum is size %d" % size)
|
||||||
|
|
||||||
def get_mon_health(self, debug=False):
|
def get_mon_health(self, debug=False, detail=False):
|
||||||
"""
|
"""
|
||||||
Extract all the monitor health information.
|
Extract all the monitor health information.
|
||||||
"""
|
"""
|
||||||
out = self.raw_cluster_cmd('health', '--format=json')
|
if detail:
|
||||||
|
out = self.raw_cluster_cmd('health', 'detail', '--format=json')
|
||||||
|
else:
|
||||||
|
out = self.raw_cluster_cmd('health', '--format=json')
|
||||||
if debug:
|
if debug:
|
||||||
self.log('health:\n{h}'.format(h=out))
|
self.log('health:\n{h}'.format(h=out))
|
||||||
return json.loads(out)
|
return json.loads(out)
|
||||||
|
@ -92,7 +92,7 @@ class CephTestCase(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
def assert_cluster_log(self, expected_pattern, invert_match=False,
|
def assert_cluster_log(self, expected_pattern, invert_match=False,
|
||||||
timeout=10, watch_channel=None):
|
timeout=10, watch_channel=None, present=True):
|
||||||
"""
|
"""
|
||||||
Context manager. Assert that during execution, or up to 5 seconds later,
|
Context manager. Assert that during execution, or up to 5 seconds later,
|
||||||
the Ceph cluster log emits a message matching the expected pattern.
|
the Ceph cluster log emits a message matching the expected pattern.
|
||||||
@ -102,6 +102,8 @@ class CephTestCase(unittest.TestCase):
|
|||||||
:param watch_channel: Specifies the channel to be watched. This can be
|
:param watch_channel: Specifies the channel to be watched. This can be
|
||||||
'cluster', 'audit', ...
|
'cluster', 'audit', ...
|
||||||
:type watch_channel: str
|
:type watch_channel: str
|
||||||
|
:param present: Assert the log entry is present (default: True) or not (False).
|
||||||
|
:type present: bool
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ceph_manager = self.ceph_cluster.mon_manager
|
ceph_manager = self.ceph_cluster.mon_manager
|
||||||
@ -118,10 +120,13 @@ class CephTestCase(unittest.TestCase):
|
|||||||
self.watcher_process = ceph_manager.run_ceph_w(watch_channel)
|
self.watcher_process = ceph_manager.run_ceph_w(watch_channel)
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
fail = False
|
||||||
if not self.watcher_process.finished:
|
if not self.watcher_process.finished:
|
||||||
# Check if we got an early match, wait a bit if we didn't
|
# Check if we got an early match, wait a bit if we didn't
|
||||||
if self.match():
|
if present and self.match():
|
||||||
return
|
return
|
||||||
|
elif not present and self.match():
|
||||||
|
fail = True
|
||||||
else:
|
else:
|
||||||
log.debug("No log hits yet, waiting...")
|
log.debug("No log hits yet, waiting...")
|
||||||
# Default monc tick interval is 10s, so wait that long and
|
# Default monc tick interval is 10s, so wait that long and
|
||||||
@ -134,18 +139,23 @@ class CephTestCase(unittest.TestCase):
|
|||||||
except CommandFailedError:
|
except CommandFailedError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if not self.match():
|
if present and not self.match():
|
||||||
log.error("Log output: \n{0}\n".format(self.watcher_process.stdout.getvalue()))
|
log.error(f"Log output: \n{self.watcher_process.stdout.getvalue()}\n")
|
||||||
raise AssertionError("Expected log message not found: '{0}'".format(expected_pattern))
|
raise AssertionError(f"Expected log message found: '{expected_pattern}'")
|
||||||
|
elif fail or (not present and self.match()):
|
||||||
|
log.error(f"Log output: \n{self.watcher_process.stdout.getvalue()}\n")
|
||||||
|
raise AssertionError(f"Unexpected log message found: '{expected_pattern}'")
|
||||||
|
|
||||||
return ContextManager()
|
return ContextManager()
|
||||||
|
|
||||||
def wait_for_health(self, pattern, timeout):
|
def wait_for_health(self, pattern, timeout, check_in_detail=None):
|
||||||
"""
|
"""
|
||||||
Wait until 'ceph health' contains messages matching the pattern
|
Wait until 'ceph health' contains messages matching the pattern
|
||||||
|
Also check if @check_in_detail matches detailed health messages
|
||||||
|
only when @pattern is a code string.
|
||||||
"""
|
"""
|
||||||
def seen_health_warning():
|
def seen_health_warning():
|
||||||
health = self.ceph_cluster.mon_manager.get_mon_health()
|
health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=bool(check_in_detail))
|
||||||
codes = [s for s in health['checks']]
|
codes = [s for s in health['checks']]
|
||||||
summary_strings = [s[1]['summary']['message'] for s in health['checks'].items()]
|
summary_strings = [s[1]['summary']['message'] for s in health['checks'].items()]
|
||||||
if len(summary_strings) == 0:
|
if len(summary_strings) == 0:
|
||||||
@ -156,7 +166,16 @@ class CephTestCase(unittest.TestCase):
|
|||||||
if pattern in ss:
|
if pattern in ss:
|
||||||
return True
|
return True
|
||||||
if pattern in codes:
|
if pattern in codes:
|
||||||
return True
|
if not check_in_detail:
|
||||||
|
return True
|
||||||
|
# check if the string is in detail list if asked
|
||||||
|
detail_strings = [ss['message'] for ss in \
|
||||||
|
[s for s in health['checks'][pattern]['detail']]]
|
||||||
|
log.debug(f'detail_strings: {detail_strings}')
|
||||||
|
for ds in detail_strings:
|
||||||
|
if check_in_detail in ds:
|
||||||
|
return True
|
||||||
|
log.debug(f'detail string "{check_in_detail}" not found')
|
||||||
|
|
||||||
log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
|
log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
|
||||||
return False
|
return False
|
||||||
|
@ -257,6 +257,7 @@ def ceph_log(ctx, config):
|
|||||||
run.wait(
|
run.wait(
|
||||||
ctx.cluster.run(
|
ctx.cluster.run(
|
||||||
args=[
|
args=[
|
||||||
|
'time',
|
||||||
'sudo',
|
'sudo',
|
||||||
'find',
|
'find',
|
||||||
'/var/log/ceph', # all logs, not just for the cluster
|
'/var/log/ceph', # all logs, not just for the cluster
|
||||||
@ -267,10 +268,15 @@ def ceph_log(ctx, config):
|
|||||||
run.Raw('|'),
|
run.Raw('|'),
|
||||||
'sudo',
|
'sudo',
|
||||||
'xargs',
|
'xargs',
|
||||||
|
'--max-args=1',
|
||||||
|
'--max-procs=0',
|
||||||
|
'--verbose',
|
||||||
'-0',
|
'-0',
|
||||||
'--no-run-if-empty',
|
'--no-run-if-empty',
|
||||||
'--',
|
'--',
|
||||||
'gzip',
|
'gzip',
|
||||||
|
'-5',
|
||||||
|
'--verbose',
|
||||||
'--',
|
'--',
|
||||||
],
|
],
|
||||||
wait=False,
|
wait=False,
|
||||||
@ -818,7 +824,6 @@ def ceph_mdss(ctx, config):
|
|||||||
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def ceph_monitoring(daemon_type, ctx, config):
|
def ceph_monitoring(daemon_type, ctx, config):
|
||||||
"""
|
"""
|
||||||
|
@ -163,7 +163,7 @@ class CephFSTestCase(CephTestCase):
|
|||||||
# In case some test messed with auth caps, reset them
|
# In case some test messed with auth caps, reset them
|
||||||
for client_id in client_mount_ids:
|
for client_id in client_mount_ids:
|
||||||
cmd = ['auth', 'caps', f'client.{client_id}', 'mon','allow r',
|
cmd = ['auth', 'caps', f'client.{client_id}', 'mon','allow r',
|
||||||
'osd', f'allow rw pool={self.fs.get_data_pool_name()}',
|
'osd', f'allow rw tag cephfs data={self.fs.name}',
|
||||||
'mds', 'allow']
|
'mds', 'allow']
|
||||||
|
|
||||||
if self.run_cluster_cmd_result(cmd) == 0:
|
if self.run_cluster_cmd_result(cmd) == 0:
|
||||||
|
@ -369,6 +369,9 @@ class MDSCluster(CephCluster):
|
|||||||
"""
|
"""
|
||||||
self.mds_daemons[mds_id].signal(sig, silent);
|
self.mds_daemons[mds_id].signal(sig, silent);
|
||||||
|
|
||||||
|
def mds_is_running(self, mds_id):
|
||||||
|
return self.mds_daemons[mds_id].running()
|
||||||
|
|
||||||
def newfs(self, name='cephfs', create=True):
|
def newfs(self, name='cephfs', create=True):
|
||||||
return Filesystem(self._ctx, name=name, create=create)
|
return Filesystem(self._ctx, name=name, create=create)
|
||||||
|
|
||||||
@ -748,6 +751,7 @@ class Filesystem(MDSCluster):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
if self.fs_config is not None:
|
if self.fs_config is not None:
|
||||||
|
log.debug(f"fs_config: {self.fs_config}")
|
||||||
max_mds = self.fs_config.get('max_mds', 1)
|
max_mds = self.fs_config.get('max_mds', 1)
|
||||||
if max_mds > 1:
|
if max_mds > 1:
|
||||||
self.set_max_mds(max_mds)
|
self.set_max_mds(max_mds)
|
||||||
@ -760,6 +764,34 @@ class Filesystem(MDSCluster):
|
|||||||
if session_timeout != 60:
|
if session_timeout != 60:
|
||||||
self.set_session_timeout(session_timeout)
|
self.set_session_timeout(session_timeout)
|
||||||
|
|
||||||
|
if self.fs_config.get('subvols', None) is not None:
|
||||||
|
log.debug(f"Creating {self.fs_config.get('subvols')} subvols "
|
||||||
|
f"for filesystem '{self.name}'")
|
||||||
|
if not hasattr(self._ctx, "created_subvols"):
|
||||||
|
self._ctx.created_subvols = dict()
|
||||||
|
|
||||||
|
subvols = self.fs_config.get('subvols')
|
||||||
|
assert(isinstance(subvols, dict))
|
||||||
|
assert(isinstance(subvols['create'], int))
|
||||||
|
assert(subvols['create'] > 0)
|
||||||
|
|
||||||
|
for sv in range(0, subvols['create']):
|
||||||
|
sv_name = f'sv_{sv}'
|
||||||
|
self.mon_manager.raw_cluster_cmd(
|
||||||
|
'fs', 'subvolume', 'create', self.name, sv_name,
|
||||||
|
self.fs_config.get('subvol_options', ''))
|
||||||
|
|
||||||
|
if self.name not in self._ctx.created_subvols:
|
||||||
|
self._ctx.created_subvols[self.name] = []
|
||||||
|
|
||||||
|
subvol_path = self.mon_manager.raw_cluster_cmd(
|
||||||
|
'fs', 'subvolume', 'getpath', self.name, sv_name)
|
||||||
|
subvol_path = subvol_path.strip()
|
||||||
|
self._ctx.created_subvols[self.name].append(subvol_path)
|
||||||
|
else:
|
||||||
|
log.debug(f"Not Creating any subvols for filesystem '{self.name}'")
|
||||||
|
|
||||||
|
|
||||||
self.getinfo(refresh = True)
|
self.getinfo(refresh = True)
|
||||||
|
|
||||||
# wait pgs to be clean
|
# wait pgs to be clean
|
||||||
@ -1090,6 +1122,10 @@ class Filesystem(MDSCluster):
|
|||||||
def rank_fail(self, rank=0):
|
def rank_fail(self, rank=0):
|
||||||
self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank))
|
self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank))
|
||||||
|
|
||||||
|
def rank_is_running(self, rank=0, status=None):
|
||||||
|
name = self.get_rank(rank=rank, status=status)['name']
|
||||||
|
return self.mds_is_running(name)
|
||||||
|
|
||||||
def get_ranks(self, status=None):
|
def get_ranks(self, status=None):
|
||||||
if status is None:
|
if status is None:
|
||||||
status = self.getinfo()
|
status = self.getinfo()
|
||||||
@ -1537,7 +1573,7 @@ class Filesystem(MDSCluster):
|
|||||||
if quiet:
|
if quiet:
|
||||||
base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1']
|
base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1']
|
||||||
else:
|
else:
|
||||||
base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
|
base_args = [os.path.join(self._prefix, tool), '--debug-mds=20', '--debug-ms=1', '--debug-objecter=1']
|
||||||
|
|
||||||
if rank is not None:
|
if rank is not None:
|
||||||
base_args.extend(["--rank", "%s" % str(rank)])
|
base_args.extend(["--rank", "%s" % str(rank)])
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user