mirror of
https://git.proxmox.com/git/ceph.git
synced 2025-04-28 15:01:36 +00:00
import ceph pacific 16.2.12 source
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
578f8e68e4
commit
f7c0226f20
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
|
||||
# remove cmake/modules/FindPython* once 3.12 is required
|
||||
|
||||
project(ceph
|
||||
VERSION 16.2.11
|
||||
VERSION 16.2.12
|
||||
LANGUAGES CXX C ASM)
|
||||
|
||||
foreach(policy
|
||||
|
@ -32,6 +32,11 @@
|
||||
in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
|
||||
the restored file system is expected to have the same ID as before.
|
||||
|
||||
* CEPHFS: Rename the `mds_max_retries_on_remount_failure` option to
|
||||
`client_max_retries_on_remount_failure` and move it from mds.yaml.in to
|
||||
mds-client.yaml.in because this option was only used by MDS client from its
|
||||
birth.
|
||||
|
||||
>=16.2.11
|
||||
--------
|
||||
|
||||
|
@ -135,7 +135,7 @@
|
||||
# main package definition
|
||||
#################################################################################
|
||||
Name: ceph
|
||||
Version: 16.2.11
|
||||
Version: 16.2.12
|
||||
Release: 0%{?dist}
|
||||
%if 0%{?fedora} || 0%{?rhel}
|
||||
Epoch: 2
|
||||
@ -151,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
|
||||
Group: System/Filesystems
|
||||
%endif
|
||||
URL: http://ceph.com/
|
||||
Source0: %{?_remote_tarball_prefix}ceph-16.2.11.tar.bz2
|
||||
Source0: %{?_remote_tarball_prefix}ceph-16.2.12.tar.bz2
|
||||
%if 0%{?suse_version}
|
||||
# _insert_obs_source_lines_here
|
||||
ExclusiveArch: x86_64 aarch64 ppc64le s390x
|
||||
@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus.
|
||||
# common
|
||||
#################################################################################
|
||||
%prep
|
||||
%autosetup -p1 -n ceph-16.2.11
|
||||
%autosetup -p1 -n ceph-16.2.12
|
||||
|
||||
%build
|
||||
# Disable lto on systems that do not support symver attribute
|
||||
@ -1398,7 +1398,7 @@ touch %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
|
||||
chmod 0600 %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
|
||||
|
||||
# firewall templates and /sbin/mount.ceph symlink
|
||||
%if 0%{?suse_version} && !0%{?usrmerged}
|
||||
%if 0%{?suse_version} && 0%{?suse_version} < 1550
|
||||
mkdir -p %{buildroot}/sbin
|
||||
ln -sf %{_sbindir}/mount.ceph %{buildroot}/sbin/mount.ceph
|
||||
%endif
|
||||
@ -1577,7 +1577,7 @@ exit 0
|
||||
%{_bindir}/rbd-replay-many
|
||||
%{_bindir}/rbdmap
|
||||
%{_sbindir}/mount.ceph
|
||||
%if 0%{?suse_version} && !0%{?usrmerged}
|
||||
%if 0%{?suse_version} && 0%{?suse_version} < 1550
|
||||
/sbin/mount.ceph
|
||||
%endif
|
||||
%if %{with lttng}
|
||||
|
@ -1398,7 +1398,7 @@ touch %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
|
||||
chmod 0600 %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
|
||||
|
||||
# firewall templates and /sbin/mount.ceph symlink
|
||||
%if 0%{?suse_version} && !0%{?usrmerged}
|
||||
%if 0%{?suse_version} && 0%{?suse_version} < 1550
|
||||
mkdir -p %{buildroot}/sbin
|
||||
ln -sf %{_sbindir}/mount.ceph %{buildroot}/sbin/mount.ceph
|
||||
%endif
|
||||
@ -1577,7 +1577,7 @@ exit 0
|
||||
%{_bindir}/rbd-replay-many
|
||||
%{_bindir}/rbdmap
|
||||
%{_sbindir}/mount.ceph
|
||||
%if 0%{?suse_version} && !0%{?usrmerged}
|
||||
%if 0%{?suse_version} && 0%{?suse_version} < 1550
|
||||
/sbin/mount.ceph
|
||||
%endif
|
||||
%if %{with lttng}
|
||||
|
@ -1,7 +1,19 @@
|
||||
ceph (16.2.11-1focal) focal; urgency=medium
|
||||
ceph (16.2.12-1focal) focal; urgency=medium
|
||||
|
||||
|
||||
-- Jenkins Build Slave User <jenkins-build@braggi16.front.sepia.ceph.com> Tue, 24 Jan 2023 21:28:06 +0000
|
||||
-- Jenkins Build Slave User <jenkins-build@braggi17.front.sepia.ceph.com> Thu, 13 Apr 2023 22:05:57 +0000
|
||||
|
||||
ceph (16.2.12-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
||||
-- Ceph Release Team <ceph-maintainers@ceph.io> Thu, 13 Apr 2023 21:54:05 +0000
|
||||
|
||||
ceph (16.2.12-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
||||
-- Ceph Release Team <ceph-maintainers@ceph.io> Thu, 13 Apr 2023 14:09:23 +0000
|
||||
|
||||
ceph (16.2.11-1) stable; urgency=medium
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
``activate``
|
||||
============
|
||||
|
||||
|
||||
Once :ref:`ceph-volume-lvm-prepare` is completed, and all the various steps
|
||||
that entails are done, the volume is ready to get "activated".
|
||||
|
||||
@ -13,7 +13,7 @@ understand what OSD is enabled and needs to be mounted.
|
||||
.. note:: The execution of this call is fully idempotent, and there is no
|
||||
side-effects when running multiple times
|
||||
|
||||
For OSDs deployed by cephadm, please refer to :ref:cephadm-osd-activate:
|
||||
For OSDs deployed by cephadm, please refer to :ref:`cephadm-osd-activate`
|
||||
instead.
|
||||
|
||||
New OSDs
|
||||
@ -29,7 +29,7 @@ need to be supplied. For example::
|
||||
Activating all OSDs
|
||||
-------------------
|
||||
|
||||
.. note:: For OSDs deployed by cephadm, please refer to :ref:cephadm-osd-activate:
|
||||
.. note:: For OSDs deployed by cephadm, please refer to :ref:`cephadm-osd-activate`
|
||||
instead.
|
||||
|
||||
It is possible to activate all existing OSDs at once by using the ``--all``
|
||||
|
@ -4,45 +4,41 @@ Encryption
|
||||
==========
|
||||
|
||||
Logical volumes can be encrypted using ``dmcrypt`` by specifying the
|
||||
``--dmcrypt`` flag when creating OSDs. Encryption can be done in different ways,
|
||||
specially with LVM. ``ceph-volume`` is somewhat opinionated with the way it
|
||||
sets up encryption with logical volumes so that the process is consistent and
|
||||
``--dmcrypt`` flag when creating OSDs. When using LVM, logical volumes can be
|
||||
encrypted in different ways. ``ceph-volume`` does not offer as many options as
|
||||
LVM does, but it encrypts logical volumes in a way that is consistent and
|
||||
robust.
|
||||
|
||||
In this case, ``ceph-volume lvm`` follows these constraints:
|
||||
In this case, ``ceph-volume lvm`` follows this constraint:
|
||||
|
||||
* only LUKS (version 1) is used
|
||||
* Logical Volumes are encrypted, while their underlying PVs (physical volumes)
|
||||
aren't
|
||||
* Non-LVM devices like partitions are also encrypted with the same OSD key
|
||||
* Non-LVM devices (such as partitions) are encrypted with the same OSD key.
|
||||
|
||||
|
||||
LUKS
|
||||
----
|
||||
There are currently two versions of LUKS, 1 and 2. Version 2 is a bit easier
|
||||
to implement but not widely available in all distros Ceph supports. LUKS 1 is
|
||||
not going to be deprecated in favor of LUKS 2, so in order to have as wide
|
||||
support as possible, ``ceph-volume`` uses LUKS version 1.
|
||||
There are currently two versions of LUKS, 1 and 2. Version 2 is a bit easier to
|
||||
implement but not widely available in all Linux distributions supported by
|
||||
Ceph.
|
||||
|
||||
.. note:: Version 1 of LUKS is just referenced as "LUKS" whereas version 2 is
|
||||
referred to as LUKS2
|
||||
.. note:: Version 1 of LUKS is referred to in this documentation as "LUKS".
|
||||
Version 2 is of LUKS is referred to in this documentation as "LUKS2".
|
||||
|
||||
|
||||
LUKS on LVM
|
||||
-----------
|
||||
Encryption is done on top of existing logical volumes (unlike encrypting the
|
||||
physical device). Any single logical volume can be encrypted while other
|
||||
volumes can remain unencrypted. This method also allows for flexible logical
|
||||
Encryption is done on top of existing logical volumes (this is not the same as
|
||||
encrypting the physical device). Any single logical volume can be encrypted,
|
||||
leaving other volumes unencrypted. This method also allows for flexible logical
|
||||
volume setups, since encryption will happen once the LV is created.
|
||||
|
||||
|
||||
Workflow
|
||||
--------
|
||||
When setting up the OSD, a secret key will be created, that will be passed
|
||||
along to the monitor in JSON format as ``stdin`` to prevent the key from being
|
||||
When setting up the OSD, a secret key is created. That secret key is passed
|
||||
to the monitor in JSON format as ``stdin`` to prevent the key from being
|
||||
captured in the logs.
|
||||
|
||||
The JSON payload looks something like::
|
||||
The JSON payload looks something like this::
|
||||
|
||||
{
|
||||
"cephx_secret": CEPHX_SECRET,
|
||||
@ -51,36 +47,38 @@ The JSON payload looks something like::
|
||||
}
|
||||
|
||||
The naming convention for the keys is **strict**, and they are named like that
|
||||
for the hardcoded (legacy) names ceph-disk used.
|
||||
for the hardcoded (legacy) names used by ceph-disk.
|
||||
|
||||
* ``cephx_secret`` : The cephx key used to authenticate
|
||||
* ``dmcrypt_key`` : The secret (or private) key to unlock encrypted devices
|
||||
* ``cephx_lockbox_secret`` : The authentication key used to retrieve the
|
||||
``dmcrypt_key``. It is named *lockbox* because ceph-disk used to have an
|
||||
unencrypted partition named after it, used to store public keys and other
|
||||
OSD metadata.
|
||||
unencrypted partition named after it, which was used to store public keys and
|
||||
other OSD metadata.
|
||||
|
||||
The naming convention is strict because Monitors supported the naming
|
||||
convention by ceph-disk, which used these key names. In order to keep
|
||||
compatibility and prevent ceph-disk from breaking, ceph-volume will use the same
|
||||
naming convention *although they don't make sense for the new encryption
|
||||
convention of ceph-disk, which used these key names. In order to maintain
|
||||
compatibility and prevent ceph-disk from breaking, ceph-volume uses the same
|
||||
naming convention *although it does not make sense for the new encryption
|
||||
workflow*.
|
||||
|
||||
After the common steps of setting up the OSD during the prepare stage, either
|
||||
with :term:`filestore` or :term:`bluestore`, the logical volume is left ready
|
||||
to be activated, regardless of the state of the device (encrypted or decrypted).
|
||||
After the common steps of setting up the OSD during the "prepare stage" (either
|
||||
with :term:`filestore` or :term:`bluestore`), the logical volume is left ready
|
||||
to be activated, regardless of the state of the device (encrypted or
|
||||
decrypted).
|
||||
|
||||
At activation time, the logical volume will get decrypted and the OSD started
|
||||
once the process completes correctly.
|
||||
At the time of its activation, the logical volume is decrypted. The OSD starts
|
||||
after the process completes correctly.
|
||||
|
||||
Summary of the encryption workflow for creating a new OSD:
|
||||
Summary of the encryption workflow for creating a new OSD
|
||||
----------------------------------------------------------
|
||||
|
||||
#. OSD is created, both lockbox and dmcrypt keys are created, and sent along
|
||||
with JSON to the monitors, indicating an encrypted OSD.
|
||||
#. OSD is created. Both lockbox and dmcrypt keys are created and sent to the
|
||||
monitors in JSON format, indicating an encrypted OSD.
|
||||
|
||||
#. All complementary devices (like journal, db, or wal) get created and
|
||||
encrypted with the same OSD key. Key is stored in the LVM metadata of the
|
||||
OSD
|
||||
OSD.
|
||||
|
||||
#. Activation continues by ensuring devices are mounted, retrieving the dmcrypt
|
||||
secret key from the monitors and decrypting before the OSD gets started.
|
||||
secret key from the monitors, and decrypting before the OSD gets started.
|
||||
|
@ -11,20 +11,28 @@ Compatibility with Podman Versions
|
||||
Podman and Ceph have different end-of-life strategies. This means that care
|
||||
must be taken in finding a version of Podman that is compatible with Ceph.
|
||||
|
||||
These versions are expected to work:
|
||||
This table shows which version pairs are expected to work or not work together:
|
||||
|
||||
|
||||
+-----------+---------------------------------------+
|
||||
| Ceph | Podman |
|
||||
+-----------+-------+-------+-------+-------+-------+
|
||||
| | 1.9 | 2.0 | 2.1 | 2.2 | 3.0 |
|
||||
+===========+=======+=======+=======+=======+=======+
|
||||
| <= 15.2.5 | True | False | False | False | False |
|
||||
+-----------+-------+-------+-------+-------+-------+
|
||||
| >= 15.2.6 | True | True | True | False | False |
|
||||
+-----------+-------+-------+-------+-------+-------+
|
||||
| >= 16.2.1 | False | True | True | False | True |
|
||||
+-----------+-------+-------+-------+-------+-------+
|
||||
+-----------+-----------------------------------------------+
|
||||
| Ceph | Podman |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
| | 1.9 | 2.0 | 2.1 | 2.2 | 3.0 | > 3.0 |
|
||||
+===========+=======+=======+=======+=======+=======+=======+
|
||||
| <= 15.2.5 | True | False | False | False | False | False |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
| >= 15.2.6 | True | True | True | False | False | False |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
| >= 16.2.1 | False | True | True | False | True | True |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
| >= 17.2.0 | False | True | True | False | True | True |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
|
||||
.. note::
|
||||
|
||||
While not all podman versions have been actively tested against
|
||||
all Ceph versions, there are no known issues with using podman
|
||||
version 3.0 or greater with Ceph Quincy and later releases.
|
||||
|
||||
.. warning::
|
||||
|
||||
@ -41,17 +49,17 @@ These versions are expected to work:
|
||||
Stability
|
||||
---------
|
||||
|
||||
Cephadm is under development. Some functionality is incomplete. Be aware
|
||||
that some of the components of Ceph may not work perfectly with cephadm.
|
||||
These include:
|
||||
|
||||
- RGW
|
||||
Cephadm is relatively stable but new functionality is still being
|
||||
added and bugs are occasionally discovered. If issues are found, please
|
||||
open a tracker issue under the Orchestrator component (https://tracker.ceph.com/projects/orchestrator/issues)
|
||||
|
||||
Cephadm support remains under development for the following features:
|
||||
|
||||
- Ingress
|
||||
- Cephadm exporter daemon
|
||||
- cephfs-mirror
|
||||
- ceph-exporter deployment
|
||||
- stretch mode integration
|
||||
- monitoring stack (moving towards prometheus service discover and providing TLS)
|
||||
- RGW multisite deployment support (requires lots of manual steps currently)
|
||||
- cephadm agent
|
||||
|
||||
If a cephadm command fails or a service stops running properly, see
|
||||
:ref:`cephadm-pause` for instructions on how to pause the Ceph cluster's
|
||||
|
@ -245,9 +245,10 @@ Many hosts can be added at once using
|
||||
hostname: node-02
|
||||
addr: 192.168.0.12
|
||||
|
||||
This can be combined with service specifications (below) to create a cluster spec
|
||||
file to deploy a whole cluster in one command. see ``cephadm bootstrap --apply-spec``
|
||||
also to do this during bootstrap. Cluster SSH Keys must be copied to hosts prior to adding them.
|
||||
This can be combined with :ref:`service specifications<orchestrator-cli-service-spec>`
|
||||
to create a cluster spec file to deploy a whole cluster in one command. see
|
||||
``cephadm bootstrap --apply-spec`` also to do this during bootstrap. Cluster
|
||||
SSH Keys must be copied to hosts prior to adding them.
|
||||
|
||||
Setting the initial CRUSH location of host
|
||||
==========================================
|
||||
|
@ -48,8 +48,8 @@ curl-based installation
|
||||
-----------------------
|
||||
|
||||
* Use ``curl`` to fetch the most recent version of the
|
||||
standalone script.
|
||||
|
||||
standalone script.
|
||||
|
||||
.. prompt:: bash #
|
||||
:substitutions:
|
||||
|
||||
@ -148,7 +148,7 @@ Running the bootstrap command
|
||||
|
||||
Run the ``ceph bootstrap`` command:
|
||||
|
||||
.. prompt:: bash #
|
||||
.. prompt:: bash #
|
||||
|
||||
cephadm bootstrap --mon-ip *<mon-ip>*
|
||||
|
||||
@ -167,11 +167,11 @@ This command will:
|
||||
with this label will (also) get a copy of ``/etc/ceph/ceph.conf`` and
|
||||
``/etc/ceph/ceph.client.admin.keyring``.
|
||||
|
||||
Further information about cephadm bootstrap
|
||||
Further information about cephadm bootstrap
|
||||
-------------------------------------------
|
||||
|
||||
The default bootstrap behavior will work for most users. But if you'd like
|
||||
immediately to know more about ``cephadm bootstrap``, read the list below.
|
||||
immediately to know more about ``cephadm bootstrap``, read the list below.
|
||||
|
||||
Also, you can run ``cephadm bootstrap -h`` to see all of ``cephadm``'s
|
||||
available options.
|
||||
@ -210,20 +210,20 @@ available options.
|
||||
EOF
|
||||
$ ./cephadm bootstrap --config initial-ceph.conf ...
|
||||
|
||||
* The ``--ssh-user *<user>*`` option makes it possible to choose which SSH
|
||||
* The ``--ssh-user *<user>*`` option makes it possible to choose which SSH
|
||||
user cephadm will use to connect to hosts. The associated SSH key will be
|
||||
added to ``/home/*<user>*/.ssh/authorized_keys``. The user that you
|
||||
added to ``/home/*<user>*/.ssh/authorized_keys``. The user that you
|
||||
designate with this option must have passwordless sudo access.
|
||||
|
||||
* If you are using a container on an authenticated registry that requires
|
||||
login, you may add the argument:
|
||||
|
||||
* ``--registry-json <path to json file>``
|
||||
* ``--registry-json <path to json file>``
|
||||
|
||||
example contents of JSON file with login info::
|
||||
|
||||
{"url":"REGISTRY_URL", "username":"REGISTRY_USERNAME", "password":"REGISTRY_PASSWORD"}
|
||||
|
||||
|
||||
Cephadm will attempt to log in to this registry so it can pull your container
|
||||
and then store the login info in its config database. Other hosts added to
|
||||
the cluster will then also be able to make use of the authenticated registry.
|
||||
@ -272,7 +272,7 @@ command. There are several ways to do this:
|
||||
Confirm that the ``ceph`` command is accessible with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
|
||||
ceph -v
|
||||
|
||||
|
||||
@ -292,7 +292,7 @@ By default, a ``ceph.conf`` file and a copy of the ``client.admin`` keyring
|
||||
are maintained in ``/etc/ceph`` on all hosts with the ``_admin`` label, which is initially
|
||||
applied only to the bootstrap host. We usually recommend that one or more other hosts be
|
||||
given the ``_admin`` label so that the Ceph CLI (e.g., via ``cephadm shell``) is easily
|
||||
accessible on multiple hosts. To add the ``_admin`` label to additional host(s),
|
||||
accessible on multiple hosts. To add the ``_admin`` label to additional host(s):
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
@ -310,8 +310,8 @@ Please follow :ref:`deploy_additional_monitors` to deploy additional MONs.
|
||||
Adding Storage
|
||||
==============
|
||||
|
||||
To add storage to the cluster, either tell Ceph to consume any
|
||||
available and unused device:
|
||||
To add storage to the cluster, you can tell Ceph to consume any
|
||||
available and unused device(s):
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
@ -406,7 +406,7 @@ have access to all hosts that you plan to add to the cluster.
|
||||
insecure registry.
|
||||
|
||||
#. Push your container image to your local registry. Here are some acceptable
|
||||
kinds of container images:
|
||||
kinds of container images:
|
||||
|
||||
* Ceph container image. See :ref:`containers`.
|
||||
* Prometheus container image
|
||||
|
@ -496,11 +496,20 @@ candidate hosts.
|
||||
If there are fewer hosts selected by the placement specification than
|
||||
demanded by ``count``, cephadm will deploy only on the selected hosts.
|
||||
|
||||
.. _cephadm-extra-container-args:
|
||||
|
||||
Extra Container Arguments
|
||||
=========================
|
||||
|
||||
.. warning::
|
||||
The arguments provided for extra container args are limited to whatever arguments are available for a `run` command from whichever container engine you are using. Providing any arguments the `run` command does not support (or invalid values for arguments) will cause the daemon to fail to start.
|
||||
The arguments provided for extra container args are limited to whatever arguments are available for
|
||||
a `run` command from whichever container engine you are using. Providing any arguments the `run`
|
||||
command does not support (or invalid values for arguments) will cause the daemon to fail to start.
|
||||
|
||||
.. note::
|
||||
|
||||
For arguments passed to the process running inside the container rather than the for
|
||||
the container runtime itself, see :ref:`cephadm-extra-entrypoint-args`
|
||||
|
||||
|
||||
Cephadm supports providing extra miscellaneous container arguments for
|
||||
@ -544,6 +553,30 @@ For example:
|
||||
- "-v"
|
||||
- "/opt/ceph_cert/host.cert:/etc/grafana/certs/cert_file:ro"
|
||||
|
||||
.. _cephadm-extra-entrypoint-args:
|
||||
|
||||
Extra Entrypoint Arguments
|
||||
==========================
|
||||
|
||||
.. note::
|
||||
|
||||
For arguments intended for the container runtime rather than the process inside
|
||||
it, see :ref:`cephadm-extra-container-args`
|
||||
|
||||
Similar to extra container args for the container runtime, Cephadm supports
|
||||
appending to args passed to the entrypoint process running
|
||||
within a container. For example, to set the collector textfile directory for
|
||||
the node-exporter service , one could apply a service spec like
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
service_type: node-exporter
|
||||
service_name: node-exporter
|
||||
placement:
|
||||
host_pattern: '*'
|
||||
extra_entrypoint_args:
|
||||
- "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector2"
|
||||
|
||||
.. _orch-rm:
|
||||
|
||||
Removing a Service
|
||||
|
@ -164,8 +164,10 @@ for RGW with a minumum set of configuration options. The orchestrator will
|
||||
deploy and manage a combination of haproxy and keepalived to provide load
|
||||
balancing on a floating virtual IP.
|
||||
|
||||
If SSL is used, then SSL must be configured and terminated by the ingress service
|
||||
and not RGW itself.
|
||||
If the RGW service is configured with SSL enabled, then the ingress service
|
||||
will use the `ssl` and `verify none` options in the backend configuration.
|
||||
Trust verification is disabled because the backends are accessed by IP
|
||||
address instead of FQDN.
|
||||
|
||||
.. image:: ../../images/HAProxy_for_RGW.svg
|
||||
|
||||
@ -186,8 +188,7 @@ between all the RGW daemons available.
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
* An existing RGW service, without SSL. (If you want SSL service, the certificate
|
||||
should be configured on the ingress service, not the RGW service.)
|
||||
* An existing RGW service.
|
||||
|
||||
Deploying
|
||||
---------
|
||||
|
@ -1,22 +1,19 @@
|
||||
Troubleshooting
|
||||
===============
|
||||
|
||||
You might need to investigate why a cephadm command failed
|
||||
You may wish to investigate why a cephadm command failed
|
||||
or why a certain service no longer runs properly.
|
||||
|
||||
Cephadm deploys daemons as containers. This means that
|
||||
troubleshooting those containerized daemons might work
|
||||
differently than you expect (and that is certainly true if
|
||||
you expect this troubleshooting to work the way that
|
||||
troubleshooting does when the daemons involved aren't
|
||||
containerized).
|
||||
Cephadm deploys daemons within containers. This means that
|
||||
troubleshooting those containerized daemons will require
|
||||
a different process than traditional package-install daemons.
|
||||
|
||||
Here are some tools and commands to help you troubleshoot
|
||||
your Ceph environment.
|
||||
|
||||
.. _cephadm-pause:
|
||||
|
||||
Pausing or disabling cephadm
|
||||
Pausing or Disabling cephadm
|
||||
----------------------------
|
||||
|
||||
If something goes wrong and cephadm is behaving badly, you can
|
||||
@ -45,16 +42,15 @@ See :ref:`cephadm-spec-unmanaged` for information on disabling
|
||||
individual services.
|
||||
|
||||
|
||||
Per-service and per-daemon events
|
||||
Per-service and Per-daemon Events
|
||||
---------------------------------
|
||||
|
||||
In order to help with the process of debugging failed daemon
|
||||
deployments, cephadm stores events per service and per daemon.
|
||||
In order to facilitate debugging failed daemons,
|
||||
cephadm stores events per service and per daemon.
|
||||
These events often contain information relevant to
|
||||
troubleshooting
|
||||
your Ceph cluster.
|
||||
troubleshooting your Ceph cluster.
|
||||
|
||||
Listing service events
|
||||
Listing Service Events
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To see the events associated with a certain service, run a
|
||||
@ -82,7 +78,7 @@ This will return something in the following form:
|
||||
- '2021-02-01T12:09:25.264584 service:alertmanager [ERROR] "Failed to apply: Cannot
|
||||
place <AlertManagerSpec for service_name=alertmanager> on unknown_host: Unknown hosts"'
|
||||
|
||||
Listing daemon events
|
||||
Listing Daemon Events
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To see the events associated with a certain daemon, run a
|
||||
@ -106,16 +102,16 @@ This will return something in the following form:
|
||||
mds.cephfs.hostname.ppdhsz on host 'hostname'"
|
||||
|
||||
|
||||
Checking cephadm logs
|
||||
Checking Cephadm Logs
|
||||
---------------------
|
||||
|
||||
To learn how to monitor the cephadm logs as they are generated, read :ref:`watching_cephadm_logs`.
|
||||
To learn how to monitor cephadm logs as they are generated, read :ref:`watching_cephadm_logs`.
|
||||
|
||||
If your Ceph cluster has been configured to log events to files, there will exist a
|
||||
cephadm log file called ``ceph.cephadm.log`` on all monitor hosts (see
|
||||
:ref:`cephadm-logs` for a more complete explanation of this).
|
||||
If your Ceph cluster has been configured to log events to files, there will be a
|
||||
``ceph.cephadm.log`` file on all monitor hosts (see
|
||||
:ref:`cephadm-logs` for a more complete explanation).
|
||||
|
||||
Gathering log files
|
||||
Gathering Log Files
|
||||
-------------------
|
||||
|
||||
Use journalctl to gather the log files of all daemons:
|
||||
@ -140,7 +136,7 @@ To fetch all log files of all daemons on a given host, run::
|
||||
cephadm logs --fsid <fsid> --name "$name" > $name;
|
||||
done
|
||||
|
||||
Collecting systemd status
|
||||
Collecting Systemd Status
|
||||
-------------------------
|
||||
|
||||
To print the state of a systemd unit, run::
|
||||
@ -156,7 +152,7 @@ To fetch all state of all daemons of a given host, run::
|
||||
done
|
||||
|
||||
|
||||
List all downloaded container images
|
||||
List all Downloaded Container Images
|
||||
------------------------------------
|
||||
|
||||
To list all container images that are downloaded on a host:
|
||||
@ -170,16 +166,16 @@ To list all container images that are downloaded on a host:
|
||||
"registry.opensuse.org/opensuse/leap:15.2"
|
||||
|
||||
|
||||
Manually running containers
|
||||
Manually Running Containers
|
||||
---------------------------
|
||||
|
||||
Cephadm writes small wrappers that run a containers. Refer to
|
||||
Cephadm uses small wrappers when running containers. Refer to
|
||||
``/var/lib/ceph/<cluster-fsid>/<service-name>/unit.run`` for the
|
||||
container execution command.
|
||||
|
||||
.. _cephadm-ssh-errors:
|
||||
|
||||
SSH errors
|
||||
SSH Errors
|
||||
----------
|
||||
|
||||
Error message::
|
||||
@ -191,7 +187,7 @@ Error message::
|
||||
Please make sure that the host is reachable and accepts connections using the cephadm SSH key
|
||||
...
|
||||
|
||||
Things users can do:
|
||||
Things Ceph administrators can do:
|
||||
|
||||
1. Ensure cephadm has an SSH identity key::
|
||||
|
||||
@ -224,7 +220,7 @@ To verify that the public key is in the authorized_keys file, run the following
|
||||
[root@mon1 ~]# cephadm shell -- ceph cephadm get-pub-key > ~/ceph.pub
|
||||
[root@mon1 ~]# grep "`cat ~/ceph.pub`" /root/.ssh/authorized_keys
|
||||
|
||||
Failed to infer CIDR network error
|
||||
Failed to Infer CIDR network error
|
||||
----------------------------------
|
||||
|
||||
If you see this error::
|
||||
@ -241,7 +237,7 @@ This means that you must run a command of this form::
|
||||
|
||||
For more detail on operations of this kind, see :ref:`deploy_additional_monitors`
|
||||
|
||||
Accessing the admin socket
|
||||
Accessing the Admin Socket
|
||||
--------------------------
|
||||
|
||||
Each Ceph daemon provides an admin socket that bypasses the
|
||||
@ -252,12 +248,12 @@ To access the admin socket, first enter the daemon container on the host::
|
||||
[root@mon1 ~]# cephadm enter --name <daemon-name>
|
||||
[ceph: root@mon1 /]# ceph --admin-daemon /var/run/ceph/ceph-<daemon-name>.asok config show
|
||||
|
||||
Calling miscellaneous ceph tools
|
||||
Running Various Ceph Tools
|
||||
--------------------------------
|
||||
|
||||
To call miscellaneous like ``ceph-objectstore-tool`` or
|
||||
``ceph-monstore-tool``, you can run them by calling
|
||||
``cephadm shell --name <daemon-name>`` like so::
|
||||
To run Ceph tools like ``ceph-objectstore-tool`` or
|
||||
``ceph-monstore-tool``, invoke the cephadm CLI with
|
||||
``cephadm shell --name <daemon-name>``. For example::
|
||||
|
||||
root@myhostname # cephadm unit --name mon.myhostname stop
|
||||
root@myhostname # cephadm shell --name mon.myhostname
|
||||
@ -272,21 +268,21 @@ To call miscellaneous like ``ceph-objectstore-tool`` or
|
||||
election_strategy: 1
|
||||
0: [v2:127.0.0.1:3300/0,v1:127.0.0.1:6789/0] mon.myhostname
|
||||
|
||||
This command sets up the environment in a way that is suitable
|
||||
for extended daemon maintenance and running the deamon interactively.
|
||||
The cephadm shell sets up the environment in a way that is suitable
|
||||
for extended daemon maintenance and running daemons interactively.
|
||||
|
||||
.. _cephadm-restore-quorum:
|
||||
|
||||
Restoring the MON quorum
|
||||
------------------------
|
||||
Restoring the Monitor Quorum
|
||||
----------------------------
|
||||
|
||||
In case the Ceph MONs cannot form a quorum, cephadm is not able
|
||||
to manage the cluster, until the quorum is restored.
|
||||
If the Ceph monitor daemons (mons) cannot form a quorum, cephadm will not be
|
||||
able to manage the cluster until quorum is restored.
|
||||
|
||||
In order to restore the MON quorum, remove unhealthy MONs
|
||||
In order to restore the quorum, remove unhealthy monitors
|
||||
form the monmap by following these steps:
|
||||
|
||||
1. Stop all MONs. For each MON host::
|
||||
1. Stop all mons. For each mon host::
|
||||
|
||||
ssh {mon-host}
|
||||
cephadm unit --name mon.`hostname` stop
|
||||
@ -301,18 +297,19 @@ form the monmap by following these steps:
|
||||
|
||||
.. _cephadm-manually-deploy-mgr:
|
||||
|
||||
Manually deploying a MGR daemon
|
||||
-------------------------------
|
||||
cephadm requires a MGR daemon in order to manage the cluster. In case the cluster
|
||||
the last MGR of a cluster was removed, follow these steps in order to deploy
|
||||
a MGR ``mgr.hostname.smfvfd`` on a random host of your cluster manually.
|
||||
Manually Deploying a Manager Daemon
|
||||
-----------------------------------
|
||||
At least one manager (mgr) daemon is required by cephadm in order to manage the
|
||||
cluster. If the last mgr in a cluster has been removed, follow these steps in
|
||||
order to deploy a manager called (for example)
|
||||
``mgr.hostname.smfvfd`` on a random host of your cluster manually.
|
||||
|
||||
Disable the cephadm scheduler, in order to prevent cephadm from removing the new
|
||||
MGR. See :ref:`cephadm-enable-cli`::
|
||||
manager. See :ref:`cephadm-enable-cli`::
|
||||
|
||||
ceph config-key set mgr/cephadm/pause true
|
||||
|
||||
Then get or create the auth entry for the new MGR::
|
||||
Then get or create the auth entry for the new manager::
|
||||
|
||||
ceph auth get-or-create mgr.hostname.smfvfd mon "profile mgr" osd "allow *" mds "allow *"
|
||||
|
||||
@ -338,26 +335,26 @@ Deploy the daemon::
|
||||
|
||||
cephadm --image <container-image> deploy --fsid <fsid> --name mgr.hostname.smfvfd --config-json config-json.json
|
||||
|
||||
Analyzing core dumps
|
||||
Analyzing Core Dumps
|
||||
---------------------
|
||||
|
||||
In case a Ceph daemon crashes, cephadm supports analyzing core dumps. To enable core dumps, run
|
||||
When a Ceph daemon crashes, cephadm supports analyzing core dumps. To enable core dumps, run
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ulimit -c unlimited
|
||||
|
||||
core dumps will now be written to ``/var/lib/systemd/coredump``.
|
||||
Core dumps will now be written to ``/var/lib/systemd/coredump``.
|
||||
|
||||
.. note::
|
||||
|
||||
core dumps are not namespaced by the kernel, which means
|
||||
Core dumps are not namespaced by the kernel, which means
|
||||
they will be written to ``/var/lib/systemd/coredump`` on
|
||||
the container host.
|
||||
|
||||
Now, wait for the crash to happen again. (To simulate the crash of a daemon, run e.g. ``killall -3 ceph-mon``)
|
||||
Now, wait for the crash to happen again. To simulate the crash of a daemon, run e.g. ``killall -3 ceph-mon``.
|
||||
|
||||
Install debug packages by entering the cephadm shell and install ``ceph-debuginfo``::
|
||||
Install debug packages including ``ceph-debuginfo`` by entering the cephadm shelll::
|
||||
|
||||
# cephadm shell --mount /var/lib/systemd/coredump
|
||||
[ceph: root@host1 /]# dnf install ceph-debuginfo gdb zstd
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 27 KiB After Width: | Height: | Size: 40 KiB |
@ -86,8 +86,17 @@ Interactive Commands
|
||||
1. m : Filesystem selection
|
||||
Displays a menu of filesystems for selection.
|
||||
|
||||
2. q : Quit
|
||||
Exit the utility if you are at the home screen (All Filesystem Info),
|
||||
2. s : Sort field selection
|
||||
Designates the sort field. 'cap_hit' is the default.
|
||||
|
||||
3. l : Client limit
|
||||
Sets the limit on the number of clients to be displayed.
|
||||
|
||||
4. r : Reset
|
||||
Resets the sort field and limit value to the default.
|
||||
|
||||
5. q : Quit
|
||||
Exit the utility if you are at the home screen (all filesystem info),
|
||||
otherwise escape back to the home screen.
|
||||
|
||||
The metrics display can be scrolled using the Arrow Keys, PgUp/PgDn, Home/End and mouse.
|
||||
|
@ -229,27 +229,21 @@ backed by the original data pool.
|
||||
|
||||
::
|
||||
|
||||
ceph fs flag set enable_multiple true --yes-i-really-mean-it
|
||||
ceph osd pool create cephfs_recovery_meta
|
||||
ceph fs new cephfs_recovery recovery <data_pool> --allow-dangerous-metadata-overlay
|
||||
ceph fs new cephfs_recovery recovery <data_pool> --recover --allow-dangerous-metadata-overlay
|
||||
|
||||
.. note::
|
||||
|
||||
The recovery file system starts with an MDS rank that will initialize the new
|
||||
metadata pool with some metadata. This is necessary to bootstrap recovery.
|
||||
However, now we will take the MDS down as we do not want it interacting with
|
||||
the metadata pool further.
|
||||
The ``--recover`` flag prevents any MDS from joining the new file system.
|
||||
|
||||
Next, we will create the intial metadata for the fs:
|
||||
|
||||
::
|
||||
|
||||
ceph fs fail cephfs_recovery
|
||||
|
||||
Next, we will reset the initial metadata the MDS created:
|
||||
|
||||
::
|
||||
|
||||
cephfs-table-tool cephfs_recovery:all reset session
|
||||
cephfs-table-tool cephfs_recovery:all reset snap
|
||||
cephfs-table-tool cephfs_recovery:all reset inode
|
||||
cephfs-table-tool cephfs_recovery:0 reset session
|
||||
cephfs-table-tool cephfs_recovery:0 reset snap
|
||||
cephfs-table-tool cephfs_recovery:0 reset inode
|
||||
cephfs-journal-tool --rank cephfs_recovery:0 journal reset --force
|
||||
|
||||
Now perform the recovery of the metadata pool from the data pool:
|
||||
|
||||
@ -272,7 +266,6 @@ with:
|
||||
::
|
||||
|
||||
cephfs-journal-tool --rank=<fs_name>:0 event recover_dentries list --alternate-pool cephfs_recovery_meta
|
||||
cephfs-journal-tool --rank cephfs_recovery:0 journal reset --force
|
||||
|
||||
After recovery, some recovered directories will have incorrect statistics.
|
||||
Ensure the parameters ``mds_verify_scatter`` and ``mds_debug_scatterstat`` are
|
||||
@ -283,20 +276,22 @@ set to false (the default) to prevent the MDS from checking the statistics:
|
||||
ceph config rm mds mds_verify_scatter
|
||||
ceph config rm mds mds_debug_scatterstat
|
||||
|
||||
(Note, the config may also have been set globally or via a ceph.conf file.)
|
||||
.. note::
|
||||
|
||||
Also verify the config has not been set globally or with a local ceph.conf file.
|
||||
|
||||
Now, allow an MDS to join the recovery file system:
|
||||
|
||||
::
|
||||
|
||||
ceph fs set cephfs_recovery joinable true
|
||||
|
||||
Finally, run a forward :doc:`scrub </cephfs/scrub>` to repair the statistics.
|
||||
Finally, run a forward :doc:`scrub </cephfs/scrub>` to repair recursive statistics.
|
||||
Ensure you have an MDS running and issue:
|
||||
|
||||
::
|
||||
|
||||
ceph fs status # get active MDS
|
||||
ceph tell mds.<id> scrub start / recursive repair
|
||||
ceph tell mds.recovery_fs:0 scrub start / recursive,repair,force
|
||||
|
||||
.. note::
|
||||
|
||||
|
@ -3,11 +3,12 @@
|
||||
FS volumes and subvolumes
|
||||
=========================
|
||||
|
||||
A single source of truth for CephFS exports is implemented in the volumes
|
||||
module of the :term:`Ceph Manager` daemon (ceph-mgr). The OpenStack shared
|
||||
file system service (manila_), Ceph Container Storage Interface (CSI_),
|
||||
The volumes
|
||||
module of the :term:`Ceph Manager` daemon (ceph-mgr) provides a single
|
||||
source of truth for CephFS exports. The OpenStack shared
|
||||
file system service (manila_) and Ceph Container Storage Interface (CSI_)
|
||||
storage administrators among others can use the common CLI provided by the
|
||||
ceph-mgr volumes module to manage the CephFS exports.
|
||||
ceph-mgr volumes module to manage CephFS exports.
|
||||
|
||||
The ceph-mgr volumes module implements the following file system export
|
||||
abstactions:
|
||||
@ -22,17 +23,17 @@ abstactions:
|
||||
|
||||
Some possible use-cases for the export abstractions:
|
||||
|
||||
* FS subvolumes used as manila shares or CSI volumes
|
||||
* FS subvolumes used as Manila shares or CSI volumes
|
||||
|
||||
* FS subvolume groups used as manila share groups
|
||||
* FS subvolume groups used as Manila share groups
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
* Nautilus (14.2.x) or a later version of Ceph
|
||||
* Nautilus (14.2.x) or later Ceph release
|
||||
|
||||
* Cephx client user (see :doc:`/rados/operations/user-management`) with
|
||||
the following minimum capabilities::
|
||||
at least the following capabilities::
|
||||
|
||||
mon 'allow r'
|
||||
mgr 'allow rw'
|
||||
@ -46,41 +47,56 @@ Create a volume using::
|
||||
$ ceph fs volume create <vol_name> [<placement>]
|
||||
|
||||
This creates a CephFS file system and its data and metadata pools. It can also
|
||||
try to create MDSes for the filesystem using the enabled ceph-mgr orchestrator
|
||||
module (see :doc:`/mgr/orchestrator`), e.g. rook.
|
||||
deploy MDS daemons for the filesystem using a ceph-mgr orchestrator
|
||||
module (see :doc:`/mgr/orchestrator`), for example Rook.
|
||||
|
||||
<vol_name> is the volume name (an arbitrary string), and
|
||||
|
||||
<placement> is an optional string signifying which hosts should have NFS Ganesha
|
||||
daemon containers running on them and, optionally, the total number of NFS
|
||||
Ganesha daemons the cluster (should you want to have more than one NFS Ganesha
|
||||
daemon running per node). For example, the following placement string means
|
||||
"deploy NFS Ganesha daemons on nodes host1 and host2 (one daemon per host):
|
||||
<placement> is an optional string that designates the hosts that should have
|
||||
an MDS running on them and, optionally, the total number of MDS daemons the cluster
|
||||
should have. For example, the
|
||||
following placement string means "deploy MDS on nodes ``host1`` and ``host2`` (one
|
||||
MDS per host):
|
||||
|
||||
"host1,host2"
|
||||
|
||||
and this placement specification says to deploy two NFS Ganesha daemons each
|
||||
on nodes host1 and host2 (for a total of four NFS Ganesha daemons in the
|
||||
cluster):
|
||||
and this placement specification says to deploy two MDS daemons on each of
|
||||
nodes ``host1`` and ``host2`` (for a total of four MDS daemons in the cluster):
|
||||
|
||||
"4 host1,host2"
|
||||
|
||||
For more details on placement specification refer to the `orchestrator doc
|
||||
<https://docs.ceph.com/docs/master/mgr/orchestrator/#placement-specification>`_
|
||||
but keep in mind that specifying the placement via a YAML file is not supported.
|
||||
For more details on placement specification refer to the :ref:`orchestrator-cli-service-spec`,
|
||||
but keep in mind that specifying placement via a YAML file is not supported.
|
||||
|
||||
Remove a volume using::
|
||||
To remove a volume, run the following command::
|
||||
|
||||
$ ceph fs volume rm <vol_name> [--yes-i-really-mean-it]
|
||||
|
||||
This removes a file system and its data and metadata pools. It also tries to
|
||||
remove MDSes using the enabled ceph-mgr orchestrator module.
|
||||
remove MDS daemons using the enabled ceph-mgr orchestrator module.
|
||||
|
||||
List volumes using::
|
||||
|
||||
$ ceph fs volume ls
|
||||
|
||||
Fetch the information of a CephFS volume using::
|
||||
Rename a volume using::
|
||||
|
||||
$ ceph fs volume rename <vol_name> <new_vol_name> [--yes-i-really-mean-it]
|
||||
|
||||
Renaming a volume can be an expensive operation that requires the following:
|
||||
|
||||
- Rename the orchestrator-managed MDS service to match the <new_vol_name>.
|
||||
This involves launching a MDS service with <new_vol_name> and bringing down
|
||||
the MDS service with <vol_name>.
|
||||
- Rename the file system matching <vol_name> to <new_vol_name>
|
||||
- Change the application tags on the data and metadata pools of the file system
|
||||
to <new_vol_name>
|
||||
- Rename the metadata and data pools of the file system.
|
||||
|
||||
The CephX IDs authorized for <vol_name> need to be reauthorized for <new_vol_name>. Any
|
||||
on-going operations of the clients using these IDs may be disrupted. Mirroring is
|
||||
expected to be disabled on the volume.
|
||||
|
||||
To fetch the information of a CephFS volume, run::
|
||||
|
||||
$ ceph fs volume info vol_name [--human_readable]
|
||||
|
||||
@ -88,15 +104,15 @@ The ``--human_readable`` flag shows used and available pool capacities in KB/MB/
|
||||
|
||||
The output format is JSON and contains fields as follows:
|
||||
|
||||
* pools: Attributes of data and metadata pools
|
||||
* avail: The amount of free space available in bytes
|
||||
* used: The amount of storage consumed in bytes
|
||||
* name: Name of the pool
|
||||
* mon_addrs: List of monitor addresses
|
||||
* used_size: Current used size of the CephFS volume in bytes
|
||||
* pending_subvolume_deletions: Number of subvolumes pending deletion
|
||||
* ``pools``: Attributes of data and metadata pools
|
||||
* ``avail``: The amount of free space available in bytes
|
||||
* ``used``: The amount of storage consumed in bytes
|
||||
* ``name``: Name of the pool
|
||||
* ``mon_addrs``: List of Ceph monitor addresses
|
||||
* ``used_size``: Current used size of the CephFS volume in bytes
|
||||
* ``pending_subvolume_deletions``: Number of subvolumes pending deletion
|
||||
|
||||
Sample output of volume info command::
|
||||
Sample output of the ``volume info`` command::
|
||||
|
||||
$ ceph fs volume info vol_name
|
||||
{
|
||||
@ -133,10 +149,10 @@ Create a subvolume group using::
|
||||
The command succeeds even if the subvolume group already exists.
|
||||
|
||||
When creating a subvolume group you can specify its data pool layout (see
|
||||
:doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals and
|
||||
:doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals, and
|
||||
size in bytes. The size of the subvolume group is specified by setting
|
||||
a quota on it (see :doc:`/cephfs/quota`). By default, the subvolume group
|
||||
is created with an octal file mode '755', uid '0', gid '0' and the data pool
|
||||
is created with octal file mode '755', uid '0', gid '0' and the data pool
|
||||
layout of its parent directory.
|
||||
|
||||
|
||||
@ -163,49 +179,49 @@ Fetch the metadata of a subvolume group using::
|
||||
|
||||
$ ceph fs subvolumegroup info <vol_name> <group_name>
|
||||
|
||||
The output format is JSON and contains fields as follows.
|
||||
The output format is JSON and contains fields as follows:
|
||||
|
||||
* atime: access time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* mtime: modification time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ctime: change time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* uid: uid of subvolume group path
|
||||
* gid: gid of subvolume group path
|
||||
* mode: mode of subvolume group path
|
||||
* mon_addrs: list of monitor addresses
|
||||
* bytes_pcent: quota used in percentage if quota is set, else displays "undefined"
|
||||
* bytes_quota: quota size in bytes if quota is set, else displays "infinite"
|
||||
* bytes_used: current used size of the subvolume group in bytes
|
||||
* created_at: time of creation of subvolume group in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* data_pool: data pool the subvolume group belongs to
|
||||
* ``atime``: access time of the subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``mtime``: modification time of the subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``ctime``: change time of the subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``uid``: uid of the subvolume group path
|
||||
* ``gid``: gid of the subvolume group path
|
||||
* ``mode``: mode of the subvolume group path
|
||||
* ``mon_addrs``: list of monitor addresses
|
||||
* ``bytes_pcent``: quota used in percentage if quota is set, else displays "undefined"
|
||||
* ``bytes_quota``: quota size in bytes if quota is set, else displays "infinite"
|
||||
* ``bytes_used``: current used size of the subvolume group in bytes
|
||||
* ``created_at``: creation time of the subvolume group in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``data_pool``: data pool to which the subvolume group belongs
|
||||
|
||||
Check the presence of any subvolume group using::
|
||||
|
||||
$ ceph fs subvolumegroup exist <vol_name>
|
||||
|
||||
The strings returned by the 'exist' command:
|
||||
The 'exist' command outputs:
|
||||
|
||||
* "subvolumegroup exists": if any subvolumegroup is present
|
||||
* "no subvolumegroup exists": if no subvolumegroup is present
|
||||
|
||||
.. note:: It checks for the presence of custom groups and not the default one. To validate the emptiness of the volume, subvolumegroup existence check alone is not sufficient. The subvolume existence also needs to be checked as there might be subvolumes in the default group.
|
||||
.. note:: This command checks for the presence of custom groups and not presence of the default one. To validate the emptiness of the volume, a subvolumegroup existence check alone is not sufficient. Subvolume existence also needs to be checked as there might be subvolumes in the default group.
|
||||
|
||||
Resize a subvolume group using::
|
||||
|
||||
$ ceph fs subvolumegroup resize <vol_name> <group_name> <new_size> [--no_shrink]
|
||||
|
||||
The command resizes the subvolume group quota using the size specified by 'new_size'.
|
||||
The '--no_shrink' flag prevents the subvolume group to shrink below the current used
|
||||
size of the subvolume group.
|
||||
The command resizes the subvolume group quota using the size specified by ``new_size``.
|
||||
The ``--no_shrink`` flag prevents the subvolume group from shrinking below the current used
|
||||
size.
|
||||
|
||||
The subvolume group can be resized to an unlimited size by passing 'inf' or 'infinite'
|
||||
as the new_size.
|
||||
The subvolume group may be resized to an infinite size by passing ``inf`` or ``infinite``
|
||||
as the ``new_size``.
|
||||
|
||||
Remove a snapshot of a subvolume group using::
|
||||
|
||||
$ ceph fs subvolumegroup snapshot rm <vol_name> <group_name> <snap_name> [--force]
|
||||
|
||||
Using the '--force' flag allows the command to succeed that would otherwise
|
||||
fail if the snapshot did not exist.
|
||||
Supplying the ``--force`` flag allows the command to succeed when it would otherwise
|
||||
fail due to the snapshot not existing.
|
||||
|
||||
List snapshots of a subvolume group using::
|
||||
|
||||
@ -254,10 +270,10 @@ Resize a subvolume using::
|
||||
|
||||
$ ceph fs subvolume resize <vol_name> <subvol_name> <new_size> [--group_name <subvol_group_name>] [--no_shrink]
|
||||
|
||||
The command resizes the subvolume quota using the size specified by 'new_size'.
|
||||
'--no_shrink' flag prevents the subvolume to shrink below the current used size of the subvolume.
|
||||
The command resizes the subvolume quota using the size specified by ``new_size``.
|
||||
The `--no_shrink`` flag prevents the subvolume from shrinking below the current used size of the subvolume.
|
||||
|
||||
The subvolume can be resized to an infinite size by passing 'inf' or 'infinite' as the new_size.
|
||||
The subvolume can be resized to an unlimited (but sparse) logical size by passing ``inf`` or ``infinite`` as `` new_size``.
|
||||
|
||||
Authorize cephx auth IDs, the read/read-write access to fs subvolumes::
|
||||
|
||||
@ -285,43 +301,43 @@ Fetch the information of a subvolume using::
|
||||
|
||||
$ ceph fs subvolume info <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
The output format is json and contains fields as follows.
|
||||
The output format is JSON and contains fields as follows.
|
||||
|
||||
* atime: access time of subvolume path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* mtime: modification time of subvolume path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ctime: change time of subvolume path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* uid: uid of subvolume path
|
||||
* gid: gid of subvolume path
|
||||
* mode: mode of subvolume path
|
||||
* mon_addrs: list of monitor addresses
|
||||
* bytes_pcent: quota used in percentage if quota is set, else displays "undefined"
|
||||
* bytes_quota: quota size in bytes if quota is set, else displays "infinite"
|
||||
* bytes_used: current used size of the subvolume in bytes
|
||||
* created_at: time of creation of subvolume in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* data_pool: data pool the subvolume belongs to
|
||||
* path: absolute path of a subvolume
|
||||
* type: subvolume type indicating whether it's clone or subvolume
|
||||
* pool_namespace: RADOS namespace of the subvolume
|
||||
* features: features supported by the subvolume
|
||||
* state: current state of the subvolume
|
||||
* ``atime``: access time of the subvolume path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``mtime``: modification time of the subvolume path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``ctime``: change time of the subvolume path in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``uid``: uid of the subvolume path
|
||||
* ``gid``: gid of the subvolume path
|
||||
* ``mode``: mode of the subvolume path
|
||||
* ``mon_addrs``: list of monitor addresses
|
||||
* ``bytes_pcent``: quota used in percentage if quota is set, else displays ``undefined``
|
||||
* ``bytes_quota``: quota size in bytes if quota is set, else displays ``infinite``
|
||||
* ``bytes_used``: current used size of the subvolume in bytes
|
||||
* ``created_at``: creation time of the subvolume in the format "YYYY-MM-DD HH:MM:SS"
|
||||
* ``data_pool``: data pool to which the subvolume belongs
|
||||
* ``path``: absolute path of a subvolume
|
||||
* ``type``: subvolume type indicating whether it's clone or subvolume
|
||||
* ``pool_namespace``: RADOS namespace of the subvolume
|
||||
* ``features``: features supported by the subvolume
|
||||
* ``state``: current state of the subvolume
|
||||
|
||||
If a subvolume has been removed retaining its snapshots, the output only contains fields as follows.
|
||||
If a subvolume has been removed retaining its snapshots, the output contains only fields as follows.
|
||||
|
||||
* type: subvolume type indicating whether it's clone or subvolume
|
||||
* features: features supported by the subvolume
|
||||
* state: current state of the subvolume
|
||||
* ``type``: subvolume type indicating whether it's clone or subvolume
|
||||
* ``features``: features supported by the subvolume
|
||||
* ``state``: current state of the subvolume
|
||||
|
||||
The subvolume "features" are based on the internal version of the subvolume and is a list containing
|
||||
a subset of the following features,
|
||||
A subvolume's ``features`` are based on the internal version of the subvolume and are
|
||||
a subset of the following:
|
||||
|
||||
* "snapshot-clone": supports cloning using a subvolumes snapshot as the source
|
||||
* "snapshot-autoprotect": supports automatically protecting snapshots, that are active clone sources, from deletion
|
||||
* "snapshot-retention": supports removing subvolume contents, retaining any existing snapshots
|
||||
* ``snapshot-clone``: supports cloning using a subvolumes snapshot as the source
|
||||
* ``snapshot-autoprotect``: supports automatically protecting snapshots, that are active clone sources, from deletion
|
||||
* ``snapshot-retention``: supports removing subvolume contents, retaining any existing snapshots
|
||||
|
||||
The subvolume "state" is based on the current state of the subvolume and contains one of the following values.
|
||||
A subvolume's ``state`` is based on the current state of the subvolume and contains one of the following values.
|
||||
|
||||
* "complete": subvolume is ready for all operations
|
||||
* "snapshot-retained": subvolume is removed but its snapshots are retained
|
||||
* ``complete``: subvolume is ready for all operations
|
||||
* ``snapshot-retained``: subvolume is removed but its snapshots are retained
|
||||
|
||||
List subvolumes using::
|
||||
|
||||
@ -333,10 +349,10 @@ Check the presence of any subvolume using::
|
||||
|
||||
$ ceph fs subvolume exist <vol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
The strings returned by the 'exist' command:
|
||||
These are the possible results of the ``exist`` command:
|
||||
|
||||
* "subvolume exists": if any subvolume of given group_name is present
|
||||
* "no subvolume exists": if no subvolume of given group_name is present
|
||||
* ``subvolume exists``: if any subvolume of given group_name is present
|
||||
* ``no subvolume exists``: if no subvolume of given group_name is present
|
||||
|
||||
Set custom metadata on the subvolume as a key-value pair using::
|
||||
|
||||
@ -360,7 +376,7 @@ Remove custom metadata set on the subvolume using the metadata key::
|
||||
|
||||
$ ceph fs subvolume metadata rm <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
||||
|
||||
Using the '--force' flag allows the command to succeed that would otherwise
|
||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||
fail if the metadata key did not exist.
|
||||
|
||||
Create a snapshot of a subvolume using::
|
||||
@ -372,7 +388,7 @@ Remove a snapshot of a subvolume using::
|
||||
|
||||
$ ceph fs subvolume snapshot rm <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>] [--force]
|
||||
|
||||
Using the '--force' flag allows the command to succeed that would otherwise
|
||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||
fail if the snapshot did not exist.
|
||||
|
||||
.. note:: if the last snapshot within a snapshot retained subvolume is removed, the subvolume is also removed
|
||||
@ -387,13 +403,13 @@ Fetch the information of a snapshot using::
|
||||
|
||||
The output format is JSON and contains fields as follows.
|
||||
|
||||
* created_at: time of creation of snapshot in the format "YYYY-MM-DD HH:MM:SS:ffffff"
|
||||
* data_pool: data pool the snapshot belongs to
|
||||
* has_pending_clones: "yes" if snapshot clone is in progress otherwise "no"
|
||||
* pending_clones: list of in progress or pending clones and their target group if exist otherwise this field is not shown
|
||||
* orphan_clones_count: count of orphan clones if snapshot has orphan clones otherwise this field is not shown
|
||||
* ``created_at``: creation time of the snapshot in the format "YYYY-MM-DD HH:MM:SS:ffffff"
|
||||
* ``data_pool``: data pool to which the snapshot belongs
|
||||
* ``has_pending_clones``: ``yes`` if snapshot clone is in progress, otherwise ``no``
|
||||
* ``pending_clones``: list of in-progress or pending clones and their target group if any exist, otherwise this field is not shown
|
||||
* ``orphan_clones_count``: count of orphan clones if the snapshot has orphan clones, otherwise this field is not shown
|
||||
|
||||
Sample output when snapshot clones are in progress or pending state::
|
||||
Sample output when snapshot clones are in progress or pending::
|
||||
|
||||
$ ceph fs subvolume snapshot info cephfs subvol snap
|
||||
{
|
||||
@ -415,7 +431,7 @@ Sample output when snapshot clones are in progress or pending state::
|
||||
]
|
||||
}
|
||||
|
||||
Sample output when no snapshot clone is in progress or pending state::
|
||||
Sample output when no snapshot clone is in progress or pending::
|
||||
|
||||
$ ceph fs subvolume snapshot info cephfs subvol snap
|
||||
{
|
||||
@ -424,15 +440,15 @@ Sample output when no snapshot clone is in progress or pending state::
|
||||
"has_pending_clones": "no"
|
||||
}
|
||||
|
||||
Set custom metadata on the snapshot as a key-value pair using::
|
||||
Set custom key-value metadata on the snapshot by running::
|
||||
|
||||
$ ceph fs subvolume snapshot metadata set <vol_name> <subvol_name> <snap_name> <key_name> <value> [--group_name <subvol_group_name>]
|
||||
|
||||
.. note:: If the key_name already exists then the old value will get replaced by the new value.
|
||||
|
||||
.. note:: The key_name and value should be a string of ASCII characters (as specified in python's string.printable). The key_name is case-insensitive and always stored in lower case.
|
||||
.. note:: The key_name and value should be a strings of ASCII characters (as specified in Python's ``string.printable``). The key_name is case-insensitive and always stored in lowercase.
|
||||
|
||||
.. note:: Custom metadata on a snapshots is not preserved when snapshotting the subvolume, and hence, is also not preserved when cloning the subvolume snapshot.
|
||||
.. note:: Custom metadata on a snapshot is not preserved when snapshotting the subvolume, and hence is also not preserved when cloning the subvolume snapshot.
|
||||
|
||||
Get custom metadata set on the snapshot using the metadata key::
|
||||
|
||||
@ -446,35 +462,35 @@ Remove custom metadata set on the snapshot using the metadata key::
|
||||
|
||||
$ ceph fs subvolume snapshot metadata rm <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>] [--force]
|
||||
|
||||
Using the '--force' flag allows the command to succeed that would otherwise
|
||||
Using the ``--force`` flag allows the command to succeed that would otherwise
|
||||
fail if the metadata key did not exist.
|
||||
|
||||
Cloning Snapshots
|
||||
-----------------
|
||||
|
||||
Subvolumes can be created by cloning subvolume snapshots. Cloning is an asynchronous operation involving copying
|
||||
data from a snapshot to a subvolume. Due to this bulk copy nature, cloning is currently inefficient for very huge
|
||||
Subvolumes can be created by cloning subvolume snapshots. Cloning is an asynchronous operation that copies
|
||||
data from a snapshot to a subvolume. Due to this bulk copying, cloning is inefficient for very large
|
||||
data sets.
|
||||
|
||||
.. note:: Removing a snapshot (source subvolume) would fail if there are pending or in progress clone operations.
|
||||
|
||||
Protecting snapshots prior to cloning was a pre-requisite in the Nautilus release, and the commands to protect/unprotect
|
||||
snapshots were introduced for this purpose. This pre-requisite, and hence the commands to protect/unprotect, is being
|
||||
deprecated in mainline CephFS, and may be removed from a future release.
|
||||
Protecting snapshots prior to cloning was a prerequisite in the Nautilus release, and the commands to protect/unprotect
|
||||
snapshots were introduced for this purpose. This prerequisite, and hence the commands to protect/unprotect, is being
|
||||
deprecated and may be removed from a future release.
|
||||
|
||||
The commands being deprecated are:
|
||||
The commands being deprecated are::
|
||||
$ ceph fs subvolume snapshot protect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
$ ceph fs subvolume snapshot unprotect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
|
||||
|
||||
.. note:: Using the above commands would not result in an error, but they serve no useful function.
|
||||
.. note:: Using the above commands will not result in an error, but they have no useful purpose.
|
||||
|
||||
.. note:: Use subvolume info command to fetch subvolume metadata regarding supported "features" to help decide if protect/unprotect of snapshots is required, based on the "snapshot-autoprotect" feature availability.
|
||||
.. note:: Use the ``subvolume info`` command to fetch subvolume metadata regarding supported ``features`` to help decide if protect/unprotect of snapshots is required, based on the availability of the ``snapshot-autoprotect`` feature.
|
||||
|
||||
To initiate a clone operation use::
|
||||
|
||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name>
|
||||
|
||||
If a snapshot (source subvolume) is a part of non-default group, the group name needs to be specified as per::
|
||||
If a snapshot (source subvolume) is a part of non-default group, the group name needs to be specified::
|
||||
|
||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --group_name <subvol_group_name>
|
||||
|
||||
@ -486,7 +502,7 @@ Similar to specifying a pool layout when creating a subvolume, pool layout can b
|
||||
|
||||
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --pool_layout <pool_layout>
|
||||
|
||||
Configure maximum number of concurrent clones. The default is set to 4::
|
||||
Configure the maximum number of concurrent clones. The default is 4::
|
||||
|
||||
$ ceph config set mgr mgr/volumes/max_concurrent_clones <value>
|
||||
|
||||
@ -496,18 +512,18 @@ To check the status of a clone operation use::
|
||||
|
||||
A clone can be in one of the following states:
|
||||
|
||||
#. `pending` : Clone operation has not started
|
||||
#. `in-progress` : Clone operation is in progress
|
||||
#. `complete` : Clone operation has successfully finished
|
||||
#. `failed` : Clone operation has failed
|
||||
#. `canceled` : Clone operation is cancelled by user
|
||||
#. ``pending`` : Clone operation has not started
|
||||
#. ``in-progress`` : Clone operation is in progress
|
||||
#. ``complete`` : Clone operation has successfully finished
|
||||
#. ``failed`` : Clone operation has failed
|
||||
#. ``canceled`` : Clone operation is cancelled by user
|
||||
|
||||
The reason for a clone failure is shown as below:
|
||||
|
||||
#. `errno` : error number
|
||||
#. `error_msg` : failure error string
|
||||
#. ``errno`` : error number
|
||||
#. ``error_msg`` : failure error string
|
||||
|
||||
Sample output of an `in-progress` clone operation::
|
||||
Here is an example of an ``in-progress`` clone::
|
||||
|
||||
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
|
||||
$ ceph fs clone status cephfs clone1
|
||||
@ -522,9 +538,9 @@ Sample output of an `in-progress` clone operation::
|
||||
}
|
||||
}
|
||||
|
||||
.. note:: The `failure` section will be shown only if the clone is in failed or cancelled state
|
||||
.. note:: The ``failure`` section will be shown only if the clone's state is ``failed`` or ``cancelled``
|
||||
|
||||
Sample output of a `failed` clone operation::
|
||||
Here is an example of a ``failed`` clone::
|
||||
|
||||
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
|
||||
$ ceph fs clone status cephfs clone1
|
||||
@ -544,11 +560,11 @@ Sample output of a `failed` clone operation::
|
||||
}
|
||||
}
|
||||
|
||||
(NOTE: since `subvol1` is in default group, `source` section in `clone status` does not include group name)
|
||||
(NOTE: since ``subvol1`` is in the default group, the ``source`` object's ``clone status`` does not include the group name)
|
||||
|
||||
.. note:: Cloned subvolumes are accessible only after the clone operation has successfully completed.
|
||||
|
||||
For a successful clone operation, `clone status` would look like so::
|
||||
After a successful clone operation, ``clone status`` will look like the below::
|
||||
|
||||
$ ceph fs clone status cephfs clone1
|
||||
{
|
||||
@ -557,21 +573,24 @@ For a successful clone operation, `clone status` would look like so::
|
||||
}
|
||||
}
|
||||
|
||||
or `failed` state when clone is unsuccessful.
|
||||
If a clone operation is unsuccessful, the ``state`` value will be ``failed``.
|
||||
|
||||
On failure of a clone operation, the partial clone needs to be deleted and the clone operation needs to be retriggered.
|
||||
To delete a partial clone use::
|
||||
To retry a failed clone operation, the incomplete clone must be deleted and the
|
||||
clone operation must be issued again. To delete a partial clone use::
|
||||
|
||||
$ ceph fs subvolume rm <vol_name> <clone_name> [--group_name <group_name>] --force
|
||||
|
||||
.. note:: Cloning only synchronizes directories, regular files and symbolic links. Also, inode timestamps (access and
|
||||
modification times) are synchronized upto seconds granularity.
|
||||
.. note:: Cloning synchronizes only directories, regular files and symbolic
|
||||
links. Inode timestamps (access and modification times) are synchronized up
|
||||
to seconds granularity.
|
||||
|
||||
An `in-progress` or a `pending` clone operation can be canceled. To cancel a clone operation use the `clone cancel` command::
|
||||
An ``in-progress`` or a ``pending`` clone operation may be canceled. To cancel
|
||||
a clone operation use the ``clone cancel`` command::
|
||||
|
||||
$ ceph fs clone cancel <vol_name> <clone_name> [--group_name <group_name>]
|
||||
|
||||
On successful cancelation, the cloned subvolume is moved to `canceled` state::
|
||||
On successful cancellation, the cloned subvolume is moved to the ``canceled``
|
||||
state::
|
||||
|
||||
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
|
||||
$ ceph fs clone cancel cephfs clone1
|
||||
@ -587,7 +606,7 @@ On successful cancelation, the cloned subvolume is moved to `canceled` state::
|
||||
}
|
||||
}
|
||||
|
||||
.. note:: The canceled cloned can be deleted by using --force option in `fs subvolume rm` command.
|
||||
.. note:: The canceled cloned may be deleted by supplying the ``--force`` option to the `fs subvolume rm` command.
|
||||
|
||||
|
||||
.. _subvol-pinning:
|
||||
@ -596,8 +615,8 @@ Pinning Subvolumes and Subvolume Groups
|
||||
---------------------------------------
|
||||
|
||||
|
||||
Subvolumes and subvolume groups can be automatically pinned to ranks according
|
||||
to policies. This can help distribute load across MDS ranks in predictable and
|
||||
Subvolumes and subvolume groups may be automatically pinned to ranks according
|
||||
to policies. This can distribute load across MDS ranks in predictable and
|
||||
stable ways. Review :ref:`cephfs-pinning` and :ref:`cephfs-ephemeral-pinning`
|
||||
for details on how pinning works.
|
||||
|
||||
|
@ -5,6 +5,60 @@ CephFS allows quotas to be set on any directory in the system. The
|
||||
quota can restrict the number of *bytes* or the number of *files*
|
||||
stored beneath that point in the directory hierarchy.
|
||||
|
||||
Like most other things in CephFS, quotas are configured using virtual
|
||||
extended attributes:
|
||||
|
||||
* ``ceph.quota.max_files`` -- file limit
|
||||
* ``ceph.quota.max_bytes`` -- byte limit
|
||||
|
||||
If the extended attributes appear on a directory that means a quota is
|
||||
configured there. If they are not present then no quota is set on that
|
||||
directory (although one may still be configured on a parent directory).
|
||||
|
||||
To set a quota, set the extended attribute on a CephFS directory with a
|
||||
value::
|
||||
|
||||
setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir # 100 MB
|
||||
setfattr -n ceph.quota.max_files -v 10000 /some/dir # 10,000 files
|
||||
|
||||
To view quota limit::
|
||||
|
||||
$ getfattr -n ceph.quota.max_bytes /some/dir
|
||||
# file: dir1/
|
||||
ceph.quota.max_bytes="100000000"
|
||||
$
|
||||
$ getfattr -n ceph.quota.max_files /some/dir
|
||||
# file: dir1/
|
||||
ceph.quota.max_files="10000"
|
||||
|
||||
.. note:: Running ``getfattr /some/dir -d -m -`` for a CephFS directory will
|
||||
print none of the CephFS extended attributes. This is because the CephFS
|
||||
kernel and FUSE clients hide this information from the ``listxattr(2)``
|
||||
system call. Instead, a specific CephFS extended attribute can be viewed by
|
||||
running ``getfattr /some/dir -n ceph.<some-xattr>``.
|
||||
|
||||
To remove a quota, set the value of extended attribute to ``0``::
|
||||
|
||||
$ setfattr -n ceph.quota.max_bytes -v 0 /some/dir
|
||||
$ getfattr /some/dir -n ceph.quota.max_bytes
|
||||
dir1/: ceph.quota.max_bytes: No such attribute
|
||||
$
|
||||
$ setfattr -n ceph.quota.max_files -v 0 /some/dir
|
||||
$ getfattr dir1/ -n ceph.quota.max_files
|
||||
dir1/: ceph.quota.max_files: No such attribute
|
||||
|
||||
Space Usage Reporting and CephFS Quotas
|
||||
---------------------------------------
|
||||
When the root directory of the CephFS mount has quota set on it, the available
|
||||
space on the CephFS reported by space usage report tools (like ``df``) is
|
||||
based on quota limit. That is, ``available space = quota limit - used space``
|
||||
instead of ``available space = total space - used space``.
|
||||
|
||||
This behaviour can be disabled by setting following option in client section
|
||||
of ``ceph.conf``::
|
||||
|
||||
client quota df = false
|
||||
|
||||
Limitations
|
||||
-----------
|
||||
|
||||
@ -85,3 +139,11 @@ To remove a quota::
|
||||
|
||||
setfattr -n ceph.quota.max_bytes -v 0 /some/dir
|
||||
setfattr -n ceph.quota.max_files -v 0 /some/dir
|
||||
|
||||
|
||||
.. note:: In cases where CephFS extended attributes are set on a CephFS
|
||||
directory (for example, ``/some/dir``), running ``getfattr /some/dir -d -m
|
||||
-`` will not print those CephFS extended attributes. This is because CephFS
|
||||
kernel and FUSE clients hide this information from the ``listxattr(2)``
|
||||
system call. You can access a specific CephFS extended attribute by running
|
||||
``getfattr /some/dir -n ceph.<some-xattr>`` instead.
|
||||
|
@ -131,3 +131,15 @@ Control (ongoing) File System Scrubs
|
||||
{
|
||||
"return_code": 0
|
||||
}
|
||||
|
||||
Damages
|
||||
=======
|
||||
|
||||
The types of damage that can be reported and repaired by File System Scrub are:
|
||||
|
||||
* DENTRY : Inode's dentry is missing.
|
||||
|
||||
* DIR_FRAG : Inode's directory fragment(s) is missing.
|
||||
|
||||
* BACKTRACE : Inode's backtrace in the data pool is corrupted.
|
||||
|
||||
|
@ -554,7 +554,7 @@ In order to configure connections (from Ceph nodes) to the KDC:
|
||||
...
|
||||
|
||||
|
||||
6. A new *set parameter* was added in Ceph, ``gss ktab client file`` which
|
||||
6. A new *set parameter* was added in Ceph, ``gss_ktab_client_file`` which
|
||||
points to the keytab file related to the Ceph node *(or principal)* in
|
||||
question.
|
||||
|
||||
@ -614,10 +614,10 @@ In order to configure connections (from Ceph nodes) to the KDC:
|
||||
/etc/ceph/ceph.conf
|
||||
[global]
|
||||
...
|
||||
auth cluster required = gss
|
||||
auth service required = gss
|
||||
auth client required = gss
|
||||
gss ktab client file = /{$my_new_location}/{$my_new_ktab_client_file.keytab}
|
||||
auth_cluster_required = gss
|
||||
auth_service_required = gss
|
||||
auth_client_required = gss
|
||||
gss_ktab_client_file = /{$my_new_location}/{$my_new_ktab_client_file.keytab}
|
||||
...
|
||||
|
||||
|
||||
|
@ -32,7 +32,7 @@ cephadm/cephadm script into memory.)
|
||||
for mon or mgr.
|
||||
- You'll see health warnings from cephadm about stray daemons--that's because
|
||||
the vstart-launched daemons aren't controlled by cephadm.
|
||||
- The default image is ``quay.io/ceph-ci/ceph:master``, but you can change
|
||||
- The default image is ``quay.io/ceph-ci/ceph:main``, but you can change
|
||||
this by passing ``-o container_image=...`` or ``ceph config set global container_image ...``.
|
||||
|
||||
|
||||
|
@ -131,3 +131,8 @@ sharing a single pool (via namespaces), their snapshots *will* collide and
|
||||
deleting one will result in missing file data for others. (This may even be
|
||||
invisible, not throwing errors to the user.) If each FS gets its own
|
||||
pool things probably work, but this isn't tested and may not be true.
|
||||
|
||||
.. Note:: To avoid snap id collision between mon-managed snapshots and file system
|
||||
snapshots, pools with mon-managed snapshots are not allowed to be attached
|
||||
to a file system. Also, mon-managed snapshots can't be created in pools
|
||||
already attached to a file system either.
|
||||
|
@ -87,7 +87,7 @@ The procedure for making changes to the Ceph repository is as follows:
|
||||
|
||||
#. :ref:`Push the changes in your local working copy to your fork<push_changes>`.
|
||||
|
||||
#. Create a Pull Request to push the change upstream
|
||||
#. Create a Pull Request to push the change upstream.
|
||||
|
||||
#. Create a Pull Request that asks for your changes to be added into the
|
||||
"upstream Ceph" repository.
|
||||
@ -513,3 +513,57 @@ the **ptl-tool** have the following form::
|
||||
client: add timer_lock support
|
||||
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
|
||||
|
||||
Miscellaneous
|
||||
-------------
|
||||
|
||||
--set-upstream
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
If you forget to include the ``--set-upstream origin x`` option in your ``git
|
||||
push`` command, you will see the following error message:
|
||||
|
||||
::
|
||||
|
||||
fatal: The current branch {x} has no upstream branch.
|
||||
To push the current branch and set the remote as upstream, use
|
||||
git push --set-upstream origin {x}
|
||||
|
||||
To set up git to automatically create the upstream branch that corresponds to
|
||||
the branch in your local working copy, run this command from within the
|
||||
``ceph/`` directory:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git config --global push.autoSetupRemote true
|
||||
|
||||
Deleting a Branch Locally
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To delete the branch named ``localBranchName`` from the local working copy, run
|
||||
a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git branch -d localBranchName
|
||||
|
||||
Deleting a Branch Remotely
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To delete the branch named ``remoteBranchName`` from the remote upstream branch
|
||||
(which is also your fork of ``ceph/ceph``, as described in :ref:`forking`), run
|
||||
a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git push origin --delete remoteBranchName
|
||||
|
||||
Searching a File Longitudinally for a String
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To search for the commit that introduced a given string (in this example, that
|
||||
string is ``foo``) into a given file (in this example, that file is
|
||||
``file.rst``), run a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git log -S 'foo' file.rst
|
||||
|
@ -89,6 +89,11 @@ click on `New issue`_.
|
||||
.. _`jump to the Ceph project`: http://tracker.ceph.com/projects/ceph
|
||||
.. _`New issue`: http://tracker.ceph.com/projects/ceph/issues/new
|
||||
|
||||
Slack
|
||||
-----
|
||||
|
||||
Ceph's Slack is https://ceph-storage.slack.com/.
|
||||
|
||||
.. _mailing-list:
|
||||
|
||||
Mailing lists
|
||||
|
@ -129,8 +129,8 @@ all the integration tests, for all the Ceph components.
|
||||
verify that teuthology can run integration tests, with and without OpenStack
|
||||
|
||||
`upgrade <https://github.com/ceph/ceph/tree/master/qa/suites/upgrade>`_
|
||||
for various versions of Ceph, verify that upgrades can happen
|
||||
without disrupting an ongoing workload
|
||||
for various versions of Ceph, verify that upgrades can happen without
|
||||
disrupting an ongoing workload (`Upgrade Testing`_)
|
||||
|
||||
.. _`ceph-deploy man page`: ../../man/8/ceph-deploy
|
||||
|
||||
@ -452,6 +452,82 @@ A single test from the rbd/thrash suite can be run by adding the
|
||||
--suite rbd/thrash \
|
||||
--filter 'rbd/thrash/{clusters/fixed-2.yaml clusters/openstack.yaml workloads/rbd_api_tests_copy_on_read.yaml}'
|
||||
|
||||
.. _upgrade-testing:
|
||||
|
||||
Upgrade Testing
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
Using the upgrade suite we are able to verify that upgrades from earlier releases can complete
|
||||
successfully without disrupting any ongoing workload.
|
||||
Each Release branch upgrade directory includes 2-x upgrade testing.
|
||||
Meaning, we are able to test the upgrade from 2 preceding releases to the current one.
|
||||
The upgrade sequence is done in `parallel <https://github.com/ceph/teuthology/blob/main/teuthology/task/parallel.py>`_
|
||||
with other given workloads.
|
||||
|
||||
For instance, the upgrade test directory from the Quincy release branch is as follows:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
├── octopus-x
|
||||
└── pacific-x
|
||||
|
||||
It is possible to test upgrades from Octopus (2-x) or from Pacific (1-x) to Quincy (x).
|
||||
A simple upgrade test consists the following order:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
├── 0-start.yaml
|
||||
├── 1-tasks.yaml
|
||||
├── upgrade-sequence.yaml
|
||||
└── workload
|
||||
|
||||
After starting the cluster with the older release we begin running the given ``workload``
|
||||
and the ``upgrade-sequnce`` in parallel.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
- print: "**** done start parallel"
|
||||
- parallel:
|
||||
- workload
|
||||
- upgrade-sequence
|
||||
- print: "**** done end parallel"
|
||||
|
||||
While the ``workload`` directory consists regular yaml files just as in any other suite,
|
||||
the ``upgrade-sequnce`` is resposible for running the upgrade and awaitng its completion:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
- print: "**** done start upgrade, wait"
|
||||
...
|
||||
mon.a:
|
||||
- ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1
|
||||
- while ceph orch upgrade status | jq '.in_progress' | grep true ; do ceph orch ps ; ceph versions ; sleep 30 ; done\
|
||||
...
|
||||
- print: "**** done end upgrade, wait..."
|
||||
|
||||
It is also possible to upgrade in stages while running workloads in between those:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
├── %
|
||||
├── 0-cluster
|
||||
├── 1-ceph-install
|
||||
├── 2-partial-upgrade
|
||||
├── 3-thrash
|
||||
├── 4-workload
|
||||
├── 5-finish-upgrade.yaml
|
||||
├── 6-quincy.yaml
|
||||
└── 8-final-workload
|
||||
|
||||
After starting a cluster we upgrade only 2/3 of the cluster
|
||||
(``2-partial-upgrade``). The next stage is running thrash tests and given
|
||||
workload tests. Later on, continuing to upgrade the rest of the cluster
|
||||
(``5-finish-upgrade.yaml``).
|
||||
|
||||
The last stage is requiring the updated release (``ceph require-osd-release
|
||||
quincy``, ``ceph osd set-require-min-compat-client quincy``) and running the
|
||||
``final-workload``.
|
||||
|
||||
Filtering tests by their description
|
||||
------------------------------------
|
||||
|
||||
|
@ -4,6 +4,11 @@
|
||||
|
||||
.. glossary::
|
||||
|
||||
Application
|
||||
More properly called a :term:`client`, an application is any program
|
||||
external to Ceph that uses a Ceph Cluster to store and
|
||||
replicate data.
|
||||
|
||||
:ref:`BlueStore<rados_config_storage_devices_bluestore>`
|
||||
OSD BlueStore is a storage back end used by OSD daemons, and
|
||||
was designed specifically for use with Ceph. BlueStore was
|
||||
@ -14,6 +19,22 @@
|
||||
system interface. Since Luminous (12.2), BlueStore has been
|
||||
Ceph's default and recommended storage back end.
|
||||
|
||||
Bucket
|
||||
In the context of :term:`RGW`, a bucket is a group of objects.
|
||||
In a filesystem-based analogy in which objects are the
|
||||
counterpart of files, buckets are the counterpart of
|
||||
directories. :ref:`Multisite sync
|
||||
policies<radosgw-multisite-sync-policy>` can be set on buckets,
|
||||
to provide fine-grained control of data movement from one zone
|
||||
to another zone.
|
||||
|
||||
The concept of the bucket has been taken from AWS S3. See also
|
||||
`the AWS S3 page on creating buckets <https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-buckets-s3.html>`_
|
||||
and `the AWS S3 'Buckets Overview' page <https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingBucket.html>`_.
|
||||
|
||||
OpenStack Swift uses the term "containers" for what RGW and AWS call "buckets".
|
||||
See `the OpenStack Storage API overview page <https://docs.openstack.org/swift/latest/api/object_api_v1_overview.html>`_.
|
||||
|
||||
Ceph
|
||||
Ceph is a distributed network storage and file system with
|
||||
distributed metadata management and POSIX semantics.
|
||||
@ -166,9 +187,14 @@
|
||||
applications, Ceph Users, and :term:`Ceph Client`\s. Ceph
|
||||
Storage Clusters receive data from :term:`Ceph Client`\s.
|
||||
|
||||
cephx
|
||||
The Ceph authentication protocol. Cephx operates like Kerberos,
|
||||
but it has no single point of failure.
|
||||
CephX
|
||||
The Ceph authentication protocol. CephX operates like Kerberos,
|
||||
but it has no single point of failure. See the :ref:`CephX
|
||||
Configuration Reference<rados-cephx-config-ref>`.
|
||||
|
||||
Client
|
||||
A client is any program external to Ceph that uses a Ceph
|
||||
Cluster to store and replicate data.
|
||||
|
||||
Cloud Platforms
|
||||
Cloud Stacks
|
||||
@ -271,6 +297,12 @@
|
||||
This is the unique identifier of an OSD. This term is used
|
||||
interchangeably with ``fsid``
|
||||
|
||||
Period
|
||||
In the context of :term:`RGW`, a period is the configuration
|
||||
state of the :term:`Realm`. The period stores the configuration
|
||||
state of a multi-site configuration. When the period is updated,
|
||||
the "epoch" is said thereby to have been changed.
|
||||
|
||||
:ref:`Pool<rados_pools>`
|
||||
A pool is a logical partition used to store objects.
|
||||
|
||||
@ -301,6 +333,10 @@
|
||||
The block storage component of Ceph. Also called "RADOS Block
|
||||
Device" or :term:`Ceph Block Device`.
|
||||
|
||||
:ref:`Realm<rgw-realms>`
|
||||
In the context of RADOS Gateway (RGW), a realm is a globally
|
||||
unique namespace that consists of one or more zonegroups.
|
||||
|
||||
Releases
|
||||
|
||||
Ceph Interim Release
|
||||
@ -335,6 +371,28 @@
|
||||
Amazon S3 RESTful API and the OpenStack Swift API. Also called
|
||||
"RADOS Gateway" and "Ceph Object Gateway".
|
||||
|
||||
scrubs
|
||||
|
||||
The processes by which Ceph ensures data integrity. During the
|
||||
process of scrubbing, Ceph generates a catalog of all objects
|
||||
in a placement group, then ensures that none of the objects are
|
||||
missing or mismatched by comparing each primary object against
|
||||
its replicas, which are stored across other OSDs. Any PG
|
||||
is determined to have a copy of an object that is different
|
||||
than the other copies or is missing entirely is marked
|
||||
"inconsistent" (that is, the PG is marked "inconsistent").
|
||||
|
||||
There are two kinds of scrubbing: light scrubbing and deep
|
||||
scrubbing (also called "normal scrubbing" and "deep scrubbing",
|
||||
respectively). Light scrubbing is performed daily and does
|
||||
nothing more than confirm that a given object exists and that
|
||||
its metadata is correct. Deep scrubbing is performed weekly and
|
||||
reads the data and uses checksums to ensure data integrity.
|
||||
|
||||
See :ref:`Scrubbing <rados_config_scrubbing>` in the RADOS OSD
|
||||
Configuration Reference Guide and page 141 of *Mastering Ceph,
|
||||
second edition* (Fisk, Nick. 2019).
|
||||
|
||||
secrets
|
||||
Secrets are credentials used to perform digital authentication
|
||||
whenever privileged users must access systems that require
|
||||
@ -352,5 +410,17 @@
|
||||
Teuthology
|
||||
The collection of software that performs scripted tests on Ceph.
|
||||
|
||||
User
|
||||
An individual or a system actor (for example, an application)
|
||||
that uses Ceph clients to interact with the :term:`Ceph Storage
|
||||
Cluster`. See :ref:`User<rados-ops-user>` and :ref:`User
|
||||
Management<user-management>`.
|
||||
|
||||
Zone
|
||||
In the context of :term:`RGW`, a zone is a logical group that
|
||||
consists of one or more :term:`RGW` instances. A zone's
|
||||
configuration state is stored in the :term:`period`. See
|
||||
:ref:`Zones<radosgw-zones>`.
|
||||
|
||||
.. _https://github.com/ceph: https://github.com/ceph
|
||||
.. _Cluster Map: ../architecture#cluster-map
|
||||
|
File diff suppressed because it is too large
Load Diff
Before Width: | Height: | Size: 568 KiB After Width: | Height: | Size: 730 KiB |
@ -2,8 +2,7 @@
|
||||
Welcome to Ceph
|
||||
=================
|
||||
|
||||
Ceph uniquely delivers **object, block, and file storage in one unified
|
||||
system**.
|
||||
Ceph delivers **object, block, and file storage in one unified system**.
|
||||
|
||||
.. warning::
|
||||
|
||||
|
@ -4,33 +4,32 @@
|
||||
Installing Ceph
|
||||
===============
|
||||
|
||||
There are several different ways to install Ceph. Choose the
|
||||
method that best suits your needs.
|
||||
There are multiple ways to install Ceph.
|
||||
|
||||
Recommended methods
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
:ref:`Cephadm <cephadm>` installs and manages a Ceph cluster using containers and
|
||||
systemd, with tight integration with the CLI and dashboard GUI.
|
||||
:ref:`Cephadm <cephadm_deploying_new_cluster>` installs and manages a Ceph
|
||||
cluster that uses containers and systemd and is tightly integrated with the CLI
|
||||
and dashboard GUI.
|
||||
|
||||
* cephadm only supports Octopus and newer releases.
|
||||
* cephadm is fully integrated with the new orchestration API and
|
||||
fully supports the new CLI and dashboard features to manage
|
||||
cluster deployment.
|
||||
* cephadm requires container support (podman or docker) and
|
||||
* cephadm supports only Octopus and newer releases.
|
||||
* cephadm is fully integrated with the orchestration API and fully supports the
|
||||
CLI and dashboard features that are used to manage cluster deployment.
|
||||
* cephadm requires container support (in the form of Podman or Docker) and
|
||||
Python 3.
|
||||
|
||||
`Rook <https://rook.io/>`_ deploys and manages Ceph clusters running
|
||||
in Kubernetes, while also enabling management of storage resources and
|
||||
provisioning via Kubernetes APIs. We recommend Rook as the way to run Ceph in
|
||||
provisioning via Kubernetes APIs. We recommend Rook as the way to run Ceph in
|
||||
Kubernetes or to connect an existing Ceph storage cluster to Kubernetes.
|
||||
|
||||
* Rook only supports Nautilus and newer releases of Ceph.
|
||||
* Rook supports only Nautilus and newer releases of Ceph.
|
||||
* Rook is the preferred method for running Ceph on Kubernetes, or for
|
||||
connecting a Kubernetes cluster to an existing (external) Ceph
|
||||
cluster.
|
||||
* Rook supports the new orchestrator API. New management features
|
||||
in the CLI and dashboard are fully supported.
|
||||
* Rook supports the orchestrator API. Management features in the CLI and
|
||||
dashboard are fully supported.
|
||||
|
||||
Other methods
|
||||
~~~~~~~~~~~~~
|
||||
@ -39,16 +38,20 @@ Other methods
|
||||
Ceph clusters using Ansible.
|
||||
|
||||
* ceph-ansible is widely deployed.
|
||||
* ceph-ansible is not integrated with the new orchestrator APIs,
|
||||
introduced in Nautlius and Octopus, which means that newer
|
||||
management features and dashboard integration are not available.
|
||||
* ceph-ansible is not integrated with the orchestrator APIs that were
|
||||
introduced in Nautilus and Octopus, which means that the management features
|
||||
and dashboard integration introduced in Nautilus and Octopus are not
|
||||
available in Ceph clusters deployed by means of ceph-ansible.
|
||||
|
||||
|
||||
`ceph-deploy <https://docs.ceph.com/projects/ceph-deploy/en/latest/>`_ is a tool for quickly deploying clusters.
|
||||
`ceph-deploy <https://docs.ceph.com/projects/ceph-deploy/en/latest/>`_ is a
|
||||
tool that can be used to quickly deploy clusters. It is deprecated.
|
||||
|
||||
.. IMPORTANT::
|
||||
|
||||
ceph-deploy is no longer actively maintained. It is not tested on versions of Ceph newer than Nautilus. It does not support RHEL8, CentOS 8, or newer operating systems.
|
||||
ceph-deploy is not actively maintained. It is not tested on versions of Ceph
|
||||
newer than Nautilus. It does not support RHEL8, CentOS 8, or newer operating
|
||||
systems.
|
||||
|
||||
`DeepSea <https://github.com/SUSE/DeepSea>`_ installs Ceph using Salt.
|
||||
|
||||
@ -67,7 +70,7 @@ Ceph can also be :ref:`installed manually <install-manual>`.
|
||||
Windows
|
||||
~~~~~~~
|
||||
|
||||
For Windows installations, please consult this document:
|
||||
For Windows installations, consult this document:
|
||||
`Windows installation guide`_.
|
||||
|
||||
.. _Windows installation guide: ./windows-install
|
||||
|
@ -312,6 +312,7 @@ function. This will result in a circular locking exception.
|
||||
.. automethod:: MgrModule.get_perf_schema
|
||||
.. automethod:: MgrModule.get_counter
|
||||
.. automethod:: MgrModule.get_mgr_id
|
||||
.. automethod:: MgrModule.get_daemon_health_metrics
|
||||
|
||||
Exposing health checks
|
||||
----------------------
|
||||
|
@ -239,7 +239,7 @@ Create CephFS Export
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>]
|
||||
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>] [--sectype <value>...]
|
||||
|
||||
This creates export RADOS objects containing the export block, where
|
||||
|
||||
@ -266,6 +266,18 @@ for permissible values.
|
||||
value is `no_root_squash`. See the `NFS-Ganesha Export Sample`_ for
|
||||
permissible values.
|
||||
|
||||
``<sectype>`` specifies which authentication methods will be used when
|
||||
connecting to the export. Valid values include "krb5p", "krb5i", "krb5", "sys",
|
||||
and "none". More than one value can be supplied. The flag may be specified
|
||||
multiple times (example: ``--sectype=krb5p --sectype=krb5i``) or multiple
|
||||
values may be separated by a comma (example: ``--sectype krb5p,krb5i``). The
|
||||
server will negotatiate a supported security type with the client preferring
|
||||
the supplied methods left-to-right.
|
||||
|
||||
.. note:: Specifying values for sectype that require Kerberos will only function on servers
|
||||
that are configured to support Kerberos. Setting up NFS-Ganesha to support Kerberos
|
||||
is outside the scope of this document.
|
||||
|
||||
.. note:: Export creation is supported only for NFS Ganesha clusters deployed using nfs interface.
|
||||
|
||||
Create RGW Export
|
||||
@ -285,7 +297,7 @@ To export a *bucket*:
|
||||
|
||||
.. code::
|
||||
|
||||
$ ceph nfs export create rgw --cluster-id <cluster_id> --pseudo-path <pseudo_path> --bucket <bucket_name> [--user-id <user-id>] [--readonly] [--client_addr <value>...] [--squash <value>]
|
||||
$ ceph nfs export create rgw --cluster-id <cluster_id> --pseudo-path <pseudo_path> --bucket <bucket_name> [--user-id <user-id>] [--readonly] [--client_addr <value>...] [--squash <value>] [--sectype <value>...]
|
||||
|
||||
For example, to export *mybucket* via NFS cluster *mynfs* at the pseudo-path */bucketdata* to any host in the ``192.168.10.0/24`` network
|
||||
|
||||
@ -316,6 +328,18 @@ for permissible values.
|
||||
value is `no_root_squash`. See the `NFS-Ganesha Export Sample`_ for
|
||||
permissible values.
|
||||
|
||||
``<sectype>`` specifies which authentication methods will be used when
|
||||
connecting to the export. Valid values include "krb5p", "krb5i", "krb5", "sys",
|
||||
and "none". More than one value can be supplied. The flag may be specified
|
||||
multiple times (example: ``--sectype=krb5p --sectype=krb5i``) or multiple
|
||||
values may be separated by a comma (example: ``--sectype krb5p,krb5i``). The
|
||||
server will negotatiate a supported security type with the client preferring
|
||||
the supplied methods left-to-right.
|
||||
|
||||
.. note:: Specifying values for sectype that require Kerberos will only function on servers
|
||||
that are configured to support Kerberos. Setting up NFS-Ganesha to support Kerberos
|
||||
is outside the scope of this document.
|
||||
|
||||
RGW user export
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
|
@ -426,6 +426,22 @@ the asynchronous writes as well as an asynchronous update to the size of the
|
||||
striped file.
|
||||
|
||||
|
||||
Debugging
|
||||
^^^^^^^^^
|
||||
|
||||
Debugging libcephsqlite can be turned on via::
|
||||
|
||||
debug_cephsqlite
|
||||
|
||||
If running the ``sqlite3`` command-line tool, use:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
env CEPH_ARGS='--log_to_file true --log-file sqlite3.log --debug_cephsqlite 20 --debug_ms 1' sqlite3 ...
|
||||
|
||||
This will save all the usual Ceph debugging to a file ``sqlite3.log`` for inspection.
|
||||
|
||||
|
||||
.. _SQLite: https://sqlite.org/index.html
|
||||
.. _SQLite VFS: https://www.sqlite.org/vfs.html
|
||||
.. _SQLite Backup: https://www.sqlite.org/backup.html
|
||||
|
@ -1,107 +1,110 @@
|
||||
.. _rados-cephx-config-ref:
|
||||
|
||||
========================
|
||||
Cephx Config Reference
|
||||
CephX Config Reference
|
||||
========================
|
||||
|
||||
The ``cephx`` protocol is enabled by default. Cryptographic authentication has
|
||||
some computational costs, though they should generally be quite low. If the
|
||||
network environment connecting your client and server hosts is very safe and
|
||||
you cannot afford authentication, you can turn it off. **This is not generally
|
||||
recommended**.
|
||||
The CephX protocol is enabled by default. The cryptographic authentication that
|
||||
CephX provides has some computational costs, though they should generally be
|
||||
quite low. If the network environment connecting your client and server hosts
|
||||
is very safe and you cannot afford authentication, you can disable it.
|
||||
**Disabling authentication is not generally recommended**.
|
||||
|
||||
.. note:: If you disable authentication, you are at risk of a man-in-the-middle
|
||||
attack altering your client/server messages, which could lead to disastrous
|
||||
security effects.
|
||||
.. note:: If you disable authentication, you will be at risk of a
|
||||
man-in-the-middle attack that alters your client/server messages, which
|
||||
could have disastrous security effects.
|
||||
|
||||
For creating users, see `User Management`_. For details on the architecture
|
||||
of Cephx, see `Architecture - High Availability Authentication`_.
|
||||
For information about creating users, see `User Management`_. For details on
|
||||
the architecture of CephX, see `Architecture - High Availability
|
||||
Authentication`_.
|
||||
|
||||
|
||||
Deployment Scenarios
|
||||
====================
|
||||
|
||||
There are two main scenarios for deploying a Ceph cluster, which impact
|
||||
how you initially configure Cephx. Most first time Ceph users use
|
||||
``cephadm`` to create a cluster (easiest). For clusters using
|
||||
other deployment tools (e.g., Chef, Juju, Puppet, etc.), you will need
|
||||
to use the manual procedures or configure your deployment tool to
|
||||
How you initially configure CephX depends on your scenario. There are two
|
||||
common strategies for deploying a Ceph cluster. If you are a first-time Ceph
|
||||
user, you should probably take the easiest approach: using ``cephadm`` to
|
||||
deploy a cluster. But if your cluster uses other deployment tools (for example,
|
||||
Ansible, Chef, Juju, or Puppet), you will need either to use the manual
|
||||
deployment procedures or to configure your deployment tool so that it will
|
||||
bootstrap your monitor(s).
|
||||
|
||||
Manual Deployment
|
||||
-----------------
|
||||
|
||||
When you deploy a cluster manually, you have to bootstrap the monitor manually
|
||||
and create the ``client.admin`` user and keyring. To bootstrap monitors, follow
|
||||
the steps in `Monitor Bootstrapping`_. The steps for monitor bootstrapping are
|
||||
the logical steps you must perform when using third party deployment tools like
|
||||
Chef, Puppet, Juju, etc.
|
||||
When you deploy a cluster manually, it is necessary to bootstrap the monitors
|
||||
manually and to create the ``client.admin`` user and keyring. To bootstrap
|
||||
monitors, follow the steps in `Monitor Bootstrapping`_. Follow these steps when
|
||||
using third-party deployment tools (for example, Chef, Puppet, and Juju).
|
||||
|
||||
|
||||
Enabling/Disabling Cephx
|
||||
Enabling/Disabling CephX
|
||||
========================
|
||||
|
||||
Enabling Cephx requires that you have deployed keys for your monitors,
|
||||
OSDs and metadata servers. If you are simply toggling Cephx on / off,
|
||||
you do not have to repeat the bootstrapping procedures.
|
||||
Enabling CephX is possible only if the keys for your monitors, OSDs, and
|
||||
metadata servers have already been deployed. If you are simply toggling CephX
|
||||
on or off, it is not necessary to repeat the bootstrapping procedures.
|
||||
|
||||
|
||||
Enabling Cephx
|
||||
Enabling CephX
|
||||
--------------
|
||||
|
||||
When ``cephx`` is enabled, Ceph will look for the keyring in the default search
|
||||
path, which includes ``/etc/ceph/$cluster.$name.keyring``. You can override
|
||||
this location by adding a ``keyring`` option in the ``[global]`` section of
|
||||
your `Ceph configuration`_ file, but this is not recommended.
|
||||
When CephX is enabled, Ceph will look for the keyring in the default search
|
||||
path: this path includes ``/etc/ceph/$cluster.$name.keyring``. It is possible
|
||||
to override this search-path location by adding a ``keyring`` option in the
|
||||
``[global]`` section of your `Ceph configuration`_ file, but this is not
|
||||
recommended.
|
||||
|
||||
Execute the following procedures to enable ``cephx`` on a cluster with
|
||||
authentication disabled. If you (or your deployment utility) have already
|
||||
To enable CephX on a cluster for which authentication has been disabled, carry
|
||||
out the following procedure. If you (or your deployment utility) have already
|
||||
generated the keys, you may skip the steps related to generating keys.
|
||||
|
||||
#. Create a ``client.admin`` key, and save a copy of the key for your client
|
||||
host
|
||||
host:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth get-or-create client.admin mon 'allow *' mds 'allow *' mgr 'allow *' osd 'allow *' -o /etc/ceph/ceph.client.admin.keyring
|
||||
|
||||
**Warning:** This will clobber any existing
|
||||
**Warning:** This step will clobber any existing
|
||||
``/etc/ceph/client.admin.keyring`` file. Do not perform this step if a
|
||||
deployment tool has already done it for you. Be careful!
|
||||
deployment tool has already generated a keyring file for you. Be careful!
|
||||
|
||||
#. Create a keyring for your monitor cluster and generate a monitor
|
||||
secret key.
|
||||
#. Create a monitor keyring and generate a monitor secret key:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-authtool --create-keyring /tmp/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
|
||||
|
||||
#. Copy the monitor keyring into a ``ceph.mon.keyring`` file in every monitor's
|
||||
``mon data`` directory. For example, to copy it to ``mon.a`` in cluster ``ceph``,
|
||||
use the following
|
||||
#. For each monitor, copy the monitor keyring into a ``ceph.mon.keyring`` file
|
||||
in the monitor's ``mon data`` directory. For example, to copy the monitor
|
||||
keyring to ``mon.a`` in a cluster called ``ceph``, run the following
|
||||
command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
cp /tmp/ceph.mon.keyring /var/lib/ceph/mon/ceph-a/keyring
|
||||
|
||||
#. Generate a secret key for every MGR, where ``{$id}`` is the MGR letter
|
||||
#. Generate a secret key for every MGR, where ``{$id}`` is the MGR letter:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth get-or-create mgr.{$id} mon 'allow profile mgr' mds 'allow *' osd 'allow *' -o /var/lib/ceph/mgr/ceph-{$id}/keyring
|
||||
|
||||
#. Generate a secret key for every OSD, where ``{$id}`` is the OSD number
|
||||
#. Generate a secret key for every OSD, where ``{$id}`` is the OSD number:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth get-or-create osd.{$id} mon 'allow rwx' osd 'allow *' -o /var/lib/ceph/osd/ceph-{$id}/keyring
|
||||
|
||||
#. Generate a secret key for every MDS, where ``{$id}`` is the MDS letter
|
||||
#. Generate a secret key for every MDS, where ``{$id}`` is the MDS letter:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth get-or-create mds.{$id} mon 'allow rwx' osd 'allow *' mds 'allow *' mgr 'allow profile mds' -o /var/lib/ceph/mds/ceph-{$id}/keyring
|
||||
|
||||
#. Enable ``cephx`` authentication by setting the following options in the
|
||||
``[global]`` section of your `Ceph configuration`_ file
|
||||
#. Enable CephX authentication by setting the following options in the
|
||||
``[global]`` section of your `Ceph configuration`_ file:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
@ -109,23 +112,23 @@ generated the keys, you may skip the steps related to generating keys.
|
||||
auth_service_required = cephx
|
||||
auth_client_required = cephx
|
||||
|
||||
|
||||
#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details.
|
||||
#. Start or restart the Ceph cluster. For details, see `Operating a Cluster`_.
|
||||
|
||||
For details on bootstrapping a monitor manually, see `Manual Deployment`_.
|
||||
|
||||
|
||||
|
||||
Disabling Cephx
|
||||
Disabling CephX
|
||||
---------------
|
||||
|
||||
The following procedure describes how to disable Cephx. If your cluster
|
||||
environment is relatively safe, you can offset the computation expense of
|
||||
running authentication. **We do not recommend it.** However, it may be easier
|
||||
during setup and/or troubleshooting to temporarily disable authentication.
|
||||
The following procedure describes how to disable CephX. If your cluster
|
||||
environment is safe, you might want to disable CephX in order to offset the
|
||||
computational expense of running authentication. **We do not recommend doing
|
||||
so.** However, setup and troubleshooting might be easier if authentication is
|
||||
temporarily disabled and subsequently re-enabled.
|
||||
|
||||
#. Disable ``cephx`` authentication by setting the following options in the
|
||||
``[global]`` section of your `Ceph configuration`_ file
|
||||
#. Disable CephX authentication by setting the following options in the
|
||||
``[global]`` section of your `Ceph configuration`_ file:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
@ -133,8 +136,7 @@ during setup and/or troubleshooting to temporarily disable authentication.
|
||||
auth_service_required = none
|
||||
auth_client_required = none
|
||||
|
||||
|
||||
#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details.
|
||||
#. Start or restart the Ceph cluster. For details, see `Operating a Cluster`_.
|
||||
|
||||
|
||||
Configuration Settings
|
||||
@ -146,8 +148,9 @@ Enablement
|
||||
|
||||
``auth_cluster_required``
|
||||
|
||||
:Description: If enabled, the Ceph Storage Cluster daemons (i.e., ``ceph-mon``,
|
||||
``ceph-osd``, ``ceph-mds`` and ``ceph-mgr``) must authenticate with
|
||||
:Description: If this configuration setting is enabled, the Ceph Storage
|
||||
Cluster daemons (that is, ``ceph-mon``, ``ceph-osd``,
|
||||
``ceph-mds``, and ``ceph-mgr``) are required to authenticate with
|
||||
each other. Valid settings are ``cephx`` or ``none``.
|
||||
|
||||
:Type: String
|
||||
@ -157,9 +160,9 @@ Enablement
|
||||
|
||||
``auth_service_required``
|
||||
|
||||
:Description: If enabled, the Ceph Storage Cluster daemons require Ceph Clients
|
||||
to authenticate with the Ceph Storage Cluster in order to access
|
||||
Ceph services. Valid settings are ``cephx`` or ``none``.
|
||||
:Description: If this configuration setting is enabled, then Ceph clients can
|
||||
access Ceph services only if those clients authenticate with the
|
||||
Ceph Storage Cluster. Valid settings are ``cephx`` or ``none``.
|
||||
|
||||
:Type: String
|
||||
:Required: No
|
||||
@ -168,9 +171,11 @@ Enablement
|
||||
|
||||
``auth_client_required``
|
||||
|
||||
:Description: If enabled, the Ceph Client requires the Ceph Storage Cluster to
|
||||
authenticate with the Ceph Client. Valid settings are ``cephx``
|
||||
or ``none``.
|
||||
:Description: If this configuration setting is enabled, then communication
|
||||
between the Ceph client and Ceph Storage Cluster can be
|
||||
established only if the Ceph Storage Cluster authenticates
|
||||
against the Ceph client. Valid settings are ``cephx`` or
|
||||
``none``.
|
||||
|
||||
:Type: String
|
||||
:Required: No
|
||||
@ -182,30 +187,108 @@ Enablement
|
||||
Keys
|
||||
----
|
||||
|
||||
When you run Ceph with authentication enabled, ``ceph`` administrative commands
|
||||
and Ceph Clients require authentication keys to access the Ceph Storage Cluster.
|
||||
When Ceph is run with authentication enabled, ``ceph`` administrative commands
|
||||
and Ceph clients can access the Ceph Storage Cluster only if they use
|
||||
authentication keys.
|
||||
|
||||
The most common way to provide these keys to the ``ceph`` administrative
|
||||
commands and clients is to include a Ceph keyring under the ``/etc/ceph``
|
||||
directory. For Octopus and later releases using ``cephadm``, the filename
|
||||
is usually ``ceph.client.admin.keyring`` (or ``$cluster.client.admin.keyring``).
|
||||
If you include the keyring under the ``/etc/ceph`` directory, you don't need to
|
||||
specify a ``keyring`` entry in your Ceph configuration file.
|
||||
The most common way to make these keys available to ``ceph`` administrative
|
||||
commands and Ceph clients is to include a Ceph keyring under the ``/etc/ceph``
|
||||
directory. For Octopus and later releases that use ``cephadm``, the filename is
|
||||
usually ``ceph.client.admin.keyring``. If the keyring is included in the
|
||||
``/etc/ceph`` directory, then it is unnecessary to specify a ``keyring`` entry
|
||||
in the Ceph configuration file.
|
||||
|
||||
We recommend copying the Ceph Storage Cluster's keyring file to nodes where you
|
||||
will run administrative commands, because it contains the ``client.admin`` key.
|
||||
Because the Ceph Storage Cluster's keyring file contains the ``client.admin``
|
||||
key, we recommend copying the keyring file to nodes from which you run
|
||||
administrative commands.
|
||||
|
||||
To perform this step manually, execute the following:
|
||||
To perform this step manually, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo scp {user}@{ceph-cluster-host}:/etc/ceph/ceph.client.admin.keyring /etc/ceph/ceph.client.admin.keyring
|
||||
|
||||
.. tip:: Ensure the ``ceph.keyring`` file has appropriate permissions set
|
||||
(e.g., ``chmod 644``) on your client machine.
|
||||
.. tip:: Make sure that the ``ceph.keyring`` file has appropriate permissions
|
||||
(for example, ``chmod 644``) set on your client machine.
|
||||
|
||||
You may specify the key itself in the Ceph configuration file using the ``key``
|
||||
setting (not recommended), or a path to a keyfile using the ``keyfile`` setting.
|
||||
You can specify the key itself by using the ``key`` setting in the Ceph
|
||||
configuration file (this approach is not recommended), or instead specify a
|
||||
path to a keyfile by using the ``keyfile`` setting in the Ceph configuration
|
||||
file.
|
||||
|
||||
``keyring``
|
||||
|
||||
:Description: The path to the keyring file.
|
||||
:Type: String
|
||||
:Required: No
|
||||
:Default: ``/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin``
|
||||
|
||||
|
||||
``keyfile``
|
||||
|
||||
:Description: The path to a keyfile (that is, a file containing only the key).
|
||||
:Type: String
|
||||
:Required: No
|
||||
:Default: None
|
||||
|
||||
|
||||
``key``
|
||||
|
||||
:Description: The key (that is, the text string of the key itself). We do not
|
||||
recommend that you use this setting unless you know what you're
|
||||
doing.
|
||||
:Type: String
|
||||
:Required: No
|
||||
:Default: None
|
||||
|
||||
|
||||
Daemon Keyrings
|
||||
---------------
|
||||
|
||||
Administrative users or deployment tools (for example, ``cephadm``) generate
|
||||
daemon keyrings in the same way that they generate user keyrings. By default,
|
||||
Ceph stores the keyring of a daemon inside that daemon's data directory. The
|
||||
default keyring locations and the capabilities that are necessary for the
|
||||
daemon to function are shown below.
|
||||
|
||||
``ceph-mon``
|
||||
|
||||
:Location: ``$mon_data/keyring``
|
||||
:Capabilities: ``mon 'allow *'``
|
||||
|
||||
``ceph-osd``
|
||||
|
||||
:Location: ``$osd_data/keyring``
|
||||
:Capabilities: ``mgr 'allow profile osd' mon 'allow profile osd' osd 'allow *'``
|
||||
|
||||
``ceph-mds``
|
||||
|
||||
:Location: ``$mds_data/keyring``
|
||||
:Capabilities: ``mds 'allow' mgr 'allow profile mds' mon 'allow profile mds' osd 'allow rwx'``
|
||||
|
||||
``ceph-mgr``
|
||||
|
||||
:Location: ``$mgr_data/keyring``
|
||||
:Capabilities: ``mon 'allow profile mgr' mds 'allow *' osd 'allow *'``
|
||||
|
||||
``radosgw``
|
||||
|
||||
:Location: ``$rgw_data/keyring``
|
||||
:Capabilities: ``mon 'allow rwx' osd 'allow rwx'``
|
||||
|
||||
|
||||
.. note:: The monitor keyring (that is, ``mon.``) contains a key but no
|
||||
capabilities, and this keyring is not part of the cluster ``auth`` database.
|
||||
|
||||
The daemon's data-directory locations default to directories of the form::
|
||||
|
||||
/var/lib/ceph/$type/$cluster-$id
|
||||
|
||||
For example, ``osd.12`` would have the following data directory::
|
||||
|
||||
/var/lib/ceph/osd/ceph-12
|
||||
|
||||
It is possible to override these locations, but it is not recommended.
|
||||
|
||||
|
||||
``keyring``
|
||||
@ -286,16 +369,66 @@ You can override these locations, but it is not recommended.
|
||||
Signatures
|
||||
----------
|
||||
|
||||
Ceph performs a signature check that provides some limited protection
|
||||
against messages being tampered with in flight (e.g., by a "man in the
|
||||
middle" attack).
|
||||
Ceph performs a signature check that provides some limited protection against
|
||||
messages being tampered with in flight (for example, by a "man in the middle"
|
||||
attack).
|
||||
|
||||
Like other parts of Ceph authentication, Ceph provides fine-grained control so
|
||||
you can enable/disable signatures for service messages between clients and
|
||||
Ceph, and so you can enable/disable signatures for messages between Ceph daemons.
|
||||
As with other parts of Ceph authentication, signatures admit of fine-grained
|
||||
control. You can enable or disable signatures for service messages between
|
||||
clients and Ceph, and for messages between Ceph daemons.
|
||||
|
||||
Note that even with signatures enabled data is not encrypted in
|
||||
flight.
|
||||
Note that even when signatures are enabled data is not encrypted in flight.
|
||||
|
||||
``cephx_require_signatures``
|
||||
|
||||
:Description: If this configuration setting is set to ``true``, Ceph requires
|
||||
signatures on all message traffic between the Ceph client and the
|
||||
Ceph Storage Cluster, and between daemons within the Ceph Storage
|
||||
Cluster.
|
||||
|
||||
.. note::
|
||||
**ANTIQUATED NOTE:**
|
||||
|
||||
Neither Ceph Argonaut nor Linux kernel versions prior to 3.19
|
||||
support signatures; if one of these clients is in use, ``cephx_require_signatures``
|
||||
can be disabled in order to allow the client to connect.
|
||||
|
||||
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
|
||||
|
||||
``cephx_cluster_require_signatures``
|
||||
|
||||
:Description: If this configuration setting is set to ``true``, Ceph requires
|
||||
signatures on all message traffic between Ceph daemons within the
|
||||
Ceph Storage Cluster.
|
||||
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
|
||||
|
||||
``cephx_service_require_signatures``
|
||||
|
||||
:Description: If this configuration setting is set to ``true``, Ceph requires
|
||||
signatures on all message traffic between Ceph clients and the
|
||||
Ceph Storage Cluster.
|
||||
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
|
||||
|
||||
``cephx_sign_messages``
|
||||
|
||||
:Description: If this configuration setting is set to ``true``, and if the Ceph
|
||||
version supports message signing, then Ceph will sign all
|
||||
messages so that they are more difficult to spoof.
|
||||
|
||||
:Type: Boolean
|
||||
:Default: ``true``
|
||||
|
||||
``cephx_require_signatures``
|
||||
|
||||
@ -346,9 +479,9 @@ Time to Live
|
||||
|
||||
``auth_service_ticket_ttl``
|
||||
|
||||
:Description: When the Ceph Storage Cluster sends a Ceph Client a ticket for
|
||||
authentication, the Ceph Storage Cluster assigns the ticket a
|
||||
time to live.
|
||||
:Description: When the Ceph Storage Cluster sends a ticket for authentication
|
||||
to a Ceph client, the Ceph Storage Cluster assigns that ticket a
|
||||
Time To Live (TTL).
|
||||
|
||||
:Type: Double
|
||||
:Default: ``60*60``
|
||||
|
@ -549,33 +549,35 @@ testing purposes, and are not recommended for use by operators.
|
||||
Runtime Changes
|
||||
===============
|
||||
|
||||
In most cases, Ceph allows you to make changes to the configuration of
|
||||
a daemon at runtime. This capability is quite useful for
|
||||
increasing/decreasing logging output, enabling/disabling debug
|
||||
settings, and even for runtime optimization.
|
||||
In most cases, Ceph permits changes to the configuration of a daemon at
|
||||
runtime. This can be used for increasing or decreasing the amount of logging
|
||||
output, for enabling or disabling debug settings, and for runtime optimization.
|
||||
|
||||
Generally speaking, configuration options can be updated in the usual
|
||||
way via the ``ceph config set`` command. For example, do enable the debug log level on a specific OSD:
|
||||
Configuration options can be updated via the ``ceph config set`` command. For
|
||||
example, to enable the debug log level on a specific OSD, run a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set osd.123 debug_ms 20
|
||||
|
||||
Note that if the same option is also customized in a local
|
||||
configuration file, the monitor setting will be ignored (it has a
|
||||
lower priority than the local config file).
|
||||
.. note:: If an option has been customized in a local configuration file, the
|
||||
`central config
|
||||
<https://ceph.io/en/news/blog/2018/new-mimic-centralized-configuration-management/>`_
|
||||
setting will be ignored (it has a lower priority than the local
|
||||
configuration file).
|
||||
|
||||
Override values
|
||||
---------------
|
||||
|
||||
You can also temporarily set an option using the `tell` or `daemon`
|
||||
interfaces on the Ceph CLI. These *override* values are ephemeral in
|
||||
that they only affect the running process and are discarded/lost if
|
||||
the daemon or process restarts.
|
||||
Options can be set temporarily by using the `tell` or `daemon` interfaces on
|
||||
the Ceph CLI. These *override* values are ephemeral, which means that they
|
||||
affect only the current instance of the daemon and revert to persistently
|
||||
configured values when the daemon restarts.
|
||||
|
||||
Override values can be set in two ways:
|
||||
|
||||
#. From any host, we can send a message to a daemon over the network with:
|
||||
#. From any host, send a message to a daemon with a command of the following
|
||||
form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -587,16 +589,16 @@ Override values can be set in two ways:
|
||||
|
||||
ceph tell osd.123 config set debug_osd 20
|
||||
|
||||
The `tell` command can also accept a wildcard for the daemon
|
||||
identifier. For example, to adjust the debug level on all OSD
|
||||
daemons:
|
||||
The ``tell`` command can also accept a wildcard as the daemon identifier.
|
||||
For example, to adjust the debug level on all OSD daemons, run a command of
|
||||
this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell osd.* config set debug_osd 20
|
||||
|
||||
#. From the host the process is running on, we can connect directly to
|
||||
the process via a socket in ``/var/run/ceph`` with:
|
||||
#. On the host where the daemon is running, connect to the daemon via a socket
|
||||
in ``/var/run/ceph`` by running a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -608,8 +610,8 @@ Override values can be set in two ways:
|
||||
|
||||
ceph daemon osd.4 config set debug_osd 20
|
||||
|
||||
Note that in the ``ceph config show`` command output these temporary
|
||||
values will be shown with a source of ``override``.
|
||||
.. note:: In the output of the ``ceph config show`` command, these temporary
|
||||
values are shown with a source of ``override``.
|
||||
|
||||
|
||||
Viewing runtime settings
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
.. _ceph-conf-common-settings:
|
||||
|
||||
Common Settings
|
||||
@ -7,30 +6,33 @@ Common Settings
|
||||
The `Hardware Recommendations`_ section provides some hardware guidelines for
|
||||
configuring a Ceph Storage Cluster. It is possible for a single :term:`Ceph
|
||||
Node` to run multiple daemons. For example, a single node with multiple drives
|
||||
may run one ``ceph-osd`` for each drive. Ideally, you will have a node for a
|
||||
particular type of process. For example, some nodes may run ``ceph-osd``
|
||||
daemons, other nodes may run ``ceph-mds`` daemons, and still other nodes may
|
||||
run ``ceph-mon`` daemons.
|
||||
ususally runs one ``ceph-osd`` for each drive. Ideally, each node will be
|
||||
assigned to a particular type of process. For example, some nodes might run
|
||||
``ceph-osd`` daemons, other nodes might run ``ceph-mds`` daemons, and still
|
||||
other nodes might run ``ceph-mon`` daemons.
|
||||
|
||||
Each node has a name. The name of a node can be found in its ``host`` setting.
|
||||
Monitors also specify a network address and port (that is, a domain name or IP
|
||||
address) that can be found in the ``addr`` setting. A basic configuration file
|
||||
typically specifies only minimal settings for each instance of monitor daemons.
|
||||
For example:
|
||||
|
||||
|
||||
|
||||
Each node has a name identified by the ``host`` setting. Monitors also specify
|
||||
a network address and port (i.e., domain name or IP address) identified by the
|
||||
``addr`` setting. A basic configuration file will typically specify only
|
||||
minimal settings for each instance of monitor daemons. For example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[global]
|
||||
mon_initial_members = ceph1
|
||||
mon_host = 10.0.0.1
|
||||
[global]
|
||||
mon_initial_members = ceph1
|
||||
mon_host = 10.0.0.1
|
||||
|
||||
|
||||
.. important:: The ``host`` setting is the short name of the node (i.e., not
|
||||
an fqdn). It is **NOT** an IP address either. Enter ``hostname -s`` on
|
||||
the command line to retrieve the name of the node. Do not use ``host``
|
||||
settings for anything other than initial monitors unless you are deploying
|
||||
Ceph manually. You **MUST NOT** specify ``host`` under individual daemons
|
||||
when using deployment tools like ``chef`` or ``cephadm``, as those tools
|
||||
will enter the appropriate values for you in the cluster map.
|
||||
.. important:: The ``host`` setting's value is the short name of the node. It
|
||||
is not an FQDN. It is **NOT** an IP address. To retrieve the name of the
|
||||
node, enter ``hostname -s`` on the command line. Unless you are deploying
|
||||
Ceph manually, do not use ``host`` settings for anything other than initial
|
||||
monitor setup. **DO NOT** specify the ``host`` setting under individual
|
||||
daemons when using deployment tools like ``chef`` or ``cephadm``. Such tools
|
||||
are designed to enter the appropriate values for you in the cluster map.
|
||||
|
||||
|
||||
.. _ceph-network-config:
|
||||
@ -38,34 +40,35 @@ minimal settings for each instance of monitor daemons. For example:
|
||||
Networks
|
||||
========
|
||||
|
||||
See the `Network Configuration Reference`_ for a detailed discussion about
|
||||
configuring a network for use with Ceph.
|
||||
For more about configuring a network for use with Ceph, see the `Network
|
||||
Configuration Reference`_ .
|
||||
|
||||
|
||||
Monitors
|
||||
========
|
||||
|
||||
Production Ceph clusters typically provision a minimum of three :term:`Ceph Monitor`
|
||||
daemons to ensure availability should a monitor instance crash. A minimum of
|
||||
three ensures that the Paxos algorithm can determine which version
|
||||
of the :term:`Ceph Cluster Map` is the most recent from a majority of Ceph
|
||||
Ceph production clusters typically provision at least three :term:`Ceph
|
||||
Monitor` daemons to ensure availability in the event of a monitor instance
|
||||
crash. A minimum of three :term:`Ceph Monitor` daemons ensures that the Paxos
|
||||
algorithm is able to determine which version of the :term:`Ceph Cluster Map` is
|
||||
the most recent. It makes this determination by consulting a majority of Ceph
|
||||
Monitors in the quorum.
|
||||
|
||||
.. note:: You may deploy Ceph with a single monitor, but if the instance fails,
|
||||
the lack of other monitors may interrupt data service availability.
|
||||
the lack of other monitors might interrupt data-service availability.
|
||||
|
||||
Ceph Monitors normally listen on port ``3300`` for the new v2 protocol, and ``6789`` for the old v1 protocol.
|
||||
Ceph Monitors normally listen on port ``3300`` for the new v2 protocol, and on
|
||||
port ``6789`` for the old v1 protocol.
|
||||
|
||||
By default, Ceph expects to store monitor data under the
|
||||
following path::
|
||||
By default, Ceph expects to store monitor data on the following path::
|
||||
|
||||
/var/lib/ceph/mon/$cluster-$id
|
||||
/var/lib/ceph/mon/$cluster-$id
|
||||
|
||||
You or a deployment tool (e.g., ``cephadm``) must create the corresponding
|
||||
directory. With metavariables fully expressed and a cluster named "ceph", the
|
||||
foregoing directory would evaluate to::
|
||||
You or a deployment tool (for example, ``cephadm``) must create the
|
||||
corresponding directory. With metavariables fully expressed and a cluster named
|
||||
"ceph", the path specified in the above example evaluates to::
|
||||
|
||||
/var/lib/ceph/mon/ceph-a
|
||||
/var/lib/ceph/mon/ceph-a
|
||||
|
||||
For additional details, see the `Monitor Config Reference`_.
|
||||
|
||||
@ -74,22 +77,22 @@ For additional details, see the `Monitor Config Reference`_.
|
||||
|
||||
.. _ceph-osd-config:
|
||||
|
||||
|
||||
Authentication
|
||||
==============
|
||||
|
||||
.. versionadded:: Bobtail 0.56
|
||||
|
||||
For Bobtail (v 0.56) and beyond, you should expressly enable or disable
|
||||
authentication in the ``[global]`` section of your Ceph configuration file.
|
||||
Authentication is explicitly enabled or disabled in the ``[global]`` section of
|
||||
the Ceph configuration file, as shown here:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
auth_cluster_required = cephx
|
||||
auth_service_required = cephx
|
||||
auth_client_required = cephx
|
||||
auth_cluster_required = cephx
|
||||
auth_service_required = cephx
|
||||
auth_client_required = cephx
|
||||
|
||||
Additionally, you should enable message signing. See `Cephx Config Reference`_ for details.
|
||||
In addition, you should enable message signing. For details, see `Cephx Config
|
||||
Reference`_.
|
||||
|
||||
.. _Cephx Config Reference: ../auth-config-ref
|
||||
|
||||
@ -100,65 +103,68 @@ Additionally, you should enable message signing. See `Cephx Config Reference`_ f
|
||||
OSDs
|
||||
====
|
||||
|
||||
Ceph production clusters typically deploy :term:`Ceph OSD Daemons` where one node
|
||||
has one OSD daemon running a Filestore on one storage device. The BlueStore back
|
||||
end is now default, but when using Filestore you specify a journal size. For example:
|
||||
When Ceph production clusters deploy :term:`Ceph OSD Daemons`, the typical
|
||||
arrangement is that one node has one OSD daemon running Filestore on one
|
||||
storage device. BlueStore is now the default back end, but when using Filestore
|
||||
you must specify a journal size. For example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[osd]
|
||||
osd_journal_size = 10000
|
||||
[osd]
|
||||
osd_journal_size = 10000
|
||||
|
||||
[osd.0]
|
||||
host = {hostname} #manual deployments only.
|
||||
[osd.0]
|
||||
host = {hostname} #manual deployments only.
|
||||
|
||||
|
||||
By default, Ceph expects to store a Ceph OSD Daemon's data at the
|
||||
following path::
|
||||
By default, Ceph expects to store a Ceph OSD Daemon's data on the following
|
||||
path::
|
||||
|
||||
/var/lib/ceph/osd/$cluster-$id
|
||||
/var/lib/ceph/osd/$cluster-$id
|
||||
|
||||
You or a deployment tool (e.g., ``cephadm``) must create the corresponding
|
||||
directory. With metavariables fully expressed and a cluster named "ceph", this
|
||||
example would evaluate to::
|
||||
You or a deployment tool (for example, ``cephadm``) must create the
|
||||
corresponding directory. With metavariables fully expressed and a cluster named
|
||||
"ceph", the path specified in the above example evaluates to::
|
||||
|
||||
/var/lib/ceph/osd/ceph-0
|
||||
/var/lib/ceph/osd/ceph-0
|
||||
|
||||
You may override this path using the ``osd_data`` setting. We recommend not
|
||||
changing the default location. Create the default directory on your OSD host.
|
||||
You can override this path using the ``osd_data`` setting. We recommend that
|
||||
you do not change the default location. To create the default directory on your
|
||||
OSD host, run the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {osd-host}
|
||||
sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
|
||||
ssh {osd-host}
|
||||
sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
|
||||
|
||||
The ``osd_data`` path ideally leads to a mount point with a device that is
|
||||
separate from the device that contains the operating system and
|
||||
daemons. If an OSD is to use a device other than the OS device, prepare it for
|
||||
use with Ceph, and mount it to the directory you just created
|
||||
The ``osd_data`` path ought to lead to a mount point that has mounted on it a
|
||||
device that is distinct from the device that contains the operating system and
|
||||
the daemons. To use a device distinct from the device that contains the
|
||||
operating system and the daemons, prepare it for use with Ceph and mount it on
|
||||
the directory you just created by running the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {new-osd-host}
|
||||
sudo mkfs -t {fstype} /dev/{disk}
|
||||
sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
|
||||
ssh {new-osd-host}
|
||||
sudo mkfs -t {fstype} /dev/{disk}
|
||||
sudo mount -o user_xattr /dev/{disk} /var/lib/ceph/osd/ceph-{osd-number}
|
||||
|
||||
We recommend using the ``xfs`` file system when running
|
||||
:command:`mkfs`. (``btrfs`` and ``ext4`` are not recommended and are no
|
||||
longer tested.)
|
||||
We recommend using the ``xfs`` file system when running :command:`mkfs`. (The
|
||||
``btrfs`` and ``ext4`` file systems are not recommended and are no longer
|
||||
tested.)
|
||||
|
||||
See the `OSD Config Reference`_ for additional configuration details.
|
||||
For additional configuration details, see `OSD Config Reference`_.
|
||||
|
||||
|
||||
Heartbeats
|
||||
==========
|
||||
|
||||
During runtime operations, Ceph OSD Daemons check up on other Ceph OSD Daemons
|
||||
and report their findings to the Ceph Monitor. You do not have to provide any
|
||||
settings. However, if you have network latency issues, you may wish to modify
|
||||
the settings.
|
||||
and report their findings to the Ceph Monitor. This process does not require
|
||||
you to provide any settings. However, if you have network latency issues, you
|
||||
might want to modify the default settings.
|
||||
|
||||
See `Configuring Monitor/OSD Interaction`_ for additional details.
|
||||
For additional details, see `Configuring Monitor/OSD Interaction`_.
|
||||
|
||||
|
||||
.. _ceph-logging-and-debugging:
|
||||
@ -166,9 +172,9 @@ See `Configuring Monitor/OSD Interaction`_ for additional details.
|
||||
Logs / Debugging
|
||||
================
|
||||
|
||||
Sometimes you may encounter issues with Ceph that require
|
||||
modifying logging output and using Ceph's debugging. See `Debugging and
|
||||
Logging`_ for details on log rotation.
|
||||
You might sometimes encounter issues with Ceph that require you to use Ceph's
|
||||
logging and debugging features. For details on log rotation, see `Debugging and
|
||||
Logging`_.
|
||||
|
||||
.. _Debugging and Logging: ../../troubleshooting/log-and-debug
|
||||
|
||||
@ -186,33 +192,30 @@ Example ceph.conf
|
||||
Running Multiple Clusters (DEPRECATED)
|
||||
======================================
|
||||
|
||||
Each Ceph cluster has an internal name that is used as part of configuration
|
||||
and log file names as well as directory and mountpoint names. This name
|
||||
defaults to "ceph". Previous releases of Ceph allowed one to specify a custom
|
||||
name instead, for example "ceph2". This was intended to faciliate running
|
||||
multiple logical clusters on the same physical hardware, but in practice this
|
||||
was rarely exploited and should no longer be attempted. Prior documentation
|
||||
could also be misinterpreted as requiring unique cluster names in order to
|
||||
use ``rbd-mirror``.
|
||||
Each Ceph cluster has an internal name. This internal name is used as part of
|
||||
configuration, and as part of "log file" names as well as part of directory
|
||||
names and as part of mountpoint names. This name defaults to "ceph". Previous
|
||||
releases of Ceph allowed one to specify a custom name instead, for example
|
||||
"ceph2". This option was intended to facilitate the running of multiple logical
|
||||
clusters on the same physical hardware, but in practice it was rarely
|
||||
exploited. Custom cluster names should no longer be attempted. Old
|
||||
documentation might lead readers to wrongly think that unique cluster names are
|
||||
required to use ``rbd-mirror``. They are not required.
|
||||
|
||||
Custom cluster names are now considered deprecated and the ability to deploy
|
||||
them has already been removed from some tools, though existing custom name
|
||||
deployments continue to operate. The ability to run and manage clusters with
|
||||
custom names may be progressively removed by future Ceph releases, so it is
|
||||
strongly recommended to deploy all new clusters with the default name "ceph".
|
||||
them has already been removed from some tools, although existing custom-name
|
||||
deployments continue to operate. The ability to run and manage clusters with
|
||||
custom names might be progressively removed by future Ceph releases, so **it is
|
||||
strongly recommended to deploy all new clusters with the default name "ceph"**.
|
||||
|
||||
Some Ceph CLI commands accept an optional ``--cluster`` (cluster name) option. This
|
||||
option is present purely for backward compatibility and need not be accomodated
|
||||
by new tools and deployments.
|
||||
Some Ceph CLI commands accept a ``--cluster`` (cluster name) option. This
|
||||
option is present only for the sake of backward compatibility. New tools and
|
||||
deployments cannot be relied upon to accommodate this option.
|
||||
|
||||
If you do need to allow multiple clusters to exist on the same host, please use
|
||||
If you need to allow multiple clusters to exist on the same host, use
|
||||
:ref:`cephadm`, which uses containers to fully isolate each cluster.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.. _Hardware Recommendations: ../../../start/hardware-recommendations
|
||||
.. _Network Configuration Reference: ../network-config-ref
|
||||
.. _OSD Config Reference: ../osd-config-ref
|
||||
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction
|
||||
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interactio
|
||||
|
@ -2,15 +2,19 @@
|
||||
Looking up Monitors through DNS
|
||||
===============================
|
||||
|
||||
Since version 11.0.0 RADOS supports looking up Monitors through DNS.
|
||||
Since Ceph version 11.0.0 (Kraken), RADOS has supported looking up monitors
|
||||
through DNS.
|
||||
|
||||
This way daemons and clients do not require a *mon host* configuration directive in their ceph.conf configuration file.
|
||||
The addition of the ability to look up monitors through DNS means that daemons
|
||||
and clients do not require a *mon host* configuration directive in their
|
||||
``ceph.conf`` configuration file.
|
||||
|
||||
Using DNS SRV TCP records clients are able to look up the monitors.
|
||||
With a DNS update, clients and daemons can be made aware of changes
|
||||
in the monitor topology. To be more precise and technical, clients look up the
|
||||
monitors by using ``DNS SRV TCP`` records.
|
||||
|
||||
This allows for less configuration on clients and monitors. Using a DNS update clients and daemons can be made aware of changes in the monitor topology.
|
||||
|
||||
By default clients and daemons will look for the TCP service called *ceph-mon* which is configured by the *mon_dns_srv_name* configuration directive.
|
||||
By default, clients and daemons look for the TCP service called *ceph-mon*,
|
||||
which is configured by the *mon_dns_srv_name* configuration directive.
|
||||
|
||||
|
||||
``mon dns srv name``
|
||||
|
@ -92,8 +92,7 @@ Similarly, two options control whether IPv4 and IPv6 addresses are used:
|
||||
|
||||
.. note:: The ability to bind to multiple ports has paved the way for
|
||||
dual-stack IPv4 and IPv6 support. That said, dual-stack support is
|
||||
not yet tested as of Nautilus v14.2.0 and likely needs some
|
||||
additional code changes to work correctly.
|
||||
not yet supported as of Quincy v17.2.0.
|
||||
|
||||
Connection modes
|
||||
----------------
|
||||
|
@ -196,6 +196,8 @@ See `Pool & PG Config Reference`_ for details.
|
||||
|
||||
.. index:: OSD; scrubbing
|
||||
|
||||
.. _rados_config_scrubbing:
|
||||
|
||||
Scrubbing
|
||||
=========
|
||||
|
||||
|
@ -4,13 +4,12 @@
|
||||
|
||||
.. index:: pools; configuration
|
||||
|
||||
When you create pools and set the number of placement groups (PGs) for each, Ceph
|
||||
uses default values when you don't specifically override the defaults. **We
|
||||
recommend** overriding some of the defaults. Specifically, we recommend setting
|
||||
a pool's replica size and overriding the default number of placement groups. You
|
||||
can specifically set these values when running `pool`_ commands. You can also
|
||||
override the defaults by adding new ones in the ``[global]`` section of your
|
||||
Ceph configuration file.
|
||||
Ceph uses default values to determine how many placement groups (PGs) will be
|
||||
assigned to each pool. We recommend overriding some of the defaults.
|
||||
Specifically, we recommend setting a pool's replica size and overriding the
|
||||
default number of placement groups. You can set these values when running
|
||||
`pool`_ commands. You can also override the defaults by adding new ones in the
|
||||
``[global]`` section of your Ceph configuration file.
|
||||
|
||||
|
||||
.. literalinclude:: pool-pg.conf
|
||||
|
@ -2,64 +2,65 @@
|
||||
BlueStore Migration
|
||||
=====================
|
||||
|
||||
Each OSD can run either BlueStore or FileStore, and a single Ceph
|
||||
cluster can contain a mix of both. Users who have previously deployed
|
||||
FileStore are likely to want to transition to BlueStore in order to
|
||||
take advantage of the improved performance and robustness. There are
|
||||
several strategies for making such a transition.
|
||||
Each OSD must be formatted as either Filestore or BlueStore. However, a Ceph
|
||||
cluster can operate with a mixture of both Filestore OSDs and BlueStore OSDs.
|
||||
Because BlueStore is superior to Filestore in performance and robustness, and
|
||||
because Filestore is not supported by Ceph releases beginning with Reef, users
|
||||
deploying Filestore OSDs should transition to BlueStore. There are several
|
||||
strategies for making the transition to BlueStore.
|
||||
|
||||
An individual OSD cannot be converted in place in isolation, however:
|
||||
BlueStore and FileStore are simply too different for that to be
|
||||
practical. "Conversion" will rely either on the cluster's normal
|
||||
replication and healing support or tools and strategies that copy OSD
|
||||
content from an old (FileStore) device to a new (BlueStore) one.
|
||||
BlueStore is so different from Filestore that an individual OSD cannot be
|
||||
converted in place. Instead, the conversion process must use either (1) the
|
||||
cluster's normal replication and healing support, or (2) tools and strategies
|
||||
that copy OSD content from an old (Filestore) device to a new (BlueStore) one.
|
||||
|
||||
Deploying new OSDs with BlueStore
|
||||
=================================
|
||||
|
||||
Deploy new OSDs with BlueStore
|
||||
==============================
|
||||
Use BlueStore when deploying new OSDs (for example, when the cluster is
|
||||
expanded). Because this is the default behavior, no specific change is
|
||||
needed.
|
||||
|
||||
Any new OSDs (e.g., when the cluster is expanded) can be deployed
|
||||
using BlueStore. This is the default behavior so no specific change
|
||||
is needed.
|
||||
Similarly, use BlueStore for any OSDs that have been reprovisioned after
|
||||
a failed drive was replaced.
|
||||
|
||||
Similarly, any OSDs that are reprovisioned after replacing a failed drive
|
||||
can use BlueStore.
|
||||
Converting existing OSDs
|
||||
========================
|
||||
|
||||
Convert existing OSDs
|
||||
=====================
|
||||
"Mark-``out``" replacement
|
||||
--------------------------
|
||||
|
||||
Mark out and replace
|
||||
--------------------
|
||||
The simplest approach is to verify that the cluster is healthy and
|
||||
then follow these steps for each Filestore OSD in succession: mark the OSD
|
||||
``out``, wait for the data to replicate across the cluster, reprovision the OSD,
|
||||
mark the OSD back ``in``, and wait for recovery to complete before proceeding
|
||||
to the next OSD. This approach is easy to automate, but it entails unnecessary
|
||||
data migration that carries costs in time and SSD wear.
|
||||
|
||||
The simplest approach is to mark out each device in turn, wait for the
|
||||
data to replicate across the cluster, reprovision the OSD, and mark
|
||||
it back in again. It is simple and easy to automate. However, it requires
|
||||
more data migration than should be necessary, so it is not optimal.
|
||||
|
||||
#. Identify a FileStore OSD to replace::
|
||||
#. Identify a Filestore OSD to replace::
|
||||
|
||||
ID=<osd-id-number>
|
||||
DEVICE=<disk-device>
|
||||
|
||||
You can tell whether a given OSD is FileStore or BlueStore with:
|
||||
#. Determine whether a given OSD is Filestore or BlueStore:
|
||||
|
||||
.. prompt:: bash $
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd metadata $ID | grep osd_objectstore
|
||||
ceph osd metadata $ID | grep osd_objectstore
|
||||
|
||||
You can get a current count of filestore vs bluestore with:
|
||||
#. Get a current count of Filestore and BlueStore OSDs:
|
||||
|
||||
.. prompt:: bash $
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd count-metadata osd_objectstore
|
||||
ceph osd count-metadata osd_objectstore
|
||||
|
||||
#. Mark the filestore OSD out:
|
||||
#. Mark a Filestore OSD ``out``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd out $ID
|
||||
|
||||
#. Wait for the data to migrate off the OSD in question:
|
||||
#. Wait for the data to migrate off this OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -71,7 +72,9 @@ more data migration than should be necessary, so it is not optimal.
|
||||
|
||||
systemctl kill ceph-osd@$ID
|
||||
|
||||
#. Make note of which device this OSD is using:
|
||||
.. _osd_id_retrieval:
|
||||
|
||||
#. Note which device the OSD is using:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -83,24 +86,27 @@ more data migration than should be necessary, so it is not optimal.
|
||||
|
||||
umount /var/lib/ceph/osd/ceph-$ID
|
||||
|
||||
#. Destroy the OSD data. Be *EXTREMELY CAREFUL* as this will destroy
|
||||
the contents of the device; be certain the data on the device is
|
||||
not needed (i.e., that the cluster is healthy) before proceeding:
|
||||
#. Destroy the OSD's data. Be *EXTREMELY CAREFUL*! These commands will destroy
|
||||
the contents of the device; you must be certain that the data on the device is
|
||||
not needed (in other words, that the cluster is healthy) before proceeding:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm zap $DEVICE
|
||||
|
||||
#. Tell the cluster the OSD has been destroyed (and a new OSD can be
|
||||
reprovisioned with the same ID):
|
||||
#. Tell the cluster that the OSD has been destroyed (and that a new OSD can be
|
||||
reprovisioned with the same OSD ID):
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd destroy $ID --yes-i-really-mean-it
|
||||
|
||||
#. Reprovision a BlueStore OSD in its place with the same OSD ID.
|
||||
This requires you do identify which device to wipe based on what you saw
|
||||
mounted above. BE CAREFUL! :
|
||||
#. Provision a BlueStore OSD in place by using the same OSD ID. This requires
|
||||
you to identify which device to wipe, and to make certain that you target
|
||||
the correct and intended device, using the information that was retrieved in
|
||||
the :ref:`"Note which device the OSD is using" <osd_id_retrieval>` step. BE
|
||||
CAREFUL! Note that you may need to modify these commands when dealing with
|
||||
hybrid OSDs:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -108,12 +114,15 @@ more data migration than should be necessary, so it is not optimal.
|
||||
|
||||
#. Repeat.
|
||||
|
||||
You can allow the refilling of the replacement OSD to happen
|
||||
concurrently with the draining of the next OSD, or follow the same
|
||||
procedure for multiple OSDs in parallel, as long as you ensure the
|
||||
cluster is fully clean (all data has all replicas) before destroying
|
||||
any OSDs. Failure to do so will reduce the redundancy of your data
|
||||
and increase the risk of (or potentially even cause) data loss.
|
||||
You may opt to (1) have the balancing of the replacement BlueStore OSD take
|
||||
place concurrently with the draining of the next Filestore OSD, or instead
|
||||
(2) follow the same procedure for multiple OSDs in parallel. In either case,
|
||||
however, you must ensure that the cluster is fully clean (in other words, that
|
||||
all data has all replicas) before destroying any OSDs. If you opt to reprovision
|
||||
multiple OSDs in parallel, be **very** careful to destroy OSDs only within a
|
||||
single CRUSH failure domain (for example, ``host`` or ``rack``). Failure to
|
||||
satisfy this requirement will reduce the redundancy and availability of your
|
||||
data and increase the risk of data loss (or even guarantee data loss).
|
||||
|
||||
Advantages:
|
||||
|
||||
@ -123,55 +132,53 @@ Advantages:
|
||||
|
||||
Disadvantages:
|
||||
|
||||
* Data is copied over the network twice: once to some other OSD in the
|
||||
cluster (to maintain the desired number of replicas), and then again
|
||||
back to the reprovisioned BlueStore OSD.
|
||||
* Data is copied over the network twice: once to another OSD in the cluster (to
|
||||
maintain the specified number of replicas), and again back to the
|
||||
reprovisioned BlueStore OSD.
|
||||
|
||||
"Whole host" replacement
|
||||
------------------------
|
||||
|
||||
Whole host replacement
|
||||
----------------------
|
||||
If you have a spare host in the cluster, or sufficient free space to evacuate
|
||||
an entire host for use as a spare, then the conversion can be done on a
|
||||
host-by-host basis so that each stored copy of the data is migrated only once.
|
||||
|
||||
If you have a spare host in the cluster, or have sufficient free space
|
||||
to evacuate an entire host in order to use it as a spare, then the
|
||||
conversion can be done on a host-by-host basis with each stored copy of
|
||||
the data migrating only once.
|
||||
To use this approach, you need an empty host that has no OSDs provisioned.
|
||||
There are two ways to do this: either by using a new, empty host that is not
|
||||
yet part of the cluster, or by offloading data from an existing host that is
|
||||
already part of the cluster.
|
||||
|
||||
First, you need have empty host that has no data. There are two ways to do this: either by starting with a new, empty host that isn't yet part of the cluster, or by offloading data from an existing host that in the cluster.
|
||||
Using a new, empty host
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Use a new, empty host
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
Ideally the host will have roughly the same capacity as each of the other hosts
|
||||
you will be converting. Add the host to the CRUSH hierarchy, but do not attach
|
||||
it to the root:
|
||||
|
||||
Ideally the host should have roughly the
|
||||
same capacity as other hosts you will be converting (although it
|
||||
doesn't strictly matter). ::
|
||||
|
||||
NEWHOST=<empty-host-name>
|
||||
|
||||
Add the host to the CRUSH hierarchy, but do not attach it to the root:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
NEWHOST=<empty-host-name>
|
||||
ceph osd crush add-bucket $NEWHOST host
|
||||
|
||||
Make sure the ceph packages are installed.
|
||||
Make sure that Ceph packages are installed on the new host.
|
||||
|
||||
Use an existing host
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
Using an existing host
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If you would like to use an existing host
|
||||
that is already part of the cluster, and there is sufficient free
|
||||
space on that host so that all of its data can be migrated off,
|
||||
then you can instead do::
|
||||
If you would like to use an existing host that is already part of the cluster,
|
||||
and if there is sufficient free space on that host so that all of its data can
|
||||
be migrated off to other cluster hosts, you can do the following (instead of
|
||||
using a new, empty host):
|
||||
|
||||
OLDHOST=<existing-cluster-host-to-offload>
|
||||
.. prompt:: bash $
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
OLDHOST=<existing-cluster-host-to-offload>
|
||||
ceph osd crush unlink $OLDHOST default
|
||||
|
||||
where "default" is the immediate ancestor in the CRUSH map. (For
|
||||
smaller clusters with unmodified configurations this will normally
|
||||
be "default", but it might also be a rack name.) You should now
|
||||
be "default", but it might instead be a rack name.) You should now
|
||||
see the host at the top of the OSD tree output with no parent:
|
||||
|
||||
.. prompt:: bash $
|
||||
@ -192,15 +199,18 @@ see the host at the top of the OSD tree output with no parent:
|
||||
2 ssd 1.00000 osd.2 up 1.00000 1.00000
|
||||
...
|
||||
|
||||
If everything looks good, jump directly to the "Wait for data
|
||||
migration to complete" step below and proceed from there to clean up
|
||||
the old OSDs.
|
||||
If everything looks good, jump directly to the :ref:`"Wait for the data
|
||||
migration to complete" <bluestore_data_migration_step>` step below and proceed
|
||||
from there to clean up the old OSDs.
|
||||
|
||||
Migration process
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
If you're using a new host, start at step #1. For an existing host,
|
||||
jump to step #5 below.
|
||||
If you're using a new host, start at :ref:`the first step
|
||||
<bluestore_migration_process_first_step>`. If you're using an existing host,
|
||||
jump to :ref:`this step <bluestore_data_migration_step>`.
|
||||
|
||||
.. _bluestore_migration_process_first_step:
|
||||
|
||||
#. Provision new BlueStore OSDs for all devices:
|
||||
|
||||
@ -208,14 +218,14 @@ jump to step #5 below.
|
||||
|
||||
ceph-volume lvm create --bluestore --data /dev/$DEVICE
|
||||
|
||||
#. Verify OSDs join the cluster with:
|
||||
#. Verify that the new OSDs have joined the cluster:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tree
|
||||
|
||||
You should see the new host ``$NEWHOST`` with all of the OSDs beneath
|
||||
it, but the host should *not* be nested beneath any other node in
|
||||
it, but the host should *not* be nested beneath any other node in the
|
||||
hierarchy (like ``root default``). For example, if ``newhost`` is
|
||||
the empty host, you might see something like::
|
||||
|
||||
@ -244,13 +254,16 @@ jump to step #5 below.
|
||||
|
||||
ceph osd crush swap-bucket $NEWHOST $OLDHOST
|
||||
|
||||
At this point all data on ``$OLDHOST`` will start migrating to OSDs
|
||||
on ``$NEWHOST``. If there is a difference in the total capacity of
|
||||
the old and new hosts you may also see some data migrate to or from
|
||||
other nodes in the cluster, but as long as the hosts are similarly
|
||||
sized this will be a relatively small amount of data.
|
||||
At this point all data on ``$OLDHOST`` will begin migrating to the OSDs on
|
||||
``$NEWHOST``. If there is a difference between the total capacity of the
|
||||
old hosts and the total capacity of the new hosts, you may also see some
|
||||
data migrate to or from other nodes in the cluster. Provided that the hosts
|
||||
are similarly sized, however, this will be a relatively small amount of
|
||||
data.
|
||||
|
||||
#. Wait for data migration to complete:
|
||||
.. _bluestore_data_migration_step:
|
||||
|
||||
#. Wait for the data migration to complete:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -261,8 +274,8 @@ jump to step #5 below.
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh $OLDHOST
|
||||
systemctl kill ceph-osd.target
|
||||
umount /var/lib/ceph/osd/ceph-*
|
||||
systemctl kill ceph-osd.target
|
||||
umount /var/lib/ceph/osd/ceph-*
|
||||
|
||||
#. Destroy and purge the old OSDs:
|
||||
|
||||
@ -270,69 +283,71 @@ jump to step #5 below.
|
||||
|
||||
for osd in `ceph osd ls-tree $OLDHOST`; do
|
||||
ceph osd purge $osd --yes-i-really-mean-it
|
||||
done
|
||||
done
|
||||
|
||||
#. Wipe the old OSD devices. This requires you do identify which
|
||||
devices are to be wiped manually (BE CAREFUL!). For each device:
|
||||
#. Wipe the old OSDs. This requires you to identify which devices are to be
|
||||
wiped manually. BE CAREFUL! For each device:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm zap $DEVICE
|
||||
|
||||
#. Use the now-empty host as the new host, and repeat::
|
||||
#. Use the now-empty host as the new host, and repeat:
|
||||
|
||||
NEWHOST=$OLDHOST
|
||||
.. prompt:: bash $
|
||||
|
||||
NEWHOST=$OLDHOST
|
||||
|
||||
Advantages:
|
||||
|
||||
* Data is copied over the network only once.
|
||||
* Converts an entire host's OSDs at once.
|
||||
* Can parallelize to converting multiple hosts at a time.
|
||||
* No spare devices are required on each host.
|
||||
* An entire host's OSDs are converted at once.
|
||||
* Can be parallelized, to make possible the conversion of multiple hosts at the same time.
|
||||
* No host involved in this process needs to have a spare device.
|
||||
|
||||
Disadvantages:
|
||||
|
||||
* A spare host is required.
|
||||
* An entire host's worth of OSDs will be migrating data at a time. This
|
||||
is like likely to impact overall cluster performance.
|
||||
* An entire host's worth of OSDs will be migrating data at a time. This
|
||||
is likely to impact overall cluster performance.
|
||||
* All migrated data still makes one full hop over the network.
|
||||
|
||||
|
||||
Per-OSD device copy
|
||||
-------------------
|
||||
|
||||
A single logical OSD can be converted by using the ``copy`` function
|
||||
of ``ceph-objectstore-tool``. This requires that the host have a free
|
||||
device (or devices) to provision a new, empty BlueStore OSD. For
|
||||
example, if each host in your cluster has 12 OSDs, then you'd need a
|
||||
13th available device so that each OSD can be converted in turn before the
|
||||
old device is reclaimed to convert the next OSD.
|
||||
included in ``ceph-objectstore-tool``. This requires that the host have one or more free
|
||||
devices to provision a new, empty BlueStore OSD. For
|
||||
example, if each host in your cluster has twelve OSDs, then you need a
|
||||
thirteenth unused OSD so that each OSD can be converted before the
|
||||
previous OSD is reclaimed to convert the next OSD.
|
||||
|
||||
Caveats:
|
||||
|
||||
* This strategy requires that a blank BlueStore OSD be prepared
|
||||
without allocating a new OSD ID, something that the ``ceph-volume``
|
||||
tool doesn't support. More importantly, the setup of *dmcrypt* is
|
||||
closely tied to the OSD identity, which means that this approach
|
||||
does not work with encrypted OSDs.
|
||||
* This approach requires that we prepare an empty BlueStore OSD but that we do not allocate
|
||||
a new OSD ID to it. The ``ceph-volume`` tool does not support such an operation. **IMPORTANT:**
|
||||
because the setup of *dmcrypt* is closely tied to the identity of the OSD, this approach does not
|
||||
work with encrypted OSDs.
|
||||
|
||||
* The device must be manually partitioned.
|
||||
|
||||
* Tooling not implemented!
|
||||
|
||||
* Not documented!
|
||||
* An unsupported user-contributed script that demonstrates this process may be found here:
|
||||
https://github.com/ceph/ceph/blob/master/src/script/contrib/ceph-migrate-bluestore.bash
|
||||
|
||||
Advantages:
|
||||
|
||||
* Little or no data migrates over the network during the conversion.
|
||||
* Provided that the 'noout' or the 'norecover'/'norebalance' flags are set on the OSD or the
|
||||
cluster while the conversion process is underway, little or no data migrates over the
|
||||
network during the conversion.
|
||||
|
||||
Disadvantages:
|
||||
|
||||
* Tooling not fully implemented.
|
||||
* Process not documented.
|
||||
* Each host must have a spare or empty device.
|
||||
* The OSD is offline during the conversion, which means new writes will
|
||||
be written to only a subset of the OSDs. This increases the risk of data
|
||||
loss due to a subsequent failure. (However, if there is a failure before
|
||||
conversion is complete, the original FileStore OSD can be started to provide
|
||||
access to its original data.)
|
||||
* Tooling is not fully implemented, supported, or documented.
|
||||
|
||||
* Each host must have an appropriate spare or empty device for staging.
|
||||
|
||||
* The OSD is offline during the conversion, which means new writes to PGs
|
||||
with the OSD in their acting set may not be ideally redundant until the
|
||||
subject OSD comes up and recovers. This increases the risk of data
|
||||
loss due to an overlapping failure. However, if another OSD fails before
|
||||
conversion and startup have completed, the original Filestore OSD can be
|
||||
started to provide access to its original data.
|
||||
|
@ -584,11 +584,11 @@ output::
|
||||
|
||||
A dump of the monitor state:
|
||||
|
||||
.. prompt:: bash $
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon dump
|
||||
ceph mon dump
|
||||
|
||||
::
|
||||
::
|
||||
|
||||
dumped monmap epoch 2
|
||||
epoch 2
|
||||
|
@ -1,14 +1,14 @@
|
||||
.. _ecpool:
|
||||
|
||||
=============
|
||||
==============
|
||||
Erasure code
|
||||
=============
|
||||
==============
|
||||
|
||||
By default, Ceph `pools <../pools>`_ are created with the type "replicated". In
|
||||
replicated-type pools, every object is copied to multiple disks (this
|
||||
multiple copying is the "replication").
|
||||
replicated-type pools, every object is copied to multiple disks. This
|
||||
multiple copying is the method of data protection known as "replication".
|
||||
|
||||
In contrast, `erasure-coded <https://en.wikipedia.org/wiki/Erasure_code>`_
|
||||
By contrast, `erasure-coded <https://en.wikipedia.org/wiki/Erasure_code>`_
|
||||
pools use a method of data protection that is different from replication. In
|
||||
erasure coding, data is broken into fragments of two kinds: data blocks and
|
||||
parity blocks. If a drive fails or becomes corrupted, the parity blocks are
|
||||
@ -16,17 +16,17 @@ used to rebuild the data. At scale, erasure coding saves space relative to
|
||||
replication.
|
||||
|
||||
In this documentation, data blocks are referred to as "data chunks"
|
||||
and parity blocks are referred to as "encoding chunks".
|
||||
and parity blocks are referred to as "coding chunks".
|
||||
|
||||
Erasure codes are also called "forward error correction codes". The
|
||||
first forward error correction code was developed in 1950 by Richard
|
||||
Hamming at Bell Laboratories.
|
||||
|
||||
|
||||
Creating a sample erasure coded pool
|
||||
Creating a sample erasure-coded pool
|
||||
------------------------------------
|
||||
|
||||
The simplest erasure coded pool is equivalent to `RAID5
|
||||
The simplest erasure-coded pool is similar to `RAID5
|
||||
<https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5>`_ and
|
||||
requires at least three hosts:
|
||||
|
||||
@ -47,12 +47,13 @@ requires at least three hosts:
|
||||
|
||||
ABCDEFGHI
|
||||
|
||||
Erasure code profiles
|
||||
Erasure-code profiles
|
||||
---------------------
|
||||
|
||||
The default erasure code profile can sustain the loss of two OSDs. This erasure
|
||||
code profile is equivalent to a replicated pool of size three, but requires
|
||||
2TB to store 1TB of data instead of 3TB to store 1TB of data. The default
|
||||
The default erasure-code profile can sustain the overlapping loss of two OSDs
|
||||
without losing data. This erasure-code profile is equivalent to a replicated
|
||||
pool of size three, but with different storage requirements: instead of
|
||||
requiring 3TB to store 1TB, it requires only 2TB to store 1TB. The default
|
||||
profile can be displayed with this command:
|
||||
|
||||
.. prompt:: bash $
|
||||
@ -68,26 +69,27 @@ profile can be displayed with this command:
|
||||
technique=reed_sol_van
|
||||
|
||||
.. note::
|
||||
The default erasure-coded pool, the profile of which is displayed here, is
|
||||
not the same as the simplest erasure-coded pool.
|
||||
|
||||
The default erasure-coded pool has two data chunks (k) and two coding chunks
|
||||
(m). The profile of the default erasure-coded pool is "k=2 m=2".
|
||||
The profile just displayed is for the *default* erasure-coded pool, not the
|
||||
*simplest* erasure-coded pool. These two pools are not the same:
|
||||
|
||||
The simplest erasure-coded pool has two data chunks (k) and one coding chunk
|
||||
(m). The profile of the simplest erasure-coded pool is "k=2 m=1".
|
||||
The default erasure-coded pool has two data chunks (K) and two coding chunks
|
||||
(M). The profile of the default erasure-coded pool is "k=2 m=2".
|
||||
|
||||
The simplest erasure-coded pool has two data chunks (K) and one coding chunk
|
||||
(M). The profile of the simplest erasure-coded pool is "k=2 m=1".
|
||||
|
||||
Choosing the right profile is important because the profile cannot be modified
|
||||
after the pool is created. If you find that you need an erasure-coded pool with
|
||||
a profile different than the one you have created, you must create a new pool
|
||||
with a different (and presumably more carefully-considered) profile. When the
|
||||
new pool is created, all objects from the wrongly-configured pool must be moved
|
||||
to the newly-created pool. There is no way to alter the profile of a pool after its creation.
|
||||
with a different (and presumably more carefully considered) profile. When the
|
||||
new pool is created, all objects from the wrongly configured pool must be moved
|
||||
to the newly created pool. There is no way to alter the profile of a pool after
|
||||
the pool has been created.
|
||||
|
||||
The most important parameters of the profile are *K*, *M* and
|
||||
The most important parameters of the profile are *K*, *M*, and
|
||||
*crush-failure-domain* because they define the storage overhead and
|
||||
the data durability. For example, if the desired architecture must
|
||||
sustain the loss of two racks with a storage overhead of 67% overhead,
|
||||
sustain the loss of two racks with a storage overhead of 67%,
|
||||
the following profile can be defined:
|
||||
|
||||
.. prompt:: bash $
|
||||
@ -106,7 +108,7 @@ the following profile can be defined:
|
||||
|
||||
The *NYAN* object will be divided in three (*K=3*) and two additional
|
||||
*chunks* will be created (*M=2*). The value of *M* defines how many
|
||||
OSD can be lost simultaneously without losing any data. The
|
||||
OSDs can be lost simultaneously without losing any data. The
|
||||
*crush-failure-domain=rack* will create a CRUSH rule that ensures
|
||||
no two *chunks* are stored in the same rack.
|
||||
|
||||
@ -155,51 +157,53 @@ no two *chunks* are stored in the same rack.
|
||||
+------+
|
||||
|
||||
|
||||
More information can be found in the `erasure code profiles
|
||||
More information can be found in the `erasure-code profiles
|
||||
<../erasure-code-profile>`_ documentation.
|
||||
|
||||
|
||||
Erasure Coding with Overwrites
|
||||
------------------------------
|
||||
|
||||
By default, erasure coded pools only work with uses like RGW that
|
||||
perform full object writes and appends.
|
||||
By default, erasure-coded pools work only with operations that
|
||||
perform full object writes and appends (for example, RGW).
|
||||
|
||||
Since Luminous, partial writes for an erasure coded pool may be
|
||||
Since Luminous, partial writes for an erasure-coded pool may be
|
||||
enabled with a per-pool setting. This lets RBD and CephFS store their
|
||||
data in an erasure coded pool:
|
||||
data in an erasure-coded pool:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set ec_pool allow_ec_overwrites true
|
||||
|
||||
This can only be enabled on a pool residing on bluestore OSDs, since
|
||||
bluestore's checksumming is used to detect bitrot or other corruption
|
||||
during deep-scrub. In addition to being unsafe, using filestore with
|
||||
ec overwrites yields low performance compared to bluestore.
|
||||
This can be enabled only on a pool residing on BlueStore OSDs, since
|
||||
BlueStore's checksumming is used during deep scrubs to detect bitrot
|
||||
or other corruption. Using Filestore with EC overwrites is not only
|
||||
unsafe, but it also results in lower performance compared to BlueStore.
|
||||
|
||||
Erasure coded pools do not support omap, so to use them with RBD and
|
||||
CephFS you must instruct them to store their data in an ec pool, and
|
||||
Erasure-coded pools do not support omap, so to use them with RBD and
|
||||
CephFS you must instruct them to store their data in an EC pool and
|
||||
their metadata in a replicated pool. For RBD, this means using the
|
||||
erasure coded pool as the ``--data-pool`` during image creation:
|
||||
erasure-coded pool as the ``--data-pool`` during image creation:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
rbd create --size 1G --data-pool ec_pool replicated_pool/image_name
|
||||
|
||||
For CephFS, an erasure coded pool can be set as the default data pool during
|
||||
For CephFS, an erasure-coded pool can be set as the default data pool during
|
||||
file system creation or via `file layouts <../../../cephfs/file-layouts>`_.
|
||||
|
||||
|
||||
Erasure coded pool and cache tiering
|
||||
------------------------------------
|
||||
Erasure-coded pools and cache tiering
|
||||
-------------------------------------
|
||||
|
||||
Erasure coded pools require more resources than replicated pools and
|
||||
lack some functionalities such as omap. To overcome these
|
||||
limitations, one can set up a `cache tier <../cache-tiering>`_
|
||||
before the erasure coded pool.
|
||||
Erasure-coded pools require more resources than replicated pools and
|
||||
lack some of the functionality supported by replicated pools (for example, omap).
|
||||
To overcome these limitations, one can set up a `cache tier <../cache-tiering>`_
|
||||
before setting up the erasure-coded pool.
|
||||
|
||||
For instance, if the pool *hot-storage* is made of fast storage:
|
||||
For example, if the pool *hot-storage* is made of fast storage, the following commands
|
||||
will place the *hot-storage* pool as a tier of *ecpool* in *writeback*
|
||||
mode:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -207,56 +211,60 @@ For instance, if the pool *hot-storage* is made of fast storage:
|
||||
ceph osd tier cache-mode hot-storage writeback
|
||||
ceph osd tier set-overlay ecpool hot-storage
|
||||
|
||||
will place the *hot-storage* pool as tier of *ecpool* in *writeback*
|
||||
mode so that every write and read to the *ecpool* are actually using
|
||||
the *hot-storage* and benefit from its flexibility and speed.
|
||||
The result is that every write and read to the *ecpool* actually uses
|
||||
the *hot-storage* pool and benefits from its flexibility and speed.
|
||||
|
||||
More information can be found in the `cache tiering
|
||||
<../cache-tiering>`_ documentation.
|
||||
<../cache-tiering>`_ documentation. Note, however, that cache tiering
|
||||
is deprecated and may be removed completely in a future release.
|
||||
|
||||
Erasure coded pool recovery
|
||||
Erasure-coded pool recovery
|
||||
---------------------------
|
||||
If an erasure coded pool loses some shards, it must recover them from the others.
|
||||
This generally involves reading from the remaining shards, reconstructing the data, and
|
||||
writing it to the new peer.
|
||||
In Octopus, erasure coded pools can recover as long as there are at least *K* shards
|
||||
If an erasure-coded pool loses any data shards, it must recover them from others.
|
||||
This recovery involves reading from the remaining shards, reconstructing the data, and
|
||||
writing new shards.
|
||||
|
||||
In Octopus and later releases, erasure-coded pools can recover as long as there are at least *K* shards
|
||||
available. (With fewer than *K* shards, you have actually lost data!)
|
||||
|
||||
Prior to Octopus, erasure coded pools required at least *min_size* shards to be
|
||||
available, even if *min_size* is greater than *K*. (We generally recommend min_size
|
||||
be *K+2* or more to prevent loss of writes and data.)
|
||||
This conservative decision was made out of an abundance of caution when designing the new pool
|
||||
mode but also meant pools with lost OSDs but no data loss were unable to recover and go active
|
||||
without manual intervention to change the *min_size*.
|
||||
Prior to Octopus, erasure-coded pools required that at least ``min_size`` shards be
|
||||
available, even if ``min_size`` was greater than ``K``. This was a conservative
|
||||
decision made out of an abundance of caution when designing the new pool
|
||||
mode. As a result, however, pools with lost OSDs but without complete data loss were
|
||||
unable to recover and go active without manual intervention to temporarily change
|
||||
the ``min_size`` setting.
|
||||
|
||||
We recommend that ``min_size`` be ``K+2`` or greater to prevent loss of writes and
|
||||
loss of data.
|
||||
|
||||
|
||||
|
||||
Glossary
|
||||
--------
|
||||
|
||||
*chunk*
|
||||
when the encoding function is called, it returns chunks of the same
|
||||
size. Data chunks which can be concatenated to reconstruct the original
|
||||
object and coding chunks which can be used to rebuild a lost chunk.
|
||||
When the encoding function is called, it returns chunks of the same size as each other. There are two
|
||||
kinds of chunks: (1) *data chunks*, which can be concatenated to reconstruct the original object, and
|
||||
(2) *coding chunks*, which can be used to rebuild a lost chunk.
|
||||
|
||||
*K*
|
||||
the number of data *chunks*, i.e. the number of *chunks* in which the
|
||||
original object is divided. For instance if *K* = 2 a 10KB object
|
||||
will be divided into *K* objects of 5KB each.
|
||||
The number of data chunks into which an object is divided. For example, if *K* = 2, then a 10KB object
|
||||
is divided into two objects of 5KB each.
|
||||
|
||||
*M*
|
||||
the number of coding *chunks*, i.e. the number of additional *chunks*
|
||||
computed by the encoding functions. If there are 2 coding *chunks*,
|
||||
it means 2 OSDs can be out without losing data.
|
||||
The number of coding chunks computed by the encoding function. *M* is equal to the number of OSDs that can
|
||||
be missing from the cluster without the cluster suffering data loss. For example, if there are two coding
|
||||
chunks, then two OSDs can be missing without data loss.
|
||||
|
||||
|
||||
Table of content
|
||||
----------------
|
||||
Table of contents
|
||||
-----------------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:maxdepth: 1
|
||||
|
||||
erasure-code-profile
|
||||
erasure-code-jerasure
|
||||
erasure-code-isa
|
||||
erasure-code-lrc
|
||||
erasure-code-shec
|
||||
erasure-code-clay
|
||||
erasure-code-profile
|
||||
erasure-code-jerasure
|
||||
erasure-code-isa
|
||||
erasure-code-lrc
|
||||
erasure-code-shec
|
||||
erasure-code-clay
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -3,35 +3,36 @@
|
||||
=========================
|
||||
|
||||
High availability and high reliability require a fault-tolerant approach to
|
||||
managing hardware and software issues. Ceph has no single point-of-failure, and
|
||||
can service requests for data in a "degraded" mode. Ceph's `data placement`_
|
||||
introduces a layer of indirection to ensure that data doesn't bind directly to
|
||||
particular OSD addresses. This means that tracking down system faults requires
|
||||
finding the `placement group`_ and the underlying OSDs at root of the problem.
|
||||
managing hardware and software issues. Ceph has no single point of failure and
|
||||
it can service requests for data even when in a "degraded" mode. Ceph's `data
|
||||
placement`_ introduces a layer of indirection to ensure that data doesn't bind
|
||||
directly to specific OSDs. For this reason, tracking system faults
|
||||
requires finding the `placement group`_ (PG) and the underlying OSDs at the
|
||||
root of the problem.
|
||||
|
||||
.. tip:: A fault in one part of the cluster may prevent you from accessing a
|
||||
particular object, but that doesn't mean that you cannot access other objects.
|
||||
.. tip:: A fault in one part of the cluster might prevent you from accessing a
|
||||
particular object, but that doesn't mean that you are prevented from accessing other objects.
|
||||
When you run into a fault, don't panic. Just follow the steps for monitoring
|
||||
your OSDs and placement groups. Then, begin troubleshooting.
|
||||
your OSDs and placement groups, and then begin troubleshooting.
|
||||
|
||||
Ceph is generally self-repairing. However, when problems persist, monitoring
|
||||
OSDs and placement groups will help you identify the problem.
|
||||
Ceph is self-repairing. However, when problems persist, monitoring OSDs and
|
||||
placement groups will help you identify the problem.
|
||||
|
||||
|
||||
Monitoring OSDs
|
||||
===============
|
||||
|
||||
An OSD's status is either in the cluster (``in``) or out of the cluster
|
||||
(``out``); and, it is either up and running (``up``), or it is down and not
|
||||
running (``down``). If an OSD is ``up``, it may be either ``in`` the cluster
|
||||
(you can read and write data) or it is ``out`` of the cluster. If it was
|
||||
``in`` the cluster and recently moved ``out`` of the cluster, Ceph will migrate
|
||||
placement groups to other OSDs. If an OSD is ``out`` of the cluster, CRUSH will
|
||||
not assign placement groups to the OSD. If an OSD is ``down``, it should also be
|
||||
An OSD's status is as follows: it is either in the cluster (``in``) or out of the cluster
|
||||
(``out``); likewise, it is either up and running (``up``) or down and not
|
||||
running (``down``). If an OSD is ``up``, it can be either ``in`` the cluster
|
||||
(if so, you can read and write data) or ``out`` of the cluster. If the OSD was previously
|
||||
``in`` the cluster but was recently moved ``out`` of the cluster, Ceph will migrate its
|
||||
PGs to other OSDs. If an OSD is ``out`` of the cluster, CRUSH will
|
||||
not assign any PGs to that OSD. If an OSD is ``down``, it should also be
|
||||
``out``.
|
||||
|
||||
.. note:: If an OSD is ``down`` and ``in``, there is a problem and the cluster
|
||||
will not be in a healthy state.
|
||||
.. note:: If an OSD is ``down`` and ``in``, then there is a problem and the cluster
|
||||
is not in a healthy state.
|
||||
|
||||
.. ditaa::
|
||||
|
||||
@ -50,72 +51,71 @@ not assign placement groups to the OSD. If an OSD is ``down``, it should also be
|
||||
| | | |
|
||||
+----------------+ +----------------+
|
||||
|
||||
If you execute a command such as ``ceph health``, ``ceph -s`` or ``ceph -w``,
|
||||
you may notice that the cluster does not always echo back ``HEALTH OK``. Don't
|
||||
panic. With respect to OSDs, you should expect that the cluster will **NOT**
|
||||
echo ``HEALTH OK`` in a few expected circumstances:
|
||||
If you run the commands ``ceph health``, ``ceph -s``, or ``ceph -w``,
|
||||
you might notice that the cluster does not always show ``HEALTH OK``. Don't
|
||||
panic. There are certain circumstances in which it is expected and normal that
|
||||
the cluster will **NOT** show ``HEALTH OK``:
|
||||
|
||||
#. You haven't started the cluster yet (it won't respond).
|
||||
#. You have just started or restarted the cluster and it's not ready yet,
|
||||
because the placement groups are getting created and the OSDs are in
|
||||
the process of peering.
|
||||
#. You just added or removed an OSD.
|
||||
#. You just have modified your cluster map.
|
||||
#. You haven't started the cluster yet.
|
||||
#. You have just started or restarted the cluster and it's not ready to show
|
||||
health statuses yet, because the PGs are in the process of being created and
|
||||
the OSDs are in the process of peering.
|
||||
#. You have just added or removed an OSD.
|
||||
#. You have just have modified your cluster map.
|
||||
|
||||
An important aspect of monitoring OSDs is to ensure that when the cluster
|
||||
is up and running that all OSDs that are ``in`` the cluster are ``up`` and
|
||||
running, too. To see if all OSDs are running, execute:
|
||||
Checking to see if OSDs are ``up`` and running is an important aspect of monitoring them:
|
||||
whenever the cluster is up and running, every OSD that is ``in`` the cluster should also
|
||||
be ``up`` and running. To see if all of the cluster's OSDs are running, run the following
|
||||
command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd stat
|
||||
ceph osd stat
|
||||
|
||||
The result should tell you the total number of OSDs (x),
|
||||
how many are ``up`` (y), how many are ``in`` (z) and the map epoch (eNNNN). ::
|
||||
The output provides the following information: the total number of OSDs (x),
|
||||
how many OSDs are ``up`` (y), how many OSDs are ``in`` (z), and the map epoch (eNNNN). ::
|
||||
|
||||
x osds: y up, z in; epoch: eNNNN
|
||||
x osds: y up, z in; epoch: eNNNN
|
||||
|
||||
If the number of OSDs that are ``in`` the cluster is more than the number of
|
||||
OSDs that are ``up``, execute the following command to identify the ``ceph-osd``
|
||||
If the number of OSDs that are ``in`` the cluster is greater than the number of
|
||||
OSDs that are ``up``, run the following command to identify the ``ceph-osd``
|
||||
daemons that are not running:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tree
|
||||
ceph osd tree
|
||||
|
||||
::
|
||||
|
||||
#ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
|
||||
-1 2.00000 pool openstack
|
||||
-3 2.00000 rack dell-2950-rack-A
|
||||
-2 2.00000 host dell-2950-A1
|
||||
0 ssd 1.00000 osd.0 up 1.00000 1.00000
|
||||
1 ssd 1.00000 osd.1 down 1.00000 1.00000
|
||||
#ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
|
||||
-1 2.00000 pool openstack
|
||||
-3 2.00000 rack dell-2950-rack-A
|
||||
-2 2.00000 host dell-2950-A1
|
||||
0 ssd 1.00000 osd.0 up 1.00000 1.00000
|
||||
1 ssd 1.00000 osd.1 down 1.00000 1.00000
|
||||
|
||||
.. tip:: The ability to search through a well-designed CRUSH hierarchy may help
|
||||
you troubleshoot your cluster by identifying the physical locations faster.
|
||||
.. tip:: Searching through a well-designed CRUSH hierarchy to identify the physical
|
||||
locations of particular OSDs might help you troubleshoot your cluster.
|
||||
|
||||
If an OSD is ``down``, start it:
|
||||
If an OSD is ``down``, start it by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo systemctl start ceph-osd@1
|
||||
sudo systemctl start ceph-osd@1
|
||||
|
||||
For problems associated with OSDs that have stopped or won't restart, see `OSD Not Running`_.
|
||||
|
||||
See `OSD Not Running`_ for problems associated with OSDs that stopped, or won't
|
||||
restart.
|
||||
|
||||
|
||||
PG Sets
|
||||
=======
|
||||
|
||||
When CRUSH assigns placement groups to OSDs, it looks at the number of replicas
|
||||
for the pool and assigns the placement group to OSDs such that each replica of
|
||||
the placement group gets assigned to a different OSD. For example, if the pool
|
||||
requires three replicas of a placement group, CRUSH may assign them to
|
||||
``osd.1``, ``osd.2`` and ``osd.3`` respectively. CRUSH actually seeks a
|
||||
pseudo-random placement that will take into account failure domains you set in
|
||||
your `CRUSH map`_, so you will rarely see placement groups assigned to nearest
|
||||
neighbor OSDs in a large cluster.
|
||||
When CRUSH assigns a PG to OSDs, it takes note of how many replicas of the PG
|
||||
are required by the pool and then assigns each replica to a different OSD.
|
||||
For example, if the pool requires three replicas of a PG, CRUSH might assign
|
||||
them individually to ``osd.1``, ``osd.2`` and ``osd.3``. CRUSH seeks a
|
||||
pseudo-random placement that takes into account the failure domains that you
|
||||
have set in your `CRUSH map`_; for this reason, PGs are rarely assigned to
|
||||
immediately adjacent OSDs in a large cluster.
|
||||
|
||||
Ceph processes a client request using the **Acting Set**, which is the set of
|
||||
OSDs that will actually handle the requests since they have a full and working
|
||||
@ -123,56 +123,55 @@ version of a placement group shard. The set of OSDs that should contain a shard
|
||||
of a particular placement group as the **Up Set**, i.e. where data is
|
||||
moved/copied to (or planned to be).
|
||||
|
||||
In some cases, an OSD in the Acting Set is ``down`` or otherwise not able to
|
||||
service requests for objects in the placement group. When these situations
|
||||
arise, don't panic. Common examples include:
|
||||
Sometimes an OSD in the Acting Set is ``down`` or otherwise unable to
|
||||
service requests for objects in the PG. When this kind of situation
|
||||
arises, don't panic. Common examples of such a situation include:
|
||||
|
||||
- You added or removed an OSD. Then, CRUSH reassigned the placement group to
|
||||
other OSDs--thereby changing the composition of the Acting Set and spawning
|
||||
the migration of data with a "backfill" process.
|
||||
- You added or removed an OSD, CRUSH reassigned the PG to
|
||||
other OSDs, and this reassignment changed the composition of the Acting Set and triggered
|
||||
the migration of data by means of a "backfill" process.
|
||||
- An OSD was ``down``, was restarted, and is now ``recovering``.
|
||||
- An OSD in the Acting Set is ``down`` or unable to service requests,
|
||||
- An OSD in the Acting Set is ``down`` or unable to service requests,
|
||||
and another OSD has temporarily assumed its duties.
|
||||
|
||||
In most cases, the Up Set and the Acting Set are identical. When they are not,
|
||||
it may indicate that Ceph is migrating the PG (it's remapped), an OSD is
|
||||
recovering, or that there is a problem (i.e., Ceph usually echoes a "HEALTH
|
||||
WARN" state with a "stuck stale" message in such scenarios).
|
||||
Typically, the Up Set and the Acting Set are identical. When they are not, it
|
||||
might indicate that Ceph is migrating the PG (in other words, that the PG has
|
||||
been remapped), that an OSD is recovering, or that there is a problem with the
|
||||
cluster (in such scenarios, Ceph usually shows a "HEALTH WARN" state with a
|
||||
"stuck stale" message).
|
||||
|
||||
To retrieve a list of placement groups, execute:
|
||||
To retrieve a list of PGs, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg dump
|
||||
|
||||
To view which OSDs are within the Acting Set or the Up Set for a given placement
|
||||
group, execute:
|
||||
ceph pg dump
|
||||
|
||||
To see which OSDs are within the Acting Set and the Up Set for a specific PG, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg map {pg-num}
|
||||
ceph pg map {pg-num}
|
||||
|
||||
The result should tell you the osdmap epoch (eNNN), the placement group number
|
||||
({pg-num}), the OSDs in the Up Set (up[]), and the OSDs in the acting set
|
||||
The output provides the following information: the osdmap epoch (eNNN), the PG number
|
||||
({pg-num}), the OSDs in the Up Set (up[]), and the OSDs in the Acting Set
|
||||
(acting[])::
|
||||
|
||||
osdmap eNNN pg {raw-pg-num} ({pg-num}) -> up [0,1,2] acting [0,1,2]
|
||||
osdmap eNNN pg {raw-pg-num} ({pg-num}) -> up [0,1,2] acting [0,1,2]
|
||||
|
||||
.. note:: If the Up Set and Acting Set do not match, this may be an indicator
|
||||
that the cluster rebalancing itself or of a potential problem with
|
||||
.. note:: If the Up Set and the Acting Set do not match, this might indicate
|
||||
that the cluster is rebalancing itself or that there is a problem with
|
||||
the cluster.
|
||||
|
||||
|
||||
|
||||
Peering
|
||||
=======
|
||||
|
||||
Before you can write data to a placement group, it must be in an ``active``
|
||||
state, and it **should** be in a ``clean`` state. For Ceph to determine the
|
||||
current state of a placement group, the primary OSD of the placement group
|
||||
(i.e., the first OSD in the acting set), peers with the secondary and tertiary
|
||||
OSDs to establish agreement on the current state of the placement group
|
||||
(assuming a pool with 3 replicas of the PG).
|
||||
|
||||
Before you can write data to a PG, it must be in an ``active`` state and it
|
||||
will preferably be in a ``clean`` state. For Ceph to determine the current
|
||||
state of a PG, peering must take place. That is, the primary OSD of the PG
|
||||
(that is, the first OSD in the Acting Set) must peer with the secondary and
|
||||
OSDs so that consensus on the current state of the PG can be established. In
|
||||
the following diagram, we assume a pool with three replicas of the PG:
|
||||
|
||||
.. ditaa::
|
||||
|
||||
@ -187,109 +186,110 @@ OSDs to establish agreement on the current state of the placement group
|
||||
| Peering |
|
||||
| |
|
||||
| Request To |
|
||||
| Peer |
|
||||
|----------------------------->|
|
||||
| Peer |
|
||||
|----------------------------->|
|
||||
|<-----------------------------|
|
||||
| Peering |
|
||||
|
||||
The OSDs also report their status to the monitor. See `Configuring Monitor/OSD
|
||||
Interaction`_ for details. To troubleshoot peering issues, see `Peering
|
||||
The OSDs also report their status to the monitor. For details, see `Configuring
|
||||
Monitor/OSD Interaction`_. To troubleshoot peering issues, see `Peering
|
||||
Failure`_.
|
||||
|
||||
|
||||
Monitoring Placement Group States
|
||||
=================================
|
||||
Monitoring PG States
|
||||
====================
|
||||
|
||||
If you execute a command such as ``ceph health``, ``ceph -s`` or ``ceph -w``,
|
||||
you may notice that the cluster does not always echo back ``HEALTH OK``. After
|
||||
you check to see if the OSDs are running, you should also check placement group
|
||||
states. You should expect that the cluster will **NOT** echo ``HEALTH OK`` in a
|
||||
number of placement group peering-related circumstances:
|
||||
If you run the commands ``ceph health``, ``ceph -s``, or ``ceph -w``,
|
||||
you might notice that the cluster does not always show ``HEALTH OK``. After
|
||||
first checking to see if the OSDs are running, you should also check PG
|
||||
states. There are certain PG-peering-related circumstances in which it is expected
|
||||
and normal that the cluster will **NOT** show ``HEALTH OK``:
|
||||
|
||||
#. You have just created a pool and placement groups haven't peered yet.
|
||||
#. The placement groups are recovering.
|
||||
#. You have just created a pool and the PGs haven't peered yet.
|
||||
#. The PGs are recovering.
|
||||
#. You have just added an OSD to or removed an OSD from the cluster.
|
||||
#. You have just modified your CRUSH map and your placement groups are migrating.
|
||||
#. There is inconsistent data in different replicas of a placement group.
|
||||
#. Ceph is scrubbing a placement group's replicas.
|
||||
#. You have just modified your CRUSH map and your PGs are migrating.
|
||||
#. There is inconsistent data in different replicas of a PG.
|
||||
#. Ceph is scrubbing a PG's replicas.
|
||||
#. Ceph doesn't have enough storage capacity to complete backfilling operations.
|
||||
|
||||
If one of the foregoing circumstances causes Ceph to echo ``HEALTH WARN``, don't
|
||||
panic. In many cases, the cluster will recover on its own. In some cases, you
|
||||
may need to take action. An important aspect of monitoring placement groups is
|
||||
to ensure that when the cluster is up and running that all placement groups are
|
||||
``active``, and preferably in the ``clean`` state. To see the status of all
|
||||
placement groups, execute:
|
||||
If one of these circumstances causes Ceph to show ``HEALTH WARN``, don't
|
||||
panic. In many cases, the cluster will recover on its own. In some cases, however, you
|
||||
might need to take action. An important aspect of monitoring PGs is to check their
|
||||
status as ``active`` and ``clean``: that is, it is important to ensure that, when the
|
||||
cluster is up and running, all PGs are ``active`` and (preferably) ``clean``.
|
||||
To see the status of every PG, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg stat
|
||||
ceph pg stat
|
||||
|
||||
The result should tell you the total number of placement groups (x), how many
|
||||
placement groups are in a particular state such as ``active+clean`` (y) and the
|
||||
The output provides the following information: the total number of PGs (x), how many
|
||||
PGs are in a particular state such as ``active+clean`` (y), and the
|
||||
amount of data stored (z). ::
|
||||
|
||||
x pgs: y active+clean; z bytes data, aa MB used, bb GB / cc GB avail
|
||||
x pgs: y active+clean; z bytes data, aa MB used, bb GB / cc GB avail
|
||||
|
||||
.. note:: It is common for Ceph to report multiple states for placement groups.
|
||||
.. note:: It is common for Ceph to report multiple states for PGs (for example,
|
||||
``active+clean``, ``active+clean+remapped``, ``active+clean+scrubbing``.
|
||||
|
||||
In addition to the placement group states, Ceph will also echo back the amount of
|
||||
storage capacity used (aa), the amount of storage capacity remaining (bb), and the total
|
||||
storage capacity for the placement group. These numbers can be important in a
|
||||
few cases:
|
||||
Here Ceph shows not only the PG states, but also storage capacity used (aa),
|
||||
the amount of storage capacity remaining (bb), and the total storage capacity
|
||||
of the PG. These values can be important in a few cases:
|
||||
|
||||
- You are reaching your ``near full ratio`` or ``full ratio``.
|
||||
- Your data is not getting distributed across the cluster due to an
|
||||
error in your CRUSH configuration.
|
||||
- The cluster is reaching its ``near full ratio`` or ``full ratio``.
|
||||
- Data is not being distributed across the cluster due to an error in the
|
||||
CRUSH configuration.
|
||||
|
||||
|
||||
.. topic:: Placement Group IDs
|
||||
|
||||
Placement group IDs consist of the pool number (not pool name) followed
|
||||
by a period (.) and the placement group ID--a hexadecimal number. You
|
||||
can view pool numbers and their names from the output of ``ceph osd
|
||||
lspools``. For example, the first pool created corresponds to
|
||||
pool number ``1``. A fully qualified placement group ID has the
|
||||
PG IDs consist of the pool number (not the pool name) followed by a period
|
||||
(.) and a hexadecimal number. You can view pool numbers and their names from
|
||||
in the output of ``ceph osd lspools``. For example, the first pool that was
|
||||
created corresponds to pool number ``1``. A fully qualified PG ID has the
|
||||
following form::
|
||||
|
||||
{pool-num}.{pg-id}
|
||||
|
||||
And it typically looks like this::
|
||||
|
||||
1.1f
|
||||
|
||||
|
||||
To retrieve a list of placement groups, execute the following:
|
||||
{pool-num}.{pg-id}
|
||||
|
||||
It typically resembles the following::
|
||||
|
||||
1.1701b
|
||||
|
||||
|
||||
To retrieve a list of PGs, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg dump
|
||||
|
||||
You can also format the output in JSON format and save it to a file:
|
||||
ceph pg dump
|
||||
|
||||
To format the output in JSON format and save it to a file, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg dump -o {filename} --format=json
|
||||
ceph pg dump -o {filename} --format=json
|
||||
|
||||
To query a particular placement group, execute the following:
|
||||
To query a specific PG, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg {poolnum}.{pg-id} query
|
||||
|
||||
ceph pg {poolnum}.{pg-id} query
|
||||
|
||||
Ceph will output the query in JSON format.
|
||||
|
||||
The following subsections describe the common pg states in detail.
|
||||
The following subsections describe the most common PG states in detail.
|
||||
|
||||
|
||||
Creating
|
||||
--------
|
||||
|
||||
When you create a pool, it will create the number of placement groups you
|
||||
specified. Ceph will echo ``creating`` when it is creating one or more
|
||||
placement groups. Once they are created, the OSDs that are part of a placement
|
||||
group's Acting Set will peer. Once peering is complete, the placement group
|
||||
status should be ``active+clean``, which means a Ceph client can begin writing
|
||||
to the placement group.
|
||||
PGs are created when you create a pool: the command that creates a pool
|
||||
specifies the total number of PGs for that pool, and when the pool is created
|
||||
all of those PGs are created as well. Ceph will echo ``creating`` while it is
|
||||
creating PGs. After the PG(s) are created, the OSDs that are part of a PG's
|
||||
Acting Set will peer. Once peering is complete, the PG status should be
|
||||
``active+clean``. This status means that Ceph clients begin writing to the
|
||||
PG.
|
||||
|
||||
.. ditaa::
|
||||
|
||||
@ -300,43 +300,38 @@ to the placement group.
|
||||
Peering
|
||||
-------
|
||||
|
||||
When Ceph is Peering a placement group, Ceph is bringing the OSDs that
|
||||
store the replicas of the placement group into **agreement about the state**
|
||||
of the objects and metadata in the placement group. When Ceph completes peering,
|
||||
this means that the OSDs that store the placement group agree about the current
|
||||
state of the placement group. However, completion of the peering process does
|
||||
**NOT** mean that each replica has the latest contents.
|
||||
When a PG peers, the OSDs that store the replicas of its data converge on an
|
||||
agreed state of the data and metadata within that PG. When peering is complete,
|
||||
those OSDs agree about the state of that PG. However, completion of the peering
|
||||
process does **NOT** mean that each replica has the latest contents.
|
||||
|
||||
.. topic:: Authoritative History
|
||||
|
||||
Ceph will **NOT** acknowledge a write operation to a client, until
|
||||
all OSDs of the acting set persist the write operation. This practice
|
||||
ensures that at least one member of the acting set will have a record
|
||||
of every acknowledged write operation since the last successful
|
||||
peering operation.
|
||||
Ceph will **NOT** acknowledge a write operation to a client until that write
|
||||
operation is persisted by every OSD in the Acting Set. This practice ensures
|
||||
that at least one member of the Acting Set will have a record of every
|
||||
acknowledged write operation since the last successful peering operation.
|
||||
|
||||
With an accurate record of each acknowledged write operation, Ceph can
|
||||
construct and disseminate a new authoritative history of the placement
|
||||
group--a complete, and fully ordered set of operations that, if performed,
|
||||
would bring an OSD’s copy of a placement group up to date.
|
||||
Given an accurate record of each acknowledged write operation, Ceph can
|
||||
construct a new authoritative history of the PG--that is, a complete and
|
||||
fully ordered set of operations that, if performed, would bring an OSD’s
|
||||
copy of the PG up to date.
|
||||
|
||||
|
||||
Active
|
||||
------
|
||||
|
||||
Once Ceph completes the peering process, a placement group may become
|
||||
``active``. The ``active`` state means that the data in the placement group is
|
||||
generally available in the primary placement group and the replicas for read
|
||||
and write operations.
|
||||
After Ceph has completed the peering process, a PG should become ``active``.
|
||||
The ``active`` state means that the data in the PG is generally available for
|
||||
read and write operations in the primary and replica OSDs.
|
||||
|
||||
|
||||
Clean
|
||||
-----
|
||||
|
||||
When a placement group is in the ``clean`` state, the primary OSD and the
|
||||
replica OSDs have successfully peered and there are no stray replicas for the
|
||||
placement group. Ceph replicated all objects in the placement group the correct
|
||||
number of times.
|
||||
When a PG is in the ``clean`` state, all OSDs holding its data and metadata
|
||||
have successfully peered and there are no stray replicas. Ceph has replicated
|
||||
all objects in the PG the correct number of times.
|
||||
|
||||
|
||||
Degraded
|
||||
@ -344,143 +339,147 @@ Degraded
|
||||
|
||||
When a client writes an object to the primary OSD, the primary OSD is
|
||||
responsible for writing the replicas to the replica OSDs. After the primary OSD
|
||||
writes the object to storage, the placement group will remain in a ``degraded``
|
||||
writes the object to storage, the PG will remain in a ``degraded``
|
||||
state until the primary OSD has received an acknowledgement from the replica
|
||||
OSDs that Ceph created the replica objects successfully.
|
||||
|
||||
The reason a placement group can be ``active+degraded`` is that an OSD may be
|
||||
``active`` even though it doesn't hold all of the objects yet. If an OSD goes
|
||||
``down``, Ceph marks each placement group assigned to the OSD as ``degraded``.
|
||||
The OSDs must peer again when the OSD comes back online. However, a client can
|
||||
still write a new object to a ``degraded`` placement group if it is ``active``.
|
||||
The reason that a PG can be ``active+degraded`` is that an OSD can be
|
||||
``active`` even if it doesn't yet hold all of the PG's objects. If an OSD goes
|
||||
``down``, Ceph marks each PG assigned to the OSD as ``degraded``. The PGs must
|
||||
peer again when the OSD comes back online. However, a client can still write a
|
||||
new object to a ``degraded`` PG if it is ``active``.
|
||||
|
||||
If an OSD is ``down`` and the ``degraded`` condition persists, Ceph may mark the
|
||||
If an OSD is ``down`` and the ``degraded`` condition persists, Ceph might mark the
|
||||
``down`` OSD as ``out`` of the cluster and remap the data from the ``down`` OSD
|
||||
to another OSD. The time between being marked ``down`` and being marked ``out``
|
||||
is controlled by ``mon osd down out interval``, which is set to ``600`` seconds
|
||||
is determined by ``mon_osd_down_out_interval``, which is set to ``600`` seconds
|
||||
by default.
|
||||
|
||||
A placement group can also be ``degraded``, because Ceph cannot find one or more
|
||||
objects that Ceph thinks should be in the placement group. While you cannot
|
||||
read or write to unfound objects, you can still access all of the other objects
|
||||
in the ``degraded`` placement group.
|
||||
A PG can also be in the ``degraded`` state because there are one or more
|
||||
objects that Ceph expects to find in the PG but that Ceph cannot find. Although
|
||||
you cannot read or write to unfound objects, you can still access all of the other
|
||||
objects in the ``degraded`` PG.
|
||||
|
||||
|
||||
Recovering
|
||||
----------
|
||||
|
||||
Ceph was designed for fault-tolerance at a scale where hardware and software
|
||||
problems are ongoing. When an OSD goes ``down``, its contents may fall behind
|
||||
the current state of other replicas in the placement groups. When the OSD is
|
||||
back ``up``, the contents of the placement groups must be updated to reflect the
|
||||
current state. During that time period, the OSD may reflect a ``recovering``
|
||||
state.
|
||||
Ceph was designed for fault-tolerance, because hardware and other server
|
||||
problems are expected or even routine. When an OSD goes ``down``, its contents
|
||||
might fall behind the current state of other replicas in the PGs. When the OSD
|
||||
has returned to the ``up`` state, the contents of the PGs must be updated to
|
||||
reflect that current state. During that time period, the OSD might be in a
|
||||
``recovering`` state.
|
||||
|
||||
Recovery is not always trivial, because a hardware failure might cause a
|
||||
cascading failure of multiple OSDs. For example, a network switch for a rack or
|
||||
cabinet may fail, which can cause the OSDs of a number of host machines to fall
|
||||
behind the current state of the cluster. Each one of the OSDs must recover once
|
||||
the fault is resolved.
|
||||
cabinet might fail, which can cause the OSDs of a number of host machines to
|
||||
fall behind the current state of the cluster. In such a scenario, general
|
||||
recovery is possible only if each of the OSDs recovers after the fault has been
|
||||
resolved.]
|
||||
|
||||
Ceph provides a number of settings to balance the resource contention between
|
||||
new service requests and the need to recover data objects and restore the
|
||||
placement groups to the current state. The ``osd recovery delay start`` setting
|
||||
allows an OSD to restart, re-peer and even process some replay requests before
|
||||
starting the recovery process. The ``osd
|
||||
recovery thread timeout`` sets a thread timeout, because multiple OSDs may fail,
|
||||
restart and re-peer at staggered rates. The ``osd recovery max active`` setting
|
||||
limits the number of recovery requests an OSD will entertain simultaneously to
|
||||
prevent the OSD from failing to serve . The ``osd recovery max chunk`` setting
|
||||
limits the size of the recovered data chunks to prevent network congestion.
|
||||
Ceph provides a number of settings that determine how the cluster balances the
|
||||
resource contention between the need to process new service requests and the
|
||||
need to recover data objects and restore the PGs to the current state. The
|
||||
``osd_recovery_delay_start`` setting allows an OSD to restart, re-peer, and
|
||||
even process some replay requests before starting the recovery process. The
|
||||
``osd_recovery_thread_timeout`` setting determines the duration of a thread
|
||||
timeout, because multiple OSDs might fail, restart, and re-peer at staggered
|
||||
rates. The ``osd_recovery_max_active`` setting limits the number of recovery
|
||||
requests an OSD can entertain simultaneously, in order to prevent the OSD from
|
||||
failing to serve. The ``osd_recovery_max_chunk`` setting limits the size of
|
||||
the recovered data chunks, in order to prevent network congestion.
|
||||
|
||||
|
||||
Back Filling
|
||||
------------
|
||||
|
||||
When a new OSD joins the cluster, CRUSH will reassign placement groups from OSDs
|
||||
in the cluster to the newly added OSD. Forcing the new OSD to accept the
|
||||
reassigned placement groups immediately can put excessive load on the new OSD.
|
||||
Back filling the OSD with the placement groups allows this process to begin in
|
||||
the background. Once backfilling is complete, the new OSD will begin serving
|
||||
requests when it is ready.
|
||||
When a new OSD joins the cluster, CRUSH will reassign PGs from OSDs that are
|
||||
already in the cluster to the newly added OSD. It can put excessive load on the
|
||||
new OSD to force it to immediately accept the reassigned PGs. Back filling the
|
||||
OSD with the PGs allows this process to begin in the background. After the
|
||||
backfill operations have completed, the new OSD will begin serving requests as
|
||||
soon as it is ready.
|
||||
|
||||
During the backfill operations, you may see one of several states:
|
||||
During the backfill operations, you might see one of several states:
|
||||
``backfill_wait`` indicates that a backfill operation is pending, but is not
|
||||
underway yet; ``backfilling`` indicates that a backfill operation is underway;
|
||||
and, ``backfill_toofull`` indicates that a backfill operation was requested,
|
||||
but couldn't be completed due to insufficient storage capacity. When a
|
||||
placement group cannot be backfilled, it may be considered ``incomplete``.
|
||||
yet underway; ``backfilling`` indicates that a backfill operation is currently
|
||||
underway; and ``backfill_toofull`` indicates that a backfill operation was
|
||||
requested but couldn't be completed due to insufficient storage capacity. When
|
||||
a PG cannot be backfilled, it might be considered ``incomplete``.
|
||||
|
||||
The ``backfill_toofull`` state may be transient. It is possible that as PGs
|
||||
are moved around, space may become available. The ``backfill_toofull`` is
|
||||
similar to ``backfill_wait`` in that as soon as conditions change
|
||||
backfill can proceed.
|
||||
The ``backfill_toofull`` state might be transient. It might happen that, as PGs
|
||||
are moved around, space becomes available. The ``backfill_toofull`` state is
|
||||
similar to ``backfill_wait`` in that backfill operations can proceed as soon as
|
||||
conditions change.
|
||||
|
||||
Ceph provides a number of settings to manage the load spike associated with
|
||||
reassigning placement groups to an OSD (especially a new OSD). By default,
|
||||
``osd_max_backfills`` sets the maximum number of concurrent backfills to and from
|
||||
an OSD to 1. The ``backfill full ratio`` enables an OSD to refuse a
|
||||
backfill request if the OSD is approaching its full ratio (90%, by default) and
|
||||
change with ``ceph osd set-backfillfull-ratio`` command.
|
||||
If an OSD refuses a backfill request, the ``osd backfill retry interval``
|
||||
enables an OSD to retry the request (after 30 seconds, by default). OSDs can
|
||||
also set ``osd backfill scan min`` and ``osd backfill scan max`` to manage scan
|
||||
intervals (64 and 512, by default).
|
||||
Ceph provides a number of settings to manage the load spike associated with the
|
||||
reassignment of PGs to an OSD (especially a new OSD). The ``osd_max_backfills``
|
||||
setting specifies the maximum number of concurrent backfills to and from an OSD
|
||||
(default: 1). The ``backfill_full_ratio`` setting allows an OSD to refuse a
|
||||
backfill request if the OSD is approaching its full ratio (default: 90%). This
|
||||
setting can be changed with the ``ceph osd set-backfillfull-ratio`` command. If
|
||||
an OSD refuses a backfill request, the ``osd_backfill_retry_interval`` setting
|
||||
allows an OSD to retry the request after a certain interval (default: 30
|
||||
seconds). OSDs can also set ``osd_backfill_scan_min`` and
|
||||
``osd_backfill_scan_max`` in order to manage scan intervals (default: 64 and
|
||||
512, respectively).
|
||||
|
||||
|
||||
Remapped
|
||||
--------
|
||||
|
||||
When the Acting Set that services a placement group changes, the data migrates
|
||||
from the old acting set to the new acting set. It may take some time for a new
|
||||
primary OSD to service requests. So it may ask the old primary to continue to
|
||||
service requests until the placement group migration is complete. Once data
|
||||
migration completes, the mapping uses the primary OSD of the new acting set.
|
||||
When the Acting Set that services a PG changes, the data migrates from the old
|
||||
Acting Set to the new Acting Set. Because it might take time for the new
|
||||
primary OSD to begin servicing requests, the old primary OSD might be required
|
||||
to continue servicing requests until the PG data migration is complete. After
|
||||
data migration has completed, the mapping uses the primary OSD of the new
|
||||
Acting Set.
|
||||
|
||||
|
||||
Stale
|
||||
-----
|
||||
|
||||
While Ceph uses heartbeats to ensure that hosts and daemons are running, the
|
||||
``ceph-osd`` daemons may also get into a ``stuck`` state where they are not
|
||||
reporting statistics in a timely manner (e.g., a temporary network fault). By
|
||||
default, OSD daemons report their placement group, up through, boot and failure
|
||||
statistics every half second (i.e., ``0.5``), which is more frequent than the
|
||||
heartbeat thresholds. If the **Primary OSD** of a placement group's acting set
|
||||
fails to report to the monitor or if other OSDs have reported the primary OSD
|
||||
``down``, the monitors will mark the placement group ``stale``.
|
||||
Although Ceph uses heartbeats in order to ensure that hosts and daemons are
|
||||
running, the ``ceph-osd`` daemons might enter a ``stuck`` state where they are
|
||||
not reporting statistics in a timely manner (for example, there might be a
|
||||
temporary network fault). By default, OSD daemons report their PG, up through,
|
||||
boot, and failure statistics every half second (that is, in accordance with a
|
||||
value of ``0.5``), which is more frequent than the reports defined by the
|
||||
heartbeat thresholds. If the primary OSD of a PG's Acting Set fails to report
|
||||
to the monitor or if other OSDs have reported the primary OSD ``down``, the
|
||||
monitors will mark the PG ``stale``.
|
||||
|
||||
When you start your cluster, it is common to see the ``stale`` state until
|
||||
the peering process completes. After your cluster has been running for awhile,
|
||||
seeing placement groups in the ``stale`` state indicates that the primary OSD
|
||||
for those placement groups is ``down`` or not reporting placement group statistics
|
||||
to the monitor.
|
||||
When you start your cluster, it is common to see the ``stale`` state until the
|
||||
peering process completes. After your cluster has been running for a while,
|
||||
however, seeing PGs in the ``stale`` state indicates that the primary OSD for
|
||||
those PGs is ``down`` or not reporting PG statistics to the monitor.
|
||||
|
||||
|
||||
Identifying Troubled PGs
|
||||
========================
|
||||
|
||||
As previously noted, a placement group is not necessarily problematic just
|
||||
because its state is not ``active+clean``. Generally, Ceph's ability to self
|
||||
repair may not be working when placement groups get stuck. The stuck states
|
||||
include:
|
||||
As previously noted, a PG is not necessarily having problems just because its
|
||||
state is not ``active+clean``. When PGs are stuck, this might indicate that
|
||||
Ceph cannot perform self-repairs. The stuck states include:
|
||||
|
||||
- **Unclean**: Placement groups contain objects that are not replicated the
|
||||
desired number of times. They should be recovering.
|
||||
- **Inactive**: Placement groups cannot process reads or writes because they
|
||||
are waiting for an OSD with the most up-to-date data to come back ``up``.
|
||||
- **Stale**: Placement groups are in an unknown state, because the OSDs that
|
||||
host them have not reported to the monitor cluster in a while (configured
|
||||
by ``mon osd report timeout``).
|
||||
- **Unclean**: PGs contain objects that have not been replicated the desired
|
||||
number of times. Under normal conditions, it can be assumed that these PGs
|
||||
are recovering.
|
||||
- **Inactive**: PGs cannot process reads or writes because they are waiting for
|
||||
an OSD that has the most up-to-date data to come back ``up``.
|
||||
- **Stale**: PG are in an unknown state, because the OSDs that host them have
|
||||
not reported to the monitor cluster for a certain period of time (determined
|
||||
by ``mon_osd_report_timeout``).
|
||||
|
||||
To identify stuck placement groups, execute the following:
|
||||
To identify stuck PGs, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg dump_stuck [unclean|inactive|stale|undersized|degraded]
|
||||
ceph pg dump_stuck [unclean|inactive|stale|undersized|degraded]
|
||||
|
||||
See `Placement Group Subsystem`_ for additional details. To troubleshoot
|
||||
stuck placement groups, see `Troubleshooting PG Errors`_.
|
||||
For more detail, see `Placement Group Subsystem`_. To troubleshoot stuck PGs,
|
||||
see `Troubleshooting PG Errors`_.
|
||||
|
||||
|
||||
Finding an Object Location
|
||||
@ -491,55 +490,54 @@ To store object data in the Ceph Object Store, a Ceph client must:
|
||||
#. Set an object name
|
||||
#. Specify a `pool`_
|
||||
|
||||
The Ceph client retrieves the latest cluster map and the CRUSH algorithm
|
||||
calculates how to map the object to a `placement group`_, and then calculates
|
||||
how to assign the placement group to an OSD dynamically. To find the object
|
||||
location, all you need is the object name and the pool name. For example:
|
||||
The Ceph client retrieves the latest cluster map, the CRUSH algorithm
|
||||
calculates how to map the object to a PG, and then the algorithm calculates how
|
||||
to dynamically assign the PG to an OSD. To find the object location given only
|
||||
the object name and the pool name, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd map {poolname} {object-name} [namespace]
|
||||
ceph osd map {poolname} {object-name} [namespace]
|
||||
|
||||
.. topic:: Exercise: Locate an Object
|
||||
|
||||
As an exercise, let's create an object. Specify an object name, a path
|
||||
to a test file containing some object data and a pool name using the
|
||||
As an exercise, let's create an object. We can specify an object name, a path
|
||||
to a test file that contains some object data, and a pool name by using the
|
||||
``rados put`` command on the command line. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
rados put {object-name} {file-path} --pool=data
|
||||
rados put test-object-1 testfile.txt --pool=data
|
||||
rados put {object-name} {file-path} --pool=data
|
||||
rados put test-object-1 testfile.txt --pool=data
|
||||
|
||||
To verify that the Ceph Object Store stored the object, execute the
|
||||
following:
|
||||
To verify that the Ceph Object Store stored the object, run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
rados -p data ls
|
||||
|
||||
Now, identify the object location:
|
||||
To identify the object location, run the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd map {pool-name} {object-name}
|
||||
ceph osd map data test-object-1
|
||||
|
||||
Ceph should output the object's location. For example::
|
||||
|
||||
osdmap e537 pool 'data' (1) object 'test-object-1' -> pg 1.d1743484 (1.4) -> up ([0,1], p0) acting ([0,1], p0)
|
||||
|
||||
To remove the test object, simply delete it using the ``rados rm``
|
||||
command. For example:
|
||||
|
||||
Ceph should output the object's location. For example::
|
||||
|
||||
osdmap e537 pool 'data' (1) object 'test-object-1' -> pg 1.d1743484 (1.4) -> up ([0,1], p0) acting ([0,1], p0)
|
||||
|
||||
To remove the test object, simply delete it by running the ``rados rm``
|
||||
command. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
|
||||
rados rm test-object-1 --pool=data
|
||||
|
||||
|
||||
As the cluster evolves, the object location may change dynamically. One benefit
|
||||
of Ceph's dynamic rebalancing is that Ceph relieves you from having to perform
|
||||
the migration manually. See the `Architecture`_ section for details.
|
||||
of Ceph's dynamic rebalancing is that Ceph spares you the burden of manually
|
||||
performing the migration. For details, see the `Architecture`_ section.
|
||||
|
||||
.. _data placement: ../data-placement
|
||||
.. _pool: ../pools
|
||||
|
@ -2,9 +2,9 @@
|
||||
Monitoring a Cluster
|
||||
======================
|
||||
|
||||
Once you have a running cluster, you may use the ``ceph`` tool to monitor your
|
||||
cluster. Monitoring a cluster typically involves checking OSD status, monitor
|
||||
status, placement group status and metadata server status.
|
||||
After you have a running cluster, you can use the ``ceph`` tool to monitor your
|
||||
cluster. Monitoring a cluster typically involves checking OSD status, monitor
|
||||
status, placement group status, and metadata server status.
|
||||
|
||||
Using the command line
|
||||
======================
|
||||
@ -13,11 +13,11 @@ Interactive mode
|
||||
----------------
|
||||
|
||||
To run the ``ceph`` tool in interactive mode, type ``ceph`` at the command line
|
||||
with no arguments. For example:
|
||||
with no arguments. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph
|
||||
ceph
|
||||
|
||||
.. prompt:: ceph>
|
||||
:prompts: ceph>
|
||||
@ -30,8 +30,9 @@ with no arguments. For example:
|
||||
Non-default paths
|
||||
-----------------
|
||||
|
||||
If you specified non-default locations for your configuration or keyring,
|
||||
you may specify their locations:
|
||||
If you specified non-default locations for your configuration or keyring when
|
||||
you install the cluster, you may specify their locations to the ``ceph`` tool
|
||||
by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -40,30 +41,32 @@ you may specify their locations:
|
||||
Checking a Cluster's Status
|
||||
===========================
|
||||
|
||||
After you start your cluster, and before you start reading and/or
|
||||
writing data, check your cluster's status first.
|
||||
After you start your cluster, and before you start reading and/or writing data,
|
||||
you should check your cluster's status.
|
||||
|
||||
To check a cluster's status, execute the following:
|
||||
To check a cluster's status, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph status
|
||||
|
||||
Or:
|
||||
|
||||
Alternatively, you can run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph -s
|
||||
|
||||
In interactive mode, type ``status`` and press **Enter**:
|
||||
In interactive mode, this operation is performed by typing ``status`` and
|
||||
pressing **Enter**:
|
||||
|
||||
.. prompt:: ceph>
|
||||
:prompts: ceph>
|
||||
|
||||
ceph> status
|
||||
|
||||
Ceph will print the cluster status. For example, a tiny Ceph demonstration
|
||||
cluster with one of each service may print the following:
|
||||
status
|
||||
|
||||
Ceph will print the cluster status. For example, a tiny Ceph "demonstration
|
||||
cluster" that is running one instance of each service (monitor, manager, and
|
||||
OSD) might print the following:
|
||||
|
||||
::
|
||||
|
||||
@ -84,33 +87,35 @@ cluster with one of each service may print the following:
|
||||
pgs: 16 active+clean
|
||||
|
||||
|
||||
.. topic:: How Ceph Calculates Data Usage
|
||||
How Ceph Calculates Data Usage
|
||||
------------------------------
|
||||
|
||||
The ``usage`` value reflects the *actual* amount of raw storage used. The
|
||||
``xxx GB / xxx GB`` value means the amount available (the lesser number)
|
||||
of the overall storage capacity of the cluster. The notional number reflects
|
||||
the size of the stored data before it is replicated, cloned or snapshotted.
|
||||
Therefore, the amount of data actually stored typically exceeds the notional
|
||||
amount stored, because Ceph creates replicas of the data and may also use
|
||||
storage capacity for cloning and snapshotting.
|
||||
The ``usage`` value reflects the *actual* amount of raw storage used. The ``xxx
|
||||
GB / xxx GB`` value means the amount available (the lesser number) of the
|
||||
overall storage capacity of the cluster. The notional number reflects the size
|
||||
of the stored data before it is replicated, cloned or snapshotted. Therefore,
|
||||
the amount of data actually stored typically exceeds the notional amount
|
||||
stored, because Ceph creates replicas of the data and may also use storage
|
||||
capacity for cloning and snapshotting.
|
||||
|
||||
|
||||
Watching a Cluster
|
||||
==================
|
||||
|
||||
In addition to local logging by each daemon, Ceph clusters maintain
|
||||
a *cluster log* that records high level events about the whole system.
|
||||
This is logged to disk on monitor servers (as ``/var/log/ceph/ceph.log`` by
|
||||
default), but can also be monitored via the command line.
|
||||
Each daemon in the Ceph cluster maintains a log of events, and the Ceph cluster
|
||||
itself maintains a *cluster log* that records high-level events about the
|
||||
entire Ceph cluster. These events are logged to disk on monitor servers (in
|
||||
the default location ``/var/log/ceph/ceph.log``), and they can be monitored via
|
||||
the command line.
|
||||
|
||||
To follow the cluster log, use the following command:
|
||||
To follow the cluster log, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph -w
|
||||
|
||||
Ceph will print the status of the system, followed by each log message as it
|
||||
is emitted. For example:
|
||||
Ceph will print the status of the system, followed by each log message as it is
|
||||
added. For example:
|
||||
|
||||
::
|
||||
|
||||
@ -135,21 +140,20 @@ is emitted. For example:
|
||||
2017-07-24 08:15:14.258143 mon.a mon.0 172.21.9.34:6789/0 39 : cluster [INF] Activating manager daemon x
|
||||
2017-07-24 08:15:15.446025 mon.a mon.0 172.21.9.34:6789/0 47 : cluster [INF] Manager daemon x is now available
|
||||
|
||||
|
||||
In addition to using ``ceph -w`` to print log lines as they are emitted,
|
||||
use ``ceph log last [n]`` to see the most recent ``n`` lines from the cluster
|
||||
log.
|
||||
Instead of printing log lines as they are added, you might want to print only
|
||||
the most recent lines. Run ``ceph log last [n]`` to see the most recent ``n``
|
||||
lines from the cluster log.
|
||||
|
||||
Monitoring Health Checks
|
||||
========================
|
||||
|
||||
Ceph continuously runs various *health checks* against its own status. When
|
||||
a health check fails, this is reflected in the output of ``ceph status`` (or
|
||||
``ceph health``). In addition, messages are sent to the cluster log to
|
||||
indicate when a check fails, and when the cluster recovers.
|
||||
Ceph continuously runs various *health checks*. When
|
||||
a health check fails, this failure is reflected in the output of ``ceph status`` and
|
||||
``ceph health``. The cluster log receives messages that
|
||||
indicate when a check has failed and when the cluster has recovered.
|
||||
|
||||
For example, when an OSD goes down, the ``health`` section of the status
|
||||
output may be updated as follows:
|
||||
output is updated as follows:
|
||||
|
||||
::
|
||||
|
||||
@ -157,7 +161,7 @@ output may be updated as follows:
|
||||
1 osds down
|
||||
Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded
|
||||
|
||||
At this time, cluster log messages are also emitted to record the failure of the
|
||||
At the same time, cluster log messages are emitted to record the failure of the
|
||||
health checks:
|
||||
|
||||
::
|
||||
@ -166,7 +170,7 @@ health checks:
|
||||
2017-07-25 10:09:01.302624 mon.a mon.0 172.21.9.34:6789/0 94 : cluster [WRN] Health check failed: Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded (PG_DEGRADED)
|
||||
|
||||
When the OSD comes back online, the cluster log records the cluster's return
|
||||
to a health state:
|
||||
to a healthy state:
|
||||
|
||||
::
|
||||
|
||||
@ -177,21 +181,23 @@ to a health state:
|
||||
Network Performance Checks
|
||||
--------------------------
|
||||
|
||||
Ceph OSDs send heartbeat ping messages amongst themselves to monitor daemon availability. We
|
||||
also use the response times to monitor network performance.
|
||||
While it is possible that a busy OSD could delay a ping response, we can assume
|
||||
that if a network switch fails multiple delays will be detected between distinct pairs of OSDs.
|
||||
Ceph OSDs send heartbeat ping messages to each other in order to monitor daemon
|
||||
availability and network performance. If a single delayed response is detected,
|
||||
this might indicate nothing more than a busy OSD. But if multiple delays
|
||||
between distinct pairs of OSDs are detected, this might indicate a failed
|
||||
network switch, a NIC failure, or a layer 1 failure.
|
||||
|
||||
By default we will warn about ping times which exceed 1 second (1000 milliseconds).
|
||||
By default, a heartbeat time that exceeds 1 second (1000 milliseconds) raises a
|
||||
health check (a ``HEALTH_WARN``. For example:
|
||||
|
||||
::
|
||||
|
||||
HEALTH_WARN Slow OSD heartbeats on back (longest 1118.001ms)
|
||||
|
||||
The health detail will add the combination of OSDs are seeing the delays and by how much. There is a limit of 10
|
||||
detail line items.
|
||||
|
||||
::
|
||||
In the output of the ``ceph health detail`` command, you can see which OSDs are
|
||||
experiencing delays and how long the delays are. The output of ``ceph health
|
||||
detail`` is limited to ten lines. Here is an example of the output you can
|
||||
expect from the ``ceph health detail`` command::
|
||||
|
||||
[WRN] OSD_SLOW_PING_TIME_BACK: Slow OSD heartbeats on back (longest 1118.001ms)
|
||||
Slow OSD heartbeats on back from osd.0 [dc1,rack1] to osd.1 [dc1,rack1] 1118.001 msec possibly improving
|
||||
@ -199,11 +205,15 @@ detail line items.
|
||||
Slow OSD heartbeats on back from osd.2 [dc1,rack2] to osd.1 [dc1,rack1] 1015.321 msec
|
||||
Slow OSD heartbeats on back from osd.1 [dc1,rack1] to osd.0 [dc1,rack1] 1010.456 msec
|
||||
|
||||
To see even more detail and a complete dump of network performance information the ``dump_osd_network`` command can be used. Typically, this would be
|
||||
sent to a mgr, but it can be limited to a particular OSD's interactions by issuing it to any OSD. The current threshold which defaults to 1 second
|
||||
(1000 milliseconds) can be overridden as an argument in milliseconds.
|
||||
To see more detail and to collect a complete dump of network performance
|
||||
information, use the ``dump_osd_network`` command. This command is usually sent
|
||||
to a Ceph Manager Daemon, but it can be used to collect information about a
|
||||
specific OSD's interactions by sending it to that OSD. The default threshold
|
||||
for a slow heartbeat is 1 second (1000 milliseconds), but this can be
|
||||
overridden by providing a number of milliseconds as an argument.
|
||||
|
||||
The following command will show all gathered network performance data by specifying a threshold of 0 and sending to the mgr.
|
||||
To show all network performance data with a specified threshold of 0, send the
|
||||
following command to the mgr:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -287,26 +297,26 @@ The following command will show all gathered network performance data by specify
|
||||
|
||||
|
||||
|
||||
Muting health checks
|
||||
Muting Health Checks
|
||||
--------------------
|
||||
|
||||
Health checks can be muted so that they do not affect the overall
|
||||
reported status of the cluster. Alerts are specified using the health
|
||||
check code (see :ref:`health-checks`):
|
||||
Health checks can be muted so that they have no effect on the overall
|
||||
reported status of the cluster. For example, if the cluster has raised a
|
||||
single health check and then you mute that health check, then the cluster will report a status of ``HEALTH_OK``.
|
||||
To mute a specific health check, use the health check code that corresponds to that health check (see :ref:`health-checks`), and
|
||||
run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph health mute <code>
|
||||
|
||||
For example, if there is a health warning, muting it will make the
|
||||
cluster report an overall status of ``HEALTH_OK``. For example, to
|
||||
mute an ``OSD_DOWN`` alert,:
|
||||
For example, to mute an ``OSD_DOWN`` health check, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph health mute OSD_DOWN
|
||||
|
||||
Mutes are reported as part of the short and long form of the ``ceph health`` command.
|
||||
Mutes are reported as part of the short and long form of the ``ceph health`` command's output.
|
||||
For example, in the above scenario, the cluster would report:
|
||||
|
||||
.. prompt:: bash $
|
||||
@ -327,7 +337,7 @@ For example, in the above scenario, the cluster would report:
|
||||
(MUTED) OSD_DOWN 1 osds down
|
||||
osd.1 is down
|
||||
|
||||
A mute can be explicitly removed with:
|
||||
A mute can be removed by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -339,56 +349,50 @@ For example:
|
||||
|
||||
ceph health unmute OSD_DOWN
|
||||
|
||||
A health check mute may optionally have a TTL (time to live)
|
||||
associated with it, such that the mute will automatically expire
|
||||
after the specified period of time has elapsed. The TTL is specified as an optional
|
||||
duration argument, e.g.:
|
||||
A "health mute" can have a TTL (**T**\ime **T**\o **L**\ive)
|
||||
associated with it: this means that the mute will automatically expire
|
||||
after a specified period of time. The TTL is specified as an optional
|
||||
duration argument, as seen in the following examples:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph health mute OSD_DOWN 4h # mute for 4 hours
|
||||
ceph health mute MON_DOWN 15m # mute for 15 minutes
|
||||
ceph health mute MON_DOWN 15m # mute for 15 minutes
|
||||
|
||||
Normally, if a muted health alert is resolved (e.g., in the example
|
||||
above, the OSD comes back up), the mute goes away. If the alert comes
|
||||
Normally, if a muted health check is resolved (for example, if the OSD that raised the ``OSD_DOWN`` health check
|
||||
in the example above has come back up), the mute goes away. If the health check comes
|
||||
back later, it will be reported in the usual way.
|
||||
|
||||
It is possible to make a mute "sticky" such that the mute will remain even if the
|
||||
alert clears. For example:
|
||||
It is possible to make a health mute "sticky": this means that the mute will remain even if the
|
||||
health check clears. For example, to make a health mute "sticky", you might run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph health mute OSD_DOWN 1h --sticky # ignore any/all down OSDs for next hour
|
||||
|
||||
Most health mutes also disappear if the extent of an alert gets worse. For example,
|
||||
if there is one OSD down, and the alert is muted, the mute will disappear if one
|
||||
or more additional OSDs go down. This is true for any health alert that involves
|
||||
a count indicating how much or how many of something is triggering the warning or
|
||||
error.
|
||||
Most health mutes disappear if the unhealthy condition that triggered the health check gets worse.
|
||||
For example, suppose that there is one OSD down and the health check is muted. In that case, if
|
||||
one or more additional OSDs go down, then the health mute disappears. This behavior occurs in any health check with a threshold value.
|
||||
|
||||
|
||||
Detecting configuration issues
|
||||
Detecting Configuration Issues
|
||||
==============================
|
||||
|
||||
In addition to the health checks that Ceph continuously runs on its
|
||||
own status, there are some configuration issues that may only be detected
|
||||
by an external tool.
|
||||
|
||||
Use the `ceph-medic`_ tool to run these additional checks on your Ceph
|
||||
cluster's configuration.
|
||||
Although Ceph continuously monitors itself, some configuration issues can be
|
||||
detected only with an external tool called ``ceph-medic``.
|
||||
>>>>>>> 41684ebd33b (doc/rados: edit ops/monitoring.rst (2 of 3))
|
||||
|
||||
Checking a Cluster's Usage Stats
|
||||
================================
|
||||
|
||||
To check a cluster's data usage and data distribution among pools, you can
|
||||
use the ``df`` option. It is similar to Linux ``df``. Execute
|
||||
the following:
|
||||
To check a cluster's data usage and data distribution among pools, use the
|
||||
``df`` command. This option is similar to Linux's ``df`` command. Run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph df
|
||||
|
||||
The output of ``ceph df`` looks like this::
|
||||
The output of ``ceph df`` resembles the following::
|
||||
|
||||
CLASS SIZE AVAIL USED RAW USED %RAW USED
|
||||
ssd 202 GiB 200 GiB 2.0 GiB 2.0 GiB 1.00
|
||||
@ -401,10 +405,6 @@ The output of ``ceph df`` looks like this::
|
||||
cephfs.a.data 3 32 0 B 0 B 0 B 0 0 B 0 B 0 B 0 99 GiB N/A N/A 0 0 B 0 B
|
||||
test 4 32 22 MiB 22 MiB 50 KiB 248 19 MiB 19 MiB 50 KiB 0 297 GiB N/A N/A 248 0 B 0 B
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
- **CLASS:** for example, "ssd" or "hdd"
|
||||
- **SIZE:** The amount of storage capacity managed by the cluster.
|
||||
- **AVAIL:** The amount of free space available in the cluster.
|
||||
@ -644,4 +644,3 @@ directly to the host in question ).
|
||||
|
||||
.. _Viewing a Configuration at Runtime: ../../configuration/ceph-conf#viewing-a-configuration-at-runtime
|
||||
.. _Storage Capacity: ../../configuration/mon-config-ref#storage-capacity
|
||||
.. _ceph-medic: http://docs.ceph.com/ceph-medic/master/
|
||||
|
@ -6,50 +6,52 @@
|
||||
|
||||
|
||||
Running Ceph with systemd
|
||||
==========================
|
||||
=========================
|
||||
|
||||
For all distributions that support systemd (CentOS 7, Fedora, Debian
|
||||
Jessie 8 and later, SUSE), ceph daemons are now managed using native
|
||||
systemd files instead of the legacy sysvinit scripts. For example:
|
||||
In all distributions that support systemd (CentOS 7, Fedora, Debian
|
||||
Jessie 8 and later, and SUSE), systemd files (and NOT legacy SysVinit scripts)
|
||||
are used to manage Ceph daemons. Ceph daemons therefore behave like any other daemons
|
||||
that can be controlled by the ``systemctl`` command, as in the following examples:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo systemctl start ceph.target # start all daemons
|
||||
sudo systemctl status ceph-osd@12 # check status of osd.12
|
||||
|
||||
To list the Ceph systemd units on a node, execute:
|
||||
To list all of the Ceph systemd units on a node, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo systemctl status ceph\*.service ceph\*.target
|
||||
|
||||
Starting all Daemons
|
||||
|
||||
Starting all daemons
|
||||
--------------------
|
||||
|
||||
To start all daemons on a Ceph Node (irrespective of type), execute the
|
||||
following:
|
||||
To start all of the daemons on a Ceph node (regardless of their type), run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo systemctl start ceph.target
|
||||
|
||||
|
||||
Stopping all Daemons
|
||||
Stopping all daemons
|
||||
--------------------
|
||||
|
||||
To stop all daemons on a Ceph Node (irrespective of type), execute the
|
||||
following:
|
||||
To stop all of the daemons on a Ceph node (regardless of their type), run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo systemctl stop ceph\*.service ceph\*.target
|
||||
|
||||
|
||||
Starting all Daemons by Type
|
||||
Starting all daemons by type
|
||||
----------------------------
|
||||
|
||||
To start all daemons of a particular type on a Ceph Node, execute one of the
|
||||
following:
|
||||
To start all of the daemons of a particular type on a Ceph node, run one of the
|
||||
following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -58,24 +60,24 @@ following:
|
||||
sudo systemctl start ceph-mds.target
|
||||
|
||||
|
||||
Stopping all Daemons by Type
|
||||
Stopping all daemons by type
|
||||
----------------------------
|
||||
|
||||
To stop all daemons of a particular type on a Ceph Node, execute one of the
|
||||
following:
|
||||
To stop all of the daemons of a particular type on a Ceph node, run one of the
|
||||
following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo systemctl stop ceph-mon\*.service ceph-mon.target
|
||||
sudo systemctl stop ceph-osd\*.service ceph-osd.target
|
||||
sudo systemctl stop ceph-mon\*.service ceph-mon.target
|
||||
sudo systemctl stop ceph-mds\*.service ceph-mds.target
|
||||
|
||||
|
||||
Starting a Daemon
|
||||
Starting a daemon
|
||||
-----------------
|
||||
|
||||
To start a specific daemon instance on a Ceph Node, execute one of the
|
||||
following:
|
||||
To start a specific daemon instance on a Ceph node, run one of the
|
||||
following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -92,11 +94,11 @@ For example:
|
||||
sudo systemctl start ceph-mds@ceph-server
|
||||
|
||||
|
||||
Stopping a Daemon
|
||||
Stopping a daemon
|
||||
-----------------
|
||||
|
||||
To stop a specific daemon instance on a Ceph Node, execute one of the
|
||||
following:
|
||||
To stop a specific daemon instance on a Ceph node, run one of the
|
||||
following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -194,15 +196,14 @@ For example::
|
||||
|
||||
.. index:: sysvinit; operating a cluster
|
||||
|
||||
Running Ceph with sysvinit
|
||||
Running Ceph with SysVinit
|
||||
==========================
|
||||
|
||||
Each time you to **start**, **restart**, and **stop** Ceph daemons (or your
|
||||
entire cluster) you must specify at least one option and one command. You may
|
||||
also specify a daemon type or a daemon instance. ::
|
||||
|
||||
{commandline} [options] [commands] [daemons]
|
||||
Each time you start, restart, or stop Ceph daemons, you must specify at least one option and one command.
|
||||
Likewise, each time you start, restart, or stop your entire cluster, you must specify at least one option and one command.
|
||||
In both cases, you can also specify a daemon type or a daemon instance. ::
|
||||
|
||||
{commandline} [options] [commands] [daemons]
|
||||
|
||||
The ``ceph`` options include:
|
||||
|
||||
@ -213,12 +214,12 @@ The ``ceph`` options include:
|
||||
+-----------------+----------+-------------------------------------------------+
|
||||
| ``--valgrind`` | ``N/A`` | (Dev and QA only) Use `Valgrind`_ debugging. |
|
||||
+-----------------+----------+-------------------------------------------------+
|
||||
| ``--allhosts`` | ``-a`` | Execute on all nodes in ``ceph.conf.`` |
|
||||
| ``--allhosts`` | ``-a`` | Execute on all nodes listed in ``ceph.conf``. |
|
||||
| | | Otherwise, it only executes on ``localhost``. |
|
||||
+-----------------+----------+-------------------------------------------------+
|
||||
| ``--restart`` | ``N/A`` | Automatically restart daemon if it core dumps. |
|
||||
+-----------------+----------+-------------------------------------------------+
|
||||
| ``--norestart`` | ``N/A`` | Don't restart a daemon if it core dumps. |
|
||||
| ``--norestart`` | ``N/A`` | Do not restart a daemon if it core dumps. |
|
||||
+-----------------+----------+-------------------------------------------------+
|
||||
| ``--conf`` | ``-c`` | Use an alternate configuration file. |
|
||||
+-----------------+----------+-------------------------------------------------+
|
||||
@ -232,24 +233,21 @@ The ``ceph`` commands include:
|
||||
+------------------+------------------------------------------------------------+
|
||||
| ``stop`` | Stop the daemon(s). |
|
||||
+------------------+------------------------------------------------------------+
|
||||
| ``forcestop`` | Force the daemon(s) to stop. Same as ``kill -9`` |
|
||||
| ``forcestop`` | Force the daemon(s) to stop. Same as ``kill -9``. |
|
||||
+------------------+------------------------------------------------------------+
|
||||
| ``killall`` | Kill all daemons of a particular type. |
|
||||
| ``killall`` | Kill all daemons of a particular type. |
|
||||
+------------------+------------------------------------------------------------+
|
||||
| ``cleanlogs`` | Cleans out the log directory. |
|
||||
+------------------+------------------------------------------------------------+
|
||||
| ``cleanalllogs`` | Cleans out **everything** in the log directory. |
|
||||
+------------------+------------------------------------------------------------+
|
||||
|
||||
For subsystem operations, the ``ceph`` service can target specific daemon types
|
||||
by adding a particular daemon type for the ``[daemons]`` option. Daemon types
|
||||
include:
|
||||
The ``[daemons]`` option allows the ``ceph`` service to target specific daemon types
|
||||
in order to perform subsystem operations. Daemon types include:
|
||||
|
||||
- ``mon``
|
||||
- ``osd``
|
||||
- ``mds``
|
||||
|
||||
|
||||
|
||||
.. _Valgrind: http://www.valgrind.org/
|
||||
.. _initctl: http://manpages.ubuntu.com/manpages/raring/en/man8/initctl.8.html
|
||||
|
@ -1,59 +1,60 @@
|
||||
============================
|
||||
Repairing PG inconsistencies
|
||||
Repairing PG Inconsistencies
|
||||
============================
|
||||
Sometimes a placement group might become "inconsistent". To return the
|
||||
placement group to an active+clean state, you must first determine which
|
||||
of the placement groups has become inconsistent and then run the "pg
|
||||
repair" command on it. This page contains commands for diagnosing placement
|
||||
groups and the command for repairing placement groups that have become
|
||||
Sometimes a Placement Group (PG) might become ``inconsistent``. To return the PG
|
||||
to an ``active+clean`` state, you must first determine which of the PGs has become
|
||||
inconsistent and then run the ``pg repair`` command on it. This page contains
|
||||
commands for diagnosing PGs and the command for repairing PGs that have become
|
||||
inconsistent.
|
||||
|
||||
.. highlight:: console
|
||||
|
||||
Commands for Diagnosing Placement-group Problems
|
||||
================================================
|
||||
The commands in this section provide various ways of diagnosing broken placement groups.
|
||||
Commands for Diagnosing PG Problems
|
||||
===================================
|
||||
The commands in this section provide various ways of diagnosing broken PGs.
|
||||
|
||||
The following command provides a high-level (low detail) overview of the health of the ceph cluster:
|
||||
To see a high-level (low-detail) overview of Ceph cluster health, run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph health detail
|
||||
|
||||
The following command provides more detail on the status of the placement groups:
|
||||
To see more detail on the status of the PGs, run the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph pg dump --format=json-pretty
|
||||
|
||||
The following command lists inconsistent placement groups:
|
||||
To see a list of inconsistent PGs, run the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
rados list-inconsistent-pg {pool}
|
||||
|
||||
The following command lists inconsistent rados objects:
|
||||
To see a list of inconsistent RADOS objects, run the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
rados list-inconsistent-obj {pgid}
|
||||
|
||||
The following command lists inconsistent snapsets in the given placement group:
|
||||
To see a list of inconsistent snapsets in a specific PG, run the following
|
||||
commands:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
rados list-inconsistent-snapset {pgid}
|
||||
|
||||
|
||||
Commands for Repairing Placement Groups
|
||||
=======================================
|
||||
The form of the command to repair a broken placement group is:
|
||||
Commands for Repairing PGs
|
||||
==========================
|
||||
The form of the command to repair a broken PG is as follows:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph pg repair {pgid}
|
||||
|
||||
Where ``{pgid}`` is the id of the affected placement group.
|
||||
Here ``{pgid}`` represents the id of the affected PG.
|
||||
|
||||
For example:
|
||||
|
||||
@ -61,21 +62,57 @@ For example:
|
||||
|
||||
ceph pg repair 1.4
|
||||
|
||||
More Information on Placement Group Repair
|
||||
==========================================
|
||||
Ceph stores and updates the checksums of objects stored in the cluster. When a scrub is performed on a placement group, the OSD attempts to choose an authoritative copy from among its replicas. Among all of the possible cases, only one case is consistent. After a deep scrub, Ceph calculates the checksum of an object read from the disk and compares it to the checksum previously recorded. If the current checksum and the previously recorded checksums do not match, that is an inconsistency. In the case of replicated pools, any mismatch between the checksum of any replica of an object and the checksum of the authoritative copy means that there is an inconsistency.
|
||||
.. note:: PG IDs have the form ``N.xxxxx``, where ``N`` is the number of the
|
||||
pool that contains the PG. The command ``ceph osd listpools`` and the
|
||||
command ``ceph osd dump | grep pool`` return a list of pool numbers.
|
||||
|
||||
The "pg repair" command attempts to fix inconsistencies of various kinds. If "pg repair" finds an inconsistent placement group, it attempts to overwrite the digest of the inconsistent copy with the digest of the authoritative copy. If "pg repair" finds an inconsistent replicated pool, it marks the inconsistent copy as missing. Recovery, in the case of replicated pools, is beyond the scope of "pg repair".
|
||||
More Information on PG Repair
|
||||
=============================
|
||||
Ceph stores and updates the checksums of objects stored in the cluster. When a
|
||||
scrub is performed on a PG, the OSD attempts to choose an authoritative copy
|
||||
from among its replicas. Only one of the possible cases is consistent. After
|
||||
performing a deep scrub, Ceph calculates the checksum of an object that is read
|
||||
from disk and compares it to the checksum that was previously recorded. If the
|
||||
current checksum and the previously recorded checksum do not match, that
|
||||
mismatch is considered to be an inconsistency. In the case of replicated pools,
|
||||
any mismatch between the checksum of any replica of an object and the checksum
|
||||
of the authoritative copy means that there is an inconsistency. The discovery
|
||||
of these inconsistencies cause a PG's state to be set to ``inconsistent``.
|
||||
|
||||
For erasure coded and bluestore pools, Ceph will automatically repair if osd_scrub_auto_repair (configuration default "false") is set to true and at most osd_scrub_auto_repair_num_errors (configuration default 5) errors are found.
|
||||
The ``pg repair`` command attempts to fix inconsistencies of various kinds. If
|
||||
``pg repair`` finds an inconsistent PG, it attempts to overwrite the digest of
|
||||
the inconsistent copy with the digest of the authoritative copy. If ``pg
|
||||
repair`` finds an inconsistent replicated pool, it marks the inconsistent copy
|
||||
as missing. In the case of replicated pools, recovery is beyond the scope of
|
||||
``pg repair``.
|
||||
|
||||
"pg repair" will not solve every problem. Ceph does not automatically repair placement groups when inconsistencies are found in them.
|
||||
In the case of erasure-coded and BlueStore pools, Ceph will automatically
|
||||
perform repairs if ``osd_scrub_auto_repair`` (default ``false`) is set to
|
||||
``true`` and if no more than ``osd_scrub_auto_repair_num_errors`` (default
|
||||
``5``) errors are found.
|
||||
|
||||
The checksum of an object or an omap is not always available. Checksums are calculated incrementally. If a replicated object is updated non-sequentially, the write operation involved in the update changes the object and invalidates its checksum. The whole object is not read while recalculating the checksum. "ceph pg repair" is able to repair things even when checksums are not available to it, as in the case of filestore. When replicated filestore pools are in question, users might prefer manual repair to "ceph pg repair".
|
||||
The ``pg repair`` command will not solve every problem. Ceph does not
|
||||
automatically repair PGs when they are found to contain inconsistencies.
|
||||
|
||||
The material in this paragraph is relevant for filestore, and bluestore has its own internal checksums. The matched-record checksum and the calculated checksum cannot prove that the authoritative copy is in fact authoritative. In the case that there is no checksum available, "pg repair" favors the data on the primary. this might or might not be the uncorrupted replica. This is why human intervention is necessary when an inconsistency is discovered. Human intervention sometimes means using the "ceph-objectstore-tool".
|
||||
The checksum of a RADOS object or an omap is not always available. Checksums
|
||||
are calculated incrementally. If a replicated object is updated
|
||||
non-sequentially, the write operation involved in the update changes the object
|
||||
and invalidates its checksum. The whole object is not read while the checksum
|
||||
is recalculated. The ``pg repair`` command is able to make repairs even when
|
||||
checksums are not available to it, as in the case of Filestore. Users working
|
||||
with replicated Filestore pools might prefer manual repair to ``ceph pg
|
||||
repair``.
|
||||
|
||||
This material is relevant for Filestore, but not for BlueStore, which has its
|
||||
own internal checksums. The matched-record checksum and the calculated checksum
|
||||
cannot prove that any specific copy is in fact authoritative. If there is no
|
||||
checksum available, ``pg repair`` favors the data on the primary, but this
|
||||
might not be the uncorrupted replica. Because of this uncertainty, human
|
||||
intervention is necessary when an inconsistency is discovered. This
|
||||
intervention sometimes involves use of ``ceph-objectstore-tool``.
|
||||
|
||||
External Links
|
||||
==============
|
||||
https://ceph.io/geen-categorie/ceph-manually-repair-object/ - This page contains a walkthrough of the repair of a placement group, and is recommended reading if you want to repair a placement
|
||||
group but have never done so.
|
||||
https://ceph.io/geen-categorie/ceph-manually-repair-object/ - This page
|
||||
contains a walkthrough of the repair of a PG. It is recommended reading if you
|
||||
want to repair a PG but have never done so.
|
||||
|
@ -1,52 +1,56 @@
|
||||
.. _upmap:
|
||||
|
||||
Using the pg-upmap
|
||||
==================
|
||||
Using pg-upmap
|
||||
==============
|
||||
|
||||
Starting in Luminous v12.2.z there is a new *pg-upmap* exception table
|
||||
In Luminous v12.2.z and later releases, there is a *pg-upmap* exception table
|
||||
in the OSDMap that allows the cluster to explicitly map specific PGs to
|
||||
specific OSDs. This allows the cluster to fine-tune the data
|
||||
distribution to, in most cases, perfectly distributed PGs across OSDs.
|
||||
specific OSDs. This allows the cluster to fine-tune the data distribution to,
|
||||
in most cases, uniformly distribute PGs across OSDs.
|
||||
|
||||
The key caveat to this new mechanism is that it requires that all
|
||||
clients understand the new *pg-upmap* structure in the OSDMap.
|
||||
However, there is an important caveat when it comes to this new feature: it
|
||||
requires all clients to understand the new *pg-upmap* structure in the OSDMap.
|
||||
|
||||
Enabling
|
||||
--------
|
||||
|
||||
New clusters will have this module on by default. The cluster must only
|
||||
have luminous (and newer) clients. You can the turn the balancer off with:
|
||||
In order to use ``pg-upmap``, the cluster cannot have any pre-Luminous clients.
|
||||
By default, new clusters enable the *balancer module*, which makes use of
|
||||
``pg-upmap``. If you want to use a different balancer or you want to make your
|
||||
own custom ``pg-upmap`` entries, you might want to turn off the balancer in
|
||||
order to avoid conflict:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer off
|
||||
|
||||
To allow use of the feature on existing clusters, you must tell the
|
||||
cluster that it only needs to support luminous (and newer) clients with:
|
||||
To allow use of the new feature on an existing cluster, you must restrict the
|
||||
cluster to supporting only Luminous (and newer) clients. To do so, run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd set-require-min-compat-client luminous
|
||||
|
||||
This command will fail if any pre-luminous clients or daemons are
|
||||
connected to the monitors. You can see what client versions are in
|
||||
use with:
|
||||
This command will fail if any pre-Luminous clients or daemons are connected to
|
||||
the monitors. To see which client versions are in use, run the following
|
||||
command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph features
|
||||
|
||||
Balancer module
|
||||
-----------------
|
||||
|
||||
The `balancer` module for ceph-mgr will automatically balance
|
||||
the number of PGs per OSD. See :ref:`balancer`
|
||||
---------------
|
||||
|
||||
The `balancer` module for ``ceph-mgr`` will automatically balance the number of
|
||||
PGs per OSD. See :ref:`balancer`
|
||||
|
||||
Offline optimization
|
||||
--------------------
|
||||
|
||||
Upmap entries are updated with an offline optimizer built into ``osdmaptool``.
|
||||
Upmap entries are updated with an offline optimizer that is built into
|
||||
``osdmaptool``.
|
||||
|
||||
#. Grab the latest copy of your osdmap:
|
||||
|
||||
@ -64,27 +68,28 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``.
|
||||
[--upmap-active]
|
||||
|
||||
It is highly recommended that optimization be done for each pool
|
||||
individually, or for sets of similarly-utilized pools. You can
|
||||
specify the ``--upmap-pool`` option multiple times. "Similar pools"
|
||||
means pools that are mapped to the same devices and store the same
|
||||
kind of data (e.g., RBD image pools, yes; RGW index pool and RGW
|
||||
data pool, no).
|
||||
individually, or for sets of similarly utilized pools. You can specify the
|
||||
``--upmap-pool`` option multiple times. "Similarly utilized pools" means
|
||||
pools that are mapped to the same devices and that store the same kind of
|
||||
data (for example, RBD image pools are considered to be similarly utilized;
|
||||
an RGW index pool and an RGW data pool are not considered to be similarly
|
||||
utilized).
|
||||
|
||||
The ``max-optimizations`` value is the maximum number of upmap entries to
|
||||
identify in the run. The default is `10` like the ceph-mgr balancer module,
|
||||
but you should use a larger number if you are doing offline optimization.
|
||||
If it cannot find any additional changes to make it will stop early
|
||||
(i.e., when the pool distribution is perfect).
|
||||
The ``max-optimizations`` value determines the maximum number of upmap
|
||||
entries to identify. The default is `10` (as is the case with the
|
||||
``ceph-mgr`` balancer module), but you should use a larger number if you are
|
||||
doing offline optimization. If it cannot find any additional changes to
|
||||
make (that is, if the pool distribution is perfect), it will stop early.
|
||||
|
||||
The ``max-deviation`` value defaults to `5`. If an OSD PG count
|
||||
varies from the computed target number by less than or equal
|
||||
to this amount it will be considered perfect.
|
||||
The ``max-deviation`` value defaults to `5`. If an OSD's PG count varies
|
||||
from the computed target number by no more than this amount it will be
|
||||
considered perfect.
|
||||
|
||||
The ``--upmap-active`` option simulates the behavior of the active
|
||||
balancer in upmap mode. It keeps cycling until the OSDs are balanced
|
||||
and reports how many rounds and how long each round is taking. The
|
||||
elapsed time for rounds indicates the CPU load ceph-mgr will be
|
||||
consuming when it tries to compute the next optimization plan.
|
||||
The ``--upmap-active`` option simulates the behavior of the active balancer
|
||||
in upmap mode. It keeps cycling until the OSDs are balanced and reports how
|
||||
many rounds have occurred and how long each round takes. The elapsed time
|
||||
for rounds indicates the CPU load that ``ceph-mgr`` consumes when it computes
|
||||
the next optimization plan.
|
||||
|
||||
#. Apply the changes:
|
||||
|
||||
@ -92,14 +97,13 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``.
|
||||
|
||||
source out.txt
|
||||
|
||||
The proposed changes are written to the output file ``out.txt`` in
|
||||
the example above. These are normal ceph CLI commands that can be
|
||||
run to apply the changes to the cluster.
|
||||
In the above example, the proposed changes are written to the output file
|
||||
``out.txt``. The commands in this procedure are normal Ceph CLI commands
|
||||
that can be run in order to apply the changes to the cluster.
|
||||
|
||||
The above steps can be repeated as many times as necessary to achieve a perfect
|
||||
distribution of PGs for each set of pools.
|
||||
|
||||
The above steps can be repeated as many times as necessary to achieve
|
||||
a perfect distribution of PGs for each set of pools.
|
||||
|
||||
You can see some (gory) details about what the tool is doing by
|
||||
passing ``--debug-osd 10`` and even more with ``--debug-crush 10``
|
||||
to ``osdmaptool``.
|
||||
To see some (gory) details about what the tool is doing, you can pass
|
||||
``--debug-osd 10`` to ``osdmaptool``. To see even more details, pass
|
||||
``--debug-crush 10`` to ``osdmaptool``.
|
||||
|
@ -4,10 +4,11 @@
|
||||
User Management
|
||||
=================
|
||||
|
||||
This document describes :term:`Ceph Client` users, and their authentication and
|
||||
authorization with the :term:`Ceph Storage Cluster`. Users are either
|
||||
individuals or system actors such as applications, which use Ceph clients to
|
||||
interact with the Ceph Storage Cluster daemons.
|
||||
This document describes :term:`Ceph Client` users, and describes the process by
|
||||
which they perform authentication and authorization so that they can access the
|
||||
:term:`Ceph Storage Cluster`. Users are either individuals or system actors
|
||||
(for example, applications) that use Ceph clients to interact with the Ceph
|
||||
Storage Cluster daemons.
|
||||
|
||||
.. ditaa::
|
||||
+-----+
|
||||
@ -24,19 +25,21 @@ interact with the Ceph Storage Cluster daemons.
|
||||
actor
|
||||
|
||||
|
||||
When Ceph runs with authentication and authorization enabled (enabled by
|
||||
default), you must specify a user name and a keyring containing the secret key
|
||||
of the specified user (usually via the command line). If you do not specify a
|
||||
user name, Ceph will use ``client.admin`` as the default user name. If you do
|
||||
not specify a keyring, Ceph will look for a keyring via the ``keyring`` setting
|
||||
in the Ceph configuration. For example, if you execute the ``ceph health``
|
||||
command without specifying a user or keyring:
|
||||
When Ceph runs with authentication and authorization enabled (both are enabled
|
||||
by default), you must specify a user name and a keyring that contains the
|
||||
secret key of the specified user (usually these are specified via the command
|
||||
line). If you do not specify a user name, Ceph will use ``client.admin`` as the
|
||||
default user name. If you do not specify a keyring, Ceph will look for a
|
||||
keyring via the ``keyring`` setting in the Ceph configuration. For example, if
|
||||
you execute the ``ceph health`` command without specifying a user or a keyring,
|
||||
Ceph will assume that the keyring is in ``/etc/ceph/ceph.client.admin.keyring``
|
||||
and will attempt to use that keyring. The following illustrates this behavior:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph health
|
||||
|
||||
Ceph interprets the command like this:
|
||||
Ceph will interpret the command like this:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -45,118 +48,122 @@ Ceph interprets the command like this:
|
||||
Alternatively, you may use the ``CEPH_ARGS`` environment variable to avoid
|
||||
re-entry of the user name and secret.
|
||||
|
||||
For details on configuring the Ceph Storage Cluster to use authentication,
|
||||
see `Cephx Config Reference`_. For details on the architecture of Cephx, see
|
||||
For details on configuring the Ceph Storage Cluster to use authentication, see
|
||||
`Cephx Config Reference`_. For details on the architecture of Cephx, see
|
||||
`Architecture - High Availability Authentication`_.
|
||||
|
||||
Background
|
||||
==========
|
||||
|
||||
Irrespective of the type of Ceph client (e.g., Block Device, Object Storage,
|
||||
Filesystem, native API, etc.), Ceph stores all data as objects within `pools`_.
|
||||
Ceph users must have access to pools in order to read and write data.
|
||||
Additionally, Ceph users must have execute permissions to use Ceph's
|
||||
administrative commands. The following concepts will help you understand Ceph
|
||||
user management.
|
||||
No matter what type of Ceph client is used (for example: Block Device, Object
|
||||
Storage, Filesystem, native API), Ceph stores all data as RADOS objects within
|
||||
`pools`_. Ceph users must have access to a given pool in order to read and
|
||||
write data, and Ceph users must have execute permissions in order to use Ceph's
|
||||
administrative commands. The following concepts will help you understand
|
||||
Ceph['s] user management.
|
||||
|
||||
.. _rados-ops-user:
|
||||
|
||||
User
|
||||
----
|
||||
|
||||
A user is either an individual or a system actor such as an application.
|
||||
A user is either an individual or a system actor (for example, an application).
|
||||
Creating users allows you to control who (or what) can access your Ceph Storage
|
||||
Cluster, its pools, and the data within pools.
|
||||
Cluster, its pools, and the data within those pools.
|
||||
|
||||
Ceph has the notion of a ``type`` of user. For the purposes of user management,
|
||||
the type will always be ``client``. Ceph identifies users in period (.)
|
||||
delimited form consisting of the user type and the user ID: for example,
|
||||
Ceph has the concept of a ``type`` of user. For purposes of user management,
|
||||
the type will always be ``client``. Ceph identifies users in a "period-
|
||||
delimited form" that consists of the user type and the user ID: for example,
|
||||
``TYPE.ID``, ``client.admin``, or ``client.user1``. The reason for user typing
|
||||
is that Ceph Monitors, OSDs, and Metadata Servers also use the Cephx protocol,
|
||||
but they are not clients. Distinguishing the user type helps to distinguish
|
||||
between client users and other users--streamlining access control, user
|
||||
monitoring and traceability.
|
||||
is that the Cephx protocol is used not only by clients but also non-clients,
|
||||
such as Ceph Monitors, OSDs, and Metadata Servers. Distinguishing the user type
|
||||
helps to distinguish between client users and other users. This distinction
|
||||
streamlines access control, user monitoring, and traceability.
|
||||
|
||||
Sometimes Ceph's user type may seem confusing, because the Ceph command line
|
||||
Sometimes Ceph's user type might seem confusing, because the Ceph command line
|
||||
allows you to specify a user with or without the type, depending upon your
|
||||
command line usage. If you specify ``--user`` or ``--id``, you can omit the
|
||||
type. So ``client.user1`` can be entered simply as ``user1``. If you specify
|
||||
``--name`` or ``-n``, you must specify the type and name, such as
|
||||
``client.user1``. We recommend using the type and name as a best practice
|
||||
wherever possible.
|
||||
type. For example, ``client.user1`` can be entered simply as ``user1``. On the
|
||||
other hand, if you specify ``--name`` or ``-n``, you must supply the type and
|
||||
name: for example, ``client.user1``. We recommend using the type and name as a
|
||||
best practice wherever possible.
|
||||
|
||||
.. note:: A Ceph Storage Cluster user is not the same as a Ceph Object Storage
|
||||
user or a Ceph File System user. The Ceph Object Gateway uses a Ceph Storage
|
||||
Cluster user to communicate between the gateway daemon and the storage
|
||||
cluster, but the gateway has its own user management functionality for end
|
||||
users. The Ceph File System uses POSIX semantics. The user space associated
|
||||
with the Ceph File System is not the same as a Ceph Storage Cluster user.
|
||||
|
||||
|
||||
cluster, but the Ceph Object Gateway has its own user-management
|
||||
functionality for end users. The Ceph File System uses POSIX semantics, and
|
||||
the user space associated with the Ceph File System is not the same as the
|
||||
user space associated with a Ceph Storage Cluster user.
|
||||
|
||||
Authorization (Capabilities)
|
||||
----------------------------
|
||||
|
||||
Ceph uses the term "capabilities" (caps) to describe authorizing an
|
||||
authenticated user to exercise the functionality of the monitors, OSDs and
|
||||
Ceph uses the term "capabilities" (caps) to describe the permissions granted to
|
||||
an authenticated user to exercise the functionality of the monitors, OSDs, and
|
||||
metadata servers. Capabilities can also restrict access to data within a pool,
|
||||
a namespace within a pool, or a set of pools based on their application tags.
|
||||
A Ceph administrative user sets a user's capabilities when creating or updating
|
||||
a user.
|
||||
A Ceph administrative user specifies the capabilities of a user when creating
|
||||
or updating that user.
|
||||
|
||||
Capability syntax follows the form::
|
||||
Capability syntax follows this form::
|
||||
|
||||
{daemon-type} '{cap-spec}[, {cap-spec} ...]'
|
||||
{daemon-type} '{cap-spec}[, {cap-spec} ...]'
|
||||
|
||||
- **Monitor Caps:** Monitor capabilities include ``r``, ``w``, ``x`` access
|
||||
settings or ``profile {name}``. For example::
|
||||
settings, and can be applied in aggregate from pre-defined profiles with
|
||||
``profile {name}``. For example::
|
||||
|
||||
mon 'allow {access-spec} [network {network/prefix}]'
|
||||
mon 'allow {access-spec} [network {network/prefix}]'
|
||||
|
||||
mon 'profile {name}'
|
||||
mon 'profile {name}'
|
||||
|
||||
The ``{access-spec}`` syntax is as follows: ::
|
||||
|
||||
* | all | [r][w][x]
|
||||
|
||||
The optional ``{network/prefix}`` is a standard network name and
|
||||
prefix length in CIDR notation (e.g., ``10.3.0.0/16``). If present,
|
||||
the use of this capability is restricted to clients connecting from
|
||||
this network.
|
||||
The optional ``{network/prefix}`` is a standard network name and prefix
|
||||
length in CIDR notation (for example, ``10.3.0.0/16``). If
|
||||
``{network/prefix}`` is present, the monitor capability can be used only by
|
||||
clients that connect from the specified network.
|
||||
|
||||
- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, ``class-read``,
|
||||
``class-write`` access settings or ``profile {name}``. Additionally, OSD
|
||||
capabilities also allow for pool and namespace settings. ::
|
||||
- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, and
|
||||
``class-read`` and ``class-write`` access settings. OSD capabilities can be
|
||||
applied in aggregate from pre-defined profiles with ``profile {name}``. In
|
||||
addition, OSD capabilities allow for pool and namespace settings. ::
|
||||
|
||||
osd 'allow {access-spec} [{match-spec}] [network {network/prefix}]'
|
||||
osd 'allow {access-spec} [{match-spec}] [network {network/prefix}]'
|
||||
|
||||
osd 'profile {name} [pool={pool-name} [namespace={namespace-name}]] [network {network/prefix}]'
|
||||
osd 'profile {name} [pool={pool-name} [namespace={namespace-name}]] [network {network/prefix}]'
|
||||
|
||||
The ``{access-spec}`` syntax is either of the following: ::
|
||||
There are two alternative forms of the ``{access-spec}`` syntax: ::
|
||||
|
||||
* | all | [r][w][x] [class-read] [class-write]
|
||||
|
||||
class {class name} [{method name}]
|
||||
|
||||
The optional ``{match-spec}`` syntax is either of the following: ::
|
||||
There are two alternative forms of the optional ``{match-spec}`` syntax::
|
||||
|
||||
pool={pool-name} [namespace={namespace-name}] [object_prefix {prefix}]
|
||||
|
||||
[namespace={namespace-name}] tag {application} {key}={value}
|
||||
|
||||
The optional ``{network/prefix}`` is a standard network name and
|
||||
prefix length in CIDR notation (e.g., ``10.3.0.0/16``). If present,
|
||||
the use of this capability is restricted to clients connecting from
|
||||
this network.
|
||||
The optional ``{network/prefix}`` is a standard network name and prefix
|
||||
length in CIDR notation (for example, ``10.3.0.0/16``). If
|
||||
``{network/prefix}`` is present, the OSD capability can be used only by
|
||||
clients that connect from the specified network.
|
||||
|
||||
- **Manager Caps:** Manager (``ceph-mgr``) capabilities include
|
||||
``r``, ``w``, ``x`` access settings or ``profile {name}``. For example: ::
|
||||
- **Manager Caps:** Manager (``ceph-mgr``) capabilities include ``r``, ``w``,
|
||||
``x`` access settings, and can be applied in aggregate from pre-defined
|
||||
profiles with ``profile {name}``. For example::
|
||||
|
||||
mgr 'allow {access-spec} [network {network/prefix}]'
|
||||
mgr 'allow {access-spec} [network {network/prefix}]'
|
||||
|
||||
mgr 'profile {name} [{key1} {match-type} {value1} ...] [network {network/prefix}]'
|
||||
mgr 'profile {name} [{key1} {match-type} {value1} ...] [network {network/prefix}]'
|
||||
|
||||
Manager capabilities can also be specified for specific commands,
|
||||
all commands exported by a built-in manager service, or all commands
|
||||
exported by a specific add-on module. For example: ::
|
||||
Manager capabilities can also be specified for specific commands, for all
|
||||
commands exported by a built-in manager service, or for all commands exported
|
||||
by a specific add-on module. For example::
|
||||
|
||||
mgr 'allow command "{command-prefix}" [with {key1} {match-type} {value1} ...] [network {network/prefix}]'
|
||||
|
||||
@ -176,15 +183,14 @@ Capability syntax follows the form::
|
||||
|
||||
= | prefix | regex
|
||||
|
||||
- **Metadata Server Caps:** For administrators, use ``allow *``. For all
|
||||
other users, such as CephFS clients, consult :doc:`/cephfs/client-auth`
|
||||
|
||||
- **Metadata Server Caps:** For administrators, use ``allow *``. For all other
|
||||
users (for example, CephFS clients), consult :doc:`/cephfs/client-auth`
|
||||
|
||||
.. note:: The Ceph Object Gateway daemon (``radosgw``) is a client of the
|
||||
Ceph Storage Cluster, so it is not represented as a Ceph Storage
|
||||
Cluster daemon type.
|
||||
Ceph Storage Cluster. For this reason, it is not represented as
|
||||
a Ceph Storage Cluster daemon type.
|
||||
|
||||
The following entries describe each access capability.
|
||||
The following entries describe access capabilities.
|
||||
|
||||
``allow``
|
||||
|
||||
@ -206,7 +212,7 @@ The following entries describe each access capability.
|
||||
``x``
|
||||
|
||||
:Description: Gives the user the capability to call class methods
|
||||
(i.e., both read and write) and to conduct ``auth``
|
||||
(that is, both read and write) and to conduct ``auth``
|
||||
operations on monitors.
|
||||
|
||||
|
||||
@ -224,75 +230,76 @@ The following entries describe each access capability.
|
||||
|
||||
``*``, ``all``
|
||||
|
||||
:Description: Gives the user read, write and execute permissions for a
|
||||
particular daemon/pool, and the ability to execute
|
||||
:Description: Gives the user read, write, and execute permissions for a
|
||||
particular daemon/pool, as well as the ability to execute
|
||||
admin commands.
|
||||
|
||||
|
||||
The following entries describe valid capability profiles:
|
||||
|
||||
``profile osd`` (Monitor only)
|
||||
|
||||
:Description: Gives a user permissions to connect as an OSD to other OSDs or
|
||||
monitors. Conferred on OSDs to enable OSDs to handle replication
|
||||
monitors. Conferred on OSDs in order to enable OSDs to handle replication
|
||||
heartbeat traffic and status reporting.
|
||||
|
||||
|
||||
``profile mds`` (Monitor only)
|
||||
|
||||
:Description: Gives a user permissions to connect as a MDS to other MDSs or
|
||||
:Description: Gives a user permissions to connect as an MDS to other MDSs or
|
||||
monitors.
|
||||
|
||||
|
||||
``profile bootstrap-osd`` (Monitor only)
|
||||
|
||||
:Description: Gives a user permissions to bootstrap an OSD. Conferred on
|
||||
deployment tools such as ``ceph-volume``, ``cephadm``, etc.
|
||||
so that they have permissions to add keys, etc. when
|
||||
deployment tools such as ``ceph-volume`` and ``cephadm``
|
||||
so that they have permissions to add keys when
|
||||
bootstrapping an OSD.
|
||||
|
||||
|
||||
``profile bootstrap-mds`` (Monitor only)
|
||||
|
||||
:Description: Gives a user permissions to bootstrap a metadata server.
|
||||
Conferred on deployment tools such as ``cephadm``, etc.
|
||||
so they have permissions to add keys, etc. when bootstrapping
|
||||
Conferred on deployment tools such as ``cephadm``
|
||||
so that they have permissions to add keys when bootstrapping
|
||||
a metadata server.
|
||||
|
||||
``profile bootstrap-rbd`` (Monitor only)
|
||||
|
||||
:Description: Gives a user permissions to bootstrap an RBD user.
|
||||
Conferred on deployment tools such as ``cephadm``, etc.
|
||||
so they have permissions to add keys, etc. when bootstrapping
|
||||
Conferred on deployment tools such as ``cephadm``
|
||||
so that they have permissions to add keys when bootstrapping
|
||||
an RBD user.
|
||||
|
||||
``profile bootstrap-rbd-mirror`` (Monitor only)
|
||||
|
||||
:Description: Gives a user permissions to bootstrap an ``rbd-mirror`` daemon
|
||||
user. Conferred on deployment tools such as ``cephadm``, etc.
|
||||
so they have permissions to add keys, etc. when bootstrapping
|
||||
an ``rbd-mirror`` daemon.
|
||||
user. Conferred on deployment tools such as ``cephadm`` so that
|
||||
they have permissions to add keys when bootstrapping an
|
||||
``rbd-mirror`` daemon.
|
||||
|
||||
``profile rbd`` (Manager, Monitor, and OSD)
|
||||
|
||||
:Description: Gives a user permissions to manipulate RBD images. When used
|
||||
as a Monitor cap, it provides the minimal privileges required
|
||||
by an RBD client application; this includes the ability
|
||||
to blocklist other client users. When used as an OSD cap, it
|
||||
provides read-write access to the specified pool to an
|
||||
RBD client application. The Manager cap supports optional
|
||||
``pool`` and ``namespace`` keyword arguments.
|
||||
:Description: Gives a user permissions to manipulate RBD images. When used as a
|
||||
Monitor cap, it provides the user with the minimal privileges
|
||||
required by an RBD client application; such privileges include
|
||||
the ability to blocklist other client users. When used as an OSD
|
||||
cap, it provides an RBD client application with read-write access
|
||||
to the specified pool. The Manager cap supports optional ``pool``
|
||||
and ``namespace`` keyword arguments.
|
||||
|
||||
``profile rbd-mirror`` (Monitor only)
|
||||
|
||||
:Description: Gives a user permissions to manipulate RBD images and retrieve
|
||||
RBD mirroring config-key secrets. It provides the minimal
|
||||
privileges required for the ``rbd-mirror`` daemon.
|
||||
privileges required for the user to manipulate the ``rbd-mirror``
|
||||
daemon.
|
||||
|
||||
``profile rbd-read-only`` (Manager and OSD)
|
||||
|
||||
:Description: Gives a user read-only permissions to RBD images. The Manager
|
||||
cap supports optional ``pool`` and ``namespace`` keyword
|
||||
arguments.
|
||||
:Description: Gives a user read-only permissions to RBD images. The Manager cap
|
||||
supports optional ``pool`` and ``namespace`` keyword arguments.
|
||||
|
||||
``profile simple-rados-client`` (Monitor only)
|
||||
|
||||
@ -303,27 +310,27 @@ The following entries describe valid capability profiles:
|
||||
|
||||
:Description: Gives a user read-only permissions for monitor, OSD, and PG data.
|
||||
Intended for use by direct librados client applications. Also
|
||||
includes permission to add blocklist entries to build HA
|
||||
applications.
|
||||
includes permissions to add blocklist entries to build
|
||||
high-availability (HA) applications.
|
||||
|
||||
``profile fs-client`` (Monitor only)
|
||||
|
||||
:Description: Gives a user read-only permissions for monitor, OSD, PG, and MDS
|
||||
data. Intended for CephFS clients.
|
||||
data. Intended for CephFS clients.
|
||||
|
||||
``profile role-definer`` (Monitor and Auth)
|
||||
|
||||
:Description: Gives a user **all** permissions for the auth subsystem, read-only
|
||||
access to monitors, and nothing else. Useful for automation
|
||||
tools. Do not assign this unless you really, **really** know what
|
||||
you're doing as the security ramifications are substantial and
|
||||
access to monitors, and nothing else. Useful for automation
|
||||
tools. Do not assign this unless you really, **really** know what
|
||||
you're doing, as the security ramifications are substantial and
|
||||
pervasive.
|
||||
|
||||
``profile crash`` (Monitor only)
|
||||
|
||||
:Description: Gives a user read-only access to monitors, used in conjunction
|
||||
with the manager ``crash`` module when collecting daemon crash
|
||||
dumps for later analysis.
|
||||
:Description: Gives a user read-only access to monitors. Used in conjunction
|
||||
with the manager ``crash`` module to upload daemon crash
|
||||
dumps into monitor storage for later analysis.
|
||||
|
||||
Pool
|
||||
----
|
||||
@ -353,7 +360,8 @@ by users who have access to the namespace.
|
||||
|
||||
.. note:: Namespaces are primarily useful for applications written on top of
|
||||
``librados`` where the logical grouping can alleviate the need to create
|
||||
different pools. Ceph Object Gateway (from ``luminous``) uses namespaces for various
|
||||
different pools. Ceph Object Gateway (in releases beginning with
|
||||
Luminous) uses namespaces for various
|
||||
metadata objects.
|
||||
|
||||
The rationale for namespaces is that pools can be a computationally expensive
|
||||
|
@ -1,32 +1,40 @@
|
||||
.. _radosgw_keycloak:
|
||||
|
||||
=================================
|
||||
Keycloak integration with RadosGW
|
||||
Integrating Keycloak with RadosGW
|
||||
=================================
|
||||
|
||||
Keycloak can be setup as an OpenID Connect Identity Provider, which can be used by mobile/ web apps
|
||||
to authenticate their users. The Web token returned as a result of authentication can be used by the
|
||||
mobile/ web app to call AssumeRoleWithWebIdentity to get back a set of temporary S3 credentials,
|
||||
which can be used by the app to make S3 calls.
|
||||
If Keycloak is set up as an OpenID Connect Identity Provider, it can be used by
|
||||
mobile apps and web apps to authenticate their users. By using the web token
|
||||
returned by the authentication process, a mobile app or web app can call
|
||||
AssumeRoleWithWebIdentity, receive a set of temporary S3 credentials, and use
|
||||
those credentials to make S3 calls.
|
||||
|
||||
Setting up Keycloak
|
||||
====================
|
||||
===================
|
||||
|
||||
Installing and bringing up Keycloak can be found here: https://www.keycloak.org/docs/latest/server_installation/.
|
||||
Documentation for installing and operating Keycloak can be found here:
|
||||
https://www.keycloak.org/guides.
|
||||
|
||||
Configuring Keycloak to talk to RGW
|
||||
===================================
|
||||
|
||||
The following configurables have to be added for RGW to talk to Keycloak::
|
||||
To configure Keycloak to talk to RGW, add the following configurables::
|
||||
|
||||
[client.radosgw.gateway]
|
||||
rgw sts key = {sts key for encrypting/ decrypting the session token}
|
||||
rgw s3 auth use sts = true
|
||||
|
||||
Example showing how to fetch a web token from Keycloak
|
||||
======================================================
|
||||
Fetching a web token with Keycloak
|
||||
==================================
|
||||
|
||||
Several examples of apps authenticating with Keycloak are given here: https://github.com/keycloak/keycloak-quickstarts/blob/latest/docs/getting-started.md
|
||||
Taking the example of app-profile-jee-jsp app given in the link above, its client id and client secret, can be used to fetch the
|
||||
access token (web token) for an application using grant type 'client_credentials' as given below::
|
||||
Several examples of apps authenticating with Keycloak can be found here:
|
||||
https://github.com/keycloak/keycloak-quickstarts/blob/latest/docs/getting-started.md.
|
||||
|
||||
Here you might consider the example of the app-profile-jee-jsp app (in the link
|
||||
above). To fetch the access token (web token) for such an application using the
|
||||
grant type 'client_credentials', one can use client id and client secret as
|
||||
follows::
|
||||
|
||||
KC_REALM=demo
|
||||
KC_CLIENT=<client id>
|
||||
@ -48,8 +56,9 @@ access token (web token) for an application using grant type 'client_credentials
|
||||
|
||||
KC_ACCESS_TOKEN=$(echo $KC_RESPONSE| jq -r .access_token)
|
||||
|
||||
An access token can also be fetched for a particular user with grant type 'password', using client id, client secret, username and its password
|
||||
as given below::
|
||||
It is also possible to fetch an access token for a particular user with the
|
||||
grant type 'password'. To fetch such an access token, use client id, client
|
||||
secret, username, and password as follows::
|
||||
|
||||
KC_REALM=demo
|
||||
KC_USERNAME=<username>
|
||||
@ -75,43 +84,45 @@ as given below::
|
||||
|
||||
KC_ACCESS_TOKEN=$(echo $KC_RESPONSE| jq -r .access_token)
|
||||
|
||||
|
||||
KC_ACCESS_TOKEN can be used to invoke AssumeRoleWithWebIdentity as given in
|
||||
``KC_ACCESS_TOKEN`` can be used to invoke ``AssumeRoleWithWebIdentity``: see
|
||||
:doc:`STS`.
|
||||
|
||||
Attaching tags to a user in Keycloak
|
||||
====================================
|
||||
Adding tags to a user in Keycloak
|
||||
=================================
|
||||
|
||||
We need to create a user in keycloak, and add tags to it as its attributes.
|
||||
To create a user in Keycloak and add tags to it as its attributes, follow these
|
||||
steps:
|
||||
|
||||
Add a user as shown below:
|
||||
#. Add a user:
|
||||
|
||||
.. image:: ../images/keycloak-adduser.png
|
||||
:align: center
|
||||
.. image:: ../images/keycloak-adduser.png
|
||||
:align: center
|
||||
|
||||
Add user details as shown below:
|
||||
#. Add user details:
|
||||
|
||||
.. image:: ../images/keycloak-userdetails.png
|
||||
:align: center
|
||||
.. image:: ../images/keycloak-userdetails.png
|
||||
:align: center
|
||||
|
||||
Add user credentials as shown below:
|
||||
#. Add user credentials:
|
||||
|
||||
.. image:: ../images/keycloak-usercredentials.png
|
||||
:align: center
|
||||
.. image:: ../images/keycloak-usercredentials.png
|
||||
:align: center
|
||||
|
||||
Add tags to the 'attributes' tab of the user as shown below:
|
||||
#. Add tags to the 'attributes' tab of the user:
|
||||
|
||||
.. image:: ../images/keycloak-usertags.png
|
||||
:align: center
|
||||
.. image:: ../images/keycloak-usertags.png
|
||||
:align: center
|
||||
|
||||
Add a protocol mapper for the user attribute to a client as shown below:
|
||||
#. Add a protocol mapper that maps the user attribute to a client:
|
||||
|
||||
.. image:: ../images/keycloak-userclientmapper.png
|
||||
:align: center
|
||||
.. image:: ../images/keycloak-userclientmapper.png
|
||||
:align: center
|
||||
|
||||
After these steps have been completed, the tag 'Department' will appear in the
|
||||
JWT (web token), under the 'https://aws.amazon.com/tags' namespace.
|
||||
|
||||
After following the steps shown above, the tag 'Department' will appear in the JWT (web token), under 'https://aws.amazon.com/tags' namespace.
|
||||
The tags can be verified using token introspection of the JWT. The command to introspect a token using client id and client secret is shown below::
|
||||
Tags can be verified by performing token introspection on a JWT. To introspect
|
||||
a token, use ``client id`` and ``client secret`` as follows::
|
||||
|
||||
KC_REALM=demo
|
||||
KC_CLIENT=<client id>
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _radosgw-multisite-sync-policy:
|
||||
|
||||
=====================
|
||||
Multisite Sync Policy
|
||||
=====================
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -123,6 +123,18 @@ Then provide the zone placement info for that target:
|
||||
--index-pool default.rgw.temporary.index \
|
||||
--data-extra-pool default.rgw.temporary.non-ec
|
||||
|
||||
.. note:: With default placement target settings, RGW stores an object's first data chunk in the RADOS "head" object along
|
||||
with xattr metadata. The `--placement-inline-data=false` flag may be passed with the `zone placement add` or
|
||||
`zone placement modify` commands to change this behavior for new objects stored on the target.
|
||||
When data is stored inline (default), it may provide an advantage for read/write workloads since the first chunk of
|
||||
an object's data can be retrieved/stored in a single librados call along with object metadata. On the other hand, a
|
||||
target that does not store data inline can provide a performance benefit for RGW client delete requests when
|
||||
the BlueStore DB is located on faster storage than bucket data since it eliminates the need to access
|
||||
slower devices synchronously while processing the client request. In that case, data associated with the deleted
|
||||
objects is removed asynchronously in the background by garbage collection.
|
||||
|
||||
.. _adding_a_storage_class:
|
||||
|
||||
Adding a Storage Class
|
||||
----------------------
|
||||
|
||||
|
@ -40,7 +40,7 @@ The following table describes the support status for current Amazon S3 functiona
|
||||
+---------------------------------+-----------------+----------------------------------------+
|
||||
| **Bucket Lifecycle** | Supported | |
|
||||
+---------------------------------+-----------------+----------------------------------------+
|
||||
| **Bucket Replication** | Partial | Only permitted across zones |
|
||||
| **Bucket Replication** | Partial | Permitted only across zones |
|
||||
+---------------------------------+-----------------+----------------------------------------+
|
||||
| **Policy (Buckets, Objects)** | Supported | ACLs & bucket policies are supported |
|
||||
+---------------------------------+-----------------+----------------------------------------+
|
||||
|
@ -43,10 +43,13 @@ An example of the session tags that are passed in by the IDP in the web token is
|
||||
"active": true
|
||||
}
|
||||
|
||||
Steps to configure Keycloak to pass tags in the web token are described here:doc:`keycloak`.
|
||||
Steps to configure Keycloak to pass tags in the web token are described here:
|
||||
:ref:`radosgw_keycloak`.
|
||||
|
||||
The trust policy must have 'sts:TagSession' permission if the web token passed in by the federated user contains session tags, otherwise
|
||||
the AssumeRoleWithWebIdentity action will fail. An example of the trust policy with sts:TagSession is as follows:
|
||||
The trust policy must have 'sts:TagSession' permission if the web token passed
|
||||
in by the federated user contains session tags, otherwise the
|
||||
AssumeRoleWithWebIdentity action will fail. An example of the trust policy with
|
||||
sts:TagSession is as follows:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@ -421,4 +424,4 @@ JWT (access token) by the IDP
|
||||
eq(s3_put_obj['ResponseMetadata']['HTTPStatusCode'],200)
|
||||
|
||||
s3_get_obj = s3client2.get_object(Bucket=bucket_name, Key=key)
|
||||
eq(s3_get_obj['ResponseMetadata']['HTTPStatusCode'],200)
|
||||
eq(s3_get_obj['ResponseMetadata']['HTTPStatusCode'],200)
|
||||
|
@ -12,21 +12,21 @@ iSCSI Initiator for Linux
|
||||
|
||||
Install the iSCSI initiator and multipath tools:
|
||||
|
||||
::
|
||||
.. prompt:: bash #
|
||||
|
||||
# yum install iscsi-initiator-utils
|
||||
# yum install device-mapper-multipath
|
||||
yum install iscsi-initiator-utils
|
||||
yum install device-mapper-multipath
|
||||
|
||||
**Configuring:**
|
||||
|
||||
#. Create the default ``/etc/multipath.conf`` file and enable the
|
||||
``multipathd`` service:
|
||||
|
||||
::
|
||||
.. prompt:: bash #
|
||||
|
||||
# mpathconf --enable --with_multipathd y
|
||||
mpathconf --enable --with_multipathd y
|
||||
|
||||
#. Add the following to ``/etc/multipath.conf`` file:
|
||||
#. Add the following to the ``/etc/multipath.conf`` file:
|
||||
|
||||
::
|
||||
|
||||
@ -47,45 +47,72 @@ Install the iSCSI initiator and multipath tools:
|
||||
|
||||
#. Restart the ``multipathd`` service:
|
||||
|
||||
::
|
||||
.. prompt:: bash #
|
||||
|
||||
# systemctl reload multipathd
|
||||
systemctl reload multipathd
|
||||
|
||||
**iSCSI Discovery and Setup:**
|
||||
|
||||
#. If CHAP was setup on the iSCSI gateway, provide a CHAP username and
|
||||
password by updating the ``/etc/iscsi/iscsid.conf`` file accordingly.
|
||||
#. Enable CHAP authentication and provide the initiator CHAP username
|
||||
and password by uncommenting and setting the following options in
|
||||
the ``/etc/iscsi/iscsid.conf`` file:
|
||||
|
||||
::
|
||||
|
||||
node.session.auth.authmethod = CHAP
|
||||
node.session.auth.username = myusername
|
||||
node.session.auth.password = mypassword
|
||||
|
||||
If you intend to use mutual (bidirectional) authentication, provide the
|
||||
target CHAP username and password:
|
||||
|
||||
::
|
||||
|
||||
node.session.auth.username_in = mytgtusername
|
||||
node.session.auth.password_in = mytgtpassword
|
||||
|
||||
#. Discover the target portals:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
iscsiadm -m discovery -t st -p 192.168.56.101
|
||||
|
||||
::
|
||||
|
||||
# iscsiadm -m discovery -t st -p 192.168.56.101
|
||||
192.168.56.101:3260,1 iqn.2003-01.org.linux-iscsi.rheln1
|
||||
192.168.56.102:3260,2 iqn.2003-01.org.linux-iscsi.rheln1
|
||||
|
||||
#. Login to target:
|
||||
#. Log in to the target:
|
||||
|
||||
::
|
||||
.. prompt:: bash #
|
||||
|
||||
# iscsiadm -m node -T iqn.2003-01.org.linux-iscsi.rheln1 -l
|
||||
iscsiadm -m node -T iqn.2003-01.org.linux-iscsi.rheln1 -l
|
||||
|
||||
**Multipath IO Setup:**
|
||||
|
||||
The multipath daemon (``multipathd``), will set up devices automatically
|
||||
based on the ``multipath.conf`` settings. Running the ``multipath``
|
||||
command show devices setup in a failover configuration with a priority
|
||||
group for each path.
|
||||
#. The multipath daemon (``multipathd``) uses the ``multipath.conf`` settings
|
||||
to set up devices automatically. Running the ``multipath`` command shows
|
||||
that the devices have been set up in a failover configuration. Notice that
|
||||
each path has been placed into its own priority group:
|
||||
|
||||
::
|
||||
.. prompt:: bash #
|
||||
|
||||
# multipath -ll
|
||||
mpathbt (360014059ca317516a69465c883a29603) dm-1 LIO-ORG ,IBLOCK
|
||||
size=1.0G features='0' hwhandler='1 alua' wp=rw
|
||||
|-+- policy='queue-length 0' prio=50 status=active
|
||||
| `- 28:0:0:1 sde 8:64 active ready running
|
||||
`-+- policy='queue-length 0' prio=10 status=enabled
|
||||
`- 29:0:0:1 sdc 8:32 active ready running
|
||||
multipath -ll
|
||||
|
||||
You should now be able to use the RBD image like you would a normal
|
||||
multipath’d iSCSI disk.
|
||||
::
|
||||
|
||||
mpathbt (360014059ca317516a69465c883a29603) dm-1 LIO-ORG ,IBLOCK
|
||||
size=1.0G features='0' hwhandler='1 alua' wp=rw
|
||||
|-+- policy='queue-length 0' prio=50 status=active
|
||||
| `- 28:0:0:1 sde 8:64 active ready running
|
||||
`-+- policy='queue-length 0' prio=10 status=enabled
|
||||
`- 29:0:0:1 sdc 8:32 active ready running
|
||||
|
||||
You should now be able to use the RBD image in the same way that you would
|
||||
use a normal multipath iSCSI disk.
|
||||
|
||||
#. Log out of the target:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
iscsiadm -m node -T iqn.2003-01.org.linux-iscsi.rheln1 -u
|
||||
|
@ -9,53 +9,68 @@
|
||||
Exclusive locks are mechanisms designed to prevent multiple processes from
|
||||
accessing the same Rados Block Device (RBD) in an uncoordinated fashion.
|
||||
Exclusive locks are used heavily in virtualization (where they prevent VMs from
|
||||
clobbering each other's writes) and in RBD mirroring (where they are a
|
||||
prerequisite for journaling).
|
||||
clobbering each other's writes) and in `RBD mirroring`_ (where they are a
|
||||
prerequisite for journaling in journal-based mirroring and fast generation of
|
||||
incremental diffs in snapshot-based mirroring).
|
||||
|
||||
By default, exclusive locks are enabled on newly created images. This default
|
||||
The ``exclusive-lock`` feature is enabled on newly created images. This default
|
||||
can be overridden via the ``rbd_default_features`` configuration option or the
|
||||
``--image-feature`` option for ``rbd create``.
|
||||
``--image-feature`` and ``--image-shared`` options for ``rbd create`` command.
|
||||
|
||||
.. note::
|
||||
Many image features, including ``object-map`` and ``fast-diff``, depend upon
|
||||
exclusive locking. Disabling the ``exclusive-lock`` feature will negatively
|
||||
affect the performance of some operations.
|
||||
|
||||
In order to ensure proper exclusive locking operations, any client using an RBD
|
||||
image whose ``exclusive-lock`` feature is enabled must have a CephX identity
|
||||
whose capabilities include ``profile rbd``.
|
||||
To maintain multi-client access, the ``exclusive-lock`` feature implements
|
||||
automatic cooperative lock transitions between clients. It ensures that only
|
||||
a single client can write to an RBD image at any given time and thus protects
|
||||
internal image structures such as the object map, the journal or the `PWL
|
||||
cache`_ from concurrent modification.
|
||||
|
||||
Exclusive locking is mostly transparent to the user.
|
||||
Exclusive locking is mostly transparent to the user:
|
||||
|
||||
#. Whenever any ``librbd`` client process or kernel RBD client
|
||||
starts using an RBD image on which exclusive locking has been
|
||||
enabled, it obtains an exclusive lock on the image before the first
|
||||
write.
|
||||
* Whenever a client (a ``librbd`` process or, in case of a ``krbd`` client,
|
||||
a client node's kernel) needs to handle a write to an RBD image on which
|
||||
exclusive locking has been enabled, it first acquires an exclusive lock on
|
||||
the image. If the lock is already held by some other client, that client is
|
||||
requested to release it.
|
||||
|
||||
#. Whenever any such client process terminates gracefully, the process
|
||||
relinquishes the lock automatically.
|
||||
* Whenever a client that holds an exclusive lock on an RBD image gets
|
||||
a request to release the lock, it stops handling writes, flushes its caches
|
||||
and releases the lock.
|
||||
|
||||
#. This graceful termination enables another, subsequent, process to acquire
|
||||
the lock and to write to the image.
|
||||
* Whenever a client that holds an exclusive lock on an RBD image terminates
|
||||
gracefully, the lock is also released gracefully.
|
||||
|
||||
* A graceful release of an exclusive lock on an RBD image (whether by request
|
||||
or due to client termination) enables another, subsequent, client to acquire
|
||||
the lock and start handling writes.
|
||||
|
||||
.. warning::
|
||||
By default, the ``exclusive-lock`` feature does not prevent two or more
|
||||
concurrently running clients from opening the same RBD image and writing to
|
||||
it in turns (whether on the same node or not). In effect, their writes just
|
||||
get linearized as the lock is automatically transitioned back and forth in
|
||||
a cooperative fashion.
|
||||
|
||||
.. note::
|
||||
It is possible for two or more concurrently running processes to open the
|
||||
image and to read from it. The client acquires the exclusive lock only when
|
||||
attempting to write to the image. To disable transparent lock transitions
|
||||
between multiple clients, the client must acquire the lock by using the
|
||||
``RBD_LOCK_MODE_EXCLUSIVE`` flag.
|
||||
To disable automatic lock transitions between clients, the
|
||||
``RBD_LOCK_MODE_EXCLUSIVE`` flag may be specified when acquiring the
|
||||
exclusive lock. This is exposed by the ``--exclusive`` option for ``rbd
|
||||
device map`` command.
|
||||
|
||||
|
||||
Blacklisting
|
||||
============
|
||||
|
||||
Sometimes a client process (or, in case of a krbd client, a client node's
|
||||
kernel) that previously held an exclusive lock on an image does not terminate
|
||||
gracefully, but dies abruptly. This may be because the client process received
|
||||
a ``KILL`` or ``ABRT`` signal, or because the client node underwent a hard
|
||||
reboot or suffered a power failure. In cases like this, the exclusive lock is
|
||||
never gracefully released. This means that any new process that starts and
|
||||
attempts to use the device must break the previously held exclusive lock.
|
||||
Sometimes a client that previously held an exclusive lock on an RBD image does
|
||||
not terminate gracefully, but dies abruptly. This may be because the client
|
||||
process received a ``KILL`` or ``ABRT`` signal, or because the client node
|
||||
underwent a hard reboot or suffered a power failure. In cases like this, the
|
||||
lock is never gracefully released. This means that any new client that comes up
|
||||
and attempts to write to the image must break the previously held exclusive
|
||||
lock.
|
||||
|
||||
However, a process (or kernel thread) may hang or merely lose network
|
||||
connectivity to the Ceph cluster for some amount of time. In that case,
|
||||
@ -78,9 +93,12 @@ Ceph Monitor.
|
||||
|
||||
Blocklisting is thus a form of storage-level resource `fencing`_.
|
||||
|
||||
In order for blocklisting to work, the client must have the ``osd
|
||||
blocklist`` capability. This capability is included in the ``profile
|
||||
rbd`` capability profile, which should be set generally on all Ceph
|
||||
:ref:`client identities <user-management>` using RBD.
|
||||
.. note::
|
||||
In order for blocklisting to work, the client must have the ``osd
|
||||
blocklist`` capability. This capability is included in the ``profile
|
||||
rbd`` capability profile, which should be set generally on all Ceph
|
||||
:ref:`client identities <user-management>` using RBD.
|
||||
|
||||
.. _RBD mirroring: ../rbd-mirroring
|
||||
.. _PWL cache: ../rbd-persistent-write-log-cache
|
||||
.. _fencing: https://en.wikipedia.org/wiki/Fencing_(computing)
|
||||
|
@ -535,13 +535,273 @@ As noted earlier, you can make documentation contributions using the `Fork and
|
||||
Pull`_ approach.
|
||||
|
||||
|
||||
Squash Extraneous Commits
|
||||
-------------------------
|
||||
Each pull request ought to be associated with only a single commit. If you have
|
||||
made more than one commit to the feature branch that you are working in, you
|
||||
will need to "squash" the multiple commits. "Squashing" is the colloquial term
|
||||
for a particular kind of "interactive rebase". Squashing can be done in a great
|
||||
number of ways, but the example here will deal with a situation in which there
|
||||
are three commits and the changes in all three of the commits are kept. The three
|
||||
commits will be squashed into a single commit.
|
||||
|
||||
#. Make the commits that you will later squash.
|
||||
|
||||
#. Make the first commit.
|
||||
|
||||
::
|
||||
|
||||
doc/glossary: improve "CephX" entry
|
||||
|
||||
Improve the glossary entry for "CephX".
|
||||
|
||||
Signed-off-by: Zac Dover <zac.dover@proton.me>
|
||||
|
||||
# Please enter the commit message for your changes. Lines starting
|
||||
# with '#' will be ignored, and an empty message aborts the commit.
|
||||
#
|
||||
# On branch wip-doc-2023-03-28-glossary-cephx
|
||||
# Changes to be committed:
|
||||
# modified: glossary.rst
|
||||
#
|
||||
|
||||
#. Make the second commit.
|
||||
|
||||
::
|
||||
|
||||
doc/glossary: add link to architecture doc
|
||||
|
||||
Add a link to a section in the architecture document, which link
|
||||
will be used in the process of improving the "CephX" glossary entry.
|
||||
|
||||
Signed-off-by: Zac Dover <zac.dover@proton.me>
|
||||
|
||||
# Please enter the commit message for your changes. Lines starting
|
||||
# with '#' will be ignored, and an empty message aborts the commit.
|
||||
#
|
||||
# On branch wip-doc-2023-03-28-glossary-cephx
|
||||
# Your branch is up to date with 'origin/wip-doc-2023-03-28-glossary-cephx'.
|
||||
#
|
||||
# Changes to be committed:
|
||||
# modified: architecture.rst
|
||||
|
||||
#. Make the third commit.
|
||||
|
||||
::
|
||||
|
||||
doc/glossary: link to Arch doc in "CephX" glossary
|
||||
|
||||
Link to the Architecture document from the "CephX" entry in the
|
||||
Glossary.
|
||||
|
||||
Signed-off-by: Zac Dover <zac.dover@proton.me>
|
||||
|
||||
# Please enter the commit message for your changes. Lines starting
|
||||
# with '#' will be ignored, and an empty message aborts the commit.
|
||||
#
|
||||
# On branch wip-doc-2023-03-28-glossary-cephx
|
||||
# Your branch is up to date with 'origin/wip-doc-2023-03-28-glossary-cephx'.
|
||||
#
|
||||
# Changes to be committed:
|
||||
# modified: glossary.rst
|
||||
|
||||
#. There are now three commits in the feature branch. We will now begin the
|
||||
process of squashing them into a single commit.
|
||||
|
||||
#. Run the command ``git rebase -i main``, which rebases the current branch
|
||||
(the feature branch) against the ``main`` branch:
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
git rebase -i main
|
||||
|
||||
#. A list of the commits that have been made to the feature branch now
|
||||
appear, and looks like this:
|
||||
|
||||
::
|
||||
|
||||
pick d395e500883 doc/glossary: improve "CephX" entry
|
||||
pick b34986e2922 doc/glossary: add link to architecture doc
|
||||
pick 74d0719735c doc/glossary: link to Arch doc in "CephX" glossary
|
||||
|
||||
# Rebase 0793495b9d1..74d0719735c onto 0793495b9d1 (3 commands)
|
||||
#
|
||||
# Commands:
|
||||
# p, pick <commit> = use commit
|
||||
# r, reword <commit> = use commit, but edit the commit message
|
||||
# e, edit <commit> = use commit, but stop for amending
|
||||
# s, squash <commit> = use commit, but meld into previous commit
|
||||
# f, fixup [-C | -c] <commit> = like "squash" but keep only the previous
|
||||
# commit's log message, unless -C is used, in which case
|
||||
# keep only this commit's message; -c is same as -C but
|
||||
# opens the editor
|
||||
# x, exec <command> = run command (the rest of the line) using shell
|
||||
# b, break = stop here (continue rebase later with 'git rebase --continue')
|
||||
# d, drop <commit> = remove commit
|
||||
# l, label <label> = label current HEAD with a name
|
||||
# t, reset <label> = reset HEAD to a label
|
||||
# m, merge [-C <commit> | -c <commit>] <label> [# <oneline>]
|
||||
# create a merge commit using the original merge commit's
|
||||
# message (or the oneline, if no original merge commit was
|
||||
# specified); use -c <commit> to reword the commit message
|
||||
# u, update-ref <ref> = track a placeholder for the <ref> to be updated
|
||||
# to this position in the new commits. The <ref> is
|
||||
# updated at the end of the rebase
|
||||
#
|
||||
# These lines can be re-ordered; they are executed from top to bottom.
|
||||
#
|
||||
# If you remove a line here THAT COMMIT WILL BE LOST.
|
||||
|
||||
Find the part of the screen that says "pick". This is the part that you will
|
||||
alter. There are three commits that are currently labeled "pick". We will
|
||||
choose one of them to remain labeled "pick", and we will label the other two
|
||||
commits "squash".
|
||||
|
||||
#. Label two of the three commits ``squash``:
|
||||
|
||||
::
|
||||
|
||||
pick d395e500883 doc/glossary: improve "CephX" entry
|
||||
squash b34986e2922 doc/glossary: add link to architecture doc
|
||||
squash 74d0719735c doc/glossary: link to Arch doc in "CephX" glossary
|
||||
|
||||
# Rebase 0793495b9d1..74d0719735c onto 0793495b9d1 (3 commands)
|
||||
#
|
||||
# Commands:
|
||||
# p, pick <commit> = use commit
|
||||
# r, reword <commit> = use commit, but edit the commit message
|
||||
# e, edit <commit> = use commit, but stop for amending
|
||||
# s, squash <commit> = use commit, but meld into previous commit
|
||||
# f, fixup [-C | -c] <commit> = like "squash" but keep only the previous
|
||||
# commit's log message, unless -C is used, in which case
|
||||
# keep only this commit's message; -c is same as -C but
|
||||
# opens the editor
|
||||
# x, exec <command> = run command (the rest of the line) using shell
|
||||
# b, break = stop here (continue rebase later with 'git rebase --continue')
|
||||
# d, drop <commit> = remove commit
|
||||
# l, label <label> = label current HEAD with a name
|
||||
# t, reset <label> = reset HEAD to a label
|
||||
# m, merge [-C <commit> | -c <commit>] <label> [# <oneline>]
|
||||
# create a merge commit using the original merge commit's
|
||||
# message (or the oneline, if no original merge commit was
|
||||
# specified); use -c <commit> to reword the commit message
|
||||
# u, update-ref <ref> = track a placeholder for the <ref> to be updated
|
||||
# to this position in the new commits. The <ref> is
|
||||
# updated at the end of the rebase
|
||||
#
|
||||
# These lines can be re-ordered; they are executed from top to bottom.
|
||||
#
|
||||
# If you remove a line here THAT COMMIT WILL BE LOST.
|
||||
|
||||
#. Now we create a commit message that applies to all the commits that have
|
||||
been squashed together:
|
||||
|
||||
#. When you save and close the list of commits that you have designated for
|
||||
squashing, a list of all three commit messages appears, and it looks
|
||||
like this:
|
||||
|
||||
::
|
||||
|
||||
# This is a combination of 3 commits.
|
||||
# This is the 1st commit message:
|
||||
|
||||
doc/glossary: improve "CephX" entry
|
||||
|
||||
Improve the glossary entry for "CephX".
|
||||
|
||||
Signed-off-by: Zac Dover <zac.dover@proton.me>
|
||||
|
||||
# This is the commit message #2:
|
||||
|
||||
doc/glossary: add link to architecture doc
|
||||
|
||||
Add a link to a section in the architecture document, which link
|
||||
will be used in the process of improving the "CephX" glossary entry.
|
||||
|
||||
Signed-off-by: Zac Dover <zac.dover@proton.me>
|
||||
|
||||
# This is the commit message #3:
|
||||
|
||||
doc/glossary: link to Arch doc in "CephX" glossary
|
||||
|
||||
Link to the Architecture document from the "CephX" entry in the
|
||||
Glossary.
|
||||
|
||||
Signed-off-by: Zac Dover <zac.dover@proton.me>
|
||||
|
||||
# Please enter the commit message for your changes. Lines starting
|
||||
# with '#' will be ignored, and an empty message aborts the commit.
|
||||
#
|
||||
# Date: Tue Mar 28 18:42:11 2023 +1000
|
||||
#
|
||||
# interactive rebase in progress; onto 0793495b9d1
|
||||
# Last commands done (3 commands done):
|
||||
# squash b34986e2922 doc/glossary: add link to architecture doc
|
||||
# squash 74d0719735c doc/glossary: link to Arch doc in "CephX" glossary
|
||||
# No commands remaining.
|
||||
# You are currently rebasing branch 'wip-doc-2023-03-28-glossary-cephx' on '0793495b9d1'.
|
||||
#
|
||||
# Changes to be committed:
|
||||
# modified: doc/architecture.rst
|
||||
# modified: doc/glossary.rst
|
||||
|
||||
#. The commit messages have been revised into the simpler form presented here:
|
||||
|
||||
::
|
||||
|
||||
doc/glossary: improve "CephX" entry
|
||||
|
||||
Improve the glossary entry for "CephX".
|
||||
|
||||
Signed-off-by: Zac Dover <zac.dover@proton.me>
|
||||
|
||||
# Please enter the commit message for your changes. Lines starting
|
||||
# with '#' will be ignored, and an empty message aborts the commit.
|
||||
#
|
||||
# Date: Tue Mar 28 18:42:11 2023 +1000
|
||||
#
|
||||
# interactive rebase in progress; onto 0793495b9d1
|
||||
# Last commands done (3 commands done):
|
||||
# squash b34986e2922 doc/glossary: add link to architecture doc
|
||||
# squash 74d0719735c doc/glossary: link to Arch doc in "CephX" glossary
|
||||
# No commands remaining.
|
||||
# You are currently rebasing branch 'wip-doc-2023-03-28-glossary-cephx' on '0793495b9d1'.
|
||||
#
|
||||
# Changes to be committed:
|
||||
# modified: doc/architecture.rst
|
||||
# modified: doc/glossary.rst
|
||||
|
||||
#. Force push the squashed commit from your local working copy to the remote
|
||||
upstream branch. The force push is necessary because the newly squashed commit
|
||||
does not have an ancestor in the remote. If that confuses you, just run this
|
||||
command and don't think too much about it:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git push -f
|
||||
|
||||
::
|
||||
|
||||
Enumerating objects: 9, done.
|
||||
Counting objects: 100% (9/9), done.
|
||||
Delta compression using up to 8 threads
|
||||
Compressing objects: 100% (5/5), done.
|
||||
Writing objects: 100% (5/5), 722 bytes | 722.00 KiB/s, done.
|
||||
Total 5 (delta 4), reused 0 (delta 0), pack-reused 0
|
||||
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.
|
||||
To github.com:zdover23/ceph.git
|
||||
+ b34986e2922...02e3a5cb763 wip-doc-2023-03-28-glossary-cephx -> wip-doc-2023-03-28-glossary-cephx (forced update)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Notify Us
|
||||
---------
|
||||
|
||||
After you make a pull request, please email ceph-docs@redhat.com.
|
||||
|
||||
|
||||
If some time has passed and the pull request that you raised has not been
|
||||
reviewed, contact the component lead and ask what's taking so long. See
|
||||
:ref:`clt` for a list of component leads.
|
||||
|
||||
Documentation Style Guide
|
||||
=========================
|
||||
@ -778,6 +1038,27 @@ Link to target with inline text::
|
||||
documentation<external_link_with_inline_text>`. If this seems inconsistent
|
||||
and confusing to you, then you're right. It is inconsistent and confusing.
|
||||
|
||||
Escaping Bold Characters within Words
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This section explains how to make certain letters within a word bold while
|
||||
leaving the other letters in the word regular (non-bold).
|
||||
|
||||
The following single-line paragraph provides an example of this:
|
||||
|
||||
**C**\eph **F**\ile **S**\ystem.
|
||||
|
||||
In ReStructured Text, the following formula will not work:
|
||||
|
||||
::
|
||||
|
||||
**C**eph **F**ile **S**ystem
|
||||
|
||||
The bolded notation must be turned off by means of the escape character (\\), as shown here:
|
||||
|
||||
::
|
||||
|
||||
**C**\eph **F**\ile **S**\ystem
|
||||
|
||||
.. _Python Sphinx: https://www.sphinx-doc.org
|
||||
.. _restructuredText: http://docutils.sourceforge.net/rst.html
|
||||
|
@ -318,7 +318,7 @@ local g = import 'grafonnet/grafana.libsonnet';
|
||||
.addTemplate(
|
||||
$.addTemplateSchema('ceph_hosts',
|
||||
'$datasource',
|
||||
'label_values({%(clusterMatcher)s}, instance)' % $.matchers(),
|
||||
if $._config.showMultiCluster then ('label_values({%(clusterMatcher)s}, instance)' % $.matchers()) else 'label_values(instance)',
|
||||
1,
|
||||
false,
|
||||
3,
|
||||
@ -719,5 +719,30 @@ local g = import 'grafonnet/grafana.libsonnet';
|
||||
11,
|
||||
9
|
||||
),
|
||||
$.addTableSchema(
|
||||
'$datasource',
|
||||
'This table shows the 10 hosts with the highest number of slow ops',
|
||||
{ col: 2, desc: true },
|
||||
[
|
||||
$.overviewStyle('Instance', 'instance', 'string', 'short'),
|
||||
$.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
|
||||
$.overviewStyle('', '/.*/', 'hidden', 'short'),
|
||||
],
|
||||
'Top Slow Ops per Host',
|
||||
'table'
|
||||
)
|
||||
.addTarget(
|
||||
$.addTargetSchema(
|
||||
|||
|
||||
topk(10,
|
||||
(sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}))
|
||||
)
|
||||
||| % $.matchers(),
|
||||
'',
|
||||
'table',
|
||||
1,
|
||||
true
|
||||
)
|
||||
) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } },
|
||||
]),
|
||||
}
|
||||
|
@ -300,6 +300,31 @@ local g = import 'grafonnet/grafana.libsonnet';
|
||||
.addTargets([$.addTargetSchema(
|
||||
'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
|
||||
)]),
|
||||
$.addTableSchema(
|
||||
'$datasource',
|
||||
'This table shows the 10 OSDs with the highest number of slow ops',
|
||||
{ col: 2, desc: true },
|
||||
[
|
||||
$.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
|
||||
$.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
|
||||
$.overviewStyle('', '/.*/', 'hidden', 'short'),
|
||||
],
|
||||
'Top Slow Ops',
|
||||
'table'
|
||||
)
|
||||
.addTarget(
|
||||
$.addTargetSchema(
|
||||
|||
|
||||
topk(10,
|
||||
(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
|
||||
)
|
||||
||| % $.matchers(),
|
||||
'',
|
||||
'table',
|
||||
1,
|
||||
true
|
||||
)
|
||||
) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } },
|
||||
]),
|
||||
'osd-device-details.json':
|
||||
local OsdDeviceDetailsPanel(title,
|
||||
|
@ -23,12 +23,6 @@
|
||||
"id": "singlestat",
|
||||
"name": "Singlestat",
|
||||
"version": "5.0.0"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "vonage-status-panel",
|
||||
"name": "Status Panel",
|
||||
"version": "1.0.8"
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
@ -64,7 +58,7 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 2,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
@ -157,8 +151,8 @@
|
||||
"fontFormat": "Regular",
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 2,
|
||||
"x": 2,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 43,
|
||||
@ -167,6 +161,19 @@
|
||||
"isHideAlertsOnDisable": false,
|
||||
"isIgnoreOKColors": false,
|
||||
"links": [],
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"aggregation": "Last",
|
||||
@ -249,7 +256,178 @@
|
||||
}
|
||||
],
|
||||
"title": "OSDs",
|
||||
"type": "vonage-status-panel"
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"clusterName": "",
|
||||
"colorMode": "Panel",
|
||||
"colors": {
|
||||
"crit": "rgba(245, 54, 54, 0.9)",
|
||||
"disable": "rgba(128, 128, 128, 0.9)",
|
||||
"ok": "rgba(50, 128, 45, 0.9)",
|
||||
"warn": "rgba(237, 129, 40, 0.9)"
|
||||
},
|
||||
"cornerRadius": 1,
|
||||
"datasource": "$datasource",
|
||||
"displayName": "",
|
||||
"flipCard": false,
|
||||
"flipTime": 5,
|
||||
"fontFormat": "Regular",
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 41,
|
||||
"isAutoScrollOnOverflow": false,
|
||||
"isGrayOnNoData": false,
|
||||
"isHideAlertsOnDisable": false,
|
||||
"isIgnoreOKColors": false,
|
||||
"links": [],
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "In Quorum",
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Always",
|
||||
"displayType": "Regular",
|
||||
"displayValueWithAlias": "When Alias Displayed",
|
||||
"expr": "sum(ceph_mon_quorum_status)",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "In Quorum",
|
||||
"refId": "A",
|
||||
"units": "none",
|
||||
"valueHandler": "Text Only"
|
||||
},
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "Total",
|
||||
"crit": 1,
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Always",
|
||||
"displayType": "Regular",
|
||||
"displayValueWithAlias": "When Alias Displayed",
|
||||
"expr": "count(ceph_mon_quorum_status)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Total",
|
||||
"refId": "B",
|
||||
"units": "none",
|
||||
"valueHandler": "Text Only",
|
||||
"warn": 2
|
||||
},
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "MONs out of Quorum",
|
||||
"crit": 1.6,
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Warning / Critical",
|
||||
"displayType": "Annotation",
|
||||
"displayValueWithAlias": "Never",
|
||||
"expr": "count(ceph_mon_quorum_status) / sum(ceph_mon_quorum_status)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "MONs out of Quorum",
|
||||
"refId": "C",
|
||||
"units": "none",
|
||||
"valueHandler": "Number Threshold",
|
||||
"warn": 1.1
|
||||
}
|
||||
],
|
||||
"title": "Monitors",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"colorMode": "Panel",
|
||||
"colors": {
|
||||
"crit": "rgba(245, 54, 54, 0.9)",
|
||||
"disable": "rgba(128, 128, 128, 0.9)",
|
||||
"ok": "rgba(50, 128, 45, 0.9)",
|
||||
"warn": "rgba(237, 129, 40, 0.9)"
|
||||
},
|
||||
"cornerRadius": 1,
|
||||
"datasource": "$datasource",
|
||||
"displayName": "",
|
||||
"flipCard": false,
|
||||
"flipTime": 5,
|
||||
"fontFormat": "Regular",
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"id": 68,
|
||||
"isAutoScrollOnOverflow": false,
|
||||
"isGrayOnNoData": false,
|
||||
"isHideAlertsOnDisable": false,
|
||||
"isIgnoreOKColors": false,
|
||||
"links": [],
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "Active",
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Always",
|
||||
"displayType": "Regular",
|
||||
"displayValueWithAlias": "When Alias Displayed",
|
||||
"expr": "count(ceph_mgr_status == 1) or vector(0)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"instant": true,
|
||||
"legendFormat": "Active",
|
||||
"refId": "A",
|
||||
"units": "none",
|
||||
"valueHandler": "Number Threshold"
|
||||
},
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "Standby",
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Always",
|
||||
"displayType": "Regular",
|
||||
"displayValueWithAlias": "When Alias Displayed",
|
||||
"expr": "count(ceph_mgr_status == 0) or vector(0)",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Standby",
|
||||
"refId": "B",
|
||||
"units": "none",
|
||||
"valueHandler": "Number Threshold"
|
||||
}
|
||||
],
|
||||
"title": "MGRs",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
@ -272,9 +450,9 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 4,
|
||||
"x": 4,
|
||||
"y": 0
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"id": 47,
|
||||
"interval": null,
|
||||
@ -342,9 +520,9 @@
|
||||
"fill": 0,
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 0
|
||||
"w": 9,
|
||||
"x": 6,
|
||||
"y": 6
|
||||
},
|
||||
"id": 53,
|
||||
"legend": {
|
||||
@ -498,9 +676,9 @@
|
||||
"fill": 0,
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 0
|
||||
"w": 9,
|
||||
"x": 15,
|
||||
"y": 6
|
||||
},
|
||||
"id": 66,
|
||||
"legend": {
|
||||
@ -595,149 +773,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"clusterName": "",
|
||||
"colorMode": "Panel",
|
||||
"colors": {
|
||||
"crit": "rgba(245, 54, 54, 0.9)",
|
||||
"disable": "rgba(128, 128, 128, 0.9)",
|
||||
"ok": "rgba(50, 128, 45, 0.9)",
|
||||
"warn": "rgba(237, 129, 40, 0.9)"
|
||||
},
|
||||
"cornerRadius": 1,
|
||||
"datasource": "$datasource",
|
||||
"displayName": "",
|
||||
"flipCard": false,
|
||||
"flipTime": 5,
|
||||
"fontFormat": "Regular",
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 2,
|
||||
"x": 0,
|
||||
"y": 3
|
||||
},
|
||||
"id": 41,
|
||||
"isAutoScrollOnOverflow": false,
|
||||
"isGrayOnNoData": false,
|
||||
"isHideAlertsOnDisable": false,
|
||||
"isIgnoreOKColors": false,
|
||||
"links": [],
|
||||
"targets": [
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "In Quorum",
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Always",
|
||||
"displayType": "Regular",
|
||||
"displayValueWithAlias": "When Alias Displayed",
|
||||
"expr": "sum(ceph_mon_quorum_status)",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "In Quorum",
|
||||
"refId": "A",
|
||||
"units": "none",
|
||||
"valueHandler": "Text Only"
|
||||
},
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "Total",
|
||||
"crit": 1,
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Always",
|
||||
"displayType": "Regular",
|
||||
"displayValueWithAlias": "When Alias Displayed",
|
||||
"expr": "count(ceph_mon_quorum_status)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Total",
|
||||
"refId": "B",
|
||||
"units": "none",
|
||||
"valueHandler": "Text Only",
|
||||
"warn": 2
|
||||
},
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "MONs out of Quorum",
|
||||
"crit": 1.6,
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Warning / Critical",
|
||||
"displayType": "Annotation",
|
||||
"displayValueWithAlias": "Never",
|
||||
"expr": "count(ceph_mon_quorum_status) / sum(ceph_mon_quorum_status)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "MONs out of Quorum",
|
||||
"refId": "C",
|
||||
"units": "none",
|
||||
"valueHandler": "Number Threshold",
|
||||
"warn": 1.1
|
||||
}
|
||||
],
|
||||
"title": "Monitors",
|
||||
"type": "vonage-status-panel"
|
||||
},
|
||||
{
|
||||
"colorMode": "Panel",
|
||||
"colors": {
|
||||
"crit": "rgba(245, 54, 54, 0.9)",
|
||||
"disable": "rgba(128, 128, 128, 0.9)",
|
||||
"ok": "rgba(50, 128, 45, 0.9)",
|
||||
"warn": "rgba(237, 129, 40, 0.9)"
|
||||
},
|
||||
"cornerRadius": 0,
|
||||
"datasource": "$datasource",
|
||||
"displayName": "",
|
||||
"flipCard": false,
|
||||
"flipTime": 5,
|
||||
"fontFormat": "Regular",
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 2,
|
||||
"x": 2,
|
||||
"y": 3
|
||||
},
|
||||
"id": 68,
|
||||
"isAutoScrollOnOverflow": false,
|
||||
"isGrayOnNoData": false,
|
||||
"isHideAlertsOnDisable": false,
|
||||
"isIgnoreOKColors": false,
|
||||
"links": [],
|
||||
"targets": [
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "Active",
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Always",
|
||||
"displayType": "Regular",
|
||||
"displayValueWithAlias": "When Alias Displayed",
|
||||
"expr": "count(ceph_mgr_status == 1) or vector(0)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Active",
|
||||
"refId": "A",
|
||||
"units": "none",
|
||||
"valueHandler": "Number Threshold"
|
||||
},
|
||||
{
|
||||
"aggregation": "Last",
|
||||
"alias": "Standby",
|
||||
"decimals": 2,
|
||||
"displayAliasType": "Always",
|
||||
"displayType": "Regular",
|
||||
"displayValueWithAlias": "When Alias Displayed",
|
||||
"expr": "count(ceph_mgr_status == 0) or vector(0)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "Standby",
|
||||
"refId": "B",
|
||||
"units": "none",
|
||||
"valueHandler": "Number Threshold"
|
||||
}
|
||||
],
|
||||
"title": "MGRs",
|
||||
"type": "vonage-status-panel"
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
@ -749,7 +784,7 @@
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
"y": 9
|
||||
},
|
||||
"id": 45,
|
||||
"legend": {
|
||||
@ -841,7 +876,7 @@
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 6
|
||||
"y": 9
|
||||
},
|
||||
"id": 62,
|
||||
"legend": {
|
||||
|
@ -1119,6 +1119,91 @@
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"columns": [ ],
|
||||
"datasource": "$datasource",
|
||||
"description": "This table shows the 10 hosts with the highest number of slow ops",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 40
|
||||
},
|
||||
"id": 15,
|
||||
"links": [ ],
|
||||
"sort": {
|
||||
"col": 2,
|
||||
"desc": true
|
||||
},
|
||||
"styles": [
|
||||
{
|
||||
"alias": "Instance",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
"rgba(245, 54, 54, 0.9)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(50, 172, 45, 0.97)"
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"mappingType": 1,
|
||||
"pattern": "instance",
|
||||
"thresholds": [ ],
|
||||
"type": "string",
|
||||
"unit": "short",
|
||||
"valueMaps": [ ]
|
||||
},
|
||||
{
|
||||
"alias": "Slow Ops",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
"rgba(245, 54, 54, 0.9)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(50, 172, 45, 0.97)"
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"mappingType": 1,
|
||||
"pattern": "Value",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "none",
|
||||
"valueMaps": [ ]
|
||||
},
|
||||
{
|
||||
"alias": "",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
"rgba(245, 54, 54, 0.9)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(50, 172, 45, 0.97)"
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"mappingType": 1,
|
||||
"pattern": "/.*/",
|
||||
"thresholds": [ ],
|
||||
"type": "hidden",
|
||||
"unit": "short",
|
||||
"valueMaps": [ ]
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"}))\n)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Top Slow Ops per Host",
|
||||
"transform": "table",
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
@ -1195,7 +1280,7 @@
|
||||
"multi": false,
|
||||
"name": "ceph_hosts",
|
||||
"options": [ ],
|
||||
"query": "label_values({}, instance)",
|
||||
"query": "label_values(instance)",
|
||||
"refresh": 1,
|
||||
"regex": "([^.:]*).*",
|
||||
"sort": 3,
|
||||
|
@ -860,6 +860,91 @@
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"columns": [ ],
|
||||
"datasource": "$datasource",
|
||||
"description": "This table shows the 10 OSDs with the highest number of slow ops",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"id": 13,
|
||||
"links": [ ],
|
||||
"sort": {
|
||||
"col": 2,
|
||||
"desc": true
|
||||
},
|
||||
"styles": [
|
||||
{
|
||||
"alias": "OSD ID",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
"rgba(245, 54, 54, 0.9)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(50, 172, 45, 0.97)"
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"mappingType": 1,
|
||||
"pattern": "ceph_daemon",
|
||||
"thresholds": [ ],
|
||||
"type": "string",
|
||||
"unit": "short",
|
||||
"valueMaps": [ ]
|
||||
},
|
||||
{
|
||||
"alias": "Slow Ops",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
"rgba(245, 54, 54, 0.9)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(50, 172, 45, 0.97)"
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"mappingType": 1,
|
||||
"pattern": "Value",
|
||||
"thresholds": [ ],
|
||||
"type": "number",
|
||||
"unit": "none",
|
||||
"valueMaps": [ ]
|
||||
},
|
||||
{
|
||||
"alias": "",
|
||||
"colorMode": null,
|
||||
"colors": [
|
||||
"rgba(245, 54, 54, 0.9)",
|
||||
"rgba(237, 129, 40, 0.89)",
|
||||
"rgba(50, 172, 45, 0.97)"
|
||||
],
|
||||
"dateFormat": "YYYY-MM-DD HH:mm:ss",
|
||||
"decimals": 2,
|
||||
"mappingType": 1,
|
||||
"pattern": "/.*/",
|
||||
"thresholds": [ ],
|
||||
"type": "hidden",
|
||||
"unit": "short",
|
||||
"valueMaps": [ ]
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10,\n (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Top Slow Ops",
|
||||
"transform": "table",
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
|
@ -629,6 +629,17 @@
|
||||
description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'CephDaemonSlowOps',
|
||||
'for': '30s',
|
||||
expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0',
|
||||
labels: { severity: 'warning', type: 'ceph_default' },
|
||||
annotations: {
|
||||
documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops',
|
||||
summary: '{{ $labels.ceph_daemon }} operations are slow to complete',
|
||||
description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
|
@ -563,6 +563,16 @@ groups:
|
||||
labels:
|
||||
severity: "warning"
|
||||
type: "ceph_default"
|
||||
- alert: "CephDaemonSlowOps"
|
||||
for: "30s"
|
||||
expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
|
||||
labels:
|
||||
severity: 'warning'
|
||||
type: 'ceph_default'
|
||||
annotations:
|
||||
summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
|
||||
description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
|
||||
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
|
||||
- name: "cephadm"
|
||||
rules:
|
||||
- alert: "CephadmUpgradeFailed"
|
||||
|
@ -679,6 +679,33 @@ tests:
|
||||
summary: OSD operations are slow to complete
|
||||
description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
|
||||
|
||||
# slow daemon ops
|
||||
- interval : 1m
|
||||
input_series:
|
||||
- series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
|
||||
values: '1+0x120'
|
||||
promql_expr_test:
|
||||
- expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
|
||||
eval_time: 1m
|
||||
exp_samples:
|
||||
- labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
|
||||
job="ceph", type="SLOW_OPS"}'
|
||||
value: 1
|
||||
alert_rule_test:
|
||||
- eval_time: 20m
|
||||
alertname: CephDaemonSlowOps
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
instance: ceph:9283
|
||||
ceph_daemon: "osd.1"
|
||||
job: ceph
|
||||
severity: warning
|
||||
type: ceph_default
|
||||
exp_annotations:
|
||||
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
|
||||
summary: osd.1 operations are slow to complete
|
||||
description: "osd.1 operations are taking too long to process (complaint time exceeded)"
|
||||
|
||||
# CEPHADM orchestrator alert triggers
|
||||
- interval: 30s
|
||||
input_series:
|
||||
|
@ -10,5 +10,5 @@ tasks:
|
||||
all:
|
||||
- sudo cp /etc/containers/registries.conf /etc/containers/registries.conf.backup
|
||||
- sudo dnf -y module reset container-tools
|
||||
- sudo dnf -y module install container-tools
|
||||
- sudo dnf -y module install container-tools --allowerasing --nobest
|
||||
- sudo cp /etc/containers/registries.conf.backup /etc/containers/registries.conf
|
||||
|
@ -9,5 +9,5 @@ tasks:
|
||||
all:
|
||||
- sudo cp /etc/containers/registries.conf /etc/containers/registries.conf.backup
|
||||
- sudo dnf -y module reset container-tools
|
||||
- sudo dnf -y module install container-tools:3.0
|
||||
- sudo dnf -y module install container-tools:3.0 --allowerasing --nobest
|
||||
- sudo cp /etc/containers/registries.conf.backup /etc/containers/registries.conf
|
||||
|
@ -9,5 +9,5 @@ tasks:
|
||||
all:
|
||||
- sudo cp /etc/containers/registries.conf /etc/containers/registries.conf.backup
|
||||
- sudo dnf -y module reset container-tools
|
||||
- sudo dnf -y module install container-tools:rhel8
|
||||
- sudo dnf -y module install container-tools:rhel8 --allowerasing --nobest
|
||||
- sudo cp /etc/containers/registries.conf.backup /etc/containers/registries.conf
|
||||
|
@ -18,6 +18,7 @@ overrides:
|
||||
- Metadata damage detected
|
||||
- MDS_READ_ONLY
|
||||
- force file system read-only
|
||||
- with standby daemon mds
|
||||
tasks:
|
||||
- cephfs_test_runner:
|
||||
modules:
|
||||
|
@ -5,6 +5,7 @@ overrides:
|
||||
- bad backtrace on inode
|
||||
- inode table repaired for inode
|
||||
- Scrub error on inode
|
||||
- Scrub error on dir
|
||||
- Metadata damage detected
|
||||
tasks:
|
||||
- cephfs_test_runner:
|
||||
|
@ -4,5 +4,6 @@ overrides:
|
||||
- but it is still running
|
||||
- objects unfound and apparently lost
|
||||
- MDS_SLOW_METADATA_IO
|
||||
- MDS_TRIM
|
||||
tasks:
|
||||
- thrashosds:
|
||||
|
@ -27,8 +27,7 @@ tasks:
|
||||
- tox: [ client.0 ]
|
||||
- keystone:
|
||||
client.0:
|
||||
sha1: 17.0.0.0rc2
|
||||
force-branch: master
|
||||
force-branch: stable/xena
|
||||
projects:
|
||||
- name: rgwcrypt
|
||||
description: Encryption Tenant
|
||||
@ -69,8 +68,7 @@ tasks:
|
||||
description: Swift Service
|
||||
- barbican:
|
||||
client.0:
|
||||
sha1: 5.0.1
|
||||
force-branch: master
|
||||
force-branch: stable/xena
|
||||
use-keystone-role: client.0
|
||||
keystone_authtoken:
|
||||
auth_plugin: password
|
||||
|
@ -8,8 +8,7 @@ tasks:
|
||||
- tox: [ client.0 ]
|
||||
- keystone:
|
||||
client.0:
|
||||
sha1: 17.0.0.0rc2
|
||||
force-branch: master
|
||||
force-branch: stable/xena
|
||||
services:
|
||||
- name: swift
|
||||
type: object-store
|
||||
@ -20,7 +19,7 @@ tasks:
|
||||
use-keystone-role: client.0
|
||||
- tempest:
|
||||
client.0:
|
||||
sha1: train-last
|
||||
sha1: 30.0.0
|
||||
force-branch: master
|
||||
use-keystone-role: client.0
|
||||
auth:
|
||||
@ -49,6 +48,10 @@ tasks:
|
||||
- .*test_container_synchronization.*
|
||||
- .*test_object_services.PublicObjectTest.test_access_public_container_object_without_using_creds
|
||||
- .*test_object_services.ObjectTest.test_create_object_with_transfer_encoding
|
||||
- .*test_container_services.ContainerTest.test_create_container_with_remove_metadata_key
|
||||
- .*test_container_services.ContainerTest.test_create_container_with_remove_metadata_value
|
||||
- .*test_object_expiry.ObjectExpiryTest.test_get_object_after_expiry_time
|
||||
- .*test_object_expiry.ObjectExpiryTest.test_get_object_at_expiry_time
|
||||
|
||||
overrides:
|
||||
ceph:
|
||||
|
@ -157,12 +157,6 @@ def fix_barbican_api(ctx, cclient):
|
||||
'/prop_dir =/ s#etc/barbican#{}/etc/barbican#'.format(get_barbican_dir(ctx)),
|
||||
'bin/barbican-api'])
|
||||
|
||||
def copy_policy_json(ctx, cclient):
|
||||
run_in_barbican_dir(ctx, cclient,
|
||||
['cp',
|
||||
get_barbican_dir(ctx)+'/etc/barbican/policy.json',
|
||||
get_barbican_dir(ctx)])
|
||||
|
||||
def create_barbican_conf(ctx, cclient):
|
||||
barbican_host, barbican_port = ctx.barbican.endpoints[cclient]
|
||||
barbican_url = 'http://{host}:{port}'.format(host=barbican_host,
|
||||
@ -174,6 +168,14 @@ def create_barbican_conf(ctx, cclient):
|
||||
'echo -n -e "[DEFAULT]\nhost_href=' + barbican_url + '\n" ' + \
|
||||
'>barbican.conf'])
|
||||
|
||||
log.info("run barbican db upgrade")
|
||||
config_path = get_barbican_dir(ctx) + '/barbican.conf'
|
||||
run_in_barbican_venv(ctx, cclient, ['barbican-manage', '--config-file', config_path,
|
||||
'db', 'upgrade'])
|
||||
log.info("run barbican db sync_secret_stores")
|
||||
run_in_barbican_venv(ctx, cclient, ['barbican-manage', '--config-file', config_path,
|
||||
'db', 'sync_secret_stores'])
|
||||
|
||||
@contextlib.contextmanager
|
||||
def configure_barbican(ctx, config):
|
||||
"""
|
||||
@ -189,7 +191,6 @@ def configure_barbican(ctx, config):
|
||||
set_authtoken_params(ctx, cclient, cconfig)
|
||||
fix_barbican_api(ctx, cclient)
|
||||
fix_barbican_api_paste(ctx, cclient)
|
||||
copy_policy_json(ctx, cclient)
|
||||
create_barbican_conf(ctx, cclient)
|
||||
try:
|
||||
yield
|
||||
|
@ -1564,23 +1564,33 @@ class CephManager:
|
||||
|
||||
Accepts arguments same as that of teuthology.orchestra.run.run()
|
||||
"""
|
||||
|
||||
prefixcmd = []
|
||||
timeoutcmd = kwargs.pop('timeoutcmd', None)
|
||||
if timeoutcmd is not None:
|
||||
prefixcmd += ['timeout', str(timeoutcmd)]
|
||||
|
||||
if self.cephadm:
|
||||
prefixcmd += ['ceph']
|
||||
cmd = prefixcmd + list(kwargs['args'])
|
||||
return shell(self.ctx, self.cluster, self.controller,
|
||||
args=['ceph'] + list(kwargs['args']),
|
||||
args=cmd,
|
||||
stdout=StringIO(),
|
||||
check_status=kwargs.get('check_status', True))
|
||||
if self.rook:
|
||||
elif self.rook:
|
||||
prefixcmd += ['ceph']
|
||||
cmd = prefixcmd + list(kwargs['args'])
|
||||
return toolbox(self.ctx, self.cluster,
|
||||
args=['ceph'] + list(kwargs['args']),
|
||||
args=cmd,
|
||||
stdout=StringIO(),
|
||||
check_status=kwargs.get('check_status', True))
|
||||
|
||||
testdir = teuthology.get_testdir(self.ctx)
|
||||
prefix = ['sudo', 'adjust-ulimits', 'ceph-coverage',
|
||||
f'{testdir}/archive/coverage', 'timeout', '120', 'ceph',
|
||||
'--cluster', self.cluster]
|
||||
kwargs['args'] = prefix + list(kwargs['args'])
|
||||
return self.controller.run(**kwargs)
|
||||
else:
|
||||
testdir = teuthology.get_testdir(self.ctx)
|
||||
prefix = prefixcmd + ['sudo', 'adjust-ulimits', 'ceph-coverage',
|
||||
f'{testdir}/archive/coverage', 'timeout', '120', 'ceph',
|
||||
'--cluster', self.cluster]
|
||||
kwargs['args'] = prefix + list(kwargs['args'])
|
||||
return self.controller.run(**kwargs)
|
||||
|
||||
def raw_cluster_cmd(self, *args, **kwargs) -> str:
|
||||
"""
|
||||
|
@ -161,6 +161,7 @@ class CephTestCase(unittest.TestCase):
|
||||
log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
|
||||
return False
|
||||
|
||||
log.info(f"waiting {timeout}s for health warning matching {pattern}")
|
||||
self.wait_until_true(seen_health_warning, timeout)
|
||||
|
||||
def wait_for_health_clear(self, timeout):
|
||||
|
@ -141,14 +141,15 @@ def download_cephadm(ctx, config, ref):
|
||||
else:
|
||||
ctx.cluster.run(
|
||||
args=[
|
||||
'git', 'archive',
|
||||
'--remote=' + git_url,
|
||||
ref,
|
||||
'src/cephadm/cephadm',
|
||||
run.Raw('|'),
|
||||
'tar', '-xO', 'src/cephadm/cephadm',
|
||||
'git', 'clone', git_url, 'testrepo',
|
||||
run.Raw('&&'),
|
||||
'cd', 'testrepo',
|
||||
run.Raw('&&'),
|
||||
'git', 'show', f'{ref}:src/cephadm/cephadm',
|
||||
run.Raw('>'),
|
||||
ctx.cephadm,
|
||||
run.Raw('&&'),
|
||||
'ls', '-l', ctx.cephadm,
|
||||
],
|
||||
)
|
||||
# sanity-check the resulting file and set executable bit
|
||||
|
@ -72,9 +72,6 @@ class CephFSTestCase(CephTestCase):
|
||||
# Whether to create the default filesystem during setUp
|
||||
REQUIRE_FILESYSTEM = True
|
||||
|
||||
# requires REQUIRE_FILESYSTEM = True
|
||||
REQUIRE_RECOVERY_FILESYSTEM = False
|
||||
|
||||
# create a backup filesystem if required.
|
||||
# required REQUIRE_FILESYSTEM enabled
|
||||
REQUIRE_BACKUP_FILESYSTEM = False
|
||||
@ -192,20 +189,6 @@ class CephFSTestCase(CephTestCase):
|
||||
self.backup_fs = self.mds_cluster.newfs(name="backup_fs")
|
||||
self.backup_fs.wait_for_daemons()
|
||||
|
||||
if self.REQUIRE_RECOVERY_FILESYSTEM:
|
||||
if not self.REQUIRE_FILESYSTEM:
|
||||
self.skipTest("Recovery filesystem requires a primary filesystem as well")
|
||||
# After Octopus is EOL, we can remove this setting:
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
|
||||
'enable_multiple', 'true',
|
||||
'--yes-i-really-mean-it')
|
||||
self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
|
||||
self.recovery_fs.set_metadata_overlay(True)
|
||||
self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
|
||||
self.recovery_fs.create()
|
||||
self.recovery_fs.getinfo(refresh=True)
|
||||
self.recovery_fs.wait_for_daemons()
|
||||
|
||||
# Load an config settings of interest
|
||||
for setting in self.LOAD_SETTINGS:
|
||||
setattr(self, setting, float(self.fs.mds_asok(
|
||||
|
@ -473,6 +473,17 @@ class MDSCluster(CephCluster):
|
||||
for fs in self.status().get_filesystems():
|
||||
Filesystem(ctx=self._ctx, fscid=fs['id']).destroy()
|
||||
|
||||
@property
|
||||
def beacon_timeout(self):
|
||||
"""
|
||||
Generate an acceptable timeout for the mons to drive some MDSMap change
|
||||
because of missed beacons from some MDS. This involves looking up the
|
||||
grace period in use by the mons and adding an acceptable buffer.
|
||||
"""
|
||||
|
||||
grace = float(self.get_config("mds_beacon_grace", service_type="mon"))
|
||||
return grace*2+15
|
||||
|
||||
|
||||
class Filesystem(MDSCluster):
|
||||
"""
|
||||
@ -485,7 +496,6 @@ class Filesystem(MDSCluster):
|
||||
self.name = name
|
||||
self.id = None
|
||||
self.metadata_pool_name = None
|
||||
self.metadata_overlay = False
|
||||
self.data_pool_name = None
|
||||
self.data_pools = None
|
||||
self.fs_config = fs_config
|
||||
@ -539,11 +549,6 @@ class Filesystem(MDSCluster):
|
||||
self.get_pool_names(status = status, refresh = refresh)
|
||||
return status
|
||||
|
||||
def set_metadata_overlay(self, overlay):
|
||||
if self.id is not None:
|
||||
raise RuntimeError("cannot specify fscid when configuring overlay")
|
||||
self.metadata_overlay = overlay
|
||||
|
||||
def deactivate(self, rank):
|
||||
if rank < 0:
|
||||
raise RuntimeError("invalid rank")
|
||||
@ -644,7 +649,7 @@ class Filesystem(MDSCluster):
|
||||
target_size_ratio = 0.9
|
||||
target_size_ratio_ec = 0.9
|
||||
|
||||
def create(self):
|
||||
def create(self, recover=False, metadata_overlay=False):
|
||||
if self.name is None:
|
||||
self.name = "cephfs"
|
||||
if self.metadata_pool_name is None:
|
||||
@ -656,7 +661,7 @@ class Filesystem(MDSCluster):
|
||||
|
||||
# will use the ec pool to store the data and a small amount of
|
||||
# metadata still goes to the primary data pool for all files.
|
||||
if not self.metadata_overlay and self.ec_profile and 'disabled' not in self.ec_profile:
|
||||
if not metadata_overlay and self.ec_profile and 'disabled' not in self.ec_profile:
|
||||
self.target_size_ratio = 0.05
|
||||
|
||||
log.debug("Creating filesystem '{0}'".format(self.name))
|
||||
@ -683,16 +688,14 @@ class Filesystem(MDSCluster):
|
||||
else:
|
||||
raise
|
||||
|
||||
if self.metadata_overlay:
|
||||
self.mon_manager.raw_cluster_cmd('fs', 'new',
|
||||
self.name, self.metadata_pool_name, data_pool_name,
|
||||
'--allow-dangerous-metadata-overlay')
|
||||
else:
|
||||
self.mon_manager.raw_cluster_cmd('fs', 'new',
|
||||
self.name,
|
||||
self.metadata_pool_name,
|
||||
data_pool_name)
|
||||
args = ["fs", "new", self.name, self.metadata_pool_name, data_pool_name]
|
||||
if recover:
|
||||
args.append('--recover')
|
||||
if metadata_overlay:
|
||||
args.append('--allow-dangerous-metadata-overlay')
|
||||
self.mon_manager.raw_cluster_cmd(*args)
|
||||
|
||||
if not recover:
|
||||
if self.ec_profile and 'disabled' not in self.ec_profile:
|
||||
ec_data_pool_name = data_pool_name + "_ec"
|
||||
log.debug("EC profile is %s", self.ec_profile)
|
||||
@ -1070,6 +1073,9 @@ class Filesystem(MDSCluster):
|
||||
def rank_freeze(self, yes, rank=0):
|
||||
self.mon_manager.raw_cluster_cmd("mds", "freeze", "{}:{}".format(self.id, rank), str(yes).lower())
|
||||
|
||||
def rank_repaired(self, rank):
|
||||
self.mon_manager.raw_cluster_cmd("mds", "repaired", "{}:{}".format(self.id, rank))
|
||||
|
||||
def rank_fail(self, rank=0):
|
||||
self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank))
|
||||
|
||||
@ -1119,6 +1125,9 @@ class Filesystem(MDSCluster):
|
||||
if timeout is None:
|
||||
timeout = DAEMON_WAIT_TIMEOUT
|
||||
|
||||
if self.id is None:
|
||||
status = self.getinfo(refresh=True)
|
||||
|
||||
if status is None:
|
||||
status = self.status()
|
||||
|
||||
@ -1233,12 +1242,12 @@ class Filesystem(MDSCluster):
|
||||
out.append((rank, f(perf)))
|
||||
return out
|
||||
|
||||
def read_cache(self, path, depth=None):
|
||||
def read_cache(self, path, depth=None, rank=None):
|
||||
cmd = ["dump", "tree", path]
|
||||
if depth is not None:
|
||||
cmd.append(depth.__str__())
|
||||
result = self.mds_asok(cmd)
|
||||
if len(result) == 0:
|
||||
result = self.rank_asok(cmd, rank=rank)
|
||||
if result is None or len(result) == 0:
|
||||
raise RuntimeError("Path not found in cache: {0}".format(path))
|
||||
|
||||
return result
|
||||
@ -1623,6 +1632,9 @@ class Filesystem(MDSCluster):
|
||||
def get_scrub_status(self, rank=0):
|
||||
return self.run_scrub(["status"], rank)
|
||||
|
||||
def flush(self, rank=0):
|
||||
return self.rank_tell(["flush", "journal"], rank=rank)
|
||||
|
||||
def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30,
|
||||
timeout=300, reverse=False):
|
||||
# time out after "timeout" seconds and assume as done
|
||||
|
@ -28,7 +28,9 @@ class KernelMount(CephFSMount):
|
||||
client_keyring_path=client_keyring_path, hostfs_mntpt=hostfs_mntpt,
|
||||
cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet)
|
||||
|
||||
self.client_config = config
|
||||
self.rbytes = config.get('rbytes', False)
|
||||
self.snapdirname = config.get('snapdirname', '.snap')
|
||||
self.inst = None
|
||||
self.addr = None
|
||||
|
||||
@ -86,6 +88,8 @@ class KernelMount(CephFSMount):
|
||||
opts += ",rbytes"
|
||||
else:
|
||||
opts += ",norbytes"
|
||||
if self.snapdirname != '.snap':
|
||||
opts += f',snapdirname={self.snapdirname}'
|
||||
if mntopts:
|
||||
opts += ',' + ','.join(mntopts)
|
||||
|
||||
|
@ -3,6 +3,7 @@ import json
|
||||
import logging
|
||||
import errno
|
||||
import re
|
||||
import time
|
||||
from teuthology.contextutil import MaxWhileTries
|
||||
from teuthology.exceptions import CommandFailedError
|
||||
from teuthology.orchestra.run import wait
|
||||
@ -562,3 +563,99 @@ class TestDamage(CephFSTestCase):
|
||||
self.fs.mon_manager.raw_cluster_cmd(
|
||||
'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
|
||||
"damage", "rm", str(entry['id']))
|
||||
|
||||
def test_dentry_first_existing(self):
|
||||
"""
|
||||
That the MDS won't abort when the dentry is already known to be damaged.
|
||||
"""
|
||||
|
||||
def verify_corrupt():
|
||||
info = self.fs.read_cache("/a", 0)
|
||||
log.debug('%s', info)
|
||||
self.assertEqual(len(info), 1)
|
||||
dirfrags = info[0]['dirfrags']
|
||||
self.assertEqual(len(dirfrags), 1)
|
||||
dentries = dirfrags[0]['dentries']
|
||||
self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c'])
|
||||
self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD
|
||||
|
||||
self.mount_a.run_shell_payload("mkdir -p a/b")
|
||||
self.fs.flush()
|
||||
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
|
||||
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
|
||||
time.sleep(5) # for conf to percolate
|
||||
self.mount_a.run_shell_payload("mv a/b a/c; sync .")
|
||||
self.mount_a.umount()
|
||||
verify_corrupt()
|
||||
self.fs.fail()
|
||||
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
|
||||
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
|
||||
self.fs.set_joinable()
|
||||
status = self.fs.status()
|
||||
self.fs.flush()
|
||||
self.assertFalse(self.fs.status().hadfailover(status))
|
||||
verify_corrupt()
|
||||
|
||||
def test_dentry_first_preflush(self):
|
||||
"""
|
||||
That the MDS won't write a dentry with new damage to CDentry::first
|
||||
to the journal.
|
||||
"""
|
||||
|
||||
rank0 = self.fs.get_rank()
|
||||
self.fs.rank_freeze(True, rank=0)
|
||||
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d")
|
||||
self.fs.flush()
|
||||
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
|
||||
time.sleep(5) # for conf to percolate
|
||||
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
|
||||
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
|
||||
self.fs.rank_freeze(False, rank=0)
|
||||
self.delete_mds_coredump(rank0['name'])
|
||||
self.fs.mds_restart(rank0['name'])
|
||||
self.fs.wait_for_daemons()
|
||||
p.wait()
|
||||
self.mount_a.run_shell_payload("stat a/ && find a/")
|
||||
self.fs.flush()
|
||||
|
||||
def test_dentry_first_precommit(self):
|
||||
"""
|
||||
That the MDS won't write a dentry with new damage to CDentry::first
|
||||
to the directory object.
|
||||
"""
|
||||
|
||||
fscid = self.fs.id
|
||||
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .")
|
||||
self.mount_a.umount() # allow immediate scatter write back
|
||||
self.fs.flush()
|
||||
# now just twiddle some inode metadata on a regular file
|
||||
self.mount_a.mount_wait()
|
||||
self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .")
|
||||
self.mount_a.umount() # avoid journaling session related things
|
||||
# okay, now cause the dentry to get damaged after loading from the journal
|
||||
self.fs.fail()
|
||||
self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0")
|
||||
time.sleep(5) # for conf to percolate
|
||||
self.fs.set_joinable()
|
||||
self.fs.wait_for_daemons()
|
||||
rank0 = self.fs.get_rank()
|
||||
self.fs.rank_freeze(True, rank=0)
|
||||
# so now we want to trigger commit but this will crash, so:
|
||||
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
|
||||
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
|
||||
self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
|
||||
self.fs.rank_freeze(False, rank=0)
|
||||
self.delete_mds_coredump(rank0['name'])
|
||||
self.fs.mds_restart(rank0['name'])
|
||||
self.fs.wait_for_daemons()
|
||||
try:
|
||||
p.wait()
|
||||
except CommandFailedError as e:
|
||||
print(e)
|
||||
else:
|
||||
self.fail("flush journal should fail!")
|
||||
self.mount_a.mount_wait()
|
||||
self.mount_a.run_shell_payload("stat a/ && find a/")
|
||||
self.fs.flush()
|
||||
|
@ -368,6 +368,7 @@ class TestDataScan(CephFSTestCase):
|
||||
self.fs.data_scan(["init"])
|
||||
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
|
||||
self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
|
||||
self.fs.data_scan(["scan_links"])
|
||||
|
||||
# Mark the MDS repaired
|
||||
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
|
||||
|
@ -319,8 +319,6 @@ class TestFailover(CephFSTestCase):
|
||||
# Kill the rank 0 daemon's physical process
|
||||
self.fs.mds_stop(original_active)
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
# Wait until the monitor promotes his replacement
|
||||
def promoted():
|
||||
active = self.fs.get_active_names()
|
||||
@ -328,9 +326,7 @@ class TestFailover(CephFSTestCase):
|
||||
|
||||
log.info("Waiting for promotion of one of the original standbys {0}".format(
|
||||
original_standbys))
|
||||
self.wait_until_true(
|
||||
promoted,
|
||||
timeout=grace*2)
|
||||
self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
|
||||
|
||||
# Start the original rank 0 daemon up again, see that he becomes a standby
|
||||
self.fs.mds_restart(original_active)
|
||||
@ -352,8 +348,6 @@ class TestFailover(CephFSTestCase):
|
||||
if not require_active:
|
||||
self.skipTest("fuse_require_active_mds is not set")
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
# Check it's not laggy to begin with
|
||||
(original_active, ) = self.fs.get_active_names()
|
||||
self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
|
||||
@ -376,7 +370,7 @@ class TestFailover(CephFSTestCase):
|
||||
|
||||
return True
|
||||
|
||||
self.wait_until_true(laggy, grace * 2)
|
||||
self.wait_until_true(laggy, self.fs.beacon_timeout)
|
||||
with self.assertRaises(CommandFailedError):
|
||||
self.mounts[0].mount_wait()
|
||||
|
||||
@ -388,8 +382,6 @@ class TestFailover(CephFSTestCase):
|
||||
# Need all my standbys up as well as the active daemons
|
||||
self.wait_for_daemon_start()
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
standbys = self.mds_cluster.get_standby_daemons()
|
||||
self.assertGreaterEqual(len(standbys), 1)
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
|
||||
@ -397,8 +389,7 @@ class TestFailover(CephFSTestCase):
|
||||
# Kill a standby and check for warning
|
||||
victim = standbys.pop()
|
||||
self.fs.mds_stop(victim)
|
||||
log.info("waiting for insufficient standby daemon warning")
|
||||
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
|
||||
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
|
||||
|
||||
# restart the standby, see that he becomes a standby, check health clears
|
||||
self.fs.mds_restart(victim)
|
||||
@ -412,8 +403,7 @@ class TestFailover(CephFSTestCase):
|
||||
standbys = self.mds_cluster.get_standby_daemons()
|
||||
self.assertGreaterEqual(len(standbys), 1)
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
|
||||
log.info("waiting for insufficient standby daemon warning")
|
||||
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
|
||||
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
|
||||
|
||||
# Set it to 0
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
|
||||
@ -429,7 +419,6 @@ class TestFailover(CephFSTestCase):
|
||||
|
||||
self.mount_a.umount_wait()
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
|
||||
|
||||
mds_0 = self.fs.get_rank(rank=0, status=status)
|
||||
@ -437,7 +426,7 @@ class TestFailover(CephFSTestCase):
|
||||
self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
|
||||
self.wait_until_true(
|
||||
lambda: "laggy_since" in self.fs.get_rank(),
|
||||
timeout=grace * 2
|
||||
timeout=self.fs.beacon_timeout
|
||||
)
|
||||
|
||||
self.fs.rank_fail(rank=1)
|
||||
@ -450,7 +439,7 @@ class TestFailover(CephFSTestCase):
|
||||
self.fs.rank_signal(signal.SIGCONT, rank=0)
|
||||
self.wait_until_true(
|
||||
lambda: "laggy_since" not in self.fs.get_rank(rank=0),
|
||||
timeout=grace * 2
|
||||
timeout=self.fs.beacon_timeout
|
||||
)
|
||||
|
||||
# mds.b will be stuck at 'reconnect' state if snapserver gets confused
|
||||
|
@ -129,7 +129,7 @@ class TestForwardScrub(CephFSTestCase):
|
||||
# Umount before flush to avoid cap releases putting
|
||||
# things we don't want in the journal later.
|
||||
self.mount_a.umount_wait()
|
||||
self.fs.mds_asok(["flush", "journal"])
|
||||
self.fs.flush()
|
||||
|
||||
# Create a new inode that's just in the log, i.e. would
|
||||
# look orphaned to backward scan if backward scan wisnae
|
||||
@ -163,7 +163,7 @@ class TestForwardScrub(CephFSTestCase):
|
||||
|
||||
# Run a tagging forward scrub
|
||||
tag = "mytag123"
|
||||
self.fs.mds_asok(["tag", "path", "/parent", tag])
|
||||
self.fs.rank_asok(["tag", "path", "/parent", tag])
|
||||
|
||||
# See that the orphan wisnae tagged
|
||||
self.assertUntagged(inos['./parent/flushed/bravo'])
|
||||
@ -175,14 +175,21 @@ class TestForwardScrub(CephFSTestCase):
|
||||
# See that journalled-but-not-flushed file *was* tagged
|
||||
self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
|
||||
|
||||
# Run cephfs-data-scan targeting only orphans
|
||||
# okay, now we are going to run cephfs-data-scan. It's necessary to
|
||||
# have a clean journal otherwise replay will blowup on mismatched
|
||||
# inotable versions (due to scan_links)
|
||||
self.fs.flush()
|
||||
self.fs.fail()
|
||||
self.fs.journal_tool(["journal", "reset", "--force"], 0)
|
||||
|
||||
# Run cephfs-data-scan targeting only orphans
|
||||
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
|
||||
self.fs.data_scan([
|
||||
"scan_inodes",
|
||||
"--filter-tag", tag,
|
||||
self.fs.get_data_pool_name()
|
||||
])
|
||||
self.fs.data_scan(["scan_links"])
|
||||
|
||||
# After in-place injection stats should be kosher again
|
||||
self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
|
||||
|
@ -317,3 +317,43 @@ class TestFragmentation(CephFSTestCase):
|
||||
lambda: _count_fragmented() > 0,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
def test_dir_merge_with_snap_items(self):
|
||||
"""
|
||||
That directory remain fragmented when snapshot items are taken into account.
|
||||
"""
|
||||
split_size = 1000
|
||||
merge_size = 100
|
||||
self._configure(
|
||||
mds_bal_split_size=split_size,
|
||||
mds_bal_merge_size=merge_size,
|
||||
mds_bal_split_bits=1
|
||||
)
|
||||
|
||||
# split the dir
|
||||
create_files = split_size + 50
|
||||
self.mount_a.create_n_files("splitdir/file_", create_files)
|
||||
|
||||
self.wait_until_true(
|
||||
lambda: self.get_splits() == 1,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
frags = self.get_dir_ino("/splitdir")['dirfrags']
|
||||
self.assertEqual(len(frags), 2)
|
||||
self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*")
|
||||
self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*")
|
||||
self.assertEqual(
|
||||
sum([len(f['dentries']) for f in frags]), create_files
|
||||
)
|
||||
|
||||
self.assertEqual(self.get_merges(), 0)
|
||||
|
||||
self.mount_a.run_shell(["mkdir", "splitdir/.snap/snap_a"])
|
||||
self.mount_a.run_shell(["mkdir", "splitdir/.snap/snap_b"])
|
||||
self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
|
||||
|
||||
time.sleep(30)
|
||||
|
||||
self.assertEqual(self.get_merges(), 0)
|
||||
self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 2)
|
||||
|
@ -16,11 +16,7 @@ ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
|
||||
|
||||
|
||||
class OverlayWorkload(object):
|
||||
def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
|
||||
self._orig_fs = orig_fs
|
||||
self._recovery_fs = recovery_fs
|
||||
self._orig_mount = orig_mount
|
||||
self._recovery_mount = recovery_mount
|
||||
def __init__(self):
|
||||
self._initial_state = None
|
||||
|
||||
# Accumulate backtraces for every failed validation, and return them. Backtraces
|
||||
@ -51,41 +47,40 @@ class OverlayWorkload(object):
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def damage(self):
|
||||
def damage(self, fs):
|
||||
"""
|
||||
Damage the filesystem pools in ways that will be interesting to recover from. By
|
||||
default just wipe everything in the metadata pool
|
||||
"""
|
||||
|
||||
pool = self._orig_fs.get_metadata_pool_name()
|
||||
self._orig_fs.rados(["purge", pool, '--yes-i-really-really-mean-it'])
|
||||
pool = fs.get_metadata_pool_name()
|
||||
fs.rados(["purge", pool, '--yes-i-really-really-mean-it'])
|
||||
|
||||
def flush(self):
|
||||
def flush(self, fs):
|
||||
"""
|
||||
Called after client unmount, after write: flush whatever you want
|
||||
"""
|
||||
self._orig_fs.mds_asok(["flush", "journal"])
|
||||
self._recovery_fs.mds_asok(["flush", "journal"])
|
||||
fs.rank_asok(["flush", "journal"])
|
||||
|
||||
|
||||
class SimpleOverlayWorkload(OverlayWorkload):
|
||||
"""
|
||||
Single file, single directory, check that it gets recovered and so does its size
|
||||
"""
|
||||
def write(self):
|
||||
self._orig_mount.run_shell(["mkdir", "subdir"])
|
||||
self._orig_mount.write_n_mb("subdir/sixmegs", 6)
|
||||
self._initial_state = self._orig_mount.stat("subdir/sixmegs")
|
||||
def write(self, mount):
|
||||
mount.run_shell(["mkdir", "subdir"])
|
||||
mount.write_n_mb("subdir/sixmegs", 6)
|
||||
self._initial_state = mount.stat("subdir/sixmegs")
|
||||
|
||||
def validate(self):
|
||||
self._recovery_mount.run_shell(["ls", "subdir"])
|
||||
st = self._recovery_mount.stat("subdir/sixmegs")
|
||||
def validate(self, recovery_mount):
|
||||
recovery_mount.run_shell(["ls", "subdir"])
|
||||
st = recovery_mount.stat("subdir/sixmegs")
|
||||
self.assert_equal(st['st_size'], self._initial_state['st_size'])
|
||||
return self._errors
|
||||
|
||||
class TestRecoveryPool(CephFSTestCase):
|
||||
MDSS_REQUIRED = 2
|
||||
CLIENTS_REQUIRED = 2
|
||||
CLIENTS_REQUIRED = 1
|
||||
REQUIRE_RECOVERY_FILESYSTEM = True
|
||||
|
||||
def is_marked_damaged(self, rank):
|
||||
@ -100,95 +95,77 @@ class TestRecoveryPool(CephFSTestCase):
|
||||
|
||||
# First, inject some files
|
||||
|
||||
workload.write()
|
||||
workload.write(self.mount_a)
|
||||
|
||||
# Unmount the client and flush the journal: the tool should also cope with
|
||||
# situations where there is dirty metadata, but we'll test that separately
|
||||
self.mount_a.umount_wait()
|
||||
self.mount_b.umount_wait()
|
||||
workload.flush()
|
||||
|
||||
# Create the alternate pool if requested
|
||||
recovery_fs = self.recovery_fs.name
|
||||
recovery_pool = self.recovery_fs.get_metadata_pool_name()
|
||||
self.recovery_fs.data_scan(['init', '--force-init',
|
||||
'--filesystem', recovery_fs,
|
||||
'--alternate-pool', recovery_pool])
|
||||
self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
|
||||
self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
|
||||
self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
|
||||
self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
|
||||
|
||||
# Stop the MDS
|
||||
self.fs.mds_stop() # otherwise MDS will join once the fs is reset
|
||||
workload.flush(self.fs)
|
||||
self.fs.fail()
|
||||
|
||||
# After recovery, we need the MDS to not be strict about stats (in production these options
|
||||
# are off by default, but in QA we need to explicitly disable them)
|
||||
# Note: these have to be written to ceph.conf to override existing ceph.conf values.
|
||||
self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
|
||||
self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
|
||||
self.fs.mds_restart()
|
||||
|
||||
# Apply any data damage the workload wants
|
||||
workload.damage()
|
||||
workload.damage(self.fs)
|
||||
|
||||
# Create the alternate pool if requested
|
||||
recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
|
||||
recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
|
||||
recovery_fs.create(recover=True, metadata_overlay=True)
|
||||
|
||||
recovery_pool = recovery_fs.get_metadata_pool_name()
|
||||
recovery_fs.mon_manager.raw_cluster_cmd('-s')
|
||||
|
||||
# Reset the MDS map in case multiple ranks were in play: recovery procedure
|
||||
# only understands how to rebuild metadata under rank 0
|
||||
self.fs.reset()
|
||||
|
||||
self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
|
||||
self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
|
||||
self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
|
||||
#self.fs.reset()
|
||||
#self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
|
||||
#self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
|
||||
#self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
|
||||
|
||||
# Run the recovery procedure
|
||||
recovery_fs.data_scan(['init', '--force-init',
|
||||
'--filesystem', recovery_fs.name,
|
||||
'--alternate-pool', recovery_pool])
|
||||
recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "session"])
|
||||
recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "snap"])
|
||||
recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "inode"])
|
||||
if False:
|
||||
with self.assertRaises(CommandFailedError):
|
||||
# Normal reset should fail when no objects are present, we'll use --force instead
|
||||
self.fs.journal_tool(["journal", "reset"], 0)
|
||||
|
||||
self.fs.data_scan(['scan_extents', '--alternate-pool',
|
||||
recovery_fs.data_scan(['scan_extents', '--alternate-pool',
|
||||
recovery_pool, '--filesystem', self.fs.name,
|
||||
self.fs.get_data_pool_name()])
|
||||
self.fs.data_scan(['scan_inodes', '--alternate-pool',
|
||||
recovery_fs.data_scan(['scan_inodes', '--alternate-pool',
|
||||
recovery_pool, '--filesystem', self.fs.name,
|
||||
'--force-corrupt', '--force-init',
|
||||
self.fs.get_data_pool_name()])
|
||||
self.fs.journal_tool(['event', 'recover_dentries', 'list',
|
||||
recovery_fs.data_scan(['scan_links', '--filesystem', recovery_fs.name])
|
||||
recovery_fs.journal_tool(['event', 'recover_dentries', 'list',
|
||||
'--alternate-pool', recovery_pool], 0)
|
||||
|
||||
self.fs.data_scan(['init', '--force-init', '--filesystem',
|
||||
self.fs.name])
|
||||
self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
|
||||
'--force-corrupt', '--force-init',
|
||||
self.fs.get_data_pool_name()])
|
||||
self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
|
||||
|
||||
self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
|
||||
self.fs.journal_tool(['journal', 'reset', '--force'], 0)
|
||||
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
|
||||
recovery_fs + ":0")
|
||||
|
||||
# Mark the MDS repaired
|
||||
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
|
||||
recovery_fs.journal_tool(["journal", "reset", "--force"], 0)
|
||||
|
||||
# Start the MDS
|
||||
self.fs.mds_restart()
|
||||
self.fs.set_joinable()
|
||||
self.recovery_fs.mds_restart()
|
||||
self.fs.wait_for_daemons()
|
||||
self.recovery_fs.wait_for_daemons()
|
||||
status = self.recovery_fs.status()
|
||||
for rank in self.recovery_fs.get_ranks(status=status):
|
||||
self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
|
||||
'injectargs', '--debug-mds=20')
|
||||
self.fs.rank_tell(['scrub', 'start', '/', 'recursive,repair'], rank=rank['rank'], status=status)
|
||||
log.info(str(self.mds_cluster.status()))
|
||||
recovery_fs.set_joinable()
|
||||
status = recovery_fs.wait_for_daemons()
|
||||
|
||||
self.config_set('mds', 'debug_mds', '20')
|
||||
for rank in recovery_fs.get_ranks(status=status):
|
||||
recovery_fs.rank_tell(['scrub', 'start', '/', 'force,recursive,repair'], rank=rank['rank'], status=status)
|
||||
log.info(str(recovery_fs.status()))
|
||||
|
||||
# Mount a client
|
||||
self.mount_a.mount_wait()
|
||||
self.mount_b.mount_wait(cephfs_name=recovery_fs)
|
||||
self.mount_a.mount_wait(cephfs_name=recovery_fs.name)
|
||||
|
||||
# See that the files are present and correct
|
||||
errors = workload.validate()
|
||||
errors = workload.validate(self.mount_a)
|
||||
if errors:
|
||||
log.error("Validation errors found: {0}".format(len(errors)))
|
||||
for e in errors:
|
||||
@ -199,5 +176,4 @@ class TestRecoveryPool(CephFSTestCase):
|
||||
))
|
||||
|
||||
def test_rebuild_simple(self):
|
||||
self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
|
||||
self.mount_a, self.mount_b))
|
||||
self._rebuild_metadata(SimpleOverlayWorkload())
|
||||
|
@ -176,3 +176,12 @@ class TestScrub(CephFSTestCase):
|
||||
|
||||
def test_scrub_dup_inode(self):
|
||||
self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))
|
||||
|
||||
def test_mdsdir_scrub_backtrace(self):
|
||||
damage_count = self._get_damage_count()
|
||||
self.assertNotIn("MDS_DAMAGE", self.mds_cluster.mon_manager.get_mon_health()['checks'])
|
||||
|
||||
out_json = self.fs.run_scrub(["start", "~mdsdir", "recursive"])
|
||||
self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
|
||||
self.assertEqual(self._get_damage_count(), damage_count)
|
||||
self.assertNotIn("MDS_DAMAGE", self.mds_cluster.mon_manager.get_mon_health()['checks'])
|
||||
|
@ -139,8 +139,7 @@ done
|
||||
|
||||
# resume and verify
|
||||
self._resume_scrub(0)
|
||||
out_json = self.fs.get_scrub_status()
|
||||
self.assertTrue("no active" in out_json['status'])
|
||||
self.assertTrue(self.fs.wait_until_scrub_complete(sleep=5, timeout=30))
|
||||
|
||||
checked = self._check_task_status_na()
|
||||
self.assertTrue(checked)
|
||||
@ -168,15 +167,13 @@ done
|
||||
# Kill the rank 0
|
||||
self.fs.mds_stop(original_active)
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
def promoted():
|
||||
active = self.fs.get_active_names()
|
||||
return active and active[0] in original_standbys
|
||||
|
||||
log.info("Waiting for promotion of one of the original standbys {0}".format(
|
||||
original_standbys))
|
||||
self.wait_until_true(promoted, timeout=grace*2)
|
||||
self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
|
||||
|
||||
self._check_task_status_na()
|
||||
|
||||
|
@ -19,7 +19,7 @@ def seconds_upto_next_schedule(time_from, timo):
|
||||
ts = int(time_from)
|
||||
return ((int(ts / 60) * 60) + timo) - ts
|
||||
|
||||
class TestSnapSchedules(CephFSTestCase):
|
||||
class TestSnapSchedulesHelper(CephFSTestCase):
|
||||
CLIENTS_REQUIRED = 1
|
||||
|
||||
TEST_VOLUME_NAME = 'snap_vol'
|
||||
@ -54,7 +54,7 @@ class TestSnapSchedules(CephFSTestCase):
|
||||
result = json.loads(self._fs_cmd("volume", "ls"))
|
||||
if len(result) == 0:
|
||||
self.vol_created = True
|
||||
self.volname = TestSnapSchedules.TEST_VOLUME_NAME
|
||||
self.volname = TestSnapSchedulesHelper.TEST_VOLUME_NAME
|
||||
self._fs_cmd("volume", "create", self.volname)
|
||||
else:
|
||||
self.volname = result[0]['name']
|
||||
@ -69,7 +69,7 @@ class TestSnapSchedules(CephFSTestCase):
|
||||
self.config_set('mgr', 'mgr/snap_schedule/allow_m_granularity', True)
|
||||
|
||||
def setUp(self):
|
||||
super(TestSnapSchedules, self).setUp()
|
||||
super(TestSnapSchedulesHelper, self).setUp()
|
||||
self.volname = None
|
||||
self.vol_created = False
|
||||
self._create_or_reuse_test_volume()
|
||||
@ -84,7 +84,7 @@ class TestSnapSchedules(CephFSTestCase):
|
||||
if self.vol_created:
|
||||
self._delete_test_volume()
|
||||
self._disable_snap_schedule()
|
||||
super(TestSnapSchedules, self).tearDown()
|
||||
super(TestSnapSchedulesHelper, self).tearDown()
|
||||
|
||||
def _schedule_to_timeout(self, schedule):
|
||||
mult = schedule[-1]
|
||||
@ -115,7 +115,7 @@ class TestSnapSchedules(CephFSTestCase):
|
||||
|
||||
def verify(self, dir_path, max_trials):
|
||||
trials = 0
|
||||
snap_path = "{0}/.snap".format(dir_path)
|
||||
snap_path = f'{dir_path}/.snap'
|
||||
while (len(self.create_cbks) or len(self.remove_cbks)) and trials < max_trials:
|
||||
snapshots = set(self.mount_a.ls(path=snap_path))
|
||||
added = snapshots - self.snapshots
|
||||
@ -143,7 +143,7 @@ class TestSnapSchedules(CephFSTestCase):
|
||||
|
||||
# expected "scheduled" snapshot name
|
||||
ts_name = (datetime.utcfromtimestamp(snap_sched_exec_epoch)
|
||||
+ timedelta(seconds=wait_timo)).strftime(TestSnapSchedules.SNAPSHOT_TS_FORMAT)
|
||||
+ timedelta(seconds=wait_timo)).strftime(TestSnapSchedulesHelper.SNAPSHOT_TS_FORMAT)
|
||||
return (wait_timo, ts_name)
|
||||
|
||||
def verify_schedule(self, dir_path, schedules, retentions=[]):
|
||||
@ -157,7 +157,8 @@ class TestSnapSchedules(CephFSTestCase):
|
||||
self.assertTrue(schedule in json_res['schedule'])
|
||||
for retention in retentions:
|
||||
self.assertTrue(retention in json_res['retention'])
|
||||
|
||||
|
||||
class TestSnapSchedules(TestSnapSchedulesHelper):
|
||||
def remove_snapshots(self, dir_path):
|
||||
snap_path = f'{dir_path}/.snap'
|
||||
|
||||
@ -351,7 +352,7 @@ class TestSnapSchedules(CephFSTestCase):
|
||||
snap_path = f"{dir_path}/.snap"[1:]
|
||||
snapshots = self.mount_a.ls(path=snap_path)
|
||||
fs_count = len(snapshots)
|
||||
log.debug(f'snapshots: {snapshots}');
|
||||
log.debug(f'snapshots: {snapshots}')
|
||||
|
||||
result = self.fs_snap_schedule_cmd('status', path=dir_path,
|
||||
format='json')
|
||||
@ -445,4 +446,50 @@ class TestSnapSchedules(CephFSTestCase):
|
||||
# cleanup
|
||||
self.fs_snap_schedule_cmd('remove', path=testdir, snap_schedule='1M')
|
||||
self.remove_snapshots(testdir[1:])
|
||||
self.mount_a.run_shell(['rmdir', testdir[1:]])
|
||||
self.mount_a.run_shell(['rmdir', testdir[1:]])
|
||||
|
||||
class TestSnapSchedulesSnapdir(TestSnapSchedulesHelper):
|
||||
def remove_snapshots(self, dir_path, sdn):
|
||||
snap_path = f'{dir_path}/{sdn}'
|
||||
|
||||
snapshots = self.mount_a.ls(path=snap_path)
|
||||
for snapshot in snapshots:
|
||||
snapshot_path = os.path.join(snap_path, snapshot)
|
||||
log.debug(f'removing snapshot: {snapshot_path}')
|
||||
self.mount_a.run_shell(['rmdir', snapshot_path])
|
||||
|
||||
def get_snap_dir_name(self):
|
||||
from tasks.cephfs.fuse_mount import FuseMount
|
||||
from tasks.cephfs.kernel_mount import KernelMount
|
||||
|
||||
if isinstance(self.mount_a, KernelMount):
|
||||
sdn = self.mount_a.client_config.get('snapdirname', '.snap')
|
||||
elif isinstance(self.mount_a, FuseMount):
|
||||
sdn = self.mount_a.client_config.get('client_snapdir', '.snap')
|
||||
self.fs.set_ceph_conf('client', 'client snapdir', sdn)
|
||||
self.mount_a.remount()
|
||||
return sdn
|
||||
|
||||
def test_snap_dir_name(self):
|
||||
"""Test the correctness of snap directory name"""
|
||||
self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedulesSnapdir.TEST_DIRECTORY])
|
||||
|
||||
# set a schedule on the dir
|
||||
self.fs_snap_schedule_cmd('add', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY, snap_schedule='1M')
|
||||
self.fs_snap_schedule_cmd('retention', 'add', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY, retention_spec_or_period='1M')
|
||||
exec_time = time.time()
|
||||
|
||||
timo, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
|
||||
sdn = self.get_snap_dir_name()
|
||||
log.info(f'expecting snap {TestSnapSchedulesSnapdir.TEST_DIRECTORY}/{sdn}/scheduled-{snap_sfx} in ~{timo}s...')
|
||||
|
||||
# verify snapshot schedule
|
||||
self.verify_schedule(TestSnapSchedulesSnapdir.TEST_DIRECTORY, ['1M'], retentions=[{'M':1}])
|
||||
|
||||
# remove snapshot schedule
|
||||
self.fs_snap_schedule_cmd('remove', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY)
|
||||
|
||||
# remove all scheduled snapshots
|
||||
self.remove_snapshots(TestSnapSchedulesSnapdir.TEST_DIRECTORY, sdn)
|
||||
|
||||
self.mount_a.run_shell(['rmdir', TestSnapSchedulesSnapdir.TEST_DIRECTORY])
|
||||
|
@ -69,8 +69,6 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.set_max_mds(2)
|
||||
status = self.fs.wait_for_daemons()
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
# setup subtrees
|
||||
self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
|
||||
self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
|
||||
@ -91,7 +89,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_freeze(True, rank=0)
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank0['name']);
|
||||
|
||||
self.fs.rank_fail(rank=0)
|
||||
@ -119,7 +117,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_freeze(True, rank=1) # prevent failover...
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*3);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank0['name']);
|
||||
|
||||
self.fs.rank_signal(signal.SIGKILL, rank=1)
|
||||
@ -167,7 +165,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_freeze(True, rank=1) # prevent failover...
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank1['name']);
|
||||
|
||||
self.mount_a.kill()
|
||||
@ -209,7 +207,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status)
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank1['name']);
|
||||
|
||||
self.mount_a.kill()
|
||||
@ -222,7 +220,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.wait_for_daemon_start([rank1['name']])
|
||||
|
||||
# rollback triggers assertion
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank0['name']);
|
||||
self.fs.rank_fail(rank=0)
|
||||
self.fs.mds_restart(rank0['name'])
|
||||
@ -243,8 +241,6 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.set_max_mds(3)
|
||||
status = self.fs.wait_for_daemons()
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
|
||||
self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
|
||||
self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
|
||||
@ -301,7 +297,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_freeze(True, rank=2)
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank2['name']);
|
||||
|
||||
# mksnap should wait for notify ack from mds.2
|
||||
@ -327,7 +323,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
|
||||
last_created = self._get_last_created_snap(rank=0)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank2['name']);
|
||||
|
||||
self.mount_a.kill()
|
||||
@ -537,3 +533,62 @@ class TestSnapshots(CephFSTestCase):
|
||||
# after reducing limit we expect the new snapshot creation to fail
|
||||
pass
|
||||
self.delete_dir_and_snaps("accounts", new_limit + 1)
|
||||
|
||||
|
||||
class TestMonSnapsAndFsPools(CephFSTestCase):
|
||||
MDSS_REQUIRED = 3
|
||||
|
||||
def test_disallow_monitor_managed_snaps_for_fs_pools(self):
|
||||
"""
|
||||
Test that creation of monitor managed snaps fails for pools attached
|
||||
to any file-system
|
||||
"""
|
||||
with self.assertRaises(CommandFailedError):
|
||||
self.fs.rados(["mksnap", "snap1"], pool=self.fs.get_data_pool_name())
|
||||
|
||||
with self.assertRaises(CommandFailedError):
|
||||
self.fs.rados(["mksnap", "snap2"], pool=self.fs.get_metadata_pool_name())
|
||||
|
||||
def test_attaching_pools_with_snaps_to_fs_fails(self):
|
||||
"""
|
||||
Test that attempt to attach pool with snapshots to an fs fails
|
||||
"""
|
||||
test_pool_name = 'snap-test-pool'
|
||||
base_cmd = f'osd pool create {test_pool_name}'
|
||||
ret = self.run_cluster_cmd_result(base_cmd)
|
||||
self.assertEqual(ret, 0)
|
||||
|
||||
self.fs.rados(["mksnap", "snap3"], pool=test_pool_name)
|
||||
|
||||
base_cmd = f'fs add_data_pool {self.fs.name} {test_pool_name}'
|
||||
ret = self.run_cluster_cmd_result(base_cmd)
|
||||
self.assertEqual(ret, errno.EOPNOTSUPP)
|
||||
|
||||
# cleanup
|
||||
self.fs.rados(["rmsnap", "snap3"], pool=test_pool_name)
|
||||
base_cmd = f'osd pool delete {test_pool_name}'
|
||||
ret = self.run_cluster_cmd_result(base_cmd)
|
||||
|
||||
def test_using_pool_with_snap_fails_fs_creation(self):
|
||||
"""
|
||||
Test that using a pool with snaps for fs creation fails
|
||||
"""
|
||||
base_cmd = 'osd pool create test_data_pool'
|
||||
ret = self.run_cluster_cmd_result(base_cmd)
|
||||
self.assertEqual(ret, 0)
|
||||
base_cmd = 'osd pool create test_metadata_pool'
|
||||
ret = self.run_cluster_cmd_result(base_cmd)
|
||||
self.assertEqual(ret, 0)
|
||||
|
||||
self.fs.rados(["mksnap", "snap4"], pool='test_data_pool')
|
||||
|
||||
base_cmd = 'fs new testfs test_metadata_pool test_data_pool'
|
||||
ret = self.run_cluster_cmd_result(base_cmd)
|
||||
self.assertEqual(ret, errno.EOPNOTSUPP)
|
||||
|
||||
# cleanup
|
||||
self.fs.rados(["rmsnap", "snap4"], pool='test_data_pool')
|
||||
base_cmd = 'osd pool delete test_data_pool'
|
||||
ret = self.run_cluster_cmd_result(base_cmd)
|
||||
base_cmd = 'osd pool delete test_metadata_pool'
|
||||
ret = self.run_cluster_cmd_result(base_cmd)
|
||||
|
@ -9,7 +9,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
FSID='00000000-0000-0000-0000-0000deadbeef'
|
||||
|
||||
# images that are used
|
||||
IMAGE_MASTER=${IMAGE_MASTER:-'quay.ceph.io/ceph-ci/ceph:master'}
|
||||
IMAGE_MAIN=${IMAGE_MAIN:-'quay.ceph.io/ceph-ci/ceph:main'}
|
||||
IMAGE_PACIFIC=${IMAGE_PACIFIC:-'quay.ceph.io/ceph-ci/ceph:pacific'}
|
||||
#IMAGE_OCTOPUS=${IMAGE_OCTOPUS:-'quay.ceph.io/ceph-ci/ceph:octopus'}
|
||||
IMAGE_DEFAULT=${IMAGE_PACIFIC}
|
||||
@ -168,7 +168,7 @@ $SUDO CEPHADM_IMAGE=$IMAGE_PACIFIC $CEPHADM_BIN version \
|
||||
#$SUDO CEPHADM_IMAGE=$IMAGE_OCTOPUS $CEPHADM_BIN version
|
||||
#$SUDO CEPHADM_IMAGE=$IMAGE_OCTOPUS $CEPHADM_BIN version \
|
||||
# | grep 'ceph version 15'
|
||||
$SUDO $CEPHADM_BIN --image $IMAGE_MASTER version | grep 'ceph version'
|
||||
$SUDO $CEPHADM_BIN --image $IMAGE_MAIN version | grep 'ceph version'
|
||||
|
||||
# try force docker; this won't work if docker isn't installed
|
||||
systemctl status docker > /dev/null && ( $CEPHADM --docker version | grep 'ceph version' ) || echo "docker not installed"
|
||||
|
@ -24,6 +24,11 @@ for f in $(find $TESTDIR/archive/coredump -type f); do
|
||||
fi
|
||||
done
|
||||
|
||||
# ceph-crash runs as the unprivileged "ceph" user, but when under test
|
||||
# the ceph osd daemons are running as root, so their crash files aren't
|
||||
# readable. let's chown them so they behave as they would in real life.
|
||||
sudo chown -R ceph:ceph /var/lib/ceph/crash
|
||||
|
||||
# let daemon find crashdumps on startup
|
||||
sudo systemctl restart ceph-crash
|
||||
sleep 30
|
||||
|
@ -387,4 +387,49 @@ if [ -n "${COOKIE}" ]; then
|
||||
unmap_device ${DEV} ${PID}
|
||||
fi
|
||||
|
||||
# test discard granularity with journaling
|
||||
rbd config image set ${POOL}/${IMAGE} rbd_discard_granularity_bytes 4096
|
||||
rbd feature enable ${POOL}/${IMAGE} journaling
|
||||
DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}`
|
||||
get_pid ${POOL}
|
||||
# since a discard will now be pruned to only whole blocks (0..4095, 4096..8191)
|
||||
# let us test all the cases around those alignments. 512 is the smallest
|
||||
# possible block blkdiscard allows us to use. Thus the test checks
|
||||
# 512 before, on the alignment, 512 after.
|
||||
_sudo blkdiscard --offset 0 --length $((4096-512)) ${DEV}
|
||||
_sudo blkdiscard --offset 0 --length 4096 ${DEV}
|
||||
_sudo blkdiscard --offset 0 --length $((4096+512)) ${DEV}
|
||||
_sudo blkdiscard --offset 512 --length $((8192-1024)) ${DEV}
|
||||
_sudo blkdiscard --offset 512 --length $((8192-512)) ${DEV}
|
||||
_sudo blkdiscard --offset 512 --length 8192 ${DEV}
|
||||
# wait for commit log to be empty, 10 seconds should be well enough
|
||||
tries=0
|
||||
queue_length=`rbd journal inspect --pool ${POOL} --image ${IMAGE} | awk '/entries inspected/ {print $1}'`
|
||||
while [ ${tries} -lt 10 ] && [ ${queue_length} -gt 0 ]; do
|
||||
rbd journal inspect --pool ${POOL} --image ${IMAGE} --verbose
|
||||
sleep 1
|
||||
queue_length=`rbd journal inspect --pool ${POOL} --image ${IMAGE} | awk '/entries inspected/ {print $1}'`
|
||||
tries=$((tries+1))
|
||||
done
|
||||
[ ${queue_length} -eq 0 ]
|
||||
unmap_device ${DEV} ${PID}
|
||||
DEV=
|
||||
rbd feature disable ${POOL}/${IMAGE} journaling
|
||||
rbd config image rm ${POOL}/${IMAGE} rbd_discard_granularity_bytes
|
||||
|
||||
# test that rbd_op_threads setting takes effect
|
||||
EXPECTED=`ceph-conf --show-config-value librados_thread_count`
|
||||
DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}`
|
||||
get_pid ${POOL}
|
||||
ACTUAL=`ps -p ${PID} -T | grep -c io_context_pool`
|
||||
[ ${ACTUAL} -eq ${EXPECTED} ]
|
||||
unmap_device ${DEV} ${PID}
|
||||
EXPECTED=$((EXPECTED * 3 + 1))
|
||||
DEV=`_sudo rbd device --device-type nbd --rbd-op-threads ${EXPECTED} map ${POOL}/${IMAGE}`
|
||||
get_pid ${POOL}
|
||||
ACTUAL=`ps -p ${PID} -T | grep -c io_context_pool`
|
||||
[ ${ACTUAL} -eq ${EXPECTED} ]
|
||||
unmap_device ${DEV} ${PID}
|
||||
DEV=
|
||||
|
||||
echo OK
|
||||
|
@ -24,8 +24,13 @@ start_mirrors ${CLUSTER1}
|
||||
start_mirrors ${CLUSTER2}
|
||||
|
||||
testlog "TEST: verify rx-only direction"
|
||||
[ "$(rbd --cluster ${CLUSTER1} --pool ${POOL} mirror pool info --format xml |
|
||||
${XMLSTARLET} sel -t -v '//mirror/peers/peer[1]/uuid')" = "" ]
|
||||
# rx-only peer is added immediately by "rbd mirror pool peer bootstrap import"
|
||||
rbd --cluster ${CLUSTER2} --pool ${POOL} mirror pool info --format json | jq -e '.peers[0].direction == "rx-only"'
|
||||
# tx-only peer is added asynchronously by mirror_peer_ping class method
|
||||
while ! rbd --cluster ${CLUSTER1} --pool ${POOL} mirror pool info --format json | jq -e '.peers | length > 0'; do
|
||||
sleep 1
|
||||
done
|
||||
rbd --cluster ${CLUSTER1} --pool ${POOL} mirror pool info --format json | jq -e '.peers[0].direction == "tx-only"'
|
||||
|
||||
create_image_and_enable_mirror ${CLUSTER1} ${POOL} image1
|
||||
|
||||
@ -34,6 +39,10 @@ write_image ${CLUSTER1} ${POOL} image1 100
|
||||
wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} image1
|
||||
|
||||
testlog "TEST: verify rx-tx direction"
|
||||
# both rx-tx peers are added immediately by "rbd mirror pool peer bootstrap import"
|
||||
rbd --cluster ${CLUSTER1} --pool ${PARENT_POOL} mirror pool info --format json | jq -e '.peers[0].direction == "rx-tx"'
|
||||
rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool info --format json | jq -e '.peers[0].direction == "rx-tx"'
|
||||
|
||||
create_image ${CLUSTER1} ${PARENT_POOL} image1
|
||||
create_image ${CLUSTER2} ${PARENT_POOL} image2
|
||||
|
||||
|
@ -1,2 +1,2 @@
|
||||
3cf40e2dca667f68c6ce3ff5cd94f01e711af894
|
||||
16.2.11
|
||||
5a2d516ce4b134bfafc80c4274532ac0d56fc1e2
|
||||
16.2.12
|
||||
|
@ -3,8 +3,10 @@
|
||||
# vim: ts=4 sw=4 smarttab expandtab
|
||||
|
||||
import argparse
|
||||
import grp
|
||||
import logging
|
||||
import os
|
||||
import pwd
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
@ -18,6 +20,7 @@ auth_names = ['client.crash.%s' % socket.gethostname(),
|
||||
'client.crash',
|
||||
'client.admin']
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
@ -29,7 +32,8 @@ def parse_args():
|
||||
)
|
||||
parser.add_argument(
|
||||
'--name', '-n',
|
||||
help='ceph name to authenticate as (default: try client.crash, client.admin)')
|
||||
help='ceph name to authenticate as '
|
||||
'(default: try client.crash, client.admin)')
|
||||
parser.add_argument(
|
||||
'--log-level', '-l',
|
||||
help='log level output (default: INFO), support INFO or DEBUG')
|
||||
@ -48,7 +52,8 @@ def post_crash(path):
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
f = open(os.path.join(path, 'meta'), 'rb')
|
||||
stderr = pr.communicate(input=f.read())
|
||||
(_, stderr) = pr.communicate(input=f.read())
|
||||
stderr = stderr.decode()
|
||||
rc = pr.wait()
|
||||
f.close()
|
||||
if rc != 0 or stderr != "":
|
||||
@ -61,6 +66,9 @@ def post_crash(path):
|
||||
def scrape_path(path):
|
||||
for p in os.listdir(path):
|
||||
crashpath = os.path.join(path, p)
|
||||
if not os.access(crashpath, os.R_OK):
|
||||
log.warning('unable to read crash path %s' % (crashpath))
|
||||
continue
|
||||
metapath = os.path.join(crashpath, 'meta')
|
||||
donepath = os.path.join(crashpath, 'done')
|
||||
if os.path.isfile(metapath):
|
||||
@ -79,12 +87,31 @@ def scrape_path(path):
|
||||
(metapath, p, os.path.join('posted/', p))
|
||||
)
|
||||
|
||||
def handler(signum):
|
||||
|
||||
def handler(signum, frame):
|
||||
print('*** Interrupted with signal %d ***' % signum)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def drop_privs():
|
||||
if os.getuid() == 0:
|
||||
try:
|
||||
ceph_uid = pwd.getpwnam("ceph").pw_uid
|
||||
ceph_gid = grp.getgrnam("ceph").gr_gid
|
||||
os.setgroups([])
|
||||
os.setgid(ceph_gid)
|
||||
os.setuid(ceph_uid)
|
||||
except Exception as e:
|
||||
log.error(f"Unable to drop privileges: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
global auth_names
|
||||
|
||||
# run as unprivileged ceph user
|
||||
drop_privs()
|
||||
|
||||
# exit code 0 on SIGINT, SIGTERM
|
||||
signal.signal(signal.SIGINT, handler)
|
||||
signal.signal(signal.SIGTERM, handler)
|
||||
@ -103,7 +130,10 @@ def main():
|
||||
|
||||
log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
|
||||
while True:
|
||||
scrape_path(args.path)
|
||||
try:
|
||||
scrape_path(args.path)
|
||||
except Exception as e:
|
||||
log.error(f"Error scraping {args.path}: {e}")
|
||||
if args.delay == 0:
|
||||
sys.exit(0)
|
||||
time.sleep(args.delay * 60)
|
||||
|
@ -794,7 +794,7 @@ def get_all_devices_vgs(name_prefix=''):
|
||||
verbose_on_failure=False
|
||||
)
|
||||
vgs = _output_parser(stdout, vg_fields)
|
||||
return [VolumeGroup(**vg) for vg in vgs]
|
||||
return [VolumeGroup(**vg) for vg in vgs if vg['vg_name']]
|
||||
|
||||
#################################
|
||||
#
|
||||
|
@ -114,16 +114,23 @@ def get_physical_fast_allocs(devices, type_, fast_slots_per_device, new_osds, ar
|
||||
|
||||
ret = []
|
||||
vg_device_map = group_devices_by_vg(devices)
|
||||
for vg_devices in vg_device_map.values():
|
||||
for vg_name, vg_devices in vg_device_map.items():
|
||||
for dev in vg_devices:
|
||||
if not dev.available_lvm:
|
||||
continue
|
||||
# any LV present is considered a taken slot
|
||||
occupied_slots = len(dev.lvs)
|
||||
# prior to v15.2.8, db/wal deployments were grouping multiple fast devices into single VGs - we need to
|
||||
# multiply requested_slots (per device) by the number of devices in the VG in order to ensure that
|
||||
# abs_size is calculated correctly from vg_size
|
||||
if vg_name == 'unused_devices':
|
||||
slots_for_vg = requested_slots
|
||||
else:
|
||||
slots_for_vg = len(vg_devices) * requested_slots
|
||||
dev_size = dev.vg_size[0]
|
||||
# this only looks at the first vg on device, unsure if there is a better
|
||||
# way
|
||||
abs_size = disk.Size(b=int(dev_size / requested_slots))
|
||||
abs_size = disk.Size(b=int(dev_size / slots_for_vg))
|
||||
free_size = dev.vg_free[0]
|
||||
relative_size = int(abs_size) / dev_size
|
||||
if requested_size:
|
||||
@ -149,7 +156,6 @@ def group_devices_by_vg(devices):
|
||||
result['unused_devices'] = []
|
||||
for dev in devices:
|
||||
if len(dev.vgs) > 0:
|
||||
# already using assumption that a PV only belongs to single VG in other places
|
||||
vg_name = dev.vgs[0].name
|
||||
if vg_name in result:
|
||||
result[vg_name].append(dev)
|
||||
|
@ -101,16 +101,16 @@ class List(object):
|
||||
'failed to determine if parent device {} is BlueStore. err: {}'.format(parent, e)))
|
||||
continue
|
||||
|
||||
bs_info = _get_bluestore_info(dev)
|
||||
if bs_info is None:
|
||||
# None is also returned in the rare event that there is an issue reading info from
|
||||
# a BlueStore disk, so be sure to log our assumption that it isn't bluestore
|
||||
logger.info('device {} does not have BlueStore information'.format(dev))
|
||||
continue
|
||||
uuid = bs_info['osd_uuid']
|
||||
if uuid not in result:
|
||||
result[uuid] = {}
|
||||
result[uuid].update(bs_info)
|
||||
bs_info = _get_bluestore_info(dev)
|
||||
if bs_info is None:
|
||||
# None is also returned in the rare event that there is an issue reading info from
|
||||
# a BlueStore disk, so be sure to log our assumption that it isn't bluestore
|
||||
logger.info('device {} does not have BlueStore information'.format(dev))
|
||||
continue
|
||||
uuid = bs_info['osd_uuid']
|
||||
if uuid not in result:
|
||||
result[uuid] = {}
|
||||
result[uuid].update(bs_info)
|
||||
|
||||
return result
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user