import ceph pacific 16.2.12 source

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Thomas Lamprecht 2023-04-17 09:34:51 +02:00
parent 578f8e68e4
commit f7c0226f20
257 changed files with 19106 additions and 13125 deletions

View File

@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
# remove cmake/modules/FindPython* once 3.12 is required
project(ceph
VERSION 16.2.11
VERSION 16.2.12
LANGUAGES CXX C ASM)
foreach(policy

View File

@ -32,6 +32,11 @@
in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
the restored file system is expected to have the same ID as before.
* CEPHFS: Rename the `mds_max_retries_on_remount_failure` option to
`client_max_retries_on_remount_failure` and move it from mds.yaml.in to
mds-client.yaml.in because this option was only used by MDS client from its
birth.
>=16.2.11
--------

View File

@ -135,7 +135,7 @@
# main package definition
#################################################################################
Name: ceph
Version: 16.2.11
Version: 16.2.12
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
@ -151,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
Group: System/Filesystems
%endif
URL: http://ceph.com/
Source0: %{?_remote_tarball_prefix}ceph-16.2.11.tar.bz2
Source0: %{?_remote_tarball_prefix}ceph-16.2.12.tar.bz2
%if 0%{?suse_version}
# _insert_obs_source_lines_here
ExclusiveArch: x86_64 aarch64 ppc64le s390x
@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus.
# common
#################################################################################
%prep
%autosetup -p1 -n ceph-16.2.11
%autosetup -p1 -n ceph-16.2.12
%build
# Disable lto on systems that do not support symver attribute
@ -1398,7 +1398,7 @@ touch %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
chmod 0600 %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
# firewall templates and /sbin/mount.ceph symlink
%if 0%{?suse_version} && !0%{?usrmerged}
%if 0%{?suse_version} && 0%{?suse_version} < 1550
mkdir -p %{buildroot}/sbin
ln -sf %{_sbindir}/mount.ceph %{buildroot}/sbin/mount.ceph
%endif
@ -1577,7 +1577,7 @@ exit 0
%{_bindir}/rbd-replay-many
%{_bindir}/rbdmap
%{_sbindir}/mount.ceph
%if 0%{?suse_version} && !0%{?usrmerged}
%if 0%{?suse_version} && 0%{?suse_version} < 1550
/sbin/mount.ceph
%endif
%if %{with lttng}

View File

@ -1398,7 +1398,7 @@ touch %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
chmod 0600 %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
# firewall templates and /sbin/mount.ceph symlink
%if 0%{?suse_version} && !0%{?usrmerged}
%if 0%{?suse_version} && 0%{?suse_version} < 1550
mkdir -p %{buildroot}/sbin
ln -sf %{_sbindir}/mount.ceph %{buildroot}/sbin/mount.ceph
%endif
@ -1577,7 +1577,7 @@ exit 0
%{_bindir}/rbd-replay-many
%{_bindir}/rbdmap
%{_sbindir}/mount.ceph
%if 0%{?suse_version} && !0%{?usrmerged}
%if 0%{?suse_version} && 0%{?suse_version} < 1550
/sbin/mount.ceph
%endif
%if %{with lttng}

View File

@ -1,7 +1,19 @@
ceph (16.2.11-1focal) focal; urgency=medium
ceph (16.2.12-1focal) focal; urgency=medium
-- Jenkins Build Slave User <jenkins-build@braggi16.front.sepia.ceph.com> Tue, 24 Jan 2023 21:28:06 +0000
-- Jenkins Build Slave User <jenkins-build@braggi17.front.sepia.ceph.com> Thu, 13 Apr 2023 22:05:57 +0000
ceph (16.2.12-1) stable; urgency=medium
* New upstream release
-- Ceph Release Team <ceph-maintainers@ceph.io> Thu, 13 Apr 2023 21:54:05 +0000
ceph (16.2.12-1) stable; urgency=medium
* New upstream release
-- Ceph Release Team <ceph-maintainers@ceph.io> Thu, 13 Apr 2023 14:09:23 +0000
ceph (16.2.11-1) stable; urgency=medium

View File

@ -13,7 +13,7 @@ understand what OSD is enabled and needs to be mounted.
.. note:: The execution of this call is fully idempotent, and there is no
side-effects when running multiple times
For OSDs deployed by cephadm, please refer to :ref:cephadm-osd-activate:
For OSDs deployed by cephadm, please refer to :ref:`cephadm-osd-activate`
instead.
New OSDs
@ -29,7 +29,7 @@ need to be supplied. For example::
Activating all OSDs
-------------------
.. note:: For OSDs deployed by cephadm, please refer to :ref:cephadm-osd-activate:
.. note:: For OSDs deployed by cephadm, please refer to :ref:`cephadm-osd-activate`
instead.
It is possible to activate all existing OSDs at once by using the ``--all``

View File

@ -4,45 +4,41 @@ Encryption
==========
Logical volumes can be encrypted using ``dmcrypt`` by specifying the
``--dmcrypt`` flag when creating OSDs. Encryption can be done in different ways,
specially with LVM. ``ceph-volume`` is somewhat opinionated with the way it
sets up encryption with logical volumes so that the process is consistent and
``--dmcrypt`` flag when creating OSDs. When using LVM, logical volumes can be
encrypted in different ways. ``ceph-volume`` does not offer as many options as
LVM does, but it encrypts logical volumes in a way that is consistent and
robust.
In this case, ``ceph-volume lvm`` follows these constraints:
In this case, ``ceph-volume lvm`` follows this constraint:
* only LUKS (version 1) is used
* Logical Volumes are encrypted, while their underlying PVs (physical volumes)
aren't
* Non-LVM devices like partitions are also encrypted with the same OSD key
* Non-LVM devices (such as partitions) are encrypted with the same OSD key.
LUKS
----
There are currently two versions of LUKS, 1 and 2. Version 2 is a bit easier
to implement but not widely available in all distros Ceph supports. LUKS 1 is
not going to be deprecated in favor of LUKS 2, so in order to have as wide
support as possible, ``ceph-volume`` uses LUKS version 1.
There are currently two versions of LUKS, 1 and 2. Version 2 is a bit easier to
implement but not widely available in all Linux distributions supported by
Ceph.
.. note:: Version 1 of LUKS is just referenced as "LUKS" whereas version 2 is
referred to as LUKS2
.. note:: Version 1 of LUKS is referred to in this documentation as "LUKS".
Version 2 is of LUKS is referred to in this documentation as "LUKS2".
LUKS on LVM
-----------
Encryption is done on top of existing logical volumes (unlike encrypting the
physical device). Any single logical volume can be encrypted while other
volumes can remain unencrypted. This method also allows for flexible logical
Encryption is done on top of existing logical volumes (this is not the same as
encrypting the physical device). Any single logical volume can be encrypted,
leaving other volumes unencrypted. This method also allows for flexible logical
volume setups, since encryption will happen once the LV is created.
Workflow
--------
When setting up the OSD, a secret key will be created, that will be passed
along to the monitor in JSON format as ``stdin`` to prevent the key from being
When setting up the OSD, a secret key is created. That secret key is passed
to the monitor in JSON format as ``stdin`` to prevent the key from being
captured in the logs.
The JSON payload looks something like::
The JSON payload looks something like this::
{
"cephx_secret": CEPHX_SECRET,
@ -51,36 +47,38 @@ The JSON payload looks something like::
}
The naming convention for the keys is **strict**, and they are named like that
for the hardcoded (legacy) names ceph-disk used.
for the hardcoded (legacy) names used by ceph-disk.
* ``cephx_secret`` : The cephx key used to authenticate
* ``dmcrypt_key`` : The secret (or private) key to unlock encrypted devices
* ``cephx_lockbox_secret`` : The authentication key used to retrieve the
``dmcrypt_key``. It is named *lockbox* because ceph-disk used to have an
unencrypted partition named after it, used to store public keys and other
OSD metadata.
unencrypted partition named after it, which was used to store public keys and
other OSD metadata.
The naming convention is strict because Monitors supported the naming
convention by ceph-disk, which used these key names. In order to keep
compatibility and prevent ceph-disk from breaking, ceph-volume will use the same
naming convention *although they don't make sense for the new encryption
convention of ceph-disk, which used these key names. In order to maintain
compatibility and prevent ceph-disk from breaking, ceph-volume uses the same
naming convention *although it does not make sense for the new encryption
workflow*.
After the common steps of setting up the OSD during the prepare stage, either
with :term:`filestore` or :term:`bluestore`, the logical volume is left ready
to be activated, regardless of the state of the device (encrypted or decrypted).
After the common steps of setting up the OSD during the "prepare stage" (either
with :term:`filestore` or :term:`bluestore`), the logical volume is left ready
to be activated, regardless of the state of the device (encrypted or
decrypted).
At activation time, the logical volume will get decrypted and the OSD started
once the process completes correctly.
At the time of its activation, the logical volume is decrypted. The OSD starts
after the process completes correctly.
Summary of the encryption workflow for creating a new OSD:
Summary of the encryption workflow for creating a new OSD
----------------------------------------------------------
#. OSD is created, both lockbox and dmcrypt keys are created, and sent along
with JSON to the monitors, indicating an encrypted OSD.
#. OSD is created. Both lockbox and dmcrypt keys are created and sent to the
monitors in JSON format, indicating an encrypted OSD.
#. All complementary devices (like journal, db, or wal) get created and
encrypted with the same OSD key. Key is stored in the LVM metadata of the
OSD
OSD.
#. Activation continues by ensuring devices are mounted, retrieving the dmcrypt
secret key from the monitors and decrypting before the OSD gets started.
secret key from the monitors, and decrypting before the OSD gets started.

View File

@ -11,20 +11,28 @@ Compatibility with Podman Versions
Podman and Ceph have different end-of-life strategies. This means that care
must be taken in finding a version of Podman that is compatible with Ceph.
These versions are expected to work:
This table shows which version pairs are expected to work or not work together:
+-----------+---------------------------------------+
| Ceph | Podman |
+-----------+-------+-------+-------+-------+-------+
| | 1.9 | 2.0 | 2.1 | 2.2 | 3.0 |
+===========+=======+=======+=======+=======+=======+
| <= 15.2.5 | True | False | False | False | False |
+-----------+-------+-------+-------+-------+-------+
| >= 15.2.6 | True | True | True | False | False |
+-----------+-------+-------+-------+-------+-------+
| >= 16.2.1 | False | True | True | False | True |
+-----------+-------+-------+-------+-------+-------+
+-----------+-----------------------------------------------+
| Ceph | Podman |
+-----------+-------+-------+-------+-------+-------+-------+
| | 1.9 | 2.0 | 2.1 | 2.2 | 3.0 | > 3.0 |
+===========+=======+=======+=======+=======+=======+=======+
| <= 15.2.5 | True | False | False | False | False | False |
+-----------+-------+-------+-------+-------+-------+-------+
| >= 15.2.6 | True | True | True | False | False | False |
+-----------+-------+-------+-------+-------+-------+-------+
| >= 16.2.1 | False | True | True | False | True | True |
+-----------+-------+-------+-------+-------+-------+-------+
| >= 17.2.0 | False | True | True | False | True | True |
+-----------+-------+-------+-------+-------+-------+-------+
.. note::
While not all podman versions have been actively tested against
all Ceph versions, there are no known issues with using podman
version 3.0 or greater with Ceph Quincy and later releases.
.. warning::
@ -41,17 +49,17 @@ These versions are expected to work:
Stability
---------
Cephadm is under development. Some functionality is incomplete. Be aware
that some of the components of Ceph may not work perfectly with cephadm.
These include:
- RGW
Cephadm is relatively stable but new functionality is still being
added and bugs are occasionally discovered. If issues are found, please
open a tracker issue under the Orchestrator component (https://tracker.ceph.com/projects/orchestrator/issues)
Cephadm support remains under development for the following features:
- Ingress
- Cephadm exporter daemon
- cephfs-mirror
- ceph-exporter deployment
- stretch mode integration
- monitoring stack (moving towards prometheus service discover and providing TLS)
- RGW multisite deployment support (requires lots of manual steps currently)
- cephadm agent
If a cephadm command fails or a service stops running properly, see
:ref:`cephadm-pause` for instructions on how to pause the Ceph cluster's

View File

@ -245,9 +245,10 @@ Many hosts can be added at once using
hostname: node-02
addr: 192.168.0.12
This can be combined with service specifications (below) to create a cluster spec
file to deploy a whole cluster in one command. see ``cephadm bootstrap --apply-spec``
also to do this during bootstrap. Cluster SSH Keys must be copied to hosts prior to adding them.
This can be combined with :ref:`service specifications<orchestrator-cli-service-spec>`
to create a cluster spec file to deploy a whole cluster in one command. see
``cephadm bootstrap --apply-spec`` also to do this during bootstrap. Cluster
SSH Keys must be copied to hosts prior to adding them.
Setting the initial CRUSH location of host
==========================================

View File

@ -292,7 +292,7 @@ By default, a ``ceph.conf`` file and a copy of the ``client.admin`` keyring
are maintained in ``/etc/ceph`` on all hosts with the ``_admin`` label, which is initially
applied only to the bootstrap host. We usually recommend that one or more other hosts be
given the ``_admin`` label so that the Ceph CLI (e.g., via ``cephadm shell``) is easily
accessible on multiple hosts. To add the ``_admin`` label to additional host(s),
accessible on multiple hosts. To add the ``_admin`` label to additional host(s):
.. prompt:: bash #
@ -310,8 +310,8 @@ Please follow :ref:`deploy_additional_monitors` to deploy additional MONs.
Adding Storage
==============
To add storage to the cluster, either tell Ceph to consume any
available and unused device:
To add storage to the cluster, you can tell Ceph to consume any
available and unused device(s):
.. prompt:: bash #

View File

@ -496,11 +496,20 @@ candidate hosts.
If there are fewer hosts selected by the placement specification than
demanded by ``count``, cephadm will deploy only on the selected hosts.
.. _cephadm-extra-container-args:
Extra Container Arguments
=========================
.. warning::
The arguments provided for extra container args are limited to whatever arguments are available for a `run` command from whichever container engine you are using. Providing any arguments the `run` command does not support (or invalid values for arguments) will cause the daemon to fail to start.
The arguments provided for extra container args are limited to whatever arguments are available for
a `run` command from whichever container engine you are using. Providing any arguments the `run`
command does not support (or invalid values for arguments) will cause the daemon to fail to start.
.. note::
For arguments passed to the process running inside the container rather than the for
the container runtime itself, see :ref:`cephadm-extra-entrypoint-args`
Cephadm supports providing extra miscellaneous container arguments for
@ -544,6 +553,30 @@ For example:
- "-v"
- "/opt/ceph_cert/host.cert:/etc/grafana/certs/cert_file:ro"
.. _cephadm-extra-entrypoint-args:
Extra Entrypoint Arguments
==========================
.. note::
For arguments intended for the container runtime rather than the process inside
it, see :ref:`cephadm-extra-container-args`
Similar to extra container args for the container runtime, Cephadm supports
appending to args passed to the entrypoint process running
within a container. For example, to set the collector textfile directory for
the node-exporter service , one could apply a service spec like
.. code-block:: yaml
service_type: node-exporter
service_name: node-exporter
placement:
host_pattern: '*'
extra_entrypoint_args:
- "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector2"
.. _orch-rm:
Removing a Service

View File

@ -164,8 +164,10 @@ for RGW with a minumum set of configuration options. The orchestrator will
deploy and manage a combination of haproxy and keepalived to provide load
balancing on a floating virtual IP.
If SSL is used, then SSL must be configured and terminated by the ingress service
and not RGW itself.
If the RGW service is configured with SSL enabled, then the ingress service
will use the `ssl` and `verify none` options in the backend configuration.
Trust verification is disabled because the backends are accessed by IP
address instead of FQDN.
.. image:: ../../images/HAProxy_for_RGW.svg
@ -186,8 +188,7 @@ between all the RGW daemons available.
Prerequisites
-------------
* An existing RGW service, without SSL. (If you want SSL service, the certificate
should be configured on the ingress service, not the RGW service.)
* An existing RGW service.
Deploying
---------

View File

@ -1,22 +1,19 @@
Troubleshooting
===============
You might need to investigate why a cephadm command failed
You may wish to investigate why a cephadm command failed
or why a certain service no longer runs properly.
Cephadm deploys daemons as containers. This means that
troubleshooting those containerized daemons might work
differently than you expect (and that is certainly true if
you expect this troubleshooting to work the way that
troubleshooting does when the daemons involved aren't
containerized).
Cephadm deploys daemons within containers. This means that
troubleshooting those containerized daemons will require
a different process than traditional package-install daemons.
Here are some tools and commands to help you troubleshoot
your Ceph environment.
.. _cephadm-pause:
Pausing or disabling cephadm
Pausing or Disabling cephadm
----------------------------
If something goes wrong and cephadm is behaving badly, you can
@ -45,16 +42,15 @@ See :ref:`cephadm-spec-unmanaged` for information on disabling
individual services.
Per-service and per-daemon events
Per-service and Per-daemon Events
---------------------------------
In order to help with the process of debugging failed daemon
deployments, cephadm stores events per service and per daemon.
In order to facilitate debugging failed daemons,
cephadm stores events per service and per daemon.
These events often contain information relevant to
troubleshooting
your Ceph cluster.
troubleshooting your Ceph cluster.
Listing service events
Listing Service Events
~~~~~~~~~~~~~~~~~~~~~~
To see the events associated with a certain service, run a
@ -82,7 +78,7 @@ This will return something in the following form:
- '2021-02-01T12:09:25.264584 service:alertmanager [ERROR] "Failed to apply: Cannot
place <AlertManagerSpec for service_name=alertmanager> on unknown_host: Unknown hosts"'
Listing daemon events
Listing Daemon Events
~~~~~~~~~~~~~~~~~~~~~
To see the events associated with a certain daemon, run a
@ -106,16 +102,16 @@ This will return something in the following form:
mds.cephfs.hostname.ppdhsz on host 'hostname'"
Checking cephadm logs
Checking Cephadm Logs
---------------------
To learn how to monitor the cephadm logs as they are generated, read :ref:`watching_cephadm_logs`.
To learn how to monitor cephadm logs as they are generated, read :ref:`watching_cephadm_logs`.
If your Ceph cluster has been configured to log events to files, there will exist a
cephadm log file called ``ceph.cephadm.log`` on all monitor hosts (see
:ref:`cephadm-logs` for a more complete explanation of this).
If your Ceph cluster has been configured to log events to files, there will be a
``ceph.cephadm.log`` file on all monitor hosts (see
:ref:`cephadm-logs` for a more complete explanation).
Gathering log files
Gathering Log Files
-------------------
Use journalctl to gather the log files of all daemons:
@ -140,7 +136,7 @@ To fetch all log files of all daemons on a given host, run::
cephadm logs --fsid <fsid> --name "$name" > $name;
done
Collecting systemd status
Collecting Systemd Status
-------------------------
To print the state of a systemd unit, run::
@ -156,7 +152,7 @@ To fetch all state of all daemons of a given host, run::
done
List all downloaded container images
List all Downloaded Container Images
------------------------------------
To list all container images that are downloaded on a host:
@ -170,16 +166,16 @@ To list all container images that are downloaded on a host:
"registry.opensuse.org/opensuse/leap:15.2"
Manually running containers
Manually Running Containers
---------------------------
Cephadm writes small wrappers that run a containers. Refer to
Cephadm uses small wrappers when running containers. Refer to
``/var/lib/ceph/<cluster-fsid>/<service-name>/unit.run`` for the
container execution command.
.. _cephadm-ssh-errors:
SSH errors
SSH Errors
----------
Error message::
@ -191,7 +187,7 @@ Error message::
Please make sure that the host is reachable and accepts connections using the cephadm SSH key
...
Things users can do:
Things Ceph administrators can do:
1. Ensure cephadm has an SSH identity key::
@ -224,7 +220,7 @@ To verify that the public key is in the authorized_keys file, run the following
[root@mon1 ~]# cephadm shell -- ceph cephadm get-pub-key > ~/ceph.pub
[root@mon1 ~]# grep "`cat ~/ceph.pub`" /root/.ssh/authorized_keys
Failed to infer CIDR network error
Failed to Infer CIDR network error
----------------------------------
If you see this error::
@ -241,7 +237,7 @@ This means that you must run a command of this form::
For more detail on operations of this kind, see :ref:`deploy_additional_monitors`
Accessing the admin socket
Accessing the Admin Socket
--------------------------
Each Ceph daemon provides an admin socket that bypasses the
@ -252,12 +248,12 @@ To access the admin socket, first enter the daemon container on the host::
[root@mon1 ~]# cephadm enter --name <daemon-name>
[ceph: root@mon1 /]# ceph --admin-daemon /var/run/ceph/ceph-<daemon-name>.asok config show
Calling miscellaneous ceph tools
Running Various Ceph Tools
--------------------------------
To call miscellaneous like ``ceph-objectstore-tool`` or
``ceph-monstore-tool``, you can run them by calling
``cephadm shell --name <daemon-name>`` like so::
To run Ceph tools like ``ceph-objectstore-tool`` or
``ceph-monstore-tool``, invoke the cephadm CLI with
``cephadm shell --name <daemon-name>``. For example::
root@myhostname # cephadm unit --name mon.myhostname stop
root@myhostname # cephadm shell --name mon.myhostname
@ -272,21 +268,21 @@ To call miscellaneous like ``ceph-objectstore-tool`` or
election_strategy: 1
0: [v2:127.0.0.1:3300/0,v1:127.0.0.1:6789/0] mon.myhostname
This command sets up the environment in a way that is suitable
for extended daemon maintenance and running the deamon interactively.
The cephadm shell sets up the environment in a way that is suitable
for extended daemon maintenance and running daemons interactively.
.. _cephadm-restore-quorum:
Restoring the MON quorum
------------------------
Restoring the Monitor Quorum
----------------------------
In case the Ceph MONs cannot form a quorum, cephadm is not able
to manage the cluster, until the quorum is restored.
If the Ceph monitor daemons (mons) cannot form a quorum, cephadm will not be
able to manage the cluster until quorum is restored.
In order to restore the MON quorum, remove unhealthy MONs
In order to restore the quorum, remove unhealthy monitors
form the monmap by following these steps:
1. Stop all MONs. For each MON host::
1. Stop all mons. For each mon host::
ssh {mon-host}
cephadm unit --name mon.`hostname` stop
@ -301,18 +297,19 @@ form the monmap by following these steps:
.. _cephadm-manually-deploy-mgr:
Manually deploying a MGR daemon
-------------------------------
cephadm requires a MGR daemon in order to manage the cluster. In case the cluster
the last MGR of a cluster was removed, follow these steps in order to deploy
a MGR ``mgr.hostname.smfvfd`` on a random host of your cluster manually.
Manually Deploying a Manager Daemon
-----------------------------------
At least one manager (mgr) daemon is required by cephadm in order to manage the
cluster. If the last mgr in a cluster has been removed, follow these steps in
order to deploy a manager called (for example)
``mgr.hostname.smfvfd`` on a random host of your cluster manually.
Disable the cephadm scheduler, in order to prevent cephadm from removing the new
MGR. See :ref:`cephadm-enable-cli`::
manager. See :ref:`cephadm-enable-cli`::
ceph config-key set mgr/cephadm/pause true
Then get or create the auth entry for the new MGR::
Then get or create the auth entry for the new manager::
ceph auth get-or-create mgr.hostname.smfvfd mon "profile mgr" osd "allow *" mds "allow *"
@ -338,26 +335,26 @@ Deploy the daemon::
cephadm --image <container-image> deploy --fsid <fsid> --name mgr.hostname.smfvfd --config-json config-json.json
Analyzing core dumps
Analyzing Core Dumps
---------------------
In case a Ceph daemon crashes, cephadm supports analyzing core dumps. To enable core dumps, run
When a Ceph daemon crashes, cephadm supports analyzing core dumps. To enable core dumps, run
.. prompt:: bash #
ulimit -c unlimited
core dumps will now be written to ``/var/lib/systemd/coredump``.
Core dumps will now be written to ``/var/lib/systemd/coredump``.
.. note::
core dumps are not namespaced by the kernel, which means
Core dumps are not namespaced by the kernel, which means
they will be written to ``/var/lib/systemd/coredump`` on
the container host.
Now, wait for the crash to happen again. (To simulate the crash of a daemon, run e.g. ``killall -3 ceph-mon``)
Now, wait for the crash to happen again. To simulate the crash of a daemon, run e.g. ``killall -3 ceph-mon``.
Install debug packages by entering the cephadm shell and install ``ceph-debuginfo``::
Install debug packages including ``ceph-debuginfo`` by entering the cephadm shelll::
# cephadm shell --mount /var/lib/systemd/coredump
[ceph: root@host1 /]# dnf install ceph-debuginfo gdb zstd

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

After

Width:  |  Height:  |  Size: 40 KiB

View File

@ -86,8 +86,17 @@ Interactive Commands
1. m : Filesystem selection
Displays a menu of filesystems for selection.
2. q : Quit
Exit the utility if you are at the home screen (All Filesystem Info),
2. s : Sort field selection
Designates the sort field. 'cap_hit' is the default.
3. l : Client limit
Sets the limit on the number of clients to be displayed.
4. r : Reset
Resets the sort field and limit value to the default.
5. q : Quit
Exit the utility if you are at the home screen (all filesystem info),
otherwise escape back to the home screen.
The metrics display can be scrolled using the Arrow Keys, PgUp/PgDn, Home/End and mouse.

View File

@ -229,27 +229,21 @@ backed by the original data pool.
::
ceph fs flag set enable_multiple true --yes-i-really-mean-it
ceph osd pool create cephfs_recovery_meta
ceph fs new cephfs_recovery recovery <data_pool> --allow-dangerous-metadata-overlay
ceph fs new cephfs_recovery recovery <data_pool> --recover --allow-dangerous-metadata-overlay
.. note::
The recovery file system starts with an MDS rank that will initialize the new
metadata pool with some metadata. This is necessary to bootstrap recovery.
However, now we will take the MDS down as we do not want it interacting with
the metadata pool further.
The ``--recover`` flag prevents any MDS from joining the new file system.
Next, we will create the intial metadata for the fs:
::
ceph fs fail cephfs_recovery
Next, we will reset the initial metadata the MDS created:
::
cephfs-table-tool cephfs_recovery:all reset session
cephfs-table-tool cephfs_recovery:all reset snap
cephfs-table-tool cephfs_recovery:all reset inode
cephfs-table-tool cephfs_recovery:0 reset session
cephfs-table-tool cephfs_recovery:0 reset snap
cephfs-table-tool cephfs_recovery:0 reset inode
cephfs-journal-tool --rank cephfs_recovery:0 journal reset --force
Now perform the recovery of the metadata pool from the data pool:
@ -272,7 +266,6 @@ with:
::
cephfs-journal-tool --rank=<fs_name>:0 event recover_dentries list --alternate-pool cephfs_recovery_meta
cephfs-journal-tool --rank cephfs_recovery:0 journal reset --force
After recovery, some recovered directories will have incorrect statistics.
Ensure the parameters ``mds_verify_scatter`` and ``mds_debug_scatterstat`` are
@ -283,20 +276,22 @@ set to false (the default) to prevent the MDS from checking the statistics:
ceph config rm mds mds_verify_scatter
ceph config rm mds mds_debug_scatterstat
(Note, the config may also have been set globally or via a ceph.conf file.)
.. note::
Also verify the config has not been set globally or with a local ceph.conf file.
Now, allow an MDS to join the recovery file system:
::
ceph fs set cephfs_recovery joinable true
Finally, run a forward :doc:`scrub </cephfs/scrub>` to repair the statistics.
Finally, run a forward :doc:`scrub </cephfs/scrub>` to repair recursive statistics.
Ensure you have an MDS running and issue:
::
ceph fs status # get active MDS
ceph tell mds.<id> scrub start / recursive repair
ceph tell mds.recovery_fs:0 scrub start / recursive,repair,force
.. note::

View File

@ -3,11 +3,12 @@
FS volumes and subvolumes
=========================
A single source of truth for CephFS exports is implemented in the volumes
module of the :term:`Ceph Manager` daemon (ceph-mgr). The OpenStack shared
file system service (manila_), Ceph Container Storage Interface (CSI_),
The volumes
module of the :term:`Ceph Manager` daemon (ceph-mgr) provides a single
source of truth for CephFS exports. The OpenStack shared
file system service (manila_) and Ceph Container Storage Interface (CSI_)
storage administrators among others can use the common CLI provided by the
ceph-mgr volumes module to manage the CephFS exports.
ceph-mgr volumes module to manage CephFS exports.
The ceph-mgr volumes module implements the following file system export
abstactions:
@ -22,17 +23,17 @@ abstactions:
Some possible use-cases for the export abstractions:
* FS subvolumes used as manila shares or CSI volumes
* FS subvolumes used as Manila shares or CSI volumes
* FS subvolume groups used as manila share groups
* FS subvolume groups used as Manila share groups
Requirements
------------
* Nautilus (14.2.x) or a later version of Ceph
* Nautilus (14.2.x) or later Ceph release
* Cephx client user (see :doc:`/rados/operations/user-management`) with
the following minimum capabilities::
at least the following capabilities::
mon 'allow r'
mgr 'allow rw'
@ -46,41 +47,56 @@ Create a volume using::
$ ceph fs volume create <vol_name> [<placement>]
This creates a CephFS file system and its data and metadata pools. It can also
try to create MDSes for the filesystem using the enabled ceph-mgr orchestrator
module (see :doc:`/mgr/orchestrator`), e.g. rook.
deploy MDS daemons for the filesystem using a ceph-mgr orchestrator
module (see :doc:`/mgr/orchestrator`), for example Rook.
<vol_name> is the volume name (an arbitrary string), and
<placement> is an optional string signifying which hosts should have NFS Ganesha
daemon containers running on them and, optionally, the total number of NFS
Ganesha daemons the cluster (should you want to have more than one NFS Ganesha
daemon running per node). For example, the following placement string means
"deploy NFS Ganesha daemons on nodes host1 and host2 (one daemon per host):
<placement> is an optional string that designates the hosts that should have
an MDS running on them and, optionally, the total number of MDS daemons the cluster
should have. For example, the
following placement string means "deploy MDS on nodes ``host1`` and ``host2`` (one
MDS per host):
"host1,host2"
and this placement specification says to deploy two NFS Ganesha daemons each
on nodes host1 and host2 (for a total of four NFS Ganesha daemons in the
cluster):
and this placement specification says to deploy two MDS daemons on each of
nodes ``host1`` and ``host2`` (for a total of four MDS daemons in the cluster):
"4 host1,host2"
For more details on placement specification refer to the `orchestrator doc
<https://docs.ceph.com/docs/master/mgr/orchestrator/#placement-specification>`_
but keep in mind that specifying the placement via a YAML file is not supported.
For more details on placement specification refer to the :ref:`orchestrator-cli-service-spec`,
but keep in mind that specifying placement via a YAML file is not supported.
Remove a volume using::
To remove a volume, run the following command::
$ ceph fs volume rm <vol_name> [--yes-i-really-mean-it]
This removes a file system and its data and metadata pools. It also tries to
remove MDSes using the enabled ceph-mgr orchestrator module.
remove MDS daemons using the enabled ceph-mgr orchestrator module.
List volumes using::
$ ceph fs volume ls
Fetch the information of a CephFS volume using::
Rename a volume using::
$ ceph fs volume rename <vol_name> <new_vol_name> [--yes-i-really-mean-it]
Renaming a volume can be an expensive operation that requires the following:
- Rename the orchestrator-managed MDS service to match the <new_vol_name>.
This involves launching a MDS service with <new_vol_name> and bringing down
the MDS service with <vol_name>.
- Rename the file system matching <vol_name> to <new_vol_name>
- Change the application tags on the data and metadata pools of the file system
to <new_vol_name>
- Rename the metadata and data pools of the file system.
The CephX IDs authorized for <vol_name> need to be reauthorized for <new_vol_name>. Any
on-going operations of the clients using these IDs may be disrupted. Mirroring is
expected to be disabled on the volume.
To fetch the information of a CephFS volume, run::
$ ceph fs volume info vol_name [--human_readable]
@ -88,15 +104,15 @@ The ``--human_readable`` flag shows used and available pool capacities in KB/MB/
The output format is JSON and contains fields as follows:
* pools: Attributes of data and metadata pools
* avail: The amount of free space available in bytes
* used: The amount of storage consumed in bytes
* name: Name of the pool
* mon_addrs: List of monitor addresses
* used_size: Current used size of the CephFS volume in bytes
* pending_subvolume_deletions: Number of subvolumes pending deletion
* ``pools``: Attributes of data and metadata pools
* ``avail``: The amount of free space available in bytes
* ``used``: The amount of storage consumed in bytes
* ``name``: Name of the pool
* ``mon_addrs``: List of Ceph monitor addresses
* ``used_size``: Current used size of the CephFS volume in bytes
* ``pending_subvolume_deletions``: Number of subvolumes pending deletion
Sample output of volume info command::
Sample output of the ``volume info`` command::
$ ceph fs volume info vol_name
{
@ -133,10 +149,10 @@ Create a subvolume group using::
The command succeeds even if the subvolume group already exists.
When creating a subvolume group you can specify its data pool layout (see
:doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals and
:doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals, and
size in bytes. The size of the subvolume group is specified by setting
a quota on it (see :doc:`/cephfs/quota`). By default, the subvolume group
is created with an octal file mode '755', uid '0', gid '0' and the data pool
is created with octal file mode '755', uid '0', gid '0' and the data pool
layout of its parent directory.
@ -163,49 +179,49 @@ Fetch the metadata of a subvolume group using::
$ ceph fs subvolumegroup info <vol_name> <group_name>
The output format is JSON and contains fields as follows.
The output format is JSON and contains fields as follows:
* atime: access time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* mtime: modification time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* ctime: change time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* uid: uid of subvolume group path
* gid: gid of subvolume group path
* mode: mode of subvolume group path
* mon_addrs: list of monitor addresses
* bytes_pcent: quota used in percentage if quota is set, else displays "undefined"
* bytes_quota: quota size in bytes if quota is set, else displays "infinite"
* bytes_used: current used size of the subvolume group in bytes
* created_at: time of creation of subvolume group in the format "YYYY-MM-DD HH:MM:SS"
* data_pool: data pool the subvolume group belongs to
* ``atime``: access time of the subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* ``mtime``: modification time of the subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* ``ctime``: change time of the subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* ``uid``: uid of the subvolume group path
* ``gid``: gid of the subvolume group path
* ``mode``: mode of the subvolume group path
* ``mon_addrs``: list of monitor addresses
* ``bytes_pcent``: quota used in percentage if quota is set, else displays "undefined"
* ``bytes_quota``: quota size in bytes if quota is set, else displays "infinite"
* ``bytes_used``: current used size of the subvolume group in bytes
* ``created_at``: creation time of the subvolume group in the format "YYYY-MM-DD HH:MM:SS"
* ``data_pool``: data pool to which the subvolume group belongs
Check the presence of any subvolume group using::
$ ceph fs subvolumegroup exist <vol_name>
The strings returned by the 'exist' command:
The 'exist' command outputs:
* "subvolumegroup exists": if any subvolumegroup is present
* "no subvolumegroup exists": if no subvolumegroup is present
.. note:: It checks for the presence of custom groups and not the default one. To validate the emptiness of the volume, subvolumegroup existence check alone is not sufficient. The subvolume existence also needs to be checked as there might be subvolumes in the default group.
.. note:: This command checks for the presence of custom groups and not presence of the default one. To validate the emptiness of the volume, a subvolumegroup existence check alone is not sufficient. Subvolume existence also needs to be checked as there might be subvolumes in the default group.
Resize a subvolume group using::
$ ceph fs subvolumegroup resize <vol_name> <group_name> <new_size> [--no_shrink]
The command resizes the subvolume group quota using the size specified by 'new_size'.
The '--no_shrink' flag prevents the subvolume group to shrink below the current used
size of the subvolume group.
The command resizes the subvolume group quota using the size specified by ``new_size``.
The ``--no_shrink`` flag prevents the subvolume group from shrinking below the current used
size.
The subvolume group can be resized to an unlimited size by passing 'inf' or 'infinite'
as the new_size.
The subvolume group may be resized to an infinite size by passing ``inf`` or ``infinite``
as the ``new_size``.
Remove a snapshot of a subvolume group using::
$ ceph fs subvolumegroup snapshot rm <vol_name> <group_name> <snap_name> [--force]
Using the '--force' flag allows the command to succeed that would otherwise
fail if the snapshot did not exist.
Supplying the ``--force`` flag allows the command to succeed when it would otherwise
fail due to the snapshot not existing.
List snapshots of a subvolume group using::
@ -254,10 +270,10 @@ Resize a subvolume using::
$ ceph fs subvolume resize <vol_name> <subvol_name> <new_size> [--group_name <subvol_group_name>] [--no_shrink]
The command resizes the subvolume quota using the size specified by 'new_size'.
'--no_shrink' flag prevents the subvolume to shrink below the current used size of the subvolume.
The command resizes the subvolume quota using the size specified by ``new_size``.
The `--no_shrink`` flag prevents the subvolume from shrinking below the current used size of the subvolume.
The subvolume can be resized to an infinite size by passing 'inf' or 'infinite' as the new_size.
The subvolume can be resized to an unlimited (but sparse) logical size by passing ``inf`` or ``infinite`` as `` new_size``.
Authorize cephx auth IDs, the read/read-write access to fs subvolumes::
@ -285,43 +301,43 @@ Fetch the information of a subvolume using::
$ ceph fs subvolume info <vol_name> <subvol_name> [--group_name <subvol_group_name>]
The output format is json and contains fields as follows.
The output format is JSON and contains fields as follows.
* atime: access time of subvolume path in the format "YYYY-MM-DD HH:MM:SS"
* mtime: modification time of subvolume path in the format "YYYY-MM-DD HH:MM:SS"
* ctime: change time of subvolume path in the format "YYYY-MM-DD HH:MM:SS"
* uid: uid of subvolume path
* gid: gid of subvolume path
* mode: mode of subvolume path
* mon_addrs: list of monitor addresses
* bytes_pcent: quota used in percentage if quota is set, else displays "undefined"
* bytes_quota: quota size in bytes if quota is set, else displays "infinite"
* bytes_used: current used size of the subvolume in bytes
* created_at: time of creation of subvolume in the format "YYYY-MM-DD HH:MM:SS"
* data_pool: data pool the subvolume belongs to
* path: absolute path of a subvolume
* type: subvolume type indicating whether it's clone or subvolume
* pool_namespace: RADOS namespace of the subvolume
* features: features supported by the subvolume
* state: current state of the subvolume
* ``atime``: access time of the subvolume path in the format "YYYY-MM-DD HH:MM:SS"
* ``mtime``: modification time of the subvolume path in the format "YYYY-MM-DD HH:MM:SS"
* ``ctime``: change time of the subvolume path in the format "YYYY-MM-DD HH:MM:SS"
* ``uid``: uid of the subvolume path
* ``gid``: gid of the subvolume path
* ``mode``: mode of the subvolume path
* ``mon_addrs``: list of monitor addresses
* ``bytes_pcent``: quota used in percentage if quota is set, else displays ``undefined``
* ``bytes_quota``: quota size in bytes if quota is set, else displays ``infinite``
* ``bytes_used``: current used size of the subvolume in bytes
* ``created_at``: creation time of the subvolume in the format "YYYY-MM-DD HH:MM:SS"
* ``data_pool``: data pool to which the subvolume belongs
* ``path``: absolute path of a subvolume
* ``type``: subvolume type indicating whether it's clone or subvolume
* ``pool_namespace``: RADOS namespace of the subvolume
* ``features``: features supported by the subvolume
* ``state``: current state of the subvolume
If a subvolume has been removed retaining its snapshots, the output only contains fields as follows.
If a subvolume has been removed retaining its snapshots, the output contains only fields as follows.
* type: subvolume type indicating whether it's clone or subvolume
* features: features supported by the subvolume
* state: current state of the subvolume
* ``type``: subvolume type indicating whether it's clone or subvolume
* ``features``: features supported by the subvolume
* ``state``: current state of the subvolume
The subvolume "features" are based on the internal version of the subvolume and is a list containing
a subset of the following features,
A subvolume's ``features`` are based on the internal version of the subvolume and are
a subset of the following:
* "snapshot-clone": supports cloning using a subvolumes snapshot as the source
* "snapshot-autoprotect": supports automatically protecting snapshots, that are active clone sources, from deletion
* "snapshot-retention": supports removing subvolume contents, retaining any existing snapshots
* ``snapshot-clone``: supports cloning using a subvolumes snapshot as the source
* ``snapshot-autoprotect``: supports automatically protecting snapshots, that are active clone sources, from deletion
* ``snapshot-retention``: supports removing subvolume contents, retaining any existing snapshots
The subvolume "state" is based on the current state of the subvolume and contains one of the following values.
A subvolume's ``state`` is based on the current state of the subvolume and contains one of the following values.
* "complete": subvolume is ready for all operations
* "snapshot-retained": subvolume is removed but its snapshots are retained
* ``complete``: subvolume is ready for all operations
* ``snapshot-retained``: subvolume is removed but its snapshots are retained
List subvolumes using::
@ -333,10 +349,10 @@ Check the presence of any subvolume using::
$ ceph fs subvolume exist <vol_name> [--group_name <subvol_group_name>]
The strings returned by the 'exist' command:
These are the possible results of the ``exist`` command:
* "subvolume exists": if any subvolume of given group_name is present
* "no subvolume exists": if no subvolume of given group_name is present
* ``subvolume exists``: if any subvolume of given group_name is present
* ``no subvolume exists``: if no subvolume of given group_name is present
Set custom metadata on the subvolume as a key-value pair using::
@ -360,7 +376,7 @@ Remove custom metadata set on the subvolume using the metadata key::
$ ceph fs subvolume metadata rm <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>] [--force]
Using the '--force' flag allows the command to succeed that would otherwise
Using the ``--force`` flag allows the command to succeed that would otherwise
fail if the metadata key did not exist.
Create a snapshot of a subvolume using::
@ -372,7 +388,7 @@ Remove a snapshot of a subvolume using::
$ ceph fs subvolume snapshot rm <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>] [--force]
Using the '--force' flag allows the command to succeed that would otherwise
Using the ``--force`` flag allows the command to succeed that would otherwise
fail if the snapshot did not exist.
.. note:: if the last snapshot within a snapshot retained subvolume is removed, the subvolume is also removed
@ -387,13 +403,13 @@ Fetch the information of a snapshot using::
The output format is JSON and contains fields as follows.
* created_at: time of creation of snapshot in the format "YYYY-MM-DD HH:MM:SS:ffffff"
* data_pool: data pool the snapshot belongs to
* has_pending_clones: "yes" if snapshot clone is in progress otherwise "no"
* pending_clones: list of in progress or pending clones and their target group if exist otherwise this field is not shown
* orphan_clones_count: count of orphan clones if snapshot has orphan clones otherwise this field is not shown
* ``created_at``: creation time of the snapshot in the format "YYYY-MM-DD HH:MM:SS:ffffff"
* ``data_pool``: data pool to which the snapshot belongs
* ``has_pending_clones``: ``yes`` if snapshot clone is in progress, otherwise ``no``
* ``pending_clones``: list of in-progress or pending clones and their target group if any exist, otherwise this field is not shown
* ``orphan_clones_count``: count of orphan clones if the snapshot has orphan clones, otherwise this field is not shown
Sample output when snapshot clones are in progress or pending state::
Sample output when snapshot clones are in progress or pending::
$ ceph fs subvolume snapshot info cephfs subvol snap
{
@ -415,7 +431,7 @@ Sample output when snapshot clones are in progress or pending state::
]
}
Sample output when no snapshot clone is in progress or pending state::
Sample output when no snapshot clone is in progress or pending::
$ ceph fs subvolume snapshot info cephfs subvol snap
{
@ -424,15 +440,15 @@ Sample output when no snapshot clone is in progress or pending state::
"has_pending_clones": "no"
}
Set custom metadata on the snapshot as a key-value pair using::
Set custom key-value metadata on the snapshot by running::
$ ceph fs subvolume snapshot metadata set <vol_name> <subvol_name> <snap_name> <key_name> <value> [--group_name <subvol_group_name>]
.. note:: If the key_name already exists then the old value will get replaced by the new value.
.. note:: The key_name and value should be a string of ASCII characters (as specified in python's string.printable). The key_name is case-insensitive and always stored in lower case.
.. note:: The key_name and value should be a strings of ASCII characters (as specified in Python's ``string.printable``). The key_name is case-insensitive and always stored in lowercase.
.. note:: Custom metadata on a snapshots is not preserved when snapshotting the subvolume, and hence, is also not preserved when cloning the subvolume snapshot.
.. note:: Custom metadata on a snapshot is not preserved when snapshotting the subvolume, and hence is also not preserved when cloning the subvolume snapshot.
Get custom metadata set on the snapshot using the metadata key::
@ -446,35 +462,35 @@ Remove custom metadata set on the snapshot using the metadata key::
$ ceph fs subvolume snapshot metadata rm <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>] [--force]
Using the '--force' flag allows the command to succeed that would otherwise
Using the ``--force`` flag allows the command to succeed that would otherwise
fail if the metadata key did not exist.
Cloning Snapshots
-----------------
Subvolumes can be created by cloning subvolume snapshots. Cloning is an asynchronous operation involving copying
data from a snapshot to a subvolume. Due to this bulk copy nature, cloning is currently inefficient for very huge
Subvolumes can be created by cloning subvolume snapshots. Cloning is an asynchronous operation that copies
data from a snapshot to a subvolume. Due to this bulk copying, cloning is inefficient for very large
data sets.
.. note:: Removing a snapshot (source subvolume) would fail if there are pending or in progress clone operations.
Protecting snapshots prior to cloning was a pre-requisite in the Nautilus release, and the commands to protect/unprotect
snapshots were introduced for this purpose. This pre-requisite, and hence the commands to protect/unprotect, is being
deprecated in mainline CephFS, and may be removed from a future release.
Protecting snapshots prior to cloning was a prerequisite in the Nautilus release, and the commands to protect/unprotect
snapshots were introduced for this purpose. This prerequisite, and hence the commands to protect/unprotect, is being
deprecated and may be removed from a future release.
The commands being deprecated are:
The commands being deprecated are::
$ ceph fs subvolume snapshot protect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
$ ceph fs subvolume snapshot unprotect <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
.. note:: Using the above commands would not result in an error, but they serve no useful function.
.. note:: Using the above commands will not result in an error, but they have no useful purpose.
.. note:: Use subvolume info command to fetch subvolume metadata regarding supported "features" to help decide if protect/unprotect of snapshots is required, based on the "snapshot-autoprotect" feature availability.
.. note:: Use the ``subvolume info`` command to fetch subvolume metadata regarding supported ``features`` to help decide if protect/unprotect of snapshots is required, based on the availability of the ``snapshot-autoprotect`` feature.
To initiate a clone operation use::
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name>
If a snapshot (source subvolume) is a part of non-default group, the group name needs to be specified as per::
If a snapshot (source subvolume) is a part of non-default group, the group name needs to be specified::
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --group_name <subvol_group_name>
@ -486,7 +502,7 @@ Similar to specifying a pool layout when creating a subvolume, pool layout can b
$ ceph fs subvolume snapshot clone <vol_name> <subvol_name> <snap_name> <target_subvol_name> --pool_layout <pool_layout>
Configure maximum number of concurrent clones. The default is set to 4::
Configure the maximum number of concurrent clones. The default is 4::
$ ceph config set mgr mgr/volumes/max_concurrent_clones <value>
@ -496,18 +512,18 @@ To check the status of a clone operation use::
A clone can be in one of the following states:
#. `pending` : Clone operation has not started
#. `in-progress` : Clone operation is in progress
#. `complete` : Clone operation has successfully finished
#. `failed` : Clone operation has failed
#. `canceled` : Clone operation is cancelled by user
#. ``pending`` : Clone operation has not started
#. ``in-progress`` : Clone operation is in progress
#. ``complete`` : Clone operation has successfully finished
#. ``failed`` : Clone operation has failed
#. ``canceled`` : Clone operation is cancelled by user
The reason for a clone failure is shown as below:
#. `errno` : error number
#. `error_msg` : failure error string
#. ``errno`` : error number
#. ``error_msg`` : failure error string
Sample output of an `in-progress` clone operation::
Here is an example of an ``in-progress`` clone::
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
$ ceph fs clone status cephfs clone1
@ -522,9 +538,9 @@ Sample output of an `in-progress` clone operation::
}
}
.. note:: The `failure` section will be shown only if the clone is in failed or cancelled state
.. note:: The ``failure`` section will be shown only if the clone's state is ``failed`` or ``cancelled``
Sample output of a `failed` clone operation::
Here is an example of a ``failed`` clone::
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
$ ceph fs clone status cephfs clone1
@ -544,11 +560,11 @@ Sample output of a `failed` clone operation::
}
}
(NOTE: since `subvol1` is in default group, `source` section in `clone status` does not include group name)
(NOTE: since ``subvol1`` is in the default group, the ``source`` object's ``clone status`` does not include the group name)
.. note:: Cloned subvolumes are accessible only after the clone operation has successfully completed.
For a successful clone operation, `clone status` would look like so::
After a successful clone operation, ``clone status`` will look like the below::
$ ceph fs clone status cephfs clone1
{
@ -557,21 +573,24 @@ For a successful clone operation, `clone status` would look like so::
}
}
or `failed` state when clone is unsuccessful.
If a clone operation is unsuccessful, the ``state`` value will be ``failed``.
On failure of a clone operation, the partial clone needs to be deleted and the clone operation needs to be retriggered.
To delete a partial clone use::
To retry a failed clone operation, the incomplete clone must be deleted and the
clone operation must be issued again. To delete a partial clone use::
$ ceph fs subvolume rm <vol_name> <clone_name> [--group_name <group_name>] --force
.. note:: Cloning only synchronizes directories, regular files and symbolic links. Also, inode timestamps (access and
modification times) are synchronized upto seconds granularity.
.. note:: Cloning synchronizes only directories, regular files and symbolic
links. Inode timestamps (access and modification times) are synchronized up
to seconds granularity.
An `in-progress` or a `pending` clone operation can be canceled. To cancel a clone operation use the `clone cancel` command::
An ``in-progress`` or a ``pending`` clone operation may be canceled. To cancel
a clone operation use the ``clone cancel`` command::
$ ceph fs clone cancel <vol_name> <clone_name> [--group_name <group_name>]
On successful cancelation, the cloned subvolume is moved to `canceled` state::
On successful cancellation, the cloned subvolume is moved to the ``canceled``
state::
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
$ ceph fs clone cancel cephfs clone1
@ -587,7 +606,7 @@ On successful cancelation, the cloned subvolume is moved to `canceled` state::
}
}
.. note:: The canceled cloned can be deleted by using --force option in `fs subvolume rm` command.
.. note:: The canceled cloned may be deleted by supplying the ``--force`` option to the `fs subvolume rm` command.
.. _subvol-pinning:
@ -596,8 +615,8 @@ Pinning Subvolumes and Subvolume Groups
---------------------------------------
Subvolumes and subvolume groups can be automatically pinned to ranks according
to policies. This can help distribute load across MDS ranks in predictable and
Subvolumes and subvolume groups may be automatically pinned to ranks according
to policies. This can distribute load across MDS ranks in predictable and
stable ways. Review :ref:`cephfs-pinning` and :ref:`cephfs-ephemeral-pinning`
for details on how pinning works.

View File

@ -5,6 +5,60 @@ CephFS allows quotas to be set on any directory in the system. The
quota can restrict the number of *bytes* or the number of *files*
stored beneath that point in the directory hierarchy.
Like most other things in CephFS, quotas are configured using virtual
extended attributes:
* ``ceph.quota.max_files`` -- file limit
* ``ceph.quota.max_bytes`` -- byte limit
If the extended attributes appear on a directory that means a quota is
configured there. If they are not present then no quota is set on that
directory (although one may still be configured on a parent directory).
To set a quota, set the extended attribute on a CephFS directory with a
value::
setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir # 100 MB
setfattr -n ceph.quota.max_files -v 10000 /some/dir # 10,000 files
To view quota limit::
$ getfattr -n ceph.quota.max_bytes /some/dir
# file: dir1/
ceph.quota.max_bytes="100000000"
$
$ getfattr -n ceph.quota.max_files /some/dir
# file: dir1/
ceph.quota.max_files="10000"
.. note:: Running ``getfattr /some/dir -d -m -`` for a CephFS directory will
print none of the CephFS extended attributes. This is because the CephFS
kernel and FUSE clients hide this information from the ``listxattr(2)``
system call. Instead, a specific CephFS extended attribute can be viewed by
running ``getfattr /some/dir -n ceph.<some-xattr>``.
To remove a quota, set the value of extended attribute to ``0``::
$ setfattr -n ceph.quota.max_bytes -v 0 /some/dir
$ getfattr /some/dir -n ceph.quota.max_bytes
dir1/: ceph.quota.max_bytes: No such attribute
$
$ setfattr -n ceph.quota.max_files -v 0 /some/dir
$ getfattr dir1/ -n ceph.quota.max_files
dir1/: ceph.quota.max_files: No such attribute
Space Usage Reporting and CephFS Quotas
---------------------------------------
When the root directory of the CephFS mount has quota set on it, the available
space on the CephFS reported by space usage report tools (like ``df``) is
based on quota limit. That is, ``available space = quota limit - used space``
instead of ``available space = total space - used space``.
This behaviour can be disabled by setting following option in client section
of ``ceph.conf``::
client quota df = false
Limitations
-----------
@ -85,3 +139,11 @@ To remove a quota::
setfattr -n ceph.quota.max_bytes -v 0 /some/dir
setfattr -n ceph.quota.max_files -v 0 /some/dir
.. note:: In cases where CephFS extended attributes are set on a CephFS
directory (for example, ``/some/dir``), running ``getfattr /some/dir -d -m
-`` will not print those CephFS extended attributes. This is because CephFS
kernel and FUSE clients hide this information from the ``listxattr(2)``
system call. You can access a specific CephFS extended attribute by running
``getfattr /some/dir -n ceph.<some-xattr>`` instead.

View File

@ -131,3 +131,15 @@ Control (ongoing) File System Scrubs
{
"return_code": 0
}
Damages
=======
The types of damage that can be reported and repaired by File System Scrub are:
* DENTRY : Inode's dentry is missing.
* DIR_FRAG : Inode's directory fragment(s) is missing.
* BACKTRACE : Inode's backtrace in the data pool is corrupted.

View File

@ -554,7 +554,7 @@ In order to configure connections (from Ceph nodes) to the KDC:
...
6. A new *set parameter* was added in Ceph, ``gss ktab client file`` which
6. A new *set parameter* was added in Ceph, ``gss_ktab_client_file`` which
points to the keytab file related to the Ceph node *(or principal)* in
question.
@ -614,10 +614,10 @@ In order to configure connections (from Ceph nodes) to the KDC:
/etc/ceph/ceph.conf
[global]
...
auth cluster required = gss
auth service required = gss
auth client required = gss
gss ktab client file = /{$my_new_location}/{$my_new_ktab_client_file.keytab}
auth_cluster_required = gss
auth_service_required = gss
auth_client_required = gss
gss_ktab_client_file = /{$my_new_location}/{$my_new_ktab_client_file.keytab}
...

View File

@ -32,7 +32,7 @@ cephadm/cephadm script into memory.)
for mon or mgr.
- You'll see health warnings from cephadm about stray daemons--that's because
the vstart-launched daemons aren't controlled by cephadm.
- The default image is ``quay.io/ceph-ci/ceph:master``, but you can change
- The default image is ``quay.io/ceph-ci/ceph:main``, but you can change
this by passing ``-o container_image=...`` or ``ceph config set global container_image ...``.

View File

@ -131,3 +131,8 @@ sharing a single pool (via namespaces), their snapshots *will* collide and
deleting one will result in missing file data for others. (This may even be
invisible, not throwing errors to the user.) If each FS gets its own
pool things probably work, but this isn't tested and may not be true.
.. Note:: To avoid snap id collision between mon-managed snapshots and file system
snapshots, pools with mon-managed snapshots are not allowed to be attached
to a file system. Also, mon-managed snapshots can't be created in pools
already attached to a file system either.

View File

@ -87,7 +87,7 @@ The procedure for making changes to the Ceph repository is as follows:
#. :ref:`Push the changes in your local working copy to your fork<push_changes>`.
#. Create a Pull Request to push the change upstream
#. Create a Pull Request to push the change upstream.
#. Create a Pull Request that asks for your changes to be added into the
"upstream Ceph" repository.
@ -513,3 +513,57 @@ the **ptl-tool** have the following form::
client: add timer_lock support
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
Miscellaneous
-------------
--set-upstream
^^^^^^^^^^^^^^
If you forget to include the ``--set-upstream origin x`` option in your ``git
push`` command, you will see the following error message:
::
fatal: The current branch {x} has no upstream branch.
To push the current branch and set the remote as upstream, use
git push --set-upstream origin {x}
To set up git to automatically create the upstream branch that corresponds to
the branch in your local working copy, run this command from within the
``ceph/`` directory:
.. prompt:: bash $
git config --global push.autoSetupRemote true
Deleting a Branch Locally
^^^^^^^^^^^^^^^^^^^^^^^^^
To delete the branch named ``localBranchName`` from the local working copy, run
a command of this form:
.. prompt:: bash $
git branch -d localBranchName
Deleting a Branch Remotely
^^^^^^^^^^^^^^^^^^^^^^^^^^
To delete the branch named ``remoteBranchName`` from the remote upstream branch
(which is also your fork of ``ceph/ceph``, as described in :ref:`forking`), run
a command of this form:
.. prompt:: bash $
git push origin --delete remoteBranchName
Searching a File Longitudinally for a String
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
To search for the commit that introduced a given string (in this example, that
string is ``foo``) into a given file (in this example, that file is
``file.rst``), run a command of this form:
.. prompt:: bash $
git log -S 'foo' file.rst

View File

@ -89,6 +89,11 @@ click on `New issue`_.
.. _`jump to the Ceph project`: http://tracker.ceph.com/projects/ceph
.. _`New issue`: http://tracker.ceph.com/projects/ceph/issues/new
Slack
-----
Ceph's Slack is https://ceph-storage.slack.com/.
.. _mailing-list:
Mailing lists

View File

@ -129,8 +129,8 @@ all the integration tests, for all the Ceph components.
verify that teuthology can run integration tests, with and without OpenStack
`upgrade <https://github.com/ceph/ceph/tree/master/qa/suites/upgrade>`_
for various versions of Ceph, verify that upgrades can happen
without disrupting an ongoing workload
for various versions of Ceph, verify that upgrades can happen without
disrupting an ongoing workload (`Upgrade Testing`_)
.. _`ceph-deploy man page`: ../../man/8/ceph-deploy
@ -452,6 +452,82 @@ A single test from the rbd/thrash suite can be run by adding the
--suite rbd/thrash \
--filter 'rbd/thrash/{clusters/fixed-2.yaml clusters/openstack.yaml workloads/rbd_api_tests_copy_on_read.yaml}'
.. _upgrade-testing:
Upgrade Testing
^^^^^^^^^^^^^^^
Using the upgrade suite we are able to verify that upgrades from earlier releases can complete
successfully without disrupting any ongoing workload.
Each Release branch upgrade directory includes 2-x upgrade testing.
Meaning, we are able to test the upgrade from 2 preceding releases to the current one.
The upgrade sequence is done in `parallel <https://github.com/ceph/teuthology/blob/main/teuthology/task/parallel.py>`_
with other given workloads.
For instance, the upgrade test directory from the Quincy release branch is as follows:
.. code-block:: none
├── octopus-x
└── pacific-x
It is possible to test upgrades from Octopus (2-x) or from Pacific (1-x) to Quincy (x).
A simple upgrade test consists the following order:
.. code-block:: none
├── 0-start.yaml
├── 1-tasks.yaml
├── upgrade-sequence.yaml
└── workload
After starting the cluster with the older release we begin running the given ``workload``
and the ``upgrade-sequnce`` in parallel.
.. code-block:: yaml
- print: "**** done start parallel"
- parallel:
- workload
- upgrade-sequence
- print: "**** done end parallel"
While the ``workload`` directory consists regular yaml files just as in any other suite,
the ``upgrade-sequnce`` is resposible for running the upgrade and awaitng its completion:
.. code-block:: yaml
- print: "**** done start upgrade, wait"
...
mon.a:
- ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1
- while ceph orch upgrade status | jq '.in_progress' | grep true ; do ceph orch ps ; ceph versions ; sleep 30 ; done\
...
- print: "**** done end upgrade, wait..."
It is also possible to upgrade in stages while running workloads in between those:
.. code-block:: none
├── %
├── 0-cluster
├── 1-ceph-install
├── 2-partial-upgrade
├── 3-thrash
├── 4-workload
├── 5-finish-upgrade.yaml
├── 6-quincy.yaml
└── 8-final-workload
After starting a cluster we upgrade only 2/3 of the cluster
(``2-partial-upgrade``). The next stage is running thrash tests and given
workload tests. Later on, continuing to upgrade the rest of the cluster
(``5-finish-upgrade.yaml``).
The last stage is requiring the updated release (``ceph require-osd-release
quincy``, ``ceph osd set-require-min-compat-client quincy``) and running the
``final-workload``.
Filtering tests by their description
------------------------------------

View File

@ -4,6 +4,11 @@
.. glossary::
Application
More properly called a :term:`client`, an application is any program
external to Ceph that uses a Ceph Cluster to store and
replicate data.
:ref:`BlueStore<rados_config_storage_devices_bluestore>`
OSD BlueStore is a storage back end used by OSD daemons, and
was designed specifically for use with Ceph. BlueStore was
@ -14,6 +19,22 @@
system interface. Since Luminous (12.2), BlueStore has been
Ceph's default and recommended storage back end.
Bucket
In the context of :term:`RGW`, a bucket is a group of objects.
In a filesystem-based analogy in which objects are the
counterpart of files, buckets are the counterpart of
directories. :ref:`Multisite sync
policies<radosgw-multisite-sync-policy>` can be set on buckets,
to provide fine-grained control of data movement from one zone
to another zone.
The concept of the bucket has been taken from AWS S3. See also
`the AWS S3 page on creating buckets <https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-buckets-s3.html>`_
and `the AWS S3 'Buckets Overview' page <https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingBucket.html>`_.
OpenStack Swift uses the term "containers" for what RGW and AWS call "buckets".
See `the OpenStack Storage API overview page <https://docs.openstack.org/swift/latest/api/object_api_v1_overview.html>`_.
Ceph
Ceph is a distributed network storage and file system with
distributed metadata management and POSIX semantics.
@ -166,9 +187,14 @@
applications, Ceph Users, and :term:`Ceph Client`\s. Ceph
Storage Clusters receive data from :term:`Ceph Client`\s.
cephx
The Ceph authentication protocol. Cephx operates like Kerberos,
but it has no single point of failure.
CephX
The Ceph authentication protocol. CephX operates like Kerberos,
but it has no single point of failure. See the :ref:`CephX
Configuration Reference<rados-cephx-config-ref>`.
Client
A client is any program external to Ceph that uses a Ceph
Cluster to store and replicate data.
Cloud Platforms
Cloud Stacks
@ -271,6 +297,12 @@
This is the unique identifier of an OSD. This term is used
interchangeably with ``fsid``
Period
In the context of :term:`RGW`, a period is the configuration
state of the :term:`Realm`. The period stores the configuration
state of a multi-site configuration. When the period is updated,
the "epoch" is said thereby to have been changed.
:ref:`Pool<rados_pools>`
A pool is a logical partition used to store objects.
@ -301,6 +333,10 @@
The block storage component of Ceph. Also called "RADOS Block
Device" or :term:`Ceph Block Device`.
:ref:`Realm<rgw-realms>`
In the context of RADOS Gateway (RGW), a realm is a globally
unique namespace that consists of one or more zonegroups.
Releases
Ceph Interim Release
@ -335,6 +371,28 @@
Amazon S3 RESTful API and the OpenStack Swift API. Also called
"RADOS Gateway" and "Ceph Object Gateway".
scrubs
The processes by which Ceph ensures data integrity. During the
process of scrubbing, Ceph generates a catalog of all objects
in a placement group, then ensures that none of the objects are
missing or mismatched by comparing each primary object against
its replicas, which are stored across other OSDs. Any PG
is determined to have a copy of an object that is different
than the other copies or is missing entirely is marked
"inconsistent" (that is, the PG is marked "inconsistent").
There are two kinds of scrubbing: light scrubbing and deep
scrubbing (also called "normal scrubbing" and "deep scrubbing",
respectively). Light scrubbing is performed daily and does
nothing more than confirm that a given object exists and that
its metadata is correct. Deep scrubbing is performed weekly and
reads the data and uses checksums to ensure data integrity.
See :ref:`Scrubbing <rados_config_scrubbing>` in the RADOS OSD
Configuration Reference Guide and page 141 of *Mastering Ceph,
second edition* (Fisk, Nick. 2019).
secrets
Secrets are credentials used to perform digital authentication
whenever privileged users must access systems that require
@ -352,5 +410,17 @@
Teuthology
The collection of software that performs scripted tests on Ceph.
User
An individual or a system actor (for example, an application)
that uses Ceph clients to interact with the :term:`Ceph Storage
Cluster`. See :ref:`User<rados-ops-user>` and :ref:`User
Management<user-management>`.
Zone
In the context of :term:`RGW`, a zone is a logical group that
consists of one or more :term:`RGW` instances. A zone's
configuration state is stored in the :term:`period`. See
:ref:`Zones<radosgw-zones>`.
.. _https://github.com/ceph: https://github.com/ceph
.. _Cluster Map: ../architecture#cluster-map

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 568 KiB

After

Width:  |  Height:  |  Size: 730 KiB

View File

@ -2,8 +2,7 @@
Welcome to Ceph
=================
Ceph uniquely delivers **object, block, and file storage in one unified
system**.
Ceph delivers **object, block, and file storage in one unified system**.
.. warning::

View File

@ -4,33 +4,32 @@
Installing Ceph
===============
There are several different ways to install Ceph. Choose the
method that best suits your needs.
There are multiple ways to install Ceph.
Recommended methods
~~~~~~~~~~~~~~~~~~~
:ref:`Cephadm <cephadm>` installs and manages a Ceph cluster using containers and
systemd, with tight integration with the CLI and dashboard GUI.
:ref:`Cephadm <cephadm_deploying_new_cluster>` installs and manages a Ceph
cluster that uses containers and systemd and is tightly integrated with the CLI
and dashboard GUI.
* cephadm only supports Octopus and newer releases.
* cephadm is fully integrated with the new orchestration API and
fully supports the new CLI and dashboard features to manage
cluster deployment.
* cephadm requires container support (podman or docker) and
* cephadm supports only Octopus and newer releases.
* cephadm is fully integrated with the orchestration API and fully supports the
CLI and dashboard features that are used to manage cluster deployment.
* cephadm requires container support (in the form of Podman or Docker) and
Python 3.
`Rook <https://rook.io/>`_ deploys and manages Ceph clusters running
in Kubernetes, while also enabling management of storage resources and
provisioning via Kubernetes APIs. We recommend Rook as the way to run Ceph in
provisioning via Kubernetes APIs. We recommend Rook as the way to run Ceph in
Kubernetes or to connect an existing Ceph storage cluster to Kubernetes.
* Rook only supports Nautilus and newer releases of Ceph.
* Rook supports only Nautilus and newer releases of Ceph.
* Rook is the preferred method for running Ceph on Kubernetes, or for
connecting a Kubernetes cluster to an existing (external) Ceph
cluster.
* Rook supports the new orchestrator API. New management features
in the CLI and dashboard are fully supported.
* Rook supports the orchestrator API. Management features in the CLI and
dashboard are fully supported.
Other methods
~~~~~~~~~~~~~
@ -39,16 +38,20 @@ Other methods
Ceph clusters using Ansible.
* ceph-ansible is widely deployed.
* ceph-ansible is not integrated with the new orchestrator APIs,
introduced in Nautlius and Octopus, which means that newer
management features and dashboard integration are not available.
* ceph-ansible is not integrated with the orchestrator APIs that were
introduced in Nautilus and Octopus, which means that the management features
and dashboard integration introduced in Nautilus and Octopus are not
available in Ceph clusters deployed by means of ceph-ansible.
`ceph-deploy <https://docs.ceph.com/projects/ceph-deploy/en/latest/>`_ is a tool for quickly deploying clusters.
`ceph-deploy <https://docs.ceph.com/projects/ceph-deploy/en/latest/>`_ is a
tool that can be used to quickly deploy clusters. It is deprecated.
.. IMPORTANT::
ceph-deploy is no longer actively maintained. It is not tested on versions of Ceph newer than Nautilus. It does not support RHEL8, CentOS 8, or newer operating systems.
ceph-deploy is not actively maintained. It is not tested on versions of Ceph
newer than Nautilus. It does not support RHEL8, CentOS 8, or newer operating
systems.
`DeepSea <https://github.com/SUSE/DeepSea>`_ installs Ceph using Salt.
@ -67,7 +70,7 @@ Ceph can also be :ref:`installed manually <install-manual>`.
Windows
~~~~~~~
For Windows installations, please consult this document:
For Windows installations, consult this document:
`Windows installation guide`_.
.. _Windows installation guide: ./windows-install

View File

@ -312,6 +312,7 @@ function. This will result in a circular locking exception.
.. automethod:: MgrModule.get_perf_schema
.. automethod:: MgrModule.get_counter
.. automethod:: MgrModule.get_mgr_id
.. automethod:: MgrModule.get_daemon_health_metrics
Exposing health checks
----------------------

View File

@ -239,7 +239,7 @@ Create CephFS Export
.. code:: bash
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>]
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>] [--sectype <value>...]
This creates export RADOS objects containing the export block, where
@ -266,6 +266,18 @@ for permissible values.
value is `no_root_squash`. See the `NFS-Ganesha Export Sample`_ for
permissible values.
``<sectype>`` specifies which authentication methods will be used when
connecting to the export. Valid values include "krb5p", "krb5i", "krb5", "sys",
and "none". More than one value can be supplied. The flag may be specified
multiple times (example: ``--sectype=krb5p --sectype=krb5i``) or multiple
values may be separated by a comma (example: ``--sectype krb5p,krb5i``). The
server will negotatiate a supported security type with the client preferring
the supplied methods left-to-right.
.. note:: Specifying values for sectype that require Kerberos will only function on servers
that are configured to support Kerberos. Setting up NFS-Ganesha to support Kerberos
is outside the scope of this document.
.. note:: Export creation is supported only for NFS Ganesha clusters deployed using nfs interface.
Create RGW Export
@ -285,7 +297,7 @@ To export a *bucket*:
.. code::
$ ceph nfs export create rgw --cluster-id <cluster_id> --pseudo-path <pseudo_path> --bucket <bucket_name> [--user-id <user-id>] [--readonly] [--client_addr <value>...] [--squash <value>]
$ ceph nfs export create rgw --cluster-id <cluster_id> --pseudo-path <pseudo_path> --bucket <bucket_name> [--user-id <user-id>] [--readonly] [--client_addr <value>...] [--squash <value>] [--sectype <value>...]
For example, to export *mybucket* via NFS cluster *mynfs* at the pseudo-path */bucketdata* to any host in the ``192.168.10.0/24`` network
@ -316,6 +328,18 @@ for permissible values.
value is `no_root_squash`. See the `NFS-Ganesha Export Sample`_ for
permissible values.
``<sectype>`` specifies which authentication methods will be used when
connecting to the export. Valid values include "krb5p", "krb5i", "krb5", "sys",
and "none". More than one value can be supplied. The flag may be specified
multiple times (example: ``--sectype=krb5p --sectype=krb5i``) or multiple
values may be separated by a comma (example: ``--sectype krb5p,krb5i``). The
server will negotatiate a supported security type with the client preferring
the supplied methods left-to-right.
.. note:: Specifying values for sectype that require Kerberos will only function on servers
that are configured to support Kerberos. Setting up NFS-Ganesha to support Kerberos
is outside the scope of this document.
RGW user export
^^^^^^^^^^^^^^^

View File

@ -426,6 +426,22 @@ the asynchronous writes as well as an asynchronous update to the size of the
striped file.
Debugging
^^^^^^^^^
Debugging libcephsqlite can be turned on via::
debug_cephsqlite
If running the ``sqlite3`` command-line tool, use:
.. code:: sh
env CEPH_ARGS='--log_to_file true --log-file sqlite3.log --debug_cephsqlite 20 --debug_ms 1' sqlite3 ...
This will save all the usual Ceph debugging to a file ``sqlite3.log`` for inspection.
.. _SQLite: https://sqlite.org/index.html
.. _SQLite VFS: https://www.sqlite.org/vfs.html
.. _SQLite Backup: https://www.sqlite.org/backup.html

View File

@ -1,107 +1,110 @@
.. _rados-cephx-config-ref:
========================
Cephx Config Reference
CephX Config Reference
========================
The ``cephx`` protocol is enabled by default. Cryptographic authentication has
some computational costs, though they should generally be quite low. If the
network environment connecting your client and server hosts is very safe and
you cannot afford authentication, you can turn it off. **This is not generally
recommended**.
The CephX protocol is enabled by default. The cryptographic authentication that
CephX provides has some computational costs, though they should generally be
quite low. If the network environment connecting your client and server hosts
is very safe and you cannot afford authentication, you can disable it.
**Disabling authentication is not generally recommended**.
.. note:: If you disable authentication, you are at risk of a man-in-the-middle
attack altering your client/server messages, which could lead to disastrous
security effects.
.. note:: If you disable authentication, you will be at risk of a
man-in-the-middle attack that alters your client/server messages, which
could have disastrous security effects.
For creating users, see `User Management`_. For details on the architecture
of Cephx, see `Architecture - High Availability Authentication`_.
For information about creating users, see `User Management`_. For details on
the architecture of CephX, see `Architecture - High Availability
Authentication`_.
Deployment Scenarios
====================
There are two main scenarios for deploying a Ceph cluster, which impact
how you initially configure Cephx. Most first time Ceph users use
``cephadm`` to create a cluster (easiest). For clusters using
other deployment tools (e.g., Chef, Juju, Puppet, etc.), you will need
to use the manual procedures or configure your deployment tool to
How you initially configure CephX depends on your scenario. There are two
common strategies for deploying a Ceph cluster. If you are a first-time Ceph
user, you should probably take the easiest approach: using ``cephadm`` to
deploy a cluster. But if your cluster uses other deployment tools (for example,
Ansible, Chef, Juju, or Puppet), you will need either to use the manual
deployment procedures or to configure your deployment tool so that it will
bootstrap your monitor(s).
Manual Deployment
-----------------
When you deploy a cluster manually, you have to bootstrap the monitor manually
and create the ``client.admin`` user and keyring. To bootstrap monitors, follow
the steps in `Monitor Bootstrapping`_. The steps for monitor bootstrapping are
the logical steps you must perform when using third party deployment tools like
Chef, Puppet, Juju, etc.
When you deploy a cluster manually, it is necessary to bootstrap the monitors
manually and to create the ``client.admin`` user and keyring. To bootstrap
monitors, follow the steps in `Monitor Bootstrapping`_. Follow these steps when
using third-party deployment tools (for example, Chef, Puppet, and Juju).
Enabling/Disabling Cephx
Enabling/Disabling CephX
========================
Enabling Cephx requires that you have deployed keys for your monitors,
OSDs and metadata servers. If you are simply toggling Cephx on / off,
you do not have to repeat the bootstrapping procedures.
Enabling CephX is possible only if the keys for your monitors, OSDs, and
metadata servers have already been deployed. If you are simply toggling CephX
on or off, it is not necessary to repeat the bootstrapping procedures.
Enabling Cephx
Enabling CephX
--------------
When ``cephx`` is enabled, Ceph will look for the keyring in the default search
path, which includes ``/etc/ceph/$cluster.$name.keyring``. You can override
this location by adding a ``keyring`` option in the ``[global]`` section of
your `Ceph configuration`_ file, but this is not recommended.
When CephX is enabled, Ceph will look for the keyring in the default search
path: this path includes ``/etc/ceph/$cluster.$name.keyring``. It is possible
to override this search-path location by adding a ``keyring`` option in the
``[global]`` section of your `Ceph configuration`_ file, but this is not
recommended.
Execute the following procedures to enable ``cephx`` on a cluster with
authentication disabled. If you (or your deployment utility) have already
To enable CephX on a cluster for which authentication has been disabled, carry
out the following procedure. If you (or your deployment utility) have already
generated the keys, you may skip the steps related to generating keys.
#. Create a ``client.admin`` key, and save a copy of the key for your client
host
host:
.. prompt:: bash $
ceph auth get-or-create client.admin mon 'allow *' mds 'allow *' mgr 'allow *' osd 'allow *' -o /etc/ceph/ceph.client.admin.keyring
**Warning:** This will clobber any existing
**Warning:** This step will clobber any existing
``/etc/ceph/client.admin.keyring`` file. Do not perform this step if a
deployment tool has already done it for you. Be careful!
deployment tool has already generated a keyring file for you. Be careful!
#. Create a keyring for your monitor cluster and generate a monitor
secret key.
#. Create a monitor keyring and generate a monitor secret key:
.. prompt:: bash $
ceph-authtool --create-keyring /tmp/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
#. Copy the monitor keyring into a ``ceph.mon.keyring`` file in every monitor's
``mon data`` directory. For example, to copy it to ``mon.a`` in cluster ``ceph``,
use the following
#. For each monitor, copy the monitor keyring into a ``ceph.mon.keyring`` file
in the monitor's ``mon data`` directory. For example, to copy the monitor
keyring to ``mon.a`` in a cluster called ``ceph``, run the following
command:
.. prompt:: bash $
cp /tmp/ceph.mon.keyring /var/lib/ceph/mon/ceph-a/keyring
#. Generate a secret key for every MGR, where ``{$id}`` is the MGR letter
#. Generate a secret key for every MGR, where ``{$id}`` is the MGR letter:
.. prompt:: bash $
ceph auth get-or-create mgr.{$id} mon 'allow profile mgr' mds 'allow *' osd 'allow *' -o /var/lib/ceph/mgr/ceph-{$id}/keyring
#. Generate a secret key for every OSD, where ``{$id}`` is the OSD number
#. Generate a secret key for every OSD, where ``{$id}`` is the OSD number:
.. prompt:: bash $
ceph auth get-or-create osd.{$id} mon 'allow rwx' osd 'allow *' -o /var/lib/ceph/osd/ceph-{$id}/keyring
#. Generate a secret key for every MDS, where ``{$id}`` is the MDS letter
#. Generate a secret key for every MDS, where ``{$id}`` is the MDS letter:
.. prompt:: bash $
ceph auth get-or-create mds.{$id} mon 'allow rwx' osd 'allow *' mds 'allow *' mgr 'allow profile mds' -o /var/lib/ceph/mds/ceph-{$id}/keyring
#. Enable ``cephx`` authentication by setting the following options in the
``[global]`` section of your `Ceph configuration`_ file
#. Enable CephX authentication by setting the following options in the
``[global]`` section of your `Ceph configuration`_ file:
.. code-block:: ini
@ -109,23 +112,23 @@ generated the keys, you may skip the steps related to generating keys.
auth_service_required = cephx
auth_client_required = cephx
#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details.
#. Start or restart the Ceph cluster. For details, see `Operating a Cluster`_.
For details on bootstrapping a monitor manually, see `Manual Deployment`_.
Disabling Cephx
Disabling CephX
---------------
The following procedure describes how to disable Cephx. If your cluster
environment is relatively safe, you can offset the computation expense of
running authentication. **We do not recommend it.** However, it may be easier
during setup and/or troubleshooting to temporarily disable authentication.
The following procedure describes how to disable CephX. If your cluster
environment is safe, you might want to disable CephX in order to offset the
computational expense of running authentication. **We do not recommend doing
so.** However, setup and troubleshooting might be easier if authentication is
temporarily disabled and subsequently re-enabled.
#. Disable ``cephx`` authentication by setting the following options in the
``[global]`` section of your `Ceph configuration`_ file
#. Disable CephX authentication by setting the following options in the
``[global]`` section of your `Ceph configuration`_ file:
.. code-block:: ini
@ -133,8 +136,7 @@ during setup and/or troubleshooting to temporarily disable authentication.
auth_service_required = none
auth_client_required = none
#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details.
#. Start or restart the Ceph cluster. For details, see `Operating a Cluster`_.
Configuration Settings
@ -146,8 +148,9 @@ Enablement
``auth_cluster_required``
:Description: If enabled, the Ceph Storage Cluster daemons (i.e., ``ceph-mon``,
``ceph-osd``, ``ceph-mds`` and ``ceph-mgr``) must authenticate with
:Description: If this configuration setting is enabled, the Ceph Storage
Cluster daemons (that is, ``ceph-mon``, ``ceph-osd``,
``ceph-mds``, and ``ceph-mgr``) are required to authenticate with
each other. Valid settings are ``cephx`` or ``none``.
:Type: String
@ -157,9 +160,9 @@ Enablement
``auth_service_required``
:Description: If enabled, the Ceph Storage Cluster daemons require Ceph Clients
to authenticate with the Ceph Storage Cluster in order to access
Ceph services. Valid settings are ``cephx`` or ``none``.
:Description: If this configuration setting is enabled, then Ceph clients can
access Ceph services only if those clients authenticate with the
Ceph Storage Cluster. Valid settings are ``cephx`` or ``none``.
:Type: String
:Required: No
@ -168,9 +171,11 @@ Enablement
``auth_client_required``
:Description: If enabled, the Ceph Client requires the Ceph Storage Cluster to
authenticate with the Ceph Client. Valid settings are ``cephx``
or ``none``.
:Description: If this configuration setting is enabled, then communication
between the Ceph client and Ceph Storage Cluster can be
established only if the Ceph Storage Cluster authenticates
against the Ceph client. Valid settings are ``cephx`` or
``none``.
:Type: String
:Required: No
@ -182,30 +187,108 @@ Enablement
Keys
----
When you run Ceph with authentication enabled, ``ceph`` administrative commands
and Ceph Clients require authentication keys to access the Ceph Storage Cluster.
When Ceph is run with authentication enabled, ``ceph`` administrative commands
and Ceph clients can access the Ceph Storage Cluster only if they use
authentication keys.
The most common way to provide these keys to the ``ceph`` administrative
commands and clients is to include a Ceph keyring under the ``/etc/ceph``
directory. For Octopus and later releases using ``cephadm``, the filename
is usually ``ceph.client.admin.keyring`` (or ``$cluster.client.admin.keyring``).
If you include the keyring under the ``/etc/ceph`` directory, you don't need to
specify a ``keyring`` entry in your Ceph configuration file.
The most common way to make these keys available to ``ceph`` administrative
commands and Ceph clients is to include a Ceph keyring under the ``/etc/ceph``
directory. For Octopus and later releases that use ``cephadm``, the filename is
usually ``ceph.client.admin.keyring``. If the keyring is included in the
``/etc/ceph`` directory, then it is unnecessary to specify a ``keyring`` entry
in the Ceph configuration file.
We recommend copying the Ceph Storage Cluster's keyring file to nodes where you
will run administrative commands, because it contains the ``client.admin`` key.
Because the Ceph Storage Cluster's keyring file contains the ``client.admin``
key, we recommend copying the keyring file to nodes from which you run
administrative commands.
To perform this step manually, execute the following:
To perform this step manually, run the following command:
.. prompt:: bash $
sudo scp {user}@{ceph-cluster-host}:/etc/ceph/ceph.client.admin.keyring /etc/ceph/ceph.client.admin.keyring
.. tip:: Ensure the ``ceph.keyring`` file has appropriate permissions set
(e.g., ``chmod 644``) on your client machine.
.. tip:: Make sure that the ``ceph.keyring`` file has appropriate permissions
(for example, ``chmod 644``) set on your client machine.
You may specify the key itself in the Ceph configuration file using the ``key``
setting (not recommended), or a path to a keyfile using the ``keyfile`` setting.
You can specify the key itself by using the ``key`` setting in the Ceph
configuration file (this approach is not recommended), or instead specify a
path to a keyfile by using the ``keyfile`` setting in the Ceph configuration
file.
``keyring``
:Description: The path to the keyring file.
:Type: String
:Required: No
:Default: ``/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin``
``keyfile``
:Description: The path to a keyfile (that is, a file containing only the key).
:Type: String
:Required: No
:Default: None
``key``
:Description: The key (that is, the text string of the key itself). We do not
recommend that you use this setting unless you know what you're
doing.
:Type: String
:Required: No
:Default: None
Daemon Keyrings
---------------
Administrative users or deployment tools (for example, ``cephadm``) generate
daemon keyrings in the same way that they generate user keyrings. By default,
Ceph stores the keyring of a daemon inside that daemon's data directory. The
default keyring locations and the capabilities that are necessary for the
daemon to function are shown below.
``ceph-mon``
:Location: ``$mon_data/keyring``
:Capabilities: ``mon 'allow *'``
``ceph-osd``
:Location: ``$osd_data/keyring``
:Capabilities: ``mgr 'allow profile osd' mon 'allow profile osd' osd 'allow *'``
``ceph-mds``
:Location: ``$mds_data/keyring``
:Capabilities: ``mds 'allow' mgr 'allow profile mds' mon 'allow profile mds' osd 'allow rwx'``
``ceph-mgr``
:Location: ``$mgr_data/keyring``
:Capabilities: ``mon 'allow profile mgr' mds 'allow *' osd 'allow *'``
``radosgw``
:Location: ``$rgw_data/keyring``
:Capabilities: ``mon 'allow rwx' osd 'allow rwx'``
.. note:: The monitor keyring (that is, ``mon.``) contains a key but no
capabilities, and this keyring is not part of the cluster ``auth`` database.
The daemon's data-directory locations default to directories of the form::
/var/lib/ceph/$type/$cluster-$id
For example, ``osd.12`` would have the following data directory::
/var/lib/ceph/osd/ceph-12
It is possible to override these locations, but it is not recommended.
``keyring``
@ -286,16 +369,66 @@ You can override these locations, but it is not recommended.
Signatures
----------
Ceph performs a signature check that provides some limited protection
against messages being tampered with in flight (e.g., by a "man in the
middle" attack).
Ceph performs a signature check that provides some limited protection against
messages being tampered with in flight (for example, by a "man in the middle"
attack).
Like other parts of Ceph authentication, Ceph provides fine-grained control so
you can enable/disable signatures for service messages between clients and
Ceph, and so you can enable/disable signatures for messages between Ceph daemons.
As with other parts of Ceph authentication, signatures admit of fine-grained
control. You can enable or disable signatures for service messages between
clients and Ceph, and for messages between Ceph daemons.
Note that even with signatures enabled data is not encrypted in
flight.
Note that even when signatures are enabled data is not encrypted in flight.
``cephx_require_signatures``
:Description: If this configuration setting is set to ``true``, Ceph requires
signatures on all message traffic between the Ceph client and the
Ceph Storage Cluster, and between daemons within the Ceph Storage
Cluster.
.. note::
**ANTIQUATED NOTE:**
Neither Ceph Argonaut nor Linux kernel versions prior to 3.19
support signatures; if one of these clients is in use, ``cephx_require_signatures``
can be disabled in order to allow the client to connect.
:Type: Boolean
:Required: No
:Default: ``false``
``cephx_cluster_require_signatures``
:Description: If this configuration setting is set to ``true``, Ceph requires
signatures on all message traffic between Ceph daemons within the
Ceph Storage Cluster.
:Type: Boolean
:Required: No
:Default: ``false``
``cephx_service_require_signatures``
:Description: If this configuration setting is set to ``true``, Ceph requires
signatures on all message traffic between Ceph clients and the
Ceph Storage Cluster.
:Type: Boolean
:Required: No
:Default: ``false``
``cephx_sign_messages``
:Description: If this configuration setting is set to ``true``, and if the Ceph
version supports message signing, then Ceph will sign all
messages so that they are more difficult to spoof.
:Type: Boolean
:Default: ``true``
``cephx_require_signatures``
@ -346,9 +479,9 @@ Time to Live
``auth_service_ticket_ttl``
:Description: When the Ceph Storage Cluster sends a Ceph Client a ticket for
authentication, the Ceph Storage Cluster assigns the ticket a
time to live.
:Description: When the Ceph Storage Cluster sends a ticket for authentication
to a Ceph client, the Ceph Storage Cluster assigns that ticket a
Time To Live (TTL).
:Type: Double
:Default: ``60*60``

View File

@ -549,33 +549,35 @@ testing purposes, and are not recommended for use by operators.
Runtime Changes
===============
In most cases, Ceph allows you to make changes to the configuration of
a daemon at runtime. This capability is quite useful for
increasing/decreasing logging output, enabling/disabling debug
settings, and even for runtime optimization.
In most cases, Ceph permits changes to the configuration of a daemon at
runtime. This can be used for increasing or decreasing the amount of logging
output, for enabling or disabling debug settings, and for runtime optimization.
Generally speaking, configuration options can be updated in the usual
way via the ``ceph config set`` command. For example, do enable the debug log level on a specific OSD:
Configuration options can be updated via the ``ceph config set`` command. For
example, to enable the debug log level on a specific OSD, run a command of this form:
.. prompt:: bash $
ceph config set osd.123 debug_ms 20
Note that if the same option is also customized in a local
configuration file, the monitor setting will be ignored (it has a
lower priority than the local config file).
.. note:: If an option has been customized in a local configuration file, the
`central config
<https://ceph.io/en/news/blog/2018/new-mimic-centralized-configuration-management/>`_
setting will be ignored (it has a lower priority than the local
configuration file).
Override values
---------------
You can also temporarily set an option using the `tell` or `daemon`
interfaces on the Ceph CLI. These *override* values are ephemeral in
that they only affect the running process and are discarded/lost if
the daemon or process restarts.
Options can be set temporarily by using the `tell` or `daemon` interfaces on
the Ceph CLI. These *override* values are ephemeral, which means that they
affect only the current instance of the daemon and revert to persistently
configured values when the daemon restarts.
Override values can be set in two ways:
#. From any host, we can send a message to a daemon over the network with:
#. From any host, send a message to a daemon with a command of the following
form:
.. prompt:: bash $
@ -587,16 +589,16 @@ Override values can be set in two ways:
ceph tell osd.123 config set debug_osd 20
The `tell` command can also accept a wildcard for the daemon
identifier. For example, to adjust the debug level on all OSD
daemons:
The ``tell`` command can also accept a wildcard as the daemon identifier.
For example, to adjust the debug level on all OSD daemons, run a command of
this form:
.. prompt:: bash $
ceph tell osd.* config set debug_osd 20
#. From the host the process is running on, we can connect directly to
the process via a socket in ``/var/run/ceph`` with:
#. On the host where the daemon is running, connect to the daemon via a socket
in ``/var/run/ceph`` by running a command of this form:
.. prompt:: bash $
@ -608,8 +610,8 @@ Override values can be set in two ways:
ceph daemon osd.4 config set debug_osd 20
Note that in the ``ceph config show`` command output these temporary
values will be shown with a source of ``override``.
.. note:: In the output of the ``ceph config show`` command, these temporary
values are shown with a source of ``override``.
Viewing runtime settings

View File

@ -1,4 +1,3 @@
.. _ceph-conf-common-settings:
Common Settings
@ -7,30 +6,33 @@ Common Settings
The `Hardware Recommendations`_ section provides some hardware guidelines for
configuring a Ceph Storage Cluster. It is possible for a single :term:`Ceph
Node` to run multiple daemons. For example, a single node with multiple drives
may run one ``ceph-osd`` for each drive. Ideally, you will have a node for a
particular type of process. For example, some nodes may run ``ceph-osd``
daemons, other nodes may run ``ceph-mds`` daemons, and still other nodes may
run ``ceph-mon`` daemons.
ususally runs one ``ceph-osd`` for each drive. Ideally, each node will be
assigned to a particular type of process. For example, some nodes might run
``ceph-osd`` daemons, other nodes might run ``ceph-mds`` daemons, and still
other nodes might run ``ceph-mon`` daemons.
Each node has a name. The name of a node can be found in its ``host`` setting.
Monitors also specify a network address and port (that is, a domain name or IP
address) that can be found in the ``addr`` setting. A basic configuration file
typically specifies only minimal settings for each instance of monitor daemons.
For example:
Each node has a name identified by the ``host`` setting. Monitors also specify
a network address and port (i.e., domain name or IP address) identified by the
``addr`` setting. A basic configuration file will typically specify only
minimal settings for each instance of monitor daemons. For example:
.. code-block:: ini
[global]
mon_initial_members = ceph1
mon_host = 10.0.0.1
[global]
mon_initial_members = ceph1
mon_host = 10.0.0.1
.. important:: The ``host`` setting is the short name of the node (i.e., not
an fqdn). It is **NOT** an IP address either. Enter ``hostname -s`` on
the command line to retrieve the name of the node. Do not use ``host``
settings for anything other than initial monitors unless you are deploying
Ceph manually. You **MUST NOT** specify ``host`` under individual daemons
when using deployment tools like ``chef`` or ``cephadm``, as those tools
will enter the appropriate values for you in the cluster map.
.. important:: The ``host`` setting's value is the short name of the node. It
is not an FQDN. It is **NOT** an IP address. To retrieve the name of the
node, enter ``hostname -s`` on the command line. Unless you are deploying
Ceph manually, do not use ``host`` settings for anything other than initial
monitor setup. **DO NOT** specify the ``host`` setting under individual
daemons when using deployment tools like ``chef`` or ``cephadm``. Such tools
are designed to enter the appropriate values for you in the cluster map.
.. _ceph-network-config:
@ -38,34 +40,35 @@ minimal settings for each instance of monitor daemons. For example:
Networks
========
See the `Network Configuration Reference`_ for a detailed discussion about
configuring a network for use with Ceph.
For more about configuring a network for use with Ceph, see the `Network
Configuration Reference`_ .
Monitors
========
Production Ceph clusters typically provision a minimum of three :term:`Ceph Monitor`
daemons to ensure availability should a monitor instance crash. A minimum of
three ensures that the Paxos algorithm can determine which version
of the :term:`Ceph Cluster Map` is the most recent from a majority of Ceph
Ceph production clusters typically provision at least three :term:`Ceph
Monitor` daemons to ensure availability in the event of a monitor instance
crash. A minimum of three :term:`Ceph Monitor` daemons ensures that the Paxos
algorithm is able to determine which version of the :term:`Ceph Cluster Map` is
the most recent. It makes this determination by consulting a majority of Ceph
Monitors in the quorum.
.. note:: You may deploy Ceph with a single monitor, but if the instance fails,
the lack of other monitors may interrupt data service availability.
the lack of other monitors might interrupt data-service availability.
Ceph Monitors normally listen on port ``3300`` for the new v2 protocol, and ``6789`` for the old v1 protocol.
Ceph Monitors normally listen on port ``3300`` for the new v2 protocol, and on
port ``6789`` for the old v1 protocol.
By default, Ceph expects to store monitor data under the
following path::
By default, Ceph expects to store monitor data on the following path::
/var/lib/ceph/mon/$cluster-$id
/var/lib/ceph/mon/$cluster-$id
You or a deployment tool (e.g., ``cephadm``) must create the corresponding
directory. With metavariables fully expressed and a cluster named "ceph", the
foregoing directory would evaluate to::
You or a deployment tool (for example, ``cephadm``) must create the
corresponding directory. With metavariables fully expressed and a cluster named
"ceph", the path specified in the above example evaluates to::
/var/lib/ceph/mon/ceph-a
/var/lib/ceph/mon/ceph-a
For additional details, see the `Monitor Config Reference`_.
@ -74,22 +77,22 @@ For additional details, see the `Monitor Config Reference`_.
.. _ceph-osd-config:
Authentication
==============
.. versionadded:: Bobtail 0.56
For Bobtail (v 0.56) and beyond, you should expressly enable or disable
authentication in the ``[global]`` section of your Ceph configuration file.
Authentication is explicitly enabled or disabled in the ``[global]`` section of
the Ceph configuration file, as shown here:
.. code-block:: ini
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
Additionally, you should enable message signing. See `Cephx Config Reference`_ for details.
In addition, you should enable message signing. For details, see `Cephx Config
Reference`_.
.. _Cephx Config Reference: ../auth-config-ref
@ -100,65 +103,68 @@ Additionally, you should enable message signing. See `Cephx Config Reference`_ f
OSDs
====
Ceph production clusters typically deploy :term:`Ceph OSD Daemons` where one node
has one OSD daemon running a Filestore on one storage device. The BlueStore back
end is now default, but when using Filestore you specify a journal size. For example:
When Ceph production clusters deploy :term:`Ceph OSD Daemons`, the typical
arrangement is that one node has one OSD daemon running Filestore on one
storage device. BlueStore is now the default back end, but when using Filestore
you must specify a journal size. For example:
.. code-block:: ini
[osd]
osd_journal_size = 10000
[osd]
osd_journal_size = 10000
[osd.0]
host = {hostname} #manual deployments only.
[osd.0]
host = {hostname} #manual deployments only.
By default, Ceph expects to store a Ceph OSD Daemon's data at the
following path::
By default, Ceph expects to store a Ceph OSD Daemon's data on the following
path::
/var/lib/ceph/osd/$cluster-$id
/var/lib/ceph/osd/$cluster-$id
You or a deployment tool (e.g., ``cephadm``) must create the corresponding
directory. With metavariables fully expressed and a cluster named "ceph", this
example would evaluate to::
You or a deployment tool (for example, ``cephadm``) must create the
corresponding directory. With metavariables fully expressed and a cluster named
"ceph", the path specified in the above example evaluates to::
/var/lib/ceph/osd/ceph-0
/var/lib/ceph/osd/ceph-0
You may override this path using the ``osd_data`` setting. We recommend not
changing the default location. Create the default directory on your OSD host.
You can override this path using the ``osd_data`` setting. We recommend that
you do not change the default location. To create the default directory on your
OSD host, run the following commands:
.. prompt:: bash $
ssh {osd-host}
sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
ssh {osd-host}
sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
The ``osd_data`` path ideally leads to a mount point with a device that is
separate from the device that contains the operating system and
daemons. If an OSD is to use a device other than the OS device, prepare it for
use with Ceph, and mount it to the directory you just created
The ``osd_data`` path ought to lead to a mount point that has mounted on it a
device that is distinct from the device that contains the operating system and
the daemons. To use a device distinct from the device that contains the
operating system and the daemons, prepare it for use with Ceph and mount it on
the directory you just created by running the following commands:
.. prompt:: bash $
ssh {new-osd-host}
sudo mkfs -t {fstype} /dev/{disk}
sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
ssh {new-osd-host}
sudo mkfs -t {fstype} /dev/{disk}
sudo mount -o user_xattr /dev/{disk} /var/lib/ceph/osd/ceph-{osd-number}
We recommend using the ``xfs`` file system when running
:command:`mkfs`. (``btrfs`` and ``ext4`` are not recommended and are no
longer tested.)
We recommend using the ``xfs`` file system when running :command:`mkfs`. (The
``btrfs`` and ``ext4`` file systems are not recommended and are no longer
tested.)
See the `OSD Config Reference`_ for additional configuration details.
For additional configuration details, see `OSD Config Reference`_.
Heartbeats
==========
During runtime operations, Ceph OSD Daemons check up on other Ceph OSD Daemons
and report their findings to the Ceph Monitor. You do not have to provide any
settings. However, if you have network latency issues, you may wish to modify
the settings.
and report their findings to the Ceph Monitor. This process does not require
you to provide any settings. However, if you have network latency issues, you
might want to modify the default settings.
See `Configuring Monitor/OSD Interaction`_ for additional details.
For additional details, see `Configuring Monitor/OSD Interaction`_.
.. _ceph-logging-and-debugging:
@ -166,9 +172,9 @@ See `Configuring Monitor/OSD Interaction`_ for additional details.
Logs / Debugging
================
Sometimes you may encounter issues with Ceph that require
modifying logging output and using Ceph's debugging. See `Debugging and
Logging`_ for details on log rotation.
You might sometimes encounter issues with Ceph that require you to use Ceph's
logging and debugging features. For details on log rotation, see `Debugging and
Logging`_.
.. _Debugging and Logging: ../../troubleshooting/log-and-debug
@ -186,33 +192,30 @@ Example ceph.conf
Running Multiple Clusters (DEPRECATED)
======================================
Each Ceph cluster has an internal name that is used as part of configuration
and log file names as well as directory and mountpoint names. This name
defaults to "ceph". Previous releases of Ceph allowed one to specify a custom
name instead, for example "ceph2". This was intended to faciliate running
multiple logical clusters on the same physical hardware, but in practice this
was rarely exploited and should no longer be attempted. Prior documentation
could also be misinterpreted as requiring unique cluster names in order to
use ``rbd-mirror``.
Each Ceph cluster has an internal name. This internal name is used as part of
configuration, and as part of "log file" names as well as part of directory
names and as part of mountpoint names. This name defaults to "ceph". Previous
releases of Ceph allowed one to specify a custom name instead, for example
"ceph2". This option was intended to facilitate the running of multiple logical
clusters on the same physical hardware, but in practice it was rarely
exploited. Custom cluster names should no longer be attempted. Old
documentation might lead readers to wrongly think that unique cluster names are
required to use ``rbd-mirror``. They are not required.
Custom cluster names are now considered deprecated and the ability to deploy
them has already been removed from some tools, though existing custom name
deployments continue to operate. The ability to run and manage clusters with
custom names may be progressively removed by future Ceph releases, so it is
strongly recommended to deploy all new clusters with the default name "ceph".
them has already been removed from some tools, although existing custom-name
deployments continue to operate. The ability to run and manage clusters with
custom names might be progressively removed by future Ceph releases, so **it is
strongly recommended to deploy all new clusters with the default name "ceph"**.
Some Ceph CLI commands accept an optional ``--cluster`` (cluster name) option. This
option is present purely for backward compatibility and need not be accomodated
by new tools and deployments.
Some Ceph CLI commands accept a ``--cluster`` (cluster name) option. This
option is present only for the sake of backward compatibility. New tools and
deployments cannot be relied upon to accommodate this option.
If you do need to allow multiple clusters to exist on the same host, please use
If you need to allow multiple clusters to exist on the same host, use
:ref:`cephadm`, which uses containers to fully isolate each cluster.
.. _Hardware Recommendations: ../../../start/hardware-recommendations
.. _Network Configuration Reference: ../network-config-ref
.. _OSD Config Reference: ../osd-config-ref
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interactio

View File

@ -2,15 +2,19 @@
Looking up Monitors through DNS
===============================
Since version 11.0.0 RADOS supports looking up Monitors through DNS.
Since Ceph version 11.0.0 (Kraken), RADOS has supported looking up monitors
through DNS.
This way daemons and clients do not require a *mon host* configuration directive in their ceph.conf configuration file.
The addition of the ability to look up monitors through DNS means that daemons
and clients do not require a *mon host* configuration directive in their
``ceph.conf`` configuration file.
Using DNS SRV TCP records clients are able to look up the monitors.
With a DNS update, clients and daemons can be made aware of changes
in the monitor topology. To be more precise and technical, clients look up the
monitors by using ``DNS SRV TCP`` records.
This allows for less configuration on clients and monitors. Using a DNS update clients and daemons can be made aware of changes in the monitor topology.
By default clients and daemons will look for the TCP service called *ceph-mon* which is configured by the *mon_dns_srv_name* configuration directive.
By default, clients and daemons look for the TCP service called *ceph-mon*,
which is configured by the *mon_dns_srv_name* configuration directive.
``mon dns srv name``

View File

@ -92,8 +92,7 @@ Similarly, two options control whether IPv4 and IPv6 addresses are used:
.. note:: The ability to bind to multiple ports has paved the way for
dual-stack IPv4 and IPv6 support. That said, dual-stack support is
not yet tested as of Nautilus v14.2.0 and likely needs some
additional code changes to work correctly.
not yet supported as of Quincy v17.2.0.
Connection modes
----------------

View File

@ -196,6 +196,8 @@ See `Pool & PG Config Reference`_ for details.
.. index:: OSD; scrubbing
.. _rados_config_scrubbing:
Scrubbing
=========

View File

@ -4,13 +4,12 @@
.. index:: pools; configuration
When you create pools and set the number of placement groups (PGs) for each, Ceph
uses default values when you don't specifically override the defaults. **We
recommend** overriding some of the defaults. Specifically, we recommend setting
a pool's replica size and overriding the default number of placement groups. You
can specifically set these values when running `pool`_ commands. You can also
override the defaults by adding new ones in the ``[global]`` section of your
Ceph configuration file.
Ceph uses default values to determine how many placement groups (PGs) will be
assigned to each pool. We recommend overriding some of the defaults.
Specifically, we recommend setting a pool's replica size and overriding the
default number of placement groups. You can set these values when running
`pool`_ commands. You can also override the defaults by adding new ones in the
``[global]`` section of your Ceph configuration file.
.. literalinclude:: pool-pg.conf

View File

@ -2,64 +2,65 @@
BlueStore Migration
=====================
Each OSD can run either BlueStore or FileStore, and a single Ceph
cluster can contain a mix of both. Users who have previously deployed
FileStore are likely to want to transition to BlueStore in order to
take advantage of the improved performance and robustness. There are
several strategies for making such a transition.
Each OSD must be formatted as either Filestore or BlueStore. However, a Ceph
cluster can operate with a mixture of both Filestore OSDs and BlueStore OSDs.
Because BlueStore is superior to Filestore in performance and robustness, and
because Filestore is not supported by Ceph releases beginning with Reef, users
deploying Filestore OSDs should transition to BlueStore. There are several
strategies for making the transition to BlueStore.
An individual OSD cannot be converted in place in isolation, however:
BlueStore and FileStore are simply too different for that to be
practical. "Conversion" will rely either on the cluster's normal
replication and healing support or tools and strategies that copy OSD
content from an old (FileStore) device to a new (BlueStore) one.
BlueStore is so different from Filestore that an individual OSD cannot be
converted in place. Instead, the conversion process must use either (1) the
cluster's normal replication and healing support, or (2) tools and strategies
that copy OSD content from an old (Filestore) device to a new (BlueStore) one.
Deploying new OSDs with BlueStore
=================================
Deploy new OSDs with BlueStore
==============================
Use BlueStore when deploying new OSDs (for example, when the cluster is
expanded). Because this is the default behavior, no specific change is
needed.
Any new OSDs (e.g., when the cluster is expanded) can be deployed
using BlueStore. This is the default behavior so no specific change
is needed.
Similarly, use BlueStore for any OSDs that have been reprovisioned after
a failed drive was replaced.
Similarly, any OSDs that are reprovisioned after replacing a failed drive
can use BlueStore.
Converting existing OSDs
========================
Convert existing OSDs
=====================
"Mark-``out``" replacement
--------------------------
Mark out and replace
--------------------
The simplest approach is to verify that the cluster is healthy and
then follow these steps for each Filestore OSD in succession: mark the OSD
``out``, wait for the data to replicate across the cluster, reprovision the OSD,
mark the OSD back ``in``, and wait for recovery to complete before proceeding
to the next OSD. This approach is easy to automate, but it entails unnecessary
data migration that carries costs in time and SSD wear.
The simplest approach is to mark out each device in turn, wait for the
data to replicate across the cluster, reprovision the OSD, and mark
it back in again. It is simple and easy to automate. However, it requires
more data migration than should be necessary, so it is not optimal.
#. Identify a FileStore OSD to replace::
#. Identify a Filestore OSD to replace::
ID=<osd-id-number>
DEVICE=<disk-device>
You can tell whether a given OSD is FileStore or BlueStore with:
#. Determine whether a given OSD is Filestore or BlueStore:
.. prompt:: bash $
.. prompt:: bash $
ceph osd metadata $ID | grep osd_objectstore
ceph osd metadata $ID | grep osd_objectstore
You can get a current count of filestore vs bluestore with:
#. Get a current count of Filestore and BlueStore OSDs:
.. prompt:: bash $
.. prompt:: bash $
ceph osd count-metadata osd_objectstore
ceph osd count-metadata osd_objectstore
#. Mark the filestore OSD out:
#. Mark a Filestore OSD ``out``:
.. prompt:: bash $
ceph osd out $ID
#. Wait for the data to migrate off the OSD in question:
#. Wait for the data to migrate off this OSD:
.. prompt:: bash $
@ -71,7 +72,9 @@ more data migration than should be necessary, so it is not optimal.
systemctl kill ceph-osd@$ID
#. Make note of which device this OSD is using:
.. _osd_id_retrieval:
#. Note which device the OSD is using:
.. prompt:: bash $
@ -83,24 +86,27 @@ more data migration than should be necessary, so it is not optimal.
umount /var/lib/ceph/osd/ceph-$ID
#. Destroy the OSD data. Be *EXTREMELY CAREFUL* as this will destroy
the contents of the device; be certain the data on the device is
not needed (i.e., that the cluster is healthy) before proceeding:
#. Destroy the OSD's data. Be *EXTREMELY CAREFUL*! These commands will destroy
the contents of the device; you must be certain that the data on the device is
not needed (in other words, that the cluster is healthy) before proceeding:
.. prompt:: bash $
ceph-volume lvm zap $DEVICE
#. Tell the cluster the OSD has been destroyed (and a new OSD can be
reprovisioned with the same ID):
#. Tell the cluster that the OSD has been destroyed (and that a new OSD can be
reprovisioned with the same OSD ID):
.. prompt:: bash $
ceph osd destroy $ID --yes-i-really-mean-it
#. Reprovision a BlueStore OSD in its place with the same OSD ID.
This requires you do identify which device to wipe based on what you saw
mounted above. BE CAREFUL! :
#. Provision a BlueStore OSD in place by using the same OSD ID. This requires
you to identify which device to wipe, and to make certain that you target
the correct and intended device, using the information that was retrieved in
the :ref:`"Note which device the OSD is using" <osd_id_retrieval>` step. BE
CAREFUL! Note that you may need to modify these commands when dealing with
hybrid OSDs:
.. prompt:: bash $
@ -108,12 +114,15 @@ more data migration than should be necessary, so it is not optimal.
#. Repeat.
You can allow the refilling of the replacement OSD to happen
concurrently with the draining of the next OSD, or follow the same
procedure for multiple OSDs in parallel, as long as you ensure the
cluster is fully clean (all data has all replicas) before destroying
any OSDs. Failure to do so will reduce the redundancy of your data
and increase the risk of (or potentially even cause) data loss.
You may opt to (1) have the balancing of the replacement BlueStore OSD take
place concurrently with the draining of the next Filestore OSD, or instead
(2) follow the same procedure for multiple OSDs in parallel. In either case,
however, you must ensure that the cluster is fully clean (in other words, that
all data has all replicas) before destroying any OSDs. If you opt to reprovision
multiple OSDs in parallel, be **very** careful to destroy OSDs only within a
single CRUSH failure domain (for example, ``host`` or ``rack``). Failure to
satisfy this requirement will reduce the redundancy and availability of your
data and increase the risk of data loss (or even guarantee data loss).
Advantages:
@ -123,55 +132,53 @@ Advantages:
Disadvantages:
* Data is copied over the network twice: once to some other OSD in the
cluster (to maintain the desired number of replicas), and then again
back to the reprovisioned BlueStore OSD.
* Data is copied over the network twice: once to another OSD in the cluster (to
maintain the specified number of replicas), and again back to the
reprovisioned BlueStore OSD.
"Whole host" replacement
------------------------
Whole host replacement
----------------------
If you have a spare host in the cluster, or sufficient free space to evacuate
an entire host for use as a spare, then the conversion can be done on a
host-by-host basis so that each stored copy of the data is migrated only once.
If you have a spare host in the cluster, or have sufficient free space
to evacuate an entire host in order to use it as a spare, then the
conversion can be done on a host-by-host basis with each stored copy of
the data migrating only once.
To use this approach, you need an empty host that has no OSDs provisioned.
There are two ways to do this: either by using a new, empty host that is not
yet part of the cluster, or by offloading data from an existing host that is
already part of the cluster.
First, you need have empty host that has no data. There are two ways to do this: either by starting with a new, empty host that isn't yet part of the cluster, or by offloading data from an existing host that in the cluster.
Using a new, empty host
^^^^^^^^^^^^^^^^^^^^^^^
Use a new, empty host
^^^^^^^^^^^^^^^^^^^^^
Ideally the host will have roughly the same capacity as each of the other hosts
you will be converting. Add the host to the CRUSH hierarchy, but do not attach
it to the root:
Ideally the host should have roughly the
same capacity as other hosts you will be converting (although it
doesn't strictly matter). ::
NEWHOST=<empty-host-name>
Add the host to the CRUSH hierarchy, but do not attach it to the root:
.. prompt:: bash $
NEWHOST=<empty-host-name>
ceph osd crush add-bucket $NEWHOST host
Make sure the ceph packages are installed.
Make sure that Ceph packages are installed on the new host.
Use an existing host
^^^^^^^^^^^^^^^^^^^^
Using an existing host
^^^^^^^^^^^^^^^^^^^^^^
If you would like to use an existing host
that is already part of the cluster, and there is sufficient free
space on that host so that all of its data can be migrated off,
then you can instead do::
OLDHOST=<existing-cluster-host-to-offload>
If you would like to use an existing host that is already part of the cluster,
and if there is sufficient free space on that host so that all of its data can
be migrated off to other cluster hosts, you can do the following (instead of
using a new, empty host):
.. prompt:: bash $
OLDHOST=<existing-cluster-host-to-offload>
ceph osd crush unlink $OLDHOST default
where "default" is the immediate ancestor in the CRUSH map. (For
smaller clusters with unmodified configurations this will normally
be "default", but it might also be a rack name.) You should now
be "default", but it might instead be a rack name.) You should now
see the host at the top of the OSD tree output with no parent:
.. prompt:: bash $
@ -192,15 +199,18 @@ see the host at the top of the OSD tree output with no parent:
2 ssd 1.00000 osd.2 up 1.00000 1.00000
...
If everything looks good, jump directly to the "Wait for data
migration to complete" step below and proceed from there to clean up
the old OSDs.
If everything looks good, jump directly to the :ref:`"Wait for the data
migration to complete" <bluestore_data_migration_step>` step below and proceed
from there to clean up the old OSDs.
Migration process
^^^^^^^^^^^^^^^^^
If you're using a new host, start at step #1. For an existing host,
jump to step #5 below.
If you're using a new host, start at :ref:`the first step
<bluestore_migration_process_first_step>`. If you're using an existing host,
jump to :ref:`this step <bluestore_data_migration_step>`.
.. _bluestore_migration_process_first_step:
#. Provision new BlueStore OSDs for all devices:
@ -208,14 +218,14 @@ jump to step #5 below.
ceph-volume lvm create --bluestore --data /dev/$DEVICE
#. Verify OSDs join the cluster with:
#. Verify that the new OSDs have joined the cluster:
.. prompt:: bash $
ceph osd tree
You should see the new host ``$NEWHOST`` with all of the OSDs beneath
it, but the host should *not* be nested beneath any other node in
it, but the host should *not* be nested beneath any other node in the
hierarchy (like ``root default``). For example, if ``newhost`` is
the empty host, you might see something like::
@ -244,13 +254,16 @@ jump to step #5 below.
ceph osd crush swap-bucket $NEWHOST $OLDHOST
At this point all data on ``$OLDHOST`` will start migrating to OSDs
on ``$NEWHOST``. If there is a difference in the total capacity of
the old and new hosts you may also see some data migrate to or from
other nodes in the cluster, but as long as the hosts are similarly
sized this will be a relatively small amount of data.
At this point all data on ``$OLDHOST`` will begin migrating to the OSDs on
``$NEWHOST``. If there is a difference between the total capacity of the
old hosts and the total capacity of the new hosts, you may also see some
data migrate to or from other nodes in the cluster. Provided that the hosts
are similarly sized, however, this will be a relatively small amount of
data.
#. Wait for data migration to complete:
.. _bluestore_data_migration_step:
#. Wait for the data migration to complete:
.. prompt:: bash $
@ -261,8 +274,8 @@ jump to step #5 below.
.. prompt:: bash $
ssh $OLDHOST
systemctl kill ceph-osd.target
umount /var/lib/ceph/osd/ceph-*
systemctl kill ceph-osd.target
umount /var/lib/ceph/osd/ceph-*
#. Destroy and purge the old OSDs:
@ -270,69 +283,71 @@ jump to step #5 below.
for osd in `ceph osd ls-tree $OLDHOST`; do
ceph osd purge $osd --yes-i-really-mean-it
done
done
#. Wipe the old OSD devices. This requires you do identify which
devices are to be wiped manually (BE CAREFUL!). For each device:
#. Wipe the old OSDs. This requires you to identify which devices are to be
wiped manually. BE CAREFUL! For each device:
.. prompt:: bash $
ceph-volume lvm zap $DEVICE
#. Use the now-empty host as the new host, and repeat::
#. Use the now-empty host as the new host, and repeat:
NEWHOST=$OLDHOST
.. prompt:: bash $
NEWHOST=$OLDHOST
Advantages:
* Data is copied over the network only once.
* Converts an entire host's OSDs at once.
* Can parallelize to converting multiple hosts at a time.
* No spare devices are required on each host.
* An entire host's OSDs are converted at once.
* Can be parallelized, to make possible the conversion of multiple hosts at the same time.
* No host involved in this process needs to have a spare device.
Disadvantages:
* A spare host is required.
* An entire host's worth of OSDs will be migrating data at a time. This
is like likely to impact overall cluster performance.
* An entire host's worth of OSDs will be migrating data at a time. This
is likely to impact overall cluster performance.
* All migrated data still makes one full hop over the network.
Per-OSD device copy
-------------------
A single logical OSD can be converted by using the ``copy`` function
of ``ceph-objectstore-tool``. This requires that the host have a free
device (or devices) to provision a new, empty BlueStore OSD. For
example, if each host in your cluster has 12 OSDs, then you'd need a
13th available device so that each OSD can be converted in turn before the
old device is reclaimed to convert the next OSD.
included in ``ceph-objectstore-tool``. This requires that the host have one or more free
devices to provision a new, empty BlueStore OSD. For
example, if each host in your cluster has twelve OSDs, then you need a
thirteenth unused OSD so that each OSD can be converted before the
previous OSD is reclaimed to convert the next OSD.
Caveats:
* This strategy requires that a blank BlueStore OSD be prepared
without allocating a new OSD ID, something that the ``ceph-volume``
tool doesn't support. More importantly, the setup of *dmcrypt* is
closely tied to the OSD identity, which means that this approach
does not work with encrypted OSDs.
* This approach requires that we prepare an empty BlueStore OSD but that we do not allocate
a new OSD ID to it. The ``ceph-volume`` tool does not support such an operation. **IMPORTANT:**
because the setup of *dmcrypt* is closely tied to the identity of the OSD, this approach does not
work with encrypted OSDs.
* The device must be manually partitioned.
* Tooling not implemented!
* Not documented!
* An unsupported user-contributed script that demonstrates this process may be found here:
https://github.com/ceph/ceph/blob/master/src/script/contrib/ceph-migrate-bluestore.bash
Advantages:
* Little or no data migrates over the network during the conversion.
* Provided that the 'noout' or the 'norecover'/'norebalance' flags are set on the OSD or the
cluster while the conversion process is underway, little or no data migrates over the
network during the conversion.
Disadvantages:
* Tooling not fully implemented.
* Process not documented.
* Each host must have a spare or empty device.
* The OSD is offline during the conversion, which means new writes will
be written to only a subset of the OSDs. This increases the risk of data
loss due to a subsequent failure. (However, if there is a failure before
conversion is complete, the original FileStore OSD can be started to provide
access to its original data.)
* Tooling is not fully implemented, supported, or documented.
* Each host must have an appropriate spare or empty device for staging.
* The OSD is offline during the conversion, which means new writes to PGs
with the OSD in their acting set may not be ideally redundant until the
subject OSD comes up and recovers. This increases the risk of data
loss due to an overlapping failure. However, if another OSD fails before
conversion and startup have completed, the original Filestore OSD can be
started to provide access to its original data.

View File

@ -584,11 +584,11 @@ output::
A dump of the monitor state:
.. prompt:: bash $
.. prompt:: bash $
ceph mon dump
ceph mon dump
::
::
dumped monmap epoch 2
epoch 2

View File

@ -1,14 +1,14 @@
.. _ecpool:
=============
==============
Erasure code
=============
==============
By default, Ceph `pools <../pools>`_ are created with the type "replicated". In
replicated-type pools, every object is copied to multiple disks (this
multiple copying is the "replication").
replicated-type pools, every object is copied to multiple disks. This
multiple copying is the method of data protection known as "replication".
In contrast, `erasure-coded <https://en.wikipedia.org/wiki/Erasure_code>`_
By contrast, `erasure-coded <https://en.wikipedia.org/wiki/Erasure_code>`_
pools use a method of data protection that is different from replication. In
erasure coding, data is broken into fragments of two kinds: data blocks and
parity blocks. If a drive fails or becomes corrupted, the parity blocks are
@ -16,17 +16,17 @@ used to rebuild the data. At scale, erasure coding saves space relative to
replication.
In this documentation, data blocks are referred to as "data chunks"
and parity blocks are referred to as "encoding chunks".
and parity blocks are referred to as "coding chunks".
Erasure codes are also called "forward error correction codes". The
first forward error correction code was developed in 1950 by Richard
Hamming at Bell Laboratories.
Creating a sample erasure coded pool
Creating a sample erasure-coded pool
------------------------------------
The simplest erasure coded pool is equivalent to `RAID5
The simplest erasure-coded pool is similar to `RAID5
<https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5>`_ and
requires at least three hosts:
@ -47,12 +47,13 @@ requires at least three hosts:
ABCDEFGHI
Erasure code profiles
Erasure-code profiles
---------------------
The default erasure code profile can sustain the loss of two OSDs. This erasure
code profile is equivalent to a replicated pool of size three, but requires
2TB to store 1TB of data instead of 3TB to store 1TB of data. The default
The default erasure-code profile can sustain the overlapping loss of two OSDs
without losing data. This erasure-code profile is equivalent to a replicated
pool of size three, but with different storage requirements: instead of
requiring 3TB to store 1TB, it requires only 2TB to store 1TB. The default
profile can be displayed with this command:
.. prompt:: bash $
@ -68,26 +69,27 @@ profile can be displayed with this command:
technique=reed_sol_van
.. note::
The default erasure-coded pool, the profile of which is displayed here, is
not the same as the simplest erasure-coded pool.
The profile just displayed is for the *default* erasure-coded pool, not the
*simplest* erasure-coded pool. These two pools are not the same:
The default erasure-coded pool has two data chunks (k) and two coding chunks
(m). The profile of the default erasure-coded pool is "k=2 m=2".
The default erasure-coded pool has two data chunks (K) and two coding chunks
(M). The profile of the default erasure-coded pool is "k=2 m=2".
The simplest erasure-coded pool has two data chunks (k) and one coding chunk
(m). The profile of the simplest erasure-coded pool is "k=2 m=1".
The simplest erasure-coded pool has two data chunks (K) and one coding chunk
(M). The profile of the simplest erasure-coded pool is "k=2 m=1".
Choosing the right profile is important because the profile cannot be modified
after the pool is created. If you find that you need an erasure-coded pool with
a profile different than the one you have created, you must create a new pool
with a different (and presumably more carefully-considered) profile. When the
new pool is created, all objects from the wrongly-configured pool must be moved
to the newly-created pool. There is no way to alter the profile of a pool after its creation.
with a different (and presumably more carefully considered) profile. When the
new pool is created, all objects from the wrongly configured pool must be moved
to the newly created pool. There is no way to alter the profile of a pool after
the pool has been created.
The most important parameters of the profile are *K*, *M* and
The most important parameters of the profile are *K*, *M*, and
*crush-failure-domain* because they define the storage overhead and
the data durability. For example, if the desired architecture must
sustain the loss of two racks with a storage overhead of 67% overhead,
sustain the loss of two racks with a storage overhead of 67%,
the following profile can be defined:
.. prompt:: bash $
@ -106,7 +108,7 @@ the following profile can be defined:
The *NYAN* object will be divided in three (*K=3*) and two additional
*chunks* will be created (*M=2*). The value of *M* defines how many
OSD can be lost simultaneously without losing any data. The
OSDs can be lost simultaneously without losing any data. The
*crush-failure-domain=rack* will create a CRUSH rule that ensures
no two *chunks* are stored in the same rack.
@ -155,51 +157,53 @@ no two *chunks* are stored in the same rack.
+------+
More information can be found in the `erasure code profiles
More information can be found in the `erasure-code profiles
<../erasure-code-profile>`_ documentation.
Erasure Coding with Overwrites
------------------------------
By default, erasure coded pools only work with uses like RGW that
perform full object writes and appends.
By default, erasure-coded pools work only with operations that
perform full object writes and appends (for example, RGW).
Since Luminous, partial writes for an erasure coded pool may be
Since Luminous, partial writes for an erasure-coded pool may be
enabled with a per-pool setting. This lets RBD and CephFS store their
data in an erasure coded pool:
data in an erasure-coded pool:
.. prompt:: bash $
ceph osd pool set ec_pool allow_ec_overwrites true
This can only be enabled on a pool residing on bluestore OSDs, since
bluestore's checksumming is used to detect bitrot or other corruption
during deep-scrub. In addition to being unsafe, using filestore with
ec overwrites yields low performance compared to bluestore.
This can be enabled only on a pool residing on BlueStore OSDs, since
BlueStore's checksumming is used during deep scrubs to detect bitrot
or other corruption. Using Filestore with EC overwrites is not only
unsafe, but it also results in lower performance compared to BlueStore.
Erasure coded pools do not support omap, so to use them with RBD and
CephFS you must instruct them to store their data in an ec pool, and
Erasure-coded pools do not support omap, so to use them with RBD and
CephFS you must instruct them to store their data in an EC pool and
their metadata in a replicated pool. For RBD, this means using the
erasure coded pool as the ``--data-pool`` during image creation:
erasure-coded pool as the ``--data-pool`` during image creation:
.. prompt:: bash $
rbd create --size 1G --data-pool ec_pool replicated_pool/image_name
For CephFS, an erasure coded pool can be set as the default data pool during
For CephFS, an erasure-coded pool can be set as the default data pool during
file system creation or via `file layouts <../../../cephfs/file-layouts>`_.
Erasure coded pool and cache tiering
------------------------------------
Erasure-coded pools and cache tiering
-------------------------------------
Erasure coded pools require more resources than replicated pools and
lack some functionalities such as omap. To overcome these
limitations, one can set up a `cache tier <../cache-tiering>`_
before the erasure coded pool.
Erasure-coded pools require more resources than replicated pools and
lack some of the functionality supported by replicated pools (for example, omap).
To overcome these limitations, one can set up a `cache tier <../cache-tiering>`_
before setting up the erasure-coded pool.
For instance, if the pool *hot-storage* is made of fast storage:
For example, if the pool *hot-storage* is made of fast storage, the following commands
will place the *hot-storage* pool as a tier of *ecpool* in *writeback*
mode:
.. prompt:: bash $
@ -207,56 +211,60 @@ For instance, if the pool *hot-storage* is made of fast storage:
ceph osd tier cache-mode hot-storage writeback
ceph osd tier set-overlay ecpool hot-storage
will place the *hot-storage* pool as tier of *ecpool* in *writeback*
mode so that every write and read to the *ecpool* are actually using
the *hot-storage* and benefit from its flexibility and speed.
The result is that every write and read to the *ecpool* actually uses
the *hot-storage* pool and benefits from its flexibility and speed.
More information can be found in the `cache tiering
<../cache-tiering>`_ documentation.
<../cache-tiering>`_ documentation. Note, however, that cache tiering
is deprecated and may be removed completely in a future release.
Erasure coded pool recovery
Erasure-coded pool recovery
---------------------------
If an erasure coded pool loses some shards, it must recover them from the others.
This generally involves reading from the remaining shards, reconstructing the data, and
writing it to the new peer.
In Octopus, erasure coded pools can recover as long as there are at least *K* shards
If an erasure-coded pool loses any data shards, it must recover them from others.
This recovery involves reading from the remaining shards, reconstructing the data, and
writing new shards.
In Octopus and later releases, erasure-coded pools can recover as long as there are at least *K* shards
available. (With fewer than *K* shards, you have actually lost data!)
Prior to Octopus, erasure coded pools required at least *min_size* shards to be
available, even if *min_size* is greater than *K*. (We generally recommend min_size
be *K+2* or more to prevent loss of writes and data.)
This conservative decision was made out of an abundance of caution when designing the new pool
mode but also meant pools with lost OSDs but no data loss were unable to recover and go active
without manual intervention to change the *min_size*.
Prior to Octopus, erasure-coded pools required that at least ``min_size`` shards be
available, even if ``min_size`` was greater than ``K``. This was a conservative
decision made out of an abundance of caution when designing the new pool
mode. As a result, however, pools with lost OSDs but without complete data loss were
unable to recover and go active without manual intervention to temporarily change
the ``min_size`` setting.
We recommend that ``min_size`` be ``K+2`` or greater to prevent loss of writes and
loss of data.
Glossary
--------
*chunk*
when the encoding function is called, it returns chunks of the same
size. Data chunks which can be concatenated to reconstruct the original
object and coding chunks which can be used to rebuild a lost chunk.
When the encoding function is called, it returns chunks of the same size as each other. There are two
kinds of chunks: (1) *data chunks*, which can be concatenated to reconstruct the original object, and
(2) *coding chunks*, which can be used to rebuild a lost chunk.
*K*
the number of data *chunks*, i.e. the number of *chunks* in which the
original object is divided. For instance if *K* = 2 a 10KB object
will be divided into *K* objects of 5KB each.
The number of data chunks into which an object is divided. For example, if *K* = 2, then a 10KB object
is divided into two objects of 5KB each.
*M*
the number of coding *chunks*, i.e. the number of additional *chunks*
computed by the encoding functions. If there are 2 coding *chunks*,
it means 2 OSDs can be out without losing data.
The number of coding chunks computed by the encoding function. *M* is equal to the number of OSDs that can
be missing from the cluster without the cluster suffering data loss. For example, if there are two coding
chunks, then two OSDs can be missing without data loss.
Table of content
----------------
Table of contents
-----------------
.. toctree::
:maxdepth: 1
:maxdepth: 1
erasure-code-profile
erasure-code-jerasure
erasure-code-isa
erasure-code-lrc
erasure-code-shec
erasure-code-clay
erasure-code-profile
erasure-code-jerasure
erasure-code-isa
erasure-code-lrc
erasure-code-shec
erasure-code-clay

File diff suppressed because it is too large Load Diff

View File

@ -3,35 +3,36 @@
=========================
High availability and high reliability require a fault-tolerant approach to
managing hardware and software issues. Ceph has no single point-of-failure, and
can service requests for data in a "degraded" mode. Ceph's `data placement`_
introduces a layer of indirection to ensure that data doesn't bind directly to
particular OSD addresses. This means that tracking down system faults requires
finding the `placement group`_ and the underlying OSDs at root of the problem.
managing hardware and software issues. Ceph has no single point of failure and
it can service requests for data even when in a "degraded" mode. Ceph's `data
placement`_ introduces a layer of indirection to ensure that data doesn't bind
directly to specific OSDs. For this reason, tracking system faults
requires finding the `placement group`_ (PG) and the underlying OSDs at the
root of the problem.
.. tip:: A fault in one part of the cluster may prevent you from accessing a
particular object, but that doesn't mean that you cannot access other objects.
.. tip:: A fault in one part of the cluster might prevent you from accessing a
particular object, but that doesn't mean that you are prevented from accessing other objects.
When you run into a fault, don't panic. Just follow the steps for monitoring
your OSDs and placement groups. Then, begin troubleshooting.
your OSDs and placement groups, and then begin troubleshooting.
Ceph is generally self-repairing. However, when problems persist, monitoring
OSDs and placement groups will help you identify the problem.
Ceph is self-repairing. However, when problems persist, monitoring OSDs and
placement groups will help you identify the problem.
Monitoring OSDs
===============
An OSD's status is either in the cluster (``in``) or out of the cluster
(``out``); and, it is either up and running (``up``), or it is down and not
running (``down``). If an OSD is ``up``, it may be either ``in`` the cluster
(you can read and write data) or it is ``out`` of the cluster. If it was
``in`` the cluster and recently moved ``out`` of the cluster, Ceph will migrate
placement groups to other OSDs. If an OSD is ``out`` of the cluster, CRUSH will
not assign placement groups to the OSD. If an OSD is ``down``, it should also be
An OSD's status is as follows: it is either in the cluster (``in``) or out of the cluster
(``out``); likewise, it is either up and running (``up``) or down and not
running (``down``). If an OSD is ``up``, it can be either ``in`` the cluster
(if so, you can read and write data) or ``out`` of the cluster. If the OSD was previously
``in`` the cluster but was recently moved ``out`` of the cluster, Ceph will migrate its
PGs to other OSDs. If an OSD is ``out`` of the cluster, CRUSH will
not assign any PGs to that OSD. If an OSD is ``down``, it should also be
``out``.
.. note:: If an OSD is ``down`` and ``in``, there is a problem and the cluster
will not be in a healthy state.
.. note:: If an OSD is ``down`` and ``in``, then there is a problem and the cluster
is not in a healthy state.
.. ditaa::
@ -50,72 +51,71 @@ not assign placement groups to the OSD. If an OSD is ``down``, it should also be
| | | |
+----------------+ +----------------+
If you execute a command such as ``ceph health``, ``ceph -s`` or ``ceph -w``,
you may notice that the cluster does not always echo back ``HEALTH OK``. Don't
panic. With respect to OSDs, you should expect that the cluster will **NOT**
echo ``HEALTH OK`` in a few expected circumstances:
If you run the commands ``ceph health``, ``ceph -s``, or ``ceph -w``,
you might notice that the cluster does not always show ``HEALTH OK``. Don't
panic. There are certain circumstances in which it is expected and normal that
the cluster will **NOT** show ``HEALTH OK``:
#. You haven't started the cluster yet (it won't respond).
#. You have just started or restarted the cluster and it's not ready yet,
because the placement groups are getting created and the OSDs are in
the process of peering.
#. You just added or removed an OSD.
#. You just have modified your cluster map.
#. You haven't started the cluster yet.
#. You have just started or restarted the cluster and it's not ready to show
health statuses yet, because the PGs are in the process of being created and
the OSDs are in the process of peering.
#. You have just added or removed an OSD.
#. You have just have modified your cluster map.
An important aspect of monitoring OSDs is to ensure that when the cluster
is up and running that all OSDs that are ``in`` the cluster are ``up`` and
running, too. To see if all OSDs are running, execute:
Checking to see if OSDs are ``up`` and running is an important aspect of monitoring them:
whenever the cluster is up and running, every OSD that is ``in`` the cluster should also
be ``up`` and running. To see if all of the cluster's OSDs are running, run the following
command:
.. prompt:: bash $
ceph osd stat
ceph osd stat
The result should tell you the total number of OSDs (x),
how many are ``up`` (y), how many are ``in`` (z) and the map epoch (eNNNN). ::
The output provides the following information: the total number of OSDs (x),
how many OSDs are ``up`` (y), how many OSDs are ``in`` (z), and the map epoch (eNNNN). ::
x osds: y up, z in; epoch: eNNNN
x osds: y up, z in; epoch: eNNNN
If the number of OSDs that are ``in`` the cluster is more than the number of
OSDs that are ``up``, execute the following command to identify the ``ceph-osd``
If the number of OSDs that are ``in`` the cluster is greater than the number of
OSDs that are ``up``, run the following command to identify the ``ceph-osd``
daemons that are not running:
.. prompt:: bash $
ceph osd tree
ceph osd tree
::
#ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 2.00000 pool openstack
-3 2.00000 rack dell-2950-rack-A
-2 2.00000 host dell-2950-A1
0 ssd 1.00000 osd.0 up 1.00000 1.00000
1 ssd 1.00000 osd.1 down 1.00000 1.00000
#ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 2.00000 pool openstack
-3 2.00000 rack dell-2950-rack-A
-2 2.00000 host dell-2950-A1
0 ssd 1.00000 osd.0 up 1.00000 1.00000
1 ssd 1.00000 osd.1 down 1.00000 1.00000
.. tip:: The ability to search through a well-designed CRUSH hierarchy may help
you troubleshoot your cluster by identifying the physical locations faster.
.. tip:: Searching through a well-designed CRUSH hierarchy to identify the physical
locations of particular OSDs might help you troubleshoot your cluster.
If an OSD is ``down``, start it:
If an OSD is ``down``, start it by running the following command:
.. prompt:: bash $
sudo systemctl start ceph-osd@1
sudo systemctl start ceph-osd@1
See `OSD Not Running`_ for problems associated with OSDs that stopped, or won't
restart.
For problems associated with OSDs that have stopped or won't restart, see `OSD Not Running`_.
PG Sets
=======
When CRUSH assigns placement groups to OSDs, it looks at the number of replicas
for the pool and assigns the placement group to OSDs such that each replica of
the placement group gets assigned to a different OSD. For example, if the pool
requires three replicas of a placement group, CRUSH may assign them to
``osd.1``, ``osd.2`` and ``osd.3`` respectively. CRUSH actually seeks a
pseudo-random placement that will take into account failure domains you set in
your `CRUSH map`_, so you will rarely see placement groups assigned to nearest
neighbor OSDs in a large cluster.
When CRUSH assigns a PG to OSDs, it takes note of how many replicas of the PG
are required by the pool and then assigns each replica to a different OSD.
For example, if the pool requires three replicas of a PG, CRUSH might assign
them individually to ``osd.1``, ``osd.2`` and ``osd.3``. CRUSH seeks a
pseudo-random placement that takes into account the failure domains that you
have set in your `CRUSH map`_; for this reason, PGs are rarely assigned to
immediately adjacent OSDs in a large cluster.
Ceph processes a client request using the **Acting Set**, which is the set of
OSDs that will actually handle the requests since they have a full and working
@ -123,56 +123,55 @@ version of a placement group shard. The set of OSDs that should contain a shard
of a particular placement group as the **Up Set**, i.e. where data is
moved/copied to (or planned to be).
In some cases, an OSD in the Acting Set is ``down`` or otherwise not able to
service requests for objects in the placement group. When these situations
arise, don't panic. Common examples include:
Sometimes an OSD in the Acting Set is ``down`` or otherwise unable to
service requests for objects in the PG. When this kind of situation
arises, don't panic. Common examples of such a situation include:
- You added or removed an OSD. Then, CRUSH reassigned the placement group to
other OSDs--thereby changing the composition of the Acting Set and spawning
the migration of data with a "backfill" process.
- You added or removed an OSD, CRUSH reassigned the PG to
other OSDs, and this reassignment changed the composition of the Acting Set and triggered
the migration of data by means of a "backfill" process.
- An OSD was ``down``, was restarted, and is now ``recovering``.
- An OSD in the Acting Set is ``down`` or unable to service requests,
and another OSD has temporarily assumed its duties.
In most cases, the Up Set and the Acting Set are identical. When they are not,
it may indicate that Ceph is migrating the PG (it's remapped), an OSD is
recovering, or that there is a problem (i.e., Ceph usually echoes a "HEALTH
WARN" state with a "stuck stale" message in such scenarios).
Typically, the Up Set and the Acting Set are identical. When they are not, it
might indicate that Ceph is migrating the PG (in other words, that the PG has
been remapped), that an OSD is recovering, or that there is a problem with the
cluster (in such scenarios, Ceph usually shows a "HEALTH WARN" state with a
"stuck stale" message).
To retrieve a list of placement groups, execute:
To retrieve a list of PGs, run the following command:
.. prompt:: bash $
ceph pg dump
ceph pg dump
To view which OSDs are within the Acting Set or the Up Set for a given placement
group, execute:
To see which OSDs are within the Acting Set and the Up Set for a specific PG, run the following command:
.. prompt:: bash $
ceph pg map {pg-num}
ceph pg map {pg-num}
The result should tell you the osdmap epoch (eNNN), the placement group number
({pg-num}), the OSDs in the Up Set (up[]), and the OSDs in the acting set
The output provides the following information: the osdmap epoch (eNNN), the PG number
({pg-num}), the OSDs in the Up Set (up[]), and the OSDs in the Acting Set
(acting[])::
osdmap eNNN pg {raw-pg-num} ({pg-num}) -> up [0,1,2] acting [0,1,2]
osdmap eNNN pg {raw-pg-num} ({pg-num}) -> up [0,1,2] acting [0,1,2]
.. note:: If the Up Set and Acting Set do not match, this may be an indicator
that the cluster rebalancing itself or of a potential problem with
.. note:: If the Up Set and the Acting Set do not match, this might indicate
that the cluster is rebalancing itself or that there is a problem with
the cluster.
Peering
=======
Before you can write data to a placement group, it must be in an ``active``
state, and it **should** be in a ``clean`` state. For Ceph to determine the
current state of a placement group, the primary OSD of the placement group
(i.e., the first OSD in the acting set), peers with the secondary and tertiary
OSDs to establish agreement on the current state of the placement group
(assuming a pool with 3 replicas of the PG).
Before you can write data to a PG, it must be in an ``active`` state and it
will preferably be in a ``clean`` state. For Ceph to determine the current
state of a PG, peering must take place. That is, the primary OSD of the PG
(that is, the first OSD in the Acting Set) must peer with the secondary and
OSDs so that consensus on the current state of the PG can be established. In
the following diagram, we assume a pool with three replicas of the PG:
.. ditaa::
@ -192,104 +191,105 @@ OSDs to establish agreement on the current state of the placement group
|<-----------------------------|
| Peering |
The OSDs also report their status to the monitor. See `Configuring Monitor/OSD
Interaction`_ for details. To troubleshoot peering issues, see `Peering
The OSDs also report their status to the monitor. For details, see `Configuring
Monitor/OSD Interaction`_. To troubleshoot peering issues, see `Peering
Failure`_.
Monitoring Placement Group States
=================================
Monitoring PG States
====================
If you execute a command such as ``ceph health``, ``ceph -s`` or ``ceph -w``,
you may notice that the cluster does not always echo back ``HEALTH OK``. After
you check to see if the OSDs are running, you should also check placement group
states. You should expect that the cluster will **NOT** echo ``HEALTH OK`` in a
number of placement group peering-related circumstances:
If you run the commands ``ceph health``, ``ceph -s``, or ``ceph -w``,
you might notice that the cluster does not always show ``HEALTH OK``. After
first checking to see if the OSDs are running, you should also check PG
states. There are certain PG-peering-related circumstances in which it is expected
and normal that the cluster will **NOT** show ``HEALTH OK``:
#. You have just created a pool and placement groups haven't peered yet.
#. The placement groups are recovering.
#. You have just created a pool and the PGs haven't peered yet.
#. The PGs are recovering.
#. You have just added an OSD to or removed an OSD from the cluster.
#. You have just modified your CRUSH map and your placement groups are migrating.
#. There is inconsistent data in different replicas of a placement group.
#. Ceph is scrubbing a placement group's replicas.
#. You have just modified your CRUSH map and your PGs are migrating.
#. There is inconsistent data in different replicas of a PG.
#. Ceph is scrubbing a PG's replicas.
#. Ceph doesn't have enough storage capacity to complete backfilling operations.
If one of the foregoing circumstances causes Ceph to echo ``HEALTH WARN``, don't
panic. In many cases, the cluster will recover on its own. In some cases, you
may need to take action. An important aspect of monitoring placement groups is
to ensure that when the cluster is up and running that all placement groups are
``active``, and preferably in the ``clean`` state. To see the status of all
placement groups, execute:
If one of these circumstances causes Ceph to show ``HEALTH WARN``, don't
panic. In many cases, the cluster will recover on its own. In some cases, however, you
might need to take action. An important aspect of monitoring PGs is to check their
status as ``active`` and ``clean``: that is, it is important to ensure that, when the
cluster is up and running, all PGs are ``active`` and (preferably) ``clean``.
To see the status of every PG, run the following command:
.. prompt:: bash $
ceph pg stat
ceph pg stat
The result should tell you the total number of placement groups (x), how many
placement groups are in a particular state such as ``active+clean`` (y) and the
The output provides the following information: the total number of PGs (x), how many
PGs are in a particular state such as ``active+clean`` (y), and the
amount of data stored (z). ::
x pgs: y active+clean; z bytes data, aa MB used, bb GB / cc GB avail
x pgs: y active+clean; z bytes data, aa MB used, bb GB / cc GB avail
.. note:: It is common for Ceph to report multiple states for placement groups.
.. note:: It is common for Ceph to report multiple states for PGs (for example,
``active+clean``, ``active+clean+remapped``, ``active+clean+scrubbing``.
In addition to the placement group states, Ceph will also echo back the amount of
storage capacity used (aa), the amount of storage capacity remaining (bb), and the total
storage capacity for the placement group. These numbers can be important in a
few cases:
Here Ceph shows not only the PG states, but also storage capacity used (aa),
the amount of storage capacity remaining (bb), and the total storage capacity
of the PG. These values can be important in a few cases:
- You are reaching your ``near full ratio`` or ``full ratio``.
- Your data is not getting distributed across the cluster due to an
error in your CRUSH configuration.
- The cluster is reaching its ``near full ratio`` or ``full ratio``.
- Data is not being distributed across the cluster due to an error in the
CRUSH configuration.
.. topic:: Placement Group IDs
Placement group IDs consist of the pool number (not pool name) followed
by a period (.) and the placement group ID--a hexadecimal number. You
can view pool numbers and their names from the output of ``ceph osd
lspools``. For example, the first pool created corresponds to
pool number ``1``. A fully qualified placement group ID has the
PG IDs consist of the pool number (not the pool name) followed by a period
(.) and a hexadecimal number. You can view pool numbers and their names from
in the output of ``ceph osd lspools``. For example, the first pool that was
created corresponds to pool number ``1``. A fully qualified PG ID has the
following form::
{pool-num}.{pg-id}
{pool-num}.{pg-id}
And it typically looks like this::
It typically resembles the following::
1.1f
1.1701b
To retrieve a list of placement groups, execute the following:
To retrieve a list of PGs, run the following command:
.. prompt:: bash $
ceph pg dump
ceph pg dump
You can also format the output in JSON format and save it to a file:
To format the output in JSON format and save it to a file, run the following command:
.. prompt:: bash $
ceph pg dump -o {filename} --format=json
ceph pg dump -o {filename} --format=json
To query a particular placement group, execute the following:
To query a specific PG, run the following command:
.. prompt:: bash $
ceph pg {poolnum}.{pg-id} query
ceph pg {poolnum}.{pg-id} query
Ceph will output the query in JSON format.
The following subsections describe the common pg states in detail.
The following subsections describe the most common PG states in detail.
Creating
--------
When you create a pool, it will create the number of placement groups you
specified. Ceph will echo ``creating`` when it is creating one or more
placement groups. Once they are created, the OSDs that are part of a placement
group's Acting Set will peer. Once peering is complete, the placement group
status should be ``active+clean``, which means a Ceph client can begin writing
to the placement group.
PGs are created when you create a pool: the command that creates a pool
specifies the total number of PGs for that pool, and when the pool is created
all of those PGs are created as well. Ceph will echo ``creating`` while it is
creating PGs. After the PG(s) are created, the OSDs that are part of a PG's
Acting Set will peer. Once peering is complete, the PG status should be
``active+clean``. This status means that Ceph clients begin writing to the
PG.
.. ditaa::
@ -300,43 +300,38 @@ to the placement group.
Peering
-------
When Ceph is Peering a placement group, Ceph is bringing the OSDs that
store the replicas of the placement group into **agreement about the state**
of the objects and metadata in the placement group. When Ceph completes peering,
this means that the OSDs that store the placement group agree about the current
state of the placement group. However, completion of the peering process does
**NOT** mean that each replica has the latest contents.
When a PG peers, the OSDs that store the replicas of its data converge on an
agreed state of the data and metadata within that PG. When peering is complete,
those OSDs agree about the state of that PG. However, completion of the peering
process does **NOT** mean that each replica has the latest contents.
.. topic:: Authoritative History
Ceph will **NOT** acknowledge a write operation to a client, until
all OSDs of the acting set persist the write operation. This practice
ensures that at least one member of the acting set will have a record
of every acknowledged write operation since the last successful
peering operation.
Ceph will **NOT** acknowledge a write operation to a client until that write
operation is persisted by every OSD in the Acting Set. This practice ensures
that at least one member of the Acting Set will have a record of every
acknowledged write operation since the last successful peering operation.
With an accurate record of each acknowledged write operation, Ceph can
construct and disseminate a new authoritative history of the placement
group--a complete, and fully ordered set of operations that, if performed,
would bring an OSDs copy of a placement group up to date.
Given an accurate record of each acknowledged write operation, Ceph can
construct a new authoritative history of the PG--that is, a complete and
fully ordered set of operations that, if performed, would bring an OSDs
copy of the PG up to date.
Active
------
Once Ceph completes the peering process, a placement group may become
``active``. The ``active`` state means that the data in the placement group is
generally available in the primary placement group and the replicas for read
and write operations.
After Ceph has completed the peering process, a PG should become ``active``.
The ``active`` state means that the data in the PG is generally available for
read and write operations in the primary and replica OSDs.
Clean
-----
When a placement group is in the ``clean`` state, the primary OSD and the
replica OSDs have successfully peered and there are no stray replicas for the
placement group. Ceph replicated all objects in the placement group the correct
number of times.
When a PG is in the ``clean`` state, all OSDs holding its data and metadata
have successfully peered and there are no stray replicas. Ceph has replicated
all objects in the PG the correct number of times.
Degraded
@ -344,143 +339,147 @@ Degraded
When a client writes an object to the primary OSD, the primary OSD is
responsible for writing the replicas to the replica OSDs. After the primary OSD
writes the object to storage, the placement group will remain in a ``degraded``
writes the object to storage, the PG will remain in a ``degraded``
state until the primary OSD has received an acknowledgement from the replica
OSDs that Ceph created the replica objects successfully.
The reason a placement group can be ``active+degraded`` is that an OSD may be
``active`` even though it doesn't hold all of the objects yet. If an OSD goes
``down``, Ceph marks each placement group assigned to the OSD as ``degraded``.
The OSDs must peer again when the OSD comes back online. However, a client can
still write a new object to a ``degraded`` placement group if it is ``active``.
The reason that a PG can be ``active+degraded`` is that an OSD can be
``active`` even if it doesn't yet hold all of the PG's objects. If an OSD goes
``down``, Ceph marks each PG assigned to the OSD as ``degraded``. The PGs must
peer again when the OSD comes back online. However, a client can still write a
new object to a ``degraded`` PG if it is ``active``.
If an OSD is ``down`` and the ``degraded`` condition persists, Ceph may mark the
If an OSD is ``down`` and the ``degraded`` condition persists, Ceph might mark the
``down`` OSD as ``out`` of the cluster and remap the data from the ``down`` OSD
to another OSD. The time between being marked ``down`` and being marked ``out``
is controlled by ``mon osd down out interval``, which is set to ``600`` seconds
is determined by ``mon_osd_down_out_interval``, which is set to ``600`` seconds
by default.
A placement group can also be ``degraded``, because Ceph cannot find one or more
objects that Ceph thinks should be in the placement group. While you cannot
read or write to unfound objects, you can still access all of the other objects
in the ``degraded`` placement group.
A PG can also be in the ``degraded`` state because there are one or more
objects that Ceph expects to find in the PG but that Ceph cannot find. Although
you cannot read or write to unfound objects, you can still access all of the other
objects in the ``degraded`` PG.
Recovering
----------
Ceph was designed for fault-tolerance at a scale where hardware and software
problems are ongoing. When an OSD goes ``down``, its contents may fall behind
the current state of other replicas in the placement groups. When the OSD is
back ``up``, the contents of the placement groups must be updated to reflect the
current state. During that time period, the OSD may reflect a ``recovering``
state.
Ceph was designed for fault-tolerance, because hardware and other server
problems are expected or even routine. When an OSD goes ``down``, its contents
might fall behind the current state of other replicas in the PGs. When the OSD
has returned to the ``up`` state, the contents of the PGs must be updated to
reflect that current state. During that time period, the OSD might be in a
``recovering`` state.
Recovery is not always trivial, because a hardware failure might cause a
cascading failure of multiple OSDs. For example, a network switch for a rack or
cabinet may fail, which can cause the OSDs of a number of host machines to fall
behind the current state of the cluster. Each one of the OSDs must recover once
the fault is resolved.
cabinet might fail, which can cause the OSDs of a number of host machines to
fall behind the current state of the cluster. In such a scenario, general
recovery is possible only if each of the OSDs recovers after the fault has been
resolved.]
Ceph provides a number of settings to balance the resource contention between
new service requests and the need to recover data objects and restore the
placement groups to the current state. The ``osd recovery delay start`` setting
allows an OSD to restart, re-peer and even process some replay requests before
starting the recovery process. The ``osd
recovery thread timeout`` sets a thread timeout, because multiple OSDs may fail,
restart and re-peer at staggered rates. The ``osd recovery max active`` setting
limits the number of recovery requests an OSD will entertain simultaneously to
prevent the OSD from failing to serve . The ``osd recovery max chunk`` setting
limits the size of the recovered data chunks to prevent network congestion.
Ceph provides a number of settings that determine how the cluster balances the
resource contention between the need to process new service requests and the
need to recover data objects and restore the PGs to the current state. The
``osd_recovery_delay_start`` setting allows an OSD to restart, re-peer, and
even process some replay requests before starting the recovery process. The
``osd_recovery_thread_timeout`` setting determines the duration of a thread
timeout, because multiple OSDs might fail, restart, and re-peer at staggered
rates. The ``osd_recovery_max_active`` setting limits the number of recovery
requests an OSD can entertain simultaneously, in order to prevent the OSD from
failing to serve. The ``osd_recovery_max_chunk`` setting limits the size of
the recovered data chunks, in order to prevent network congestion.
Back Filling
------------
When a new OSD joins the cluster, CRUSH will reassign placement groups from OSDs
in the cluster to the newly added OSD. Forcing the new OSD to accept the
reassigned placement groups immediately can put excessive load on the new OSD.
Back filling the OSD with the placement groups allows this process to begin in
the background. Once backfilling is complete, the new OSD will begin serving
requests when it is ready.
When a new OSD joins the cluster, CRUSH will reassign PGs from OSDs that are
already in the cluster to the newly added OSD. It can put excessive load on the
new OSD to force it to immediately accept the reassigned PGs. Back filling the
OSD with the PGs allows this process to begin in the background. After the
backfill operations have completed, the new OSD will begin serving requests as
soon as it is ready.
During the backfill operations, you may see one of several states:
During the backfill operations, you might see one of several states:
``backfill_wait`` indicates that a backfill operation is pending, but is not
underway yet; ``backfilling`` indicates that a backfill operation is underway;
and, ``backfill_toofull`` indicates that a backfill operation was requested,
but couldn't be completed due to insufficient storage capacity. When a
placement group cannot be backfilled, it may be considered ``incomplete``.
yet underway; ``backfilling`` indicates that a backfill operation is currently
underway; and ``backfill_toofull`` indicates that a backfill operation was
requested but couldn't be completed due to insufficient storage capacity. When
a PG cannot be backfilled, it might be considered ``incomplete``.
The ``backfill_toofull`` state may be transient. It is possible that as PGs
are moved around, space may become available. The ``backfill_toofull`` is
similar to ``backfill_wait`` in that as soon as conditions change
backfill can proceed.
The ``backfill_toofull`` state might be transient. It might happen that, as PGs
are moved around, space becomes available. The ``backfill_toofull`` state is
similar to ``backfill_wait`` in that backfill operations can proceed as soon as
conditions change.
Ceph provides a number of settings to manage the load spike associated with
reassigning placement groups to an OSD (especially a new OSD). By default,
``osd_max_backfills`` sets the maximum number of concurrent backfills to and from
an OSD to 1. The ``backfill full ratio`` enables an OSD to refuse a
backfill request if the OSD is approaching its full ratio (90%, by default) and
change with ``ceph osd set-backfillfull-ratio`` command.
If an OSD refuses a backfill request, the ``osd backfill retry interval``
enables an OSD to retry the request (after 30 seconds, by default). OSDs can
also set ``osd backfill scan min`` and ``osd backfill scan max`` to manage scan
intervals (64 and 512, by default).
Ceph provides a number of settings to manage the load spike associated with the
reassignment of PGs to an OSD (especially a new OSD). The ``osd_max_backfills``
setting specifies the maximum number of concurrent backfills to and from an OSD
(default: 1). The ``backfill_full_ratio`` setting allows an OSD to refuse a
backfill request if the OSD is approaching its full ratio (default: 90%). This
setting can be changed with the ``ceph osd set-backfillfull-ratio`` command. If
an OSD refuses a backfill request, the ``osd_backfill_retry_interval`` setting
allows an OSD to retry the request after a certain interval (default: 30
seconds). OSDs can also set ``osd_backfill_scan_min`` and
``osd_backfill_scan_max`` in order to manage scan intervals (default: 64 and
512, respectively).
Remapped
--------
When the Acting Set that services a placement group changes, the data migrates
from the old acting set to the new acting set. It may take some time for a new
primary OSD to service requests. So it may ask the old primary to continue to
service requests until the placement group migration is complete. Once data
migration completes, the mapping uses the primary OSD of the new acting set.
When the Acting Set that services a PG changes, the data migrates from the old
Acting Set to the new Acting Set. Because it might take time for the new
primary OSD to begin servicing requests, the old primary OSD might be required
to continue servicing requests until the PG data migration is complete. After
data migration has completed, the mapping uses the primary OSD of the new
Acting Set.
Stale
-----
While Ceph uses heartbeats to ensure that hosts and daemons are running, the
``ceph-osd`` daemons may also get into a ``stuck`` state where they are not
reporting statistics in a timely manner (e.g., a temporary network fault). By
default, OSD daemons report their placement group, up through, boot and failure
statistics every half second (i.e., ``0.5``), which is more frequent than the
heartbeat thresholds. If the **Primary OSD** of a placement group's acting set
fails to report to the monitor or if other OSDs have reported the primary OSD
``down``, the monitors will mark the placement group ``stale``.
Although Ceph uses heartbeats in order to ensure that hosts and daemons are
running, the ``ceph-osd`` daemons might enter a ``stuck`` state where they are
not reporting statistics in a timely manner (for example, there might be a
temporary network fault). By default, OSD daemons report their PG, up through,
boot, and failure statistics every half second (that is, in accordance with a
value of ``0.5``), which is more frequent than the reports defined by the
heartbeat thresholds. If the primary OSD of a PG's Acting Set fails to report
to the monitor or if other OSDs have reported the primary OSD ``down``, the
monitors will mark the PG ``stale``.
When you start your cluster, it is common to see the ``stale`` state until
the peering process completes. After your cluster has been running for awhile,
seeing placement groups in the ``stale`` state indicates that the primary OSD
for those placement groups is ``down`` or not reporting placement group statistics
to the monitor.
When you start your cluster, it is common to see the ``stale`` state until the
peering process completes. After your cluster has been running for a while,
however, seeing PGs in the ``stale`` state indicates that the primary OSD for
those PGs is ``down`` or not reporting PG statistics to the monitor.
Identifying Troubled PGs
========================
As previously noted, a placement group is not necessarily problematic just
because its state is not ``active+clean``. Generally, Ceph's ability to self
repair may not be working when placement groups get stuck. The stuck states
include:
As previously noted, a PG is not necessarily having problems just because its
state is not ``active+clean``. When PGs are stuck, this might indicate that
Ceph cannot perform self-repairs. The stuck states include:
- **Unclean**: Placement groups contain objects that are not replicated the
desired number of times. They should be recovering.
- **Inactive**: Placement groups cannot process reads or writes because they
are waiting for an OSD with the most up-to-date data to come back ``up``.
- **Stale**: Placement groups are in an unknown state, because the OSDs that
host them have not reported to the monitor cluster in a while (configured
by ``mon osd report timeout``).
- **Unclean**: PGs contain objects that have not been replicated the desired
number of times. Under normal conditions, it can be assumed that these PGs
are recovering.
- **Inactive**: PGs cannot process reads or writes because they are waiting for
an OSD that has the most up-to-date data to come back ``up``.
- **Stale**: PG are in an unknown state, because the OSDs that host them have
not reported to the monitor cluster for a certain period of time (determined
by ``mon_osd_report_timeout``).
To identify stuck placement groups, execute the following:
To identify stuck PGs, run the following command:
.. prompt:: bash $
ceph pg dump_stuck [unclean|inactive|stale|undersized|degraded]
ceph pg dump_stuck [unclean|inactive|stale|undersized|degraded]
See `Placement Group Subsystem`_ for additional details. To troubleshoot
stuck placement groups, see `Troubleshooting PG Errors`_.
For more detail, see `Placement Group Subsystem`_. To troubleshoot stuck PGs,
see `Troubleshooting PG Errors`_.
Finding an Object Location
@ -491,55 +490,54 @@ To store object data in the Ceph Object Store, a Ceph client must:
#. Set an object name
#. Specify a `pool`_
The Ceph client retrieves the latest cluster map and the CRUSH algorithm
calculates how to map the object to a `placement group`_, and then calculates
how to assign the placement group to an OSD dynamically. To find the object
location, all you need is the object name and the pool name. For example:
The Ceph client retrieves the latest cluster map, the CRUSH algorithm
calculates how to map the object to a PG, and then the algorithm calculates how
to dynamically assign the PG to an OSD. To find the object location given only
the object name and the pool name, run a command of the following form:
.. prompt:: bash $
ceph osd map {poolname} {object-name} [namespace]
ceph osd map {poolname} {object-name} [namespace]
.. topic:: Exercise: Locate an Object
As an exercise, let's create an object. Specify an object name, a path
to a test file containing some object data and a pool name using the
As an exercise, let's create an object. We can specify an object name, a path
to a test file that contains some object data, and a pool name by using the
``rados put`` command on the command line. For example:
.. prompt:: bash $
rados put {object-name} {file-path} --pool=data
rados put test-object-1 testfile.txt --pool=data
rados put {object-name} {file-path} --pool=data
rados put test-object-1 testfile.txt --pool=data
To verify that the Ceph Object Store stored the object, execute the
following:
To verify that the Ceph Object Store stored the object, run the
following command:
.. prompt:: bash $
rados -p data ls
Now, identify the object location:
To identify the object location, run the following commands:
.. prompt:: bash $
ceph osd map {pool-name} {object-name}
ceph osd map data test-object-1
Ceph should output the object's location. For example::
Ceph should output the object's location. For example::
osdmap e537 pool 'data' (1) object 'test-object-1' -> pg 1.d1743484 (1.4) -> up ([0,1], p0) acting ([0,1], p0)
osdmap e537 pool 'data' (1) object 'test-object-1' -> pg 1.d1743484 (1.4) -> up ([0,1], p0) acting ([0,1], p0)
To remove the test object, simply delete it using the ``rados rm``
command. For example:
To remove the test object, simply delete it by running the ``rados rm``
command. For example:
.. prompt:: bash $
rados rm test-object-1 --pool=data
As the cluster evolves, the object location may change dynamically. One benefit
of Ceph's dynamic rebalancing is that Ceph relieves you from having to perform
the migration manually. See the `Architecture`_ section for details.
of Ceph's dynamic rebalancing is that Ceph spares you the burden of manually
performing the migration. For details, see the `Architecture`_ section.
.. _data placement: ../data-placement
.. _pool: ../pools

View File

@ -2,9 +2,9 @@
Monitoring a Cluster
======================
Once you have a running cluster, you may use the ``ceph`` tool to monitor your
After you have a running cluster, you can use the ``ceph`` tool to monitor your
cluster. Monitoring a cluster typically involves checking OSD status, monitor
status, placement group status and metadata server status.
status, placement group status, and metadata server status.
Using the command line
======================
@ -13,11 +13,11 @@ Interactive mode
----------------
To run the ``ceph`` tool in interactive mode, type ``ceph`` at the command line
with no arguments. For example:
with no arguments. For example:
.. prompt:: bash $
ceph
ceph
.. prompt:: ceph>
:prompts: ceph>
@ -30,8 +30,9 @@ with no arguments. For example:
Non-default paths
-----------------
If you specified non-default locations for your configuration or keyring,
you may specify their locations:
If you specified non-default locations for your configuration or keyring when
you install the cluster, you may specify their locations to the ``ceph`` tool
by running the following command:
.. prompt:: bash $
@ -40,30 +41,32 @@ you may specify their locations:
Checking a Cluster's Status
===========================
After you start your cluster, and before you start reading and/or
writing data, check your cluster's status first.
After you start your cluster, and before you start reading and/or writing data,
you should check your cluster's status.
To check a cluster's status, execute the following:
To check a cluster's status, run the following command:
.. prompt:: bash $
ceph status
Or:
Alternatively, you can run the following command:
.. prompt:: bash $
ceph -s
In interactive mode, type ``status`` and press **Enter**:
In interactive mode, this operation is performed by typing ``status`` and
pressing **Enter**:
.. prompt:: ceph>
:prompts: ceph>
ceph> status
status
Ceph will print the cluster status. For example, a tiny Ceph demonstration
cluster with one of each service may print the following:
Ceph will print the cluster status. For example, a tiny Ceph "demonstration
cluster" that is running one instance of each service (monitor, manager, and
OSD) might print the following:
::
@ -84,33 +87,35 @@ cluster with one of each service may print the following:
pgs: 16 active+clean
.. topic:: How Ceph Calculates Data Usage
How Ceph Calculates Data Usage
------------------------------
The ``usage`` value reflects the *actual* amount of raw storage used. The
``xxx GB / xxx GB`` value means the amount available (the lesser number)
of the overall storage capacity of the cluster. The notional number reflects
the size of the stored data before it is replicated, cloned or snapshotted.
Therefore, the amount of data actually stored typically exceeds the notional
amount stored, because Ceph creates replicas of the data and may also use
storage capacity for cloning and snapshotting.
The ``usage`` value reflects the *actual* amount of raw storage used. The ``xxx
GB / xxx GB`` value means the amount available (the lesser number) of the
overall storage capacity of the cluster. The notional number reflects the size
of the stored data before it is replicated, cloned or snapshotted. Therefore,
the amount of data actually stored typically exceeds the notional amount
stored, because Ceph creates replicas of the data and may also use storage
capacity for cloning and snapshotting.
Watching a Cluster
==================
In addition to local logging by each daemon, Ceph clusters maintain
a *cluster log* that records high level events about the whole system.
This is logged to disk on monitor servers (as ``/var/log/ceph/ceph.log`` by
default), but can also be monitored via the command line.
Each daemon in the Ceph cluster maintains a log of events, and the Ceph cluster
itself maintains a *cluster log* that records high-level events about the
entire Ceph cluster. These events are logged to disk on monitor servers (in
the default location ``/var/log/ceph/ceph.log``), and they can be monitored via
the command line.
To follow the cluster log, use the following command:
To follow the cluster log, run the following command:
.. prompt:: bash $
ceph -w
Ceph will print the status of the system, followed by each log message as it
is emitted. For example:
Ceph will print the status of the system, followed by each log message as it is
added. For example:
::
@ -135,21 +140,20 @@ is emitted. For example:
2017-07-24 08:15:14.258143 mon.a mon.0 172.21.9.34:6789/0 39 : cluster [INF] Activating manager daemon x
2017-07-24 08:15:15.446025 mon.a mon.0 172.21.9.34:6789/0 47 : cluster [INF] Manager daemon x is now available
In addition to using ``ceph -w`` to print log lines as they are emitted,
use ``ceph log last [n]`` to see the most recent ``n`` lines from the cluster
log.
Instead of printing log lines as they are added, you might want to print only
the most recent lines. Run ``ceph log last [n]`` to see the most recent ``n``
lines from the cluster log.
Monitoring Health Checks
========================
Ceph continuously runs various *health checks* against its own status. When
a health check fails, this is reflected in the output of ``ceph status`` (or
``ceph health``). In addition, messages are sent to the cluster log to
indicate when a check fails, and when the cluster recovers.
Ceph continuously runs various *health checks*. When
a health check fails, this failure is reflected in the output of ``ceph status`` and
``ceph health``. The cluster log receives messages that
indicate when a check has failed and when the cluster has recovered.
For example, when an OSD goes down, the ``health`` section of the status
output may be updated as follows:
output is updated as follows:
::
@ -157,7 +161,7 @@ output may be updated as follows:
1 osds down
Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded
At this time, cluster log messages are also emitted to record the failure of the
At the same time, cluster log messages are emitted to record the failure of the
health checks:
::
@ -166,7 +170,7 @@ health checks:
2017-07-25 10:09:01.302624 mon.a mon.0 172.21.9.34:6789/0 94 : cluster [WRN] Health check failed: Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded (PG_DEGRADED)
When the OSD comes back online, the cluster log records the cluster's return
to a health state:
to a healthy state:
::
@ -177,21 +181,23 @@ to a health state:
Network Performance Checks
--------------------------
Ceph OSDs send heartbeat ping messages amongst themselves to monitor daemon availability. We
also use the response times to monitor network performance.
While it is possible that a busy OSD could delay a ping response, we can assume
that if a network switch fails multiple delays will be detected between distinct pairs of OSDs.
Ceph OSDs send heartbeat ping messages to each other in order to monitor daemon
availability and network performance. If a single delayed response is detected,
this might indicate nothing more than a busy OSD. But if multiple delays
between distinct pairs of OSDs are detected, this might indicate a failed
network switch, a NIC failure, or a layer 1 failure.
By default we will warn about ping times which exceed 1 second (1000 milliseconds).
By default, a heartbeat time that exceeds 1 second (1000 milliseconds) raises a
health check (a ``HEALTH_WARN``. For example:
::
HEALTH_WARN Slow OSD heartbeats on back (longest 1118.001ms)
The health detail will add the combination of OSDs are seeing the delays and by how much. There is a limit of 10
detail line items.
::
In the output of the ``ceph health detail`` command, you can see which OSDs are
experiencing delays and how long the delays are. The output of ``ceph health
detail`` is limited to ten lines. Here is an example of the output you can
expect from the ``ceph health detail`` command::
[WRN] OSD_SLOW_PING_TIME_BACK: Slow OSD heartbeats on back (longest 1118.001ms)
Slow OSD heartbeats on back from osd.0 [dc1,rack1] to osd.1 [dc1,rack1] 1118.001 msec possibly improving
@ -199,11 +205,15 @@ detail line items.
Slow OSD heartbeats on back from osd.2 [dc1,rack2] to osd.1 [dc1,rack1] 1015.321 msec
Slow OSD heartbeats on back from osd.1 [dc1,rack1] to osd.0 [dc1,rack1] 1010.456 msec
To see even more detail and a complete dump of network performance information the ``dump_osd_network`` command can be used. Typically, this would be
sent to a mgr, but it can be limited to a particular OSD's interactions by issuing it to any OSD. The current threshold which defaults to 1 second
(1000 milliseconds) can be overridden as an argument in milliseconds.
To see more detail and to collect a complete dump of network performance
information, use the ``dump_osd_network`` command. This command is usually sent
to a Ceph Manager Daemon, but it can be used to collect information about a
specific OSD's interactions by sending it to that OSD. The default threshold
for a slow heartbeat is 1 second (1000 milliseconds), but this can be
overridden by providing a number of milliseconds as an argument.
The following command will show all gathered network performance data by specifying a threshold of 0 and sending to the mgr.
To show all network performance data with a specified threshold of 0, send the
following command to the mgr:
.. prompt:: bash $
@ -287,26 +297,26 @@ The following command will show all gathered network performance data by specify
Muting health checks
Muting Health Checks
--------------------
Health checks can be muted so that they do not affect the overall
reported status of the cluster. Alerts are specified using the health
check code (see :ref:`health-checks`):
Health checks can be muted so that they have no effect on the overall
reported status of the cluster. For example, if the cluster has raised a
single health check and then you mute that health check, then the cluster will report a status of ``HEALTH_OK``.
To mute a specific health check, use the health check code that corresponds to that health check (see :ref:`health-checks`), and
run the following command:
.. prompt:: bash $
ceph health mute <code>
For example, if there is a health warning, muting it will make the
cluster report an overall status of ``HEALTH_OK``. For example, to
mute an ``OSD_DOWN`` alert,:
For example, to mute an ``OSD_DOWN`` health check, run the following command:
.. prompt:: bash $
ceph health mute OSD_DOWN
Mutes are reported as part of the short and long form of the ``ceph health`` command.
Mutes are reported as part of the short and long form of the ``ceph health`` command's output.
For example, in the above scenario, the cluster would report:
.. prompt:: bash $
@ -327,7 +337,7 @@ For example, in the above scenario, the cluster would report:
(MUTED) OSD_DOWN 1 osds down
osd.1 is down
A mute can be explicitly removed with:
A mute can be removed by running the following command:
.. prompt:: bash $
@ -339,56 +349,50 @@ For example:
ceph health unmute OSD_DOWN
A health check mute may optionally have a TTL (time to live)
associated with it, such that the mute will automatically expire
after the specified period of time has elapsed. The TTL is specified as an optional
duration argument, e.g.:
A "health mute" can have a TTL (**T**\ime **T**\o **L**\ive)
associated with it: this means that the mute will automatically expire
after a specified period of time. The TTL is specified as an optional
duration argument, as seen in the following examples:
.. prompt:: bash $
ceph health mute OSD_DOWN 4h # mute for 4 hours
ceph health mute MON_DOWN 15m # mute for 15 minutes
ceph health mute MON_DOWN 15m # mute for 15 minutes
Normally, if a muted health alert is resolved (e.g., in the example
above, the OSD comes back up), the mute goes away. If the alert comes
Normally, if a muted health check is resolved (for example, if the OSD that raised the ``OSD_DOWN`` health check
in the example above has come back up), the mute goes away. If the health check comes
back later, it will be reported in the usual way.
It is possible to make a mute "sticky" such that the mute will remain even if the
alert clears. For example:
It is possible to make a health mute "sticky": this means that the mute will remain even if the
health check clears. For example, to make a health mute "sticky", you might run the following command:
.. prompt:: bash $
ceph health mute OSD_DOWN 1h --sticky # ignore any/all down OSDs for next hour
Most health mutes also disappear if the extent of an alert gets worse. For example,
if there is one OSD down, and the alert is muted, the mute will disappear if one
or more additional OSDs go down. This is true for any health alert that involves
a count indicating how much or how many of something is triggering the warning or
error.
Most health mutes disappear if the unhealthy condition that triggered the health check gets worse.
For example, suppose that there is one OSD down and the health check is muted. In that case, if
one or more additional OSDs go down, then the health mute disappears. This behavior occurs in any health check with a threshold value.
Detecting configuration issues
Detecting Configuration Issues
==============================
In addition to the health checks that Ceph continuously runs on its
own status, there are some configuration issues that may only be detected
by an external tool.
Use the `ceph-medic`_ tool to run these additional checks on your Ceph
cluster's configuration.
Although Ceph continuously monitors itself, some configuration issues can be
detected only with an external tool called ``ceph-medic``.
>>>>>>> 41684ebd33b (doc/rados: edit ops/monitoring.rst (2 of 3))
Checking a Cluster's Usage Stats
================================
To check a cluster's data usage and data distribution among pools, you can
use the ``df`` option. It is similar to Linux ``df``. Execute
the following:
To check a cluster's data usage and data distribution among pools, use the
``df`` command. This option is similar to Linux's ``df`` command. Run the
following command:
.. prompt:: bash $
ceph df
The output of ``ceph df`` looks like this::
The output of ``ceph df`` resembles the following::
CLASS SIZE AVAIL USED RAW USED %RAW USED
ssd 202 GiB 200 GiB 2.0 GiB 2.0 GiB 1.00
@ -401,10 +405,6 @@ The output of ``ceph df`` looks like this::
cephfs.a.data 3 32 0 B 0 B 0 B 0 0 B 0 B 0 B 0 99 GiB N/A N/A 0 0 B 0 B
test 4 32 22 MiB 22 MiB 50 KiB 248 19 MiB 19 MiB 50 KiB 0 297 GiB N/A N/A 248 0 B 0 B
- **CLASS:** for example, "ssd" or "hdd"
- **SIZE:** The amount of storage capacity managed by the cluster.
- **AVAIL:** The amount of free space available in the cluster.
@ -644,4 +644,3 @@ directly to the host in question ).
.. _Viewing a Configuration at Runtime: ../../configuration/ceph-conf#viewing-a-configuration-at-runtime
.. _Storage Capacity: ../../configuration/mon-config-ref#storage-capacity
.. _ceph-medic: http://docs.ceph.com/ceph-medic/master/

View File

@ -6,50 +6,52 @@
Running Ceph with systemd
==========================
=========================
For all distributions that support systemd (CentOS 7, Fedora, Debian
Jessie 8 and later, SUSE), ceph daemons are now managed using native
systemd files instead of the legacy sysvinit scripts. For example:
In all distributions that support systemd (CentOS 7, Fedora, Debian
Jessie 8 and later, and SUSE), systemd files (and NOT legacy SysVinit scripts)
are used to manage Ceph daemons. Ceph daemons therefore behave like any other daemons
that can be controlled by the ``systemctl`` command, as in the following examples:
.. prompt:: bash $
sudo systemctl start ceph.target # start all daemons
sudo systemctl status ceph-osd@12 # check status of osd.12
To list the Ceph systemd units on a node, execute:
To list all of the Ceph systemd units on a node, run the following command:
.. prompt:: bash $
sudo systemctl status ceph\*.service ceph\*.target
Starting all Daemons
Starting all daemons
--------------------
To start all daemons on a Ceph Node (irrespective of type), execute the
following:
To start all of the daemons on a Ceph node (regardless of their type), run the
following command:
.. prompt:: bash $
sudo systemctl start ceph.target
Stopping all Daemons
Stopping all daemons
--------------------
To stop all daemons on a Ceph Node (irrespective of type), execute the
following:
To stop all of the daemons on a Ceph node (regardless of their type), run the
following command:
.. prompt:: bash $
sudo systemctl stop ceph\*.service ceph\*.target
Starting all Daemons by Type
Starting all daemons by type
----------------------------
To start all daemons of a particular type on a Ceph Node, execute one of the
following:
To start all of the daemons of a particular type on a Ceph node, run one of the
following commands:
.. prompt:: bash $
@ -58,24 +60,24 @@ following:
sudo systemctl start ceph-mds.target
Stopping all Daemons by Type
Stopping all daemons by type
----------------------------
To stop all daemons of a particular type on a Ceph Node, execute one of the
following:
To stop all of the daemons of a particular type on a Ceph node, run one of the
following commands:
.. prompt:: bash $
sudo systemctl stop ceph-mon\*.service ceph-mon.target
sudo systemctl stop ceph-osd\*.service ceph-osd.target
sudo systemctl stop ceph-mon\*.service ceph-mon.target
sudo systemctl stop ceph-mds\*.service ceph-mds.target
Starting a Daemon
Starting a daemon
-----------------
To start a specific daemon instance on a Ceph Node, execute one of the
following:
To start a specific daemon instance on a Ceph node, run one of the
following commands:
.. prompt:: bash $
@ -92,11 +94,11 @@ For example:
sudo systemctl start ceph-mds@ceph-server
Stopping a Daemon
Stopping a daemon
-----------------
To stop a specific daemon instance on a Ceph Node, execute one of the
following:
To stop a specific daemon instance on a Ceph node, run one of the
following commands:
.. prompt:: bash $
@ -194,15 +196,14 @@ For example::
.. index:: sysvinit; operating a cluster
Running Ceph with sysvinit
Running Ceph with SysVinit
==========================
Each time you to **start**, **restart**, and **stop** Ceph daemons (or your
entire cluster) you must specify at least one option and one command. You may
also specify a daemon type or a daemon instance. ::
{commandline} [options] [commands] [daemons]
Each time you start, restart, or stop Ceph daemons, you must specify at least one option and one command.
Likewise, each time you start, restart, or stop your entire cluster, you must specify at least one option and one command.
In both cases, you can also specify a daemon type or a daemon instance. ::
{commandline} [options] [commands] [daemons]
The ``ceph`` options include:
@ -213,12 +214,12 @@ The ``ceph`` options include:
+-----------------+----------+-------------------------------------------------+
| ``--valgrind`` | ``N/A`` | (Dev and QA only) Use `Valgrind`_ debugging. |
+-----------------+----------+-------------------------------------------------+
| ``--allhosts`` | ``-a`` | Execute on all nodes in ``ceph.conf.`` |
| ``--allhosts`` | ``-a`` | Execute on all nodes listed in ``ceph.conf``. |
| | | Otherwise, it only executes on ``localhost``. |
+-----------------+----------+-------------------------------------------------+
| ``--restart`` | ``N/A`` | Automatically restart daemon if it core dumps. |
+-----------------+----------+-------------------------------------------------+
| ``--norestart`` | ``N/A`` | Don't restart a daemon if it core dumps. |
| ``--norestart`` | ``N/A`` | Do not restart a daemon if it core dumps. |
+-----------------+----------+-------------------------------------------------+
| ``--conf`` | ``-c`` | Use an alternate configuration file. |
+-----------------+----------+-------------------------------------------------+
@ -232,7 +233,7 @@ The ``ceph`` commands include:
+------------------+------------------------------------------------------------+
| ``stop`` | Stop the daemon(s). |
+------------------+------------------------------------------------------------+
| ``forcestop`` | Force the daemon(s) to stop. Same as ``kill -9`` |
| ``forcestop`` | Force the daemon(s) to stop. Same as ``kill -9``. |
+------------------+------------------------------------------------------------+
| ``killall`` | Kill all daemons of a particular type. |
+------------------+------------------------------------------------------------+
@ -241,15 +242,12 @@ The ``ceph`` commands include:
| ``cleanalllogs`` | Cleans out **everything** in the log directory. |
+------------------+------------------------------------------------------------+
For subsystem operations, the ``ceph`` service can target specific daemon types
by adding a particular daemon type for the ``[daemons]`` option. Daemon types
include:
The ``[daemons]`` option allows the ``ceph`` service to target specific daemon types
in order to perform subsystem operations. Daemon types include:
- ``mon``
- ``osd``
- ``mds``
.. _Valgrind: http://www.valgrind.org/
.. _initctl: http://manpages.ubuntu.com/manpages/raring/en/man8/initctl.8.html

View File

@ -1,59 +1,60 @@
============================
Repairing PG inconsistencies
Repairing PG Inconsistencies
============================
Sometimes a placement group might become "inconsistent". To return the
placement group to an active+clean state, you must first determine which
of the placement groups has become inconsistent and then run the "pg
repair" command on it. This page contains commands for diagnosing placement
groups and the command for repairing placement groups that have become
Sometimes a Placement Group (PG) might become ``inconsistent``. To return the PG
to an ``active+clean`` state, you must first determine which of the PGs has become
inconsistent and then run the ``pg repair`` command on it. This page contains
commands for diagnosing PGs and the command for repairing PGs that have become
inconsistent.
.. highlight:: console
Commands for Diagnosing Placement-group Problems
================================================
The commands in this section provide various ways of diagnosing broken placement groups.
Commands for Diagnosing PG Problems
===================================
The commands in this section provide various ways of diagnosing broken PGs.
The following command provides a high-level (low detail) overview of the health of the ceph cluster:
To see a high-level (low-detail) overview of Ceph cluster health, run the
following command:
.. prompt:: bash #
ceph health detail
The following command provides more detail on the status of the placement groups:
To see more detail on the status of the PGs, run the following command:
.. prompt:: bash #
ceph pg dump --format=json-pretty
The following command lists inconsistent placement groups:
To see a list of inconsistent PGs, run the following command:
.. prompt:: bash #
rados list-inconsistent-pg {pool}
The following command lists inconsistent rados objects:
To see a list of inconsistent RADOS objects, run the following command:
.. prompt:: bash #
rados list-inconsistent-obj {pgid}
The following command lists inconsistent snapsets in the given placement group:
To see a list of inconsistent snapsets in a specific PG, run the following
commands:
.. prompt:: bash #
rados list-inconsistent-snapset {pgid}
Commands for Repairing Placement Groups
=======================================
The form of the command to repair a broken placement group is:
Commands for Repairing PGs
==========================
The form of the command to repair a broken PG is as follows:
.. prompt:: bash #
ceph pg repair {pgid}
Where ``{pgid}`` is the id of the affected placement group.
Here ``{pgid}`` represents the id of the affected PG.
For example:
@ -61,21 +62,57 @@ For example:
ceph pg repair 1.4
More Information on Placement Group Repair
==========================================
Ceph stores and updates the checksums of objects stored in the cluster. When a scrub is performed on a placement group, the OSD attempts to choose an authoritative copy from among its replicas. Among all of the possible cases, only one case is consistent. After a deep scrub, Ceph calculates the checksum of an object read from the disk and compares it to the checksum previously recorded. If the current checksum and the previously recorded checksums do not match, that is an inconsistency. In the case of replicated pools, any mismatch between the checksum of any replica of an object and the checksum of the authoritative copy means that there is an inconsistency.
.. note:: PG IDs have the form ``N.xxxxx``, where ``N`` is the number of the
pool that contains the PG. The command ``ceph osd listpools`` and the
command ``ceph osd dump | grep pool`` return a list of pool numbers.
The "pg repair" command attempts to fix inconsistencies of various kinds. If "pg repair" finds an inconsistent placement group, it attempts to overwrite the digest of the inconsistent copy with the digest of the authoritative copy. If "pg repair" finds an inconsistent replicated pool, it marks the inconsistent copy as missing. Recovery, in the case of replicated pools, is beyond the scope of "pg repair".
More Information on PG Repair
=============================
Ceph stores and updates the checksums of objects stored in the cluster. When a
scrub is performed on a PG, the OSD attempts to choose an authoritative copy
from among its replicas. Only one of the possible cases is consistent. After
performing a deep scrub, Ceph calculates the checksum of an object that is read
from disk and compares it to the checksum that was previously recorded. If the
current checksum and the previously recorded checksum do not match, that
mismatch is considered to be an inconsistency. In the case of replicated pools,
any mismatch between the checksum of any replica of an object and the checksum
of the authoritative copy means that there is an inconsistency. The discovery
of these inconsistencies cause a PG's state to be set to ``inconsistent``.
For erasure coded and bluestore pools, Ceph will automatically repair if osd_scrub_auto_repair (configuration default "false") is set to true and at most osd_scrub_auto_repair_num_errors (configuration default 5) errors are found.
The ``pg repair`` command attempts to fix inconsistencies of various kinds. If
``pg repair`` finds an inconsistent PG, it attempts to overwrite the digest of
the inconsistent copy with the digest of the authoritative copy. If ``pg
repair`` finds an inconsistent replicated pool, it marks the inconsistent copy
as missing. In the case of replicated pools, recovery is beyond the scope of
``pg repair``.
"pg repair" will not solve every problem. Ceph does not automatically repair placement groups when inconsistencies are found in them.
In the case of erasure-coded and BlueStore pools, Ceph will automatically
perform repairs if ``osd_scrub_auto_repair`` (default ``false`) is set to
``true`` and if no more than ``osd_scrub_auto_repair_num_errors`` (default
``5``) errors are found.
The checksum of an object or an omap is not always available. Checksums are calculated incrementally. If a replicated object is updated non-sequentially, the write operation involved in the update changes the object and invalidates its checksum. The whole object is not read while recalculating the checksum. "ceph pg repair" is able to repair things even when checksums are not available to it, as in the case of filestore. When replicated filestore pools are in question, users might prefer manual repair to "ceph pg repair".
The ``pg repair`` command will not solve every problem. Ceph does not
automatically repair PGs when they are found to contain inconsistencies.
The material in this paragraph is relevant for filestore, and bluestore has its own internal checksums. The matched-record checksum and the calculated checksum cannot prove that the authoritative copy is in fact authoritative. In the case that there is no checksum available, "pg repair" favors the data on the primary. this might or might not be the uncorrupted replica. This is why human intervention is necessary when an inconsistency is discovered. Human intervention sometimes means using the "ceph-objectstore-tool".
The checksum of a RADOS object or an omap is not always available. Checksums
are calculated incrementally. If a replicated object is updated
non-sequentially, the write operation involved in the update changes the object
and invalidates its checksum. The whole object is not read while the checksum
is recalculated. The ``pg repair`` command is able to make repairs even when
checksums are not available to it, as in the case of Filestore. Users working
with replicated Filestore pools might prefer manual repair to ``ceph pg
repair``.
This material is relevant for Filestore, but not for BlueStore, which has its
own internal checksums. The matched-record checksum and the calculated checksum
cannot prove that any specific copy is in fact authoritative. If there is no
checksum available, ``pg repair`` favors the data on the primary, but this
might not be the uncorrupted replica. Because of this uncertainty, human
intervention is necessary when an inconsistency is discovered. This
intervention sometimes involves use of ``ceph-objectstore-tool``.
External Links
==============
https://ceph.io/geen-categorie/ceph-manually-repair-object/ - This page contains a walkthrough of the repair of a placement group, and is recommended reading if you want to repair a placement
group but have never done so.
https://ceph.io/geen-categorie/ceph-manually-repair-object/ - This page
contains a walkthrough of the repair of a PG. It is recommended reading if you
want to repair a PG but have never done so.

View File

@ -1,52 +1,56 @@
.. _upmap:
Using the pg-upmap
==================
Using pg-upmap
==============
Starting in Luminous v12.2.z there is a new *pg-upmap* exception table
In Luminous v12.2.z and later releases, there is a *pg-upmap* exception table
in the OSDMap that allows the cluster to explicitly map specific PGs to
specific OSDs. This allows the cluster to fine-tune the data
distribution to, in most cases, perfectly distributed PGs across OSDs.
specific OSDs. This allows the cluster to fine-tune the data distribution to,
in most cases, uniformly distribute PGs across OSDs.
The key caveat to this new mechanism is that it requires that all
clients understand the new *pg-upmap* structure in the OSDMap.
However, there is an important caveat when it comes to this new feature: it
requires all clients to understand the new *pg-upmap* structure in the OSDMap.
Enabling
--------
New clusters will have this module on by default. The cluster must only
have luminous (and newer) clients. You can the turn the balancer off with:
In order to use ``pg-upmap``, the cluster cannot have any pre-Luminous clients.
By default, new clusters enable the *balancer module*, which makes use of
``pg-upmap``. If you want to use a different balancer or you want to make your
own custom ``pg-upmap`` entries, you might want to turn off the balancer in
order to avoid conflict:
.. prompt:: bash $
ceph balancer off
To allow use of the feature on existing clusters, you must tell the
cluster that it only needs to support luminous (and newer) clients with:
To allow use of the new feature on an existing cluster, you must restrict the
cluster to supporting only Luminous (and newer) clients. To do so, run the
following command:
.. prompt:: bash $
ceph osd set-require-min-compat-client luminous
This command will fail if any pre-luminous clients or daemons are
connected to the monitors. You can see what client versions are in
use with:
This command will fail if any pre-Luminous clients or daemons are connected to
the monitors. To see which client versions are in use, run the following
command:
.. prompt:: bash $
ceph features
Balancer module
-----------------
The `balancer` module for ceph-mgr will automatically balance
the number of PGs per OSD. See :ref:`balancer`
---------------
The `balancer` module for ``ceph-mgr`` will automatically balance the number of
PGs per OSD. See :ref:`balancer`
Offline optimization
--------------------
Upmap entries are updated with an offline optimizer built into ``osdmaptool``.
Upmap entries are updated with an offline optimizer that is built into
``osdmaptool``.
#. Grab the latest copy of your osdmap:
@ -64,27 +68,28 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``.
[--upmap-active]
It is highly recommended that optimization be done for each pool
individually, or for sets of similarly-utilized pools. You can
specify the ``--upmap-pool`` option multiple times. "Similar pools"
means pools that are mapped to the same devices and store the same
kind of data (e.g., RBD image pools, yes; RGW index pool and RGW
data pool, no).
individually, or for sets of similarly utilized pools. You can specify the
``--upmap-pool`` option multiple times. "Similarly utilized pools" means
pools that are mapped to the same devices and that store the same kind of
data (for example, RBD image pools are considered to be similarly utilized;
an RGW index pool and an RGW data pool are not considered to be similarly
utilized).
The ``max-optimizations`` value is the maximum number of upmap entries to
identify in the run. The default is `10` like the ceph-mgr balancer module,
but you should use a larger number if you are doing offline optimization.
If it cannot find any additional changes to make it will stop early
(i.e., when the pool distribution is perfect).
The ``max-optimizations`` value determines the maximum number of upmap
entries to identify. The default is `10` (as is the case with the
``ceph-mgr`` balancer module), but you should use a larger number if you are
doing offline optimization. If it cannot find any additional changes to
make (that is, if the pool distribution is perfect), it will stop early.
The ``max-deviation`` value defaults to `5`. If an OSD PG count
varies from the computed target number by less than or equal
to this amount it will be considered perfect.
The ``max-deviation`` value defaults to `5`. If an OSD's PG count varies
from the computed target number by no more than this amount it will be
considered perfect.
The ``--upmap-active`` option simulates the behavior of the active
balancer in upmap mode. It keeps cycling until the OSDs are balanced
and reports how many rounds and how long each round is taking. The
elapsed time for rounds indicates the CPU load ceph-mgr will be
consuming when it tries to compute the next optimization plan.
The ``--upmap-active`` option simulates the behavior of the active balancer
in upmap mode. It keeps cycling until the OSDs are balanced and reports how
many rounds have occurred and how long each round takes. The elapsed time
for rounds indicates the CPU load that ``ceph-mgr`` consumes when it computes
the next optimization plan.
#. Apply the changes:
@ -92,14 +97,13 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``.
source out.txt
The proposed changes are written to the output file ``out.txt`` in
the example above. These are normal ceph CLI commands that can be
run to apply the changes to the cluster.
In the above example, the proposed changes are written to the output file
``out.txt``. The commands in this procedure are normal Ceph CLI commands
that can be run in order to apply the changes to the cluster.
The above steps can be repeated as many times as necessary to achieve a perfect
distribution of PGs for each set of pools.
The above steps can be repeated as many times as necessary to achieve
a perfect distribution of PGs for each set of pools.
You can see some (gory) details about what the tool is doing by
passing ``--debug-osd 10`` and even more with ``--debug-crush 10``
to ``osdmaptool``.
To see some (gory) details about what the tool is doing, you can pass
``--debug-osd 10`` to ``osdmaptool``. To see even more details, pass
``--debug-crush 10`` to ``osdmaptool``.

View File

@ -4,10 +4,11 @@
User Management
=================
This document describes :term:`Ceph Client` users, and their authentication and
authorization with the :term:`Ceph Storage Cluster`. Users are either
individuals or system actors such as applications, which use Ceph clients to
interact with the Ceph Storage Cluster daemons.
This document describes :term:`Ceph Client` users, and describes the process by
which they perform authentication and authorization so that they can access the
:term:`Ceph Storage Cluster`. Users are either individuals or system actors
(for example, applications) that use Ceph clients to interact with the Ceph
Storage Cluster daemons.
.. ditaa::
+-----+
@ -24,19 +25,21 @@ interact with the Ceph Storage Cluster daemons.
actor
When Ceph runs with authentication and authorization enabled (enabled by
default), you must specify a user name and a keyring containing the secret key
of the specified user (usually via the command line). If you do not specify a
user name, Ceph will use ``client.admin`` as the default user name. If you do
not specify a keyring, Ceph will look for a keyring via the ``keyring`` setting
in the Ceph configuration. For example, if you execute the ``ceph health``
command without specifying a user or keyring:
When Ceph runs with authentication and authorization enabled (both are enabled
by default), you must specify a user name and a keyring that contains the
secret key of the specified user (usually these are specified via the command
line). If you do not specify a user name, Ceph will use ``client.admin`` as the
default user name. If you do not specify a keyring, Ceph will look for a
keyring via the ``keyring`` setting in the Ceph configuration. For example, if
you execute the ``ceph health`` command without specifying a user or a keyring,
Ceph will assume that the keyring is in ``/etc/ceph/ceph.client.admin.keyring``
and will attempt to use that keyring. The following illustrates this behavior:
.. prompt:: bash $
ceph health
Ceph interprets the command like this:
Ceph will interpret the command like this:
.. prompt:: bash $
@ -45,118 +48,122 @@ Ceph interprets the command like this:
Alternatively, you may use the ``CEPH_ARGS`` environment variable to avoid
re-entry of the user name and secret.
For details on configuring the Ceph Storage Cluster to use authentication,
see `Cephx Config Reference`_. For details on the architecture of Cephx, see
For details on configuring the Ceph Storage Cluster to use authentication, see
`Cephx Config Reference`_. For details on the architecture of Cephx, see
`Architecture - High Availability Authentication`_.
Background
==========
Irrespective of the type of Ceph client (e.g., Block Device, Object Storage,
Filesystem, native API, etc.), Ceph stores all data as objects within `pools`_.
Ceph users must have access to pools in order to read and write data.
Additionally, Ceph users must have execute permissions to use Ceph's
administrative commands. The following concepts will help you understand Ceph
user management.
No matter what type of Ceph client is used (for example: Block Device, Object
Storage, Filesystem, native API), Ceph stores all data as RADOS objects within
`pools`_. Ceph users must have access to a given pool in order to read and
write data, and Ceph users must have execute permissions in order to use Ceph's
administrative commands. The following concepts will help you understand
Ceph['s] user management.
.. _rados-ops-user:
User
----
A user is either an individual or a system actor such as an application.
A user is either an individual or a system actor (for example, an application).
Creating users allows you to control who (or what) can access your Ceph Storage
Cluster, its pools, and the data within pools.
Cluster, its pools, and the data within those pools.
Ceph has the notion of a ``type`` of user. For the purposes of user management,
the type will always be ``client``. Ceph identifies users in period (.)
delimited form consisting of the user type and the user ID: for example,
Ceph has the concept of a ``type`` of user. For purposes of user management,
the type will always be ``client``. Ceph identifies users in a "period-
delimited form" that consists of the user type and the user ID: for example,
``TYPE.ID``, ``client.admin``, or ``client.user1``. The reason for user typing
is that Ceph Monitors, OSDs, and Metadata Servers also use the Cephx protocol,
but they are not clients. Distinguishing the user type helps to distinguish
between client users and other users--streamlining access control, user
monitoring and traceability.
is that the Cephx protocol is used not only by clients but also non-clients,
such as Ceph Monitors, OSDs, and Metadata Servers. Distinguishing the user type
helps to distinguish between client users and other users. This distinction
streamlines access control, user monitoring, and traceability.
Sometimes Ceph's user type may seem confusing, because the Ceph command line
Sometimes Ceph's user type might seem confusing, because the Ceph command line
allows you to specify a user with or without the type, depending upon your
command line usage. If you specify ``--user`` or ``--id``, you can omit the
type. So ``client.user1`` can be entered simply as ``user1``. If you specify
``--name`` or ``-n``, you must specify the type and name, such as
``client.user1``. We recommend using the type and name as a best practice
wherever possible.
type. For example, ``client.user1`` can be entered simply as ``user1``. On the
other hand, if you specify ``--name`` or ``-n``, you must supply the type and
name: for example, ``client.user1``. We recommend using the type and name as a
best practice wherever possible.
.. note:: A Ceph Storage Cluster user is not the same as a Ceph Object Storage
user or a Ceph File System user. The Ceph Object Gateway uses a Ceph Storage
Cluster user to communicate between the gateway daemon and the storage
cluster, but the gateway has its own user management functionality for end
users. The Ceph File System uses POSIX semantics. The user space associated
with the Ceph File System is not the same as a Ceph Storage Cluster user.
cluster, but the Ceph Object Gateway has its own user-management
functionality for end users. The Ceph File System uses POSIX semantics, and
the user space associated with the Ceph File System is not the same as the
user space associated with a Ceph Storage Cluster user.
Authorization (Capabilities)
----------------------------
Ceph uses the term "capabilities" (caps) to describe authorizing an
authenticated user to exercise the functionality of the monitors, OSDs and
Ceph uses the term "capabilities" (caps) to describe the permissions granted to
an authenticated user to exercise the functionality of the monitors, OSDs, and
metadata servers. Capabilities can also restrict access to data within a pool,
a namespace within a pool, or a set of pools based on their application tags.
A Ceph administrative user sets a user's capabilities when creating or updating
a user.
A Ceph administrative user specifies the capabilities of a user when creating
or updating that user.
Capability syntax follows the form::
Capability syntax follows this form::
{daemon-type} '{cap-spec}[, {cap-spec} ...]'
{daemon-type} '{cap-spec}[, {cap-spec} ...]'
- **Monitor Caps:** Monitor capabilities include ``r``, ``w``, ``x`` access
settings or ``profile {name}``. For example::
settings, and can be applied in aggregate from pre-defined profiles with
``profile {name}``. For example::
mon 'allow {access-spec} [network {network/prefix}]'
mon 'allow {access-spec} [network {network/prefix}]'
mon 'profile {name}'
mon 'profile {name}'
The ``{access-spec}`` syntax is as follows: ::
* | all | [r][w][x]
The optional ``{network/prefix}`` is a standard network name and
prefix length in CIDR notation (e.g., ``10.3.0.0/16``). If present,
the use of this capability is restricted to clients connecting from
this network.
The optional ``{network/prefix}`` is a standard network name and prefix
length in CIDR notation (for example, ``10.3.0.0/16``). If
``{network/prefix}`` is present, the monitor capability can be used only by
clients that connect from the specified network.
- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, ``class-read``,
``class-write`` access settings or ``profile {name}``. Additionally, OSD
capabilities also allow for pool and namespace settings. ::
- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, and
``class-read`` and ``class-write`` access settings. OSD capabilities can be
applied in aggregate from pre-defined profiles with ``profile {name}``. In
addition, OSD capabilities allow for pool and namespace settings. ::
osd 'allow {access-spec} [{match-spec}] [network {network/prefix}]'
osd 'allow {access-spec} [{match-spec}] [network {network/prefix}]'
osd 'profile {name} [pool={pool-name} [namespace={namespace-name}]] [network {network/prefix}]'
osd 'profile {name} [pool={pool-name} [namespace={namespace-name}]] [network {network/prefix}]'
The ``{access-spec}`` syntax is either of the following: ::
There are two alternative forms of the ``{access-spec}`` syntax: ::
* | all | [r][w][x] [class-read] [class-write]
class {class name} [{method name}]
The optional ``{match-spec}`` syntax is either of the following: ::
There are two alternative forms of the optional ``{match-spec}`` syntax::
pool={pool-name} [namespace={namespace-name}] [object_prefix {prefix}]
[namespace={namespace-name}] tag {application} {key}={value}
The optional ``{network/prefix}`` is a standard network name and
prefix length in CIDR notation (e.g., ``10.3.0.0/16``). If present,
the use of this capability is restricted to clients connecting from
this network.
The optional ``{network/prefix}`` is a standard network name and prefix
length in CIDR notation (for example, ``10.3.0.0/16``). If
``{network/prefix}`` is present, the OSD capability can be used only by
clients that connect from the specified network.
- **Manager Caps:** Manager (``ceph-mgr``) capabilities include
``r``, ``w``, ``x`` access settings or ``profile {name}``. For example: ::
- **Manager Caps:** Manager (``ceph-mgr``) capabilities include ``r``, ``w``,
``x`` access settings, and can be applied in aggregate from pre-defined
profiles with ``profile {name}``. For example::
mgr 'allow {access-spec} [network {network/prefix}]'
mgr 'allow {access-spec} [network {network/prefix}]'
mgr 'profile {name} [{key1} {match-type} {value1} ...] [network {network/prefix}]'
mgr 'profile {name} [{key1} {match-type} {value1} ...] [network {network/prefix}]'
Manager capabilities can also be specified for specific commands,
all commands exported by a built-in manager service, or all commands
exported by a specific add-on module. For example: ::
Manager capabilities can also be specified for specific commands, for all
commands exported by a built-in manager service, or for all commands exported
by a specific add-on module. For example::
mgr 'allow command "{command-prefix}" [with {key1} {match-type} {value1} ...] [network {network/prefix}]'
@ -176,15 +183,14 @@ Capability syntax follows the form::
= | prefix | regex
- **Metadata Server Caps:** For administrators, use ``allow *``. For all
other users, such as CephFS clients, consult :doc:`/cephfs/client-auth`
- **Metadata Server Caps:** For administrators, use ``allow *``. For all other
users (for example, CephFS clients), consult :doc:`/cephfs/client-auth`
.. note:: The Ceph Object Gateway daemon (``radosgw``) is a client of the
Ceph Storage Cluster, so it is not represented as a Ceph Storage
Cluster daemon type.
Ceph Storage Cluster. For this reason, it is not represented as
a Ceph Storage Cluster daemon type.
The following entries describe each access capability.
The following entries describe access capabilities.
``allow``
@ -206,7 +212,7 @@ The following entries describe each access capability.
``x``
:Description: Gives the user the capability to call class methods
(i.e., both read and write) and to conduct ``auth``
(that is, both read and write) and to conduct ``auth``
operations on monitors.
@ -224,75 +230,76 @@ The following entries describe each access capability.
``*``, ``all``
:Description: Gives the user read, write and execute permissions for a
particular daemon/pool, and the ability to execute
:Description: Gives the user read, write, and execute permissions for a
particular daemon/pool, as well as the ability to execute
admin commands.
The following entries describe valid capability profiles:
``profile osd`` (Monitor only)
:Description: Gives a user permissions to connect as an OSD to other OSDs or
monitors. Conferred on OSDs to enable OSDs to handle replication
monitors. Conferred on OSDs in order to enable OSDs to handle replication
heartbeat traffic and status reporting.
``profile mds`` (Monitor only)
:Description: Gives a user permissions to connect as a MDS to other MDSs or
:Description: Gives a user permissions to connect as an MDS to other MDSs or
monitors.
``profile bootstrap-osd`` (Monitor only)
:Description: Gives a user permissions to bootstrap an OSD. Conferred on
deployment tools such as ``ceph-volume``, ``cephadm``, etc.
so that they have permissions to add keys, etc. when
deployment tools such as ``ceph-volume`` and ``cephadm``
so that they have permissions to add keys when
bootstrapping an OSD.
``profile bootstrap-mds`` (Monitor only)
:Description: Gives a user permissions to bootstrap a metadata server.
Conferred on deployment tools such as ``cephadm``, etc.
so they have permissions to add keys, etc. when bootstrapping
Conferred on deployment tools such as ``cephadm``
so that they have permissions to add keys when bootstrapping
a metadata server.
``profile bootstrap-rbd`` (Monitor only)
:Description: Gives a user permissions to bootstrap an RBD user.
Conferred on deployment tools such as ``cephadm``, etc.
so they have permissions to add keys, etc. when bootstrapping
Conferred on deployment tools such as ``cephadm``
so that they have permissions to add keys when bootstrapping
an RBD user.
``profile bootstrap-rbd-mirror`` (Monitor only)
:Description: Gives a user permissions to bootstrap an ``rbd-mirror`` daemon
user. Conferred on deployment tools such as ``cephadm``, etc.
so they have permissions to add keys, etc. when bootstrapping
an ``rbd-mirror`` daemon.
user. Conferred on deployment tools such as ``cephadm`` so that
they have permissions to add keys when bootstrapping an
``rbd-mirror`` daemon.
``profile rbd`` (Manager, Monitor, and OSD)
:Description: Gives a user permissions to manipulate RBD images. When used
as a Monitor cap, it provides the minimal privileges required
by an RBD client application; this includes the ability
to blocklist other client users. When used as an OSD cap, it
provides read-write access to the specified pool to an
RBD client application. The Manager cap supports optional
``pool`` and ``namespace`` keyword arguments.
:Description: Gives a user permissions to manipulate RBD images. When used as a
Monitor cap, it provides the user with the minimal privileges
required by an RBD client application; such privileges include
the ability to blocklist other client users. When used as an OSD
cap, it provides an RBD client application with read-write access
to the specified pool. The Manager cap supports optional ``pool``
and ``namespace`` keyword arguments.
``profile rbd-mirror`` (Monitor only)
:Description: Gives a user permissions to manipulate RBD images and retrieve
RBD mirroring config-key secrets. It provides the minimal
privileges required for the ``rbd-mirror`` daemon.
privileges required for the user to manipulate the ``rbd-mirror``
daemon.
``profile rbd-read-only`` (Manager and OSD)
:Description: Gives a user read-only permissions to RBD images. The Manager
cap supports optional ``pool`` and ``namespace`` keyword
arguments.
:Description: Gives a user read-only permissions to RBD images. The Manager cap
supports optional ``pool`` and ``namespace`` keyword arguments.
``profile simple-rados-client`` (Monitor only)
@ -303,27 +310,27 @@ The following entries describe valid capability profiles:
:Description: Gives a user read-only permissions for monitor, OSD, and PG data.
Intended for use by direct librados client applications. Also
includes permission to add blocklist entries to build HA
applications.
includes permissions to add blocklist entries to build
high-availability (HA) applications.
``profile fs-client`` (Monitor only)
:Description: Gives a user read-only permissions for monitor, OSD, PG, and MDS
data. Intended for CephFS clients.
data. Intended for CephFS clients.
``profile role-definer`` (Monitor and Auth)
:Description: Gives a user **all** permissions for the auth subsystem, read-only
access to monitors, and nothing else. Useful for automation
tools. Do not assign this unless you really, **really** know what
you're doing as the security ramifications are substantial and
access to monitors, and nothing else. Useful for automation
tools. Do not assign this unless you really, **really** know what
you're doing, as the security ramifications are substantial and
pervasive.
``profile crash`` (Monitor only)
:Description: Gives a user read-only access to monitors, used in conjunction
with the manager ``crash`` module when collecting daemon crash
dumps for later analysis.
:Description: Gives a user read-only access to monitors. Used in conjunction
with the manager ``crash`` module to upload daemon crash
dumps into monitor storage for later analysis.
Pool
----
@ -353,7 +360,8 @@ by users who have access to the namespace.
.. note:: Namespaces are primarily useful for applications written on top of
``librados`` where the logical grouping can alleviate the need to create
different pools. Ceph Object Gateway (from ``luminous``) uses namespaces for various
different pools. Ceph Object Gateway (in releases beginning with
Luminous) uses namespaces for various
metadata objects.
The rationale for namespaces is that pools can be a computationally expensive

View File

@ -1,32 +1,40 @@
.. _radosgw_keycloak:
=================================
Keycloak integration with RadosGW
Integrating Keycloak with RadosGW
=================================
Keycloak can be setup as an OpenID Connect Identity Provider, which can be used by mobile/ web apps
to authenticate their users. The Web token returned as a result of authentication can be used by the
mobile/ web app to call AssumeRoleWithWebIdentity to get back a set of temporary S3 credentials,
which can be used by the app to make S3 calls.
If Keycloak is set up as an OpenID Connect Identity Provider, it can be used by
mobile apps and web apps to authenticate their users. By using the web token
returned by the authentication process, a mobile app or web app can call
AssumeRoleWithWebIdentity, receive a set of temporary S3 credentials, and use
those credentials to make S3 calls.
Setting up Keycloak
====================
===================
Installing and bringing up Keycloak can be found here: https://www.keycloak.org/docs/latest/server_installation/.
Documentation for installing and operating Keycloak can be found here:
https://www.keycloak.org/guides.
Configuring Keycloak to talk to RGW
===================================
The following configurables have to be added for RGW to talk to Keycloak::
To configure Keycloak to talk to RGW, add the following configurables::
[client.radosgw.gateway]
rgw sts key = {sts key for encrypting/ decrypting the session token}
rgw s3 auth use sts = true
Example showing how to fetch a web token from Keycloak
======================================================
Fetching a web token with Keycloak
==================================
Several examples of apps authenticating with Keycloak are given here: https://github.com/keycloak/keycloak-quickstarts/blob/latest/docs/getting-started.md
Taking the example of app-profile-jee-jsp app given in the link above, its client id and client secret, can be used to fetch the
access token (web token) for an application using grant type 'client_credentials' as given below::
Several examples of apps authenticating with Keycloak can be found here:
https://github.com/keycloak/keycloak-quickstarts/blob/latest/docs/getting-started.md.
Here you might consider the example of the app-profile-jee-jsp app (in the link
above). To fetch the access token (web token) for such an application using the
grant type 'client_credentials', one can use client id and client secret as
follows::
KC_REALM=demo
KC_CLIENT=<client id>
@ -48,8 +56,9 @@ access token (web token) for an application using grant type 'client_credentials
KC_ACCESS_TOKEN=$(echo $KC_RESPONSE| jq -r .access_token)
An access token can also be fetched for a particular user with grant type 'password', using client id, client secret, username and its password
as given below::
It is also possible to fetch an access token for a particular user with the
grant type 'password'. To fetch such an access token, use client id, client
secret, username, and password as follows::
KC_REALM=demo
KC_USERNAME=<username>
@ -75,43 +84,45 @@ as given below::
KC_ACCESS_TOKEN=$(echo $KC_RESPONSE| jq -r .access_token)
KC_ACCESS_TOKEN can be used to invoke AssumeRoleWithWebIdentity as given in
``KC_ACCESS_TOKEN`` can be used to invoke ``AssumeRoleWithWebIdentity``: see
:doc:`STS`.
Attaching tags to a user in Keycloak
====================================
Adding tags to a user in Keycloak
=================================
We need to create a user in keycloak, and add tags to it as its attributes.
To create a user in Keycloak and add tags to it as its attributes, follow these
steps:
Add a user as shown below:
#. Add a user:
.. image:: ../images/keycloak-adduser.png
:align: center
.. image:: ../images/keycloak-adduser.png
:align: center
Add user details as shown below:
#. Add user details:
.. image:: ../images/keycloak-userdetails.png
:align: center
.. image:: ../images/keycloak-userdetails.png
:align: center
Add user credentials as shown below:
#. Add user credentials:
.. image:: ../images/keycloak-usercredentials.png
:align: center
.. image:: ../images/keycloak-usercredentials.png
:align: center
Add tags to the 'attributes' tab of the user as shown below:
#. Add tags to the 'attributes' tab of the user:
.. image:: ../images/keycloak-usertags.png
:align: center
.. image:: ../images/keycloak-usertags.png
:align: center
Add a protocol mapper for the user attribute to a client as shown below:
#. Add a protocol mapper that maps the user attribute to a client:
.. image:: ../images/keycloak-userclientmapper.png
:align: center
.. image:: ../images/keycloak-userclientmapper.png
:align: center
After these steps have been completed, the tag 'Department' will appear in the
JWT (web token), under the 'https://aws.amazon.com/tags' namespace.
After following the steps shown above, the tag 'Department' will appear in the JWT (web token), under 'https://aws.amazon.com/tags' namespace.
The tags can be verified using token introspection of the JWT. The command to introspect a token using client id and client secret is shown below::
Tags can be verified by performing token introspection on a JWT. To introspect
a token, use ``client id`` and ``client secret`` as follows::
KC_REALM=demo
KC_CLIENT=<client id>

View File

@ -1,3 +1,5 @@
.. _radosgw-multisite-sync-policy:
=====================
Multisite Sync Policy
=====================

File diff suppressed because it is too large Load Diff

View File

@ -123,6 +123,18 @@ Then provide the zone placement info for that target:
--index-pool default.rgw.temporary.index \
--data-extra-pool default.rgw.temporary.non-ec
.. note:: With default placement target settings, RGW stores an object's first data chunk in the RADOS "head" object along
with xattr metadata. The `--placement-inline-data=false` flag may be passed with the `zone placement add` or
`zone placement modify` commands to change this behavior for new objects stored on the target.
When data is stored inline (default), it may provide an advantage for read/write workloads since the first chunk of
an object's data can be retrieved/stored in a single librados call along with object metadata. On the other hand, a
target that does not store data inline can provide a performance benefit for RGW client delete requests when
the BlueStore DB is located on faster storage than bucket data since it eliminates the need to access
slower devices synchronously while processing the client request. In that case, data associated with the deleted
objects is removed asynchronously in the background by garbage collection.
.. _adding_a_storage_class:
Adding a Storage Class
----------------------

View File

@ -40,7 +40,7 @@ The following table describes the support status for current Amazon S3 functiona
+---------------------------------+-----------------+----------------------------------------+
| **Bucket Lifecycle** | Supported | |
+---------------------------------+-----------------+----------------------------------------+
| **Bucket Replication** | Partial | Only permitted across zones |
| **Bucket Replication** | Partial | Permitted only across zones |
+---------------------------------+-----------------+----------------------------------------+
| **Policy (Buckets, Objects)** | Supported | ACLs & bucket policies are supported |
+---------------------------------+-----------------+----------------------------------------+

View File

@ -43,10 +43,13 @@ An example of the session tags that are passed in by the IDP in the web token is
"active": true
}
Steps to configure Keycloak to pass tags in the web token are described here:doc:`keycloak`.
Steps to configure Keycloak to pass tags in the web token are described here:
:ref:`radosgw_keycloak`.
The trust policy must have 'sts:TagSession' permission if the web token passed in by the federated user contains session tags, otherwise
the AssumeRoleWithWebIdentity action will fail. An example of the trust policy with sts:TagSession is as follows:
The trust policy must have 'sts:TagSession' permission if the web token passed
in by the federated user contains session tags, otherwise the
AssumeRoleWithWebIdentity action will fail. An example of the trust policy with
sts:TagSession is as follows:
.. code-block:: python

View File

@ -12,21 +12,21 @@ iSCSI Initiator for Linux
Install the iSCSI initiator and multipath tools:
::
.. prompt:: bash #
# yum install iscsi-initiator-utils
# yum install device-mapper-multipath
yum install iscsi-initiator-utils
yum install device-mapper-multipath
**Configuring:**
#. Create the default ``/etc/multipath.conf`` file and enable the
``multipathd`` service:
::
.. prompt:: bash #
# mpathconf --enable --with_multipathd y
mpathconf --enable --with_multipathd y
#. Add the following to ``/etc/multipath.conf`` file:
#. Add the following to the ``/etc/multipath.conf`` file:
::
@ -47,45 +47,72 @@ Install the iSCSI initiator and multipath tools:
#. Restart the ``multipathd`` service:
::
.. prompt:: bash #
# systemctl reload multipathd
systemctl reload multipathd
**iSCSI Discovery and Setup:**
#. If CHAP was setup on the iSCSI gateway, provide a CHAP username and
password by updating the ``/etc/iscsi/iscsid.conf`` file accordingly.
#. Enable CHAP authentication and provide the initiator CHAP username
and password by uncommenting and setting the following options in
the ``/etc/iscsi/iscsid.conf`` file:
::
node.session.auth.authmethod = CHAP
node.session.auth.username = myusername
node.session.auth.password = mypassword
If you intend to use mutual (bidirectional) authentication, provide the
target CHAP username and password:
::
node.session.auth.username_in = mytgtusername
node.session.auth.password_in = mytgtpassword
#. Discover the target portals:
.. prompt:: bash #
iscsiadm -m discovery -t st -p 192.168.56.101
::
# iscsiadm -m discovery -t st -p 192.168.56.101
192.168.56.101:3260,1 iqn.2003-01.org.linux-iscsi.rheln1
192.168.56.102:3260,2 iqn.2003-01.org.linux-iscsi.rheln1
#. Login to target:
#. Log in to the target:
::
.. prompt:: bash #
# iscsiadm -m node -T iqn.2003-01.org.linux-iscsi.rheln1 -l
iscsiadm -m node -T iqn.2003-01.org.linux-iscsi.rheln1 -l
**Multipath IO Setup:**
The multipath daemon (``multipathd``), will set up devices automatically
based on the ``multipath.conf`` settings. Running the ``multipath``
command show devices setup in a failover configuration with a priority
group for each path.
#. The multipath daemon (``multipathd``) uses the ``multipath.conf`` settings
to set up devices automatically. Running the ``multipath`` command shows
that the devices have been set up in a failover configuration. Notice that
each path has been placed into its own priority group:
::
.. prompt:: bash #
# multipath -ll
mpathbt (360014059ca317516a69465c883a29603) dm-1 LIO-ORG ,IBLOCK
size=1.0G features='0' hwhandler='1 alua' wp=rw
|-+- policy='queue-length 0' prio=50 status=active
| `- 28:0:0:1 sde 8:64 active ready running
`-+- policy='queue-length 0' prio=10 status=enabled
`- 29:0:0:1 sdc 8:32 active ready running
multipath -ll
You should now be able to use the RBD image like you would a normal
multipathd iSCSI disk.
::
mpathbt (360014059ca317516a69465c883a29603) dm-1 LIO-ORG ,IBLOCK
size=1.0G features='0' hwhandler='1 alua' wp=rw
|-+- policy='queue-length 0' prio=50 status=active
| `- 28:0:0:1 sde 8:64 active ready running
`-+- policy='queue-length 0' prio=10 status=enabled
`- 29:0:0:1 sdc 8:32 active ready running
You should now be able to use the RBD image in the same way that you would
use a normal multipath iSCSI disk.
#. Log out of the target:
.. prompt:: bash #
iscsiadm -m node -T iqn.2003-01.org.linux-iscsi.rheln1 -u

View File

@ -9,53 +9,68 @@
Exclusive locks are mechanisms designed to prevent multiple processes from
accessing the same Rados Block Device (RBD) in an uncoordinated fashion.
Exclusive locks are used heavily in virtualization (where they prevent VMs from
clobbering each other's writes) and in RBD mirroring (where they are a
prerequisite for journaling).
clobbering each other's writes) and in `RBD mirroring`_ (where they are a
prerequisite for journaling in journal-based mirroring and fast generation of
incremental diffs in snapshot-based mirroring).
By default, exclusive locks are enabled on newly created images. This default
The ``exclusive-lock`` feature is enabled on newly created images. This default
can be overridden via the ``rbd_default_features`` configuration option or the
``--image-feature`` option for ``rbd create``.
``--image-feature`` and ``--image-shared`` options for ``rbd create`` command.
.. note::
Many image features, including ``object-map`` and ``fast-diff``, depend upon
exclusive locking. Disabling the ``exclusive-lock`` feature will negatively
affect the performance of some operations.
In order to ensure proper exclusive locking operations, any client using an RBD
image whose ``exclusive-lock`` feature is enabled must have a CephX identity
whose capabilities include ``profile rbd``.
To maintain multi-client access, the ``exclusive-lock`` feature implements
automatic cooperative lock transitions between clients. It ensures that only
a single client can write to an RBD image at any given time and thus protects
internal image structures such as the object map, the journal or the `PWL
cache`_ from concurrent modification.
Exclusive locking is mostly transparent to the user.
Exclusive locking is mostly transparent to the user:
#. Whenever any ``librbd`` client process or kernel RBD client
starts using an RBD image on which exclusive locking has been
enabled, it obtains an exclusive lock on the image before the first
write.
* Whenever a client (a ``librbd`` process or, in case of a ``krbd`` client,
a client node's kernel) needs to handle a write to an RBD image on which
exclusive locking has been enabled, it first acquires an exclusive lock on
the image. If the lock is already held by some other client, that client is
requested to release it.
#. Whenever any such client process terminates gracefully, the process
relinquishes the lock automatically.
* Whenever a client that holds an exclusive lock on an RBD image gets
a request to release the lock, it stops handling writes, flushes its caches
and releases the lock.
#. This graceful termination enables another, subsequent, process to acquire
the lock and to write to the image.
* Whenever a client that holds an exclusive lock on an RBD image terminates
gracefully, the lock is also released gracefully.
* A graceful release of an exclusive lock on an RBD image (whether by request
or due to client termination) enables another, subsequent, client to acquire
the lock and start handling writes.
.. warning::
By default, the ``exclusive-lock`` feature does not prevent two or more
concurrently running clients from opening the same RBD image and writing to
it in turns (whether on the same node or not). In effect, their writes just
get linearized as the lock is automatically transitioned back and forth in
a cooperative fashion.
.. note::
It is possible for two or more concurrently running processes to open the
image and to read from it. The client acquires the exclusive lock only when
attempting to write to the image. To disable transparent lock transitions
between multiple clients, the client must acquire the lock by using the
``RBD_LOCK_MODE_EXCLUSIVE`` flag.
To disable automatic lock transitions between clients, the
``RBD_LOCK_MODE_EXCLUSIVE`` flag may be specified when acquiring the
exclusive lock. This is exposed by the ``--exclusive`` option for ``rbd
device map`` command.
Blacklisting
============
Sometimes a client process (or, in case of a krbd client, a client node's
kernel) that previously held an exclusive lock on an image does not terminate
gracefully, but dies abruptly. This may be because the client process received
a ``KILL`` or ``ABRT`` signal, or because the client node underwent a hard
reboot or suffered a power failure. In cases like this, the exclusive lock is
never gracefully released. This means that any new process that starts and
attempts to use the device must break the previously held exclusive lock.
Sometimes a client that previously held an exclusive lock on an RBD image does
not terminate gracefully, but dies abruptly. This may be because the client
process received a ``KILL`` or ``ABRT`` signal, or because the client node
underwent a hard reboot or suffered a power failure. In cases like this, the
lock is never gracefully released. This means that any new client that comes up
and attempts to write to the image must break the previously held exclusive
lock.
However, a process (or kernel thread) may hang or merely lose network
connectivity to the Ceph cluster for some amount of time. In that case,
@ -78,9 +93,12 @@ Ceph Monitor.
Blocklisting is thus a form of storage-level resource `fencing`_.
In order for blocklisting to work, the client must have the ``osd
blocklist`` capability. This capability is included in the ``profile
rbd`` capability profile, which should be set generally on all Ceph
:ref:`client identities <user-management>` using RBD.
.. note::
In order for blocklisting to work, the client must have the ``osd
blocklist`` capability. This capability is included in the ``profile
rbd`` capability profile, which should be set generally on all Ceph
:ref:`client identities <user-management>` using RBD.
.. _RBD mirroring: ../rbd-mirroring
.. _PWL cache: ../rbd-persistent-write-log-cache
.. _fencing: https://en.wikipedia.org/wiki/Fencing_(computing)

View File

@ -535,13 +535,273 @@ As noted earlier, you can make documentation contributions using the `Fork and
Pull`_ approach.
Squash Extraneous Commits
-------------------------
Each pull request ought to be associated with only a single commit. If you have
made more than one commit to the feature branch that you are working in, you
will need to "squash" the multiple commits. "Squashing" is the colloquial term
for a particular kind of "interactive rebase". Squashing can be done in a great
number of ways, but the example here will deal with a situation in which there
are three commits and the changes in all three of the commits are kept. The three
commits will be squashed into a single commit.
#. Make the commits that you will later squash.
#. Make the first commit.
::
doc/glossary: improve "CephX" entry
Improve the glossary entry for "CephX".
Signed-off-by: Zac Dover <zac.dover@proton.me>
# Please enter the commit message for your changes. Lines starting
# with '#' will be ignored, and an empty message aborts the commit.
#
# On branch wip-doc-2023-03-28-glossary-cephx
# Changes to be committed:
# modified: glossary.rst
#
#. Make the second commit.
::
doc/glossary: add link to architecture doc
Add a link to a section in the architecture document, which link
will be used in the process of improving the "CephX" glossary entry.
Signed-off-by: Zac Dover <zac.dover@proton.me>
# Please enter the commit message for your changes. Lines starting
# with '#' will be ignored, and an empty message aborts the commit.
#
# On branch wip-doc-2023-03-28-glossary-cephx
# Your branch is up to date with 'origin/wip-doc-2023-03-28-glossary-cephx'.
#
# Changes to be committed:
# modified: architecture.rst
#. Make the third commit.
::
doc/glossary: link to Arch doc in "CephX" glossary
Link to the Architecture document from the "CephX" entry in the
Glossary.
Signed-off-by: Zac Dover <zac.dover@proton.me>
# Please enter the commit message for your changes. Lines starting
# with '#' will be ignored, and an empty message aborts the commit.
#
# On branch wip-doc-2023-03-28-glossary-cephx
# Your branch is up to date with 'origin/wip-doc-2023-03-28-glossary-cephx'.
#
# Changes to be committed:
# modified: glossary.rst
#. There are now three commits in the feature branch. We will now begin the
process of squashing them into a single commit.
#. Run the command ``git rebase -i main``, which rebases the current branch
(the feature branch) against the ``main`` branch:
.. prompt:: bash
git rebase -i main
#. A list of the commits that have been made to the feature branch now
appear, and looks like this:
::
pick d395e500883 doc/glossary: improve "CephX" entry
pick b34986e2922 doc/glossary: add link to architecture doc
pick 74d0719735c doc/glossary: link to Arch doc in "CephX" glossary
# Rebase 0793495b9d1..74d0719735c onto 0793495b9d1 (3 commands)
#
# Commands:
# p, pick <commit> = use commit
# r, reword <commit> = use commit, but edit the commit message
# e, edit <commit> = use commit, but stop for amending
# s, squash <commit> = use commit, but meld into previous commit
# f, fixup [-C | -c] <commit> = like "squash" but keep only the previous
# commit's log message, unless -C is used, in which case
# keep only this commit's message; -c is same as -C but
# opens the editor
# x, exec <command> = run command (the rest of the line) using shell
# b, break = stop here (continue rebase later with 'git rebase --continue')
# d, drop <commit> = remove commit
# l, label <label> = label current HEAD with a name
# t, reset <label> = reset HEAD to a label
# m, merge [-C <commit> | -c <commit>] <label> [# <oneline>]
# create a merge commit using the original merge commit's
# message (or the oneline, if no original merge commit was
# specified); use -c <commit> to reword the commit message
# u, update-ref <ref> = track a placeholder for the <ref> to be updated
# to this position in the new commits. The <ref> is
# updated at the end of the rebase
#
# These lines can be re-ordered; they are executed from top to bottom.
#
# If you remove a line here THAT COMMIT WILL BE LOST.
Find the part of the screen that says "pick". This is the part that you will
alter. There are three commits that are currently labeled "pick". We will
choose one of them to remain labeled "pick", and we will label the other two
commits "squash".
#. Label two of the three commits ``squash``:
::
pick d395e500883 doc/glossary: improve "CephX" entry
squash b34986e2922 doc/glossary: add link to architecture doc
squash 74d0719735c doc/glossary: link to Arch doc in "CephX" glossary
# Rebase 0793495b9d1..74d0719735c onto 0793495b9d1 (3 commands)
#
# Commands:
# p, pick <commit> = use commit
# r, reword <commit> = use commit, but edit the commit message
# e, edit <commit> = use commit, but stop for amending
# s, squash <commit> = use commit, but meld into previous commit
# f, fixup [-C | -c] <commit> = like "squash" but keep only the previous
# commit's log message, unless -C is used, in which case
# keep only this commit's message; -c is same as -C but
# opens the editor
# x, exec <command> = run command (the rest of the line) using shell
# b, break = stop here (continue rebase later with 'git rebase --continue')
# d, drop <commit> = remove commit
# l, label <label> = label current HEAD with a name
# t, reset <label> = reset HEAD to a label
# m, merge [-C <commit> | -c <commit>] <label> [# <oneline>]
# create a merge commit using the original merge commit's
# message (or the oneline, if no original merge commit was
# specified); use -c <commit> to reword the commit message
# u, update-ref <ref> = track a placeholder for the <ref> to be updated
# to this position in the new commits. The <ref> is
# updated at the end of the rebase
#
# These lines can be re-ordered; they are executed from top to bottom.
#
# If you remove a line here THAT COMMIT WILL BE LOST.
#. Now we create a commit message that applies to all the commits that have
been squashed together:
#. When you save and close the list of commits that you have designated for
squashing, a list of all three commit messages appears, and it looks
like this:
::
# This is a combination of 3 commits.
# This is the 1st commit message:
doc/glossary: improve "CephX" entry
Improve the glossary entry for "CephX".
Signed-off-by: Zac Dover <zac.dover@proton.me>
# This is the commit message #2:
doc/glossary: add link to architecture doc
Add a link to a section in the architecture document, which link
will be used in the process of improving the "CephX" glossary entry.
Signed-off-by: Zac Dover <zac.dover@proton.me>
# This is the commit message #3:
doc/glossary: link to Arch doc in "CephX" glossary
Link to the Architecture document from the "CephX" entry in the
Glossary.
Signed-off-by: Zac Dover <zac.dover@proton.me>
# Please enter the commit message for your changes. Lines starting
# with '#' will be ignored, and an empty message aborts the commit.
#
# Date: Tue Mar 28 18:42:11 2023 +1000
#
# interactive rebase in progress; onto 0793495b9d1
# Last commands done (3 commands done):
# squash b34986e2922 doc/glossary: add link to architecture doc
# squash 74d0719735c doc/glossary: link to Arch doc in "CephX" glossary
# No commands remaining.
# You are currently rebasing branch 'wip-doc-2023-03-28-glossary-cephx' on '0793495b9d1'.
#
# Changes to be committed:
# modified: doc/architecture.rst
# modified: doc/glossary.rst
#. The commit messages have been revised into the simpler form presented here:
::
doc/glossary: improve "CephX" entry
Improve the glossary entry for "CephX".
Signed-off-by: Zac Dover <zac.dover@proton.me>
# Please enter the commit message for your changes. Lines starting
# with '#' will be ignored, and an empty message aborts the commit.
#
# Date: Tue Mar 28 18:42:11 2023 +1000
#
# interactive rebase in progress; onto 0793495b9d1
# Last commands done (3 commands done):
# squash b34986e2922 doc/glossary: add link to architecture doc
# squash 74d0719735c doc/glossary: link to Arch doc in "CephX" glossary
# No commands remaining.
# You are currently rebasing branch 'wip-doc-2023-03-28-glossary-cephx' on '0793495b9d1'.
#
# Changes to be committed:
# modified: doc/architecture.rst
# modified: doc/glossary.rst
#. Force push the squashed commit from your local working copy to the remote
upstream branch. The force push is necessary because the newly squashed commit
does not have an ancestor in the remote. If that confuses you, just run this
command and don't think too much about it:
.. prompt:: bash $
git push -f
::
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 8 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 722 bytes | 722.00 KiB/s, done.
Total 5 (delta 4), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.
To github.com:zdover23/ceph.git
+ b34986e2922...02e3a5cb763 wip-doc-2023-03-28-glossary-cephx -> wip-doc-2023-03-28-glossary-cephx (forced update)
Notify Us
---------
After you make a pull request, please email ceph-docs@redhat.com.
If some time has passed and the pull request that you raised has not been
reviewed, contact the component lead and ask what's taking so long. See
:ref:`clt` for a list of component leads.
Documentation Style Guide
=========================
@ -778,6 +1038,27 @@ Link to target with inline text::
documentation<external_link_with_inline_text>`. If this seems inconsistent
and confusing to you, then you're right. It is inconsistent and confusing.
Escaping Bold Characters within Words
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This section explains how to make certain letters within a word bold while
leaving the other letters in the word regular (non-bold).
The following single-line paragraph provides an example of this:
**C**\eph **F**\ile **S**\ystem.
In ReStructured Text, the following formula will not work:
::
**C**eph **F**ile **S**ystem
The bolded notation must be turned off by means of the escape character (\\), as shown here:
::
**C**\eph **F**\ile **S**\ystem
.. _Python Sphinx: https://www.sphinx-doc.org
.. _restructuredText: http://docutils.sourceforge.net/rst.html

View File

@ -318,7 +318,7 @@ local g = import 'grafonnet/grafana.libsonnet';
.addTemplate(
$.addTemplateSchema('ceph_hosts',
'$datasource',
'label_values({%(clusterMatcher)s}, instance)' % $.matchers(),
if $._config.showMultiCluster then ('label_values({%(clusterMatcher)s}, instance)' % $.matchers()) else 'label_values(instance)',
1,
false,
3,
@ -719,5 +719,30 @@ local g = import 'grafonnet/grafana.libsonnet';
11,
9
),
$.addTableSchema(
'$datasource',
'This table shows the 10 hosts with the highest number of slow ops',
{ col: 2, desc: true },
[
$.overviewStyle('Instance', 'instance', 'string', 'short'),
$.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
$.overviewStyle('', '/.*/', 'hidden', 'short'),
],
'Top Slow Ops per Host',
'table'
)
.addTarget(
$.addTargetSchema(
|||
topk(10,
(sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}))
)
||| % $.matchers(),
'',
'table',
1,
true
)
) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } },
]),
}

View File

@ -300,6 +300,31 @@ local g = import 'grafonnet/grafana.libsonnet';
.addTargets([$.addTargetSchema(
'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
)]),
$.addTableSchema(
'$datasource',
'This table shows the 10 OSDs with the highest number of slow ops',
{ col: 2, desc: true },
[
$.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
$.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
$.overviewStyle('', '/.*/', 'hidden', 'short'),
],
'Top Slow Ops',
'table'
)
.addTarget(
$.addTargetSchema(
|||
topk(10,
(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
)
||| % $.matchers(),
'',
'table',
1,
true
)
) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } },
]),
'osd-device-details.json':
local OsdDeviceDetailsPanel(title,

View File

@ -23,12 +23,6 @@
"id": "singlestat",
"name": "Singlestat",
"version": "5.0.0"
},
{
"type": "panel",
"id": "vonage-status-panel",
"name": "Status Panel",
"version": "1.0.8"
}
],
"annotations": {
@ -64,7 +58,7 @@
},
"gridPos": {
"h": 3,
"w": 2,
"w": 6,
"x": 0,
"y": 0
},
@ -157,8 +151,8 @@
"fontFormat": "Regular",
"gridPos": {
"h": 3,
"w": 2,
"x": 2,
"w": 6,
"x": 6,
"y": 0
},
"id": 43,
@ -167,6 +161,19 @@
"isHideAlertsOnDisable": false,
"isIgnoreOKColors": false,
"links": [],
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"targets": [
{
"aggregation": "Last",
@ -249,7 +256,178 @@
}
],
"title": "OSDs",
"type": "vonage-status-panel"
"type": "stat"
},
{
"clusterName": "",
"colorMode": "Panel",
"colors": {
"crit": "rgba(245, 54, 54, 0.9)",
"disable": "rgba(128, 128, 128, 0.9)",
"ok": "rgba(50, 128, 45, 0.9)",
"warn": "rgba(237, 129, 40, 0.9)"
},
"cornerRadius": 1,
"datasource": "$datasource",
"displayName": "",
"flipCard": false,
"flipTime": 5,
"fontFormat": "Regular",
"gridPos": {
"h": 3,
"w": 6,
"x": 12,
"y": 0
},
"id": 41,
"isAutoScrollOnOverflow": false,
"isGrayOnNoData": false,
"isHideAlertsOnDisable": false,
"isIgnoreOKColors": false,
"links": [],
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"targets": [
{
"aggregation": "Last",
"alias": "In Quorum",
"decimals": 2,
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "sum(ceph_mon_quorum_status)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "In Quorum",
"refId": "A",
"units": "none",
"valueHandler": "Text Only"
},
{
"aggregation": "Last",
"alias": "Total",
"crit": 1,
"decimals": 2,
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "count(ceph_mon_quorum_status)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Total",
"refId": "B",
"units": "none",
"valueHandler": "Text Only",
"warn": 2
},
{
"aggregation": "Last",
"alias": "MONs out of Quorum",
"crit": 1.6,
"decimals": 2,
"displayAliasType": "Warning / Critical",
"displayType": "Annotation",
"displayValueWithAlias": "Never",
"expr": "count(ceph_mon_quorum_status) / sum(ceph_mon_quorum_status)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "MONs out of Quorum",
"refId": "C",
"units": "none",
"valueHandler": "Number Threshold",
"warn": 1.1
}
],
"title": "Monitors",
"type": "stat"
},
{
"colorMode": "Panel",
"colors": {
"crit": "rgba(245, 54, 54, 0.9)",
"disable": "rgba(128, 128, 128, 0.9)",
"ok": "rgba(50, 128, 45, 0.9)",
"warn": "rgba(237, 129, 40, 0.9)"
},
"cornerRadius": 1,
"datasource": "$datasource",
"displayName": "",
"flipCard": false,
"flipTime": 5,
"fontFormat": "Regular",
"gridPos": {
"h": 3,
"w": 6,
"x": 18,
"y": 0
},
"id": 68,
"isAutoScrollOnOverflow": false,
"isGrayOnNoData": false,
"isHideAlertsOnDisable": false,
"isIgnoreOKColors": false,
"links": [],
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"targets": [
{
"aggregation": "Last",
"alias": "Active",
"decimals": 2,
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "count(ceph_mgr_status == 1) or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"instant": true,
"legendFormat": "Active",
"refId": "A",
"units": "none",
"valueHandler": "Number Threshold"
},
{
"aggregation": "Last",
"alias": "Standby",
"decimals": 2,
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "count(ceph_mgr_status == 0) or vector(0)",
"format": "time_series",
"instant": true,
"intervalFactor": 1,
"legendFormat": "Standby",
"refId": "B",
"units": "none",
"valueHandler": "Number Threshold"
}
],
"title": "MGRs",
"type": "stat"
},
{
"cacheTimeout": null,
@ -272,9 +450,9 @@
},
"gridPos": {
"h": 6,
"w": 4,
"x": 4,
"y": 0
"w": 6,
"x": 0,
"y": 6
},
"id": 47,
"interval": null,
@ -342,9 +520,9 @@
"fill": 0,
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 0
"w": 9,
"x": 6,
"y": 6
},
"id": 53,
"legend": {
@ -498,9 +676,9 @@
"fill": 0,
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 0
"w": 9,
"x": 15,
"y": 6
},
"id": 66,
"legend": {
@ -595,149 +773,6 @@
}
]
},
{
"clusterName": "",
"colorMode": "Panel",
"colors": {
"crit": "rgba(245, 54, 54, 0.9)",
"disable": "rgba(128, 128, 128, 0.9)",
"ok": "rgba(50, 128, 45, 0.9)",
"warn": "rgba(237, 129, 40, 0.9)"
},
"cornerRadius": 1,
"datasource": "$datasource",
"displayName": "",
"flipCard": false,
"flipTime": 5,
"fontFormat": "Regular",
"gridPos": {
"h": 3,
"w": 2,
"x": 0,
"y": 3
},
"id": 41,
"isAutoScrollOnOverflow": false,
"isGrayOnNoData": false,
"isHideAlertsOnDisable": false,
"isIgnoreOKColors": false,
"links": [],
"targets": [
{
"aggregation": "Last",
"alias": "In Quorum",
"decimals": 2,
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "sum(ceph_mon_quorum_status)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "In Quorum",
"refId": "A",
"units": "none",
"valueHandler": "Text Only"
},
{
"aggregation": "Last",
"alias": "Total",
"crit": 1,
"decimals": 2,
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "count(ceph_mon_quorum_status)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Total",
"refId": "B",
"units": "none",
"valueHandler": "Text Only",
"warn": 2
},
{
"aggregation": "Last",
"alias": "MONs out of Quorum",
"crit": 1.6,
"decimals": 2,
"displayAliasType": "Warning / Critical",
"displayType": "Annotation",
"displayValueWithAlias": "Never",
"expr": "count(ceph_mon_quorum_status) / sum(ceph_mon_quorum_status)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "MONs out of Quorum",
"refId": "C",
"units": "none",
"valueHandler": "Number Threshold",
"warn": 1.1
}
],
"title": "Monitors",
"type": "vonage-status-panel"
},
{
"colorMode": "Panel",
"colors": {
"crit": "rgba(245, 54, 54, 0.9)",
"disable": "rgba(128, 128, 128, 0.9)",
"ok": "rgba(50, 128, 45, 0.9)",
"warn": "rgba(237, 129, 40, 0.9)"
},
"cornerRadius": 0,
"datasource": "$datasource",
"displayName": "",
"flipCard": false,
"flipTime": 5,
"fontFormat": "Regular",
"gridPos": {
"h": 3,
"w": 2,
"x": 2,
"y": 3
},
"id": 68,
"isAutoScrollOnOverflow": false,
"isGrayOnNoData": false,
"isHideAlertsOnDisable": false,
"isIgnoreOKColors": false,
"links": [],
"targets": [
{
"aggregation": "Last",
"alias": "Active",
"decimals": 2,
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "count(ceph_mgr_status == 1) or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Active",
"refId": "A",
"units": "none",
"valueHandler": "Number Threshold"
},
{
"aggregation": "Last",
"alias": "Standby",
"decimals": 2,
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "count(ceph_mgr_status == 0) or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Standby",
"refId": "B",
"units": "none",
"valueHandler": "Number Threshold"
}
],
"title": "MGRs",
"type": "vonage-status-panel"
},
{
"aliasColors": {},
"bars": false,
@ -749,7 +784,7 @@
"h": 9,
"w": 12,
"x": 0,
"y": 6
"y": 9
},
"id": 45,
"legend": {
@ -841,7 +876,7 @@
"h": 9,
"w": 12,
"x": 12,
"y": 6
"y": 9
},
"id": 62,
"legend": {

View File

@ -1119,6 +1119,91 @@
"show": true
}
]
},
{
"columns": [ ],
"datasource": "$datasource",
"description": "This table shows the 10 hosts with the highest number of slow ops",
"gridPos": {
"h": 8,
"w": 4,
"x": 0,
"y": 40
},
"id": 15,
"links": [ ],
"sort": {
"col": 2,
"desc": true
},
"styles": [
{
"alias": "Instance",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "instance",
"thresholds": [ ],
"type": "string",
"unit": "short",
"valueMaps": [ ]
},
{
"alias": "Slow Ops",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "Value",
"thresholds": [ ],
"type": "number",
"unit": "none",
"valueMaps": [ ]
},
{
"alias": "",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "/.*/",
"thresholds": [ ],
"type": "hidden",
"unit": "short",
"valueMaps": [ ]
}
],
"targets": [
{
"expr": "topk(10,\n (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"}))\n)\n",
"format": "table",
"instant": true,
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Top Slow Ops per Host",
"transform": "table",
"type": "table"
}
],
"refresh": "30s",
@ -1195,7 +1280,7 @@
"multi": false,
"name": "ceph_hosts",
"options": [ ],
"query": "label_values({}, instance)",
"query": "label_values(instance)",
"refresh": 1,
"regex": "([^.:]*).*",
"sort": 3,

View File

@ -860,6 +860,91 @@
"show": true
}
]
},
{
"columns": [ ],
"datasource": "$datasource",
"description": "This table shows the 10 OSDs with the highest number of slow ops",
"gridPos": {
"h": 8,
"w": 4,
"x": 0,
"y": 20
},
"id": 13,
"links": [ ],
"sort": {
"col": 2,
"desc": true
},
"styles": [
{
"alias": "OSD ID",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "ceph_daemon",
"thresholds": [ ],
"type": "string",
"unit": "short",
"valueMaps": [ ]
},
{
"alias": "Slow Ops",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "Value",
"thresholds": [ ],
"type": "number",
"unit": "none",
"valueMaps": [ ]
},
{
"alias": "",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "/.*/",
"thresholds": [ ],
"type": "hidden",
"unit": "short",
"valueMaps": [ ]
}
],
"targets": [
{
"expr": "topk(10,\n (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n",
"format": "table",
"instant": true,
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Top Slow Ops",
"transform": "table",
"type": "table"
}
],
"refresh": "30s",

View File

@ -629,6 +629,17 @@
description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)',
},
},
{
alert: 'CephDaemonSlowOps',
'for': '30s',
expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0',
labels: { severity: 'warning', type: 'ceph_default' },
annotations: {
documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops',
summary: '{{ $labels.ceph_daemon }} operations are slow to complete',
description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)',
},
},
],
},
{

View File

@ -563,6 +563,16 @@ groups:
labels:
severity: "warning"
type: "ceph_default"
- alert: "CephDaemonSlowOps"
for: "30s"
expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
labels:
severity: 'warning'
type: 'ceph_default'
annotations:
summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
- name: "cephadm"
rules:
- alert: "CephadmUpgradeFailed"

View File

@ -679,6 +679,33 @@ tests:
summary: OSD operations are slow to complete
description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
# slow daemon ops
- interval : 1m
input_series:
- series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
values: '1+0x120'
promql_expr_test:
- expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
eval_time: 1m
exp_samples:
- labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
job="ceph", type="SLOW_OPS"}'
value: 1
alert_rule_test:
- eval_time: 20m
alertname: CephDaemonSlowOps
exp_alerts:
- exp_labels:
instance: ceph:9283
ceph_daemon: "osd.1"
job: ceph
severity: warning
type: ceph_default
exp_annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
summary: osd.1 operations are slow to complete
description: "osd.1 operations are taking too long to process (complaint time exceeded)"
# CEPHADM orchestrator alert triggers
- interval: 30s
input_series:

View File

@ -10,5 +10,5 @@ tasks:
all:
- sudo cp /etc/containers/registries.conf /etc/containers/registries.conf.backup
- sudo dnf -y module reset container-tools
- sudo dnf -y module install container-tools
- sudo dnf -y module install container-tools --allowerasing --nobest
- sudo cp /etc/containers/registries.conf.backup /etc/containers/registries.conf

View File

@ -9,5 +9,5 @@ tasks:
all:
- sudo cp /etc/containers/registries.conf /etc/containers/registries.conf.backup
- sudo dnf -y module reset container-tools
- sudo dnf -y module install container-tools:3.0
- sudo dnf -y module install container-tools:3.0 --allowerasing --nobest
- sudo cp /etc/containers/registries.conf.backup /etc/containers/registries.conf

View File

@ -9,5 +9,5 @@ tasks:
all:
- sudo cp /etc/containers/registries.conf /etc/containers/registries.conf.backup
- sudo dnf -y module reset container-tools
- sudo dnf -y module install container-tools:rhel8
- sudo dnf -y module install container-tools:rhel8 --allowerasing --nobest
- sudo cp /etc/containers/registries.conf.backup /etc/containers/registries.conf

View File

@ -18,6 +18,7 @@ overrides:
- Metadata damage detected
- MDS_READ_ONLY
- force file system read-only
- with standby daemon mds
tasks:
- cephfs_test_runner:
modules:

View File

@ -5,6 +5,7 @@ overrides:
- bad backtrace on inode
- inode table repaired for inode
- Scrub error on inode
- Scrub error on dir
- Metadata damage detected
tasks:
- cephfs_test_runner:

View File

@ -4,5 +4,6 @@ overrides:
- but it is still running
- objects unfound and apparently lost
- MDS_SLOW_METADATA_IO
- MDS_TRIM
tasks:
- thrashosds:

View File

@ -27,8 +27,7 @@ tasks:
- tox: [ client.0 ]
- keystone:
client.0:
sha1: 17.0.0.0rc2
force-branch: master
force-branch: stable/xena
projects:
- name: rgwcrypt
description: Encryption Tenant
@ -69,8 +68,7 @@ tasks:
description: Swift Service
- barbican:
client.0:
sha1: 5.0.1
force-branch: master
force-branch: stable/xena
use-keystone-role: client.0
keystone_authtoken:
auth_plugin: password

View File

@ -8,8 +8,7 @@ tasks:
- tox: [ client.0 ]
- keystone:
client.0:
sha1: 17.0.0.0rc2
force-branch: master
force-branch: stable/xena
services:
- name: swift
type: object-store
@ -20,7 +19,7 @@ tasks:
use-keystone-role: client.0
- tempest:
client.0:
sha1: train-last
sha1: 30.0.0
force-branch: master
use-keystone-role: client.0
auth:
@ -49,6 +48,10 @@ tasks:
- .*test_container_synchronization.*
- .*test_object_services.PublicObjectTest.test_access_public_container_object_without_using_creds
- .*test_object_services.ObjectTest.test_create_object_with_transfer_encoding
- .*test_container_services.ContainerTest.test_create_container_with_remove_metadata_key
- .*test_container_services.ContainerTest.test_create_container_with_remove_metadata_value
- .*test_object_expiry.ObjectExpiryTest.test_get_object_after_expiry_time
- .*test_object_expiry.ObjectExpiryTest.test_get_object_at_expiry_time
overrides:
ceph:

View File

@ -157,12 +157,6 @@ def fix_barbican_api(ctx, cclient):
'/prop_dir =/ s#etc/barbican#{}/etc/barbican#'.format(get_barbican_dir(ctx)),
'bin/barbican-api'])
def copy_policy_json(ctx, cclient):
run_in_barbican_dir(ctx, cclient,
['cp',
get_barbican_dir(ctx)+'/etc/barbican/policy.json',
get_barbican_dir(ctx)])
def create_barbican_conf(ctx, cclient):
barbican_host, barbican_port = ctx.barbican.endpoints[cclient]
barbican_url = 'http://{host}:{port}'.format(host=barbican_host,
@ -174,6 +168,14 @@ def create_barbican_conf(ctx, cclient):
'echo -n -e "[DEFAULT]\nhost_href=' + barbican_url + '\n" ' + \
'>barbican.conf'])
log.info("run barbican db upgrade")
config_path = get_barbican_dir(ctx) + '/barbican.conf'
run_in_barbican_venv(ctx, cclient, ['barbican-manage', '--config-file', config_path,
'db', 'upgrade'])
log.info("run barbican db sync_secret_stores")
run_in_barbican_venv(ctx, cclient, ['barbican-manage', '--config-file', config_path,
'db', 'sync_secret_stores'])
@contextlib.contextmanager
def configure_barbican(ctx, config):
"""
@ -189,7 +191,6 @@ def configure_barbican(ctx, config):
set_authtoken_params(ctx, cclient, cconfig)
fix_barbican_api(ctx, cclient)
fix_barbican_api_paste(ctx, cclient)
copy_policy_json(ctx, cclient)
create_barbican_conf(ctx, cclient)
try:
yield

View File

@ -1564,23 +1564,33 @@ class CephManager:
Accepts arguments same as that of teuthology.orchestra.run.run()
"""
prefixcmd = []
timeoutcmd = kwargs.pop('timeoutcmd', None)
if timeoutcmd is not None:
prefixcmd += ['timeout', str(timeoutcmd)]
if self.cephadm:
prefixcmd += ['ceph']
cmd = prefixcmd + list(kwargs['args'])
return shell(self.ctx, self.cluster, self.controller,
args=['ceph'] + list(kwargs['args']),
args=cmd,
stdout=StringIO(),
check_status=kwargs.get('check_status', True))
if self.rook:
elif self.rook:
prefixcmd += ['ceph']
cmd = prefixcmd + list(kwargs['args'])
return toolbox(self.ctx, self.cluster,
args=['ceph'] + list(kwargs['args']),
args=cmd,
stdout=StringIO(),
check_status=kwargs.get('check_status', True))
testdir = teuthology.get_testdir(self.ctx)
prefix = ['sudo', 'adjust-ulimits', 'ceph-coverage',
f'{testdir}/archive/coverage', 'timeout', '120', 'ceph',
'--cluster', self.cluster]
kwargs['args'] = prefix + list(kwargs['args'])
return self.controller.run(**kwargs)
else:
testdir = teuthology.get_testdir(self.ctx)
prefix = prefixcmd + ['sudo', 'adjust-ulimits', 'ceph-coverage',
f'{testdir}/archive/coverage', 'timeout', '120', 'ceph',
'--cluster', self.cluster]
kwargs['args'] = prefix + list(kwargs['args'])
return self.controller.run(**kwargs)
def raw_cluster_cmd(self, *args, **kwargs) -> str:
"""

View File

@ -161,6 +161,7 @@ class CephTestCase(unittest.TestCase):
log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
return False
log.info(f"waiting {timeout}s for health warning matching {pattern}")
self.wait_until_true(seen_health_warning, timeout)
def wait_for_health_clear(self, timeout):

View File

@ -141,14 +141,15 @@ def download_cephadm(ctx, config, ref):
else:
ctx.cluster.run(
args=[
'git', 'archive',
'--remote=' + git_url,
ref,
'src/cephadm/cephadm',
run.Raw('|'),
'tar', '-xO', 'src/cephadm/cephadm',
'git', 'clone', git_url, 'testrepo',
run.Raw('&&'),
'cd', 'testrepo',
run.Raw('&&'),
'git', 'show', f'{ref}:src/cephadm/cephadm',
run.Raw('>'),
ctx.cephadm,
run.Raw('&&'),
'ls', '-l', ctx.cephadm,
],
)
# sanity-check the resulting file and set executable bit

View File

@ -72,9 +72,6 @@ class CephFSTestCase(CephTestCase):
# Whether to create the default filesystem during setUp
REQUIRE_FILESYSTEM = True
# requires REQUIRE_FILESYSTEM = True
REQUIRE_RECOVERY_FILESYSTEM = False
# create a backup filesystem if required.
# required REQUIRE_FILESYSTEM enabled
REQUIRE_BACKUP_FILESYSTEM = False
@ -192,20 +189,6 @@ class CephFSTestCase(CephTestCase):
self.backup_fs = self.mds_cluster.newfs(name="backup_fs")
self.backup_fs.wait_for_daemons()
if self.REQUIRE_RECOVERY_FILESYSTEM:
if not self.REQUIRE_FILESYSTEM:
self.skipTest("Recovery filesystem requires a primary filesystem as well")
# After Octopus is EOL, we can remove this setting:
self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
'enable_multiple', 'true',
'--yes-i-really-mean-it')
self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
self.recovery_fs.set_metadata_overlay(True)
self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
self.recovery_fs.create()
self.recovery_fs.getinfo(refresh=True)
self.recovery_fs.wait_for_daemons()
# Load an config settings of interest
for setting in self.LOAD_SETTINGS:
setattr(self, setting, float(self.fs.mds_asok(

View File

@ -473,6 +473,17 @@ class MDSCluster(CephCluster):
for fs in self.status().get_filesystems():
Filesystem(ctx=self._ctx, fscid=fs['id']).destroy()
@property
def beacon_timeout(self):
"""
Generate an acceptable timeout for the mons to drive some MDSMap change
because of missed beacons from some MDS. This involves looking up the
grace period in use by the mons and adding an acceptable buffer.
"""
grace = float(self.get_config("mds_beacon_grace", service_type="mon"))
return grace*2+15
class Filesystem(MDSCluster):
"""
@ -485,7 +496,6 @@ class Filesystem(MDSCluster):
self.name = name
self.id = None
self.metadata_pool_name = None
self.metadata_overlay = False
self.data_pool_name = None
self.data_pools = None
self.fs_config = fs_config
@ -539,11 +549,6 @@ class Filesystem(MDSCluster):
self.get_pool_names(status = status, refresh = refresh)
return status
def set_metadata_overlay(self, overlay):
if self.id is not None:
raise RuntimeError("cannot specify fscid when configuring overlay")
self.metadata_overlay = overlay
def deactivate(self, rank):
if rank < 0:
raise RuntimeError("invalid rank")
@ -644,7 +649,7 @@ class Filesystem(MDSCluster):
target_size_ratio = 0.9
target_size_ratio_ec = 0.9
def create(self):
def create(self, recover=False, metadata_overlay=False):
if self.name is None:
self.name = "cephfs"
if self.metadata_pool_name is None:
@ -656,7 +661,7 @@ class Filesystem(MDSCluster):
# will use the ec pool to store the data and a small amount of
# metadata still goes to the primary data pool for all files.
if not self.metadata_overlay and self.ec_profile and 'disabled' not in self.ec_profile:
if not metadata_overlay and self.ec_profile and 'disabled' not in self.ec_profile:
self.target_size_ratio = 0.05
log.debug("Creating filesystem '{0}'".format(self.name))
@ -683,16 +688,14 @@ class Filesystem(MDSCluster):
else:
raise
if self.metadata_overlay:
self.mon_manager.raw_cluster_cmd('fs', 'new',
self.name, self.metadata_pool_name, data_pool_name,
'--allow-dangerous-metadata-overlay')
else:
self.mon_manager.raw_cluster_cmd('fs', 'new',
self.name,
self.metadata_pool_name,
data_pool_name)
args = ["fs", "new", self.name, self.metadata_pool_name, data_pool_name]
if recover:
args.append('--recover')
if metadata_overlay:
args.append('--allow-dangerous-metadata-overlay')
self.mon_manager.raw_cluster_cmd(*args)
if not recover:
if self.ec_profile and 'disabled' not in self.ec_profile:
ec_data_pool_name = data_pool_name + "_ec"
log.debug("EC profile is %s", self.ec_profile)
@ -1070,6 +1073,9 @@ class Filesystem(MDSCluster):
def rank_freeze(self, yes, rank=0):
self.mon_manager.raw_cluster_cmd("mds", "freeze", "{}:{}".format(self.id, rank), str(yes).lower())
def rank_repaired(self, rank):
self.mon_manager.raw_cluster_cmd("mds", "repaired", "{}:{}".format(self.id, rank))
def rank_fail(self, rank=0):
self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank))
@ -1119,6 +1125,9 @@ class Filesystem(MDSCluster):
if timeout is None:
timeout = DAEMON_WAIT_TIMEOUT
if self.id is None:
status = self.getinfo(refresh=True)
if status is None:
status = self.status()
@ -1233,12 +1242,12 @@ class Filesystem(MDSCluster):
out.append((rank, f(perf)))
return out
def read_cache(self, path, depth=None):
def read_cache(self, path, depth=None, rank=None):
cmd = ["dump", "tree", path]
if depth is not None:
cmd.append(depth.__str__())
result = self.mds_asok(cmd)
if len(result) == 0:
result = self.rank_asok(cmd, rank=rank)
if result is None or len(result) == 0:
raise RuntimeError("Path not found in cache: {0}".format(path))
return result
@ -1623,6 +1632,9 @@ class Filesystem(MDSCluster):
def get_scrub_status(self, rank=0):
return self.run_scrub(["status"], rank)
def flush(self, rank=0):
return self.rank_tell(["flush", "journal"], rank=rank)
def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30,
timeout=300, reverse=False):
# time out after "timeout" seconds and assume as done

View File

@ -28,7 +28,9 @@ class KernelMount(CephFSMount):
client_keyring_path=client_keyring_path, hostfs_mntpt=hostfs_mntpt,
cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet)
self.client_config = config
self.rbytes = config.get('rbytes', False)
self.snapdirname = config.get('snapdirname', '.snap')
self.inst = None
self.addr = None
@ -86,6 +88,8 @@ class KernelMount(CephFSMount):
opts += ",rbytes"
else:
opts += ",norbytes"
if self.snapdirname != '.snap':
opts += f',snapdirname={self.snapdirname}'
if mntopts:
opts += ',' + ','.join(mntopts)

View File

@ -3,6 +3,7 @@ import json
import logging
import errno
import re
import time
from teuthology.contextutil import MaxWhileTries
from teuthology.exceptions import CommandFailedError
from teuthology.orchestra.run import wait
@ -562,3 +563,99 @@ class TestDamage(CephFSTestCase):
self.fs.mon_manager.raw_cluster_cmd(
'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
"damage", "rm", str(entry['id']))
def test_dentry_first_existing(self):
"""
That the MDS won't abort when the dentry is already known to be damaged.
"""
def verify_corrupt():
info = self.fs.read_cache("/a", 0)
log.debug('%s', info)
self.assertEqual(len(info), 1)
dirfrags = info[0]['dirfrags']
self.assertEqual(len(dirfrags), 1)
dentries = dirfrags[0]['dentries']
self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c'])
self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD
self.mount_a.run_shell_payload("mkdir -p a/b")
self.fs.flush()
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
time.sleep(5) # for conf to percolate
self.mount_a.run_shell_payload("mv a/b a/c; sync .")
self.mount_a.umount()
verify_corrupt()
self.fs.fail()
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False)
self.fs.set_joinable()
status = self.fs.status()
self.fs.flush()
self.assertFalse(self.fs.status().hadfailover(status))
verify_corrupt()
def test_dentry_first_preflush(self):
"""
That the MDS won't write a dentry with new damage to CDentry::first
to the journal.
"""
rank0 = self.fs.get_rank()
self.fs.rank_freeze(True, rank=0)
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d")
self.fs.flush()
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
time.sleep(5) # for conf to percolate
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
self.fs.rank_freeze(False, rank=0)
self.delete_mds_coredump(rank0['name'])
self.fs.mds_restart(rank0['name'])
self.fs.wait_for_daemons()
p.wait()
self.mount_a.run_shell_payload("stat a/ && find a/")
self.fs.flush()
def test_dentry_first_precommit(self):
"""
That the MDS won't write a dentry with new damage to CDentry::first
to the directory object.
"""
fscid = self.fs.id
self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .")
self.mount_a.umount() # allow immediate scatter write back
self.fs.flush()
# now just twiddle some inode metadata on a regular file
self.mount_a.mount_wait()
self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .")
self.mount_a.umount() # avoid journaling session related things
# okay, now cause the dentry to get damaged after loading from the journal
self.fs.fail()
self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0")
time.sleep(5) # for conf to percolate
self.fs.set_joinable()
self.fs.wait_for_daemons()
rank0 = self.fs.get_rank()
self.fs.rank_freeze(True, rank=0)
# so now we want to trigger commit but this will crash, so:
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
self.fs.rank_freeze(False, rank=0)
self.delete_mds_coredump(rank0['name'])
self.fs.mds_restart(rank0['name'])
self.fs.wait_for_daemons()
try:
p.wait()
except CommandFailedError as e:
print(e)
else:
self.fail("flush journal should fail!")
self.mount_a.mount_wait()
self.mount_a.run_shell_payload("stat a/ && find a/")
self.fs.flush()

View File

@ -368,6 +368,7 @@ class TestDataScan(CephFSTestCase):
self.fs.data_scan(["init"])
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
self.fs.data_scan(["scan_links"])
# Mark the MDS repaired
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

View File

@ -319,8 +319,6 @@ class TestFailover(CephFSTestCase):
# Kill the rank 0 daemon's physical process
self.fs.mds_stop(original_active)
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
# Wait until the monitor promotes his replacement
def promoted():
active = self.fs.get_active_names()
@ -328,9 +326,7 @@ class TestFailover(CephFSTestCase):
log.info("Waiting for promotion of one of the original standbys {0}".format(
original_standbys))
self.wait_until_true(
promoted,
timeout=grace*2)
self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
# Start the original rank 0 daemon up again, see that he becomes a standby
self.fs.mds_restart(original_active)
@ -352,8 +348,6 @@ class TestFailover(CephFSTestCase):
if not require_active:
self.skipTest("fuse_require_active_mds is not set")
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
# Check it's not laggy to begin with
(original_active, ) = self.fs.get_active_names()
self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
@ -376,7 +370,7 @@ class TestFailover(CephFSTestCase):
return True
self.wait_until_true(laggy, grace * 2)
self.wait_until_true(laggy, self.fs.beacon_timeout)
with self.assertRaises(CommandFailedError):
self.mounts[0].mount_wait()
@ -388,8 +382,6 @@ class TestFailover(CephFSTestCase):
# Need all my standbys up as well as the active daemons
self.wait_for_daemon_start()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
standbys = self.mds_cluster.get_standby_daemons()
self.assertGreaterEqual(len(standbys), 1)
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
@ -397,8 +389,7 @@ class TestFailover(CephFSTestCase):
# Kill a standby and check for warning
victim = standbys.pop()
self.fs.mds_stop(victim)
log.info("waiting for insufficient standby daemon warning")
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
# restart the standby, see that he becomes a standby, check health clears
self.fs.mds_restart(victim)
@ -412,8 +403,7 @@ class TestFailover(CephFSTestCase):
standbys = self.mds_cluster.get_standby_daemons()
self.assertGreaterEqual(len(standbys), 1)
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
log.info("waiting for insufficient standby daemon warning")
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
# Set it to 0
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
@ -429,7 +419,6 @@ class TestFailover(CephFSTestCase):
self.mount_a.umount_wait()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
mds_0 = self.fs.get_rank(rank=0, status=status)
@ -437,7 +426,7 @@ class TestFailover(CephFSTestCase):
self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
self.wait_until_true(
lambda: "laggy_since" in self.fs.get_rank(),
timeout=grace * 2
timeout=self.fs.beacon_timeout
)
self.fs.rank_fail(rank=1)
@ -450,7 +439,7 @@ class TestFailover(CephFSTestCase):
self.fs.rank_signal(signal.SIGCONT, rank=0)
self.wait_until_true(
lambda: "laggy_since" not in self.fs.get_rank(rank=0),
timeout=grace * 2
timeout=self.fs.beacon_timeout
)
# mds.b will be stuck at 'reconnect' state if snapserver gets confused

View File

@ -129,7 +129,7 @@ class TestForwardScrub(CephFSTestCase):
# Umount before flush to avoid cap releases putting
# things we don't want in the journal later.
self.mount_a.umount_wait()
self.fs.mds_asok(["flush", "journal"])
self.fs.flush()
# Create a new inode that's just in the log, i.e. would
# look orphaned to backward scan if backward scan wisnae
@ -163,7 +163,7 @@ class TestForwardScrub(CephFSTestCase):
# Run a tagging forward scrub
tag = "mytag123"
self.fs.mds_asok(["tag", "path", "/parent", tag])
self.fs.rank_asok(["tag", "path", "/parent", tag])
# See that the orphan wisnae tagged
self.assertUntagged(inos['./parent/flushed/bravo'])
@ -175,14 +175,21 @@ class TestForwardScrub(CephFSTestCase):
# See that journalled-but-not-flushed file *was* tagged
self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
# Run cephfs-data-scan targeting only orphans
# okay, now we are going to run cephfs-data-scan. It's necessary to
# have a clean journal otherwise replay will blowup on mismatched
# inotable versions (due to scan_links)
self.fs.flush()
self.fs.fail()
self.fs.journal_tool(["journal", "reset", "--force"], 0)
# Run cephfs-data-scan targeting only orphans
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
self.fs.data_scan([
"scan_inodes",
"--filter-tag", tag,
self.fs.get_data_pool_name()
])
self.fs.data_scan(["scan_links"])
# After in-place injection stats should be kosher again
self.fs.set_ceph_conf('mds', 'mds verify scatter', True)

View File

@ -317,3 +317,43 @@ class TestFragmentation(CephFSTestCase):
lambda: _count_fragmented() > 0,
timeout=30
)
def test_dir_merge_with_snap_items(self):
"""
That directory remain fragmented when snapshot items are taken into account.
"""
split_size = 1000
merge_size = 100
self._configure(
mds_bal_split_size=split_size,
mds_bal_merge_size=merge_size,
mds_bal_split_bits=1
)
# split the dir
create_files = split_size + 50
self.mount_a.create_n_files("splitdir/file_", create_files)
self.wait_until_true(
lambda: self.get_splits() == 1,
timeout=30
)
frags = self.get_dir_ino("/splitdir")['dirfrags']
self.assertEqual(len(frags), 2)
self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*")
self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*")
self.assertEqual(
sum([len(f['dentries']) for f in frags]), create_files
)
self.assertEqual(self.get_merges(), 0)
self.mount_a.run_shell(["mkdir", "splitdir/.snap/snap_a"])
self.mount_a.run_shell(["mkdir", "splitdir/.snap/snap_b"])
self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
time.sleep(30)
self.assertEqual(self.get_merges(), 0)
self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 2)

View File

@ -16,11 +16,7 @@ ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
class OverlayWorkload(object):
def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
self._orig_fs = orig_fs
self._recovery_fs = recovery_fs
self._orig_mount = orig_mount
self._recovery_mount = recovery_mount
def __init__(self):
self._initial_state = None
# Accumulate backtraces for every failed validation, and return them. Backtraces
@ -51,41 +47,40 @@ class OverlayWorkload(object):
"""
raise NotImplementedError()
def damage(self):
def damage(self, fs):
"""
Damage the filesystem pools in ways that will be interesting to recover from. By
default just wipe everything in the metadata pool
"""
pool = self._orig_fs.get_metadata_pool_name()
self._orig_fs.rados(["purge", pool, '--yes-i-really-really-mean-it'])
pool = fs.get_metadata_pool_name()
fs.rados(["purge", pool, '--yes-i-really-really-mean-it'])
def flush(self):
def flush(self, fs):
"""
Called after client unmount, after write: flush whatever you want
"""
self._orig_fs.mds_asok(["flush", "journal"])
self._recovery_fs.mds_asok(["flush", "journal"])
fs.rank_asok(["flush", "journal"])
class SimpleOverlayWorkload(OverlayWorkload):
"""
Single file, single directory, check that it gets recovered and so does its size
"""
def write(self):
self._orig_mount.run_shell(["mkdir", "subdir"])
self._orig_mount.write_n_mb("subdir/sixmegs", 6)
self._initial_state = self._orig_mount.stat("subdir/sixmegs")
def write(self, mount):
mount.run_shell(["mkdir", "subdir"])
mount.write_n_mb("subdir/sixmegs", 6)
self._initial_state = mount.stat("subdir/sixmegs")
def validate(self):
self._recovery_mount.run_shell(["ls", "subdir"])
st = self._recovery_mount.stat("subdir/sixmegs")
def validate(self, recovery_mount):
recovery_mount.run_shell(["ls", "subdir"])
st = recovery_mount.stat("subdir/sixmegs")
self.assert_equal(st['st_size'], self._initial_state['st_size'])
return self._errors
class TestRecoveryPool(CephFSTestCase):
MDSS_REQUIRED = 2
CLIENTS_REQUIRED = 2
CLIENTS_REQUIRED = 1
REQUIRE_RECOVERY_FILESYSTEM = True
def is_marked_damaged(self, rank):
@ -100,95 +95,77 @@ class TestRecoveryPool(CephFSTestCase):
# First, inject some files
workload.write()
workload.write(self.mount_a)
# Unmount the client and flush the journal: the tool should also cope with
# situations where there is dirty metadata, but we'll test that separately
self.mount_a.umount_wait()
self.mount_b.umount_wait()
workload.flush()
# Create the alternate pool if requested
recovery_fs = self.recovery_fs.name
recovery_pool = self.recovery_fs.get_metadata_pool_name()
self.recovery_fs.data_scan(['init', '--force-init',
'--filesystem', recovery_fs,
'--alternate-pool', recovery_pool])
self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
# Stop the MDS
self.fs.mds_stop() # otherwise MDS will join once the fs is reset
workload.flush(self.fs)
self.fs.fail()
# After recovery, we need the MDS to not be strict about stats (in production these options
# are off by default, but in QA we need to explicitly disable them)
# Note: these have to be written to ceph.conf to override existing ceph.conf values.
self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
self.fs.mds_restart()
# Apply any data damage the workload wants
workload.damage()
workload.damage(self.fs)
# Create the alternate pool if requested
recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
recovery_fs.create(recover=True, metadata_overlay=True)
recovery_pool = recovery_fs.get_metadata_pool_name()
recovery_fs.mon_manager.raw_cluster_cmd('-s')
# Reset the MDS map in case multiple ranks were in play: recovery procedure
# only understands how to rebuild metadata under rank 0
self.fs.reset()
self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
#self.fs.reset()
#self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
#self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
#self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
# Run the recovery procedure
recovery_fs.data_scan(['init', '--force-init',
'--filesystem', recovery_fs.name,
'--alternate-pool', recovery_pool])
recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "session"])
recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "snap"])
recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "inode"])
if False:
with self.assertRaises(CommandFailedError):
# Normal reset should fail when no objects are present, we'll use --force instead
self.fs.journal_tool(["journal", "reset"], 0)
self.fs.data_scan(['scan_extents', '--alternate-pool',
recovery_fs.data_scan(['scan_extents', '--alternate-pool',
recovery_pool, '--filesystem', self.fs.name,
self.fs.get_data_pool_name()])
self.fs.data_scan(['scan_inodes', '--alternate-pool',
recovery_fs.data_scan(['scan_inodes', '--alternate-pool',
recovery_pool, '--filesystem', self.fs.name,
'--force-corrupt', '--force-init',
self.fs.get_data_pool_name()])
self.fs.journal_tool(['event', 'recover_dentries', 'list',
recovery_fs.data_scan(['scan_links', '--filesystem', recovery_fs.name])
recovery_fs.journal_tool(['event', 'recover_dentries', 'list',
'--alternate-pool', recovery_pool], 0)
self.fs.data_scan(['init', '--force-init', '--filesystem',
self.fs.name])
self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
'--force-corrupt', '--force-init',
self.fs.get_data_pool_name()])
self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
self.fs.journal_tool(['journal', 'reset', '--force'], 0)
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
recovery_fs + ":0")
# Mark the MDS repaired
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
recovery_fs.journal_tool(["journal", "reset", "--force"], 0)
# Start the MDS
self.fs.mds_restart()
self.fs.set_joinable()
self.recovery_fs.mds_restart()
self.fs.wait_for_daemons()
self.recovery_fs.wait_for_daemons()
status = self.recovery_fs.status()
for rank in self.recovery_fs.get_ranks(status=status):
self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
'injectargs', '--debug-mds=20')
self.fs.rank_tell(['scrub', 'start', '/', 'recursive,repair'], rank=rank['rank'], status=status)
log.info(str(self.mds_cluster.status()))
recovery_fs.set_joinable()
status = recovery_fs.wait_for_daemons()
self.config_set('mds', 'debug_mds', '20')
for rank in recovery_fs.get_ranks(status=status):
recovery_fs.rank_tell(['scrub', 'start', '/', 'force,recursive,repair'], rank=rank['rank'], status=status)
log.info(str(recovery_fs.status()))
# Mount a client
self.mount_a.mount_wait()
self.mount_b.mount_wait(cephfs_name=recovery_fs)
self.mount_a.mount_wait(cephfs_name=recovery_fs.name)
# See that the files are present and correct
errors = workload.validate()
errors = workload.validate(self.mount_a)
if errors:
log.error("Validation errors found: {0}".format(len(errors)))
for e in errors:
@ -199,5 +176,4 @@ class TestRecoveryPool(CephFSTestCase):
))
def test_rebuild_simple(self):
self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
self.mount_a, self.mount_b))
self._rebuild_metadata(SimpleOverlayWorkload())

View File

@ -176,3 +176,12 @@ class TestScrub(CephFSTestCase):
def test_scrub_dup_inode(self):
self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))
def test_mdsdir_scrub_backtrace(self):
damage_count = self._get_damage_count()
self.assertNotIn("MDS_DAMAGE", self.mds_cluster.mon_manager.get_mon_health()['checks'])
out_json = self.fs.run_scrub(["start", "~mdsdir", "recursive"])
self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
self.assertEqual(self._get_damage_count(), damage_count)
self.assertNotIn("MDS_DAMAGE", self.mds_cluster.mon_manager.get_mon_health()['checks'])

View File

@ -139,8 +139,7 @@ done
# resume and verify
self._resume_scrub(0)
out_json = self.fs.get_scrub_status()
self.assertTrue("no active" in out_json['status'])
self.assertTrue(self.fs.wait_until_scrub_complete(sleep=5, timeout=30))
checked = self._check_task_status_na()
self.assertTrue(checked)
@ -168,15 +167,13 @@ done
# Kill the rank 0
self.fs.mds_stop(original_active)
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
def promoted():
active = self.fs.get_active_names()
return active and active[0] in original_standbys
log.info("Waiting for promotion of one of the original standbys {0}".format(
original_standbys))
self.wait_until_true(promoted, timeout=grace*2)
self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
self._check_task_status_na()

View File

@ -19,7 +19,7 @@ def seconds_upto_next_schedule(time_from, timo):
ts = int(time_from)
return ((int(ts / 60) * 60) + timo) - ts
class TestSnapSchedules(CephFSTestCase):
class TestSnapSchedulesHelper(CephFSTestCase):
CLIENTS_REQUIRED = 1
TEST_VOLUME_NAME = 'snap_vol'
@ -54,7 +54,7 @@ class TestSnapSchedules(CephFSTestCase):
result = json.loads(self._fs_cmd("volume", "ls"))
if len(result) == 0:
self.vol_created = True
self.volname = TestSnapSchedules.TEST_VOLUME_NAME
self.volname = TestSnapSchedulesHelper.TEST_VOLUME_NAME
self._fs_cmd("volume", "create", self.volname)
else:
self.volname = result[0]['name']
@ -69,7 +69,7 @@ class TestSnapSchedules(CephFSTestCase):
self.config_set('mgr', 'mgr/snap_schedule/allow_m_granularity', True)
def setUp(self):
super(TestSnapSchedules, self).setUp()
super(TestSnapSchedulesHelper, self).setUp()
self.volname = None
self.vol_created = False
self._create_or_reuse_test_volume()
@ -84,7 +84,7 @@ class TestSnapSchedules(CephFSTestCase):
if self.vol_created:
self._delete_test_volume()
self._disable_snap_schedule()
super(TestSnapSchedules, self).tearDown()
super(TestSnapSchedulesHelper, self).tearDown()
def _schedule_to_timeout(self, schedule):
mult = schedule[-1]
@ -115,7 +115,7 @@ class TestSnapSchedules(CephFSTestCase):
def verify(self, dir_path, max_trials):
trials = 0
snap_path = "{0}/.snap".format(dir_path)
snap_path = f'{dir_path}/.snap'
while (len(self.create_cbks) or len(self.remove_cbks)) and trials < max_trials:
snapshots = set(self.mount_a.ls(path=snap_path))
added = snapshots - self.snapshots
@ -143,7 +143,7 @@ class TestSnapSchedules(CephFSTestCase):
# expected "scheduled" snapshot name
ts_name = (datetime.utcfromtimestamp(snap_sched_exec_epoch)
+ timedelta(seconds=wait_timo)).strftime(TestSnapSchedules.SNAPSHOT_TS_FORMAT)
+ timedelta(seconds=wait_timo)).strftime(TestSnapSchedulesHelper.SNAPSHOT_TS_FORMAT)
return (wait_timo, ts_name)
def verify_schedule(self, dir_path, schedules, retentions=[]):
@ -158,6 +158,7 @@ class TestSnapSchedules(CephFSTestCase):
for retention in retentions:
self.assertTrue(retention in json_res['retention'])
class TestSnapSchedules(TestSnapSchedulesHelper):
def remove_snapshots(self, dir_path):
snap_path = f'{dir_path}/.snap'
@ -351,7 +352,7 @@ class TestSnapSchedules(CephFSTestCase):
snap_path = f"{dir_path}/.snap"[1:]
snapshots = self.mount_a.ls(path=snap_path)
fs_count = len(snapshots)
log.debug(f'snapshots: {snapshots}');
log.debug(f'snapshots: {snapshots}')
result = self.fs_snap_schedule_cmd('status', path=dir_path,
format='json')
@ -446,3 +447,49 @@ class TestSnapSchedules(CephFSTestCase):
self.fs_snap_schedule_cmd('remove', path=testdir, snap_schedule='1M')
self.remove_snapshots(testdir[1:])
self.mount_a.run_shell(['rmdir', testdir[1:]])
class TestSnapSchedulesSnapdir(TestSnapSchedulesHelper):
def remove_snapshots(self, dir_path, sdn):
snap_path = f'{dir_path}/{sdn}'
snapshots = self.mount_a.ls(path=snap_path)
for snapshot in snapshots:
snapshot_path = os.path.join(snap_path, snapshot)
log.debug(f'removing snapshot: {snapshot_path}')
self.mount_a.run_shell(['rmdir', snapshot_path])
def get_snap_dir_name(self):
from tasks.cephfs.fuse_mount import FuseMount
from tasks.cephfs.kernel_mount import KernelMount
if isinstance(self.mount_a, KernelMount):
sdn = self.mount_a.client_config.get('snapdirname', '.snap')
elif isinstance(self.mount_a, FuseMount):
sdn = self.mount_a.client_config.get('client_snapdir', '.snap')
self.fs.set_ceph_conf('client', 'client snapdir', sdn)
self.mount_a.remount()
return sdn
def test_snap_dir_name(self):
"""Test the correctness of snap directory name"""
self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedulesSnapdir.TEST_DIRECTORY])
# set a schedule on the dir
self.fs_snap_schedule_cmd('add', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY, snap_schedule='1M')
self.fs_snap_schedule_cmd('retention', 'add', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY, retention_spec_or_period='1M')
exec_time = time.time()
timo, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M')
sdn = self.get_snap_dir_name()
log.info(f'expecting snap {TestSnapSchedulesSnapdir.TEST_DIRECTORY}/{sdn}/scheduled-{snap_sfx} in ~{timo}s...')
# verify snapshot schedule
self.verify_schedule(TestSnapSchedulesSnapdir.TEST_DIRECTORY, ['1M'], retentions=[{'M':1}])
# remove snapshot schedule
self.fs_snap_schedule_cmd('remove', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY)
# remove all scheduled snapshots
self.remove_snapshots(TestSnapSchedulesSnapdir.TEST_DIRECTORY, sdn)
self.mount_a.run_shell(['rmdir', TestSnapSchedulesSnapdir.TEST_DIRECTORY])

View File

@ -69,8 +69,6 @@ class TestSnapshots(CephFSTestCase):
self.fs.set_max_mds(2)
status = self.fs.wait_for_daemons()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
# setup subtrees
self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
@ -91,7 +89,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_freeze(True, rank=0)
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank0['name']);
self.fs.rank_fail(rank=0)
@ -119,7 +117,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_freeze(True, rank=1) # prevent failover...
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*3);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank0['name']);
self.fs.rank_signal(signal.SIGKILL, rank=1)
@ -167,7 +165,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_freeze(True, rank=1) # prevent failover...
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank1['name']);
self.mount_a.kill()
@ -209,7 +207,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status)
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank1['name']);
self.mount_a.kill()
@ -222,7 +220,7 @@ class TestSnapshots(CephFSTestCase):
self.wait_for_daemon_start([rank1['name']])
# rollback triggers assertion
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank0['name']);
self.fs.rank_fail(rank=0)
self.fs.mds_restart(rank0['name'])
@ -243,8 +241,6 @@ class TestSnapshots(CephFSTestCase):
self.fs.set_max_mds(3)
status = self.fs.wait_for_daemons()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
@ -301,7 +297,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_freeze(True, rank=2)
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank2['name']);
# mksnap should wait for notify ack from mds.2
@ -327,7 +323,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
last_created = self._get_last_created_snap(rank=0)
proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank2['name']);
self.mount_a.kill()
@ -537,3 +533,62 @@ class TestSnapshots(CephFSTestCase):
# after reducing limit we expect the new snapshot creation to fail
pass
self.delete_dir_and_snaps("accounts", new_limit + 1)
class TestMonSnapsAndFsPools(CephFSTestCase):
MDSS_REQUIRED = 3
def test_disallow_monitor_managed_snaps_for_fs_pools(self):
"""
Test that creation of monitor managed snaps fails for pools attached
to any file-system
"""
with self.assertRaises(CommandFailedError):
self.fs.rados(["mksnap", "snap1"], pool=self.fs.get_data_pool_name())
with self.assertRaises(CommandFailedError):
self.fs.rados(["mksnap", "snap2"], pool=self.fs.get_metadata_pool_name())
def test_attaching_pools_with_snaps_to_fs_fails(self):
"""
Test that attempt to attach pool with snapshots to an fs fails
"""
test_pool_name = 'snap-test-pool'
base_cmd = f'osd pool create {test_pool_name}'
ret = self.run_cluster_cmd_result(base_cmd)
self.assertEqual(ret, 0)
self.fs.rados(["mksnap", "snap3"], pool=test_pool_name)
base_cmd = f'fs add_data_pool {self.fs.name} {test_pool_name}'
ret = self.run_cluster_cmd_result(base_cmd)
self.assertEqual(ret, errno.EOPNOTSUPP)
# cleanup
self.fs.rados(["rmsnap", "snap3"], pool=test_pool_name)
base_cmd = f'osd pool delete {test_pool_name}'
ret = self.run_cluster_cmd_result(base_cmd)
def test_using_pool_with_snap_fails_fs_creation(self):
"""
Test that using a pool with snaps for fs creation fails
"""
base_cmd = 'osd pool create test_data_pool'
ret = self.run_cluster_cmd_result(base_cmd)
self.assertEqual(ret, 0)
base_cmd = 'osd pool create test_metadata_pool'
ret = self.run_cluster_cmd_result(base_cmd)
self.assertEqual(ret, 0)
self.fs.rados(["mksnap", "snap4"], pool='test_data_pool')
base_cmd = 'fs new testfs test_metadata_pool test_data_pool'
ret = self.run_cluster_cmd_result(base_cmd)
self.assertEqual(ret, errno.EOPNOTSUPP)
# cleanup
self.fs.rados(["rmsnap", "snap4"], pool='test_data_pool')
base_cmd = 'osd pool delete test_data_pool'
ret = self.run_cluster_cmd_result(base_cmd)
base_cmd = 'osd pool delete test_metadata_pool'
ret = self.run_cluster_cmd_result(base_cmd)

View File

@ -9,7 +9,7 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
FSID='00000000-0000-0000-0000-0000deadbeef'
# images that are used
IMAGE_MASTER=${IMAGE_MASTER:-'quay.ceph.io/ceph-ci/ceph:master'}
IMAGE_MAIN=${IMAGE_MAIN:-'quay.ceph.io/ceph-ci/ceph:main'}
IMAGE_PACIFIC=${IMAGE_PACIFIC:-'quay.ceph.io/ceph-ci/ceph:pacific'}
#IMAGE_OCTOPUS=${IMAGE_OCTOPUS:-'quay.ceph.io/ceph-ci/ceph:octopus'}
IMAGE_DEFAULT=${IMAGE_PACIFIC}
@ -168,7 +168,7 @@ $SUDO CEPHADM_IMAGE=$IMAGE_PACIFIC $CEPHADM_BIN version \
#$SUDO CEPHADM_IMAGE=$IMAGE_OCTOPUS $CEPHADM_BIN version
#$SUDO CEPHADM_IMAGE=$IMAGE_OCTOPUS $CEPHADM_BIN version \
# | grep 'ceph version 15'
$SUDO $CEPHADM_BIN --image $IMAGE_MASTER version | grep 'ceph version'
$SUDO $CEPHADM_BIN --image $IMAGE_MAIN version | grep 'ceph version'
# try force docker; this won't work if docker isn't installed
systemctl status docker > /dev/null && ( $CEPHADM --docker version | grep 'ceph version' ) || echo "docker not installed"

View File

@ -24,6 +24,11 @@ for f in $(find $TESTDIR/archive/coredump -type f); do
fi
done
# ceph-crash runs as the unprivileged "ceph" user, but when under test
# the ceph osd daemons are running as root, so their crash files aren't
# readable. let's chown them so they behave as they would in real life.
sudo chown -R ceph:ceph /var/lib/ceph/crash
# let daemon find crashdumps on startup
sudo systemctl restart ceph-crash
sleep 30

View File

@ -387,4 +387,49 @@ if [ -n "${COOKIE}" ]; then
unmap_device ${DEV} ${PID}
fi
# test discard granularity with journaling
rbd config image set ${POOL}/${IMAGE} rbd_discard_granularity_bytes 4096
rbd feature enable ${POOL}/${IMAGE} journaling
DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}`
get_pid ${POOL}
# since a discard will now be pruned to only whole blocks (0..4095, 4096..8191)
# let us test all the cases around those alignments. 512 is the smallest
# possible block blkdiscard allows us to use. Thus the test checks
# 512 before, on the alignment, 512 after.
_sudo blkdiscard --offset 0 --length $((4096-512)) ${DEV}
_sudo blkdiscard --offset 0 --length 4096 ${DEV}
_sudo blkdiscard --offset 0 --length $((4096+512)) ${DEV}
_sudo blkdiscard --offset 512 --length $((8192-1024)) ${DEV}
_sudo blkdiscard --offset 512 --length $((8192-512)) ${DEV}
_sudo blkdiscard --offset 512 --length 8192 ${DEV}
# wait for commit log to be empty, 10 seconds should be well enough
tries=0
queue_length=`rbd journal inspect --pool ${POOL} --image ${IMAGE} | awk '/entries inspected/ {print $1}'`
while [ ${tries} -lt 10 ] && [ ${queue_length} -gt 0 ]; do
rbd journal inspect --pool ${POOL} --image ${IMAGE} --verbose
sleep 1
queue_length=`rbd journal inspect --pool ${POOL} --image ${IMAGE} | awk '/entries inspected/ {print $1}'`
tries=$((tries+1))
done
[ ${queue_length} -eq 0 ]
unmap_device ${DEV} ${PID}
DEV=
rbd feature disable ${POOL}/${IMAGE} journaling
rbd config image rm ${POOL}/${IMAGE} rbd_discard_granularity_bytes
# test that rbd_op_threads setting takes effect
EXPECTED=`ceph-conf --show-config-value librados_thread_count`
DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}`
get_pid ${POOL}
ACTUAL=`ps -p ${PID} -T | grep -c io_context_pool`
[ ${ACTUAL} -eq ${EXPECTED} ]
unmap_device ${DEV} ${PID}
EXPECTED=$((EXPECTED * 3 + 1))
DEV=`_sudo rbd device --device-type nbd --rbd-op-threads ${EXPECTED} map ${POOL}/${IMAGE}`
get_pid ${POOL}
ACTUAL=`ps -p ${PID} -T | grep -c io_context_pool`
[ ${ACTUAL} -eq ${EXPECTED} ]
unmap_device ${DEV} ${PID}
DEV=
echo OK

View File

@ -24,8 +24,13 @@ start_mirrors ${CLUSTER1}
start_mirrors ${CLUSTER2}
testlog "TEST: verify rx-only direction"
[ "$(rbd --cluster ${CLUSTER1} --pool ${POOL} mirror pool info --format xml |
${XMLSTARLET} sel -t -v '//mirror/peers/peer[1]/uuid')" = "" ]
# rx-only peer is added immediately by "rbd mirror pool peer bootstrap import"
rbd --cluster ${CLUSTER2} --pool ${POOL} mirror pool info --format json | jq -e '.peers[0].direction == "rx-only"'
# tx-only peer is added asynchronously by mirror_peer_ping class method
while ! rbd --cluster ${CLUSTER1} --pool ${POOL} mirror pool info --format json | jq -e '.peers | length > 0'; do
sleep 1
done
rbd --cluster ${CLUSTER1} --pool ${POOL} mirror pool info --format json | jq -e '.peers[0].direction == "tx-only"'
create_image_and_enable_mirror ${CLUSTER1} ${POOL} image1
@ -34,6 +39,10 @@ write_image ${CLUSTER1} ${POOL} image1 100
wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} image1
testlog "TEST: verify rx-tx direction"
# both rx-tx peers are added immediately by "rbd mirror pool peer bootstrap import"
rbd --cluster ${CLUSTER1} --pool ${PARENT_POOL} mirror pool info --format json | jq -e '.peers[0].direction == "rx-tx"'
rbd --cluster ${CLUSTER2} --pool ${PARENT_POOL} mirror pool info --format json | jq -e '.peers[0].direction == "rx-tx"'
create_image ${CLUSTER1} ${PARENT_POOL} image1
create_image ${CLUSTER2} ${PARENT_POOL} image2

View File

@ -1,2 +1,2 @@
3cf40e2dca667f68c6ce3ff5cd94f01e711af894
16.2.11
5a2d516ce4b134bfafc80c4274532ac0d56fc1e2
16.2.12

View File

@ -3,8 +3,10 @@
# vim: ts=4 sw=4 smarttab expandtab
import argparse
import grp
import logging
import os
import pwd
import signal
import socket
import subprocess
@ -18,6 +20,7 @@ auth_names = ['client.crash.%s' % socket.gethostname(),
'client.crash',
'client.admin']
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
@ -29,7 +32,8 @@ def parse_args():
)
parser.add_argument(
'--name', '-n',
help='ceph name to authenticate as (default: try client.crash, client.admin)')
help='ceph name to authenticate as '
'(default: try client.crash, client.admin)')
parser.add_argument(
'--log-level', '-l',
help='log level output (default: INFO), support INFO or DEBUG')
@ -48,7 +52,8 @@ def post_crash(path):
stderr=subprocess.PIPE,
)
f = open(os.path.join(path, 'meta'), 'rb')
stderr = pr.communicate(input=f.read())
(_, stderr) = pr.communicate(input=f.read())
stderr = stderr.decode()
rc = pr.wait()
f.close()
if rc != 0 or stderr != "":
@ -61,6 +66,9 @@ def post_crash(path):
def scrape_path(path):
for p in os.listdir(path):
crashpath = os.path.join(path, p)
if not os.access(crashpath, os.R_OK):
log.warning('unable to read crash path %s' % (crashpath))
continue
metapath = os.path.join(crashpath, 'meta')
donepath = os.path.join(crashpath, 'done')
if os.path.isfile(metapath):
@ -79,12 +87,31 @@ def scrape_path(path):
(metapath, p, os.path.join('posted/', p))
)
def handler(signum):
def handler(signum, frame):
print('*** Interrupted with signal %d ***' % signum)
sys.exit(0)
def drop_privs():
if os.getuid() == 0:
try:
ceph_uid = pwd.getpwnam("ceph").pw_uid
ceph_gid = grp.getgrnam("ceph").gr_gid
os.setgroups([])
os.setgid(ceph_gid)
os.setuid(ceph_uid)
except Exception as e:
log.error(f"Unable to drop privileges: {e}")
sys.exit(1)
def main():
global auth_names
# run as unprivileged ceph user
drop_privs()
# exit code 0 on SIGINT, SIGTERM
signal.signal(signal.SIGINT, handler)
signal.signal(signal.SIGTERM, handler)
@ -103,7 +130,10 @@ def main():
log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
while True:
scrape_path(args.path)
try:
scrape_path(args.path)
except Exception as e:
log.error(f"Error scraping {args.path}: {e}")
if args.delay == 0:
sys.exit(0)
time.sleep(args.delay * 60)

View File

@ -794,7 +794,7 @@ def get_all_devices_vgs(name_prefix=''):
verbose_on_failure=False
)
vgs = _output_parser(stdout, vg_fields)
return [VolumeGroup(**vg) for vg in vgs]
return [VolumeGroup(**vg) for vg in vgs if vg['vg_name']]
#################################
#

View File

@ -114,16 +114,23 @@ def get_physical_fast_allocs(devices, type_, fast_slots_per_device, new_osds, ar
ret = []
vg_device_map = group_devices_by_vg(devices)
for vg_devices in vg_device_map.values():
for vg_name, vg_devices in vg_device_map.items():
for dev in vg_devices:
if not dev.available_lvm:
continue
# any LV present is considered a taken slot
occupied_slots = len(dev.lvs)
# prior to v15.2.8, db/wal deployments were grouping multiple fast devices into single VGs - we need to
# multiply requested_slots (per device) by the number of devices in the VG in order to ensure that
# abs_size is calculated correctly from vg_size
if vg_name == 'unused_devices':
slots_for_vg = requested_slots
else:
slots_for_vg = len(vg_devices) * requested_slots
dev_size = dev.vg_size[0]
# this only looks at the first vg on device, unsure if there is a better
# way
abs_size = disk.Size(b=int(dev_size / requested_slots))
abs_size = disk.Size(b=int(dev_size / slots_for_vg))
free_size = dev.vg_free[0]
relative_size = int(abs_size) / dev_size
if requested_size:
@ -149,7 +156,6 @@ def group_devices_by_vg(devices):
result['unused_devices'] = []
for dev in devices:
if len(dev.vgs) > 0:
# already using assumption that a PV only belongs to single VG in other places
vg_name = dev.vgs[0].name
if vg_name in result:
result[vg_name].append(dev)

View File

@ -101,16 +101,16 @@ class List(object):
'failed to determine if parent device {} is BlueStore. err: {}'.format(parent, e)))
continue
bs_info = _get_bluestore_info(dev)
if bs_info is None:
# None is also returned in the rare event that there is an issue reading info from
# a BlueStore disk, so be sure to log our assumption that it isn't bluestore
logger.info('device {} does not have BlueStore information'.format(dev))
continue
uuid = bs_info['osd_uuid']
if uuid not in result:
result[uuid] = {}
result[uuid].update(bs_info)
bs_info = _get_bluestore_info(dev)
if bs_info is None:
# None is also returned in the rare event that there is an issue reading info from
# a BlueStore disk, so be sure to log our assumption that it isn't bluestore
logger.info('device {} does not have BlueStore information'.format(dev))
continue
uuid = bs_info['osd_uuid']
if uuid not in result:
result[uuid] = {}
result[uuid].update(bs_info)
return result

Some files were not shown because too many files have changed in this diff Show More