import ceph pacific 16.2.15 source

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Thomas Lamprecht 2024-03-04 16:44:57 +01:00
parent ca55da0300
commit 47fdce5df8
405 changed files with 9800 additions and 3148 deletions

View File

@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
# remove cmake/modules/FindPython* once 3.12 is required
project(ceph
VERSION 16.2.14
VERSION 16.2.15
LANGUAGES CXX C ASM)
foreach(policy

View File

@ -32,6 +32,29 @@
in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
the restored file system is expected to have the same ID as before.
>=16.2.15
----------
* `ceph config dump --format <json|xml>` output will display the localized
option names instead of its normalized version. For e.g.,
"mgr/prometheus/x/server_port" will be displayed instead of
"mgr/prometheus/server_port". This matches the output of the non pretty-print
formatted version of the command.
* CEPHFS: MDS evicts clients which are not advancing their request tids which causes
a large buildup of session metadata resulting in the MDS going read-only due to
the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold`
config controls the maximum size that a (encoded) session metadata can grow.
* RADOS: `get_pool_is_selfmanaged_snaps_mode` C++ API has been deprecated
due to being prone to false negative results. It's safer replacement is
`pool_is_in_selfmanaged_snaps_mode`.
* RBD: When diffing against the beginning of time (`fromsnapname == NULL`) in
fast-diff mode (`whole_object == true` with `fast-diff` image feature enabled
and valid), diff-iterate is now guaranteed to execute locally if exclusive
lock is available. This brings a dramatic performance improvement for QEMU
live disk synchronization and backup use cases.
>= 16.2.14
----------
@ -132,6 +155,10 @@
* CEPHFS: After recovering a Ceph File System post following the disaster recovery
procedure, the recovered files under `lost+found` directory can now be deleted.
* core: cache-tiering is now deprecated.
* mgr/snap_schedule: The snap-schedule mgr module now retains one less snapshot
than the number mentioned against the config tunable `mds_max_snaps_per_dir`
so that a new snapshot can be created and retained during the next schedule
run.
>=16.2.8
--------

View File

@ -1,4 +1,4 @@
Sphinx == 4.4.0
Sphinx == 5.0.2
git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
breathe >= 4.20.0
Jinja2

View File

@ -135,7 +135,7 @@
# main package definition
#################################################################################
Name: ceph
Version: 16.2.14
Version: 16.2.15
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
@ -151,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
Group: System/Filesystems
%endif
URL: http://ceph.com/
Source0: %{?_remote_tarball_prefix}ceph-16.2.14.tar.bz2
Source0: %{?_remote_tarball_prefix}ceph-16.2.15.tar.bz2
%if 0%{?suse_version}
# _insert_obs_source_lines_here
ExclusiveArch: x86_64 aarch64 ppc64le s390x
@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus.
# common
#################################################################################
%prep
%autosetup -p1 -n ceph-16.2.14
%autosetup -p1 -n ceph-16.2.15
%build
# Disable lto on systems that do not support symver attribute

View File

@ -1,7 +1,13 @@
ceph (16.2.14-1focal) focal; urgency=medium
ceph (16.2.15-1focal) focal; urgency=medium
-- Jenkins Build Slave User <jenkins-build@braggi13.front.sepia.ceph.com> Tue, 29 Aug 2023 16:38:35 +0000
-- Jenkins Build Slave User <jenkins-build@braggi16.front.sepia.ceph.com> Mon, 26 Feb 2024 19:34:01 +0000
ceph (16.2.15-1) stable; urgency=medium
* New upstream release
-- Ceph Release Team <ceph-maintainers@ceph.io> Mon, 26 Feb 2024 19:21:07 +0000
ceph (16.2.14-1) stable; urgency=medium

View File

@ -56,12 +56,13 @@ function(build_rocksdb)
endif()
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-Wno-deprecated-copy" HAS_WARNING_DEPRECATED_COPY)
set(rocksdb_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
if(HAS_WARNING_DEPRECATED_COPY)
set(rocksdb_CXX_FLAGS -Wno-deprecated-copy)
string(APPEND rocksdb_CXX_FLAGS " -Wno-deprecated-copy")
endif()
check_cxx_compiler_flag("-Wno-pessimizing-move" HAS_WARNING_PESSIMIZING_MOVE)
if(HAS_WARNING_PESSIMIZING_MOVE)
set(rocksdb_CXX_FLAGS "${rocksdb_CXX_FLAGS} -Wno-pessimizing-move")
string(APPEND rocksdb_CXX_FLAGS " -Wno-pessimizing-move")
endif()
if(rocksdb_CXX_FLAGS)
list(APPEND rocksdb_CMAKE_ARGS -DCMAKE_CXX_FLAGS='${rocksdb_CXX_FLAGS}')

View File

@ -15,7 +15,7 @@ creation of multiple file systems use ``ceph fs flag set enable_multiple true``.
::
fs new <file system name> <metadata pool name> <data pool name>
ceph fs new <file system name> <metadata pool name> <data pool name>
This command creates a new file system. The file system name and metadata pool
name are self-explanatory. The specified data pool is the default data pool and
@ -25,13 +25,13 @@ to accommodate the new file system.
::
fs ls
ceph fs ls
List all file systems by name.
::
fs dump [epoch]
ceph fs dump [epoch]
This dumps the FSMap at the given epoch (default: current) which includes all
file system settings, MDS daemons and the ranks they hold, and the list of
@ -40,7 +40,7 @@ standby MDS daemons.
::
fs rm <file system name> [--yes-i-really-mean-it]
ceph fs rm <file system name> [--yes-i-really-mean-it]
Destroy a CephFS file system. This wipes information about the state of the
file system from the FSMap. The metadata pool and data pools are untouched and
@ -48,28 +48,28 @@ must be destroyed separately.
::
fs get <file system name>
ceph fs get <file system name>
Get information about the named file system, including settings and ranks. This
is a subset of the same information from the ``fs dump`` command.
is a subset of the same information from the ``ceph fs dump`` command.
::
fs set <file system name> <var> <val>
ceph fs set <file system name> <var> <val>
Change a setting on a file system. These settings are specific to the named
file system and do not affect other file systems.
::
fs add_data_pool <file system name> <pool name/id>
ceph fs add_data_pool <file system name> <pool name/id>
Add a data pool to the file system. This pool can be used for file layouts
as an alternate location to store file data.
::
fs rm_data_pool <file system name> <pool name/id>
ceph fs rm_data_pool <file system name> <pool name/id>
This command removes the specified pool from the list of data pools for the
file system. If any files have layouts for the removed data pool, the file
@ -82,7 +82,7 @@ Settings
::
fs set <fs name> max_file_size <size in bytes>
ceph fs set <fs name> max_file_size <size in bytes>
CephFS has a configurable maximum file size, and it's 1TB by default.
You may wish to set this limit higher if you expect to store large files
@ -116,13 +116,13 @@ Taking a CephFS cluster down is done by setting the down flag:
::
fs set <fs_name> down true
ceph fs set <fs_name> down true
To bring the cluster back online:
::
fs set <fs_name> down false
ceph fs set <fs_name> down false
This will also restore the previous value of max_mds. MDS daemons are brought
down in a way such that journals are flushed to the metadata pool and all
@ -133,11 +133,11 @@ Taking the cluster down rapidly for deletion or disaster recovery
-----------------------------------------------------------------
To allow rapidly deleting a file system (for testing) or to quickly bring the
file system and MDS daemons down, use the ``fs fail`` command:
file system and MDS daemons down, use the ``ceph fs fail`` command:
::
fs fail <fs_name>
ceph fs fail <fs_name>
This command sets a file system flag to prevent standbys from
activating on the file system (the ``joinable`` flag).
@ -146,7 +146,7 @@ This process can also be done manually by doing the following:
::
fs set <fs_name> joinable false
ceph fs set <fs_name> joinable false
Then the operator can fail all of the ranks which causes the MDS daemons to
respawn as standbys. The file system will be left in a degraded state.
@ -154,7 +154,7 @@ respawn as standbys. The file system will be left in a degraded state.
::
# For all ranks, 0-N:
mds fail <fs_name>:<n>
ceph mds fail <fs_name>:<n>
Once all ranks are inactive, the file system may also be deleted or left in
this state for other purposes (perhaps disaster recovery).
@ -163,7 +163,7 @@ To bring the cluster back up, simply set the joinable flag:
::
fs set <fs_name> joinable true
ceph fs set <fs_name> joinable true
Daemons
@ -182,34 +182,35 @@ Commands to manipulate MDS daemons:
::
mds fail <gid/name/role>
ceph mds fail <gid/name/role>
Mark an MDS daemon as failed. This is equivalent to what the cluster
would do if an MDS daemon had failed to send a message to the mon
for ``mds_beacon_grace`` second. If the daemon was active and a suitable
standby is available, using ``mds fail`` will force a failover to the standby.
standby is available, using ``ceph mds fail`` will force a failover to the
standby.
If the MDS daemon was in reality still running, then using ``mds fail``
If the MDS daemon was in reality still running, then using ``ceph mds fail``
will cause the daemon to restart. If it was active and a standby was
available, then the "failed" daemon will return as a standby.
::
tell mds.<daemon name> command ...
ceph tell mds.<daemon name> command ...
Send a command to the MDS daemon(s). Use ``mds.*`` to send a command to all
daemons. Use ``ceph tell mds.* help`` to learn available commands.
::
mds metadata <gid/name/role>
ceph mds metadata <gid/name/role>
Get metadata about the given MDS known to the Monitors.
::
mds repaired <role>
ceph mds repaired <role>
Mark the file system rank as repaired. Unlike the name suggests, this command
does not change a MDS; it manipulates the file system rank which has been
@ -228,14 +229,14 @@ Commands to manipulate required client features of a file system:
::
fs required_client_features <fs name> add reply_encoding
fs required_client_features <fs name> rm reply_encoding
ceph fs required_client_features <fs name> add reply_encoding
ceph fs required_client_features <fs name> rm reply_encoding
To list all CephFS features
::
fs feature ls
ceph fs feature ls
Clients that are missing newly added features will be evicted automatically.
@ -330,7 +331,7 @@ Global settings
::
fs flag set <flag name> <flag val> [<confirmation string>]
ceph fs flag set <flag name> <flag val> [<confirmation string>]
Sets a global CephFS flag (i.e. not specific to a particular file system).
Currently, the only flag setting is 'enable_multiple' which allows having
@ -352,13 +353,13 @@ file system.
::
mds rmfailed
ceph mds rmfailed
This removes a rank from the failed set.
::
fs reset <file system name>
ceph fs reset <file system name>
This command resets the file system state to defaults, except for the name and
pools. Non-zero ranks are saved in the stopped set.
@ -366,7 +367,7 @@ pools. Non-zero ranks are saved in the stopped set.
::
fs new <file system name> <metadata pool name> <data pool name> --fscid <fscid> --force
ceph fs new <file system name> <metadata pool name> <data pool name> --fscid <fscid> --force
This command creates a file system with a specific **fscid** (file system cluster ID).
You may want to do this when an application expects the file system's ID to be

View File

@ -37,7 +37,7 @@ Options :
.. code:: bash
[build]$ python3 -m venv venv && source venv/bin/activate && pip3 install cmd2
[build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/cephfs-shell
[build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/shell/cephfs-shell
Commands
========

View File

@ -24,6 +24,16 @@ that directory.
To restrict clients to only mount and work within a certain directory, use
path-based MDS authentication capabilities.
Note that this restriction *only* impacts the filesystem hierarchy -- the metadata
tree managed by the MDS. Clients will still be able to access the underlying
file data in RADOS directly. To segregate clients fully, you must also isolate
untrusted clients in their own RADOS namespace. You can place a client's
filesystem subtree in a particular namespace using `file layouts`_ and then
restrict their RADOS access to that namespace using `OSD capabilities`_
.. _file layouts: ./file-layouts
.. _OSD capabilities: ../rados/operations/user-management/#authorization-capabilities
Syntax
------

View File

@ -38,6 +38,13 @@ below). By default
the start time is last midnight. So when a snapshot schedule with repeat
interval `1h` is added at 13:50
with the default start time, the first snapshot will be taken at 14:00.
The time zone is assumed to be UTC if none is explicitly included in the string.
An explicit time zone will be mapped to UTC at execution.
The start time must be in ISO8601 format. Examples below:
UTC: 2022-08-08T05:30:00 i.e. 5:30 AM UTC, without explicit time zone offset
IDT: 2022-08-08T09:00:00+03:00 i.e. 6:00 AM UTC
EDT: 2022-08-08T05:30:00-04:00 i.e. 9:30 AM UTC
Retention specifications are identified by path and the retention spec itself. A
retention spec consists of either a number and a time period separated by a
@ -155,6 +162,11 @@ Examples::
snapshot creation is accounted for in the "created_count" field, which is a
cumulative count of the total number of snapshots created so far.
.. note: The maximum number of snapshots to retain per directory is limited by the
config tunable `mds_max_snaps_per_dir`. This tunable defaults to 100.
To ensure a new snapshot can be created, one snapshot less than this will be
retained. So by default, a maximum of 99 snapshots will be retained.
Active and inactive schedules
-----------------------------
Snapshot schedules can be added for a path that doesn't exist yet in the

View File

@ -60,6 +60,8 @@ Possible -op commands::
* meta-list
* get-osdmap
* set-osdmap
* get-superblock
* set-superblock
* get-inc-osdmap
* set-inc-osdmap
* mark-complete
@ -414,7 +416,7 @@ Options
.. option:: --op arg
Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log]
Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log]
.. option:: --epoch arg
@ -422,7 +424,7 @@ Options
.. option:: --file arg
path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap
path of file to export, export-remove, import, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap or set-inc-osdmap
.. option:: --mon-store-path arg

View File

@ -1314,7 +1314,7 @@ Subcommand ``cache-mode`` specifies the caching mode for cache tier <pool>.
Usage::
ceph osd tier cache-mode <poolname> writeback|readproxy|readonly|none
ceph osd tier cache-mode <poolname> writeback|proxy|readproxy|readonly|none
Subcommand ``remove`` removes the tier <tierpool> (the second one) from base pool
<pool> (the first one).

View File

@ -264,8 +264,8 @@ Pool specific commands
:command:`append` *name* *infile*
Append object name to the cluster with contents from infile.
:command:`rm` *name*
Remove object name.
:command:`rm` [--force-full] *name* ...
Remove object(s) with name(s). With ``--force-full`` will remove when cluster is marked full.
:command:`listwatchers` *name*
List the watchers of object name.

View File

@ -333,7 +333,7 @@ OSD and run the following command:
ceph-bluestore-tool \
--path <data path> \
--sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \
--sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \
reshard

View File

@ -109,17 +109,6 @@ Async messenger options
:Default: ``3``
``ms_async_max_op_threads``
:Description: Maximum number of worker threads used by each Async Messenger instance.
Set to lower values when your machine has limited CPU count, and increase
when your CPUs are underutilized (i. e. one or more of CPUs are
constantly on 100% load during I/O operations).
:Type: 64-bit Unsigned Integer
:Required: No
:Default: ``5``
``ms_async_send_inline``
:Description: Send messages directly from the thread that generated them instead of
@ -129,5 +118,3 @@ Async messenger options
:Type: Boolean
:Required: No
:Default: ``false``

View File

@ -4,12 +4,41 @@
.. index:: pools; configuration
Ceph uses default values to determine how many placement groups (PGs) will be
assigned to each pool. We recommend overriding some of the defaults.
Specifically, we recommend setting a pool's replica size and overriding the
default number of placement groups. You can set these values when running
`pool`_ commands. You can also override the defaults by adding new ones in the
``[global]`` section of your Ceph configuration file.
The number of placement groups that the CRUSH algorithm assigns to each pool is
determined by the values of variables in the centralized configuration database
in the monitor cluster.
Both containerized deployments of Ceph (deployments made using ``cephadm`` or
Rook) and non-containerized deployments of Ceph rely on the values in the
central configuration database in the monitor cluster to assign placement
groups to pools.
Example Commands
----------------
To see the value of the variable that governs the number of placement groups in a given pool, run a command of the following form:
.. prompt:: bash
ceph config get osd osd_pool_default_pg_num
To set the value of the variable that governs the number of placement groups in a given pool, run a command of the following form:
.. prompt:: bash
ceph config set osd osd_pool_default_pg_num
Manual Tuning
-------------
In some cases, it might be advisable to override some of the defaults. For
example, you might determine that it is wise to set a pool's replica size and
to override the default number of placement groups in the pool. You can set
these values when running `pool`_ commands.
See Also
--------
See :ref:`pg-autoscaler`.
.. literalinclude:: pool-pg.conf

View File

@ -1404,6 +1404,31 @@ other performance issue with the OSDs.
The exact size of the snapshot trim queue is reported by the ``snaptrimq_len``
field of ``ceph pg ls -f json-detail``.
Stretch Mode
------------
INCORRECT_NUM_BUCKETS_STRETCH_MODE
__________________________________
Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests
that the number of dividing buckets is not equal to 2 after stretch mode is enabled.
You can expect unpredictable failures and MON assertions until the condition is fixed.
We encourage you to fix this by removing additional dividing buckets or bump the
number of dividing buckets to 2.
UNEVEN_WEIGHTS_STRETCH_MODE
___________________________
The 2 dividing buckets must have equal weights when stretch mode is enabled.
This warning suggests that the 2 dividing buckets have uneven weights after
stretch mode is enabled. This is not immediately fatal, however, you can expect
Ceph to be confused when trying to process transitions between dividing buckets.
We encourage you to fix this by making the weights even on both dividing buckets.
This can be done by making sure the combined weight of the OSDs on each dividing
bucket are the same.
Miscellaneous
-------------

View File

@ -127,6 +127,14 @@ Options
:Type: Integer
:Default: ``65000``
``max_header_size``
:Description: The maximum number of header bytes available for a single request.
:Type: Integer
:Default: ``16384``
:Maximum: ``65536``
Civetweb
========

View File

@ -55,7 +55,7 @@ download_from() {
exit
fi
url=$url_base/$fname
wget -c --no-verbose -O $fname $url
wget --no-verbose -O $fname $url
if [ $? != 0 -o ! -e $fname ]; then
echo "Download of $url failed"
elif [ $(sha256sum $fname | awk '{print $1}') != $sha256 ]; then
@ -183,8 +183,7 @@ download_boost $boost_version 4eb3b8d442b426dc35346235c8733b5ae35ba431690e38c6a8
https://boostorg.jfrog.io/artifactory/main/release/$boost_version/source \
https://downloads.sourceforge.net/project/boost/boost/$boost_version \
https://download.ceph.com/qa
download_liburing 0.7 8e2842cfe947f3a443af301bdd6d034455536c38a455c7a700d0c1ad165a7543 \
https://github.com/axboe/liburing/archive \
download_liburing 0.7 05d0cf8493d573c76b11abfcf34aabc7153affebe17ff95f9ae88b0de062a59d \
https://git.kernel.dk/cgit/liburing/snapshot
pmdk_version=1.10
download_pmdk $pmdk_version 08dafcf94db5ac13fac9139c92225d9aa5f3724ea74beee4e6ca19a01a2eb20c \

View File

@ -342,7 +342,7 @@ local g = import 'grafonnet/grafana.libsonnet';
$.graphPanelSchema({},
title,
description,
'null',
'null as zero',
false,
formatY1,
'short',

View File

@ -133,7 +133,7 @@ local u = import 'utils.libsonnet';
$.graphPanelSchema({},
title,
'',
'null',
'null as zero',
false,
formatY1,
'short',

View File

@ -140,7 +140,7 @@ local u = import 'utils.libsonnet';
{},
title,
description,
'null',
'null as zero',
false,
formatY1,
formatY2,
@ -658,7 +658,7 @@ local u = import 'utils.libsonnet';
$.graphPanelSchema(aliasColors,
title,
description,
'null',
'null as zero',
false,
formatY1,
formatY2,

View File

@ -87,7 +87,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -185,7 +185,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -283,7 +283,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -400,7 +400,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -498,7 +498,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -596,7 +596,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,

View File

@ -93,7 +93,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -186,7 +186,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -285,7 +285,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,

View File

@ -87,7 +87,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -180,7 +180,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -266,7 +266,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -352,7 +352,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -445,7 +445,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -531,7 +531,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -636,7 +636,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -754,7 +754,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -893,7 +893,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -1000,7 +1000,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,

View File

@ -80,7 +80,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -173,7 +173,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
@ -266,7 +266,7 @@
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null",
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,

View File

@ -518,7 +518,7 @@ groups:
annotations:
description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
summary: "Pool growth rate may soon exceed capacity"
expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) group_right ceph_pool_metadata) >= 95"
expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
severity: "warning"

View File

@ -1499,35 +1499,44 @@ tests:
# trigger percent full prediction on pools 1 and 2 only
- interval: 12h
input_series:
- series: 'ceph_pool_percent_used{pool_id="1"}'
values: '70 75 80 87 92'
- series: 'ceph_pool_percent_used{pool_id="2"}'
values: '22 22 23 23 24'
- series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
- series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
values: '1 1 1 1 1'
- series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
- series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
values: '78 89 79 98 78'
- series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
values: '1 1 1 1 1'
- series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
values: '22 22 23 23 24'
- series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
values: '1 1 1 1 1'
- series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
values: '1 1 1 1 1'
- series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
values: '1 1 1 1 1'
- series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
values: '1 1 1 1 1'
promql_expr_test:
- expr: |
(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
group_right ceph_pool_metadata) >= 95
(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
group_right() ceph_pool_metadata) >= 95
eval_time: 36h
exp_samples:
- labels: '{name="rbd",pool_id="1",type="replicated"}'
value: 1.424E+02 # 142%
- labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
value: 1.435E+02 # 142%
alert_rule_test:
- eval_time: 48h
alertname: CephPoolGrowthWarning
exp_alerts:
- exp_labels:
name: rbd
instance: 8090
name: default.rgw.index
pool_id: 1
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.9.2
exp_annotations:
summary: Pool growth rate may soon exceed capacity
description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
- interval: 1m
input_series:
- series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'

View File

@ -3,6 +3,7 @@ overrides:
conf:
mds:
debug mds: 20
debug mds balancer: 20
debug ms: 1
mds debug frag: true
mds debug scatterstat: true

View File

@ -2,7 +2,10 @@ overrides:
ceph:
log-ignorelist:
- overall HEALTH_
- \(CEPHADM_STRAY_DAEMON\)
- \(FS_DEGRADED\)
- FS_
- \(CEPHADM_
- \(MDS_FAILED\)
- \(MDS_DEGRADED\)
- \(FS_WITH_FAILED_MDS\)
@ -10,3 +13,10 @@ overrides:
- \(MDS_ALL_DOWN\)
- \(MDS_UP_LESS_THAN_MAX\)
- \(FS_INLINE_DATA_DEPRECATED\)
- \(PG_DEGRADED\)
- Degraded data redundancy
- \(PG_
- acting
- MDS_INSUFFICIENT_STANDBY
- deprecated feature inline_data
- compat changed unexpectedly

View File

@ -2,8 +2,10 @@ overrides:
ceph:
log-ignorelist:
- overall HEALTH_
- \(OSD_DOWN\)
- \(OSD_
- OSD_DOWN
- OSD_
- but it is still running
# MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a'
- is not responding
- is down
- osds down

View File

@ -0,0 +1,6 @@
os_type: rhel
os_version: "8.5"
overrides:
selinux:
whitelist:
- scontext=system_u:system_r:logrotate_t:s0

View File

@ -0,0 +1,6 @@
os_type: rhel
os_version: "8.6"
overrides:
selinux:
whitelist:
- scontext=system_u:system_r:logrotate_t:s0

View File

@ -1 +1 @@
rhel_8.4.yaml
rhel_8.6.yaml

View File

@ -1 +0,0 @@
.qa/distros/podman/rhel_8.4_container_tools_3.0.yaml

View File

@ -1 +0,0 @@
.qa/distros/podman/rhel_8.4_container_tools_rhel8.yaml

View File

@ -0,0 +1 @@
.qa/distros/podman/rhel_8.6_container_tools_3.0.yaml

View File

@ -0,0 +1 @@
.qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml

View File

@ -1,5 +1,5 @@
os_type: rhel
os_version: "8.4"
os_version: "8.6"
overrides:
selinux:
whitelist:

View File

@ -1,5 +1,5 @@
os_type: rhel
os_version: "8.4"
os_version: "8.6"
overrides:
selinux:
whitelist:

View File

@ -1691,6 +1691,29 @@ function test_wait_for_peered() {
#######################################################################
##
# Wait until the cluster's health condition disappeared.
# $TIMEOUT default
#
# @param string to grep for in health detail
# @return 0 if the cluster health doesn't matches request,
# 1 otherwise if after $TIMEOUT seconds health condition remains.
#
function wait_for_health_gone() {
local grepstr=$1
local -a delays=($(get_timeout_delays $TIMEOUT .1))
local -i loop=0
while ceph health detail | grep "$grepstr" ; do
if (( $loop >= ${#delays[*]} )) ; then
ceph health detail
return 1
fi
sleep ${delays[$loop]}
loop+=1
done
}
##
# Wait until the cluster has health condition passed as arg
# again for $TIMEOUT seconds.

View File

@ -0,0 +1,148 @@
#!/usr/bin/env bash
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
function run() {
local dir=$1
shift
export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
export BASE_CEPH_ARGS=$CEPH_ARGS
CEPH_ARGS+="--mon-host=$CEPH_MON_A"
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
setup $dir || return 1
$func $dir || return 1
teardown $dir || return 1
done
}
TEST_stretched_cluster_failover_add_three_osds(){
local dir=$1
local OSDS=8
setup $dir || return 1
run_mon $dir a --public-addr $CEPH_MON_A || return 1
wait_for_quorum 300 1 || return 1
run_mon $dir b --public-addr $CEPH_MON_B || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
wait_for_quorum 300 2 || return 1
run_mon $dir c --public-addr $CEPH_MON_C || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
wait_for_quorum 300 3 || return 1
run_mon $dir d --public-addr $CEPH_MON_D || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
wait_for_quorum 300 4 || return 1
run_mon $dir e --public-addr $CEPH_MON_E || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
wait_for_quorum 300 5 || return 1
ceph mon set election_strategy connectivity
ceph mon add disallowed_leader e
run_mgr $dir x || return 1
run_mgr $dir y || return 1
run_mgr $dir z || return 1
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd || return 1
done
for zone in iris pze
do
ceph osd crush add-bucket $zone zone
ceph osd crush move $zone root=default
done
ceph osd crush add-bucket node-2 host
ceph osd crush add-bucket node-3 host
ceph osd crush add-bucket node-4 host
ceph osd crush add-bucket node-5 host
ceph osd crush move node-2 zone=iris
ceph osd crush move node-3 zone=iris
ceph osd crush move node-4 zone=pze
ceph osd crush move node-5 zone=pze
ceph osd crush move osd.0 host=node-2
ceph osd crush move osd.1 host=node-2
ceph osd crush move osd.2 host=node-3
ceph osd crush move osd.3 host=node-3
ceph osd crush move osd.4 host=node-4
ceph osd crush move osd.5 host=node-4
ceph osd crush move osd.6 host=node-5
ceph osd crush move osd.7 host=node-5
ceph mon set_location a zone=iris host=node-2
ceph mon set_location b zone=iris host=node-3
ceph mon set_location c zone=pze host=node-4
ceph mon set_location d zone=pze host=node-5
hostname=$(hostname -s)
ceph osd crush remove $hostname || return 1
ceph osd getcrushmap > crushmap || return 1
crushtool --decompile crushmap > crushmap.txt || return 1
sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
cat >> crushmap_modified.txt << EOF
rule stretch_rule {
id 1
type replicated
min_size 1
max_size 10
step take iris
step chooseleaf firstn 2 type host
step emit
step take pze
step chooseleaf firstn 2 type host
step emit
}
# end crush map
EOF
crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
ceph osd setcrushmap -i crushmap.bin || return 1
local stretched_poolname=stretched_rbdpool
ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
ceph osd pool set $stretched_poolname size 4 || return 1
sleep 3
ceph mon set_location e zone=arbiter host=node-1
ceph mon enable_stretch_mode e stretch_rule zone
kill_daemons $dir KILL mon.c || return 1
kill_daemons $dir KILL mon.d || return 1
kill_daemons $dir KILL osd.4 || return 1
kill_daemons $dir KILL osd.5 || return 1
kill_daemons $dir KILL osd.6 || return 1
kill_daemons $dir KILL osd.7 || return 1
ceph -s
sleep 3
run_osd $dir 8 || return 1
run_osd $dir 9 || return 1
run_osd $dir 10 || return 1
ceph -s
sleep 3
teardown $dir || return 1
}
main mon-stretch-fail-recovery "$@"

View File

@ -0,0 +1,145 @@
#!/usr/bin/env bash
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
function run() {
local dir=$1
shift
export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
export BASE_CEPH_ARGS=$CEPH_ARGS
CEPH_ARGS+="--mon-host=$CEPH_MON_A"
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
setup $dir || return 1
$func $dir || return 1
teardown $dir || return 1
done
}
TEST_stretched_cluster_uneven_weight() {
local dir=$1
local OSDS=4
local weight=0.09000
setup $dir || return 1
run_mon $dir a --public-addr $CEPH_MON_A || return 1
wait_for_quorum 300 1 || return 1
run_mon $dir b --public-addr $CEPH_MON_B || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
wait_for_quorum 300 2 || return 1
run_mon $dir c --public-addr $CEPH_MON_C || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
wait_for_quorum 300 3 || return 1
run_mon $dir d --public-addr $CEPH_MON_D || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
wait_for_quorum 300 4 || return 1
run_mon $dir e --public-addr $CEPH_MON_E || return 1
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
wait_for_quorum 300 5 || return 1
ceph mon set election_strategy connectivity
ceph mon add disallowed_leader e
run_mgr $dir x || return 1
run_mgr $dir y || return 1
run_mgr $dir z || return 1
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd || return 1
done
for zone in iris pze
do
ceph osd crush add-bucket $zone zone
ceph osd crush move $zone root=default
done
ceph osd crush add-bucket node-2 host
ceph osd crush add-bucket node-3 host
ceph osd crush add-bucket node-4 host
ceph osd crush add-bucket node-5 host
ceph osd crush move node-2 zone=iris
ceph osd crush move node-3 zone=iris
ceph osd crush move node-4 zone=pze
ceph osd crush move node-5 zone=pze
ceph osd crush move osd.0 host=node-2
ceph osd crush move osd.1 host=node-3
ceph osd crush move osd.2 host=node-4
ceph osd crush move osd.3 host=node-5
ceph mon set_location a zone=iris host=node-2
ceph mon set_location b zone=iris host=node-3
ceph mon set_location c zone=pze host=node-4
ceph mon set_location d zone=pze host=node-5
hostname=$(hostname -s)
ceph osd crush remove $hostname || return 1
ceph osd getcrushmap > crushmap || return 1
crushtool --decompile crushmap > crushmap.txt || return 1
sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
cat >> crushmap_modified.txt << EOF
rule stretch_rule {
id 1
type replicated
min_size 1
max_size 10
step take iris
step chooseleaf firstn 2 type host
step emit
step take pze
step chooseleaf firstn 2 type host
step emit
}
# end crush map
EOF
crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
ceph osd setcrushmap -i crushmap.bin || return 1
local stretched_poolname=stretched_rbdpool
ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
ceph osd pool set $stretched_poolname size 4 || return 1
ceph mon set_location e zone=arbiter host=node-1 || return 1
ceph mon enable_stretch_mode e stretch_rule zone || return 1 # Enter strech mode
# reweight to a more round decimal.
ceph osd crush reweight osd.0 $weight
ceph osd crush reweight osd.1 $weight
ceph osd crush reweight osd.2 $weight
ceph osd crush reweight osd.3 $weight
# Firstly, we test for stretch mode buckets != 2
ceph osd crush add-bucket sham zone || return 1
ceph osd crush move sham root=default || return 1
wait_for_health "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
ceph osd crush rm sham # clear the health warn
wait_for_health_gone "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
# Next, we test for uneven weights across buckets
ceph osd crush reweight osd.0 0.07000
wait_for_health "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
ceph osd crush reweight osd.0 $weight # clear the health warn
wait_for_health_gone "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
teardown $dir || return 1
}
main mon-stretched-cluster-uneven-weight "$@"

View File

@ -19,6 +19,7 @@ overrides:
- MDS_READ_ONLY
- force file system read-only
- with standby daemon mds
- MDS abort because newly corrupt dentry
tasks:
- cephfs_test_runner:
modules:

View File

@ -0,0 +1,6 @@
# Lengthen the timeout for thrashed MDS
overrides:
ceph:
conf:
client:
client_shutdown_timeout: 120

View File

@ -0,0 +1,6 @@
# Lengthen the timeout for thrashed MDS
overrides:
ceph:
conf:
client:
client_shutdown_timeout: 120

View File

@ -0,0 +1,13 @@
tasks:
- check-counter:
counters:
mgr:
- name: "finisher-volumes.complete_latency.avgcount"
min: 4
- name: "finisher-volumes.queue_len"
expected_val: 0
- cephfs_test_runner:
fail_on_skip: false
modules:
- tasks.cephfs.test_volumes.TestPerModuleFinsherThread

View File

@ -0,0 +1 @@
.qa/objectstore/bluestore-bitmap.yaml

View File

@ -0,0 +1,7 @@
overrides:
ceph:
conf:
global:
ms die on skipped message: false
client:
rbd default features: 37

View File

@ -0,0 +1 @@
../.qa/

View File

@ -0,0 +1,5 @@
overrides:
ceph:
conf:
client:
rbd default map options: ms_mode=crc,rxbounce

View File

@ -0,0 +1,5 @@
overrides:
ceph:
conf:
client:
rbd default map options: ms_mode=crc

View File

@ -0,0 +1,5 @@
overrides:
ceph:
conf:
client:
rbd default map options: ms_mode=legacy,rxbounce

View File

@ -0,0 +1,5 @@
overrides:
ceph:
conf:
client:
rbd default map options: ms_mode=legacy

View File

@ -0,0 +1,5 @@
overrides:
ceph:
conf:
client:
rbd default map options: ms_mode=secure

View File

@ -0,0 +1 @@
../.qa/

View File

@ -0,0 +1 @@
../.qa/

View File

@ -2,6 +2,7 @@ overrides:
ceph:
conf:
global:
mon warn on pool no app: false
ms die on skipped message: false
client:
rbd default features: 37

View File

@ -0,0 +1,19 @@
overrides:
ceph:
conf:
global:
osd pool default size: 1
osd:
osd shutdown pgref assert: true
roles:
- [mon.a, mgr.x, osd.0, client.0]
tasks:
- install:
extra_system_packages:
- fio
- ceph:
- workunit:
clients:
all:
- rbd/krbd_watch_errors.sh

View File

@ -1,3 +1,28 @@
overrides:
ceph:
log-ignorelist:
- \(HOST_IN_MAINTENANCE\)
- \(OSD_DOWN\)
- \(MON_DOWN\)
- down
- overall HEALTH_
- \(CEPHADM_STRAY_DAEMON\)
- stray daemon
- \(FS_DEGRADED\)
- \(MDS_FAILED\)
- \(MDS_DEGRADED\)
- \(FS_WITH_FAILED_MDS\)
- \(MDS_DAMAGE\)
- \(MDS_ALL_DOWN\)
- \(MDS_UP_LESS_THAN_MAX\)
- \(FS_INLINE_DATA_DEPRECATED\)
- \(PG_DEGRADED\)
- Degraded data redundancy
- \(PG_
- acting
- MDS_INSUFFICIENT_STANDBY
- deprecated feature inline_data
- compat changed unexpectedly
roles:
# 3 osd roles on host.a is required for cephadm task. It checks if the cluster is healthy.
# More daemons will be deployed on both hosts in e2e tests.

View File

@ -24,6 +24,21 @@ openstack:
size: 10 # GB
overrides:
ceph:
log-ignorelist:
- slow requests
- \(PG_
- PG_
- \(CEPHADM_STRAY_DAEMON\)
- slow request
- \(MDS_
- MDS_
- osds down
- OSD_
- \(OSD_
- client
- FS_
- \(FS_
- degraded
conf:
osd:
osd shutdown pgref assert: true

View File

@ -1,3 +1,10 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- \(OSD_DOWN\)
- \(PG_
- but it is still running
tasks:
- cephadm.shell:
host.a:

View File

@ -1,3 +1,10 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- \(OSD_DOWN\)
- \(PG_
- but it is still running
tasks:
- cephadm.shell:
host.a:

View File

@ -1,3 +1,10 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- \(OSD_DOWN\)
- \(PG_
- but it is still running
tasks:
- cephadm.shell:
host.a:

View File

@ -1,3 +1,10 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- \(OSD_DOWN\)
- \(PG_
- but it is still running
tasks:
- cephadm.shell:
host.a:

View File

@ -1,3 +1,11 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- \(OSD_DOWN\)
- \(PG_
- but it is still running
- \(CEPHADM_STRAY_DAEMON\)
tasks:
- cephadm.shell:
host.a:

View File

@ -1,3 +1,11 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- \(PG_AVAILABILITY\)
- mon down
- mons down
- out of quorum
tasks:
- cephadm:
conf:

View File

@ -3,6 +3,23 @@ overrides:
log-ignorelist:
- but it is still running
- objects unfound and apparently lost
- \(MON_DOWN\)
- \(OSDMAP_FLAGS\)
- flag\(s\) set
- \(CACHE_POOL_NO_HIT_SET\)
- \(CACHE_
- \(PG_
- \(OSD_
- \(POOL_
- \(CEPHADM_STRAY_DAEMON\)
- PG_
- CACHE_
- degraded
- backfill
- mons down
- OSD_
- is down
- acting
conf:
osd:
osd debug reject backfill probability: .3

View File

@ -1,3 +1,14 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- \(PG_
- mons down
- pg inactive
- out of quorum
- \(OSD_
- osds down
- osd down
tasks:
- cephadm.shell:
env: [sha1]

View File

@ -1,3 +1,9 @@
overrides:
ceph:
log-ignorelist:
- Replacing daemon mds
- FS_DEGRADED
- \(CEPHADM_STRAY_DAEMON\)
roles:
- - host.a
- osd.0

View File

@ -1,3 +1,10 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- \(OSD_DOWN\)
- \(CEPHADM_PAUSED\)
- mons down
roles:
- - host.a
- osd.0

View File

@ -1,3 +1,10 @@
overrides:
ceph:
log-ignorelist:
- \(MON_DOWN\)
- mons down
- \(MGR_DOWN\)
- out of quorum
roles:
- - host.a
- osd.0

View File

@ -11,6 +11,15 @@ overrides:
- \(POOL_APP_NOT_ENABLED\)
- \(PG_AVAILABILITY\)
- \(PG_DEGRADED\)
- \(MON_DOWN\)
- \(CEPHADM_STRAY_DAEMON\)
- missing hit_sets
- do not have an application enabled
- application not enabled on pool
- pool application
- mons down
- out of quorum
- needs hit_set_type to be set but it is not
conf:
client:
debug ms: 1

View File

@ -2,6 +2,7 @@ overrides:
ceph:
log-ignorelist:
- \(PG_AVAILABILITY\)
- \(POOL_APP_NOT_ENABLED\)
conf:
osd:
osd_class_load_list: "*"

View File

@ -8,6 +8,13 @@ overrides:
- \(OSD_
- \(OBJECT_
- \(POOL_APP_NOT_ENABLED\)
- \(MON_DOWN\)
- mons down
- application not enabled on pool
- do not have an application enabled
- pool application
- out of quorum
- needs hit_set_type to be set but it is not
tasks:
- workunit:
clients:

View File

@ -0,0 +1,43 @@
tasks:
- install:
- ceph:
wait-for-scrub: false
- check-counter:
counters:
mgr:
- name: "finisher-balancer.complete_latency.avgcount"
min: 1
- name: "finisher-balancer.queue_len"
expected_val: 0
- name: "finisher-crash.complete_latency.avgcount"
min: 2
- name: "finisher-crash.queue_len"
expected_val: 0
- name: "finisher-devicehealth.complete_latency.avgcount"
min: 1
- name: "finisher-devicehealth.queue_len"
expected_val: 0
- name: "finisher-iostat.complete_latency.avgcount"
min: 1
- name: "finisher-iostat.queue_len"
expected_val: 0
- name: "finisher-pg_autoscaler.complete_latency.avgcount"
min: 1
- name: "finisher-pg_autoscaler.queue_len"
expected_val: 0
- name: "finisher-progress.complete_latency.avgcount"
min: 2
- name: "finisher-progress.queue_len"
expected_val: 0
- name: "finisher-status.complete_latency.avgcount"
min: 2
- name: "finisher-status.queue_len"
expected_val: 0
- name: "finisher-telemetry.complete_latency.avgcount"
min: 1
- name: "finisher-telemetry.queue_len"
expected_val: 0
- workunit:
clients:
client.0:
- mgr/test_per_module_finisher.sh

View File

@ -13,4 +13,4 @@ tasks:
- workunit:
clients:
client.0:
- mgr
- mgr/test_localpool.sh

View File

@ -0,0 +1,18 @@
roles:
- - mon.a
- mgr.x
- osd.0
- osd.1
- osd.2
- client.0
openstack:
- volumes: # attached to each instance
count: 3
size: 10 # GB
tasks:
- install:
- workunit:
basedir: qa/standalone
clients:
all:
- mon-stretch

View File

@ -4,6 +4,8 @@ overrides:
osd:
osd_class_load_list: "*"
osd_class_default_list: "*"
log-ignorelist:
- \(POOL_APP_NOT_ENABLED\)
tasks:
- workunit:
clients:

View File

@ -0,0 +1,13 @@
overrides:
ceph:
conf:
mgr:
debug rbd: 20
tasks:
- install:
extra_system_packages:
- fio
- workunit:
clients:
client.0:
- rbd/rbd_support_module_recovery.sh

View File

@ -0,0 +1,5 @@
tasks:
- workunit:
clients:
client.0:
- rgw/run-bucket-check.sh

View File

@ -6,7 +6,7 @@ workload:
- sequential:
- ragweed:
client.1:
default-branch: ceph-pacific
default-branch: ceph-nautilus
rgw_server: client.1
stages: prepare
- print: "**** done rgw ragweed prepare 2-workload"

View File

@ -5,7 +5,7 @@ rgw-final-workload:
full_sequential:
- ragweed:
client.1:
default-branch: ceph-pacific
default-branch: ceph-nautilus
rgw_server: client.1
stages: check
- print: "**** done ragweed check 4-final-workload"

View File

@ -5,7 +5,7 @@ rgw-final-workload:
full_sequential:
- ragweed:
client.1:
default-branch: ceph-pacific
default-branch: ceph-octopus
rgw_server: client.1
stages: check
- print: "**** done ragweed check 4-final-workload"

View File

@ -123,7 +123,7 @@ workload_pacific:
- rados/test.sh
- cls
env:
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
- print: "**** done rados/test.sh & cls workload_pacific"
- sequential:
- rgw: [client.0]

View File

@ -7,4 +7,6 @@ stress-tasks:
clients:
client.0:
- cls/test_cls_rbd.sh
env:
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
- print: "**** done cls/test_cls_rbd.sh 4-workload"

View File

@ -3,7 +3,7 @@ meta:
librbd python api tests
tasks:
- workunit:
tag: v16.2.7
branch: pacific
clients:
client.0:
- rbd/test_librbd_python.sh

View File

@ -232,6 +232,7 @@ class OSDThrasher(Thrasher):
self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0)
self.random_eio = self.config.get('random_eio')
self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
self.chance_reset_purged_snaps_last = self.config.get('chance_reset_purged_snaps_last', 0.3)
num_osds = self.in_osds + self.out_osds
self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * len(num_osds)
@ -798,6 +799,19 @@ class OSDThrasher(Thrasher):
else:
self.cancel_force_recovery()
def reset_purged_snaps_last(self):
"""
Run reset_purged_snaps_last
"""
self.log('reset_purged_snaps_last')
for osd in self.in_osds:
try:
self.ceph_manager.raw_cluster_cmd(
'tell', "osd.%s" % (str(osd)),
'reset_purged_snaps_last')
except CommandFailedError:
self.log('Failed to reset_purged_snaps_last, ignoring')
def all_up(self):
"""
Make sure all osds are up and not out.
@ -1248,6 +1262,8 @@ class OSDThrasher(Thrasher):
actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,))
if self.chance_force_recovery > 0:
actions.append((self.force_cancel_recovery, self.chance_force_recovery))
if self.chance_reset_purged_snaps_last > 0:
actions.append((self.reset_purged_snaps_last, self.chance_reset_purged_snaps_last))
for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
for scenario in [

View File

@ -2,6 +2,8 @@
# make logging friendly to teuthology
log_to_file = true
log_to_stderr = false
log to journald = false
mon cluster log to file = true
mon cluster log file level = debug
mon clock drift allowed = 1.000

View File

@ -811,7 +811,7 @@ class CephFSMount(object):
))
p.wait()
def open_background(self, basename="background_file", write=True):
def open_background(self, basename="background_file", write=True, content="content"):
"""
Open a file for writing, then block such that the client
will hold a capability.
@ -828,12 +828,11 @@ class CephFSMount(object):
import time
with open("{path}", 'w') as f:
f.write('content')
f.write("{content}")
f.flush()
f.write('content2')
while True:
time.sleep(1)
""").format(path=path)
""").format(path=path, content=content)
else:
pyscript = dedent("""
import time
@ -849,7 +848,10 @@ class CephFSMount(object):
# This wait would not be sufficient if the file had already
# existed, but it's simple and in practice users of open_background
# are not using it on existing files.
self.wait_for_visible(basename)
if write:
self.wait_for_visible(basename, size=len(content))
else:
self.wait_for_visible(basename)
return rproc
@ -887,19 +889,27 @@ class CephFSMount(object):
if nr_links == 2:
return
def wait_for_visible(self, basename="background_file", timeout=30):
def wait_for_visible(self, basename="background_file", size=None, timeout=30):
i = 0
args = ['stat']
if size is not None:
args += ['--printf=%s']
args += [os.path.join(self.hostfs_mntpt, basename)]
while i < timeout:
r = self.client_remote.run(args=[
'stat', os.path.join(self.hostfs_mntpt, basename)
], check_status=False)
if r.exitstatus == 0:
log.debug("File {0} became visible from {1} after {2}s".format(
basename, self.client_id, i))
return
else:
time.sleep(1)
i += 1
p = self.client_remote.run(args=args, stdout=StringIO(), check_status=False)
if p.exitstatus == 0:
if size is not None:
s = p.stdout.getvalue().strip()
if int(s) == size:
log.info(f"File {basename} became visible with size {size} from {self.client_id} after {i}s")
return
else:
log.error(f"File {basename} became visible but with size {int(s)} not {size}")
else:
log.info(f"File {basename} became visible from {self.client_id} after {i}s")
return
time.sleep(1)
i += 1
raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
i, basename, self.client_id))

View File

@ -1,6 +1,8 @@
"""
Before running this testsuite, add path to cephfs-shell module to $PATH and
export $PATH.
NOTE: For running this tests locally (using vstart_runner.py), export the
path to src/tools/cephfs/shell/cephfs-shell module to $PATH. Running
"export PATH=$PATH:$(cd ../src/tools/cephfs/shell && pwd)" from the build dir
will update the environment without hassles of typing the path correctly.
"""
from io import StringIO
from os import path

View File

@ -9,7 +9,9 @@ from textwrap import dedent
from tasks.ceph_test_case import TestTimeoutError
from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
from tasks.cephfs.fuse_mount import FuseMount
from teuthology.exceptions import CommandFailedError
import os
from io import StringIO
log = logging.getLogger(__name__)
@ -157,29 +159,49 @@ class TestClientLimits(CephFSTestCase):
a fraction of second (0.5) by default when throttling condition is met.
"""
max_caps_per_client = 500
cap_acquisition_throttle = 250
subdir_count = 4
files_per_dir = 25
self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client)
self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle)
# throttle in a way so that two dir reads are already hitting it.
throttle_value = (files_per_dir * 3) // 2
# Create 1500 files split across 6 directories, 250 each.
for i in range(1, 7):
self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True)
# activate throttling logic by setting max per client to a low value
self.config_set('mds', 'mds_max_caps_per_client', 1)
self.config_set('mds', 'mds_session_cap_acquisition_throttle', throttle_value)
# Create files split across {subdir_count} directories, {per_dir_count} in each dir
for i in range(1, subdir_count+1):
self.mount_a.create_n_files("dir{0}/file".format(i), files_per_dir, sync=True)
mount_a_client_id = self.mount_a.get_global_id()
# recursive readdir
self.mount_a.run_shell_payload("find | wc")
# validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250
cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle)
# recursive readdir. macOs wants an explicit directory for `find`.
proc = self.mount_a.run_shell_payload("find . | wc", stderr=StringIO())
# return code may be None if the command got interrupted
self.assertTrue(proc.returncode is None or proc.returncode == 0, proc.stderr.getvalue())
# validate the throttle condition to be hit atleast once
cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle']
self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1)
# validate cap_acquisition decay counter after readdir to NOT exceed the throttle value
# plus one batch that could have been taken immediately before querying
# assuming the batch is equal to the per dir file count.
cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
self.assertLessEqual(cap_acquisition_value, files_per_dir + throttle_value)
# make sure that the throttle was reported in the events
def historic_ops_have_event(expected_event):
ops_dump = self.fs.rank_tell(['dump_historic_ops'])
# reverse the events and the ops assuming that later ops would be throttled
for op in reversed(ops_dump['ops']):
for ev in reversed(op.get('type_data', {}).get('events', [])):
if ev['event'] == expected_event:
return True
return False
self.assertTrue(historic_ops_have_event('cap_acquisition_throttle'))
def test_client_release_bug(self):
"""
When a client has a bug (which we will simulate) preventing it from releasing caps,
@ -219,6 +241,55 @@ class TestClientLimits(CephFSTestCase):
self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
rproc.wait()
def test_client_blocklisted_oldest_tid(self):
"""
that a client is blocklisted when its encoded session metadata exceeds the
configured threshold (due to ever growing `completed_requests` caused due
to an unidentified bug (in the client or the MDS)).
"""
# num of requests client issues
max_requests = 10000
# The debug hook to inject the failure only exists in the fuse client
if not isinstance(self.mount_a, FuseMount):
self.skipTest("Require FUSE client to inject client release failure")
self.config_set('client', 'client inject fixed oldest tid', 'true')
self.mount_a.teardown()
self.mount_a.mount_wait()
self.config_set('mds', 'mds_max_completed_requests', max_requests);
# Create lots of files
self.mount_a.create_n_files("testdir/file1", max_requests + 100)
# Create a few files synchronously. This makes sure previous requests are completed
self.mount_a.create_n_files("testdir/file2", 5, True)
# Wait for the health warnings. Assume mds can handle 10 request per second at least
self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id))
# set the threshold low so that it has a high probability of
# hitting.
self.config_set('mds', 'mds_session_metadata_threshold', 5000);
# Create lot many files synchronously. This would hit the session metadata threshold
# causing the client to get blocklisted.
with self.assertRaises(CommandFailedError):
self.mount_a.create_n_files("testdir/file2", 100000, True)
self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr())
# the mds should bump up the relevant perf counter
pd = self.perf_dump()
self.assertGreater(pd['mds_sessions']['mdthresh_evicted'], 0)
# reset the config
self.config_set('client', 'client inject fixed oldest tid', 'false')
self.mount_a.kill_cleanup()
self.mount_a.mount_wait()
def test_client_oldest_tid(self):
"""
When a client does not advance its oldest tid, the MDS should notice that

View File

@ -10,8 +10,10 @@ from textwrap import dedent
import time
import distutils.version as version
import re
import string
import os
from teuthology import contextutil
from teuthology.orchestra import run
from teuthology.orchestra.run import CommandFailedError
from tasks.cephfs.fuse_mount import FuseMount
@ -221,8 +223,10 @@ class TestClientRecovery(CephFSTestCase):
# Capability release from stale session
# =====================================
if write:
cap_holder = self.mount_a.open_background()
content = ''.join(random.choices(string.ascii_uppercase + string.digits, k=16))
cap_holder = self.mount_a.open_background(content=content)
else:
content = ''
self.mount_a.run_shell(["touch", "background_file"])
self.mount_a.umount_wait()
self.mount_a.mount_wait()
@ -233,7 +237,7 @@ class TestClientRecovery(CephFSTestCase):
# Wait for the file to be visible from another client, indicating
# that mount_a has completed its network ops
self.mount_b.wait_for_visible()
self.mount_b.wait_for_visible(size=len(content))
# Simulate client death
self.mount_a.suspend_netns()
@ -264,11 +268,9 @@ class TestClientRecovery(CephFSTestCase):
"Capability handover took {0}, expected approx {1}".format(
cap_waited, session_timeout
))
self.mount_a._kill_background(cap_holder)
finally:
# teardown() doesn't quite handle this case cleanly, so help it out
self.mount_a.resume_netns()
self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable
self.mount_a._kill_background(cap_holder)
def test_stale_read_caps(self):
self._test_stale_caps(False)
@ -319,9 +321,9 @@ class TestClientRecovery(CephFSTestCase):
cap_waited, session_timeout / 2.0
))
self.mount_a._kill_background(cap_holder)
finally:
self.mount_a.resume_netns()
self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable
self.mount_a._kill_background(cap_holder)
def test_trim_caps(self):
# Trim capability when reconnecting MDS
@ -387,7 +389,6 @@ class TestClientRecovery(CephFSTestCase):
self.mount_b.check_filelock(do_flock=flockable)
# Tear down the background process
self.mount_a._kill_background(lock_holder)
def test_filelock_eviction(self):
@ -416,7 +417,6 @@ class TestClientRecovery(CephFSTestCase):
# succeed
self.wait_until_true(lambda: lock_taker.finished, timeout=10)
finally:
# Tear down the background process
self.mount_a._kill_background(lock_holder)
# teardown() doesn't quite handle this case cleanly, so help it out
@ -751,24 +751,27 @@ class TestClientOnLaggyOSD(CephFSTestCase):
# it takes time to have laggy clients entries in cluster log,
# wait for 6 minutes to see if it is visible, finally restart
# the client
tries = 6
while True:
try:
with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs",
timeout=55):
# make sure clients weren't evicted
self.assert_session_count(2)
break
except AssertionError:
tries -= 1
if tries:
continue
raise
with contextutil.safe_while(sleep=5, tries=6) as proceed:
while proceed():
try:
with self.assert_cluster_log("1 client(s) laggy due to"
" laggy OSDs",
timeout=55):
# make sure clients weren't evicted
self.assert_session_count(2)
break
except (AssertionError, CommandFailedError) as e:
log.debug(f'{e}, retrying')
# clear lagginess, expect to get the warning cleared and make sure
# client gets evicted
self.clear_laggy_params(osd)
self.wait_for_health_clear(60)
self.assert_session_count(1)
finally:
self.mount_a.kill_cleanup()
self.mount_a.mount_wait()
self.mount_a.create_destroy()
self.clear_laggy_params(osd)
def test_client_eviction_if_config_is_unset(self):
"""
@ -800,6 +803,11 @@ class TestClientOnLaggyOSD(CephFSTestCase):
time.sleep(session_timeout)
self.assert_session_count(1)
# make sure warning wasn't seen in cluster log
with self.assert_cluster_log("laggy due to laggy OSDs",
timeout=120, present=False):
pass
finally:
self.mount_a.kill_cleanup()
self.mount_a.mount_wait()

View File

@ -608,8 +608,9 @@ class TestDamage(CephFSTestCase):
self.fs.flush()
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
time.sleep(5) # for conf to percolate
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
with self.assert_cluster_log("MDS abort because newly corrupt dentry"):
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
self.fs.rank_freeze(False, rank=0)
self.delete_mds_coredump(rank0['name'])
@ -642,9 +643,10 @@ class TestDamage(CephFSTestCase):
rank0 = self.fs.get_rank()
self.fs.rank_freeze(True, rank=0)
# so now we want to trigger commit but this will crash, so:
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
with self.assert_cluster_log("MDS abort because newly corrupt dentry"):
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
self.fs.rank_freeze(False, rank=0)
self.delete_mds_coredump(rank0['name'])

View File

@ -14,9 +14,12 @@ class TestClusterAffinity(CephFSTestCase):
CLIENTS_REQUIRED = 0
MDSS_REQUIRED = 4
def _verify_join_fs(self, target, status=None):
def _verify_join_fs(self, target, status=None, fs=None):
fs_select = fs
if fs_select is None:
fs_select = self.fs
if status is None:
status = self.fs.wait_for_daemons(timeout=30)
status = fs_select.wait_for_daemons(timeout=30)
log.debug("%s", status)
target = sorted(target, key=operator.itemgetter('name'))
log.info("target = %s", target)
@ -37,11 +40,14 @@ class TestClusterAffinity(CephFSTestCase):
return
self.fail("no entity")
def _verify_init(self):
status = self.fs.status()
def _verify_init(self, fs=None):
fs_select = fs
if fs_select is None:
fs_select = self.fs
status = fs_select.status()
log.info("status = {0}".format(status))
target = [{'join_fscid': -1, 'name': info['name']} for info in status.get_all()]
self._verify_join_fs(target, status=status)
self._verify_join_fs(target, status=status, fs=fs_select)
return (status, target)
def _reach_target(self, target):
@ -109,12 +115,21 @@ class TestClusterAffinity(CephFSTestCase):
fs2 = self.mds_cluster.newfs(name="cephfs2")
status, target = self._verify_init()
active = self.fs.get_active_names(status=status)[0]
status2, _ = self._verify_init(fs=fs2)
active2 = fs2.get_active_names(status=status2)[0]
standbys = [info['name'] for info in status.get_standbys()]
victim = standbys.pop()
# Set a bogus fs on the others
for mds in standbys:
self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2')
self._change_target_state(target, mds, {'join_fscid': fs2.id})
# The active MDS for cephfs2 will be replaced by the MDS for which
# file system affinity has been set. Also, set the affinity for
# the earlier active MDS so that it is not chosen by the monitors
# as an active MDS for the existing file system.
log.info(f'assigning affinity to cephfs2 for active mds (mds.{active2})')
self.config_set(f'mds.{active2}', 'mds_join_fs', 'cephfs2')
self._change_target_state(target, active2, {'join_fscid': fs2.id})
self.fs.rank_fail()
self._change_target_state(target, victim, {'state': 'up:active'})
self._reach_target(target)

Some files were not shown because too many files have changed in this diff Show More