import ceph pacific 16.2.15 source

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
2025-04-28 12:54:34 +00:00 · 2024-03-04 16:44:57 +01:00 · 2024-03-04 16:44:57 +01:00 · 47fdce5df8
commit 47fdce5df8
parent ca55da0300
405 changed files with 9800 additions and 3148 deletions
--- a/ceph/CMakeLists.txt
+++ b/ceph/CMakeLists.txt
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
 # remove cmake/modules/FindPython* once 3.12 is required

 project(ceph
-  VERSION 16.2.14
+  VERSION 16.2.15
  LANGUAGES CXX C ASM)

 foreach(policy
--- a/ceph/PendingReleaseNotes
+++ b/ceph/PendingReleaseNotes
@ -32,6 +32,29 @@
  in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
  the restored file system is expected to have the same ID as before.

+>=16.2.15
+----------
+* `ceph config dump --format <json|xml>` output will display the localized
+  option names instead of its normalized version. For e.g.,
+  "mgr/prometheus/x/server_port" will be displayed instead of
+  "mgr/prometheus/server_port". This matches the output of the non pretty-print
+  formatted version of the command.
+
+* CEPHFS: MDS evicts clients which are not advancing their request tids which causes
+  a large buildup of session metadata resulting in the MDS going read-only due to
+  the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold`
+  config controls the maximum size that a (encoded) session metadata can grow.
+
+* RADOS: `get_pool_is_selfmanaged_snaps_mode` C++ API has been deprecated
+  due to being prone to false negative results.  It's safer replacement is
+  `pool_is_in_selfmanaged_snaps_mode`.
+
+* RBD: When diffing against the beginning of time (`fromsnapname == NULL`) in
+  fast-diff mode (`whole_object == true` with `fast-diff` image feature enabled
+  and valid), diff-iterate is now guaranteed to execute locally if exclusive
+  lock is available.  This brings a dramatic performance improvement for QEMU
+  live disk synchronization and backup use cases.
+
 >= 16.2.14
 ----------

@ -132,6 +155,10 @@
 * CEPHFS: After recovering a Ceph File System post following the disaster recovery
  procedure, the recovered files under `lost+found` directory can now be deleted.
 * core: cache-tiering is now deprecated.
+* mgr/snap_schedule: The snap-schedule mgr module now retains one less snapshot
+  than the number mentioned against the config tunable `mds_max_snaps_per_dir`
+  so that a new snapshot can be created and retained during the next schedule
+  run.

 >=16.2.8
 --------
--- a/ceph/admin/doc-requirements.txt
+++ b/ceph/admin/doc-requirements.txt
@ -1,4 +1,4 @@
-Sphinx == 4.4.0
+Sphinx == 5.0.2
 git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
 breathe >= 4.20.0
 Jinja2
--- a/ceph/ceph.spec
+++ b/ceph/ceph.spec
@ -135,7 +135,7 @@
 # main package definition
 #################################################################################
 Name:		ceph
-Version:	16.2.14
+Version:	16.2.15
 Release:	0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:		2
@ -151,7 +151,7 @@ License:	LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
 Group:		System/Filesystems
 %endif
 URL:		http://ceph.com/
-Source0:	%{?_remote_tarball_prefix}ceph-16.2.14.tar.bz2
+Source0:	%{?_remote_tarball_prefix}ceph-16.2.15.tar.bz2
 %if 0%{?suse_version}
 # _insert_obs_source_lines_here
 ExclusiveArch:  x86_64 aarch64 ppc64le s390x
@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-16.2.14
+%autosetup -p1 -n ceph-16.2.15

 %build
 # Disable lto on systems that do not support symver attribute
--- a/ceph/changelog.upstream
+++ b/ceph/changelog.upstream
@ -1,7 +1,13 @@
-ceph (16.2.14-1focal) focal; urgency=medium
+ceph (16.2.15-1focal) focal; urgency=medium


- -- Jenkins Build Slave User <jenkins-build@braggi13.front.sepia.ceph.com>  Tue, 29 Aug 2023 16:38:35 +0000
+ -- Jenkins Build Slave User <jenkins-build@braggi16.front.sepia.ceph.com>  Mon, 26 Feb 2024 19:34:01 +0000
+
+ceph (16.2.15-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.io>  Mon, 26 Feb 2024 19:21:07 +0000

 ceph (16.2.14-1) stable; urgency=medium

--- a/ceph/cmake/modules/BuildRocksDB.cmake
+++ b/ceph/cmake/modules/BuildRocksDB.cmake
@ -56,12 +56,13 @@ function(build_rocksdb)
  endif()
  include(CheckCXXCompilerFlag)
  check_cxx_compiler_flag("-Wno-deprecated-copy" HAS_WARNING_DEPRECATED_COPY)
+  set(rocksdb_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  if(HAS_WARNING_DEPRECATED_COPY)
-    set(rocksdb_CXX_FLAGS -Wno-deprecated-copy)
+    string(APPEND rocksdb_CXX_FLAGS " -Wno-deprecated-copy")
  endif()
  check_cxx_compiler_flag("-Wno-pessimizing-move" HAS_WARNING_PESSIMIZING_MOVE)
  if(HAS_WARNING_PESSIMIZING_MOVE)
-    set(rocksdb_CXX_FLAGS "${rocksdb_CXX_FLAGS} -Wno-pessimizing-move")
+    string(APPEND rocksdb_CXX_FLAGS " -Wno-pessimizing-move")
  endif()
  if(rocksdb_CXX_FLAGS)
    list(APPEND rocksdb_CMAKE_ARGS -DCMAKE_CXX_FLAGS='${rocksdb_CXX_FLAGS}')
--- a/ceph/doc/cephfs/administration.rst
+++ b/ceph/doc/cephfs/administration.rst
@ -15,7 +15,7 @@ creation of multiple file systems use ``ceph fs flag set enable_multiple true``.

 ::

-    fs new <file system name> <metadata pool name> <data pool name>
+    ceph fs new <file system name> <metadata pool name> <data pool name>

 This command creates a new file system. The file system name and metadata pool
 name are self-explanatory. The specified data pool is the default data pool and
@ -25,13 +25,13 @@ to accommodate the new file system.

 ::

-    fs ls
+    ceph fs ls

 List all file systems by name.

 ::

-    fs dump [epoch]
+    ceph fs dump [epoch]

 This dumps the FSMap at the given epoch (default: current) which includes all
 file system settings, MDS daemons and the ranks they hold, and the list of
@ -40,7 +40,7 @@ standby MDS daemons.

 ::

-    fs rm <file system name> [--yes-i-really-mean-it]
+    ceph fs rm <file system name> [--yes-i-really-mean-it]

 Destroy a CephFS file system. This wipes information about the state of the
 file system from the FSMap. The metadata pool and data pools are untouched and
@ -48,28 +48,28 @@ must be destroyed separately.

 ::

-    fs get <file system name>
+    ceph fs get <file system name>

 Get information about the named file system, including settings and ranks. This
-is a subset of the same information from the ``fs dump`` command.
+is a subset of the same information from the ``ceph fs dump`` command.

 ::

-    fs set <file system name> <var> <val>
+    ceph fs set <file system name> <var> <val>

 Change a setting on a file system. These settings are specific to the named
 file system and do not affect other file systems.

 ::

-    fs add_data_pool <file system name> <pool name/id>
+    ceph fs add_data_pool <file system name> <pool name/id>

 Add a data pool to the file system. This pool can be used for file layouts
 as an alternate location to store file data.

 ::

-    fs rm_data_pool <file system name> <pool name/id>
+    ceph fs rm_data_pool <file system name> <pool name/id>

 This command removes the specified pool from the list of data pools for the
 file system.  If any files have layouts for the removed data pool, the file
@ -82,7 +82,7 @@ Settings

 ::

-    fs set <fs name> max_file_size <size in bytes>
+    ceph fs set <fs name> max_file_size <size in bytes>

 CephFS has a configurable maximum file size, and it's 1TB by default.
 You may wish to set this limit higher if you expect to store large files
@ -116,13 +116,13 @@ Taking a CephFS cluster down is done by setting the down flag:
 
 :: 
 
-    fs set <fs_name> down true
+    ceph fs set <fs_name> down true
 
 To bring the cluster back online:
 
 :: 

-    fs set <fs_name> down false
+    ceph fs set <fs_name> down false

 This will also restore the previous value of max_mds. MDS daemons are brought
 down in a way such that journals are flushed to the metadata pool and all
@ -133,11 +133,11 @@ Taking the cluster down rapidly for deletion or disaster recovery
 -----------------------------------------------------------------

 To allow rapidly deleting a file system (for testing) or to quickly bring the
-file system and MDS daemons down, use the ``fs fail`` command:
+file system and MDS daemons down, use the ``ceph fs fail`` command:

 ::

-    fs fail <fs_name>
+    ceph fs fail <fs_name>

 This command sets a file system flag to prevent standbys from
 activating on the file system (the ``joinable`` flag).
@ -146,7 +146,7 @@ This process can also be done manually by doing the following:

 ::

-    fs set <fs_name> joinable false
+    ceph fs set <fs_name> joinable false

 Then the operator can fail all of the ranks which causes the MDS daemons to
 respawn as standbys. The file system will be left in a degraded state.
@ -154,7 +154,7 @@ respawn as standbys. The file system will be left in a degraded state.
 ::

    # For all ranks, 0-N:
-    mds fail <fs_name>:<n>
+    ceph mds fail <fs_name>:<n>

 Once all ranks are inactive, the file system may also be deleted or left in
 this state for other purposes (perhaps disaster recovery).
@ -163,7 +163,7 @@ To bring the cluster back up, simply set the joinable flag:

 ::

-    fs set <fs_name> joinable true
+    ceph fs set <fs_name> joinable true


 Daemons
@ -182,34 +182,35 @@ Commands to manipulate MDS daemons:

 ::

-    mds fail <gid/name/role>
+    ceph mds fail <gid/name/role>

 Mark an MDS daemon as failed.  This is equivalent to what the cluster
 would do if an MDS daemon had failed to send a message to the mon
 for ``mds_beacon_grace`` second.  If the daemon was active and a suitable
-standby is available, using ``mds fail`` will force a failover to the standby.
+standby is available, using ``ceph mds fail`` will force a failover to the
+standby.

-If the MDS daemon was in reality still running, then using ``mds fail``
+If the MDS daemon was in reality still running, then using ``ceph mds fail``
 will cause the daemon to restart.  If it was active and a standby was
 available, then the "failed" daemon will return as a standby.


 ::

-    tell mds.<daemon name> command ...
+    ceph tell mds.<daemon name> command ...

 Send a command to the MDS daemon(s). Use ``mds.*`` to send a command to all
 daemons. Use ``ceph tell mds.* help`` to learn available commands.

 ::

-    mds metadata <gid/name/role>
+    ceph mds metadata <gid/name/role>

 Get metadata about the given MDS known to the Monitors.

 ::

-    mds repaired <role>
+    ceph mds repaired <role>

 Mark the file system rank as repaired. Unlike the name suggests, this command
 does not change a MDS; it manipulates the file system rank which has been
@ -228,14 +229,14 @@ Commands to manipulate required client features of a file system:

 ::

-    fs required_client_features <fs name> add reply_encoding
-    fs required_client_features <fs name> rm reply_encoding
+    ceph fs required_client_features <fs name> add reply_encoding
+    ceph fs required_client_features <fs name> rm reply_encoding

 To list all CephFS features

 ::

-    fs feature ls
+    ceph fs feature ls

 Clients that are missing newly added features will be evicted automatically.

@ -330,7 +331,7 @@ Global settings

 ::

-    fs flag set <flag name> <flag val> [<confirmation string>]
+    ceph fs flag set <flag name> <flag val> [<confirmation string>]

 Sets a global CephFS flag (i.e. not specific to a particular file system).
 Currently, the only flag setting is 'enable_multiple' which allows having
@ -352,13 +353,13 @@ file system.

 ::

-    mds rmfailed
+    ceph mds rmfailed

 This removes a rank from the failed set.

 ::

-    fs reset <file system name>
+    ceph fs reset <file system name>

 This command resets the file system state to defaults, except for the name and
 pools. Non-zero ranks are saved in the stopped set.
@ -366,7 +367,7 @@ pools. Non-zero ranks are saved in the stopped set.

 ::

-    fs new <file system name> <metadata pool name> <data pool name> --fscid <fscid> --force
+    ceph fs new <file system name> <metadata pool name> <data pool name> --fscid <fscid> --force

 This command creates a file system with a specific **fscid** (file system cluster ID).
 You may want to do this when an application expects the file system's ID to be
--- a/ceph/doc/cephfs/cephfs-shell.rst
+++ b/ceph/doc/cephfs/cephfs-shell.rst
@ -37,7 +37,7 @@ Options :
 .. code:: bash

    [build]$ python3 -m venv venv && source venv/bin/activate && pip3 install cmd2
-    [build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/cephfs-shell
+    [build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/shell/cephfs-shell

 Commands
 ========
--- a/ceph/doc/cephfs/client-auth.rst
+++ b/ceph/doc/cephfs/client-auth.rst
@ -24,6 +24,16 @@ that directory.
 To restrict clients to only mount and work within a certain directory, use
 path-based MDS authentication capabilities.

+Note that this restriction *only* impacts the filesystem hierarchy -- the metadata
+tree managed by the MDS. Clients will still be able to access the underlying
+file data in RADOS directly. To segregate clients fully, you must also isolate
+untrusted clients in their own RADOS namespace. You can place a client's
+filesystem subtree in a particular namespace using `file layouts`_ and then
+restrict their RADOS access to that namespace using `OSD capabilities`_
+
+.. _file layouts: ./file-layouts
+.. _OSD capabilities: ../rados/operations/user-management/#authorization-capabilities
+
 Syntax
 ------

--- a/ceph/doc/cephfs/snap-schedule.rst
+++ b/ceph/doc/cephfs/snap-schedule.rst
@ -38,6 +38,13 @@ below). By default
 the start time is last midnight. So when a snapshot schedule with repeat
 interval `1h` is added at 13:50
 with the default start time, the first snapshot will be taken at 14:00.
+The time zone is assumed to be UTC if none is explicitly included in the string.
+An explicit time zone will be mapped to UTC at execution.
+The start time must be in ISO8601 format. Examples below:
+
+UTC: 2022-08-08T05:30:00 i.e. 5:30 AM UTC, without explicit time zone offset
+IDT: 2022-08-08T09:00:00+03:00 i.e. 6:00 AM UTC
+EDT: 2022-08-08T05:30:00-04:00 i.e. 9:30 AM UTC

 Retention specifications are identified by path and the retention spec itself. A
 retention spec consists of either a number and a time period separated by a
@ -155,6 +162,11 @@ Examples::
   snapshot creation is accounted for in the "created_count" field, which is a
   cumulative count of the total number of snapshots created so far.

+.. note: The maximum number of snapshots to retain per directory is limited by the
+   config tunable `mds_max_snaps_per_dir`. This tunable defaults to 100.
+   To ensure a new snapshot can be created, one snapshot less than this will be
+   retained. So by default, a maximum of 99 snapshots will be retained.
+
 Active and inactive schedules
 -----------------------------
 Snapshot schedules can be added for a path that doesn't exist yet in the
--- a/ceph/doc/man/8/ceph-objectstore-tool.rst
+++ b/ceph/doc/man/8/ceph-objectstore-tool.rst
@ -60,6 +60,8 @@ Possible -op commands::
 * meta-list
 * get-osdmap
 * set-osdmap
+* get-superblock
+* set-superblock
 * get-inc-osdmap
 * set-inc-osdmap
 * mark-complete
@ -414,7 +416,7 @@ Options

 .. option:: --op arg

-   Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log]
+   Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log]

 .. option:: --epoch arg

@ -422,7 +424,7 @@ Options

 .. option:: --file arg             
   
-   path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap
+   path of file to export, export-remove, import, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap or set-inc-osdmap

 .. option:: --mon-store-path arg

--- a/ceph/doc/man/8/ceph.rst
+++ b/ceph/doc/man/8/ceph.rst
@ -1314,7 +1314,7 @@ Subcommand ``cache-mode`` specifies the caching mode for cache tier <pool>.

 Usage::

-	ceph osd tier cache-mode <poolname> writeback|readproxy|readonly|none
+	ceph osd tier cache-mode <poolname> writeback|proxy|readproxy|readonly|none

 Subcommand ``remove`` removes the tier <tierpool> (the second one) from base pool
 <pool> (the first one).
--- a/ceph/doc/man/8/rados.rst
+++ b/ceph/doc/man/8/rados.rst
@ -264,8 +264,8 @@ Pool specific commands
 :command:`append` *name* *infile*
  Append object name to the cluster with contents from infile.

-:command:`rm` *name*
-  Remove object name.
+:command:`rm` [--force-full] *name* ...
+  Remove object(s) with name(s). With ``--force-full`` will remove when cluster is marked full.

 :command:`listwatchers` *name*
  List the watchers of object name.
--- a/ceph/doc/rados/configuration/bluestore-config-ref.rst
+++ b/ceph/doc/rados/configuration/bluestore-config-ref.rst
@ -333,7 +333,7 @@ OSD and run the following command:

       ceph-bluestore-tool \
        --path <data path> \
-        --sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \
+        --sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \
        reshard


--- a/ceph/doc/rados/configuration/ms-ref.rst
+++ b/ceph/doc/rados/configuration/ms-ref.rst
@ -109,17 +109,6 @@ Async messenger options
 :Default: ``3``


-``ms_async_max_op_threads``
-
-:Description: Maximum number of worker threads used by each Async Messenger instance. 
-              Set to lower values when your machine has limited CPU count, and increase 
-              when your CPUs are underutilized (i. e. one or more of CPUs are
-              constantly on 100% load during I/O operations).
-:Type: 64-bit Unsigned Integer
-:Required: No
-:Default: ``5``
-
-
 ``ms_async_send_inline``

 :Description: Send messages directly from the thread that generated them instead of
@ -129,5 +118,3 @@ Async messenger options
 :Type: Boolean
 :Required: No
 :Default: ``false``
-
-
--- a/ceph/doc/rados/configuration/pool-pg-config-ref.rst
+++ b/ceph/doc/rados/configuration/pool-pg-config-ref.rst
@ -4,12 +4,41 @@

 .. index:: pools; configuration

-Ceph uses default values to determine how many placement groups (PGs) will be
-assigned to each pool. We recommend overriding some of the defaults.
-Specifically, we recommend setting a pool's replica size and overriding the
-default number of placement groups. You can set these values when running
-`pool`_ commands. You can also override the defaults by adding new ones in the
-``[global]`` section of your Ceph configuration file.
+The number of placement groups that the CRUSH algorithm assigns to each pool is
+determined by the values of variables in the centralized configuration database
+in the monitor cluster. 
+
+Both containerized deployments of Ceph (deployments made using ``cephadm`` or
+Rook) and non-containerized deployments of Ceph rely on the values in the
+central configuration database in the monitor cluster to assign placement
+groups to pools. 
+
+Example Commands
+----------------
+
+To see the value of the variable that governs the number of placement groups in a given pool, run a command of the following form:
+
+.. prompt:: bash
+
+   ceph config get osd osd_pool_default_pg_num
+
+To set the value of the variable that governs the number of placement groups in a given pool, run a command of the following form:
+
+.. prompt:: bash
+
+   ceph config set osd osd_pool_default_pg_num
+
+Manual Tuning
+-------------
+In some cases, it might be advisable to override some of the defaults. For
+example, you might determine that it is wise to set a pool's replica size and
+to override the default number of placement groups in the pool. You can set
+these values when running `pool`_ commands. 
+
+See Also
+--------
+
+See :ref:`pg-autoscaler`.


 .. literalinclude:: pool-pg.conf
--- a/ceph/doc/rados/operations/health-checks.rst
+++ b/ceph/doc/rados/operations/health-checks.rst
@ -1404,6 +1404,31 @@ other performance issue with the OSDs.
 The exact size of the snapshot trim queue is reported by the ``snaptrimq_len``
 field of ``ceph pg ls -f json-detail``.

+Stretch Mode
+------------
+
+INCORRECT_NUM_BUCKETS_STRETCH_MODE
+__________________________________
+
+Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests
+that the number of dividing buckets is not equal to 2 after stretch mode is enabled.
+You can expect unpredictable failures and MON assertions until the condition is fixed.
+
+We encourage you to fix this by removing additional dividing buckets or bump the
+number of dividing buckets to 2.
+
+UNEVEN_WEIGHTS_STRETCH_MODE
+___________________________
+
+The 2 dividing buckets must have equal weights when stretch mode is enabled.
+This warning suggests that the 2 dividing buckets have uneven weights after
+stretch mode is enabled. This is not immediately fatal, however, you can expect
+Ceph to be confused when trying to process transitions between dividing buckets.
+
+We encourage you to fix this by making the weights even on both dividing buckets.
+This can be done by making sure the combined weight of the OSDs on each dividing
+bucket are the same.
+
 Miscellaneous
 -------------

--- a/ceph/doc/radosgw/frontends.rst
+++ b/ceph/doc/radosgw/frontends.rst
@ -127,6 +127,14 @@ Options
 :Type: Integer
 :Default: ``65000``

+``max_header_size``
+
+:Description: The maximum number of header bytes available for a single request.
+
+:Type: Integer
+:Default: ``16384``
+:Maximum: ``65536``
+

 Civetweb
 ========
--- a/ceph/make-dist
+++ b/ceph/make-dist
@ -55,7 +55,7 @@ download_from() {
            exit
        fi
        url=$url_base/$fname
-        wget -c --no-verbose -O $fname $url
+        wget --no-verbose -O $fname $url
        if [ $? != 0 -o ! -e $fname ]; then
            echo "Download of $url failed"
        elif [ $(sha256sum $fname | awk '{print $1}') != $sha256 ]; then
@ -183,8 +183,7 @@ download_boost $boost_version 4eb3b8d442b426dc35346235c8733b5ae35ba431690e38c6a8
               https://boostorg.jfrog.io/artifactory/main/release/$boost_version/source \
               https://downloads.sourceforge.net/project/boost/boost/$boost_version \
               https://download.ceph.com/qa
-download_liburing 0.7 8e2842cfe947f3a443af301bdd6d034455536c38a455c7a700d0c1ad165a7543 \
-                  https://github.com/axboe/liburing/archive \
+download_liburing 0.7 05d0cf8493d573c76b11abfcf34aabc7153affebe17ff95f9ae88b0de062a59d \
                  https://git.kernel.dk/cgit/liburing/snapshot
 pmdk_version=1.10
 download_pmdk $pmdk_version 08dafcf94db5ac13fac9139c92225d9aa5f3724ea74beee4e6ca19a01a2eb20c \
--- a/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
+++ b/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
@ -342,7 +342,7 @@ local g = import 'grafonnet/grafana.libsonnet';
      $.graphPanelSchema({},
                         title,
                         description,
-                         'null',
+                         'null as zero',
                         false,
                         formatY1,
                         'short',
--- a/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet
+++ b/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet
@ -133,7 +133,7 @@ local u = import 'utils.libsonnet';
      $.graphPanelSchema({},
                         title,
                         '',
-                         'null',
+                         'null as zero',
                         false,
                         formatY1,
                         'short',
--- a/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet
+++ b/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet
@ -140,7 +140,7 @@ local u = import 'utils.libsonnet';
        {},
        title,
        description,
-        'null',
+        'null as zero',
        false,
        formatY1,
        formatY2,
@ -658,7 +658,7 @@ local u = import 'utils.libsonnet';
      $.graphPanelSchema(aliasColors,
                         title,
                         description,
-                         'null',
+                         'null as zero',
                         false,
                         formatY1,
                         formatY2,
--- a/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json
+++ b/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json
@ -87,7 +87,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -185,7 +185,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -283,7 +283,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -400,7 +400,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -498,7 +498,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -596,7 +596,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
--- a/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json
+++ b/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json
@ -93,7 +93,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -186,7 +186,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -285,7 +285,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
--- a/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
+++ b/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
@ -87,7 +87,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -180,7 +180,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -266,7 +266,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -352,7 +352,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -445,7 +445,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -531,7 +531,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -636,7 +636,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -754,7 +754,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -893,7 +893,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -1000,7 +1000,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
--- a/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json
+++ b/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json
@ -80,7 +80,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -173,7 +173,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -266,7 +266,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
--- a/ceph/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/ceph/monitoring/ceph-mixin/prometheus_alerts.yml
@ -518,7 +518,7 @@ groups:
        annotations:
          description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
          summary: "Pool growth rate may soon exceed capacity"
-        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)    group_right ceph_pool_metadata) >= 95"
+        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
        labels:
          oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
          severity: "warning"
--- a/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@ -1499,35 +1499,44 @@ tests:
   # trigger percent full prediction on pools 1 and 2 only
 - interval: 12h
   input_series:
-    - series: 'ceph_pool_percent_used{pool_id="1"}'
-      values: '70 75 80 87 92'
-    - series: 'ceph_pool_percent_used{pool_id="2"}'
-      values: '22 22 23 23 24'
-    - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
      values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
+      values: '78 89 79 98 78'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
+      values: '22 22 23 23 24'
+    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
      values: '1 1 1 1 1'
   promql_expr_test:
     - expr: |
-         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
-              group_right ceph_pool_metadata) >= 95
+         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
+              group_right() ceph_pool_metadata) >= 95
       eval_time: 36h
       exp_samples:
-         - labels: '{name="rbd",pool_id="1",type="replicated"}'
-           value: 1.424E+02 # 142%
+         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
+           value: 1.435E+02 # 142%
   alert_rule_test:
    - eval_time: 48h
      alertname: CephPoolGrowthWarning
      exp_alerts:
      - exp_labels:
-          name: rbd
+          instance: 8090
+          name: default.rgw.index
          pool_id: 1
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.1.2.1.9.2
        exp_annotations:
          summary: Pool growth rate may soon exceed capacity
-          description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
+          description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
 - interval: 1m
   input_series:
    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
--- a/ceph/qa/cephfs/conf/mds.yaml
+++ b/ceph/qa/cephfs/conf/mds.yaml
@ -3,6 +3,7 @@ overrides:
    conf:
      mds:
        debug mds: 20
+        debug mds balancer: 20
        debug ms: 1
        mds debug frag: true
        mds debug scatterstat: true
--- a/ceph/qa/cephfs/overrides/ignorelist_health.yaml
+++ b/ceph/qa/cephfs/overrides/ignorelist_health.yaml
@ -2,7 +2,10 @@ overrides:
  ceph:
    log-ignorelist:
      - overall HEALTH_
+      - \(CEPHADM_STRAY_DAEMON\)
      - \(FS_DEGRADED\)
+      - FS_
+      - \(CEPHADM_
      - \(MDS_FAILED\)
      - \(MDS_DEGRADED\)
      - \(FS_WITH_FAILED_MDS\)
@ -10,3 +13,10 @@ overrides:
      - \(MDS_ALL_DOWN\)
      - \(MDS_UP_LESS_THAN_MAX\)
      - \(FS_INLINE_DATA_DEPRECATED\)
+      - \(PG_DEGRADED\)
+      - Degraded data redundancy
+      - \(PG_
+      - acting
+      - MDS_INSUFFICIENT_STANDBY
+      - deprecated feature inline_data
+      - compat changed unexpectedly
--- a/ceph/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml
+++ b/ceph/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml
@ -2,8 +2,10 @@ overrides:
  ceph:
    log-ignorelist:
      - overall HEALTH_
-      - \(OSD_DOWN\)
-      - \(OSD_
+      - OSD_DOWN
+      - OSD_
      - but it is still running
 # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a'
      - is not responding
+      - is down
+      - osds down
--- a/ceph/qa/distros/all/rhel_8.5.yaml
+++ b/ceph/qa/distros/all/rhel_8.5.yaml
@ -0,0 +1,6 @@
+os_type: rhel
+os_version: "8.5"
+overrides:
+  selinux:
+    whitelist:
+      - scontext=system_u:system_r:logrotate_t:s0
--- a/ceph/qa/distros/all/rhel_8.6.yaml
+++ b/ceph/qa/distros/all/rhel_8.6.yaml
@ -0,0 +1,6 @@
+os_type: rhel
+os_version: "8.6"
+overrides:
+  selinux:
+    whitelist:
+      - scontext=system_u:system_r:logrotate_t:s0
--- a/ceph/qa/distros/all/rhel_8.yaml
+++ b/ceph/qa/distros/all/rhel_8.yaml
@ -1 +1 @@
-rhel_8.4.yaml
+rhel_8.6.yaml
--- a/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_3.0.yaml
+++ b/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_3.0.yaml
@ -1 +0,0 @@
-.qa/distros/podman/rhel_8.4_container_tools_3.0.yaml
--- a/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_rhel8.yaml
+++ b/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_rhel8.yaml
@ -1 +0,0 @@
-.qa/distros/podman/rhel_8.4_container_tools_rhel8.yaml
--- a/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_3.0.yaml
+++ b/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_3.0.yaml
@ -0,0 +1 @@
+.qa/distros/podman/rhel_8.6_container_tools_3.0.yaml
--- a/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_rhel8.yaml
+++ b/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_rhel8.yaml
@ -0,0 +1 @@
+.qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml
--- a/ceph/qa/distros/podman/rhel_8.6_container_tools_3.0.yaml
+++ b/ceph/qa/distros/podman/rhel_8.6_container_tools_3.0.yaml
@ -1,5 +1,5 @@
 os_type: rhel
-os_version: "8.4"
+os_version: "8.6"
 overrides:
  selinux:
    whitelist:
--- a/ceph/qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml
+++ b/ceph/qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml
@ -1,5 +1,5 @@
 os_type: rhel
-os_version: "8.4"
+os_version: "8.6"
 overrides:
  selinux:
    whitelist:
--- a/ceph/qa/standalone/ceph-helpers.sh
+++ b/ceph/qa/standalone/ceph-helpers.sh
@ -1691,6 +1691,29 @@ function test_wait_for_peered() {

 #######################################################################

+##
+# Wait until the cluster's health condition disappeared.
+# $TIMEOUT default
+#
+# @param string to grep for in health detail
+# @return 0 if the cluster health doesn't matches request,
+# 1 otherwise if after $TIMEOUT seconds health condition remains.
+#
+function wait_for_health_gone() {
+    local grepstr=$1
+    local -a delays=($(get_timeout_delays $TIMEOUT .1))
+    local -i loop=0
+
+    while ceph health detail | grep "$grepstr" ; do
+	if (( $loop >= ${#delays[*]} )) ; then
+            ceph health detail
+            return 1
+        fi
+        sleep ${delays[$loop]}
+        loop+=1
+    done
+}
+
 ##
 # Wait until the cluster has health condition passed as arg
 # again for $TIMEOUT seconds.
--- a/ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh
+++ b/ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh
@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
+    export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
+    export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
+    export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
+    export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+
+    export BASE_CEPH_ARGS=$CEPH_ARGS
+    CEPH_ARGS+="--mon-host=$CEPH_MON_A"
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+TEST_stretched_cluster_failover_add_three_osds(){
+    local dir=$1
+    local OSDS=8
+    setup $dir || return 1
+
+    run_mon $dir a --public-addr $CEPH_MON_A || return 1
+    wait_for_quorum 300 1 || return 1
+
+    run_mon $dir b --public-addr $CEPH_MON_B || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
+    wait_for_quorum 300 2 || return 1
+
+    run_mon $dir c --public-addr $CEPH_MON_C || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
+    wait_for_quorum 300 3 || return 1
+
+    run_mon $dir d --public-addr $CEPH_MON_D || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
+    wait_for_quorum 300 4 || return 1
+
+    run_mon $dir e --public-addr $CEPH_MON_E || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
+    wait_for_quorum 300 5 || return 1
+
+    ceph mon set election_strategy connectivity
+    ceph mon add disallowed_leader e
+
+    run_mgr $dir x || return 1
+    run_mgr $dir y || return 1
+    run_mgr $dir z || return 1
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+    
+    for zone in iris pze
+    do
+      ceph osd crush add-bucket $zone zone
+      ceph osd crush move $zone root=default
+    done
+
+
+    ceph osd crush add-bucket node-2 host
+    ceph osd crush add-bucket node-3 host
+    ceph osd crush add-bucket node-4 host
+    ceph osd crush add-bucket node-5 host
+
+    ceph osd crush move node-2 zone=iris
+    ceph osd crush move node-3 zone=iris
+    ceph osd crush move node-4 zone=pze
+    ceph osd crush move node-5 zone=pze
+
+    ceph osd crush move osd.0 host=node-2
+    ceph osd crush move osd.1 host=node-2
+    ceph osd crush move osd.2 host=node-3
+    ceph osd crush move osd.3 host=node-3
+    ceph osd crush move osd.4 host=node-4
+    ceph osd crush move osd.5 host=node-4
+    ceph osd crush move osd.6 host=node-5
+    ceph osd crush move osd.7 host=node-5
+    
+    ceph mon set_location a zone=iris host=node-2
+    ceph mon set_location b zone=iris host=node-3
+    ceph mon set_location c zone=pze host=node-4
+    ceph mon set_location d zone=pze  host=node-5
+
+    hostname=$(hostname -s)
+    ceph osd crush remove $hostname || return 1
+    ceph osd getcrushmap > crushmap || return 1
+    crushtool --decompile crushmap > crushmap.txt || return 1
+    sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
+    cat >> crushmap_modified.txt << EOF
+rule stretch_rule {
+        id 1
+        type replicated
+        min_size 1
+        max_size 10
+        step take iris
+        step chooseleaf firstn 2 type host
+        step emit
+        step take pze
+        step chooseleaf firstn 2 type host
+        step emit
+}
+
+# end crush map
+EOF
+
+    crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
+    ceph osd setcrushmap -i crushmap.bin  || return 1
+    local stretched_poolname=stretched_rbdpool
+    ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
+    ceph osd pool set $stretched_poolname size 4 || return 1
+
+    sleep 3
+
+    ceph mon set_location e zone=arbiter host=node-1
+    ceph mon enable_stretch_mode e stretch_rule zone
+
+    kill_daemons $dir KILL mon.c || return 1
+    kill_daemons $dir KILL mon.d || return 1
+
+    kill_daemons $dir KILL osd.4 || return 1
+    kill_daemons $dir KILL osd.5 || return 1
+    kill_daemons $dir KILL osd.6 || return 1
+    kill_daemons $dir KILL osd.7 || return 1
+
+    ceph -s
+
+    sleep 3
+
+    run_osd $dir 8 || return 1
+    run_osd $dir 9 || return 1
+    run_osd $dir 10 || return 1
+
+    ceph -s
+
+    sleep 3
+
+    teardown $dir || return 1
+}
+main mon-stretch-fail-recovery "$@"
--- a/ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh
+++ b/ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh
@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
+    export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
+    export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
+    export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
+    export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+
+    export BASE_CEPH_ARGS=$CEPH_ARGS
+    CEPH_ARGS+="--mon-host=$CEPH_MON_A"
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+TEST_stretched_cluster_uneven_weight() {
+    local dir=$1
+    local OSDS=4
+    local weight=0.09000
+    setup $dir || return 1
+
+    run_mon $dir a --public-addr $CEPH_MON_A || return 1
+    wait_for_quorum 300 1 || return 1
+
+    run_mon $dir b --public-addr $CEPH_MON_B || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
+    wait_for_quorum 300 2 || return 1
+
+    run_mon $dir c --public-addr $CEPH_MON_C || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
+    wait_for_quorum 300 3 || return 1
+
+    run_mon $dir d --public-addr $CEPH_MON_D || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
+    wait_for_quorum 300 4 || return 1
+
+    run_mon $dir e --public-addr $CEPH_MON_E || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
+    wait_for_quorum 300 5 || return 1
+
+    ceph mon set election_strategy connectivity
+    ceph mon add disallowed_leader e
+
+    run_mgr $dir x || return 1
+    run_mgr $dir y || return 1
+    run_mgr $dir z || return 1
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+    
+    for zone in iris pze
+    do
+      ceph osd crush add-bucket $zone zone
+      ceph osd crush move $zone root=default
+    done
+
+    ceph osd crush add-bucket node-2 host
+    ceph osd crush add-bucket node-3 host
+    ceph osd crush add-bucket node-4 host
+    ceph osd crush add-bucket node-5 host
+
+    ceph osd crush move node-2 zone=iris
+    ceph osd crush move node-3 zone=iris
+    ceph osd crush move node-4 zone=pze
+    ceph osd crush move node-5 zone=pze
+
+    ceph osd crush move osd.0 host=node-2
+    ceph osd crush move osd.1 host=node-3
+    ceph osd crush move osd.2 host=node-4
+    ceph osd crush move osd.3 host=node-5
+    
+    ceph mon set_location a zone=iris host=node-2
+    ceph mon set_location b zone=iris host=node-3
+    ceph mon set_location c zone=pze host=node-4
+    ceph mon set_location d zone=pze host=node-5
+
+    hostname=$(hostname -s)
+    ceph osd crush remove $hostname || return 1
+    ceph osd getcrushmap > crushmap || return 1
+    crushtool --decompile crushmap > crushmap.txt || return 1
+    sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
+    cat >> crushmap_modified.txt << EOF
+rule stretch_rule {
+        id 1
+        type replicated
+        min_size 1
+        max_size 10
+        step take iris
+        step chooseleaf firstn 2 type host
+        step emit
+        step take pze
+        step chooseleaf firstn 2 type host
+        step emit
+}
+# end crush map
+EOF
+
+    crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
+    ceph osd setcrushmap -i crushmap.bin  || return 1
+    local stretched_poolname=stretched_rbdpool
+    ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
+    ceph osd pool set $stretched_poolname size 4 || return 1
+
+    ceph mon set_location e zone=arbiter host=node-1 || return 1
+    ceph mon enable_stretch_mode e stretch_rule zone || return 1 # Enter strech mode
+
+    # reweight to a more round decimal.
+    ceph osd crush reweight osd.0 $weight
+    ceph osd crush reweight osd.1 $weight
+    ceph osd crush reweight osd.2 $weight
+    ceph osd crush reweight osd.3 $weight
+
+    # Firstly, we test for stretch mode buckets != 2
+    ceph osd crush add-bucket sham zone || return 1
+    ceph osd crush move sham root=default || return 1
+    wait_for_health "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
+
+    ceph osd crush rm sham # clear the health warn
+    wait_for_health_gone "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
+
+    # Next, we test for uneven weights across buckets
+
+    ceph osd crush reweight osd.0 0.07000
+
+    wait_for_health "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
+
+    ceph osd crush reweight osd.0 $weight # clear the health warn
+
+    wait_for_health_gone "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
+
+    teardown $dir || return 1
+}
+main mon-stretched-cluster-uneven-weight "$@"
--- a/ceph/qa/suites/fs/functional/tasks/damage.yaml
+++ b/ceph/qa/suites/fs/functional/tasks/damage.yaml
@ -19,6 +19,7 @@ overrides:
      - MDS_READ_ONLY
      - force file system read-only
      - with standby daemon mds
+      - MDS abort because newly corrupt dentry
 tasks:
  - cephfs_test_runner:
      modules:
--- a/ceph/qa/suites/fs/thrash/multifs/overrides/client-shutdown.yaml
+++ b/ceph/qa/suites/fs/thrash/multifs/overrides/client-shutdown.yaml
@ -0,0 +1,6 @@
+# Lengthen the timeout for thrashed MDS
+overrides:
+  ceph:
+    conf:
+      client:
+        client_shutdown_timeout: 120
--- a/ceph/qa/suites/fs/thrash/workloads/overrides/client-shutdown.yaml
+++ b/ceph/qa/suites/fs/thrash/workloads/overrides/client-shutdown.yaml
@ -0,0 +1,6 @@
+# Lengthen the timeout for thrashed MDS
+overrides:
+  ceph:
+    conf:
+      client:
+        client_shutdown_timeout: 120
--- a/ceph/qa/suites/fs/volumes/tasks/volumes/test/finisher_per_module.yaml
+++ b/ceph/qa/suites/fs/volumes/tasks/volumes/test/finisher_per_module.yaml
@ -0,0 +1,13 @@
+tasks:
+  - check-counter:
+      counters:
+        mgr:
+            - name: "finisher-volumes.complete_latency.avgcount"
+              min: 4
+            - name: "finisher-volumes.queue_len"
+              expected_val: 0
+
+  - cephfs_test_runner:
+      fail_on_skip: false
+      modules:
+        - tasks.cephfs.test_volumes.TestPerModuleFinsherThread
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/%
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/%
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/.qa
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/.qa
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/bluestore-bitmap.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/bluestore-bitmap.yaml
@ -0,0 +1 @@
+.qa/objectstore/bluestore-bitmap.yaml
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml
@ -0,0 +1,7 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        ms die on skipped message: false
+      client:
+        rbd default features: 37
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc-rxbounce.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc-rxbounce.yaml
@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default map options: ms_mode=crc,rxbounce
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc.yaml
@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default map options: ms_mode=crc
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy-rxbounce.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy-rxbounce.yaml
@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default map options: ms_mode=legacy,rxbounce
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy.yaml
@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default map options: ms_mode=legacy
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/secure.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/secure.yaml
@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default map options: ms_mode=secure
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/few.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/few.yaml
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/many.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/many.yaml
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/rbd_xfstests.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/rbd_xfstests.yaml
--- a/ceph/qa/suites/krbd/singleton/conf.yaml
+++ b/ceph/qa/suites/krbd/singleton/conf.yaml
@ -2,6 +2,7 @@ overrides:
  ceph:
    conf:
      global:
+        mon warn on pool no app: false
        ms die on skipped message: false
      client:
        rbd default features: 37
--- a/ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml
+++ b/ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml
@ -0,0 +1,19 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd pool default size: 1
+      osd:
+        osd shutdown pgref assert: true
+roles:
+- [mon.a, mgr.x, osd.0, client.0]
+
+tasks:
+- install:
+    extra_system_packages:
+      - fio
+- ceph:
+- workunit:
+    clients:
+      all:
+        - rbd/krbd_watch_errors.sh
--- a/ceph/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml
+++ b/ceph/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml
@ -1,3 +1,28 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(HOST_IN_MAINTENANCE\)
+      - \(OSD_DOWN\)
+      - \(MON_DOWN\)
+      - down
+      - overall HEALTH_
+      - \(CEPHADM_STRAY_DAEMON\)
+      - stray daemon
+      - \(FS_DEGRADED\)
+      - \(MDS_FAILED\)
+      - \(MDS_DEGRADED\)
+      - \(FS_WITH_FAILED_MDS\)
+      - \(MDS_DAMAGE\)
+      - \(MDS_ALL_DOWN\)
+      - \(MDS_UP_LESS_THAN_MAX\)
+      - \(FS_INLINE_DATA_DEPRECATED\)
+      - \(PG_DEGRADED\)
+      - Degraded data redundancy
+      - \(PG_
+      - acting
+      - MDS_INSUFFICIENT_STANDBY
+      - deprecated feature inline_data
+      - compat changed unexpectedly
 roles:
 # 3 osd roles on host.a is required for cephadm task. It checks if the cluster is healthy.
 # More daemons will be deployed on both hosts in e2e tests.
--- a/ceph/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml
+++ b/ceph/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml
@ -24,6 +24,21 @@ openstack:
    size: 10 # GB
 overrides:
  ceph:
+    log-ignorelist:
+      - slow requests
+      - \(PG_
+      - PG_
+      - \(CEPHADM_STRAY_DAEMON\)
+      - slow request
+      - \(MDS_
+      - MDS_
+      - osds down
+      - OSD_
+      - \(OSD_
+      - client
+      - FS_
+      - \(FS_
+      - degraded
    conf:
      osd:
        osd shutdown pgref assert: true
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/repave-all.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/repave-all.yaml
@ -1,3 +1,10 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - \(OSD_DOWN\)
+      - \(PG_
+      - but it is still running
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-add.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-add.yaml
@ -1,3 +1,10 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - \(OSD_DOWN\)
+      - \(PG_
+      - but it is still running
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-flag.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-flag.yaml
@ -1,3 +1,10 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - \(OSD_DOWN\)
+      - \(PG_
+      - but it is still running
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-wait.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-wait.yaml
@ -1,3 +1,10 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - \(OSD_DOWN\)
+      - \(PG_
+      - but it is still running
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml
@ -1,3 +1,11 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - \(OSD_DOWN\)
+      - \(PG_
+      - but it is still running
+      - \(CEPHADM_STRAY_DAEMON\)
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/smoke/start.yaml
+++ b/ceph/qa/suites/orch/cephadm/smoke/start.yaml
@ -1,3 +1,11 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - \(PG_AVAILABILITY\)
+      - mon down
+      - mons down
+      - out of quorum
 tasks:
 - cephadm:
    conf:
--- a/ceph/qa/suites/orch/cephadm/thrash/2-thrash.yaml
+++ b/ceph/qa/suites/orch/cephadm/thrash/2-thrash.yaml
@ -3,6 +3,23 @@ overrides:
    log-ignorelist:
    - but it is still running
    - objects unfound and apparently lost
+    - \(MON_DOWN\)
+    - \(OSDMAP_FLAGS\)
+    - flag\(s\) set
+    - \(CACHE_POOL_NO_HIT_SET\)
+    - \(CACHE_
+    - \(PG_
+    - \(OSD_
+    - \(POOL_
+    - \(CEPHADM_STRAY_DAEMON\)
+    - PG_
+    - CACHE_
+    - degraded
+    - backfill
+    - mons down
+    - OSD_
+    - is down
+    - acting
    conf:
      osd:
        osd debug reject backfill probability: .3
--- a/ceph/qa/suites/orch/cephadm/upgrade/4-wait.yaml
+++ b/ceph/qa/suites/orch/cephadm/upgrade/4-wait.yaml
@ -1,3 +1,14 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - \(PG_
+      - mons down
+      - pg inactive
+      - out of quorum
+      - \(OSD_
+      - osds down
+      - osd down
 tasks:
 - cephadm.shell:
    env: [sha1]
--- a/ceph/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml
+++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml
@ -1,3 +1,9 @@
+overrides:
+  ceph:
+    log-ignorelist:
+    - Replacing daemon mds
+    - FS_DEGRADED
+    - \(CEPHADM_STRAY_DAEMON\)
 roles:
 - - host.a
  - osd.0
--- a/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml
+++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml
@ -1,3 +1,10 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - \(OSD_DOWN\)
+      - \(CEPHADM_PAUSED\)
+      - mons down
 roles:
 - - host.a
  - osd.0
--- a/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli_mon.yaml
+++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli_mon.yaml
@ -1,3 +1,10 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - \(MON_DOWN\)
+      - mons down
+      - \(MGR_DOWN\)
+      - out of quorum
 roles:
 - - host.a
  - osd.0
--- a/ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml
+++ b/ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml
@ -11,6 +11,15 @@ overrides:
    - \(POOL_APP_NOT_ENABLED\)
    - \(PG_AVAILABILITY\)
    - \(PG_DEGRADED\)
+    - \(MON_DOWN\)
+    - \(CEPHADM_STRAY_DAEMON\)
+    - missing hit_sets
+    - do not have an application enabled
+    - application not enabled on pool
+    - pool application
+    - mons down
+    - out of quorum
+    - needs hit_set_type to be set but it is not
    conf:
      client:
        debug ms: 1
--- a/ceph/qa/suites/rados/basic/tasks/rados_cls_all.yaml
+++ b/ceph/qa/suites/rados/basic/tasks/rados_cls_all.yaml
@ -2,6 +2,7 @@ overrides:
  ceph:
    log-ignorelist:
    - \(PG_AVAILABILITY\)
+    - \(POOL_APP_NOT_ENABLED\)
    conf:
      osd:
        osd_class_load_list: "*"
--- a/ceph/qa/suites/rados/basic/tasks/rados_python.yaml
+++ b/ceph/qa/suites/rados/basic/tasks/rados_python.yaml
@ -8,6 +8,13 @@ overrides:
    - \(OSD_
    - \(OBJECT_
    - \(POOL_APP_NOT_ENABLED\)
+    - \(MON_DOWN\)
+    - mons down
+    - application not enabled on pool
+    - do not have an application enabled
+    - pool application
+    - out of quorum
+    - needs hit_set_type to be set but it is not
 tasks:
 - workunit:
    clients:
--- a/ceph/qa/suites/rados/mgr/tasks/per_module_finisher_stats.yaml
+++ b/ceph/qa/suites/rados/mgr/tasks/per_module_finisher_stats.yaml
@ -0,0 +1,43 @@
+tasks:
+  - install:
+  - ceph:
+      wait-for-scrub: false
+  - check-counter:
+      counters:
+        mgr:
+            - name: "finisher-balancer.complete_latency.avgcount"
+              min: 1
+            - name: "finisher-balancer.queue_len"
+              expected_val: 0
+            - name: "finisher-crash.complete_latency.avgcount"
+              min: 2
+            - name: "finisher-crash.queue_len"
+              expected_val: 0
+            - name: "finisher-devicehealth.complete_latency.avgcount"
+              min: 1
+            - name: "finisher-devicehealth.queue_len"
+              expected_val: 0
+            - name: "finisher-iostat.complete_latency.avgcount"
+              min: 1
+            - name: "finisher-iostat.queue_len"
+              expected_val: 0
+            - name: "finisher-pg_autoscaler.complete_latency.avgcount"
+              min: 1
+            - name: "finisher-pg_autoscaler.queue_len"
+              expected_val: 0
+            - name: "finisher-progress.complete_latency.avgcount"
+              min: 2
+            - name: "finisher-progress.queue_len"
+              expected_val: 0
+            - name: "finisher-status.complete_latency.avgcount"
+              min: 2
+            - name: "finisher-status.queue_len"
+              expected_val: 0
+            - name: "finisher-telemetry.complete_latency.avgcount"
+              min: 1
+            - name: "finisher-telemetry.queue_len"
+              expected_val: 0
+  - workunit:
+      clients:
+        client.0:
+          - mgr/test_per_module_finisher.sh
--- a/ceph/qa/suites/rados/mgr/tasks/workunits.yaml
+++ b/ceph/qa/suites/rados/mgr/tasks/workunits.yaml
@ -13,4 +13,4 @@ tasks:
  - workunit:
      clients:
        client.0:
-          - mgr
+          - mgr/test_localpool.sh
--- a/ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml
+++ b/ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml
@ -0,0 +1,18 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - client.0
+openstack:
+  - volumes: # attached to each instance
+      count: 3
+      size: 10 # GB
+tasks:
+- install:
+- workunit:
+    basedir: qa/standalone
+    clients:
+      all:
+        - mon-stretch
--- a/ceph/qa/suites/rados/verify/tasks/rados_cls_all.yaml
+++ b/ceph/qa/suites/rados/verify/tasks/rados_cls_all.yaml
@ -4,6 +4,8 @@ overrides:
      osd:
        osd_class_load_list: "*"
        osd_class_default_list: "*"
+    log-ignorelist:
+      - \(POOL_APP_NOT_ENABLED\)
 tasks:
 - workunit:
    clients:
--- a/ceph/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml
+++ b/ceph/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml
@ -0,0 +1,13 @@
+overrides:
+  ceph:
+    conf:
+      mgr:
+        debug rbd: 20
+tasks:
+- install:
+    extra_system_packages:
+      - fio
+- workunit:
+    clients:
+      client.0:
+        - rbd/rbd_support_module_recovery.sh
--- a/ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml
+++ b/ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml
@ -0,0 +1,5 @@
+tasks:
+- workunit:
+    clients:
+      client.0:
+        - rgw/run-bucket-check.sh
--- a/ceph/qa/suites/upgrade/nautilus-x/parallel/2-workload/rgw_ragweed_prepare.yaml
+++ b/ceph/qa/suites/upgrade/nautilus-x/parallel/2-workload/rgw_ragweed_prepare.yaml
@ -6,7 +6,7 @@ workload:
  - sequential:
    - ragweed:
        client.1:
-          default-branch: ceph-pacific
+          default-branch: ceph-nautilus
          rgw_server: client.1
          stages: prepare
    - print: "**** done rgw ragweed prepare 2-workload"
--- a/ceph/qa/suites/upgrade/nautilus-x/parallel/5-final-workload/rgw_ragweed_check.yaml
+++ b/ceph/qa/suites/upgrade/nautilus-x/parallel/5-final-workload/rgw_ragweed_check.yaml
@ -5,7 +5,7 @@ rgw-final-workload:
  full_sequential:
  - ragweed:
      client.1:
-        default-branch: ceph-pacific
+        default-branch: ceph-nautilus
        rgw_server: client.1
        stages: check
  - print: "**** done ragweed check 4-final-workload"
--- a/ceph/qa/suites/upgrade/octopus-x/parallel-no-cephadm/5-final-workload/rgw_ragweed_check.yaml
+++ b/ceph/qa/suites/upgrade/octopus-x/parallel-no-cephadm/5-final-workload/rgw_ragweed_check.yaml
@ -5,7 +5,7 @@ rgw-final-workload:
  full_sequential:
  - ragweed:
      client.1:
-        default-branch: ceph-pacific
+        default-branch: ceph-octopus
        rgw_server: client.1
        stages: check
  - print: "**** done ragweed check 4-final-workload"
--- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-parallel/point-to-point-upgrade.yaml
+++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-parallel/point-to-point-upgrade.yaml
@ -123,7 +123,7 @@ workload_pacific:
         - rados/test.sh
         - cls
       env:
-         CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
+         CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
   - print: "**** done rados/test.sh &  cls workload_pacific"
   - sequential:
     - rgw: [client.0]
--- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml
+++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml
@ -7,4 +7,6 @@ stress-tasks:
    clients:
      client.0:
        - cls/test_cls_rbd.sh
+    env:
+      CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
 - print: "**** done cls/test_cls_rbd.sh 4-workload"
--- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml
+++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml
@ -3,7 +3,7 @@ meta:
   librbd python api tests
 tasks:
 - workunit:
-    tag: v16.2.7
+    branch: pacific
    clients:
      client.0:
        - rbd/test_librbd_python.sh
--- a/ceph/qa/tasks/ceph_manager.py
+++ b/ceph/qa/tasks/ceph_manager.py
@ -232,6 +232,7 @@ class OSDThrasher(Thrasher):
        self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0)
        self.random_eio = self.config.get('random_eio')
        self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
+        self.chance_reset_purged_snaps_last = self.config.get('chance_reset_purged_snaps_last', 0.3)

        num_osds = self.in_osds + self.out_osds
        self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * len(num_osds)
@ -798,6 +799,19 @@ class OSDThrasher(Thrasher):
        else:
           self.cancel_force_recovery()

+    def reset_purged_snaps_last(self):
+        """
+        Run reset_purged_snaps_last
+        """
+        self.log('reset_purged_snaps_last')
+        for osd in self.in_osds:
+            try:
+               self.ceph_manager.raw_cluster_cmd(
+               'tell', "osd.%s" % (str(osd)),
+               'reset_purged_snaps_last')
+            except CommandFailedError:
+                self.log('Failed to reset_purged_snaps_last, ignoring')
+
    def all_up(self):
        """
        Make sure all osds are up and not out.
@ -1248,6 +1262,8 @@ class OSDThrasher(Thrasher):
            actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,))
        if self.chance_force_recovery > 0:
            actions.append((self.force_cancel_recovery, self.chance_force_recovery))
+        if self.chance_reset_purged_snaps_last > 0:
+            actions.append((self.reset_purged_snaps_last, self.chance_reset_purged_snaps_last))

        for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
            for scenario in [
--- a/ceph/qa/tasks/cephadm.conf
+++ b/ceph/qa/tasks/cephadm.conf
@ -2,6 +2,8 @@
 # make logging friendly to teuthology
 log_to_file = true
 log_to_stderr = false
+log to journald = false
+mon cluster log to file = true
 mon cluster log file level = debug

 mon clock drift allowed = 1.000
--- a/ceph/qa/tasks/cephfs/mount.py
+++ b/ceph/qa/tasks/cephfs/mount.py
@ -811,7 +811,7 @@ class CephFSMount(object):
        ))
        p.wait()

-    def open_background(self, basename="background_file", write=True):
+    def open_background(self, basename="background_file", write=True, content="content"):
        """
        Open a file for writing, then block such that the client
        will hold a capability.
@ -828,12 +828,11 @@ class CephFSMount(object):
                import time

                with open("{path}", 'w') as f:
-                    f.write('content')
+                    f.write("{content}")
                    f.flush()
-                    f.write('content2')
                    while True:
                        time.sleep(1)
-                """).format(path=path)
+                """).format(path=path, content=content)
        else:
            pyscript = dedent("""
                import time
@ -849,7 +848,10 @@ class CephFSMount(object):
        # This wait would not be sufficient if the file had already
        # existed, but it's simple and in practice users of open_background
        # are not using it on existing files.
-        self.wait_for_visible(basename)
+        if write:
+            self.wait_for_visible(basename, size=len(content))
+        else:
+            self.wait_for_visible(basename)

        return rproc

@ -887,19 +889,27 @@ class CephFSMount(object):
                if nr_links == 2:
                    return

-    def wait_for_visible(self, basename="background_file", timeout=30):
+    def wait_for_visible(self, basename="background_file", size=None, timeout=30):
        i = 0
+        args = ['stat']
+        if size is not None:
+            args += ['--printf=%s']
+        args += [os.path.join(self.hostfs_mntpt, basename)]
        while i < timeout:
-            r = self.client_remote.run(args=[
-                'stat', os.path.join(self.hostfs_mntpt, basename)
-            ], check_status=False)
-            if r.exitstatus == 0:
-                log.debug("File {0} became visible from {1} after {2}s".format(
-                    basename, self.client_id, i))
-                return
-            else:
-                time.sleep(1)
-                i += 1
+            p = self.client_remote.run(args=args, stdout=StringIO(), check_status=False)
+            if p.exitstatus == 0:
+                if size is not None:
+                    s = p.stdout.getvalue().strip()
+                    if int(s) == size:
+                        log.info(f"File {basename} became visible with size {size} from {self.client_id} after {i}s")
+                        return
+                    else:
+                        log.error(f"File {basename} became visible but with size {int(s)} not {size}")
+                else:
+                    log.info(f"File {basename} became visible from {self.client_id} after {i}s")
+                    return
+            time.sleep(1)
+            i += 1

        raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
            i, basename, self.client_id))
--- a/ceph/qa/tasks/cephfs/test_cephfs_shell.py
+++ b/ceph/qa/tasks/cephfs/test_cephfs_shell.py
@ -1,6 +1,8 @@
 """
-Before running this testsuite, add path to cephfs-shell module to $PATH and
-export $PATH.
+NOTE: For running this tests locally (using vstart_runner.py), export the
+path to src/tools/cephfs/shell/cephfs-shell module to $PATH. Running
+"export PATH=$PATH:$(cd ../src/tools/cephfs/shell && pwd)" from the build dir
+will update the environment without hassles of typing the path correctly.
 """
 from io import StringIO
 from os import path
--- a/ceph/qa/tasks/cephfs/test_client_limits.py
+++ b/ceph/qa/tasks/cephfs/test_client_limits.py
@ -9,7 +9,9 @@ from textwrap import dedent
 from tasks.ceph_test_case import TestTimeoutError
 from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
 from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
 import os
+from io import StringIO


 log = logging.getLogger(__name__)
@ -157,29 +159,49 @@ class TestClientLimits(CephFSTestCase):
        a fraction of second (0.5) by default when throttling condition is met.
        """

-        max_caps_per_client = 500
-        cap_acquisition_throttle = 250
+        subdir_count = 4
+        files_per_dir = 25

-        self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client)
-        self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle)
+        # throttle in a way so that two dir reads are already hitting it.
+        throttle_value = (files_per_dir * 3) // 2

-        # Create 1500 files split across 6 directories, 250 each.
-        for i in range(1, 7):
-            self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True)
+        # activate throttling logic by setting max per client to a low value
+        self.config_set('mds', 'mds_max_caps_per_client', 1)
+        self.config_set('mds', 'mds_session_cap_acquisition_throttle', throttle_value)
+
+        # Create files split across {subdir_count} directories, {per_dir_count} in each dir
+        for i in range(1, subdir_count+1):
+            self.mount_a.create_n_files("dir{0}/file".format(i), files_per_dir, sync=True)

        mount_a_client_id = self.mount_a.get_global_id()

-        # recursive readdir
-        self.mount_a.run_shell_payload("find | wc")
-
-        # validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250
-        cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
-        self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle)
+        # recursive readdir. macOs wants an explicit directory for `find`.
+        proc = self.mount_a.run_shell_payload("find . | wc", stderr=StringIO())
+        # return code may be None if the command got interrupted
+        self.assertTrue(proc.returncode is None or proc.returncode == 0, proc.stderr.getvalue())

        # validate the throttle condition to be hit atleast once
        cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle']
        self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1)

+        # validate cap_acquisition decay counter after readdir to NOT exceed the throttle value
+        # plus one batch that could have been taken immediately before querying
+        # assuming the batch is equal to the per dir file count.
+        cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
+        self.assertLessEqual(cap_acquisition_value, files_per_dir + throttle_value)
+
+        # make sure that the throttle was reported in the events
+        def historic_ops_have_event(expected_event):
+            ops_dump = self.fs.rank_tell(['dump_historic_ops'])
+            # reverse the events and the ops assuming that later ops would be throttled
+            for op in reversed(ops_dump['ops']):
+                for ev in reversed(op.get('type_data', {}).get('events', [])):
+                    if ev['event'] == expected_event:
+                        return True
+            return False
+
+        self.assertTrue(historic_ops_have_event('cap_acquisition_throttle'))
+
    def test_client_release_bug(self):
        """
        When a client has a bug (which we will simulate) preventing it from releasing caps,
@ -219,6 +241,55 @@ class TestClientLimits(CephFSTestCase):
        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
        rproc.wait()

+    def test_client_blocklisted_oldest_tid(self):
+        """
+        that a client is blocklisted when its encoded session metadata exceeds the
+        configured threshold (due to ever growing `completed_requests` caused due
+        to an unidentified bug (in the client or the MDS)).
+        """
+
+        # num of requests client issues
+        max_requests = 10000
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to inject client release failure")
+
+        self.config_set('client', 'client inject fixed oldest tid', 'true')
+        self.mount_a.teardown()
+        self.mount_a.mount_wait()
+
+        self.config_set('mds', 'mds_max_completed_requests', max_requests);
+
+        # Create lots of files
+        self.mount_a.create_n_files("testdir/file1", max_requests + 100)
+
+        # Create a few files synchronously. This makes sure previous requests are completed
+        self.mount_a.create_n_files("testdir/file2", 5, True)
+
+        # Wait for the health warnings. Assume mds can handle 10 request per second at least
+        self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id))
+
+        # set the threshold low so that it has a high probability of
+        # hitting.
+        self.config_set('mds', 'mds_session_metadata_threshold', 5000);
+
+        # Create lot many files synchronously. This would hit the session metadata threshold
+        # causing the client to get blocklisted.
+        with self.assertRaises(CommandFailedError):
+            self.mount_a.create_n_files("testdir/file2", 100000, True)
+
+        self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr())
+        # the mds should bump up the relevant perf counter
+        pd = self.perf_dump()
+        self.assertGreater(pd['mds_sessions']['mdthresh_evicted'], 0)
+
+        # reset the config
+        self.config_set('client', 'client inject fixed oldest tid', 'false')
+
+        self.mount_a.kill_cleanup()
+        self.mount_a.mount_wait()
+
    def test_client_oldest_tid(self):
        """
        When a client does not advance its oldest tid, the MDS should notice that
--- a/ceph/qa/tasks/cephfs/test_client_recovery.py
+++ b/ceph/qa/tasks/cephfs/test_client_recovery.py
@ -10,8 +10,10 @@ from textwrap import dedent
 import time
 import distutils.version as version
 import re
+import string
 import os

+from teuthology import contextutil
 from teuthology.orchestra import run
 from teuthology.orchestra.run import CommandFailedError
 from tasks.cephfs.fuse_mount import FuseMount
@ -221,8 +223,10 @@ class TestClientRecovery(CephFSTestCase):
        # Capability release from stale session
        # =====================================
        if write:
-            cap_holder = self.mount_a.open_background()
+            content = ''.join(random.choices(string.ascii_uppercase + string.digits, k=16))
+            cap_holder = self.mount_a.open_background(content=content)
        else:
+            content = ''
            self.mount_a.run_shell(["touch", "background_file"])
            self.mount_a.umount_wait()
            self.mount_a.mount_wait()
@ -233,7 +237,7 @@ class TestClientRecovery(CephFSTestCase):

        # Wait for the file to be visible from another client, indicating
        # that mount_a has completed its network ops
-        self.mount_b.wait_for_visible()
+        self.mount_b.wait_for_visible(size=len(content))

        # Simulate client death
        self.mount_a.suspend_netns()
@ -264,11 +268,9 @@ class TestClientRecovery(CephFSTestCase):
                            "Capability handover took {0}, expected approx {1}".format(
                                cap_waited, session_timeout
                            ))
-
-            self.mount_a._kill_background(cap_holder)
        finally:
-            # teardown() doesn't quite handle this case cleanly, so help it out
-            self.mount_a.resume_netns()
+            self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable
+        self.mount_a._kill_background(cap_holder)

    def test_stale_read_caps(self):
        self._test_stale_caps(False)
@ -319,9 +321,9 @@ class TestClientRecovery(CephFSTestCase):
                                cap_waited, session_timeout / 2.0
                            ))

-            self.mount_a._kill_background(cap_holder)
        finally:
-            self.mount_a.resume_netns()
+            self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable
+        self.mount_a._kill_background(cap_holder)

    def test_trim_caps(self):
        # Trim capability when reconnecting MDS
@ -387,7 +389,6 @@ class TestClientRecovery(CephFSTestCase):

        self.mount_b.check_filelock(do_flock=flockable)

-        # Tear down the background process
        self.mount_a._kill_background(lock_holder)

    def test_filelock_eviction(self):
@ -416,7 +417,6 @@ class TestClientRecovery(CephFSTestCase):
            # succeed
            self.wait_until_true(lambda: lock_taker.finished, timeout=10)
        finally:
-            # Tear down the background process
            self.mount_a._kill_background(lock_holder)

            # teardown() doesn't quite handle this case cleanly, so help it out
@ -751,24 +751,27 @@ class TestClientOnLaggyOSD(CephFSTestCase):
            # it takes time to have laggy clients entries in cluster log,
            # wait for 6 minutes to see if it is visible, finally restart
            # the client
-            tries = 6
-            while True:
-                try:
-                    with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs",
-                                                 timeout=55):
-                        # make sure clients weren't evicted
-                        self.assert_session_count(2)
-                        break
-                except AssertionError:
-                    tries -= 1
-                    if tries:
-                        continue
-                    raise
+            with contextutil.safe_while(sleep=5, tries=6) as proceed:
+                while proceed():
+                    try:
+                        with self.assert_cluster_log("1 client(s) laggy due to"
+                                                     " laggy OSDs",
+                                                     timeout=55):
+                            # make sure clients weren't evicted
+                            self.assert_session_count(2)
+                            break
+                    except (AssertionError, CommandFailedError) as e:
+                        log.debug(f'{e}, retrying')
+
+            # clear lagginess, expect to get the warning cleared and make sure
+            # client gets evicted
+            self.clear_laggy_params(osd)
+            self.wait_for_health_clear(60)
+            self.assert_session_count(1)
        finally:
            self.mount_a.kill_cleanup()
            self.mount_a.mount_wait()
            self.mount_a.create_destroy()
-            self.clear_laggy_params(osd)

    def test_client_eviction_if_config_is_unset(self):
        """
@ -800,6 +803,11 @@ class TestClientOnLaggyOSD(CephFSTestCase):

            time.sleep(session_timeout)
            self.assert_session_count(1)
+
+            # make sure warning wasn't seen in cluster log
+            with self.assert_cluster_log("laggy due to laggy OSDs",
+                                         timeout=120, present=False):
+                pass
        finally:
            self.mount_a.kill_cleanup()
            self.mount_a.mount_wait()
--- a/ceph/qa/tasks/cephfs/test_damage.py
+++ b/ceph/qa/tasks/cephfs/test_damage.py
@ -608,8 +608,9 @@ class TestDamage(CephFSTestCase):
        self.fs.flush()
        self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
        time.sleep(5) # for conf to percolate
-        p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
-        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
+        with self.assert_cluster_log("MDS abort because newly corrupt dentry"):
+            p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
        self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
        self.fs.rank_freeze(False, rank=0)
        self.delete_mds_coredump(rank0['name'])
@ -642,9 +643,10 @@ class TestDamage(CephFSTestCase):
        rank0 = self.fs.get_rank()
        self.fs.rank_freeze(True, rank=0)
        # so now we want to trigger commit but this will crash, so:
-        c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
-        p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
-        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
+        with self.assert_cluster_log("MDS abort because newly corrupt dentry"):
+            c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
+            p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
        self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
        self.fs.rank_freeze(False, rank=0)
        self.delete_mds_coredump(rank0['name'])
--- a/ceph/qa/tasks/cephfs/test_failover.py
+++ b/ceph/qa/tasks/cephfs/test_failover.py
@ -14,9 +14,12 @@ class TestClusterAffinity(CephFSTestCase):
    CLIENTS_REQUIRED = 0
    MDSS_REQUIRED = 4

-    def _verify_join_fs(self, target, status=None):
+    def _verify_join_fs(self, target, status=None, fs=None):
+        fs_select = fs
+        if fs_select is None:
+            fs_select = self.fs
        if status is None:
-            status = self.fs.wait_for_daemons(timeout=30)
+            status = fs_select.wait_for_daemons(timeout=30)
            log.debug("%s", status)
        target = sorted(target, key=operator.itemgetter('name'))
        log.info("target = %s", target)
@ -37,11 +40,14 @@ class TestClusterAffinity(CephFSTestCase):
                return
        self.fail("no entity")

-    def _verify_init(self):
-        status = self.fs.status()
+    def _verify_init(self, fs=None):
+        fs_select = fs
+        if fs_select is None:
+            fs_select = self.fs
+        status = fs_select.status()
        log.info("status = {0}".format(status))
        target = [{'join_fscid': -1, 'name': info['name']} for info in status.get_all()]
-        self._verify_join_fs(target, status=status)
+        self._verify_join_fs(target, status=status, fs=fs_select)
        return (status, target)

    def _reach_target(self, target):
@ -109,12 +115,21 @@ class TestClusterAffinity(CephFSTestCase):
        fs2 = self.mds_cluster.newfs(name="cephfs2")
        status, target = self._verify_init()
        active = self.fs.get_active_names(status=status)[0]
+        status2, _ = self._verify_init(fs=fs2)
+        active2 = fs2.get_active_names(status=status2)[0]
        standbys = [info['name'] for info in status.get_standbys()]
        victim = standbys.pop()
        # Set a bogus fs on the others
        for mds in standbys:
            self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2')
            self._change_target_state(target, mds, {'join_fscid': fs2.id})
+        # The active MDS for cephfs2 will be replaced by the MDS for which
+        # file system affinity has been set. Also, set the affinity for
+        # the earlier active MDS so that it is not chosen by the monitors
+        # as an active MDS for the existing file system.
+        log.info(f'assigning affinity to cephfs2 for active mds (mds.{active2})')
+        self.config_set(f'mds.{active2}', 'mds_join_fs', 'cephfs2')
+        self._change_target_state(target, active2, {'join_fscid': fs2.id})
        self.fs.rank_fail()
        self._change_target_state(target, victim, {'state': 'up:active'})
        self._reach_target(target)
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`.qa/distros/podman/rhel_8.4_container_tools_3.0.yaml`
				`@ -0,0 +1 @@`
				`.qa/distros/podman/rhel_8.6_container_tools_3.0.yaml`