import ceph pacific 16.2.15 source

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
2025-04-30 12:16:12 +00:00 · 2024-03-04 16:44:57 +01:00 · 2024-03-04 16:44:57 +01:00 · 47fdce5df8
commit 47fdce5df8
parent ca55da0300
405 changed files with 9800 additions and 3148 deletions
--- a/ceph/CMakeLists.txt
+++ b/ceph/CMakeLists.txt
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
 # remove cmake/modules/FindPython* once 3.12 is required
 project(ceph
-  VERSION 16.2.14
+  VERSION 16.2.15
  LANGUAGES CXX C ASM)
 foreach(policy
--- a/ceph/PendingReleaseNotes
+++ b/ceph/PendingReleaseNotes
@ -32,6 +32,29 @@
  in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
  the restored file system is expected to have the same ID as before.
 >=16.2.15
 ----------
 * `ceph config dump --format <json|xml>` output will display the localized
  option names instead of its normalized version. For e.g.,
  "mgr/prometheus/x/server_port" will be displayed instead of
  "mgr/prometheus/server_port". This matches the output of the non pretty-print
  formatted version of the command.
 * CEPHFS: MDS evicts clients which are not advancing their request tids which causes
  a large buildup of session metadata resulting in the MDS going read-only due to
  the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold`
  config controls the maximum size that a (encoded) session metadata can grow.
 * RADOS: `get_pool_is_selfmanaged_snaps_mode` C++ API has been deprecated
  due to being prone to false negative results.  It's safer replacement is
  `pool_is_in_selfmanaged_snaps_mode`.
 * RBD: When diffing against the beginning of time (`fromsnapname == NULL`) in
  fast-diff mode (`whole_object == true` with `fast-diff` image feature enabled
  and valid), diff-iterate is now guaranteed to execute locally if exclusive
  lock is available.  This brings a dramatic performance improvement for QEMU
  live disk synchronization and backup use cases.
 >= 16.2.14
 ----------
@ -132,6 +155,10 @@
 * CEPHFS: After recovering a Ceph File System post following the disaster recovery
  procedure, the recovered files under `lost+found` directory can now be deleted.
 * core: cache-tiering is now deprecated.
 * mgr/snap_schedule: The snap-schedule mgr module now retains one less snapshot
  than the number mentioned against the config tunable `mds_max_snaps_per_dir`
  so that a new snapshot can be created and retained during the next schedule
  run.
 >=16.2.8
 --------
--- a/ceph/admin/doc-requirements.txt
+++ b/ceph/admin/doc-requirements.txt
@ -1,4 +1,4 @@
-Sphinx == 4.4.0
+Sphinx == 5.0.2
 git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
 breathe >= 4.20.0
 Jinja2
--- a/ceph/ceph.spec
+++ b/ceph/ceph.spec
@ -135,7 +135,7 @@
 # main package definition
 #################################################################################
 Name:		ceph
-Version:	16.2.14
+Version:	16.2.15
 Release:	0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:		2
@ -151,7 +151,7 @@ License:	LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
 Group:		System/Filesystems
 %endif
 URL:		http://ceph.com/
-Source0:	%{?_remote_tarball_prefix}ceph-16.2.14.tar.bz2
+Source0:	%{?_remote_tarball_prefix}ceph-16.2.15.tar.bz2
 %if 0%{?suse_version}
 # _insert_obs_source_lines_here
 ExclusiveArch:  x86_64 aarch64 ppc64le s390x
@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-16.2.14
+%autosetup -p1 -n ceph-16.2.15
 %build
 # Disable lto on systems that do not support symver attribute
--- a/ceph/changelog.upstream
+++ b/ceph/changelog.upstream
@ -1,7 +1,13 @@
-ceph (16.2.14-1focal) focal; urgency=medium
+ceph (16.2.15-1focal) focal; urgency=medium
- -- Jenkins Build Slave User <jenkins-build@braggi13.front.sepia.ceph.com>  Tue, 29 Aug 2023 16:38:35 +0000
+ -- Jenkins Build Slave User <jenkins-build@braggi16.front.sepia.ceph.com>  Mon, 26 Feb 2024 19:34:01 +0000
 ceph (16.2.15-1) stable; urgency=medium
  * New upstream release
 -- Ceph Release Team <ceph-maintainers@ceph.io>  Mon, 26 Feb 2024 19:21:07 +0000
 ceph (16.2.14-1) stable; urgency=medium
--- a/ceph/cmake/modules/BuildRocksDB.cmake
+++ b/ceph/cmake/modules/BuildRocksDB.cmake
@ -56,12 +56,13 @@ function(build_rocksdb)
  endif()
  include(CheckCXXCompilerFlag)
  check_cxx_compiler_flag("-Wno-deprecated-copy" HAS_WARNING_DEPRECATED_COPY)
  set(rocksdb_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  if(HAS_WARNING_DEPRECATED_COPY)
-    set(rocksdb_CXX_FLAGS -Wno-deprecated-copy)
+    string(APPEND rocksdb_CXX_FLAGS " -Wno-deprecated-copy")
  endif()
  check_cxx_compiler_flag("-Wno-pessimizing-move" HAS_WARNING_PESSIMIZING_MOVE)
  if(HAS_WARNING_PESSIMIZING_MOVE)
-    set(rocksdb_CXX_FLAGS "${rocksdb_CXX_FLAGS} -Wno-pessimizing-move")
+    string(APPEND rocksdb_CXX_FLAGS " -Wno-pessimizing-move")
  endif()
  if(rocksdb_CXX_FLAGS)
    list(APPEND rocksdb_CMAKE_ARGS -DCMAKE_CXX_FLAGS='${rocksdb_CXX_FLAGS}')
--- a/ceph/doc/cephfs/administration.rst
+++ b/ceph/doc/cephfs/administration.rst
@ -15,7 +15,7 @@ creation of multiple file systems use ``ceph fs flag set enable_multiple true``.
 ::
-    fs new <file system name> <metadata pool name> <data pool name>
+    ceph fs new <file system name> <metadata pool name> <data pool name>
 This command creates a new file system. The file system name and metadata pool
 name are self-explanatory. The specified data pool is the default data pool and
@ -25,13 +25,13 @@ to accommodate the new file system.
 ::
-    fs ls
+    ceph fs ls
 List all file systems by name.
 ::
-    fs dump [epoch]
+    ceph fs dump [epoch]
 This dumps the FSMap at the given epoch (default: current) which includes all
 file system settings, MDS daemons and the ranks they hold, and the list of
@ -40,7 +40,7 @@ standby MDS daemons.
 ::
-    fs rm <file system name> [--yes-i-really-mean-it]
+    ceph fs rm <file system name> [--yes-i-really-mean-it]
 Destroy a CephFS file system. This wipes information about the state of the
 file system from the FSMap. The metadata pool and data pools are untouched and
@ -48,28 +48,28 @@ must be destroyed separately.
 ::
-    fs get <file system name>
+    ceph fs get <file system name>
 Get information about the named file system, including settings and ranks. This
-is a subset of the same information from the ``fs dump`` command.
+is a subset of the same information from the ``ceph fs dump`` command.
 ::
-    fs set <file system name> <var> <val>
+    ceph fs set <file system name> <var> <val>
 Change a setting on a file system. These settings are specific to the named
 file system and do not affect other file systems.
 ::
-    fs add_data_pool <file system name> <pool name/id>
+    ceph fs add_data_pool <file system name> <pool name/id>
 Add a data pool to the file system. This pool can be used for file layouts
 as an alternate location to store file data.
 ::
-    fs rm_data_pool <file system name> <pool name/id>
+    ceph fs rm_data_pool <file system name> <pool name/id>
 This command removes the specified pool from the list of data pools for the
 file system.  If any files have layouts for the removed data pool, the file
@ -82,7 +82,7 @@ Settings
 ::
-    fs set <fs name> max_file_size <size in bytes>
+    ceph fs set <fs name> max_file_size <size in bytes>
 CephFS has a configurable maximum file size, and it's 1TB by default.
 You may wish to set this limit higher if you expect to store large files
@ -116,13 +116,13 @@ Taking a CephFS cluster down is done by setting the down flag:
 :: 
-    fs set <fs_name> down true
+    ceph fs set <fs_name> down true
 To bring the cluster back online:
 :: 
-    fs set <fs_name> down false
+    ceph fs set <fs_name> down false
 This will also restore the previous value of max_mds. MDS daemons are brought
 down in a way such that journals are flushed to the metadata pool and all
@ -133,11 +133,11 @@ Taking the cluster down rapidly for deletion or disaster recovery
 -----------------------------------------------------------------
 To allow rapidly deleting a file system (for testing) or to quickly bring the
-file system and MDS daemons down, use the ``fs fail`` command:
+file system and MDS daemons down, use the ``ceph fs fail`` command:
 ::
-    fs fail <fs_name>
+    ceph fs fail <fs_name>
 This command sets a file system flag to prevent standbys from
 activating on the file system (the ``joinable`` flag).
@ -146,7 +146,7 @@ This process can also be done manually by doing the following:
 ::
-    fs set <fs_name> joinable false
+    ceph fs set <fs_name> joinable false
 Then the operator can fail all of the ranks which causes the MDS daemons to
 respawn as standbys. The file system will be left in a degraded state.
@ -154,7 +154,7 @@ respawn as standbys. The file system will be left in a degraded state.
 ::
    # For all ranks, 0-N:
-    mds fail <fs_name>:<n>
+    ceph mds fail <fs_name>:<n>
 Once all ranks are inactive, the file system may also be deleted or left in
 this state for other purposes (perhaps disaster recovery).
@ -163,7 +163,7 @@ To bring the cluster back up, simply set the joinable flag:
 ::
-    fs set <fs_name> joinable true
+    ceph fs set <fs_name> joinable true
 Daemons
@ -182,34 +182,35 @@ Commands to manipulate MDS daemons:
 ::
-    mds fail <gid/name/role>
+    ceph mds fail <gid/name/role>
 Mark an MDS daemon as failed.  This is equivalent to what the cluster
 would do if an MDS daemon had failed to send a message to the mon
 for ``mds_beacon_grace`` second.  If the daemon was active and a suitable
-standby is available, using ``mds fail`` will force a failover to the standby.
+standby is available, using ``ceph mds fail`` will force a failover to the
 standby.
-If the MDS daemon was in reality still running, then using ``mds fail``
+If the MDS daemon was in reality still running, then using ``ceph mds fail``
 will cause the daemon to restart.  If it was active and a standby was
 available, then the "failed" daemon will return as a standby.
 ::
-    tell mds.<daemon name> command ...
+    ceph tell mds.<daemon name> command ...
 Send a command to the MDS daemon(s). Use ``mds.*`` to send a command to all
 daemons. Use ``ceph tell mds.* help`` to learn available commands.
 ::
-    mds metadata <gid/name/role>
+    ceph mds metadata <gid/name/role>
 Get metadata about the given MDS known to the Monitors.
 ::
-    mds repaired <role>
+    ceph mds repaired <role>
 Mark the file system rank as repaired. Unlike the name suggests, this command
 does not change a MDS; it manipulates the file system rank which has been
@ -228,14 +229,14 @@ Commands to manipulate required client features of a file system:
 ::
-    fs required_client_features <fs name> add reply_encoding
+    ceph fs required_client_features <fs name> add reply_encoding
-    fs required_client_features <fs name> rm reply_encoding
+    ceph fs required_client_features <fs name> rm reply_encoding
 To list all CephFS features
 ::
-    fs feature ls
+    ceph fs feature ls
 Clients that are missing newly added features will be evicted automatically.
@ -330,7 +331,7 @@ Global settings
 ::
-    fs flag set <flag name> <flag val> [<confirmation string>]
+    ceph fs flag set <flag name> <flag val> [<confirmation string>]
 Sets a global CephFS flag (i.e. not specific to a particular file system).
 Currently, the only flag setting is 'enable_multiple' which allows having
@ -352,13 +353,13 @@ file system.
 ::
-    mds rmfailed
+    ceph mds rmfailed
 This removes a rank from the failed set.
 ::
-    fs reset <file system name>
+    ceph fs reset <file system name>
 This command resets the file system state to defaults, except for the name and
 pools. Non-zero ranks are saved in the stopped set.
@ -366,7 +367,7 @@ pools. Non-zero ranks are saved in the stopped set.
 ::
-    fs new <file system name> <metadata pool name> <data pool name> --fscid <fscid> --force
+    ceph fs new <file system name> <metadata pool name> <data pool name> --fscid <fscid> --force
 This command creates a file system with a specific **fscid** (file system cluster ID).
 You may want to do this when an application expects the file system's ID to be
--- a/ceph/doc/cephfs/cephfs-shell.rst
+++ b/ceph/doc/cephfs/cephfs-shell.rst
@ -37,7 +37,7 @@ Options :
 .. code:: bash
    [build]$ python3 -m venv venv && source venv/bin/activate && pip3 install cmd2
-    [build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/cephfs-shell
+    [build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/shell/cephfs-shell
 Commands
 ========
--- a/ceph/doc/cephfs/client-auth.rst
+++ b/ceph/doc/cephfs/client-auth.rst
@ -24,6 +24,16 @@ that directory.
 To restrict clients to only mount and work within a certain directory, use
 path-based MDS authentication capabilities.
 Note that this restriction *only* impacts the filesystem hierarchy -- the metadata
 tree managed by the MDS. Clients will still be able to access the underlying
 file data in RADOS directly. To segregate clients fully, you must also isolate
 untrusted clients in their own RADOS namespace. You can place a client's
 filesystem subtree in a particular namespace using `file layouts`_ and then
 restrict their RADOS access to that namespace using `OSD capabilities`_
 .. _file layouts: ./file-layouts
 .. _OSD capabilities: ../rados/operations/user-management/#authorization-capabilities
 Syntax
 ------
--- a/ceph/doc/cephfs/snap-schedule.rst
+++ b/ceph/doc/cephfs/snap-schedule.rst
@ -38,6 +38,13 @@ below). By default
 the start time is last midnight. So when a snapshot schedule with repeat
 interval `1h` is added at 13:50
 with the default start time, the first snapshot will be taken at 14:00.
 The time zone is assumed to be UTC if none is explicitly included in the string.
 An explicit time zone will be mapped to UTC at execution.
 The start time must be in ISO8601 format. Examples below:
 UTC: 2022-08-08T05:30:00 i.e. 5:30 AM UTC, without explicit time zone offset
 IDT: 2022-08-08T09:00:00+03:00 i.e. 6:00 AM UTC
 EDT: 2022-08-08T05:30:00-04:00 i.e. 9:30 AM UTC
 Retention specifications are identified by path and the retention spec itself. A
 retention spec consists of either a number and a time period separated by a
@ -155,6 +162,11 @@ Examples::
   snapshot creation is accounted for in the "created_count" field, which is a
   cumulative count of the total number of snapshots created so far.
 .. note: The maximum number of snapshots to retain per directory is limited by the
   config tunable `mds_max_snaps_per_dir`. This tunable defaults to 100.
   To ensure a new snapshot can be created, one snapshot less than this will be
   retained. So by default, a maximum of 99 snapshots will be retained.
 Active and inactive schedules
 -----------------------------
 Snapshot schedules can be added for a path that doesn't exist yet in the
--- a/ceph/doc/man/8/ceph-objectstore-tool.rst
+++ b/ceph/doc/man/8/ceph-objectstore-tool.rst
@ -60,6 +60,8 @@ Possible -op commands::
 * meta-list
 * get-osdmap
 * set-osdmap
 * get-superblock
 * set-superblock
 * get-inc-osdmap
 * set-inc-osdmap
 * mark-complete
@ -414,7 +416,7 @@ Options
 .. option:: --op arg
-   Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log]
+   Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log]
 .. option:: --epoch arg
@ -422,7 +424,7 @@ Options
 .. option:: --file arg             
-   path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap
+   path of file to export, export-remove, import, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap or set-inc-osdmap
 .. option:: --mon-store-path arg
--- a/ceph/doc/man/8/ceph.rst
+++ b/ceph/doc/man/8/ceph.rst
@ -1314,7 +1314,7 @@ Subcommand ``cache-mode`` specifies the caching mode for cache tier <pool>.
 Usage::
-	ceph osd tier cache-mode <poolname> writeback|readproxy|readonly|none
+	ceph osd tier cache-mode <poolname> writeback|proxy|readproxy|readonly|none
 Subcommand ``remove`` removes the tier <tierpool> (the second one) from base pool
 <pool> (the first one).
--- a/ceph/doc/man/8/rados.rst
+++ b/ceph/doc/man/8/rados.rst
@ -264,8 +264,8 @@ Pool specific commands
 :command:`append` *name* *infile*
  Append object name to the cluster with contents from infile.
-:command:`rm` *name*
+:command:`rm` [--force-full] *name* ...
-  Remove object name.
+  Remove object(s) with name(s). With ``--force-full`` will remove when cluster is marked full.
 :command:`listwatchers` *name*
  List the watchers of object name.
--- a/ceph/doc/rados/configuration/bluestore-config-ref.rst
+++ b/ceph/doc/rados/configuration/bluestore-config-ref.rst
@ -333,7 +333,7 @@ OSD and run the following command:
       ceph-bluestore-tool \
        --path <data path> \
-        --sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \
+        --sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \
        reshard
--- a/ceph/doc/rados/configuration/ms-ref.rst
+++ b/ceph/doc/rados/configuration/ms-ref.rst
@ -109,17 +109,6 @@ Async messenger options
 :Default: ``3``
 ``ms_async_max_op_threads``
 :Description: Maximum number of worker threads used by each Async Messenger instance. 
              Set to lower values when your machine has limited CPU count, and increase 
              when your CPUs are underutilized (i. e. one or more of CPUs are
              constantly on 100% load during I/O operations).
 :Type: 64-bit Unsigned Integer
 :Required: No
 :Default: ``5``
 ``ms_async_send_inline``
 :Description: Send messages directly from the thread that generated them instead of
@ -129,5 +118,3 @@ Async messenger options
 :Type: Boolean
 :Required: No
 :Default: ``false``
--- a/ceph/doc/rados/configuration/pool-pg-config-ref.rst
+++ b/ceph/doc/rados/configuration/pool-pg-config-ref.rst
@ -4,12 +4,41 @@
 .. index:: pools; configuration
-Ceph uses default values to determine how many placement groups (PGs) will be
+The number of placement groups that the CRUSH algorithm assigns to each pool is
-assigned to each pool. We recommend overriding some of the defaults.
+determined by the values of variables in the centralized configuration database
-Specifically, we recommend setting a pool's replica size and overriding the
+in the monitor cluster. 
-default number of placement groups. You can set these values when running
+
-`pool`_ commands. You can also override the defaults by adding new ones in the
+Both containerized deployments of Ceph (deployments made using ``cephadm`` or
-``[global]`` section of your Ceph configuration file.
+Rook) and non-containerized deployments of Ceph rely on the values in the
 central configuration database in the monitor cluster to assign placement
 groups to pools. 
 Example Commands
 ----------------
 To see the value of the variable that governs the number of placement groups in a given pool, run a command of the following form:
 .. prompt:: bash
   ceph config get osd osd_pool_default_pg_num
 To set the value of the variable that governs the number of placement groups in a given pool, run a command of the following form:
 .. prompt:: bash
   ceph config set osd osd_pool_default_pg_num
 Manual Tuning
 -------------
 In some cases, it might be advisable to override some of the defaults. For
 example, you might determine that it is wise to set a pool's replica size and
 to override the default number of placement groups in the pool. You can set
 these values when running `pool`_ commands. 
 See Also
 --------
 See :ref:`pg-autoscaler`.
 .. literalinclude:: pool-pg.conf
--- a/ceph/doc/rados/operations/health-checks.rst
+++ b/ceph/doc/rados/operations/health-checks.rst
@ -1404,6 +1404,31 @@ other performance issue with the OSDs.
 The exact size of the snapshot trim queue is reported by the ``snaptrimq_len``
 field of ``ceph pg ls -f json-detail``.
 Stretch Mode
 ------------
 INCORRECT_NUM_BUCKETS_STRETCH_MODE
 __________________________________
 Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests
 that the number of dividing buckets is not equal to 2 after stretch mode is enabled.
 You can expect unpredictable failures and MON assertions until the condition is fixed.
 We encourage you to fix this by removing additional dividing buckets or bump the
 number of dividing buckets to 2.
 UNEVEN_WEIGHTS_STRETCH_MODE
 ___________________________
 The 2 dividing buckets must have equal weights when stretch mode is enabled.
 This warning suggests that the 2 dividing buckets have uneven weights after
 stretch mode is enabled. This is not immediately fatal, however, you can expect
 Ceph to be confused when trying to process transitions between dividing buckets.
 We encourage you to fix this by making the weights even on both dividing buckets.
 This can be done by making sure the combined weight of the OSDs on each dividing
 bucket are the same.
 Miscellaneous
 -------------
--- a/ceph/doc/radosgw/frontends.rst
+++ b/ceph/doc/radosgw/frontends.rst
@ -127,6 +127,14 @@ Options
 :Type: Integer
 :Default: ``65000``
 ``max_header_size``
 :Description: The maximum number of header bytes available for a single request.
 :Type: Integer
 :Default: ``16384``
 :Maximum: ``65536``
 Civetweb
 ========
--- a/ceph/make-dist
+++ b/ceph/make-dist
@ -55,7 +55,7 @@ download_from() {
            exit
        fi
        url=$url_base/$fname
-        wget -c --no-verbose -O $fname $url
+        wget --no-verbose -O $fname $url
        if [ $? != 0 -o ! -e $fname ]; then
            echo "Download of $url failed"
        elif [ $(sha256sum $fname | awk '{print $1}') != $sha256 ]; then
@ -183,8 +183,7 @@ download_boost $boost_version 4eb3b8d442b426dc35346235c8733b5ae35ba431690e38c6a8
               https://boostorg.jfrog.io/artifactory/main/release/$boost_version/source \
               https://downloads.sourceforge.net/project/boost/boost/$boost_version \
               https://download.ceph.com/qa
-download_liburing 0.7 8e2842cfe947f3a443af301bdd6d034455536c38a455c7a700d0c1ad165a7543 \
+download_liburing 0.7 05d0cf8493d573c76b11abfcf34aabc7153affebe17ff95f9ae88b0de062a59d \
                  https://github.com/axboe/liburing/archive \
                  https://git.kernel.dk/cgit/liburing/snapshot
 pmdk_version=1.10
 download_pmdk $pmdk_version 08dafcf94db5ac13fac9139c92225d9aa5f3724ea74beee4e6ca19a01a2eb20c \
--- a/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
+++ b/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
@ -342,7 +342,7 @@ local g = import 'grafonnet/grafana.libsonnet';
      $.graphPanelSchema({},
                         title,
                         description,
-                         'null',
+                         'null as zero',
                         false,
                         formatY1,
                         'short',
--- a/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet
+++ b/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet
@ -133,7 +133,7 @@ local u = import 'utils.libsonnet';
      $.graphPanelSchema({},
                         title,
                         '',
-                         'null',
+                         'null as zero',
                         false,
                         formatY1,
                         'short',
--- a/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet
+++ b/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet
@ -140,7 +140,7 @@ local u = import 'utils.libsonnet';
        {},
        title,
        description,
-        'null',
+        'null as zero',
        false,
        formatY1,
        formatY2,
@ -658,7 +658,7 @@ local u = import 'utils.libsonnet';
      $.graphPanelSchema(aliasColors,
                         title,
                         description,
-                         'null',
+                         'null as zero',
                         false,
                         formatY1,
                         formatY2,
--- a/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json
+++ b/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json
@ -87,7 +87,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -185,7 +185,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -283,7 +283,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -400,7 +400,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -498,7 +498,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -596,7 +596,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
--- a/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json
+++ b/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json
@ -93,7 +93,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -186,7 +186,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -285,7 +285,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
--- a/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
+++ b/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json
@ -87,7 +87,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -180,7 +180,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -266,7 +266,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -352,7 +352,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -445,7 +445,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -531,7 +531,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -636,7 +636,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -754,7 +754,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -893,7 +893,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -1000,7 +1000,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
--- a/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json
+++ b/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json
@ -80,7 +80,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -173,7 +173,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
@ -266,7 +266,7 @@
         "lines": true,
         "linewidth": 1,
         "links": [ ],
-         "nullPointMode": "null",
+         "nullPointMode": "null as zero",
         "percentage": false,
         "pointradius": 5,
         "points": false,
--- a/ceph/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/ceph/monitoring/ceph-mixin/prometheus_alerts.yml
@ -518,7 +518,7 @@ groups:
        annotations:
          description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
          summary: "Pool growth rate may soon exceed capacity"
-        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)    group_right ceph_pool_metadata) >= 95"
+        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
        labels:
          oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
          severity: "warning"
--- a/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@ -1499,35 +1499,44 @@ tests:
   # trigger percent full prediction on pools 1 and 2 only
 - interval: 12h
   input_series:
-    - series: 'ceph_pool_percent_used{pool_id="1"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
      values: '70 75 80 87 92'
    - series: 'ceph_pool_percent_used{pool_id="2"}'
      values: '22 22 23 23 24'
    - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
      values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
      values: '78 89 79 98 78'
    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
      values: '1 1 1 1 1'
    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
      values: '22 22 23 23 24'
    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
      values: '1 1 1 1 1'
    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
      values: '1 1 1 1 1'
    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
      values: '1 1 1 1 1'
    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
      values: '1 1 1 1 1'
   promql_expr_test:
     - expr: |
-         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
+         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
-              group_right ceph_pool_metadata) >= 95
+              group_right() ceph_pool_metadata) >= 95
       eval_time: 36h
       exp_samples:
-         - labels: '{name="rbd",pool_id="1",type="replicated"}'
+         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
-           value: 1.424E+02 # 142%
+           value: 1.435E+02 # 142%
   alert_rule_test:
    - eval_time: 48h
      alertname: CephPoolGrowthWarning
      exp_alerts:
      - exp_labels:
-          name: rbd
+          instance: 8090
          name: default.rgw.index
          pool_id: 1
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.1.2.1.9.2
        exp_annotations:
          summary: Pool growth rate may soon exceed capacity
-          description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
+          description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
 - interval: 1m
   input_series:
    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
--- a/ceph/qa/cephfs/conf/mds.yaml
+++ b/ceph/qa/cephfs/conf/mds.yaml
@ -3,6 +3,7 @@ overrides:
    conf:
      mds:
        debug mds: 20
        debug mds balancer: 20
        debug ms: 1
        mds debug frag: true
        mds debug scatterstat: true
--- a/ceph/qa/cephfs/overrides/ignorelist_health.yaml
+++ b/ceph/qa/cephfs/overrides/ignorelist_health.yaml
@ -2,7 +2,10 @@ overrides:
  ceph:
    log-ignorelist:
      - overall HEALTH_
      - \(CEPHADM_STRAY_DAEMON\)
      - \(FS_DEGRADED\)
      - FS_
      - \(CEPHADM_
      - \(MDS_FAILED\)
      - \(MDS_DEGRADED\)
      - \(FS_WITH_FAILED_MDS\)
@ -10,3 +13,10 @@ overrides:
      - \(MDS_ALL_DOWN\)
      - \(MDS_UP_LESS_THAN_MAX\)
      - \(FS_INLINE_DATA_DEPRECATED\)
      - \(PG_DEGRADED\)
      - Degraded data redundancy
      - \(PG_
      - acting
      - MDS_INSUFFICIENT_STANDBY
      - deprecated feature inline_data
      - compat changed unexpectedly
--- a/ceph/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml
+++ b/ceph/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml
@ -2,8 +2,10 @@ overrides:
  ceph:
    log-ignorelist:
      - overall HEALTH_
-      - \(OSD_DOWN\)
+      - OSD_DOWN
-      - \(OSD_
+      - OSD_
      - but it is still running
 # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a'
      - is not responding
      - is down
      - osds down
--- a/ceph/qa/distros/all/rhel_8.5.yaml
+++ b/ceph/qa/distros/all/rhel_8.5.yaml
@ -0,0 +1,6 @@
 os_type: rhel
 os_version: "8.5"
 overrides:
  selinux:
    whitelist:
      - scontext=system_u:system_r:logrotate_t:s0
--- a/ceph/qa/distros/all/rhel_8.6.yaml
+++ b/ceph/qa/distros/all/rhel_8.6.yaml
@ -0,0 +1,6 @@
 os_type: rhel
 os_version: "8.6"
 overrides:
  selinux:
    whitelist:
      - scontext=system_u:system_r:logrotate_t:s0
--- a/ceph/qa/distros/all/rhel_8.yaml
+++ b/ceph/qa/distros/all/rhel_8.yaml
@ -1 +1 @@
-rhel_8.4.yaml
+rhel_8.6.yaml
--- a/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_3.0.yaml
+++ b/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_3.0.yaml
@ -1 +0,0 @@
 .qa/distros/podman/rhel_8.4_container_tools_3.0.yaml
--- a/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_rhel8.yaml
+++ b/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_rhel8.yaml
@ -1 +0,0 @@
 .qa/distros/podman/rhel_8.4_container_tools_rhel8.yaml
--- a/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_3.0.yaml
+++ b/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_3.0.yaml
@ -0,0 +1 @@
 .qa/distros/podman/rhel_8.6_container_tools_3.0.yaml
--- a/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_rhel8.yaml
+++ b/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_rhel8.yaml
@ -0,0 +1 @@
 .qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml
--- a/ceph/qa/distros/podman/rhel_8.6_container_tools_3.0.yaml
+++ b/ceph/qa/distros/podman/rhel_8.6_container_tools_3.0.yaml
@ -1,5 +1,5 @@
 os_type: rhel
-os_version: "8.4"
+os_version: "8.6"
 overrides:
  selinux:
    whitelist:
--- a/ceph/qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml
+++ b/ceph/qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml
@ -1,5 +1,5 @@
 os_type: rhel
-os_version: "8.4"
+os_version: "8.6"
 overrides:
  selinux:
    whitelist:
--- a/ceph/qa/standalone/ceph-helpers.sh
+++ b/ceph/qa/standalone/ceph-helpers.sh
@ -1691,6 +1691,29 @@ function test_wait_for_peered() {
 #######################################################################
 ##
 # Wait until the cluster's health condition disappeared.
 # $TIMEOUT default
 #
 # @param string to grep for in health detail
 # @return 0 if the cluster health doesn't matches request,
 # 1 otherwise if after $TIMEOUT seconds health condition remains.
 #
 function wait_for_health_gone() {
    local grepstr=$1
    local -a delays=($(get_timeout_delays $TIMEOUT .1))
    local -i loop=0
    while ceph health detail | grep "$grepstr" ; do
 	if (( $loop >= ${#delays[*]} )) ; then
            ceph health detail
            return 1
        fi
        sleep ${delays[$loop]}
        loop+=1
    done
 }
 ##
 # Wait until the cluster has health condition passed as arg
 # again for $TIMEOUT seconds.
--- a/ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh
+++ b/ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh
@ -0,0 +1,148 @@
 #!/usr/bin/env bash
 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
 function run() {
    local dir=$1
    shift
    export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
    export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
    export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
    export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
    export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
    export CEPH_ARGS
    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
    export BASE_CEPH_ARGS=$CEPH_ARGS
    CEPH_ARGS+="--mon-host=$CEPH_MON_A"
    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
    for func in $funcs ; do
        setup $dir || return 1
        $func $dir || return 1
        teardown $dir || return 1
    done
 }
 TEST_stretched_cluster_failover_add_three_osds(){
    local dir=$1
    local OSDS=8
    setup $dir || return 1
    run_mon $dir a --public-addr $CEPH_MON_A || return 1
    wait_for_quorum 300 1 || return 1
    run_mon $dir b --public-addr $CEPH_MON_B || return 1
    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
    wait_for_quorum 300 2 || return 1
    run_mon $dir c --public-addr $CEPH_MON_C || return 1
    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
    wait_for_quorum 300 3 || return 1
    run_mon $dir d --public-addr $CEPH_MON_D || return 1
    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
    wait_for_quorum 300 4 || return 1
    run_mon $dir e --public-addr $CEPH_MON_E || return 1
    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
    wait_for_quorum 300 5 || return 1
    ceph mon set election_strategy connectivity
    ceph mon add disallowed_leader e
    run_mgr $dir x || return 1
    run_mgr $dir y || return 1
    run_mgr $dir z || return 1
    for osd in $(seq 0 $(expr $OSDS - 1))
    do
      run_osd $dir $osd || return 1
    done
    for zone in iris pze
    do
      ceph osd crush add-bucket $zone zone
      ceph osd crush move $zone root=default
    done
    ceph osd crush add-bucket node-2 host
    ceph osd crush add-bucket node-3 host
    ceph osd crush add-bucket node-4 host
    ceph osd crush add-bucket node-5 host
    ceph osd crush move node-2 zone=iris
    ceph osd crush move node-3 zone=iris
    ceph osd crush move node-4 zone=pze
    ceph osd crush move node-5 zone=pze
    ceph osd crush move osd.0 host=node-2
    ceph osd crush move osd.1 host=node-2
    ceph osd crush move osd.2 host=node-3
    ceph osd crush move osd.3 host=node-3
    ceph osd crush move osd.4 host=node-4
    ceph osd crush move osd.5 host=node-4
    ceph osd crush move osd.6 host=node-5
    ceph osd crush move osd.7 host=node-5
    ceph mon set_location a zone=iris host=node-2
    ceph mon set_location b zone=iris host=node-3
    ceph mon set_location c zone=pze host=node-4
    ceph mon set_location d zone=pze  host=node-5
    hostname=$(hostname -s)
    ceph osd crush remove $hostname || return 1
    ceph osd getcrushmap > crushmap || return 1
    crushtool --decompile crushmap > crushmap.txt || return 1
    sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
    cat >> crushmap_modified.txt << EOF
 rule stretch_rule {
        id 1
        type replicated
        min_size 1
        max_size 10
        step take iris
        step chooseleaf firstn 2 type host
        step emit
        step take pze
        step chooseleaf firstn 2 type host
        step emit
 }
 # end crush map
 EOF
    crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
    ceph osd setcrushmap -i crushmap.bin  || return 1
    local stretched_poolname=stretched_rbdpool
    ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
    ceph osd pool set $stretched_poolname size 4 || return 1
    sleep 3
    ceph mon set_location e zone=arbiter host=node-1
    ceph mon enable_stretch_mode e stretch_rule zone
    kill_daemons $dir KILL mon.c || return 1
    kill_daemons $dir KILL mon.d || return 1
    kill_daemons $dir KILL osd.4 || return 1
    kill_daemons $dir KILL osd.5 || return 1
    kill_daemons $dir KILL osd.6 || return 1
    kill_daemons $dir KILL osd.7 || return 1
    ceph -s
    sleep 3
    run_osd $dir 8 || return 1
    run_osd $dir 9 || return 1
    run_osd $dir 10 || return 1
    ceph -s
    sleep 3
    teardown $dir || return 1
 }
 main mon-stretch-fail-recovery "$@"
--- a/ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh
+++ b/ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh
@ -0,0 +1,145 @@
 #!/usr/bin/env bash
 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
 function run() {
    local dir=$1
    shift
    export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
    export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
    export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
    export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
    export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
    export CEPH_ARGS
    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
    export BASE_CEPH_ARGS=$CEPH_ARGS
    CEPH_ARGS+="--mon-host=$CEPH_MON_A"
    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
    for func in $funcs ; do
        setup $dir || return 1
        $func $dir || return 1
        teardown $dir || return 1
    done
 }
 TEST_stretched_cluster_uneven_weight() {
    local dir=$1
    local OSDS=4
    local weight=0.09000
    setup $dir || return 1
    run_mon $dir a --public-addr $CEPH_MON_A || return 1
    wait_for_quorum 300 1 || return 1
    run_mon $dir b --public-addr $CEPH_MON_B || return 1
    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
    wait_for_quorum 300 2 || return 1
    run_mon $dir c --public-addr $CEPH_MON_C || return 1
    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
    wait_for_quorum 300 3 || return 1
    run_mon $dir d --public-addr $CEPH_MON_D || return 1
    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
    wait_for_quorum 300 4 || return 1
    run_mon $dir e --public-addr $CEPH_MON_E || return 1
    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
    wait_for_quorum 300 5 || return 1
    ceph mon set election_strategy connectivity
    ceph mon add disallowed_leader e
    run_mgr $dir x || return 1
    run_mgr $dir y || return 1
    run_mgr $dir z || return 1
    for osd in $(seq 0 $(expr $OSDS - 1))
    do
      run_osd $dir $osd || return 1
    done
    for zone in iris pze
    do
      ceph osd crush add-bucket $zone zone
      ceph osd crush move $zone root=default
    done
    ceph osd crush add-bucket node-2 host
    ceph osd crush add-bucket node-3 host
    ceph osd crush add-bucket node-4 host
    ceph osd crush add-bucket node-5 host
    ceph osd crush move node-2 zone=iris
    ceph osd crush move node-3 zone=iris
    ceph osd crush move node-4 zone=pze
    ceph osd crush move node-5 zone=pze
    ceph osd crush move osd.0 host=node-2
    ceph osd crush move osd.1 host=node-3
    ceph osd crush move osd.2 host=node-4
    ceph osd crush move osd.3 host=node-5
    ceph mon set_location a zone=iris host=node-2
    ceph mon set_location b zone=iris host=node-3
    ceph mon set_location c zone=pze host=node-4
    ceph mon set_location d zone=pze host=node-5
    hostname=$(hostname -s)
    ceph osd crush remove $hostname || return 1
    ceph osd getcrushmap > crushmap || return 1
    crushtool --decompile crushmap > crushmap.txt || return 1
    sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
    cat >> crushmap_modified.txt << EOF
 rule stretch_rule {
        id 1
        type replicated
        min_size 1
        max_size 10
        step take iris
        step chooseleaf firstn 2 type host
        step emit
        step take pze
        step chooseleaf firstn 2 type host
        step emit
 }
 # end crush map
 EOF
    crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
    ceph osd setcrushmap -i crushmap.bin  || return 1
    local stretched_poolname=stretched_rbdpool
    ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
    ceph osd pool set $stretched_poolname size 4 || return 1
    ceph mon set_location e zone=arbiter host=node-1 || return 1
    ceph mon enable_stretch_mode e stretch_rule zone || return 1 # Enter strech mode
    # reweight to a more round decimal.
    ceph osd crush reweight osd.0 $weight
    ceph osd crush reweight osd.1 $weight
    ceph osd crush reweight osd.2 $weight
    ceph osd crush reweight osd.3 $weight
    # Firstly, we test for stretch mode buckets != 2
    ceph osd crush add-bucket sham zone || return 1
    ceph osd crush move sham root=default || return 1
    wait_for_health "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
    ceph osd crush rm sham # clear the health warn
    wait_for_health_gone "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
    # Next, we test for uneven weights across buckets
    ceph osd crush reweight osd.0 0.07000
    wait_for_health "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
    ceph osd crush reweight osd.0 $weight # clear the health warn
    wait_for_health_gone "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
    teardown $dir || return 1
 }
 main mon-stretched-cluster-uneven-weight "$@"
--- a/ceph/qa/suites/fs/functional/tasks/damage.yaml
+++ b/ceph/qa/suites/fs/functional/tasks/damage.yaml
@ -19,6 +19,7 @@ overrides:
      - MDS_READ_ONLY
      - force file system read-only
      - with standby daemon mds
      - MDS abort because newly corrupt dentry
 tasks:
  - cephfs_test_runner:
      modules:
--- a/ceph/qa/suites/fs/thrash/multifs/overrides/client-shutdown.yaml
+++ b/ceph/qa/suites/fs/thrash/multifs/overrides/client-shutdown.yaml
@ -0,0 +1,6 @@
 # Lengthen the timeout for thrashed MDS
 overrides:
  ceph:
    conf:
      client:
        client_shutdown_timeout: 120
--- a/ceph/qa/suites/fs/thrash/workloads/overrides/client-shutdown.yaml
+++ b/ceph/qa/suites/fs/thrash/workloads/overrides/client-shutdown.yaml
@ -0,0 +1,6 @@
 # Lengthen the timeout for thrashed MDS
 overrides:
  ceph:
    conf:
      client:
        client_shutdown_timeout: 120
--- a/ceph/qa/suites/fs/volumes/tasks/volumes/test/finisher_per_module.yaml
+++ b/ceph/qa/suites/fs/volumes/tasks/volumes/test/finisher_per_module.yaml
@ -0,0 +1,13 @@
 tasks:
  - check-counter:
      counters:
        mgr:
            - name: "finisher-volumes.complete_latency.avgcount"
              min: 4
            - name: "finisher-volumes.queue_len"
              expected_val: 0
  - cephfs_test_runner:
      fail_on_skip: false
      modules:
        - tasks.cephfs.test_volumes.TestPerModuleFinsherThread
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/%
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/%
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/.qa
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/.qa
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/bluestore-bitmap.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/bluestore-bitmap.yaml
@ -0,0 +1 @@
 .qa/objectstore/bluestore-bitmap.yaml
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml
@ -0,0 +1,7 @@
 overrides:
  ceph:
    conf:
      global:
        ms die on skipped message: false
      client:
        rbd default features: 37
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa
@ -0,0 +1 @@
 ../.qa/
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc-rxbounce.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc-rxbounce.yaml
@ -0,0 +1,5 @@
 overrides:
  ceph:
    conf:
      client:
        rbd default map options: ms_mode=crc,rxbounce
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc.yaml
@ -0,0 +1,5 @@
 overrides:
  ceph:
    conf:
      client:
        rbd default map options: ms_mode=crc
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy-rxbounce.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy-rxbounce.yaml
@ -0,0 +1,5 @@
 overrides:
  ceph:
    conf:
      client:
        rbd default map options: ms_mode=legacy,rxbounce
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy.yaml
@ -0,0 +1,5 @@
 overrides:
  ceph:
    conf:
      client:
        rbd default map options: ms_mode=legacy
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/secure.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/secure.yaml
@ -0,0 +1,5 @@
 overrides:
  ceph:
    conf:
      client:
        rbd default map options: ms_mode=secure
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa
@ -0,0 +1 @@
 ../.qa/
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/few.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/few.yaml
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/many.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/many.yaml
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa
@ -0,0 +1 @@
 ../.qa/
--- a/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/rbd_xfstests.yaml
+++ b/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/rbd_xfstests.yaml
--- a/ceph/qa/suites/krbd/singleton/conf.yaml
+++ b/ceph/qa/suites/krbd/singleton/conf.yaml
@ -2,6 +2,7 @@ overrides:
  ceph:
    conf:
      global:
        mon warn on pool no app: false
        ms die on skipped message: false
      client:
        rbd default features: 37
--- a/ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml
+++ b/ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml
@ -0,0 +1,19 @@
 overrides:
  ceph:
    conf:
      global:
        osd pool default size: 1
      osd:
        osd shutdown pgref assert: true
 roles:
 - [mon.a, mgr.x, osd.0, client.0]
 tasks:
 - install:
    extra_system_packages:
      - fio
 - ceph:
 - workunit:
    clients:
      all:
        - rbd/krbd_watch_errors.sh
--- a/ceph/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml
+++ b/ceph/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml
@ -1,3 +1,28 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(HOST_IN_MAINTENANCE\)
      - \(OSD_DOWN\)
      - \(MON_DOWN\)
      - down
      - overall HEALTH_
      - \(CEPHADM_STRAY_DAEMON\)
      - stray daemon
      - \(FS_DEGRADED\)
      - \(MDS_FAILED\)
      - \(MDS_DEGRADED\)
      - \(FS_WITH_FAILED_MDS\)
      - \(MDS_DAMAGE\)
      - \(MDS_ALL_DOWN\)
      - \(MDS_UP_LESS_THAN_MAX\)
      - \(FS_INLINE_DATA_DEPRECATED\)
      - \(PG_DEGRADED\)
      - Degraded data redundancy
      - \(PG_
      - acting
      - MDS_INSUFFICIENT_STANDBY
      - deprecated feature inline_data
      - compat changed unexpectedly
 roles:
 # 3 osd roles on host.a is required for cephadm task. It checks if the cluster is healthy.
 # More daemons will be deployed on both hosts in e2e tests.
--- a/ceph/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml
+++ b/ceph/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml
@ -24,6 +24,21 @@ openstack:
    size: 10 # GB
 overrides:
  ceph:
    log-ignorelist:
      - slow requests
      - \(PG_
      - PG_
      - \(CEPHADM_STRAY_DAEMON\)
      - slow request
      - \(MDS_
      - MDS_
      - osds down
      - OSD_
      - \(OSD_
      - client
      - FS_
      - \(FS_
      - degraded
    conf:
      osd:
        osd shutdown pgref assert: true
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/repave-all.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/repave-all.yaml
@ -1,3 +1,10 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - \(OSD_DOWN\)
      - \(PG_
      - but it is still running
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-add.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-add.yaml
@ -1,3 +1,10 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - \(OSD_DOWN\)
      - \(PG_
      - but it is still running
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-flag.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-flag.yaml
@ -1,3 +1,10 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - \(OSD_DOWN\)
      - \(PG_
      - but it is still running
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-wait.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-wait.yaml
@ -1,3 +1,10 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - \(OSD_DOWN\)
      - \(PG_
      - but it is still running
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml
+++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml
@ -1,3 +1,11 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - \(OSD_DOWN\)
      - \(PG_
      - but it is still running
      - \(CEPHADM_STRAY_DAEMON\)
 tasks:
 - cephadm.shell:
    host.a:
--- a/ceph/qa/suites/orch/cephadm/smoke/start.yaml
+++ b/ceph/qa/suites/orch/cephadm/smoke/start.yaml
@ -1,3 +1,11 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - \(PG_AVAILABILITY\)
      - mon down
      - mons down
      - out of quorum
 tasks:
 - cephadm:
    conf:
--- a/ceph/qa/suites/orch/cephadm/thrash/2-thrash.yaml
+++ b/ceph/qa/suites/orch/cephadm/thrash/2-thrash.yaml
@ -3,6 +3,23 @@ overrides:
    log-ignorelist:
    - but it is still running
    - objects unfound and apparently lost
    - \(MON_DOWN\)
    - \(OSDMAP_FLAGS\)
    - flag\(s\) set
    - \(CACHE_POOL_NO_HIT_SET\)
    - \(CACHE_
    - \(PG_
    - \(OSD_
    - \(POOL_
    - \(CEPHADM_STRAY_DAEMON\)
    - PG_
    - CACHE_
    - degraded
    - backfill
    - mons down
    - OSD_
    - is down
    - acting
    conf:
      osd:
        osd debug reject backfill probability: .3
--- a/ceph/qa/suites/orch/cephadm/upgrade/4-wait.yaml
+++ b/ceph/qa/suites/orch/cephadm/upgrade/4-wait.yaml
@ -1,3 +1,14 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - \(PG_
      - mons down
      - pg inactive
      - out of quorum
      - \(OSD_
      - osds down
      - osd down
 tasks:
 - cephadm.shell:
    env: [sha1]
--- a/ceph/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml
+++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml
@ -1,3 +1,9 @@
 overrides:
  ceph:
    log-ignorelist:
    - Replacing daemon mds
    - FS_DEGRADED
    - \(CEPHADM_STRAY_DAEMON\)
 roles:
 - - host.a
  - osd.0
--- a/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml
+++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml
@ -1,3 +1,10 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - \(OSD_DOWN\)
      - \(CEPHADM_PAUSED\)
      - mons down
 roles:
 - - host.a
  - osd.0
--- a/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli_mon.yaml
+++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli_mon.yaml
@ -1,3 +1,10 @@
 overrides:
  ceph:
    log-ignorelist:
      - \(MON_DOWN\)
      - mons down
      - \(MGR_DOWN\)
      - out of quorum
 roles:
 - - host.a
  - osd.0
--- a/ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml
+++ b/ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml
@ -11,6 +11,15 @@ overrides:
    - \(POOL_APP_NOT_ENABLED\)
    - \(PG_AVAILABILITY\)
    - \(PG_DEGRADED\)
    - \(MON_DOWN\)
    - \(CEPHADM_STRAY_DAEMON\)
    - missing hit_sets
    - do not have an application enabled
    - application not enabled on pool
    - pool application
    - mons down
    - out of quorum
    - needs hit_set_type to be set but it is not
    conf:
      client:
        debug ms: 1
--- a/ceph/qa/suites/rados/basic/tasks/rados_cls_all.yaml
+++ b/ceph/qa/suites/rados/basic/tasks/rados_cls_all.yaml
@ -2,6 +2,7 @@ overrides:
  ceph:
    log-ignorelist:
    - \(PG_AVAILABILITY\)
    - \(POOL_APP_NOT_ENABLED\)
    conf:
      osd:
        osd_class_load_list: "*"
--- a/ceph/qa/suites/rados/basic/tasks/rados_python.yaml
+++ b/ceph/qa/suites/rados/basic/tasks/rados_python.yaml
@ -8,6 +8,13 @@ overrides:
    - \(OSD_
    - \(OBJECT_
    - \(POOL_APP_NOT_ENABLED\)
    - \(MON_DOWN\)
    - mons down
    - application not enabled on pool
    - do not have an application enabled
    - pool application
    - out of quorum
    - needs hit_set_type to be set but it is not
 tasks:
 - workunit:
    clients:
--- a/ceph/qa/suites/rados/mgr/tasks/per_module_finisher_stats.yaml
+++ b/ceph/qa/suites/rados/mgr/tasks/per_module_finisher_stats.yaml
@ -0,0 +1,43 @@
 tasks:
  - install:
  - ceph:
      wait-for-scrub: false
  - check-counter:
      counters:
        mgr:
            - name: "finisher-balancer.complete_latency.avgcount"
              min: 1
            - name: "finisher-balancer.queue_len"
              expected_val: 0
            - name: "finisher-crash.complete_latency.avgcount"
              min: 2
            - name: "finisher-crash.queue_len"
              expected_val: 0
            - name: "finisher-devicehealth.complete_latency.avgcount"
              min: 1
            - name: "finisher-devicehealth.queue_len"
              expected_val: 0
            - name: "finisher-iostat.complete_latency.avgcount"
              min: 1
            - name: "finisher-iostat.queue_len"
              expected_val: 0
            - name: "finisher-pg_autoscaler.complete_latency.avgcount"
              min: 1
            - name: "finisher-pg_autoscaler.queue_len"
              expected_val: 0
            - name: "finisher-progress.complete_latency.avgcount"
              min: 2
            - name: "finisher-progress.queue_len"
              expected_val: 0
            - name: "finisher-status.complete_latency.avgcount"
              min: 2
            - name: "finisher-status.queue_len"
              expected_val: 0
            - name: "finisher-telemetry.complete_latency.avgcount"
              min: 1
            - name: "finisher-telemetry.queue_len"
              expected_val: 0
  - workunit:
      clients:
        client.0:
          - mgr/test_per_module_finisher.sh
--- a/ceph/qa/suites/rados/mgr/tasks/workunits.yaml
+++ b/ceph/qa/suites/rados/mgr/tasks/workunits.yaml
@ -13,4 +13,4 @@ tasks:
  - workunit:
      clients:
        client.0:
-          - mgr
+          - mgr/test_localpool.sh
--- a/ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml
+++ b/ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml
@ -0,0 +1,18 @@
 roles:
 - - mon.a
  - mgr.x
  - osd.0
  - osd.1
  - osd.2
  - client.0
 openstack:
  - volumes: # attached to each instance
      count: 3
      size: 10 # GB
 tasks:
 - install:
 - workunit:
    basedir: qa/standalone
    clients:
      all:
        - mon-stretch
--- a/ceph/qa/suites/rados/verify/tasks/rados_cls_all.yaml
+++ b/ceph/qa/suites/rados/verify/tasks/rados_cls_all.yaml
@ -4,6 +4,8 @@ overrides:
      osd:
        osd_class_load_list: "*"
        osd_class_default_list: "*"
    log-ignorelist:
      - \(POOL_APP_NOT_ENABLED\)
 tasks:
 - workunit:
    clients:
--- a/ceph/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml
+++ b/ceph/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml
@ -0,0 +1,13 @@
 overrides:
  ceph:
    conf:
      mgr:
        debug rbd: 20
 tasks:
 - install:
    extra_system_packages:
      - fio
 - workunit:
    clients:
      client.0:
        - rbd/rbd_support_module_recovery.sh
--- a/ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml
+++ b/ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml
@ -0,0 +1,5 @@
 tasks:
 - workunit:
    clients:
      client.0:
        - rgw/run-bucket-check.sh
--- a/ceph/qa/suites/upgrade/nautilus-x/parallel/2-workload/rgw_ragweed_prepare.yaml
+++ b/ceph/qa/suites/upgrade/nautilus-x/parallel/2-workload/rgw_ragweed_prepare.yaml
@ -6,7 +6,7 @@ workload:
  - sequential:
    - ragweed:
        client.1:
-          default-branch: ceph-pacific
+          default-branch: ceph-nautilus
          rgw_server: client.1
          stages: prepare
    - print: "**** done rgw ragweed prepare 2-workload"
--- a/ceph/qa/suites/upgrade/nautilus-x/parallel/5-final-workload/rgw_ragweed_check.yaml
+++ b/ceph/qa/suites/upgrade/nautilus-x/parallel/5-final-workload/rgw_ragweed_check.yaml
@ -5,7 +5,7 @@ rgw-final-workload:
  full_sequential:
  - ragweed:
      client.1:
-        default-branch: ceph-pacific
+        default-branch: ceph-nautilus
        rgw_server: client.1
        stages: check
  - print: "**** done ragweed check 4-final-workload"
--- a/ceph/qa/suites/upgrade/octopus-x/parallel-no-cephadm/5-final-workload/rgw_ragweed_check.yaml
+++ b/ceph/qa/suites/upgrade/octopus-x/parallel-no-cephadm/5-final-workload/rgw_ragweed_check.yaml
@ -5,7 +5,7 @@ rgw-final-workload:
  full_sequential:
  - ragweed:
      client.1:
-        default-branch: ceph-pacific
+        default-branch: ceph-octopus
        rgw_server: client.1
        stages: check
  - print: "**** done ragweed check 4-final-workload"
--- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-parallel/point-to-point-upgrade.yaml
+++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-parallel/point-to-point-upgrade.yaml
@ -123,7 +123,7 @@ workload_pacific:
         - rados/test.sh
         - cls
       env:
-         CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
+         CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
   - print: "**** done rados/test.sh &  cls workload_pacific"
   - sequential:
     - rgw: [client.0]
--- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml
+++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml
@ -7,4 +7,6 @@ stress-tasks:
    clients:
      client.0:
        - cls/test_cls_rbd.sh
    env:
      CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
 - print: "**** done cls/test_cls_rbd.sh 4-workload"
--- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml
+++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml
@ -3,7 +3,7 @@ meta:
   librbd python api tests
 tasks:
 - workunit:
-    tag: v16.2.7
+    branch: pacific
    clients:
      client.0:
        - rbd/test_librbd_python.sh
--- a/ceph/qa/tasks/ceph_manager.py
+++ b/ceph/qa/tasks/ceph_manager.py
@ -232,6 +232,7 @@ class OSDThrasher(Thrasher):
        self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0)
        self.random_eio = self.config.get('random_eio')
        self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
        self.chance_reset_purged_snaps_last = self.config.get('chance_reset_purged_snaps_last', 0.3)
        num_osds = self.in_osds + self.out_osds
        self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * len(num_osds)
@ -798,6 +799,19 @@ class OSDThrasher(Thrasher):
        else:
           self.cancel_force_recovery()
    def reset_purged_snaps_last(self):
        """
        Run reset_purged_snaps_last
        """
        self.log('reset_purged_snaps_last')
        for osd in self.in_osds:
            try:
               self.ceph_manager.raw_cluster_cmd(
               'tell', "osd.%s" % (str(osd)),
               'reset_purged_snaps_last')
            except CommandFailedError:
                self.log('Failed to reset_purged_snaps_last, ignoring')
    def all_up(self):
        """
        Make sure all osds are up and not out.
@ -1248,6 +1262,8 @@ class OSDThrasher(Thrasher):
            actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,))
        if self.chance_force_recovery > 0:
            actions.append((self.force_cancel_recovery, self.chance_force_recovery))
        if self.chance_reset_purged_snaps_last > 0:
            actions.append((self.reset_purged_snaps_last, self.chance_reset_purged_snaps_last))
        for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
            for scenario in [
--- a/ceph/qa/tasks/cephadm.conf
+++ b/ceph/qa/tasks/cephadm.conf
@ -2,6 +2,8 @@
 # make logging friendly to teuthology
 log_to_file = true
 log_to_stderr = false
 log to journald = false
 mon cluster log to file = true
 mon cluster log file level = debug
 mon clock drift allowed = 1.000
--- a/ceph/qa/tasks/cephfs/mount.py
+++ b/ceph/qa/tasks/cephfs/mount.py
@ -811,7 +811,7 @@ class CephFSMount(object):
        ))
        p.wait()
-    def open_background(self, basename="background_file", write=True):
+    def open_background(self, basename="background_file", write=True, content="content"):
        """
        Open a file for writing, then block such that the client
        will hold a capability.
@ -828,12 +828,11 @@ class CephFSMount(object):
                import time
                with open("{path}", 'w') as f:
-                    f.write('content')
+                    f.write("{content}")
                    f.flush()
                    f.write('content2')
                    while True:
                        time.sleep(1)
-                """).format(path=path)
+                """).format(path=path, content=content)
        else:
            pyscript = dedent("""
                import time
@ -849,6 +848,9 @@ class CephFSMount(object):
        # This wait would not be sufficient if the file had already
        # existed, but it's simple and in practice users of open_background
        # are not using it on existing files.
        if write:
            self.wait_for_visible(basename, size=len(content))
        else:
            self.wait_for_visible(basename)
        return rproc
@ -887,17 +889,25 @@ class CephFSMount(object):
                if nr_links == 2:
                    return
-    def wait_for_visible(self, basename="background_file", timeout=30):
+    def wait_for_visible(self, basename="background_file", size=None, timeout=30):
        i = 0
        args = ['stat']
        if size is not None:
            args += ['--printf=%s']
        args += [os.path.join(self.hostfs_mntpt, basename)]
        while i < timeout:
-            r = self.client_remote.run(args=[
+            p = self.client_remote.run(args=args, stdout=StringIO(), check_status=False)
-                'stat', os.path.join(self.hostfs_mntpt, basename)
+            if p.exitstatus == 0:
-            ], check_status=False)
+                if size is not None:
-            if r.exitstatus == 0:
+                    s = p.stdout.getvalue().strip()
-                log.debug("File {0} became visible from {1} after {2}s".format(
+                    if int(s) == size:
-                    basename, self.client_id, i))
+                        log.info(f"File {basename} became visible with size {size} from {self.client_id} after {i}s")
                        return
                    else:
                        log.error(f"File {basename} became visible but with size {int(s)} not {size}")
                else:
                    log.info(f"File {basename} became visible from {self.client_id} after {i}s")
                    return
            time.sleep(1)
            i += 1
--- a/ceph/qa/tasks/cephfs/test_cephfs_shell.py
+++ b/ceph/qa/tasks/cephfs/test_cephfs_shell.py
@ -1,6 +1,8 @@
 """
-Before running this testsuite, add path to cephfs-shell module to $PATH and
+NOTE: For running this tests locally (using vstart_runner.py), export the
-export $PATH.
+path to src/tools/cephfs/shell/cephfs-shell module to $PATH. Running
 "export PATH=$PATH:$(cd ../src/tools/cephfs/shell && pwd)" from the build dir
 will update the environment without hassles of typing the path correctly.
 """
 from io import StringIO
 from os import path
--- a/ceph/qa/tasks/cephfs/test_client_limits.py
+++ b/ceph/qa/tasks/cephfs/test_client_limits.py
@ -9,7 +9,9 @@ from textwrap import dedent
 from tasks.ceph_test_case import TestTimeoutError
 from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
 from tasks.cephfs.fuse_mount import FuseMount
 from teuthology.exceptions import CommandFailedError
 import os
 from io import StringIO
 log = logging.getLogger(__name__)
@ -157,29 +159,49 @@ class TestClientLimits(CephFSTestCase):
        a fraction of second (0.5) by default when throttling condition is met.
        """
-        max_caps_per_client = 500
+        subdir_count = 4
-        cap_acquisition_throttle = 250
+        files_per_dir = 25
-        self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client)
+        # throttle in a way so that two dir reads are already hitting it.
-        self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle)
+        throttle_value = (files_per_dir * 3) // 2
-        # Create 1500 files split across 6 directories, 250 each.
+        # activate throttling logic by setting max per client to a low value
-        for i in range(1, 7):
+        self.config_set('mds', 'mds_max_caps_per_client', 1)
-            self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True)
+        self.config_set('mds', 'mds_session_cap_acquisition_throttle', throttle_value)
        # Create files split across {subdir_count} directories, {per_dir_count} in each dir
        for i in range(1, subdir_count+1):
            self.mount_a.create_n_files("dir{0}/file".format(i), files_per_dir, sync=True)
        mount_a_client_id = self.mount_a.get_global_id()
-        # recursive readdir
+        # recursive readdir. macOs wants an explicit directory for `find`.
-        self.mount_a.run_shell_payload("find | wc")
+        proc = self.mount_a.run_shell_payload("find . | wc", stderr=StringIO())
-
+        # return code may be None if the command got interrupted
-        # validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250
+        self.assertTrue(proc.returncode is None or proc.returncode == 0, proc.stderr.getvalue())
        cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
        self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle)
        # validate the throttle condition to be hit atleast once
        cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle']
        self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1)
        # validate cap_acquisition decay counter after readdir to NOT exceed the throttle value
        # plus one batch that could have been taken immediately before querying
        # assuming the batch is equal to the per dir file count.
        cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
        self.assertLessEqual(cap_acquisition_value, files_per_dir + throttle_value)
        # make sure that the throttle was reported in the events
        def historic_ops_have_event(expected_event):
            ops_dump = self.fs.rank_tell(['dump_historic_ops'])
            # reverse the events and the ops assuming that later ops would be throttled
            for op in reversed(ops_dump['ops']):
                for ev in reversed(op.get('type_data', {}).get('events', [])):
                    if ev['event'] == expected_event:
                        return True
            return False
        self.assertTrue(historic_ops_have_event('cap_acquisition_throttle'))
    def test_client_release_bug(self):
        """
        When a client has a bug (which we will simulate) preventing it from releasing caps,
@ -219,6 +241,55 @@ class TestClientLimits(CephFSTestCase):
        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
        rproc.wait()
    def test_client_blocklisted_oldest_tid(self):
        """
        that a client is blocklisted when its encoded session metadata exceeds the
        configured threshold (due to ever growing `completed_requests` caused due
        to an unidentified bug (in the client or the MDS)).
        """
        # num of requests client issues
        max_requests = 10000
        # The debug hook to inject the failure only exists in the fuse client
        if not isinstance(self.mount_a, FuseMount):
            self.skipTest("Require FUSE client to inject client release failure")
        self.config_set('client', 'client inject fixed oldest tid', 'true')
        self.mount_a.teardown()
        self.mount_a.mount_wait()
        self.config_set('mds', 'mds_max_completed_requests', max_requests);
        # Create lots of files
        self.mount_a.create_n_files("testdir/file1", max_requests + 100)
        # Create a few files synchronously. This makes sure previous requests are completed
        self.mount_a.create_n_files("testdir/file2", 5, True)
        # Wait for the health warnings. Assume mds can handle 10 request per second at least
        self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id))
        # set the threshold low so that it has a high probability of
        # hitting.
        self.config_set('mds', 'mds_session_metadata_threshold', 5000);
        # Create lot many files synchronously. This would hit the session metadata threshold
        # causing the client to get blocklisted.
        with self.assertRaises(CommandFailedError):
            self.mount_a.create_n_files("testdir/file2", 100000, True)
        self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr())
        # the mds should bump up the relevant perf counter
        pd = self.perf_dump()
        self.assertGreater(pd['mds_sessions']['mdthresh_evicted'], 0)
        # reset the config
        self.config_set('client', 'client inject fixed oldest tid', 'false')
        self.mount_a.kill_cleanup()
        self.mount_a.mount_wait()
    def test_client_oldest_tid(self):
        """
        When a client does not advance its oldest tid, the MDS should notice that
--- a/ceph/qa/tasks/cephfs/test_client_recovery.py
+++ b/ceph/qa/tasks/cephfs/test_client_recovery.py
@ -10,8 +10,10 @@ from textwrap import dedent
 import time
 import distutils.version as version
 import re
 import string
 import os
 from teuthology import contextutil
 from teuthology.orchestra import run
 from teuthology.orchestra.run import CommandFailedError
 from tasks.cephfs.fuse_mount import FuseMount
@ -221,8 +223,10 @@ class TestClientRecovery(CephFSTestCase):
        # Capability release from stale session
        # =====================================
        if write:
-            cap_holder = self.mount_a.open_background()
+            content = ''.join(random.choices(string.ascii_uppercase + string.digits, k=16))
            cap_holder = self.mount_a.open_background(content=content)
        else:
            content = ''
            self.mount_a.run_shell(["touch", "background_file"])
            self.mount_a.umount_wait()
            self.mount_a.mount_wait()
@ -233,7 +237,7 @@ class TestClientRecovery(CephFSTestCase):
        # Wait for the file to be visible from another client, indicating
        # that mount_a has completed its network ops
-        self.mount_b.wait_for_visible()
+        self.mount_b.wait_for_visible(size=len(content))
        # Simulate client death
        self.mount_a.suspend_netns()
@ -264,11 +268,9 @@ class TestClientRecovery(CephFSTestCase):
                            "Capability handover took {0}, expected approx {1}".format(
                                cap_waited, session_timeout
                            ))
            self.mount_a._kill_background(cap_holder)
        finally:
-            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable
-            self.mount_a.resume_netns()
+        self.mount_a._kill_background(cap_holder)
    def test_stale_read_caps(self):
        self._test_stale_caps(False)
@ -319,9 +321,9 @@ class TestClientRecovery(CephFSTestCase):
                                cap_waited, session_timeout / 2.0
                            ))
            self.mount_a._kill_background(cap_holder)
        finally:
-            self.mount_a.resume_netns()
+            self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable
        self.mount_a._kill_background(cap_holder)
    def test_trim_caps(self):
        # Trim capability when reconnecting MDS
@ -387,7 +389,6 @@ class TestClientRecovery(CephFSTestCase):
        self.mount_b.check_filelock(do_flock=flockable)
        # Tear down the background process
        self.mount_a._kill_background(lock_holder)
    def test_filelock_eviction(self):
@ -416,7 +417,6 @@ class TestClientRecovery(CephFSTestCase):
            # succeed
            self.wait_until_true(lambda: lock_taker.finished, timeout=10)
        finally:
            # Tear down the background process
            self.mount_a._kill_background(lock_holder)
            # teardown() doesn't quite handle this case cleanly, so help it out
@ -751,24 +751,27 @@ class TestClientOnLaggyOSD(CephFSTestCase):
            # it takes time to have laggy clients entries in cluster log,
            # wait for 6 minutes to see if it is visible, finally restart
            # the client
-            tries = 6
+            with contextutil.safe_while(sleep=5, tries=6) as proceed:
-            while True:
+                while proceed():
                    try:
-                    with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs",
+                        with self.assert_cluster_log("1 client(s) laggy due to"
                                                     " laggy OSDs",
                                                     timeout=55):
                            # make sure clients weren't evicted
                            self.assert_session_count(2)
                            break
-                except AssertionError:
+                    except (AssertionError, CommandFailedError) as e:
-                    tries -= 1
+                        log.debug(f'{e}, retrying')
-                    if tries:
+
-                        continue
+            # clear lagginess, expect to get the warning cleared and make sure
-                    raise
+            # client gets evicted
            self.clear_laggy_params(osd)
            self.wait_for_health_clear(60)
            self.assert_session_count(1)
        finally:
            self.mount_a.kill_cleanup()
            self.mount_a.mount_wait()
            self.mount_a.create_destroy()
            self.clear_laggy_params(osd)
    def test_client_eviction_if_config_is_unset(self):
        """
@ -800,6 +803,11 @@ class TestClientOnLaggyOSD(CephFSTestCase):
            time.sleep(session_timeout)
            self.assert_session_count(1)
            # make sure warning wasn't seen in cluster log
            with self.assert_cluster_log("laggy due to laggy OSDs",
                                         timeout=120, present=False):
                pass
        finally:
            self.mount_a.kill_cleanup()
            self.mount_a.mount_wait()
--- a/ceph/qa/tasks/cephfs/test_damage.py
+++ b/ceph/qa/tasks/cephfs/test_damage.py
@ -608,6 +608,7 @@ class TestDamage(CephFSTestCase):
        self.fs.flush()
        self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
        time.sleep(5) # for conf to percolate
        with self.assert_cluster_log("MDS abort because newly corrupt dentry"):
            p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
        self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
@ -642,6 +643,7 @@ class TestDamage(CephFSTestCase):
        rank0 = self.fs.get_rank()
        self.fs.rank_freeze(True, rank=0)
        # so now we want to trigger commit but this will crash, so:
        with self.assert_cluster_log("MDS abort because newly corrupt dentry"):
            c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
            p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
--- a/ceph/qa/tasks/cephfs/test_failover.py
+++ b/ceph/qa/tasks/cephfs/test_failover.py
@ -14,9 +14,12 @@ class TestClusterAffinity(CephFSTestCase):
    CLIENTS_REQUIRED = 0
    MDSS_REQUIRED = 4
-    def _verify_join_fs(self, target, status=None):
+    def _verify_join_fs(self, target, status=None, fs=None):
        fs_select = fs
        if fs_select is None:
            fs_select = self.fs
        if status is None:
-            status = self.fs.wait_for_daemons(timeout=30)
+            status = fs_select.wait_for_daemons(timeout=30)
            log.debug("%s", status)
        target = sorted(target, key=operator.itemgetter('name'))
        log.info("target = %s", target)
@ -37,11 +40,14 @@ class TestClusterAffinity(CephFSTestCase):
                return
        self.fail("no entity")
-    def _verify_init(self):
+    def _verify_init(self, fs=None):
-        status = self.fs.status()
+        fs_select = fs
        if fs_select is None:
            fs_select = self.fs
        status = fs_select.status()
        log.info("status = {0}".format(status))
        target = [{'join_fscid': -1, 'name': info['name']} for info in status.get_all()]
-        self._verify_join_fs(target, status=status)
+        self._verify_join_fs(target, status=status, fs=fs_select)
        return (status, target)
    def _reach_target(self, target):
@ -109,12 +115,21 @@ class TestClusterAffinity(CephFSTestCase):
        fs2 = self.mds_cluster.newfs(name="cephfs2")
        status, target = self._verify_init()
        active = self.fs.get_active_names(status=status)[0]
        status2, _ = self._verify_init(fs=fs2)
        active2 = fs2.get_active_names(status=status2)[0]
        standbys = [info['name'] for info in status.get_standbys()]
        victim = standbys.pop()
        # Set a bogus fs on the others
        for mds in standbys:
            self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2')
            self._change_target_state(target, mds, {'join_fscid': fs2.id})
        # The active MDS for cephfs2 will be replaced by the MDS for which
        # file system affinity has been set. Also, set the affinity for
        # the earlier active MDS so that it is not chosen by the monitors
        # as an active MDS for the existing file system.
        log.info(f'assigning affinity to cephfs2 for active mds (mds.{active2})')
        self.config_set(f'mds.{active2}', 'mds_join_fs', 'cephfs2')
        self._change_target_state(target, active2, {'join_fscid': fs2.id})
        self.fs.rank_fail()
        self._change_target_state(target, victim, {'state': 'up:active'})
        self._reach_target(target)
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`.qa/distros/podman/rhel_8.4_container_tools_3.0.yaml`
		`@ -0,0 +1 @@`
							`.qa/distros/podman/rhel_8.6_container_tools_3.0.yaml`