import preliminary last stable release 14.2.22

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
2025-04-28 12:54:34 +00:00 · 2021-07-05 19:42:40 +02:00 · 2021-07-05 19:42:40 +02:00 · d500a7f9ff
commit d500a7f9ff
parent 886a8c9442
934 changed files with 181812 additions and 24420 deletions
--- a/ceph/.gitmodules
+++ b/ceph/.gitmodules
@ -58,3 +58,6 @@
 [submodule "src/c-ares"]
 	path = src/c-ares
 	url = https://github.com/ceph/c-ares.git
+[submodule "src/spawn"]
+	path = src/spawn
+	url = https://github.com/ceph/spawn.git
--- a/ceph/CMakeLists.txt
+++ b/ceph/CMakeLists.txt
@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.5.1)

 project(ceph CXX C ASM)
-set(VERSION 14.2.20)
+set(VERSION 14.2.22)

 if(POLICY CMP0028)
  cmake_policy(SET CMP0028 NEW)
@ -123,6 +123,7 @@ cmake_pop_check_state()

 CHECK_FUNCTION_EXISTS(eventfd HAVE_EVENTFD)
 CHECK_FUNCTION_EXISTS(getprogname HAVE_GETPROGNAME)
+CHECK_FUNCTION_EXISTS(gettid HAVE_GETTID)

 CHECK_INCLUDE_FILES("linux/types.h" HAVE_LINUX_TYPES_H)
 CHECK_INCLUDE_FILES("linux/version.h" HAVE_LINUX_VERSION_H)
--- a/ceph/PendingReleaseNotes
+++ b/ceph/PendingReleaseNotes
@ -1,6 +1,13 @@
-14.2.17
+14.2.19
 -------

-* $pid expansion in config paths like `admin_socket` will now properly expand
-  to the daemon pid for commands like `ceph-mds` or `ceph-osd`. Previously only
-  `ceph-fuse`/`rbd-nbd` expanded `$pid` with the actual daemon pid.
+* OSD: the option ``osd_fast_shutdown_notify_mon`` has been introduced to allow
+  the OSD to notify the monitor it is shutting down even if ``osd_fast_shutdown``
+  is enabled. This helps with the monitor logs on larger clusters, that may get
+  many 'osd.X reported immediately failed by osd.Y' messages, and confuse tools.
+
+14.2.18
+-------
+
+* This release fixes issues loading the dashboard and volumes manager
+  modules in some environments.
--- a/ceph/alpine/APKBUILD
+++ b/ceph/alpine/APKBUILD
@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=14.2.20
+pkgver=14.2.22
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@ -64,7 +64,7 @@ makedepends="
 	xmlstarlet
 	yasm
 "
-source="ceph-14.2.20.tar.bz2"
+source="ceph-14.2.22.tar.bz2"
 subpackages="
 	$pkgname-base
 	$pkgname-common
@ -117,7 +117,7 @@ _sysconfdir=/etc
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages

-builddir=$srcdir/ceph-14.2.20
+builddir=$srcdir/ceph-14.2.22

 build() {
 	export CEPH_BUILD_VIRTUALENV=$builddir
--- a/ceph/ceph.spec
+++ b/ceph/ceph.spec
@ -23,7 +23,7 @@
 #################################################################################
 %bcond_with make_check
 %bcond_without ceph_test_package
-%ifarch s390 s390x
+%ifarch s390
 %bcond_with tcmalloc
 %else
 %bcond_without tcmalloc
@ -109,7 +109,7 @@
 # main package definition
 #################################################################################
 Name:		ceph
-Version:	14.2.20
+Version:	14.2.22
 Release:	0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:		2
@ -125,7 +125,7 @@ License:	LGPL-2.1 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
 Group:		System/Filesystems
 %endif
 URL:		http://ceph.com/
-Source0:	%{?_remote_tarball_prefix}ceph-14.2.20.tar.bz2
+Source0:	%{?_remote_tarball_prefix}ceph-14.2.22.tar.bz2
 %if 0%{?suse_version}
 # _insert_obs_source_lines_here
 ExclusiveArch:  x86_64 aarch64 ppc64le s390x
@ -293,6 +293,7 @@ BuildRequires:  pyOpenSSL%{_python_buildid}
 %else
 BuildRequires:  python%{_python_buildid}-pyOpenSSL
 %endif
+BuildRequires:	golang-github-prometheus
 BuildRequires:	libtool-ltdl-devel
 BuildRequires:	python%{_python_buildid}-cherrypy
 BuildRequires:	python%{_python_buildid}-jwt
@ -306,6 +307,7 @@ BuildRequires:	xmlsec1-openssl
 BuildRequires:	xmlsec1-openssl-devel
 %endif
 %if 0%{?suse_version}
+BuildRequires:	golang-github-prometheus-prometheus
 BuildRequires:	libxmlsec1-1
 BuildRequires:	libxmlsec1-nss1
 BuildRequires:	libxmlsec1-openssl1
@ -659,6 +661,9 @@ Requires:	librgw2 = %{_epoch_prefix}%{version}-%{release}
 %if 0%{?rhel} || 0%{?fedora}
 Requires:	mailcap
 %endif
+%if 0%{?weak_deps}
+Recommends:	gawk
+%endif
 %description radosgw
 RADOS is a distributed object store used by the Ceph distributed
 storage system.  This package provides a REST gateway to the
@ -1142,7 +1147,7 @@ This package provides Ceph’s default alerts for Prometheus.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-14.2.20
+%autosetup -p1 -n ceph-14.2.22

 %build
 # LTO can be enabled as soon as the following GCC bug is fixed:
@ -1870,6 +1875,8 @@ fi
 %{_bindir}/radosgw-token
 %{_bindir}/radosgw-es
 %{_bindir}/radosgw-object-expirer
+%{_bindir}/rgw-gap-list
+%{_bindir}/rgw-gap-list-comparator
 %{_bindir}/rgw-orphan-list
 %{_mandir}/man8/radosgw.8*
 %dir %{_localstatedir}/lib/ceph/radosgw
--- a/ceph/ceph.spec.in
+++ b/ceph/ceph.spec.in
@ -23,7 +23,7 @@
 #################################################################################
 %bcond_with make_check
 %bcond_without ceph_test_package
-%ifarch s390 s390x
+%ifarch s390
 %bcond_with tcmalloc
 %else
 %bcond_without tcmalloc
@ -293,6 +293,7 @@ BuildRequires:  pyOpenSSL%{_python_buildid}
 %else
 BuildRequires:  python%{_python_buildid}-pyOpenSSL
 %endif
+BuildRequires:	golang-github-prometheus
 BuildRequires:	libtool-ltdl-devel
 BuildRequires:	python%{_python_buildid}-cherrypy
 BuildRequires:	python%{_python_buildid}-jwt
@ -306,6 +307,7 @@ BuildRequires:	xmlsec1-openssl
 BuildRequires:	xmlsec1-openssl-devel
 %endif
 %if 0%{?suse_version}
+BuildRequires:	golang-github-prometheus-prometheus
 BuildRequires:	libxmlsec1-1
 BuildRequires:	libxmlsec1-nss1
 BuildRequires:	libxmlsec1-openssl1
@ -659,6 +661,9 @@ Requires:	librgw2 = %{_epoch_prefix}%{version}-%{release}
 %if 0%{?rhel} || 0%{?fedora}
 Requires:	mailcap
 %endif
+%if 0%{?weak_deps}
+Recommends:	gawk
+%endif
 %description radosgw
 RADOS is a distributed object store used by the Ceph distributed
 storage system.  This package provides a REST gateway to the
@ -1870,6 +1875,8 @@ fi
 %{_bindir}/radosgw-token
 %{_bindir}/radosgw-es
 %{_bindir}/radosgw-object-expirer
+%{_bindir}/rgw-gap-list
+%{_bindir}/rgw-gap-list-comparator
 %{_bindir}/rgw-orphan-list
 %{_mandir}/man8/radosgw.8*
 %dir %{_localstatedir}/lib/ceph/radosgw
--- a/ceph/changelog.upstream
+++ b/ceph/changelog.upstream
@ -1,7 +1,19 @@
-ceph (14.2.20-1xenial) xenial; urgency=medium
+ceph (14.2.22-1xenial) xenial; urgency=medium


- -- Jenkins Build Slave User <jenkins-build@confusa08.front.sepia.ceph.com>  Mon, 19 Apr 2021 10:22:46 -0400
+ -- Jenkins Build Slave User <jenkins-build@braggi10.front.sepia.ceph.com>  Tue, 29 Jun 2021 22:18:42 +0000
+
+ceph (14.2.22-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Tue, 29 Jun 2021 22:09:07 +0000
+
+ceph (14.2.21-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Thu, 13 May 2021 17:23:05 +0000

 ceph (14.2.20-1) stable; urgency=medium

--- a/ceph/cmake/modules/BuildBoost.cmake
+++ b/ceph/cmake/modules/BuildBoost.cmake
@ -230,6 +230,7 @@ macro(build_boost version)
        INTERFACE_LINK_LIBRARIES "${dependencies}")
      unset(dependencies)
    endif()
+    set(Boost_${c}_FOUND "TRUE")
  endforeach()

  # for header-only libraries
--- a/ceph/cmake/modules/BuildZstd.cmake
+++ b/ceph/cmake/modules/BuildZstd.cmake
@ -0,0 +1,22 @@
+# libzstd - build it statically
+function(build_Zstd)
+  set(ZSTD_C_FLAGS "-fPIC -Wno-unused-variable -O3")
+
+  include(ExternalProject)
+  ExternalProject_Add(zstd_ext
+    SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/zstd/build/cmake
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DCMAKE_C_FLAGS=${ZSTD_C_FLAGS}
+               -DCMAKE_AR=${CMAKE_AR}
+               -DCMAKE_POSITION_INDEPENDENT_CODE=${ENABLE_SHARED}
+    BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libzstd
+    BUILD_COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --target libzstd_static
+    BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/libzstd/lib/libzstd.a"
+    INSTALL_COMMAND "")
+  add_library(Zstd::Zstd STATIC IMPORTED)
+  set_target_properties(Zstd::Zstd PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/src/zstd/lib"
+    IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/libzstd/lib/libzstd.a")
+  add_dependencies(Zstd::Zstd zstd_ext)
+endfunction()
--- a/ceph/cmake/modules/FindZstd.cmake
+++ b/ceph/cmake/modules/FindZstd.cmake
@ -0,0 +1,51 @@
+# Try to find liblz4
+#
+# Once done, this will define
+#
+# Zstd_FOUND
+# Zstd_INCLUDE_DIRS
+# Zstd_LIBRARIES
+# Zstd_VERSION_STRING
+# Zstd_VERSION_MAJOR
+# Zstd_VERSION_MINOR
+# Zstd_VERSION_RELEASE
+
+find_path(Zstd_INCLUDE_DIR
+  NAMES zstd.h
+  HINTS ${Zstd_ROOT_DIR}/include)
+
+if(Zstd_INCLUDE_DIR AND EXISTS "${Zstd_INCLUDE_DIR}/zstd.h")
+  foreach(ver "MAJOR" "MINOR" "RELEASE")
+    file(STRINGS "${Zstd_INCLUDE_DIR}/zstd.h" Zstd_VER_${ver}_LINE
+      REGEX "^#define[ \t]+ZSTD_VERSION_${ver}[ \t]+[0-9]+$")
+    string(REGEX REPLACE "^#define[ \t]+ZSTD_VERSION_${ver}[ \t]+([0-9]+)$"
+      "\\1" Zstd_VERSION_${ver} "${Zstd_VER_${ver}_LINE}")
+    unset(${Zstd_VER_${ver}_LINE})
+  endforeach()
+  set(Zstd_VERSION_STRING
+    "${Zstd_VERSION_MAJOR}.${Zstd_VERSION_MINOR}.${Zstd_VERSION_RELEASE}")
+endif()
+
+find_library(Zstd_LIBRARY
+  NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}zstd.${CMAKE_STATIC_LIBRARY_SUFFIX}" zstd
+  HINTS ${Zstd_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Zstd
+  REQUIRED_VARS Zstd_LIBRARY Zstd_INCLUDE_DIR
+  VERSION_VAR Zstd_VERSION_STRING)
+
+mark_as_advanced(
+  Zstd_LIBRARY
+  Zstd_INCLUDE_DIR)
+
+if(Zstd_FOUND AND NOT (TARGET Zstd::Zstd))
+  set(Zstd_INCLUDE_DIRS ${Zstd_INCLUDE_DIR})
+  set(Zstd_LIBRARIES ${Zstd_LIBRARY})
+  add_library (Zstd::Zstd UNKNOWN IMPORTED)
+  set_target_properties(Zstd::Zstd PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES ${Zstd_INCLUDE_DIR}
+    IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+    IMPORTED_LOCATION ${Zstd_LIBRARY}
+    VERSION "${Zstd_VERSION_STRING}")
+endif()
--- a/ceph/debian/ceph-common.postinst
+++ b/ceph/debian/ceph-common.postinst
@ -78,7 +78,9 @@ case "$1" in
       fi
       if ! dpkg-statoverride --list /var/log/ceph >/dev/null
       then
-           chown -R $SERVER_USER:$SERVER_GROUP /var/log/ceph
+	   # take care not to touch cephadm log subdirs
+           chown $SERVER_USER:$SERVER_GROUP /var/log/ceph
+	   chown $SERVER_USER:$SERVER_GROUP /var/log/ceph/*.log* || true
 	   # members of group ceph can log here, but cannot remove
 	   # others' files.  non-members cannot read any logs.
           chmod u=rwx,g=rwxs,o=t /var/log/ceph
--- a/ceph/debian/control
+++ b/ceph/debian/control
@ -51,30 +51,32 @@ Build-Depends: cmake (>= 3.5),
               libxml2-dev,
               librabbitmq-dev,
               librdkafka-dev,
-# Make-Check   libxmlsec1
-# Make-Check   libxmlsec1-nss
-# Make-Check   libxmlsec1-openssl
-# Make-Check   libxmlsec1-dev
+# Make-Check   libzstd-dev,
+# Make-Check   libxmlsec1,
+# Make-Check   libxmlsec1-nss,
+# Make-Check   libxmlsec1-openssl,
+# Make-Check   libxmlsec1-dev,
               lsb-release,
               parted,
               patch,
               pkg-config,
               python (>= 2.7),
               python-all-dev,
-               python-cherrypy3,
-# Make-Check   python-jwt,
-# Make-Check   python-nose,
-# Make-Check   python-pecan,
-# Make-Check   python-bcrypt,
-# Make-Check   python-six,
-# Make-Check   python-tox,
-# Make-Check   python-coverage,
-# Make-Check   python-openssl,
-# Make-Check   python-prettytable,
-# Make-Check   python-requests,
+               python-cherrypy3 | python3-cherrypy3,
+# Make-Check   prometheus,               
+# Make-Check   python3-jwt,
+# Make-Check   python3-nose,
+# Make-Check   python3-pecan,
+# Make-Check   python3-bcrypt,
+# Make-Check   python3-six,
+# Make-Check   tox,
+# Make-Check   python3-coverage,
+# Make-Check   python3-openssl,
+# Make-Check   python3-prettytable,
+# Make-Check   python3-requests,
               python-setuptools,
-               python-sphinx,
-# Make-Check   python-werkzeug,
+               python3-sphinx,
+# Make-Check   python3-werkzeug,
               python3-all-dev,
               python3-setuptools,
 # Make-Check   socat,
@ -827,6 +829,7 @@ Depends: ceph-common (= ${binary:Version}),
         mime-support,
         ${misc:Depends},
         ${shlibs:Depends},
+Suggests: gawk,
 Recommends: ntp | time-daemon,
 Description: REST gateway for RADOS distributed object store
 RADOS is a distributed object store used by the Ceph distributed
--- a/ceph/debian/radosgw.install
+++ b/ceph/debian/radosgw.install
@ -4,6 +4,8 @@ usr/bin/radosgw
 usr/bin/radosgw-es
 usr/bin/radosgw-object-expirer
 usr/bin/radosgw-token
+usr/bin/rgw-gap-list
+usr/bin/rgw-gap-list-comparator
 usr/bin/rgw-orphan-list
 usr/share/man/man8/ceph-diff-sorted.8
 usr/share/man/man8/radosgw.8
--- a/ceph/do_cmake.sh
+++ b/ceph/do_cmake.sh
@ -6,7 +6,7 @@ if test -e build; then
    exit 1
 fi

-PYBUILD="2"
+PYBUILD="3"
 if [ -r /etc/os-release ]; then
  source /etc/os-release
  case "$ID" in
--- a/ceph/doc/man/8/ceph-volume.rst
+++ b/ceph/doc/man/8/ceph-volume.rst
@ -15,7 +15,7 @@ Synopsis
 | **ceph-volume** **inventory**

 | **ceph-volume** **lvm** [ *trigger* | *create* | *activate* | *prepare*
-| *zap* | *list* | *batch*]
+| *zap* | *list* | *batch* | *new-wal* | *new-db* | *migrate* ]

 | **ceph-volume** **simple** [ *trigger* | *scan* | *activate* ]

@ -243,6 +243,71 @@ Positional arguments:
  ``/path/to/sda1`` or ``/path/to/sda`` for regular devices.


+**new-wal**
+Attaches the given logical volume to OSD as a WAL. Logical volume
+name format is vg/lv. Fails if OSD has already got attached WAL.
+
+Usage::
+
+    ceph-volume lvm new-wal --osd-id OSD_ID --osd-fsid OSD_FSID --target TARGET_LV
+
+Optional arguments:
+
+* [-h, --help]          show the help message and exit
+
+Required arguments:
+
+* --osd-id OSD_ID       OSD id to attach new WAL to
+* --osd-fsid OSD_FSID   OSD fsid to attach new WAL to
+* --target TARGET_LV    logical volume name to attach as WAL
+
+
+**new-db**
+Attaches the given logical volume to OSD as a DB. Logical volume
+name format is vg/lv. Fails if OSD has already got attached DB.
+
+Usage::
+
+    ceph-volume lvm new-db --osd-id OSD_ID --osd-fsid OSD_FSID --target <target lv>
+
+Optional arguments:
+
+* [-h, --help]          show the help message and exit
+
+Required arguments:
+
+* --osd-id OSD_ID       OSD id to attach new DB to
+* --osd-fsid OSD_FSID   OSD fsid to attach new DB to
+* --target TARGET_LV    logical volume name to attach as DB
+
+**migrate**
+
+Moves BlueFS data from source volume(s) to the target one, source volumes
+(except the main, i.e. data or block one) are removed on success. LVM volumes
+are permitted for Target only, both already attached or new one. In the latter
+case it is attached to the OSD replacing one of the source devices. Following
+replacement rules apply (in the order of precedence, stop on the first match):
+
+    - if source list has DB volume - target device replaces it.
+    - if source list has WAL volume - target device replace it.
+    - if source list has slow volume only - operation is not permitted,
+      requires explicit allocation via new-db/new-wal command.
+
+Usage::
+
+    ceph-volume lvm migrate --osd-id OSD_ID --osd-fsid OSD_FSID --target TARGET_LV --from {data|db|wal} [{data|db|wal} ...]
+
+Optional arguments:
+
+* [-h, --help]          show the help message and exit
+
+Required arguments:
+
+* --osd-id OSD_ID       OSD id to perform migration at
+* --osd-fsid OSD_FSID   OSD fsid to perform migration at
+* --target TARGET_LV    logical volume to move data to
+* --from TYPE_LIST      list of source device type names, e.g. --from db wal
+
 simple
 ------

--- a/ceph/doc/man/8/ceph.rst
+++ b/ceph/doc/man/8/ceph.rst
@ -904,11 +904,16 @@ data should remain readable and writeable, although data redundancy
 may be reduced as some PGs may end up in a degraded (but active)
 state.  It will return a success code if it is okay to stop the
 OSD(s), or an error code and informative message if it is not or if no
-conclusion can be drawn at the current time.
+conclusion can be drawn at the current time.  When ``--max <num>`` is
+provided, up to <num> OSDs IDs will return (including the provided
+OSDs) that can all be stopped simultaneously.  This allows larger sets
+of stoppable OSDs to be generated easily by providing a single
+starting OSD and a max.  Additional OSDs are drawn from adjacent locations
+in the CRUSH hierarchy.

 Usage::

-  ceph osd ok-to-stop <id> [<ids>...]
+  ceph osd ok-to-stop <id> [<ids>...] [--max <num>]

 Subcommand ``pause`` pauses osd.

--- a/ceph/doc/rados/configuration/mon-config-ref.rst
+++ b/ceph/doc/rados/configuration/mon-config-ref.rst
@ -791,6 +791,14 @@ Trimming requires that the placement groups are ``active + clean``.
 :Default: 500


+``paxos service trim max multiplier``
+
+:Description: The factor by which paxos service trim max will be multiplied
+              to get a new upper bound when trim sizes are high (0 disables it)
+:Type: Integer
+:Default: ``20``
+
+
 ``mon max log epochs``

 :Description: The maximum amount of log epochs to trim during a single proposal
--- a/ceph/doc/rados/configuration/msgr2.rst
+++ b/ceph/doc/rados/configuration/msgr2.rst
@ -88,7 +88,7 @@ Similarly, two options control whether IPv4 and IPv6 addresses are used:
  * ``ms_bind_ipv6`` [default: false] controls whether a daemon binds
    to an IPv6 address

-.. note: The ability to bind to multiple ports has paved the way for
+.. note:: The ability to bind to multiple ports has paved the way for
   dual-stack IPv4 and IPv6 support.  That said, dual-stack support is
   not yet tested as of Nautilus v14.2.0 and likely needs some
   additional code changes to work correctly.
--- a/ceph/doc/rados/configuration/network-config-ref.rst
+++ b/ceph/doc/rados/configuration/network-config-ref.rst
@ -201,6 +201,27 @@ following option to the ``[global]`` section of your Ceph configuration file.
 We prefer that the cluster network is **NOT** reachable from the public network
 or the Internet for added security.

+IPv4/IPv6 Dual Stack Mode
+-------------------------
+
+If you want to run in an IPv4/IPv6 dual stack mode and want to define your public and/or
+cluster networks, then you need to specify both your IPv4 and IPv6 networks for each:
+
+.. code-block:: ini
+
+	[global]
+		# ... elided configuration
+		public network = {IPv4 public-network/netmask}, {IPv6 public-network/netmask}
+
+This is so ceph can find a valid IP address for both address families.
+
+If you want just an IPv4 or an IPv6 stack environment, then make sure you set the `ms bind`
+options correctly.
+
+.. note::
+   Binding to IPv4 is enabled by default, so if you just add the option to bind to IPv6
+   you'll actually put yourself into dual stack mode. If you want just IPv6, then disable IPv4 and
+   enable IPv6. See `Bind`_ below.

 Ceph Daemons
 ============
@ -336,11 +357,16 @@ addresses.
 :Default: ``7300``
 :Required: No. 

+``ms bind ipv4``
+
+:Description: Enables Ceph daemons to bind to IPv4 addresses.
+:Type: Boolean
+:Default: ``true``
+:Required: No

 ``ms bind ipv6``

-:Description: Enables Ceph daemons to bind to IPv6 addresses. Currently the
-              messenger *either* uses IPv4 or IPv6, but it cannot do both.
+:Description: Enables Ceph daemons to bind to IPv6 addresses.
 :Type: Boolean
 :Default: ``false``
 :Required: No
--- a/ceph/install-deps.sh
+++ b/ceph/install-deps.sh
@ -147,13 +147,21 @@ function install_pkg_on_ubuntu {
 }

 function install_boost_on_ubuntu {
-    local codename=$1
-    if dpkg -s ceph-libboost1.67-dev &> /dev/null; then
-	$SUDO env DEBIAN_FRONTEND=noninteractive apt-get -y remove 'ceph-libboost.*1.67.*'
-	$SUDO rm /etc/apt/sources.list.d/ceph-libboost1.67.list
-    fi
-    local project=libboost
    local ver=1.72
+    local installed_ver=$(apt -qq list --installed ceph-libboost*-dev 2>/dev/null |
+                              grep -e 'libboost[0-9].[0-9]\+-dev' |
+                              cut -d' ' -f2 |
+                              cut -d'.' -f1,2)
+    if test -n "$installed_ver"; then
+        if echo "$installed_ver" | grep -q "^$ver"; then
+            return
+        else
+            $SUDO env DEBIAN_FRONTEND=noninteractive apt-get -y remove "ceph-libboost.*${installed_ver}.*"
+            $SUDO rm -f /etc/apt/sources.list.d/ceph-libboost${installed_ver}.list
+        fi
+    fi
+    local codename=$1
+    local project=libboost
    local sha1=1d7c7a00cc3f37e340bae0360191a757b44ec80c
    install_pkg_on_ubuntu \
 	$project \
--- a/ceph/make-dist
+++ b/ceph/make-dist
@ -1,7 +1,21 @@
 #!/bin/sh -e

+SCRIPTNAME="$(basename "${0}")"
+BASEDIR="$(readlink -f "$(dirname "${0}")")"
+
 if [ ! -d .git ]; then
-    echo "no .git present.  run this from the base dir of the git checkout."
+    echo "$SCRIPTNAME: Full path to the script: $BASEDIR/$SCRIPTNAME"
+    echo "$SCRIPTNAME: No .git present. Run this from the base dir of the git checkout."
+    exit 1
+fi
+
+# Running the script from a directory containing a colon anywhere in the path
+# will expose us to the dreaded "[BUG] npm run [command] failed if the directory
+# path contains colon" bug https://github.com/npm/cli/issues/633
+# (see https://tracker.ceph.com/issues/39556 for details)
+if [[ "$BASEDIR" == *:* ]] ; then
+    echo "$SCRIPTNAME: Full path to the script: $BASEDIR/$SCRIPTNAME"
+    echo "$SCRIPTNAME: The path to the script contains a colon. Their presence has been known to break the script."
    exit 1
 fi

@ -67,7 +81,7 @@ build_dashboard_frontend() {

  $CURR_DIR/src/tools/setup-virtualenv.sh $TEMP_DIR
  $TEMP_DIR/bin/pip install nodeenv
-  $TEMP_DIR/bin/nodeenv -p --node=10.13.0
+  $TEMP_DIR/bin/nodeenv --verbose -p --node=10.13.0
  cd src/pybind/mgr/dashboard/frontend

  DEFAULT_LANG=`jq -r .config.locale package.json`
--- a/ceph/monitoring/grafana/dashboards/host-details.json
+++ b/ceph/monitoring/grafana/dashboards/host-details.json
@ -37,7 +37,7 @@
  "gnetId": null,
  "graphTooltip": 0,
  "id": null,
-  "iteration": 1557386759572,
+  "iteration": 1615564911000,
  "links": [],
  "panels": [
    {
@ -182,7 +182,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "sum by (mode) (\n  irate(node_cpu{instance=~\"($ceph_hosts).*\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m]) or\n  irate(node_cpu_seconds_total{instance=~\"($ceph_hosts).*\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m])\n) / scalar(\n  sum(irate(node_cpu{instance=~\"($ceph_hosts).*\"}[1m]) or\n      irate(node_cpu_seconds_total{instance=~\"($ceph_hosts).*\"}[1m]))\n) * 100",
+          "expr": "sum by (mode) (\n  irate(node_cpu{instance=~\"($ceph_hosts)([\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m]) or\n  irate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[1m])\n) / scalar(\n  sum(irate(node_cpu{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[1m]) or\n      irate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[1m]))\n) * 100",
          "format": "time_series",
          "intervalFactor": 2,
          "legendFormat": "{{mode}}",
@ -283,14 +283,14 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "(node_memory_MemTotal{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemTotal_bytes{instance=~\"[[ceph_hosts]].*\"})- (\n  (node_memory_MemFree{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemFree_bytes{instance=~\"[[ceph_hosts]].*\"})  + \n  (node_memory_Cached{instance=~\"[[ceph_hosts]].*\"} or node_memory_Cached_bytes{instance=~\"[[ceph_hosts]].*\"}) + \n  (node_memory_Buffers{instance=~\"[[ceph_hosts]].*\"} or node_memory_Buffers_bytes{instance=~\"[[ceph_hosts]].*\"}) +\n  (node_memory_Slab{instance=~\"[[ceph_hosts]].*\"} or node_memory_Slab_bytes{instance=~\"[[ceph_hosts]].*\"})\n  )\n  \n",
+          "expr": "(node_memory_MemTotal{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_MemTotal_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"})- (\n  (node_memory_MemFree{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_MemFree_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"})  + \n  (node_memory_Cached{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_Cached_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}) + \n  (node_memory_Buffers{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_Buffers_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}) +\n  (node_memory_Slab{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_Slab_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"})\n  )\n  \n",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "used",
          "refId": "D"
        },
        {
-          "expr": "node_memory_MemFree{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemFree_bytes{instance=~\"[[ceph_hosts]].*\"} ",
+          "expr": "node_memory_MemFree{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_MemFree_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} ",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
@ -298,7 +298,7 @@
          "refId": "A"
        },
        {
-          "expr": "(node_memory_Cached{instance=~\"[[ceph_hosts]].*\"} or node_memory_Cached_bytes{instance=~\"[[ceph_hosts]].*\"}) + \n(node_memory_Buffers{instance=~\"[[ceph_hosts]].*\"} or node_memory_Buffers_bytes{instance=~\"[[ceph_hosts]].*\"}) +\n(node_memory_Slab{instance=~\"[[ceph_hosts]].*\"} or node_memory_Slab_bytes{instance=~\"[[ceph_hosts]].*\"}) \n",
+          "expr": "(node_memory_Cached{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_Cached_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}) + \n(node_memory_Buffers{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_Buffers_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}) +\n(node_memory_Slab{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_Slab_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}) \n",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
@ -306,7 +306,7 @@
          "refId": "C"
        },
        {
-          "expr": "node_memory_MemTotal{instance=~\"[[ceph_hosts]].*\"} or node_memory_MemTotal_bytes{instance=~\"[[ceph_hosts]].*\"} ",
+          "expr": "node_memory_MemTotal{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} or node_memory_MemTotal_bytes{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"} ",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
@ -401,7 +401,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "sum by (device) (\n  irate(node_network_receive_bytes{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m]) or \n  irate(node_network_receive_bytes_total{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m])\n)",
+          "expr": "sum by (device) (\n  irate(node_network_receive_bytes{instance=~\"($ceph_hosts)([\\\\.:].*)?\",device!=\"lo\"}[1m]) or \n  irate(node_network_receive_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\",device!=\"lo\"}[1m])\n)",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{device}}.rx",
@ -410,7 +410,7 @@
          "textEditor": true
        },
        {
-          "expr": "sum by (device) (\n  irate(node_network_transmit_bytes{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m]) or\n  irate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts).*\",device!=\"lo\"}[1m])\n)",
+          "expr": "sum by (device) (\n  irate(node_network_transmit_bytes{instance=~\"($ceph_hosts)([\\\\.:].*)?\",device!=\"lo\"}[1m]) or\n  irate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\",device!=\"lo\"}[1m])\n)",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{device}}.tx",
@ -501,7 +501,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "irate(node_network_receive_drop{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_receive_drop_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+          "expr": "irate(node_network_receive_drop{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}[1m]) or irate(node_network_receive_drop_total{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}[1m])",
          "format": "time_series",
          "instant": false,
          "intervalFactor": 1,
@ -509,7 +509,7 @@
          "refId": "A"
        },
        {
-          "expr": "irate(node_network_transmit_drop{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_transmit_drop_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+          "expr": "irate(node_network_transmit_drop{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}[1m]) or irate(node_network_transmit_drop_total{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}[1m])",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{device}}.tx",
@ -621,7 +621,7 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr": "sum(ceph_osd_stat_bytes and on (ceph_daemon) ceph_disk_occupation{instance=~\"($ceph_hosts).*\"})",
+          "expr": "sum(ceph_osd_stat_bytes and on (ceph_daemon) ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\.:].*)?\"})",
          "format": "time_series",
          "intervalFactor": 2,
          "refId": "A",
@ -685,7 +685,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "irate(node_network_receive_errs{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_receive_errs_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+          "expr": "irate(node_network_receive_errs{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}[1m]) or irate(node_network_receive_errs_total{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}[1m])",
          "format": "time_series",
          "instant": false,
          "intervalFactor": 1,
@ -693,7 +693,7 @@
          "refId": "A"
        },
        {
-          "expr": "irate(node_network_transmit_errs{instance=~\"[[ceph_hosts]].*\"}[1m]) or irate(node_network_transmit_errs_total{instance=~\"[[ceph_hosts]].*\"}[1m])",
+          "expr": "irate(node_network_transmit_errs{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}[1m]) or irate(node_network_transmit_errs_total{instance=~\"[[ceph_hosts]]([\\\\.:].*)?\"}[1m])",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{device}}.tx",
@ -798,7 +798,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "label_replace(\n  (\n    irate(node_disk_writes_completed{instance=~\"($ceph_hosts).*\"}[5m]) or\n    irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts).*\"}[5m])\n  ),\n  \"instance\",\n  \"$1\",\n  \"instance\",\n  \"([^:.]*).*\"\n)\n* on(instance, device) group_left(ceph_daemon)\n  label_replace(\n    label_replace(\n      ceph_disk_occupation,\n      \"device\",\n      \"$1\",\n      \"device\",\n      \"/dev/(.*)\"\n    ),\n    \"instance\",\n    \"$1\",\n    \"instance\",\n    \"([^:.]*).*\"\n  )",
+          "expr": "label_replace(\n  (\n    irate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or\n    irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])\n  ),\n  \"instance\",\n  \"$1\",\n  \"instance\",\n  \"([^:.]*).*\"\n)\n* on(instance, device, ceph_daemon) group_left\n  label_replace(\n    label_replace(\n      ceph_disk_occupation,\n      \"device\",\n      \"$1\",\n      \"device\",\n      \"/dev/(.*)\"\n    ),\n    \"instance\",\n    \"$1\",\n    \"instance\",\n    \"([^:.]*).*\"\n  )",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{device}}({{ceph_daemon}}) writes",
@ -807,7 +807,7 @@
          "textEditor": true
        },
        {
-          "expr": "label_replace(\n    (irate(node_disk_reads_completed{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts).*\"}[5m])),\n    \"instance\",\n    \"$1\",\n    \"instance\",\n    \"([^:.]*).*\"\n)\n* on(instance, device) group_left(ceph_daemon)\n  label_replace(\n    label_replace(\n      ceph_disk_occupation,\n      \"device\",\n      \"$1\",\n      \"device\",\n      \"/dev/(.*)\"\n    ),\n    \"instance\",\n    \"$1\",\n    \"instance\",\n    \"([^:.]*).*\"\n  )",
+          "expr": "label_replace(\n    (irate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])),\n    \"instance\",\n    \"$1\",\n    \"instance\",\n    \"([^:.]*).*\"\n)\n* on(instance, device, ceph_daemon) group_left\n  label_replace(\n    label_replace(\n      ceph_disk_occupation,\n      \"device\",\n      \"$1\",\n      \"device\",\n      \"/dev/(.*)\"\n    ),\n    \"instance\",\n    \"$1\",\n    \"instance\",\n    \"([^:.]*).*\"\n  )",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
@ -899,14 +899,14 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "label_replace((irate(node_disk_bytes_written{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts).*\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+          "expr": "label_replace((irate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{device}}({{ceph_daemon}}) write",
          "refId": "B"
        },
        {
-          "expr": "label_replace((irate(node_disk_bytes_read{instance=~\"($ceph_hosts).*\"}[5m]) or irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts).*\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+          "expr": "label_replace((irate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
          "format": "time_series",
          "intervalFactor": 1,
          "legendFormat": "{{device}}({{ceph_daemon}}) read",
@ -992,7 +992,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~\"($ceph_hosts).*\"}[5m]) )  / clamp_min(irate(node_disk_writes_completed_total{ instance=~\"($ceph_hosts).*\"}[5m]), 0.001) or   (irate(node_disk_read_time_seconds_total{ instance=~\"($ceph_hosts).*\"}[5m]) )  / clamp_min(irate(node_disk_reads_completed_total{ instance=~\"($ceph_hosts).*\"}[5m]), 0.001), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")) *  on(instance,device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+          "expr": "max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) )  / clamp_min(irate(node_disk_writes_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001) or   (irate(node_disk_read_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) )  / clamp_min(irate(node_disk_reads_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")) *  on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
@ -1083,7 +1083,7 @@
      "steppedLine": false,
      "targets": [
        {
-          "expr": "label_replace(((irate(node_disk_io_time_ms{instance=~\"($ceph_hosts).*\"}[5m]) / 10 ) or  irate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts).*\"}[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
+          "expr": "label_replace(((irate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) / 10 ) or  irate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")",
          "format": "time_series",
          "hide": false,
          "intervalFactor": 1,
--- a/ceph/monitoring/grafana/dashboards/hosts-overview.json
+++ b/ceph/monitoring/grafana/dashboards/hosts-overview.json
@ -131,7 +131,6 @@
        "#d44a3a"
      ],
      "datasource": "$datasource",
-      "decimals": 0,
      "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster",
      "decimals": 2,
      "format": "percentunit",
@ -215,7 +214,6 @@
        "#d44a3a"
      ],
      "datasource": "$datasource",
-      "decimals": 0,
      "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)",
      "decimals": 2,
      "format": "percentunit",
@ -433,7 +431,7 @@
      "tableColumn": "",
      "targets": [
        {
-          "expr" : "avg (\n  label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n   (irate(node_disk_io_time_seconds_total[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n  ) *\n  on(instance, device) label_replace(label_replace(ceph_disk_occupation{instance=~\"($osd_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\")\n)",
+          "expr" : "avg (\n  label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n   (irate(node_disk_io_time_seconds_total[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n  ) *\n  on(instance, device, ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~\"($osd_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\")\n)",
          "format": "time_series",
          "instant": true,
          "intervalFactor": 1,
--- a/ceph/monitoring/grafana/dashboards/radosgw-sync-overview.json
+++ b/ceph/monitoring/grafana/dashboards/radosgw-sync-overview.json
@ -0,0 +1,440 @@
+{
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "5.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": "5.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1534386107523,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null as zero",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_sum[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{source_zone}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Replication (throughput) from Source Zone",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "unit": "bytes",
+          "format": "Bps",
+          "decimals": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 7.4,
+        "x": 8.3,
+        "y": 0
+      },
+      "id": 2,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null as zero",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_count[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{source_zone}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Replication (objects) from Source Zone",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "decimals": null,
+          "label": "Objects/s",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 16,
+        "y": 0
+      },
+      "id": 3,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null as zero",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_poll_latency_sum[30s]) * 1000)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{source_zone}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Polling Request Latency from Source Zone",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "unit": "s",
+          "format": "ms",
+          "decimals": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "$datasource",
+      "fill": 1,
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 0,
+        "y": 7
+      },
+      "id": 4,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "links": [],
+      "nullPointMode": "null as zero",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_errors[30s]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{source_zone}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Unsuccessful Object Replications from Source Zone",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "decimals": null,
+          "label": "Count/s",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ]
+    }
+  ],
+  "refresh": "15s",
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [
+    "overview"
+  ],
+  "templating": {
+    "list": [
+      {
+        "allValue": null,
+        "current": {},
+        "datasource": "$datasource",
+        "hide": 2,
+        "includeAll": true,
+        "label": null,
+        "multi": false,
+        "name": "rgw_servers",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "sort": 1,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "current": {
+        "tags": [],
+        "text": "default",
+        "value": "default"
+        },
+        "hide": 0,
+        "label": "Data Source",
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "15s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "RGW Sync Overview",
+  "uid": "rgw-sync-overview",
+  "version": 2
+}
--- a/ceph/monitoring/prometheus/alerts/test_alerts.yml
+++ b/ceph/monitoring/prometheus/alerts/test_alerts.yml
@ -0,0 +1,769 @@
+rule_files:
+  - ceph_default_alerts.yml
+evaluation_interval: 5m
+tests:
+ # health error
+ - interval: 5m
+   input_series:
+    - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+      values: '2 2 2 2 2 2 2'
+   promql_expr_test:
+    - expr: ceph_health_status == 2
+      eval_time: 5m
+      exp_samples:
+       - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+         value: 2
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: health error
+    - eval_time: 6m
+      alertname: health error
+      exp_alerts:
+      - exp_labels:
+          instance: ceph:9283
+          job: ceph
+          type: ceph_default
+          severity: critical
+        exp_annotations:
+          description: >
+            Ceph in HEALTH_ERROR state for more than 5 minutes.
+            Please check "ceph health detail" for more information.
+
+ # health warning
+ - interval: 5m
+   input_series:
+    - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: ceph_health_status == 1
+       eval_time: 15m
+       exp_samples:
+         - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+           value: 1
+   alert_rule_test:
+    - eval_time: 10m
+      alertname: health warn
+    - eval_time: 20m
+      alertname: health warn
+      exp_alerts:
+      - exp_labels:
+          instance: ceph:9283
+          job: ceph
+          type: ceph_default
+          severity: warning
+        exp_annotations:
+          description: >
+            Ceph has been in HEALTH_WARN for more than 15 minutes.
+            Please check "ceph health detail" for more information.
+
+ # low monitor quorum count
+ - interval: 1m
+   input_series:
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283",
+      job="ceph"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283",
+      job="ceph"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283",
+      job="ceph"}'
+      values: '0 0 0 0 0'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version
+      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+      public_addr="172.20.0.2",rank="0"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version
+      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+      public_addr="172.20.0.2",rank="1"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version
+      17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
+      (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
+      public_addr="172.20.0.2",rank="2"}'
+      values: '1 1 1 1 1'
+   promql_expr_test:
+     - expr: sum(ceph_mon_quorum_status) < 3
+       eval_time: 1m
+       exp_samples:
+         - labels: '{}'
+           value: 2
+   alert_rule_test:
+    - eval_time: 1m
+      alertname: low monitor quorum count
+      exp_alerts:
+      - exp_labels:
+          type: ceph_default
+          severity: critical
+        exp_annotations:
+          description: |
+            Monitor count in quorum is below three.
+
+            Only 2 of 3 monitors are active.
+
+            The following monitors are down:
+              - mon.c on ceph
+
+
+ # 10% OSDs down
+ - interval: 1m
+   input_series:
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+      values: '0 0 0 0 0'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1'
+   promql_expr_test:
+     - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
+       eval_time: 1m
+       exp_samples:
+         - labels: '{}'
+           value: 3.333333333333333E+01
+   alert_rule_test:
+     - eval_time: 1m
+       alertname: 10% OSDs down
+       exp_alerts:
+       - exp_labels:
+           type: ceph_default
+           severity: critical
+         exp_annotations:
+           description: |
+             33.33% or 1 of 3 OSDs are down (≥ 10%).
+
+             The following OSDs are down:
+               - osd.1 on ceph
+
+ # OSD down
+ - interval: 1m
+   input_series:
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+      values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: count(ceph_osd_up == 0) > 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 15m
+       alertname: OSD down
+       exp_alerts:
+       - exp_labels:
+           type: ceph_default
+           severity: warning
+         exp_annotations:
+           description: |
+
+             1 OSD down for more than 15 minutes.
+
+             1 of 3 OSDs are down.
+
+             The following OSD is down:
+                 - osd.1 on ceph
+
+  # OSDs near full
+ - interval: 1m
+   input_series:
+    - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283"
+      ,job="ceph"}'
+      values: '1076310016 1076310016 1076310016 1076310016 1076310016
+      1076310016'
+    - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283"
+      ,job="ceph"}'
+      values: '1076310016 1076310016 1076310016 1076310016 1076310016
+      1076310016'
+    - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283"
+      ,job="ceph"}'
+      values: '1076310016 1076310016 1076310016 1076310016 1076310016
+      100856561909.76'
+    - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283"
+      ,job="ceph"}'
+      values: '108447916032 108447916032 108447916032 108447916032 108447916032
+      108447916032'
+    - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283"
+      ,job="ceph"}'
+      values: '108447916032 108447916032 108447916032 108447916032 108447916032
+      108447916032'
+    - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283"
+      ,job="ceph"}'
+      values: '108447916032 108447916032 108447916032 108447916032 108447916032
+      108447916032'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: |
+         (
+           ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon)
+           ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname)
+           ceph_osd_metadata
+         ) * 100 > 90
+
+       eval_time: 5m
+       exp_samples:
+         - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283",
+           job="ceph"}'
+           value: 9.3E+01
+   alert_rule_test:
+     - eval_time: 10m
+       alertname: OSDs near full
+       exp_alerts:
+       - exp_labels:
+           ceph_daemon: osd.2
+           hostname: ceph
+           instance: ceph:9283
+           job: ceph
+           type: ceph_default
+           severity: critical
+         exp_annotations:
+           description: >
+             OSD osd.2 on ceph is dangerously full: 93%
+
+ # flapping OSD
+ - interval: 1s
+   input_series:
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+      values: '1+1x100'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+      values: '1+0x100'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+      values: '1+0x100'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: |
+         (
+           rate(ceph_osd_up[5m])
+           * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
+         ) * 60 > 1
+       eval_time: 1m
+       exp_samples:
+         - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
+           job="ceph"}'
+           value: 1.2200000000000001E+01
+   alert_rule_test:
+     - eval_time: 5m
+       alertname: flapping OSD
+       exp_alerts:
+       - exp_labels:
+           ceph_daemon: osd.0
+           hostname: ceph
+           instance: ceph:9283
+           job: ceph
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           description: >
+              OSD osd.0 on ceph was
+              marked down and back up at 20.1 times once a
+              minute for 5 minutes.
+
+ # high pg count deviation
+ - interval: 1m
+   input_series:
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
+      job="ceph"}'
+      values: '100 100 100 100 100 160'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
+      job="ceph"}'
+      values: '100 100 100 100 100 320'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
+      job="ceph"}'
+      values: '100 100 100 100 100 160'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
+      job="ceph"}'
+      values: '100 100 100 100 100 160'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
+      ceph_version="ceph version 17.0.0-189-g3558fd72
+      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+      public_addr="172.20.0.2"}'
+      values: '1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: |
+         abs(
+           (
+             (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
+             by (job)
+           ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+         ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+
+       eval_time: 5m
+       exp_samples:
+         - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
+           job="ceph"}'
+           value: 6E-01
+   alert_rule_test:
+     - eval_time: 10m
+       alertname: high pg count deviation
+       exp_alerts:
+       - exp_labels:
+           ceph_daemon: osd.1
+           hostname: ceph
+           instance: ceph:9283
+           job: ceph
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           description: >
+              OSD osd.1 on ceph deviates
+              by more than 30% from average PG count.
+
+ # pgs inactive
+ - interval: 1m
+   input_series:
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="device_health_metrics",pool_id="1"}'
+      values: '1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="device_health_metrics",pool_id="2"}'
+      values: '1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="device_health_metrics",pool_id="3"}'
+      values: '1 1 1 1 1 1 1 1'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
+      values: '1 1 1 1 1 1 1 1'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
+      values: '32 32 32 32 32 32 32 32'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
+      values: '33 32 32 32 32 33 33 32'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
+      values: '1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
+      values: '32 32 32 32 32 32 32 32'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
+      values: '32 32 32 32 32 32 32 32'
+   promql_expr_test:
+     - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
+             (ceph_pg_total - ceph_pg_active) > 0
+       eval_time: 5m
+       exp_samples:
+         - labels: '{instance="ceph:9283", job="ceph",
+           name="device_health_metrics",
+           pool_id="3"}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 5m
+       alertname: pgs inactive
+       exp_alerts:
+       - exp_labels:
+           instance: ceph:9283
+           job: ceph
+           name: device_health_metrics
+           pool_id: 3
+           severity: critical
+           type: ceph_default
+         exp_annotations:
+           description: >
+              1 PGs have been inactive for more than 5 minutes in pool
+              device_health_metrics.
+              Inactive placement groups aren't able to serve read/write
+              requests.
+
+ #pgs unclean
+ - interval: 1m
+   input_series:
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="device_health_metrics",pool_id="1"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="device_health_metrics",pool_id="2"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="device_health_metrics",pool_id="3"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
+      32 32 32'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
+      values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
+      33 33'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
+      values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
+      32 32'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
+      32 32'
+   promql_expr_test:
+     - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
+             (ceph_pg_total - ceph_pg_clean) > 0
+       eval_time: 15m
+       exp_samples:
+         - labels: '{instance="ceph:9283", job="ceph",
+           name="device_health_metrics", pool_id="3"}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 16m
+       alertname: pgs unclean
+       exp_alerts:
+       - exp_labels:
+           instance: ceph:9283
+           job: ceph
+           name: device_health_metrics
+           pool_id: 3
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           description: >
+              1 PGs haven't been clean for more than 15 minutes in pool
+              device_health_metrics.
+              Unclean PGs haven't been able to completely recover from a
+              previous failure.
+
+ # root volume full
+ - interval: 1m
+   input_series:
+    - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
+      --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
+      mountpoint="/"}'
+      values: '35336400896 35336400896 35336400896 35336400896 35336400896
+      3525385519.104 3533640089'
+    - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
+      --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
+      mountpoint="/"}'
+      values: '73445531648 73445531648 73445531648 73445531648 73445531648
+      73445531648 73445531648'
+   promql_expr_test:
+     - expr: node_filesystem_avail_bytes{mountpoint="/"} /
+             node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
+       eval_time: 5m
+       exp_samples:
+         - labels: '{device="/dev/mapper/fedora_localhost --live-home",
+           fstype="ext4", instance="node-exporter", job="node-exporter",
+           mountpoint="/"}'
+           value: 4.8E+00
+   alert_rule_test:
+     - eval_time: 10m
+       alertname: root volume full
+       exp_alerts:
+       - exp_labels:
+           device: /dev/mapper/fedora_localhost --live-home
+           fstype: ext4
+           instance: node-exporter
+           job: node-exporter
+           mountpoint: /
+           severity: critical
+           type: ceph_default
+         exp_annotations:
+           description: >
+              Root volume (OSD and MON store) is dangerously full: 4.811% free.
+
+ # network packets dropped
+ - interval: 1s
+   input_series:
+    - series: 'node_network_receive_drop_total{device="eth0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '1+1x500'
+    - series: 'node_network_transmit_drop_total{device="eth0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '1+1x500'
+   promql_expr_test:
+     - expr: |
+         (
+           increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+           increase(node_network_transmit_drop_total{device!="lo"}[1m])
+         ) / (
+           increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+           increase(node_network_transmit_packets_total{device!="lo"}[1m])
+         ) >= 0.0001 or (
+           increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+           increase(node_network_transmit_drop_total{device!="lo"}[1m])
+         ) >= 10
+
+       eval_time: 5m
+       exp_samples:
+         - labels: '{device="eth0", instance="node-exporter",
+           job="node-exporter"}'
+           value: 1.2E+02
+   alert_rule_test:
+     - eval_time: 5m
+       alertname: network packets dropped
+       exp_alerts:
+       - exp_labels:
+           device: eth0
+           instance: node-exporter
+           job: node-exporter
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           description: >
+              Node node-exporter experiences packet drop > 0.01% or >
+              10 packets/s on interface eth0.
+
+ # network packets errors
+ - interval: 1s
+   input_series:
+    - series: 'node_network_receive_errs_total{device="eth0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '1+1x500'
+    - series: 'node_network_transmit_errs_total{device="eth0",
+      instance="node-exporter",job="node-exporter"}'
+      values: '1+1x500'
+   promql_expr_test:
+     - expr: |
+         (
+           increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+           increase(node_network_transmit_errs_total{device!="lo"}[1m])
+         ) / (
+           increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+           increase(node_network_transmit_packets_total{device!="lo"}[1m])
+         ) >= 0.0001 or (
+           increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+           increase(node_network_transmit_errs_total{device!="lo"}[1m])
+         ) >= 10
+
+       eval_time: 5m
+       exp_samples:
+         - labels: '{device="eth0", instance="node-exporter",
+           job="node-exporter"}'
+           value: 1.2E+02
+   alert_rule_test:
+     - eval_time: 5m
+       alertname: network packet errors
+       exp_alerts:
+       - exp_labels:
+           device: eth0
+           instance: node-exporter
+           job: node-exporter
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           description: >
+              Node node-exporter experiences packet errors > 0.01% or > 10
+              packets/s on interface eth0.
+
+ # MTU Mismatch
+ - interval: 1m
+   input_series:
+    - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
+      job="node-exporter"}'
+      values: '1500 1500 1500 1500 1500'
+    - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
+      job="node-exporter"}'
+      values: '1500 1500 1500 1500 1500'
+    - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
+      job="node-exporter"}'
+      values: '1500 1500 1500 1500 1500'
+    - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
+      job="node-exporter"}'
+      values: '1500 1500 1500 1500 1500'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
+      job="node-exporter"}'
+      values: '9000 9000 9000 9000 9000'
+   promql_expr_test:
+     - expr: node_network_mtu_bytes{device!="lo"} != on() group_left()
+             (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="node_network_mtu_bytes", device="eth4",
+           instance="node-exporter", job="node-exporter"}'
+           value: 9000
+   alert_rule_test:
+     - eval_time: 1m
+       alertname: MTU Mismatch
+       exp_alerts:
+       - exp_labels:
+           device: eth4
+           instance: node-exporter
+           job: node-exporter
+           oid: 1.3.6.1.4.1.50495.15.1.2.8.5
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           description: >
+               Node node-exporter has a different MTU size (9000)
+               than the median value on device eth4.
+
+ # pool full
+ - interval: 1m
+   input_series:
+    - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="1"}'
+      values: '0 0 0 0 0 0 0 0 0'
+    - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="2"}'
+      values: '1850 1850 1850 1850 1850 1850 1850'
+    - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="3"}'
+      values: '900 900 23524 23524 23524 23524 23524 23524
+      23524'
+    - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="1"}'
+      values: '106287063040 106287063040 106287063040 106287063040 106287063040
+      106287063040 106287063040'
+    - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="2"}'
+      values: '106287063040 106287063040 106287063040 106287063040 106287063040
+      106287063040 106287063040'
+    - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="3"}'
+      values: '37.5 37.5 37.5 37.5 37.5 37.5 37.5'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="device_health_metrics",pool_id="1"}'
+      values: '1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name=".rgw.root",pool_id="2"}'
+      values: '1 1 1 1 1 1 1 1 1'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
+      name="default.rgw.log",pool_id="3"}'
+      values: '1 1 1 1 1 1 1 1 1'
+   promql_expr_test:
+     - expr: |
+         ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
+         * on(pool_id) group_right ceph_pool_metadata * 100 > 90
+
+       eval_time: 1m
+       exp_samples:
+         - labels: '{instance="ceph:9283", job="ceph", name="default.rgw.log",
+           pool_id="3"}'
+           value: 9.6E+01
+   alert_rule_test:
+     - eval_time: 2m
+       alertname: pool full
+       exp_alerts:
+       - exp_labels:
+           instance: ceph:9283
+           job: ceph
+           name: default.rgw.log
+           pool_id: 3
+           severity: critical
+           type: ceph_default
+         exp_annotations:
+           description: Pool default.rgw.log at 96% capacity.
+
+ # slow OSD ops
+ - interval : 1m
+   input_series:
+    - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
+      values: '1+0x120'
+   promql_expr_test:
+     - expr: ceph_healthcheck_slow_ops > 0
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
+           job="ceph"}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 20m
+       alertname: Slow OSD Ops
+       exp_alerts:
+       - exp_labels:
+           instance: ceph:9283
+           job: ceph
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           description: >
+             1 OSD requests are taking too long to process
+             (osd_op_complaint_time exceeded)
--- a/ceph/qa/cephfs/tasks/cfuse_workunit_suites_ffsb.yaml
+++ b/ceph/qa/cephfs/tasks/cfuse_workunit_suites_ffsb.yaml
@ -6,6 +6,7 @@ overrides:
    conf:
      osd:
        filestore flush min: 0
+        osd heartbeat grace: 60
 tasks:
 - check-counter:
    counters:
--- a/ceph/qa/rbd/krbd_blkroset.t
+++ b/ceph/qa/rbd/krbd_blkroset.t
@ -146,9 +146,7 @@ R/O, unpartitioned:
  $ blockdev --setrw $DEV
  .*BLKROSET: Permission denied (re)
  [1]
-  $ sudo blockdev --setrw $DEV
-  .*BLKROSET: Read-only file system (re)
-  [1]
+  $ sudo blockdev --setrw $DEV  # succeeds but effectively ignored
  $ blockdev --getro $DEV
  1
  $ dd if=/dev/urandom of=$DEV bs=1k seek=1 count=1 status=none
@ -182,15 +180,11 @@ R/O, partitioned:
  $ blockdev --setrw ${DEV}p1
  .*BLKROSET: Permission denied (re)
  [1]
-  $ sudo blockdev --setrw ${DEV}p1
-  .*BLKROSET: Read-only file system (re)
-  [1]
+  $ sudo blockdev --setrw ${DEV}p1  # succeeds but effectively ignored
  $ blockdev --setrw ${DEV}p2
  .*BLKROSET: Permission denied (re)
  [1]
-  $ sudo blockdev --setrw ${DEV}p2
-  .*BLKROSET: Read-only file system (re)
-  [1]
+  $ sudo blockdev --setrw ${DEV}p2  # succeeds but effectively ignored
  $ blockdev --getro ${DEV}p1
  1
  $ blockdev --getro ${DEV}p2
@ -227,9 +221,7 @@ Unpartitioned:
  $ blockdev --setrw $DEV
  .*BLKROSET: Permission denied (re)
  [1]
-  $ sudo blockdev --setrw $DEV
-  .*BLKROSET: Read-only file system (re)
-  [1]
+  $ sudo blockdev --setrw $DEV  # succeeds but effectively ignored
  $ blockdev --getro $DEV
  1
  $ dd if=/dev/urandom of=$DEV bs=1k seek=1 count=1 status=none
@ -263,15 +255,11 @@ Partitioned:
  $ blockdev --setrw ${DEV}p1
  .*BLKROSET: Permission denied (re)
  [1]
-  $ sudo blockdev --setrw ${DEV}p1
-  .*BLKROSET: Read-only file system (re)
-  [1]
+  $ sudo blockdev --setrw ${DEV}p1  # succeeds but effectively ignored
  $ blockdev --setrw ${DEV}p2
  .*BLKROSET: Permission denied (re)
  [1]
-  $ sudo blockdev --setrw ${DEV}p2
-  .*BLKROSET: Read-only file system (re)
-  [1]
+  $ sudo blockdev --setrw ${DEV}p2  # succeeds but effectively ignored
  $ blockdev --getro ${DEV}p1
  1
  $ blockdev --getro ${DEV}p2
--- a/ceph/qa/standalone/crush/crush-classes.sh
+++ b/ceph/qa/standalone/crush/crush-classes.sh
@ -57,6 +57,34 @@ function get_osds_up() {
    echo $osds
 }

+function TEST_reweight_vs_classes() {
+    local dir=$1
+
+    # CrushWrapper::update_item (and ceph osd crush set) must rebuild the shadow
+    # tree too. https://tracker.ceph.com/issues/48065
+
+    run_mon $dir a || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+
+    ceph osd crush set-device-class ssd osd.0 || return 1
+    ceph osd crush class ls-osd ssd | grep 0 || return 1
+    ceph osd crush set-device-class ssd osd.1 || return 1
+    ceph osd crush class ls-osd ssd | grep 1 || return 1
+
+    ceph osd crush reweight osd.0 1
+
+    h=`hostname -s`
+    ceph osd crush dump | jq ".buckets[] | select(.name==\"$h\") | .items[0].weight" | grep 65536
+    ceph osd crush dump | jq ".buckets[] | select(.name==\"$h~ssd\") | .items[0].weight" | grep 65536
+
+    ceph osd crush set 0 2 host=$h
+
+    ceph osd crush dump | jq ".buckets[] | select(.name==\"$h\") | .items[0].weight" | grep 131072
+    ceph osd crush dump | jq ".buckets[] | select(.name==\"$h~ssd\") | .items[0].weight" | grep 131072
+}
+
 function TEST_classes() {
    local dir=$1

--- a/ceph/qa/standalone/misc/ok-to-stop.sh
+++ b/ceph/qa/standalone/misc/ok-to-stop.sh
@ -264,6 +264,8 @@ function TEST_0_osd() {
    ceph osd ok-to-stop 3 || return 1
    ! ceph osd ok-to-stop 0 1 || return 1
    ! ceph osd ok-to-stop 2 3 || return 1
+    ceph osd ok-to-stop 0 --max 2 | grep '[0]' || return 1
+    ceph osd ok-to-stop 1 --max 2 | grep '[1]' || return 1

    # with min_size 2 we can stop 1 osds
    ceph osd pool set ec min_size 2 || return 1
@ -274,6 +276,11 @@ function TEST_0_osd() {
    ! ceph osd ok-to-stop 0 1 2 || return 1
    ! ceph osd ok-to-stop 1 2 3 || return 1

+    ceph osd ok-to-stop 0 --max 2 | grep '[0,1]' || return 1
+    ceph osd ok-to-stop 0 --max 20 | grep '[0,1]' || return 1
+    ceph osd ok-to-stop 2 --max 2 | grep '[2,3]' || return 1
+    ceph osd ok-to-stop 2 --max 20 | grep '[2,3]' || return 1
+
    # we should get the same result with one of the osds already down
    kill_daemons $dir TERM osd.0 || return 1
    ceph osd down 0 || return 1
--- a/ceph/qa/suites/fs/basic_functional/tasks/volumes.yaml
+++ b/ceph/qa/suites/fs/basic_functional/tasks/volumes.yaml
@ -2,7 +2,10 @@ overrides:
  ceph:
    conf:
      mgr:
-        debug client: 10
+        debug mgr: 20
+        debug ms: 1
+        debug finisher: 20
+        debug client: 20
    log-whitelist:
      - OSD full dropping all updates
      - OSD near full
--- a/ceph/qa/suites/fs/upgrade/featureful_client/old_client/tasks/3-compat_client/mimic.yaml
+++ b/ceph/qa/suites/fs/upgrade/featureful_client/old_client/tasks/3-compat_client/mimic.yaml
@ -7,4 +7,6 @@ tasks:
    mon.a:
      - ceph fs dump --format=json-pretty
      - ceph fs set cephfs min_compat_client mimic
+- sleep:
+    duration: 5
 - fs.clients_evicted:
--- a/ceph/qa/suites/fs/upgrade/featureful_client/upgraded_client/tasks/4-compat_client.yaml
+++ b/ceph/qa/suites/fs/upgrade/featureful_client/upgraded_client/tasks/4-compat_client.yaml
@ -7,6 +7,8 @@ tasks:
    mon.a:
      - ceph fs dump --format=json-pretty
      - ceph fs set cephfs min_compat_client mimic
+- sleep:
+    duration: 5
 - fs.clients_evicted:
    clients:
      client.0: False
--- a/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_ffsb.yaml
+++ b/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_ffsb.yaml
@ -3,6 +3,9 @@ overrides:
    log-whitelist:
    - SLOW_OPS
    - slow request
+    conf:
+      osd:
+        osd heartbeat grace: 60
 tasks:
 - workunit:
    clients:
--- a/ceph/qa/suites/kcephfs/thrash/workloads/kclient_workunit_suites_ffsb.yaml
+++ b/ceph/qa/suites/kcephfs/thrash/workloads/kclient_workunit_suites_ffsb.yaml
@ -6,6 +6,7 @@ overrides:
    conf:
      osd:
        filestore flush min: 0
+        osd heartbeat grace: 60
 tasks:
 - workunit:
    clients:
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/.qa
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/%
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/%
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/.qa
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/0-cluster/+
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/0-cluster/+
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/0-cluster/.qa
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/0-cluster/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/0-cluster/openstack.yaml
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/0-cluster/openstack.yaml
@ -0,0 +1,4 @@
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 30 # GB
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/0-cluster/start.yaml
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/0-cluster/start.yaml
@ -0,0 +1,21 @@
+meta:
+- desc: |
+   Insatll and run ceph on one node,
+   with a separate client 1.
+   Upgrade client 1 to nautilus
+   Run tests against old cluster
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - osd.0
+  - osd.1
+  - osd.2
+  - client.0
+  - mgr.x
+- - client.1
+overrides:
+  ceph:
+    log-whitelist:
+    - failed to encode map
+    fs: xfs
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/1-install/.qa
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/1-install/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/1-install/nautilus-client-x.yaml
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/1-install/nautilus-client-x.yaml
@ -0,0 +1,11 @@
+tasks:
+- install:
+    branch: nautilus
+    exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev','python34-cephfs','python34-rados']
+- print: "**** done install nautilus"
+- install.upgrade:
+   exclude_packages: ['ceph-test', 'ceph-test-dbg','libcephfs1', 'python-ceph']
+   client.1:
+- print: "**** done install.upgrade to -x on client.1"
+- ceph:
+- print: "**** done ceph task"
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/2-features/.qa
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/2-features/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/2-features/defaults.yaml
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/2-features/defaults.yaml
@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default features: 61
+
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/2-features/layering.yaml
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/2-features/layering.yaml
@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      client:
+        rbd default features: 1
+
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/3-workload/.qa
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/3-workload/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/3-workload/rbd_notification_tests.yaml
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/3-workload/rbd_notification_tests.yaml
@ -0,0 +1,34 @@
+tasks:
+- parallel:
+  - workunit:
+      branch: nautilus
+      clients:
+        client.0:
+          - rbd/notify_master.sh
+      env:
+        RBD_FEATURES: "61"
+  - workunit:
+      branch: pacific
+      clients:
+        client.1:
+          - rbd/notify_slave.sh
+      env:
+        RBD_FEATURES: "61"
+        RBD_DISABLE_UPDATE_FEATURES: "1"
+- print: "**** done rbd: old librbd -> new librbd"
+- parallel:
+  - workunit:
+      branch: nautilus
+      clients:
+        client.0:
+          - rbd/notify_slave.sh
+      env:
+        RBD_FEATURES: "61"
+  - workunit:
+      branch: pacific
+      clients:
+        client.1:
+          - rbd/notify_master.sh
+      env:
+        RBD_FEATURES: "61"
+- print: "**** done rbd: new librbd -> old librbd"
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/supported/.qa
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/supported/.qa
@ -0,0 +1 @@
+../.qa/
--- a/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/supported/ubuntu_18.04.yaml
+++ b/ceph/qa/suites/upgrade-clients/client-upgrade-nautilus-pacific/nautilus-client-x/rbd/supported/ubuntu_18.04.yaml
@ -0,0 +1 @@
+../../../../../../distros/all/ubuntu_18.04.yaml
--- a/ceph/qa/suites/upgrade/nautilus-p2p/nautilus-p2p-parallel/point-to-point-upgrade.yaml
+++ b/ceph/qa/suites/upgrade/nautilus-p2p/nautilus-p2p-parallel/point-to-point-upgrade.yaml
@ -1,9 +1,9 @@
 meta:
 - desc: |
   Run ceph on two nodes, using one of them as a client,
-   with a separate client-only node. 
+   with a separate client-only node.
   Use xfs beneath the osds.
-   install ceph/nautilus v14.2.2 point version
+   install ceph/nautilus v14.2.20 point version
   run workload and upgrade-sequence in parallel
   (every point reslease should be tested)
   run workload and upgrade-sequence in parallel
@ -32,8 +32,8 @@ overrides:
    - cache pools at or near target size
    - filesystem is degraded
    - OBJECT_MISPLACED
-    ### ref: https://tracker.ceph.com/issues/40251 
-    #removed see ^ - failed to encode map 
+    ### ref: https://tracker.ceph.com/issues/40251
+    #removed see ^ - failed to encode map

    fs: xfs

@ -46,7 +46,7 @@ overrides:
      osd:
        osd map max advance: 1000
        osd_class_default_list: "*"
-        osd_class_load_list: "*" 
+        osd_class_load_list: "*"
      client:
        rgw_crypt_require_ssl: false
        rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
@ -70,19 +70,19 @@ openstack:
    size: 30 # GB
 tasks:
 # v14.2.0 removed per http://tracker.ceph.com/issues/40251
- print: "****  done nautilus v14.2.2 about to install"
+- print: "****  done nautilus v14.2.20 about to install"
 - install:
-    tag: v14.2.2
+    tag: v14.2.20
    # line below can be removed its from jewel test
    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
- print: "**** done v14.2.2 install"
+- print: "**** done v14.2.20 install"
 - ceph:
   fs: xfs
   add_osds_to_crush: true
 - print: "**** done ceph xfs"
 - sequential:
   - workload
- print: "**** done workload v14.2.2"
+- print: "**** done workload v14.2.20"


 # v14.2.1 removed per http://tracker.ceph.com/issues/40251
--- a/ceph/qa/suites/upgrade/nautilus-p2p/nautilus-p2p-stress-split/1-ceph-install/nautilus.yaml
+++ b/ceph/qa/suites/upgrade/nautilus-p2p/nautilus-p2p-stress-split/1-ceph-install/nautilus.yaml
@ -1,11 +1,11 @@
 meta:
- desc: install ceph/nautilus v14.2.2
+- desc: install ceph/nautilus v14.2.20
 tasks:
 - install:
-    tag: v14.2.2
+    tag: v14.2.20
    exclude_packages: ['librados3']
    extra_packages: ['librados2']
- print: "**** done install nautilus v14.2.2"
+- print: "**** done install nautilus v14.2.20"
 - ceph:
 - exec:
    osd.0:
--- a/ceph/qa/suites/upgrade/nautilus-p2p/nautilus-p2p-stress-split/7-final-workload/rbd-python.yaml
+++ b/ceph/qa/suites/upgrade/nautilus-p2p/nautilus-p2p-stress-split/7-final-workload/rbd-python.yaml
@ -3,7 +3,7 @@ meta:
   librbd python api tests
 tasks:
 - workunit:
-    tag: v14.2.10
+    tag: v14.2.20
    clients:
      client.0:
        - rbd/test_librbd_python.sh
--- a/ceph/qa/tasks/ceph_manager.py
+++ b/ceph/qa/tasks/ceph_manager.py
@ -3,6 +3,7 @@ ceph manager -- Thrasher and CephManager objects
 """
 from functools import wraps
 import contextlib
+import errno
 import random
 import signal
 import time
@ -2560,13 +2561,22 @@ class CephManager:
        Loop until quorum size is reached.
        """
        self.log('waiting for quorum size %d' % size)
-        start = time.time()
-        while not len(self.get_mon_quorum()) == size:
-            if timeout is not None:
-                assert time.time() - start < timeout, \
-                    ('failed to reach quorum size %d '
-                     'before timeout expired' % size)
-            time.sleep(3)
+        sleep = 3
+        with safe_while(sleep=sleep,
+                        tries=timeout // sleep,
+                        action=f'wait for quorum size {size}') as proceed:
+            while proceed():
+                try:
+                    if len(self.get_mon_quorum()) == size:
+                        break
+                except CommandFailedError as e:
+                    # could fail instea4d of blocked if the rotating key of the
+                    # connected monitor is not updated yet after they form the
+                    # quorum
+                    if e.exitstatus == errno.EACCES:
+                        pass
+                    else:
+                        raise
        self.log("quorum is size %d" % size)

    def get_mon_health(self, debug=False):
--- a/ceph/qa/tasks/cephfs/cephfs_test_case.py
+++ b/ceph/qa/tasks/cephfs/cephfs_test_case.py
@ -176,6 +176,9 @@ class CephFSTestCase(CephTestCase):
        for m in self.mounts:
            m.teardown()

+        # To prevent failover messages during Unwind of ceph task
+        self.mds_cluster.delete_all_filesystems()
+
        for i, m in enumerate(self.mounts):
            m.client_id = self._original_client_ids[i]

--- a/ceph/qa/tasks/cephfs/test_scrub_checks.py
+++ b/ceph/qa/tasks/cephfs/test_scrub_checks.py
@ -6,6 +6,7 @@ import logging
 import errno
 import time
 from teuthology.exceptions import CommandFailedError
+from teuthology.contextutil import safe_while
 import os
 from tasks.cephfs.cephfs_test_case import CephFSTestCase

@ -30,22 +31,46 @@ class TestScrubControls(CephFSTestCase):
        self.assertEqual(res['return_code'], expected)
    def _get_scrub_status(self):
        return self.fs.rank_tell(["scrub", "status"])
-    def _check_task_status(self, expected_status):
-        task_status = self.fs.get_task_status("scrub status")
-        active = self.fs.get_active_names()
-        log.debug("current active={0}".format(active))
-        self.assertTrue(task_status[active[0]].startswith(expected_status))
+    def _check_task_status(self, expected_status, timo=120):
+        """ check scrub status for current active mds in ceph status """
+        with safe_while(sleep=1, tries=120, action='wait for task status') as proceed:
+            while proceed():
+                active = self.fs.get_active_names()
+                log.debug("current active={0}".format(active))
+                task_status = self.fs.get_task_status("scrub status")
+                try:
+                    if task_status[active[0]].startswith(expected_status):
+                        return True
+                except KeyError:
+                    pass
+
+    def _check_task_status_na(self, timo=120):
+        """ check absence of scrub status in ceph status """
+        with safe_while(sleep=1, tries=120, action='wait for task status') as proceed:
+            while proceed():
+                active = self.fs.get_active_names()
+                log.debug("current active={0}".format(active))
+                task_status = self.fs.get_task_status("scrub status")
+                if not active[0] in task_status:
+                    return True
+
+    def create_scrub_data(self, test_dir):
+        for i in range(32):
+            dirname = "dir.{0}".format(i)
+            dirpath = os.path.join(test_dir, dirname)
+            self.mount_a.run_shell_payload(f"""
+set -e
+mkdir -p {dirpath}
+for ((i = 0; i < 32; i++)); do
+    dd if=/dev/urandom of={dirpath}/filename.$i bs=1M conv=fdatasync count=1
+done
+""")

    def test_scrub_abort(self):
        test_dir = "scrub_control_test_path"
        abs_test_path = "/{0}".format(test_dir)

-        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
-        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
-        log.info("client_path: {0}".format(client_path))
-
-        log.info("Cloning repo into place")
-        TestScrubChecks.clone_repo(self.mount_a, client_path)
+        self.create_scrub_data(test_dir)

        out_json = self.fs.rank_tell(["scrub", "start", abs_test_path, "recursive"])
        self.assertNotEqual(out_json, None)
@ -56,8 +81,8 @@ class TestScrubControls(CephFSTestCase):
        self.assertTrue("no active" in out_json['status'])

        # sleep enough to fetch updated task status
-        time.sleep(10)
-        self._check_task_status("idle")
+        checked = self._check_task_status_na()
+        self.assertTrue(checked)

    def test_scrub_pause_and_resume(self):
        test_dir = "scrub_control_test_path"
@ -67,8 +92,7 @@ class TestScrubControls(CephFSTestCase):
        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
        log.info("client_path: {0}".format(client_path))

-        log.info("Cloning repo into place")
-        _ = TestScrubChecks.clone_repo(self.mount_a, client_path)
+        self.create_scrub_data(test_dir)

        out_json = self.fs.rank_tell(["scrub", "start", abs_test_path, "recursive"])
        self.assertNotEqual(out_json, None)
@ -78,25 +102,22 @@ class TestScrubControls(CephFSTestCase):
        out_json = self._get_scrub_status()
        self.assertTrue("PAUSED" in out_json['status'])

-        # sleep enough to fetch updated task status
-        time.sleep(10)
-        self._check_task_status("paused")
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)

        # resume and verify
        self._resume_scrub(0)
        out_json = self._get_scrub_status()
        self.assertFalse("PAUSED" in out_json['status'])

+        checked = self._check_task_status_na()
+        self.assertTrue(checked)
+
    def test_scrub_pause_and_resume_with_abort(self):
        test_dir = "scrub_control_test_path"
        abs_test_path = "/{0}".format(test_dir)

-        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
-        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
-        log.info("client_path: {0}".format(client_path))
-
-        log.info("Cloning repo into place")
-        _ = TestScrubChecks.clone_repo(self.mount_a, client_path)
+        self.create_scrub_data(test_dir)

        out_json = self.fs.rank_tell(["scrub", "start", abs_test_path, "recursive"])
        self.assertNotEqual(out_json, None)
@ -106,9 +127,8 @@ class TestScrubControls(CephFSTestCase):
        out_json = self._get_scrub_status()
        self.assertTrue("PAUSED" in out_json['status'])

-        # sleep enough to fetch updated task status
-        time.sleep(10)
-        self._check_task_status("paused")
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)

        # abort and verify
        self._abort_scrub(0)
@ -116,26 +136,37 @@ class TestScrubControls(CephFSTestCase):
        self.assertTrue("PAUSED" in out_json['status'])
        self.assertTrue("0 inodes" in out_json['status'])

-        # sleep enough to fetch updated task status
-        time.sleep(10)
-        self._check_task_status("paused")
+        # scrub status should still be paused...
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)

        # resume and verify
        self._resume_scrub(0)
        out_json = self._get_scrub_status()
        self.assertTrue("no active" in out_json['status'])

-        # sleep enough to fetch updated task status
-        time.sleep(10)
-        self._check_task_status("idle")
+        checked = self._check_task_status_na()
+        self.assertTrue(checked)

    def test_scrub_task_status_on_mds_failover(self):
-        # sleep enough to fetch updated task status
-        time.sleep(10)
-
        (original_active, ) = self.fs.get_active_names()
        original_standbys = self.mds_cluster.get_standby_daemons()
-        self._check_task_status("idle")
+
+        test_dir = "scrub_control_test_path"
+        abs_test_path = "/{0}".format(test_dir)
+
+        self.create_scrub_data(test_dir)
+
+        out_json = self.fs.rank_tell(["scrub", "start", abs_test_path, "recursive"])
+        self.assertNotEqual(out_json, None)
+
+        # pause and verify
+        self._pause_scrub(0)
+        out_json = self._get_scrub_status()
+        self.assertTrue("PAUSED" in out_json['status'])
+
+        checked = self._check_task_status("paused")
+        self.assertTrue(checked)

        # Kill the rank 0
        self.fs.mds_stop(original_active)
@ -150,12 +181,7 @@ class TestScrubControls(CephFSTestCase):
            original_standbys))
        self.wait_until_true(promoted, timeout=grace*2)

-        mgr_beacon_grace = float(self.fs.get_config("mgr_service_beacon_grace", service_type="mon"))
-
-        def status_check():
-            task_status = self.fs.get_task_status("scrub status")
-            return original_active not in task_status
-        self.wait_until_true(status_check, timeout=mgr_beacon_grace*2)
+        self._check_task_status_na()

 class TestScrubChecks(CephFSTestCase):
    """
--- a/ceph/qa/tasks/cephfs/test_volume_client.py
+++ b/ceph/qa/tasks/cephfs/test_volume_client.py
@ -562,6 +562,9 @@ vc.disconnect()
        self.mount_a.run_shell(["touch", os.path.join(mount_path, "noperms")])
        self.mount_a.run_shell(["chmod", "0000", os.path.join(mount_path, "noperms")])

+        # A folder with non-ascii characters
+        self.mount_a.run_shell(["mkdir", os.path.join(mount_path, u"f\u00F6n")])
+
        self._volume_client_python(self.mount_b, dedent("""
            vp = VolumePath("{group_id}", u"{volume_id}")
            vc.delete_volume(vp)
--- a/ceph/qa/tasks/cephfs/test_volumes.py
+++ b/ceph/qa/tasks/cephfs/test_volumes.py
@ -3649,6 +3649,48 @@ class TestVolumes(CephFSTestCase):
        # verify trash dir is clean
        self._wait_for_trash_empty()

+    def test_subvolume_snapshot_clone_retain_suid_guid(self):
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume)
+
+        # Create a file with suid, guid bits set along with executable bit.
+        args = ["subvolume", "getpath", self.volname, subvolume]
+        args = tuple(args)
+        subvolpath = self._fs_cmd(*args)
+        self.assertNotEqual(subvolpath, None)
+        subvolpath = subvolpath[1:].rstrip() # remove "/" prefix and any trailing newline
+
+        file_path = subvolpath
+        file_path = os.path.join(subvolpath, "test_suid_file")
+        self.mount_a.run_shell(["touch", file_path])
+        self.mount_a.run_shell(["chmod", "u+sx,g+sx", file_path])
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # check clone status
+        self._wait_for_clone_to_complete(clone)
+
+        # verify clone
+        self._verify_clone(subvolume, snapshot, clone)
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+        self._fs_cmd("subvolume", "rm", self.volname, clone)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
    def test_subvolume_snapshot_reconf_max_concurrent_clones(self):
        """
        Validate 'max_concurrent_clones' config option
--- a/ceph/qa/tasks/mgr/dashboard/test_rgw.py
+++ b/ceph/qa/tasks/mgr/dashboard/test_rgw.py
@ -107,6 +107,16 @@ class RgwApiCredentialsTest(RgwTestCase):
                      data['message'])


+class RgwSiteTest(RgwTestCase):
+
+    AUTH_ROLES = ['rgw-manager']
+
+    def test_get_realms(self):
+        data = self._get('/api/rgw/site?query=realms')
+        self.assertStatus(200)
+        self.assertSchema(data, JList(str))
+
+
 class RgwBucketTest(RgwTestCase):

    AUTH_ROLES = ['rgw-manager']
--- a/ceph/qa/tasks/mgr/test_progress.py
+++ b/ceph/qa/tasks/mgr/test_progress.py
@ -166,8 +166,7 @@ class TestProgress(MgrTestCase):

        # Wait for a progress event to pop up
        self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
-                              timeout=self.EVENT_CREATION_PERIOD*2,
-                              period=1)
+                              timeout=self.EVENT_CREATION_PERIOD*2)
        ev = self._get_osd_in_out_events('out')[0]
        log.info(json.dumps(ev, indent=1))
        self.assertIn("Rebalancing after osd.0 marked out", ev['message'])
@ -182,13 +181,12 @@ class TestProgress(MgrTestCase):
        
        # First Event should complete promptly
        self.wait_until_true(lambda: self._is_complete(initial_event['id']),
-                             timeout=self.EVENT_CREATION_PERIOD)
-       
+                             timeout=self.RECOVERY_PERIOD)
+
        try:
            # Wait for progress event marked in to pop up
            self.wait_until_equal(lambda: self._osd_in_out_events_count('in'), 1,
-                                  timeout=self.EVENT_CREATION_PERIOD*2,
-                                  period=1)
+                                  timeout=self.EVENT_CREATION_PERIOD*2)
        except RuntimeError as ex:
            if not "Timed out after" in str(ex):
                raise ex
@ -261,7 +259,7 @@ class TestProgress(MgrTestCase):

        # Event should complete promptly
        self.wait_until_true(lambda: self._is_complete(ev['id']),
-                             timeout=self.EVENT_CREATION_PERIOD)
+                             timeout=self.RECOVERY_PERIOD)
        self.assertTrue(self._is_quiet())

    def test_osd_came_back(self):
@ -274,10 +272,11 @@ class TestProgress(MgrTestCase):
        ev1 = self._simulate_failure()

        ev2 = self._simulate_back_in([0], ev1)
-        
-        # Wait for progress event to ultimately complete
-        self.wait_until_true(lambda: self._is_complete(ev2['id']),
-                             timeout=self.RECOVERY_PERIOD)
+
+        if ev2 is not None:
+            # Wait for progress event to ultimately complete
+            self.wait_until_true(lambda: self._is_complete(ev2['id']),
+                                 timeout=self.RECOVERY_PERIOD)

        self.assertTrue(self._is_quiet())

@ -364,8 +363,8 @@ class TestProgress(MgrTestCase):
                'osd', 'out', '0')

        # Wait for a progress event to pop up
-        self.wait_until_equal(lambda: len(self._all_events()), 1,
-                              timeout=self.EVENT_CREATION_PERIOD*2)
+        self.wait_until_equal(lambda: self._osd_in_out_events_count('out'), 1,
+                              timeout=self.RECOVERY_PERIOD)

        ev = self._all_events()[0]

--- a/ceph/qa/tasks/userdata_setup.yaml
+++ b/ceph/qa/tasks/userdata_setup.yaml
@ -14,6 +14,8 @@
  #!/usr/bin/env bash

  # mount a NFS share for storing logs
+  sed -i 's/archive.ubuntu.com/old-releases.ubuntu.com/' /etc/apt/sources.list
+  sed -i 's/security.ubuntu.com/old-releases.ubuntu.com/' /etc/apt/sources.list
  apt-get update
  apt-get -y install nfs-common
  mkdir /mnt/log
--- a/ceph/qa/tasks/vstart_runner.py
+++ b/ceph/qa/tasks/vstart_runner.py
@ -298,6 +298,7 @@ class LocalRemote(object):
                                       stderr=subprocess.PIPE,
                                       stdin=subprocess.PIPE,
                                       cwd=cwd,
+                                       env=env,
                                       shell=True)
        else:
            # Sanity check that we've got a list of strings
@ -917,7 +918,9 @@ class LocalContext(object):
                    self.daemons.daemons[prefixed_type][svc_id] = LocalDaemon(svc_type, svc_id)

    def __del__(self):
-        shutil.rmtree(self.teuthology_config['test_path'])
+        path = self.teuthology_config['test_path']
+        if path is not None:
+            shutil.rmtree(path)

 def exec_test():
    # Parse arguments
--- a/ceph/qa/workunits/rbd/cli_generic.sh
+++ b/ceph/qa/workunits/rbd/cli_generic.sh
@ -485,21 +485,148 @@ test_purge() {
    echo "testing trash purge..."
    remove_images

+    rbd trash ls | wc -l | grep 0
+    rbd trash purge
+
+    rbd create $RBD_CREATE_ARGS --size 256 testimg1
+    rbd create $RBD_CREATE_ARGS --size 256 testimg2
+    rbd trash mv testimg1
+    rbd trash mv testimg2
+    rbd trash ls | wc -l | grep 2
    rbd trash purge
    rbd trash ls | wc -l | grep 0

-    rbd create $RBD_CREATE_ARGS foo -s 1
-    rbd create $RBD_CREATE_ARGS bar -s 1
+    rbd create $RBD_CREATE_ARGS --size 256 testimg1
+    rbd create $RBD_CREATE_ARGS --size 256 testimg2
+    rbd trash mv testimg1 --expires-at "1 hour"
+    rbd trash mv testimg2 --expires-at "3 hours"
+    rbd trash ls | wc -l | grep 2
+    rbd trash purge
+    rbd trash ls | wc -l | grep 2
+    rbd trash purge --expired-before "now + 2 hours"
+    rbd trash ls | wc -l | grep 1
+    rbd trash ls | grep testimg2
+    rbd trash purge --expired-before "now + 4 hours"
+    rbd trash ls | wc -l | grep 0

-    rbd trash mv foo --expires-at "10 sec"
-    rbd trash mv bar --expires-at "30 sec"
+    rbd create $RBD_CREATE_ARGS --size 256 testimg1
+    rbd snap create testimg1@snap  # pin testimg1
+    rbd create $RBD_CREATE_ARGS --size 256 testimg2
+    rbd create $RBD_CREATE_ARGS --size 256 testimg3
+    rbd trash mv testimg1
+    rbd trash mv testimg2
+    rbd trash mv testimg3
+    rbd trash ls | wc -l | grep 3
+    rbd trash purge 2>&1 | grep 'some expired images could not be removed'
+    rbd trash ls | wc -l | grep 1
+    rbd trash ls | grep testimg1
+    ID=$(rbd trash ls | awk '{ print $1 }')
+    rbd snap purge --image-id $ID
+    rbd trash purge
+    rbd trash ls | wc -l | grep 0

-    rbd trash purge --expired-before "now + 10 sec"
-    rbd trash ls | grep -v foo | wc -l | grep 1
-    rbd trash ls | grep bar
+    rbd create $RBD_CREATE_ARGS --size 256 testimg1
+    rbd create $RBD_CREATE_ARGS --size 256 testimg2
+    rbd snap create testimg2@snap  # pin testimg2
+    rbd create $RBD_CREATE_ARGS --size 256 testimg3
+    rbd trash mv testimg1
+    rbd trash mv testimg2
+    rbd trash mv testimg3
+    rbd trash ls | wc -l | grep 3
+    rbd trash purge 2>&1 | grep 'some expired images could not be removed'
+    rbd trash ls | wc -l | grep 1
+    rbd trash ls | grep testimg2
+    ID=$(rbd trash ls | awk '{ print $1 }')
+    rbd snap purge --image-id $ID
+    rbd trash purge
+    rbd trash ls | wc -l | grep 0

-    LAST_IMG=$(rbd trash ls | grep bar | awk '{print $1;}')
-    rbd trash rm $LAST_IMG --force --no-progress | grep -v '.' | wc -l | grep 0
+    rbd create $RBD_CREATE_ARGS --size 256 testimg1
+    rbd create $RBD_CREATE_ARGS --size 256 testimg2
+    rbd create $RBD_CREATE_ARGS --size 256 testimg3
+    rbd snap create testimg3@snap  # pin testimg3
+    rbd trash mv testimg1
+    rbd trash mv testimg2
+    rbd trash mv testimg3
+    rbd trash ls | wc -l | grep 3
+    rbd trash purge 2>&1 | grep 'some expired images could not be removed'
+    rbd trash ls | wc -l | grep 1
+    rbd trash ls | grep testimg3
+    ID=$(rbd trash ls | awk '{ print $1 }')
+    rbd snap purge --image-id $ID
+    rbd trash purge
+    rbd trash ls | wc -l | grep 0
+
+    # test purging a clone with a chain of parents
+    rbd create $RBD_CREATE_ARGS --size 256 testimg1
+    rbd snap create testimg1@snap
+    rbd clone --rbd-default-clone-format=2 testimg1@snap testimg2
+    rbd snap rm testimg1@snap
+    rbd create $RBD_CREATE_ARGS --size 256 testimg3
+    rbd snap create testimg2@snap
+    rbd clone --rbd-default-clone-format=2 testimg2@snap testimg4
+    rbd clone --rbd-default-clone-format=2 testimg2@snap testimg5
+    rbd snap rm testimg2@snap
+    rbd snap create testimg4@snap
+    rbd clone --rbd-default-clone-format=2 testimg4@snap testimg6
+    rbd snap rm testimg4@snap
+    rbd trash mv testimg1
+    rbd trash mv testimg2
+    rbd trash mv testimg3
+    rbd trash mv testimg4
+    rbd trash ls | wc -l | grep 4
+    rbd trash purge 2>&1 | grep 'some expired images could not be removed'
+    rbd trash ls | wc -l | grep 3
+    rbd trash ls | grep testimg1
+    rbd trash ls | grep testimg2
+    rbd trash ls | grep testimg4
+    rbd trash mv testimg6
+    rbd trash ls | wc -l | grep 4
+    rbd trash purge 2>&1 | grep 'some expired images could not be removed'
+    rbd trash ls | wc -l | grep 2
+    rbd trash ls | grep testimg1
+    rbd trash ls | grep testimg2
+    rbd trash mv testimg5
+    rbd trash ls | wc -l | grep 3
+    rbd trash purge
+    rbd trash ls | wc -l | grep 0
+
+    rbd create $RBD_CREATE_ARGS --size 256 testimg1
+    rbd snap create testimg1@snap
+    rbd clone --rbd-default-clone-format=2 testimg1@snap testimg2
+    rbd snap rm testimg1@snap
+    rbd create $RBD_CREATE_ARGS --size 256 testimg3
+    rbd snap create testimg3@snap  # pin testimg3
+    rbd snap create testimg2@snap
+    rbd clone --rbd-default-clone-format=2 testimg2@snap testimg4
+    rbd clone --rbd-default-clone-format=2 testimg2@snap testimg5
+    rbd snap rm testimg2@snap
+    rbd snap create testimg4@snap
+    rbd clone --rbd-default-clone-format=2 testimg4@snap testimg6
+    rbd snap rm testimg4@snap
+    rbd trash mv testimg1
+    rbd trash mv testimg2
+    rbd trash mv testimg3
+    rbd trash mv testimg4
+    rbd trash ls | wc -l | grep 4
+    rbd trash purge 2>&1 | grep 'some expired images could not be removed'
+    rbd trash ls | wc -l | grep 4
+    rbd trash mv testimg6
+    rbd trash ls | wc -l | grep 5
+    rbd trash purge 2>&1 | grep 'some expired images could not be removed'
+    rbd trash ls | wc -l | grep 3
+    rbd trash ls | grep testimg1
+    rbd trash ls | grep testimg2
+    rbd trash ls | grep testimg3
+    rbd trash mv testimg5
+    rbd trash ls | wc -l | grep 4
+    rbd trash purge 2>&1 | grep 'some expired images could not be removed'
+    rbd trash ls | wc -l | grep 1
+    rbd trash ls | grep testimg3
+    ID=$(rbd trash ls | awk '{ print $1 }')
+    rbd snap purge --image-id $ID
+    rbd trash purge
+    rbd trash ls | wc -l | grep 0
 }

 test_deep_copy_clone() {
--- a/ceph/qa/workunits/rgw/test_rgw_orphan_list.sh
+++ b/ceph/qa/workunits/rgw/test_rgw_orphan_list.sh
@ -1,12 +1,13 @@
 #!/usr/bin/env bash

-set -ex
+# set -x
+set -e

 # if defined, debug messages will be displayed and prepended with the string
 # debug="DEBUG"

-huge_size=2222 # in megabytes
-big_size=6 # in megabytes
+huge_size=5100 # in megabytes
+big_size=7 # in megabytes

 huge_obj=/tmp/huge_obj.temp.$$
 big_obj=/tmp/big_obj.temp.$$
@ -160,7 +161,6 @@ mys3uploadkill() {
 	exit 1
    fi

-    set -v
    local_file="$1"
    remote_bkt="$2"
    remote_obj="$3"
@ -229,8 +229,16 @@ mys3cmd ls s3://multipart-bkt
 bkt="incomplete-mp-bkt-1"

 mys3cmd mb s3://$bkt
-mys3uploadkill $huge_obj $bkt incomplete-mp-obj-1 $fifo 20
-mys3uploadkill $huge_obj $bkt incomplete-mp-obj-2 $fifo 100
+
+mys3uploadkill $huge_obj $bkt incomplete-mp-obj-c $fifo 20
+
+# generate an incomplete multipart with more than 1,000 parts
+mys3uploadkill $huge_obj $bkt incomplete-mp-obj-b $fifo 1005
+
+# generate more than 1000 incomplet multiparts
+for c in $(seq 1005) ;do
+    mys3uploadkill $huge_obj $bkt incomplete-mp-obj-c-$c $fifo 3
+done

 ####################################
 # resharded bucket
--- a/ceph/run-make-check.sh
+++ b/ceph/run-make-check.sh
@ -22,6 +22,10 @@ source src/script/run-make.sh

 set -e

+function in_jenkins() {
+    test -n "$JENKINS_HOME"
+}
+
 function run() {
    # to prevent OSD EMFILE death on tests, make sure ulimit >= 1024
    $DRY_RUN ulimit -n $(ulimit -Hn)
@ -35,9 +39,16 @@ function run() {
    $DRY_RUN sudo /sbin/sysctl -q -w fs.aio-max-nr=$((65536 * 16))

    CHECK_MAKEOPTS=${CHECK_MAKEOPTS:-$DEFAULT_MAKEOPTS}
-    if ! $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure; then
-        rm -fr ${TMPDIR:-/tmp}/ceph-asok.*
-        return 1
+    if in_jenkins; then
+        if ! ctest $CHECK_MAKEOPTS --no-compress-output --output-on-failure -T Test; then
+            # do not return failure, as the jenkins publisher will take care of this
+            rm -fr ${TMPDIR:-/tmp}/ceph-asok.*
+        fi
+    else
+        if ! $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure; then
+            rm -fr ${TMPDIR:-/tmp}/ceph-asok.*
+            return 1
+        fi
    fi
 }

--- a/ceph/src/.git_version
+++ b/ceph/src/.git_version
@ -1,2 +1,2 @@
-36274af6eb7f2a5055f2d53ad448f2694e9046a0
-v14.2.20
+ca74598065096e6fcbd8433c8779a2be0c889351
+v14.2.22
--- a/ceph/src/CMakeLists.txt
+++ b/ceph/src/CMakeLists.txt
@ -401,7 +401,7 @@ target_link_libraries(ceph-common ${ceph_common_deps})
 # appease dpkg-shlibdeps
 set_target_properties(ceph-common PROPERTIES
  SOVERSION 0
-  INSTALL_RPATH "")
+  SKIP_RPATH TRUE)
 if(NOT APPLE AND NOT FREEBSD)
  # Apple uses Mach-O, not ELF. so this option does not apply to APPLE.
  #
@ -692,6 +692,12 @@ if(WITH_RBD)
  add_subdirectory(rbd_replay)
 endif(WITH_RBD)

+if(WITH_BOOST_CONTEXT)
+  set(SPAWN_BUILD_TESTS OFF CACHE INTERNAL "disable building of spawn unit tests")
+  set(SPAWN_INSTALL OFF CACHE INTERNAL "disable installation of spawn headers")
+  add_subdirectory(spawn)
+endif()
+
 # RadosGW
 if(WITH_KVS)
  add_subdirectory(key_value_store)
--- a/ceph/src/ceph-volume/ceph_volume/api/lvm.py
+++ b/ceph/src/ceph-volume/ceph_volume/api/lvm.py
@ -1134,3 +1134,15 @@ def get_device_lvs(device, name_prefix=''):
    lvs = _output_parser(stdout, LV_FIELDS)
    return [Volume(**lv) for lv in lvs if lv['lv_name'] and
            lv['lv_name'].startswith(name_prefix)]
+
+def get_lv_by_fullname(full_name):
+    """
+    returns LV by the specified LV's full name (formatted as vg_name/lv_name)
+    """
+    try:
+        vg_name, lv_name = full_name.split('/')
+        res_lv = get_first_lv(filters={'lv_name': lv_name,
+                                       'vg_name': vg_name})
+    except ValueError:
+        res_lv = None
+    return res_lv
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py
@ -106,7 +106,7 @@ def get_physical_fast_allocs(devices, type_, fast_slots_per_device, new_osds, ar
        requested_slots = fast_slots_per_device

    requested_size = getattr(args, '{}_size'.format(type_), 0)
-    if requested_size == 0:
+    if not requested_size or requested_size == 0:
        # no size argument was specified, check ceph.conf
        get_size_fct = getattr(prepare, 'get_{}_size'.format(type_))
        requested_size = get_size_fct(lv_format=False)
@ -126,6 +126,7 @@ def get_physical_fast_allocs(devices, type_, fast_slots_per_device, new_osds, ar
        if requested_size:
            if requested_size <= abs_size:
                abs_size = requested_size
+                relative_size = int(abs_size) / dev_size
            else:
                mlogger.error(
                    '{} was requested for {}, but only {} can be fulfilled'.format(
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/main.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/main.py
@ -9,6 +9,7 @@ from . import trigger
 from . import listing
 from . import zap
 from . import batch
+from . import migrate


 class LVM(object):
@ -30,6 +31,9 @@ class LVM(object):
        'trigger': trigger.Trigger,
        'list': listing.List,
        'zap': zap.Zap,
+        'migrate': migrate.Migrate,
+        'new-wal': migrate.NewWAL,
+        'new-db': migrate.NewDB,
    }

    def __init__(self, argv):
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/migrate.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/migrate.py
@ -0,0 +1,674 @@
+from __future__ import print_function
+import argparse
+import logging
+import os
+from textwrap import dedent
+from ceph_volume.util import system, disk, merge_dict
+from ceph_volume.util.device import Device
+from ceph_volume import decorators, terminal, process
+from ceph_volume.api import lvm as api
+from ceph_volume.systemd import systemctl
+
+
+logger = logging.getLogger(__name__)
+mlogger = terminal.MultiLogger(__name__)
+
+def get_cluster_name(osd_id, osd_fsid):
+    """
+    From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
+    system that match those tag values, then return cluster_name for the first
+    one.
+    """
+    lv_tags = {}
+    lv_tags['ceph.osd_id'] = osd_id
+    lv_tags['ceph.osd_fsid'] = osd_fsid
+
+    lvs = api.get_lvs(tags=lv_tags)
+    if not lvs:
+        mlogger.error(
+            'Unable to find any LV for source OSD: id:{} fsid:{}'.format(
+                osd_id,  osd_fsid)        )
+        raise SystemExit('Unexpected error, terminating')
+    return next(iter(lvs)).tags["ceph.cluster_name"]
+
+def get_osd_path(osd_id, osd_fsid):
+    return '/var/lib/ceph/osd/{}-{}'.format(
+        get_cluster_name(osd_id, osd_fsid), osd_id)
+
+def find_associated_devices(osd_id, osd_fsid):
+    """
+    From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
+    system that match those tag values, further detect if any partitions are
+    part of the OSD, and then return the set of LVs and partitions (if any).
+    """
+    lv_tags = {}
+    lv_tags['ceph.osd_id'] = osd_id
+    lv_tags['ceph.osd_fsid'] = osd_fsid
+
+    lvs = api.get_lvs(tags=lv_tags)
+    if not lvs:
+        mlogger.error(
+            'Unable to find any LV for source OSD: id:{} fsid:{}'.format(
+                osd_id,  osd_fsid)        )
+        raise SystemExit('Unexpected error, terminating')
+
+    devices = set(ensure_associated_lvs(lvs, lv_tags))
+    return [(Device(path), type) for path, type in devices if path]
+
+def ensure_associated_lvs(lvs, lv_tags):
+    """
+    Go through each LV and ensure if backing devices (journal, wal, block)
+    are LVs or partitions, so that they can be accurately reported.
+    """
+    # look for many LVs for each backing type, because it is possible to
+    # receive a filtering for osd.1, and have multiple failed deployments
+    # leaving many journals with osd.1 - usually, only a single LV will be
+    # returned
+
+    block_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'block'}))
+    db_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'db'}))
+    wal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'wal'}))
+    backing_devices = [(block_lvs, 'block'), (db_lvs, 'db'),
+                       (wal_lvs, 'wal')]
+
+    verified_devices = []
+
+    for lv in lvs:
+        # go through each lv and append it, otherwise query `blkid` to find
+        # a physical device. Do this for each type (journal,db,wal) regardless
+        # if they have been processed in the previous LV, so that bad devices
+        # with the same ID can be caught
+        for ceph_lvs, type in backing_devices:
+
+            if ceph_lvs:
+                verified_devices.extend([(l.lv_path, type) for l in ceph_lvs])
+                continue
+
+            # must be a disk partition, by querying blkid by the uuid we are
+            # ensuring that the device path is always correct
+            try:
+                device_uuid = lv.tags['ceph.{}_uuid'.format(type)]
+            except KeyError:
+                # Bluestore will not have ceph.journal_uuid, and Filestore
+                # will not not have ceph.db_uuid
+                continue
+
+            osd_device = disk.get_device_from_partuuid(device_uuid)
+            if not osd_device:
+                # if the osd_device is not found by the partuuid, then it is
+                # not possible to ensure this device exists anymore, so skip it
+                continue
+            verified_devices.append((osd_device, type))
+
+    return verified_devices
+
+class VolumeTagTracker(object):
+    def __init__(self, devices, target_lv):
+        self.target_lv = target_lv
+        self.data_device = self.db_device = self.wal_device = None
+        for device, type in devices:
+            if type == 'block':
+                self.data_device = device
+            elif type == 'db':
+                self.db_device = device
+            elif type == 'wal':
+                self.wal_device = device
+        if not self.data_device:
+            mlogger.error('Data device not found')
+            raise SystemExit(
+                "Unexpected error, terminating")
+        if not self.data_device.is_lv:
+            mlogger.error('Data device isn\'t LVM')
+            raise SystemExit(
+                "Unexpected error, terminating")
+
+        self.old_target_tags = self.target_lv.tags.copy()
+        self.old_data_tags = (
+            self.data_device.lv_api.tags.copy()
+            if self.data_device.is_lv else None)
+        self.old_db_tags = (
+            self.db_device.lv_api.tags.copy()
+            if self.db_device and self.db_device.is_lv else None)
+        self.old_wal_tags = (
+            self.wal_device.lv_api.tags.copy()
+            if self.wal_device and self.wal_device.is_lv else None)
+
+    def update_tags_when_lv_create(self, create_type):
+        tags = {}
+        if not self.data_device.is_lv:
+            mlogger.warning(
+                'Data device is not LVM, wouldn\'t update LVM tags')
+        else:
+            tags["ceph.{}_uuid".format(create_type)] = self.target_lv.lv_uuid
+            tags["ceph.{}_device".format(create_type)] = self.target_lv.lv_path
+            self.data_device.lv_api.set_tags(tags)
+
+            tags = self.data_device.lv_api.tags.copy()
+            tags["ceph.type"] = create_type
+            self.target_lv.set_tags(tags)
+
+        aux_dev = None
+        if create_type == "db" and self.wal_device:
+            aux_dev = self.wal_device
+        elif create_type == "wal" and self.db_device:
+            aux_dev = self.db_device
+        else:
+            return
+        if not aux_dev.is_lv:
+            mlogger.warning(
+                '{} device is not LVM, wouldn\'t update LVM tags'.format(
+                    create_type.upper()))
+        else:
+            tags = {}
+            tags["ceph.{}_uuid".format(create_type)] = self.target_lv.lv_uuid
+            tags["ceph.{}_device".format(create_type)] = self.target_lv.lv_path
+            aux_dev.lv_api.set_tags(tags)
+
+    def remove_lvs(self, source_devices, target_type):
+        remaining_devices = [self.data_device, self.db_device, self.wal_device]
+
+        outdated_tags = []
+        for device, type in source_devices:
+            if type == "block" or type == target_type:
+                continue
+            remaining_devices.remove(device)
+            if device.is_lv:
+                outdated_tags.append("ceph.{}_uuid".format(type))
+                outdated_tags.append("ceph.{}_device".format(type))
+                device.lv_api.clear_tags()
+        if len(outdated_tags) > 0:
+            for d in remaining_devices:
+                if d and d.is_lv:
+                    d.lv_api.clear_tags(outdated_tags)
+
+    def replace_lvs(self, source_devices, target_type):
+        remaining_devices = [self.data_device]
+        if self.db_device:
+            remaining_devices.append(self.db_device)
+        if self.wal_device:
+            remaining_devices.append(self.wal_device)
+
+        outdated_tags = []
+        for device, type in source_devices:
+            if type == "block":
+                continue
+            remaining_devices.remove(device)
+            if device.is_lv:
+                outdated_tags.append("ceph.{}_uuid".format(type))
+                outdated_tags.append("ceph.{}_device".format(type))
+                device.lv_api.clear_tags()
+
+        new_tags = {}
+        new_tags["ceph.{}_uuid".format(target_type)] = self.target_lv.lv_uuid
+        new_tags["ceph.{}_device".format(target_type)] = self.target_lv.lv_path
+
+        for d in remaining_devices:
+            if d and d.is_lv:
+                if len(outdated_tags) > 0:
+                    d.lv_api.clear_tags(outdated_tags)
+                d.lv_api.set_tags(new_tags)
+
+        if not self.data_device.is_lv:
+            mlogger.warning(
+                'Data device is not LVM, wouldn\'t properly update target LVM tags')
+        else:
+            tags = self.data_device.lv_api.tags.copy()
+
+        tags["ceph.type"] = target_type
+        tags["ceph.{}_uuid".format(target_type)] = self.target_lv.lv_uuid
+        tags["ceph.{}_device".format(target_type)] = self.target_lv.lv_path
+        self.target_lv.set_tags(tags)
+
+    def undo(self):
+        mlogger.info(
+            'Undoing lv tag set')
+        if self.data_device:
+            if self.old_data_tags:
+                self.data_device.lv_api.set_tags(self.old_data_tags)
+            else:
+                self.data_device.lv_api.clear_tags()
+        if self.db_device:
+            if self.old_db_tags:
+                self.db_device.lv_api.set_tags(self.old_db_tags)
+            else:
+                self.db_device.lv_api.clear_tags()
+        if self.wal_device:
+            if self.old_wal_tags:
+                self.wal_device.lv_api.set_tags(self.old_wal_tags)
+            else:
+                self.wal_device.lv_api.clear_tags()
+        if self.old_target_tags:
+            self.target_lv.set_tags(self.old_target_tags)
+        else:
+            self.target_lv.clear_tags()
+
+class Migrate(object):
+
+    help = 'Migrate BlueFS data from to another LVM device'
+
+    def __init__(self, argv):
+        self.argv = argv
+        self.osd_id = None
+
+    def get_source_devices(self, devices, target_type=""):
+        ret = []
+        for device, type in devices:
+            if type == target_type:
+                continue
+            if type == 'block':
+                if 'data' not in self.args.from_:
+                    continue;
+            elif type == 'db':
+                if 'db' not in self.args.from_:
+                    continue;
+            elif type == 'wal':
+                if 'wal' not in self.args.from_:
+                    continue;
+            ret.append([device, type])
+        if ret == []:
+            mlogger.error('Source device list is empty')
+            raise SystemExit(
+                'Unable to migrate to : {}'.format(self.args.target))
+        return ret
+
+    # ceph-bluestore-tool uses the following replacement rules
+    # (in the order of precedence, stop on the first match)
+    # if source list has DB volume - target device replaces it.
+    # if source list has WAL volume - target device replace it.
+    # if source list has slow volume only - operation isn't permitted,
+    #  requires explicit allocation via new-db/new-wal command.detects which
+    def get_target_type_by_source(self, devices):
+        ret = None
+        for device, type in devices:
+            if type == 'db':
+                return 'db'
+            elif type == 'wal':
+                ret = 'wal'
+        return ret
+
+    def get_filename_by_type(self, type):
+        filename = 'block'
+        if type == 'db' or type == 'wal':
+            filename += '.' + type
+        return filename
+
+    def get_source_args(self, osd_path, devices):
+        ret = []
+        for device, type in devices:
+            ret = ret + ["--devs-source", os.path.join(
+                osd_path, self.get_filename_by_type(type))]
+        return ret
+
+    @decorators.needs_root
+    def migrate_to_new(self, osd_id, osd_fsid, devices, target_lv):
+        source_devices = self.get_source_devices(devices)
+        target_type = self.get_target_type_by_source(source_devices)
+        if not target_type:
+            mlogger.error(
+                "Unable to determine new volume type,"
+                " please use new-db or new-wal command before.")
+            raise SystemExit(
+                "Unable to migrate to : {}".format(self.args.target))
+
+        target_path = target_lv.lv_path
+
+        try:
+            tag_tracker = VolumeTagTracker(devices, target_lv)
+            # we need to update lvm tags for all the remaining volumes
+            # and clear for ones which to be removed
+
+            # ceph-bluestore-tool removes source volume(s) other than block one
+            # and attaches target one after successful migration
+            tag_tracker.replace_lvs(source_devices, target_type)
+
+            osd_path = get_osd_path(osd_id, osd_fsid)
+            source_args = self.get_source_args(osd_path, source_devices)
+            mlogger.info("Migrate to new, Source: {} Target: {}".format(
+                source_args, target_path))
+            stdout, stderr, exit_code = process.call([
+                'ceph-bluestore-tool',
+                '--path',
+                osd_path,
+                '--dev-target',
+                target_path,
+                '--command',
+                'bluefs-bdev-migrate'] +
+                source_args)
+            if exit_code != 0:
+                mlogger.error(
+                    'Failed to migrate device, error code:{}'.format(exit_code))
+                raise SystemExit(
+                    'Failed to migrate to : {}'.format(self.args.target))
+            else:
+                system.chown(os.path.join(osd_path, "block.{}".format(
+                    target_type)))
+                terminal.success('Migration successful.')
+        except:
+            tag_tracker.undo()
+            raise
+
+        return
+
+    @decorators.needs_root
+    def migrate_to_existing(self, osd_id, osd_fsid, devices, target_lv):
+        target_type = target_lv.tags["ceph.type"]
+        if target_type == "wal":
+            mlogger.error("Migrate to WAL is not supported")
+            raise SystemExit(
+                "Unable to migrate to : {}".format(self.args.target))
+        target_filename = self.get_filename_by_type(target_type)
+        if (target_filename == ""):
+            mlogger.error(
+                "Target Logical Volume doesn't have proper volume type "
+                "(ceph.type LVM tag): {}".format(target_type))
+            raise SystemExit(
+                "Unable to migrate to : {}".format(self.args.target))
+
+        osd_path = get_osd_path(osd_id, osd_fsid)
+        source_devices = self.get_source_devices(devices, target_type)
+        target_path = os.path.join(osd_path, target_filename)
+        tag_tracker = VolumeTagTracker(devices, target_lv)
+
+        try:
+            # ceph-bluestore-tool removes source volume(s) other than
+            # block and target ones after successful migration
+            tag_tracker.remove_lvs(source_devices, target_type)
+            source_args = self.get_source_args(osd_path, source_devices)
+            mlogger.info("Migrate to existing, Source: {} Target: {}".format(
+                source_args, target_path))
+            stdout, stderr, exit_code = process.call([
+                'ceph-bluestore-tool',
+                '--path',
+                osd_path,
+                '--dev-target',
+                target_path,
+                '--command',
+                'bluefs-bdev-migrate'] +
+                source_args)
+            if exit_code != 0:
+                mlogger.error(
+                    'Failed to migrate device, error code:{}'.format(exit_code))
+                raise SystemExit(
+                    'Failed to migrate to : {}'.format(self.args.target))
+            else:
+                terminal.success('Migration successful.')
+        except:
+            tag_tracker.undo()
+            raise
+
+        return
+
+    @decorators.needs_root
+    def migrate_osd(self):
+        if self.args.osd_id:
+            osd_is_running = systemctl.osd_is_active(self.args.osd_id)
+            if osd_is_running:
+                mlogger.error('OSD is running, stop it with: '
+                    'systemctl stop ceph-osd@{}'.format(
+                        self.args.osd_id))
+                raise SystemExit(
+                    'Unable to migrate devices associated with OSD ID: {}'
+                        .format(self.args.osd_id))
+
+        target_lv = api.get_lv_by_fullname(self.args.target)
+        if not target_lv:
+            mlogger.error(
+                'Target path "{}" is not a Logical Volume'.formaat(
+                    self.args.target))
+            raise SystemExit(
+                'Unable to migrate to : {}'.format(self.args.target))
+        devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
+        if (not target_lv.used_by_ceph):
+            self.migrate_to_new(self.args.osd_id, self.args.osd_fsid,
+                devices,
+                target_lv)
+        else:
+            if (target_lv.tags['ceph.osd_id'] != self.args.osd_id or
+                    target_lv.tags['ceph.osd_fsid'] != self.args.osd_fsid):
+                mlogger.error(
+                    'Target Logical Volume isn\'t used by the specified OSD: '
+                        '{} FSID: {}'.format(self.args.osd_id,
+                            self.args.osd_fsid))
+                raise SystemExit(
+                    'Unable to migrate to : {}'.format(self.args.target))
+
+            self.migrate_to_existing(self.args.osd_id, self.args.osd_fsid,
+                devices,
+                target_lv)
+
+    def parse_argv(self):
+        sub_command_help = dedent("""
+        Moves BlueFS data from source volume(s) to the target one, source
+        volumes (except the main (i.e. data or block) one) are removed on
+        success. LVM volumes are permitted for Target only, both already
+        attached or new logical one. In the latter case it is attached to OSD
+        replacing one of the source devices. Following replacement rules apply
+        (in the order of precedence, stop on the first match):
+        * if source list has DB volume - target device replaces it.
+        * if source list has WAL volume - target device replace it.
+        * if source list has slow volume only - operation is not permitted,
+          requires explicit allocation via new-db/new-wal command.
+
+        Example calls for supported scenarios:
+
+          Moves BlueFS data from main device to LV already attached as DB:
+
+            ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from data --target vgname/db
+
+          Moves BlueFS data from shared main device to LV which will be attached
+           as a new DB:
+
+            ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from data --target vgname/new_db
+
+          Moves BlueFS data from DB device to new LV, DB is replaced:
+
+            ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from db --target vgname/new_db
+
+          Moves BlueFS data from main and DB devices to new LV, DB is replaced:
+
+            ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from data db --target vgname/new_db
+
+          Moves BlueFS data from main, DB and WAL devices to new LV, WAL is
+           removed and DB is replaced:
+
+            ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from data db wal --target vgname/new_db
+
+          Moves BlueFS data from main, DB and WAL devices to main device, WAL
+           and DB are removed:
+
+            ceph-volume lvm migrate --osd-id 1 --osd-fsid <uuid> --from db wal --target vgname/data
+
+        """)
+        parser = argparse.ArgumentParser(
+            prog='ceph-volume lvm migrate',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            '--osd-id',
+            required=True,
+            help='Specify an OSD ID to detect associated devices for zapping',
+        )
+
+        parser.add_argument(
+            '--osd-fsid',
+            required=True,
+            help='Specify an OSD FSID to detect associated devices for zapping',
+        )
+        parser.add_argument(
+            '--target',
+            required=True,
+            help='Specify target Logical Volume (LV) to migrate data to',
+        )
+        parser.add_argument(
+            '--from',
+            nargs='*',
+            dest='from_',
+            required=True,
+            choices=['data', 'db', 'wal'],
+            help='Copy BlueFS data from DB device',
+        )
+
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        self.args = parser.parse_args(self.argv)
+
+    def main(self):
+        self.parse_argv()
+        self.migrate_osd()
+
+class NewVolume(object):
+    def __init__(self, create_type, argv):
+        self.create_type = create_type
+        self.argv = argv
+
+    def make_parser(self, prog, sub_command_help):
+        parser = argparse.ArgumentParser(
+            prog=prog,
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            description=sub_command_help,
+        )
+
+        parser.add_argument(
+            '--osd-id',
+            required=True,
+            help='Specify an OSD ID to attach new volume to',
+        )
+
+        parser.add_argument(
+            '--osd-fsid',
+            required=True,
+            help='Specify an OSD FSIDto attach new volume to',
+        )
+        parser.add_argument(
+            '--target',
+            required=True,
+            help='Specify target Logical Volume (LV) to attach',
+        )
+        return parser
+
+    @decorators.needs_root
+    def make_new_volume(self, osd_id, osd_fsid, devices, target_lv):
+        osd_path = get_osd_path(osd_id, osd_fsid)
+        mlogger.info(
+            'Making new volume at {} for OSD: {} ({})'.format(
+                target_lv.lv_path, osd_id, osd_path))
+        tag_tracker = VolumeTagTracker(devices, target_lv)
+
+        try:
+            tag_tracker.update_tags_when_lv_create(self.create_type)
+
+            stdout, stderr, exit_code = process.call([
+                'ceph-bluestore-tool',
+                '--path',
+                osd_path,
+                '--dev-target',
+                target_lv.lv_path,
+                '--command',
+                'bluefs-bdev-new-{}'.format(self.create_type)
+            ])
+            if exit_code != 0:
+                mlogger.error(
+                    'failed to attach new volume, error code:{}'.format(
+                        exit_code))
+                raise SystemExit(
+                    "Failed to attach new volume: {}".format(
+                        self.args.target))
+            else:
+                system.chown(os.path.join(osd_path, "block.{}".format(
+                    self.create_type)))
+                terminal.success('New volume attached.')
+        except:
+            tag_tracker.undo()
+            raise
+        return
+
+    @decorators.needs_root
+    def new_volume(self):
+        if self.args.osd_id:
+            osd_is_running = systemctl.osd_is_active(self.args.osd_id)
+            if osd_is_running:
+                mlogger.error('OSD ID is running, stop it with:'
+                    ' systemctl stop ceph-osd@{}'.format(self.args.osd_id))
+                raise SystemExit(
+                    'Unable to attach new volume for OSD: {}'.format(
+                        self.args.osd_id))
+
+        target_lv = api.get_lv_by_fullname(self.args.target)
+        if not target_lv:
+            mlogger.error(
+                'Target path {} is not a Logical Volume'.format(
+                    self.args.target))
+            raise SystemExit(
+                'Unable to attach new volume : {}'.format(self.args.target))
+        if target_lv.used_by_ceph:
+            mlogger.error(
+                'Target Logical Volume is already used by ceph: {}'.format(
+                    self.args.target))
+            raise SystemExit(
+                'Unable to attach new volume : {}'.format(self.args.target))
+        else:
+            devices = find_associated_devices(self.args.osd_id,
+                self.args.osd_fsid)
+            self.make_new_volume(
+                self.args.osd_id,
+                self.args.osd_fsid,
+                devices,
+                target_lv)
+
+class NewWAL(NewVolume):
+
+    help = 'Allocate new WAL volume for OSD at specified Logical Volume'
+
+    def __init__(self, argv):
+        super(NewWAL, self).__init__("wal", argv)
+
+    def main(self):
+        sub_command_help = dedent("""
+        Attaches the given logical volume to the given OSD as a WAL volume.
+        Logical volume format is vg/lv. Fails if OSD has already got attached DB.
+
+        Example:
+
+          Attach vgname/lvname as a WAL volume to OSD 1
+
+              ceph-volume lvm new-wal --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D --target vgname/new_wal
+        """)
+        parser = self.make_parser('ceph-volume lvm new-wal', sub_command_help)
+
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+
+        self.args = parser.parse_args(self.argv)
+
+        self.new_volume()
+
+class NewDB(NewVolume):
+
+    help = 'Allocate new DB volume for OSD at specified Logical Volume'
+
+    def __init__(self, argv):
+        super(NewDB, self).__init__("db", argv)
+
+    def main(self):
+        sub_command_help = dedent("""
+        Attaches the given logical volume to the given OSD as a DB volume.
+        Logical volume format is vg/lv. Fails if OSD has already got attached DB.
+
+        Example:
+
+          Attach vgname/lvname as a DB volume to OSD 1
+
+              ceph-volume lvm new-db --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D --target vgname/new_db
+        """)
+
+        parser = self.make_parser('ceph-volume lvm new-db', sub_command_help)
+        if len(self.argv) == 0:
+            print(sub_command_help)
+            return
+        self.args = parser.parse_args(self.argv)
+
+        self.new_volume()
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_migrate.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_migrate.py
--- a/ceph/src/ceph-volume/ceph_volume/util/disk.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/disk.py
@ -24,7 +24,7 @@ def get_partuuid(device):
    device
    """
    out, err, rc = process.call(
-        ['blkid', '-s', 'PARTUUID', '-o', 'value', device]
+        ['blkid', '-c', '/dev/null', '-s', 'PARTUUID', '-o', 'value', device]
    )
    return ' '.join(out).strip()

@ -98,7 +98,7 @@ def blkid(device):
    PART_ENTRY_UUID                 PARTUUID
    """
    out, err, rc = process.call(
-        ['blkid', '-p', device]
+        ['blkid', '-c', '/dev/null', '-p', device]
    )
    return _blkid_parser(' '.join(out))

@ -110,7 +110,7 @@ def get_part_entry_type(device):
    used for udev rules, but it is useful in this case as it is the only
    consistent way to retrieve the GUID used by ceph-disk to identify devices.
    """
-    out, err, rc = process.call(['blkid', '-p', '-o', 'udev', device])
+    out, err, rc = process.call(['blkid', '-c', '/dev/null', '-p', '-o', 'udev', device])
    for line in out:
        if 'ID_PART_ENTRY_TYPE=' in line:
            return line.split('=')[-1].strip()
@ -123,7 +123,7 @@ def get_device_from_partuuid(partuuid):
    device is
    """
    out, err, rc = process.call(
-        ['blkid', '-t', 'PARTUUID="%s"' % partuuid, '-o', 'device']
+        ['blkid', '-c', '/dev/null', '-t', 'PARTUUID="%s"' % partuuid, '-o', 'device']
    )
    return ' '.join(out).strip()

--- a/ceph/src/ceph.in
+++ b/ceph/src/ceph.in
@ -1213,18 +1213,6 @@ def main():
                              errno.errorcode.get(ret, 'Unknown'), outs),
                          file=sys.stderr)

-        if ret < 0:
-            ret = -ret
-            errstr = errno.errorcode.get(ret, 'Unknown')
-            print(u'Error {0}: {1}'.format(errstr, outs), file=sys.stderr)
-            if len(targets) > 1:
-                final_ret = ret
-            else:
-                return ret
-
-        if outs:
-            print(prefix + outs, file=sys.stderr)
-
        sys.stdout.flush()

        if parsed_args.output_file:
@ -1250,12 +1238,23 @@ def main():
                except IOError as e:
                    if e.errno != errno.EPIPE:
                        raise e
+        final_e = None
        try:
            sys.stdout.flush()
        except IOError as e:
            if e.errno != errno.EPIPE:
-                raise e
+                final_e = e

+        if ret < 0:
+            ret = -ret
+            errstr = errno.errorcode.get(ret, 'Unknown')
+            print(u'Error {0}: {1}'.format(errstr, outs), file=sys.stderr)
+            final_ret = ret
+        elif outs:
+            print(prefix + outs, file=sys.stderr)
+
+        if final_e:
+            raise final_e

    # Block until command completion (currently scrub and deep_scrub only)
    if block:
--- a/ceph/src/ceph_mon.cc
+++ b/ceph/src/ceph_mon.cc
@ -109,6 +109,14 @@ int obtain_monmap(MonitorDBStore &store, bufferlist &bl)
    }
  }

+  if (store.exists("mon_sync", "temp_newer_monmap")) {
+    dout(10) << __func__ << " found temp_newer_monmap" << dendl;
+    int err = store.get("mon_sync", "temp_newer_monmap", bl);
+    ceph_assert(err == 0);
+    ceph_assert(bl.length() > 0);
+    return 0;
+  }
+
  if (store.exists("mkfs", "monmap")) {
    dout(10) << __func__ << " found mkfs monmap" << dendl;
    int err = store.get("mkfs", "monmap", bl);
--- a/ceph/src/client/Client.cc
+++ b/ceph/src/client/Client.cc
@ -125,12 +125,24 @@

 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)

+#ifndef S_IXUGO
+#define S_IXUGO	(S_IXUSR|S_IXGRP|S_IXOTH)
+#endif
+
 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 {
  Client *client = static_cast<Client*>(p);
  client->flush_set_callback(oset);
 }

+bool Client::is_reserved_vino(vinodeno_t &vino) {
+  if (MDS_IS_PRIVATE_INO(vino.ino)) {
+    ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
+    return true;
+  }
+  return false;
+}
+

 // -------------

@ -3192,7 +3204,7 @@ void Client::put_cap_ref(Inode *in, int cap)
    int put_nref = 0;
    int drop = last & ~in->caps_issued();
    if (in->snapid == CEPH_NOSNAP) {
-      if ((last & CEPH_CAP_FILE_WR) &&
+      if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
 	  !in->cap_snaps.empty() &&
 	  in->cap_snaps.rbegin()->second.writing) {
 	ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
@ -3661,9 +3673,9 @@ void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
    capsnap.context = old_snapc;
    capsnap.issued = in->caps_issued();
    capsnap.dirty = in->caps_dirty();
-    
+
    capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
-    
+
    capsnap.uid = in->uid;
    capsnap.gid = in->gid;
    capsnap.mode = in->mode;
@ -3672,7 +3684,7 @@ void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
    capsnap.xattr_version = in->xattr_version;
    capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
    capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
- 
+
    if (used & CEPH_CAP_FILE_WR) {
      ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
      capsnap.writing = 1;
@ -3707,6 +3719,7 @@ void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
  }

  if (used & CEPH_CAP_FILE_BUFFER) {
+    capsnap.writing = 1;
    ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
 	     << " WRBUFFER, delaying" << dendl;
  } else {
@ -3715,13 +3728,6 @@ void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
  }
 }

-void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
-{
-  ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
-  in->cap_snaps.at(seq).dirty_data = 0;
-  flush_snaps(in);
-}
-
 void Client::send_flush_snap(Inode *in, MetaSession *session,
 			     snapid_t follows, CapSnap& capsnap)
 {
@ -3789,7 +3795,7 @@ void Client::flush_snaps(Inode *in)
 	     << " on " << *in << dendl;
    if (capsnap.dirty_data || capsnap.writing)
      break;
-    
+
    capsnap.flush_tid = ++last_flush_tid;
    session->flushing_caps_tids.insert(capsnap.flush_tid);
    in->flushing_cap_tids[capsnap.flush_tid] = 0;
@ -4336,7 +4342,7 @@ void Client::trim_caps(MetaSession *s, uint64_t max)
        ++q;
 	if (dn->lru_is_expireable()) {
 	  if (can_invalidate_dentries &&
-	      dn->dir->parent_inode->ino == MDS_INO_ROOT) {
+	      dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
 	    // Only issue one of these per DN for inodes in root: handle
 	    // others more efficiently by calling for root-child DNs at
 	    // the end of this function.
@ -4349,10 +4355,10 @@ void Client::trim_caps(MetaSession *s, uint64_t max)
 	  all = false;
        }
      }
-      if (in->ll_ref == 1 && in->ino != MDS_INO_ROOT) {
+      if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
         _schedule_ino_release_callback(in.get());
      }
-      if (all && in->ino != MDS_INO_ROOT) {
+      if (all && in->ino != CEPH_INO_ROOT) {
        ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
 	trimmed++;
      }
@ -4738,25 +4744,19 @@ void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool
      ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
 	       << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
    }
-        
+
    if (!first_realm)
      first_realm = realm;
    else
      put_snap_realm(realm);
  }

-  for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
-       q != dirty_realms.end();
-       ++q) {
-    SnapRealm *realm = q->first;
+  for (auto &[realm, snapc] : dirty_realms) {
    // if there are new snaps ?
-    if (has_new_snaps(q->second, realm->get_snap_context())) { 
+    if (has_new_snaps(snapc, realm->get_snap_context())) {
      ldout(cct, 10) << " flushing caps on " << *realm << dendl;
-      xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
-      while (!r.end()) {
-	Inode *in = *r;
-	++r;
-	queue_cap_snap(in, q->second);
+      for (auto&& in : realm->inodes_with_caps) {
+	queue_cap_snap(in, snapc);
      }
    } else {
      ldout(cct, 10) << " no new snap on " << *realm << dendl;
@ -5383,8 +5383,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const M

 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
 {
-  if (perms.uid() == 0)
+  if (perms.uid() == 0) {
+    // Executable are overridable when there is at least one exec bit set
+    if((want & MAY_EXEC) && !(in->mode & S_IXUGO))
+      return -EACCES;
    return 0;
+  }
  
  if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
    int ret = _posix_acl_permission(in, perms, want);
@ -8659,33 +8663,44 @@ int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
 * the resulting Inode object in one operation, so that caller
 * can safely assume inode will still be there after return.
 */
-int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
+int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
 {
-  ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
+  ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;

  if (unmounting)
    return -ENOTCONN;

+  if (is_reserved_vino(vino))
+    return -ESTALE;
+
  MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
-  filepath path(ino);
+  filepath path(vino.ino);
  req->set_filepath(path);

+  /*
+   * The MDS expects either a "real" snapid here or 0. The special value
+   * carveouts for the snapid are all at the end of the range so we can
+   * just look for any snapid below this value.
+   */
+  if (vino.snapid < CEPH_NOSNAP)
+    req->head.args.lookupino.snapid = vino.snapid;
+
  int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
  if (r == 0 && inode != NULL) {
-    vinodeno_t vino(ino, CEPH_NOSNAP);
    unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
    ceph_assert(p != inode_map.end());
    *inode = p->second;
    _ll_get(*inode);
  }
-  ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
+  ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
  return r;
 }

 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
 {
+  vinodeno_t vino(ino, CEPH_NOSNAP);
  std::lock_guard lock(client_lock);
-  return _lookup_ino(ino, perms, inode);
+  return _lookup_vino(vino, perms, inode);
 }

 /**
@ -9055,8 +9070,15 @@ void Client::lock_fh_pos(Fh *f)

 void Client::unlock_fh_pos(Fh *f)
 {
+  ceph_assert(client_lock.is_locked_by_me());
+
  ldout(cct, 10) << __func__ << " " << f << dendl;
  f->pos_locked = false;
+  if (!f->pos_waiters.empty()) {
+    // only wake up the oldest waiter
+    auto cond = f->pos_waiters.front();
+    cond->SignalOne();
+  }
 }

 int Client::uninline_data(Inode *in, Context *onfinish)
@ -10805,56 +10827,59 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
  return r;
 }

+int Client::ll_lookup_vino(
+    vinodeno_t vino,
+    const UserPerm& perms,
+    Inode **inode)
+{
+  ceph_assert(inode != NULL);
+
+  if (unmounting)
+    return -ENOTCONN;
+
+  if (is_reserved_vino(vino))
+    return -ESTALE;
+
+  std::lock_guard lock(client_lock);
+  ldout(cct, 3) << __func__ << vino << dendl;
+   
+  // Check the cache first
+  unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
+  if (p != inode_map.end()) {
+    *inode = p->second;
+    _ll_get(*inode);
+    return 0;
+  }
+
+  uint64_t snapid = vino.snapid;
+
+  // for snapdir, find the non-snapped dir inode
+  if (snapid == CEPH_SNAPDIR)
+    vino.snapid = CEPH_NOSNAP;
+
+  int r = _lookup_vino(vino, perms, inode);
+  if (r)
+    return r;
+  ceph_assert(*inode != NULL);
+
+  if (snapid == CEPH_SNAPDIR) {
+    Inode *tmp = *inode;
+
+    // open the snapdir and put the inode ref
+    *inode = open_snapdir(tmp);
+    _ll_forget(tmp, 1);
+    _ll_get(*inode);
+  }
+  return 0;
+}
+
 int Client::ll_lookup_inode(
    struct inodeno_t ino,
    const UserPerm& perms,
    Inode **inode)
 {
-  ceph_assert(inode != NULL);
-  std::lock_guard lock(client_lock);
-  ldout(cct, 3) << "ll_lookup_inode " << ino  << dendl;
-   
-  if (unmounting)
-    return -ENOTCONN;
-
-  // Num1: get inode and *inode
-  int r = _lookup_ino(ino, perms, inode);
-  if (r)
-    return r;
-
-  ceph_assert(*inode != NULL);
-
-  if (!(*inode)->dentries.empty()) {
-    ldout(cct, 8) << __func__ << " dentry already present" << dendl;
-    return 0;
-  }
-
-  if ((*inode)->is_root()) {
-    ldout(cct, 8) << "ino is root, no parent" << dendl;
-    return 0;
-  }
-
-  // Num2: Request the parent inode, so that we can look up the name
-  Inode *parent;
-  r = _lookup_parent(*inode, perms, &parent);
-  if (r) {
-    _ll_forget(*inode, 1);  
-    return r;
-  }
-
-  ceph_assert(parent != NULL);
-
-  // Num3: Finally, get the name (dentry) of the requested inode
-  r = _lookup_name(*inode, parent, perms);
-  if (r) {
-    // Unexpected error
-    _ll_forget(parent, 1);
-    _ll_forget(*inode, 1);
-    return r;
-  }
-
-  _ll_forget(parent, 1);
-  return 0;
+  vinodeno_t vino(ino, CEPH_NOSNAP);
+  return ll_lookup_vino(vino, perms, inode);
 }

 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
@ -11066,6 +11091,9 @@ Inode *Client::ll_get_inode(vinodeno_t vino)
  if (unmounting)
    return NULL;

+  if (is_reserved_vino(vino))
+    return NULL;
+
  unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
  if (p == inode_map.end())
    return NULL;
@ -14262,6 +14290,10 @@ int Client::check_pool_perm(Inode *in, int need)
  if (!cct->_conf->client_check_pool_perm)
    return 0;

+  /* Only need to do this for regular files */
+  if (!in->is_file())
+    return 0;
+
  int64_t pool_id = in->layout.pool_id;
  std::string pool_ns = in->layout.pool_ns;
  std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
--- a/ceph/src/client/Client.h
+++ b/ceph/src/client/Client.h
@ -479,6 +479,7 @@ public:
  int ll_lookup(Inode *parent, const char *name, struct stat *attr,
 		Inode **out, const UserPerm& perms);
  int ll_lookup_inode(struct inodeno_t ino, const UserPerm& perms, Inode **inode);
+  int ll_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode);
  int ll_lookupx(Inode *parent, const char *name, Inode **out,
 			struct ceph_statx *stx, unsigned want, unsigned flags,
 			const UserPerm& perms);
@ -664,7 +665,6 @@ public:
  void wait_sync_caps(ceph_tid_t want);
  void queue_cap_snap(Inode *in, SnapContext &old_snapc);
  void finish_cap_snap(Inode *in, CapSnap &capsnap, int used);
-  void _flushed_cap_snap(Inode *in, snapid_t seq);

  void _schedule_invalidate_dentry_callback(Dentry *dn, bool del);
  void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name);
@ -1012,6 +1012,7 @@ private:
  static const VXattr _common_vxattrs[];


+  bool is_reserved_vino(vinodeno_t &vino);

  void fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off);

@ -1182,7 +1183,7 @@ private:
  int _ll_getattr(Inode *in, int caps, const UserPerm& perms);
  int _lookup_parent(Inode *in, const UserPerm& perms, Inode **parent=NULL);
  int _lookup_name(Inode *in, Inode *parent, const UserPerm& perms);
-  int _lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode=NULL);
+  int _lookup_vino(vinodeno_t ino, const UserPerm& perms, Inode **inode=NULL);
  bool _ll_forget(Inode *in, uint64_t count);


--- a/ceph/src/client/Inode.h
+++ b/ceph/src/client/Inode.h
@ -163,7 +163,7 @@ struct Inode {
  version_t  inline_version;
  bufferlist inline_data;

-  bool is_root()    const { return ino == MDS_INO_ROOT; }
+  bool is_root()    const { return ino == CEPH_INO_ROOT; }
  bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
  bool is_dir()     const { return (mode & S_IFMT) == S_IFDIR; }
  bool is_file()    const { return (mode & S_IFMT) == S_IFREG; }
--- a/ceph/src/cls/CMakeLists.txt
+++ b/ceph/src/cls/CMakeLists.txt
@ -86,6 +86,10 @@ if (WITH_RADOSGW)
    otp/cls_otp_types.cc
    )
  add_library(cls_otp_client STATIC ${cls_otp_client_srcs})
+  if (WITH_BOOST_CONTEXT)
+    target_include_directories(cls_otp_client PRIVATE
+      $<TARGET_PROPERTY:spawn,INTERFACE_INCLUDE_DIRECTORIES>)
+  endif()
 endif (WITH_RADOSGW)

 # cls_refcount
--- a/ceph/src/cls/rgw/cls_rgw.cc
+++ b/ceph/src/cls/rgw/cls_rgw.cc
@ -35,6 +35,9 @@ static std::string bucket_index_prefixes[] = { "", /* special handling for the o
                                          /* this must be the last index */
                                          "9999_",};

+static const std::string BI_PREFIX_END = string(1, BI_PREFIX_CHAR) +
+    bucket_index_prefixes[BI_BUCKET_LAST_INDEX];
+
 static bool bi_is_objs_index(const string& s) {
  return ((unsigned char)s[0] != BI_PREFIX_CHAR);
 }
@ -2322,29 +2325,29 @@ static int rgw_bi_put_op(cls_method_context_t hctx, bufferlist *in, bufferlist *
  return 0;
 }

-static int list_plain_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max,
-                              list<rgw_cls_bi_entry> *entries, bool *pmore)
+static int list_plain_entries(cls_method_context_t hctx,
+                              const string& filter,
+                              const string& start_after_key,
+                              const string& end_key,
+                              uint32_t max,
+                              list<rgw_cls_bi_entry> *entries,
+                              bool *end_key_reached,
+                              bool *pmore)
 {
-  string filter = name;
-  string start_key = marker;
-
-  string end_key; // stop listing at bi_log_prefix
-  bi_log_prefix(end_key);
-
  int count = 0;
  map<string, bufferlist> keys;
-  int ret = cls_cxx_map_get_vals(hctx, start_key, filter, max, &keys, pmore);
+  int ret = cls_cxx_map_get_vals(hctx, start_after_key, filter, max, &keys,
+                                 pmore);
  if (ret < 0) {
    return ret;
  }

-  map<string, bufferlist>::iterator iter;
-  for (iter = keys.begin(); iter != keys.end(); ++iter) {
-    if (iter->first >= end_key) {
-      /* past the end of plain namespace */
-      if (pmore) {
-	*pmore = false;
-      }
+  *end_key_reached = false;
+
+  for (auto iter = keys.begin(); iter != keys.end(); ++iter) {
+    if (!end_key.empty() && iter->first >= end_key) {
+      *end_key_reached = true;
+      *pmore = true;
      return count;
    }

@ -2363,13 +2366,12 @@ static int list_plain_entries(cls_method_context_t hctx, const string& name, con
      return -EIO;
    }

-    CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__, escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str());
+    CLS_LOG(20, "%s(): entry.idx=%s e.key.name=%s", __func__,
+            escape_str(entry.idx).c_str(), escape_str(e.key.name).c_str());

-    if (!name.empty() && e.key.name != name) {
+    if (!filter.empty() && e.key.name != filter) {
      /* we are skipping the rest of the entries */
-      if (pmore) {
-	*pmore = false;
-      }
+      *pmore = false;
      return count;
    }

@ -2378,12 +2380,54 @@ static int list_plain_entries(cls_method_context_t hctx, const string& name, con
    if (count >= (int)max) {
      return count;
    }
-    start_key = entry.idx;
  }

  return count;
 }

+static int list_plain_entries(cls_method_context_t hctx,
+                              const string& name,
+                              const string& marker,
+                              uint32_t max,
+                              list<rgw_cls_bi_entry> *entries,
+                              bool *pmore) {
+  string start_after_key = marker;
+  string end_key;
+  bi_log_prefix(end_key);
+  int r;
+  bool end_key_reached;
+  bool more;
+
+  if (start_after_key < end_key) {
+    // listing ascii plain namespace
+    int r = list_plain_entries(hctx, name, start_after_key, end_key, max,
+                               entries, &end_key_reached, &more);
+    if (r < 0) {
+      return r;
+    }
+    if (r >= (int)max || !end_key_reached || !more) {
+      if (pmore) {
+	*pmore = more;
+      }
+      return r;
+    }
+    start_after_key = BI_PREFIX_END;
+    max = max - r;
+  }
+
+  // listing non-ascii plain namespace
+  r = list_plain_entries(hctx, name, start_after_key, {}, max, entries,
+                         &end_key_reached, &more);
+  if (r < 0) {
+    return r;
+  }
+  if (pmore) {
+    *pmore = more;
+  }
+
+  return r;
+}
+
 static int list_instance_entries(cls_method_context_t hctx, const string& name, const string& marker, uint32_t max,
                                 list<rgw_cls_bi_entry> *entries, bool *pmore)
 {
--- a/ceph/src/common/AsyncOpTracker.h
+++ b/ceph/src/common/AsyncOpTracker.h
@ -5,8 +5,7 @@
 #define CEPH_ASYNC_OP_TRACKER_H

 #include "common/ceph_mutex.h"
-
-struct Context;
+#include "include/Context.h"

 class AsyncOpTracker {
 public:
@ -27,4 +26,23 @@ private:

 };

+class C_TrackedOp : public Context {
+public:
+  C_TrackedOp(AsyncOpTracker& async_op_tracker, Context* on_finish)
+    : m_async_op_tracker(async_op_tracker), m_on_finish(on_finish) {
+    m_async_op_tracker.start_op();
+  }
+
+  void finish(int r) override {
+    if (m_on_finish != nullptr) {
+      m_on_finish->complete(r);
+    }
+    m_async_op_tracker.finish_op();
+  }
+
+private:
+  AsyncOpTracker& m_async_op_tracker;
+  Context* m_on_finish;
+};
+
 #endif // CEPH_ASYNC_OP_TRACKER_H
--- a/ceph/src/common/CMakeLists.txt
+++ b/ceph/src/common/CMakeLists.txt
@ -168,7 +168,7 @@ elseif(HAVE_ARMV8_CRC)
    crc32c_aarch64.c)
 endif(HAVE_INTEL)

-add_library(crc32 ${crc32_srcs})
+add_library(crc32 STATIC ${crc32_srcs})
 if(HAVE_ARMV8_CRC)
  set_target_properties(crc32 PROPERTIES
    COMPILE_FLAGS "${CMAKE_C_FLAGS} ${ARMV8_CRC_COMPILE_FLAGS}")
--- a/ceph/src/common/LogClient.cc
+++ b/ceph/src/common/LogClient.cc
@ -145,6 +145,14 @@ LogClientTemp::~LogClientTemp()
    parent.do_log(type, ss);
 }

+void LogChannel::set_log_to_monitors(bool v)
+{
+  if (log_to_monitors != v) {
+    parent->reset();
+    log_to_monitors = v;
+  }
+}
+
 void LogChannel::update_config(map<string,string> &log_to_monitors,
 			       map<string,string> &log_to_syslog,
 			       map<string,string> &log_channels,
@ -342,6 +350,15 @@ version_t LogClient::queue(LogEntry &entry)
  return entry.seq;
 }

+void LogClient::reset()
+{
+  std::lock_guard l(log_lock);
+  if (log_queue.size()) {
+    log_queue.clear();
+  }
+  last_log_sent = last_log;
+}
+
 uint64_t LogClient::get_next_seq()
 {
  std::lock_guard l(log_lock);
--- a/ceph/src/common/LogClient.h
+++ b/ceph/src/common/LogClient.h
@ -134,9 +134,7 @@ public:
    do_log(CLOG_SEC, s);
  }

-  void set_log_to_monitors(bool v) {
-    log_to_monitors = v;
-  }
+  void set_log_to_monitors(bool v);
  void set_log_to_syslog(bool v) {
    log_to_syslog = v;
  }
@ -253,6 +251,7 @@ public:
  const EntityName& get_myname();
  entity_name_t get_myrank();
  version_t queue(LogEntry &entry);
+  void reset();

 private:
  Message *_get_mon_log_message();
--- a/ceph/src/common/async/yield_context.h
+++ b/ceph/src/common/async/yield_context.h
@ -22,31 +22,28 @@

 #ifndef HAVE_BOOST_CONTEXT

-// hide the dependencies on boost::context and boost::coroutines
-namespace boost::asio {
+// hide the dependency on boost::context
+namespace spawn {
 struct yield_context;
 }

 #else // HAVE_BOOST_CONTEXT
-#ifndef BOOST_COROUTINES_NO_DEPRECATION_WARNING
-#define BOOST_COROUTINES_NO_DEPRECATION_WARNING
-#endif
-#include <boost/asio/spawn.hpp>
+#include <spawn/spawn.hpp>

 #endif // HAVE_BOOST_CONTEXT


-/// optional-like wrapper for a boost::asio::yield_context and its associated
+/// optional-like wrapper for a spawn::yield_context and its associated
 /// boost::asio::io_context. operations that take an optional_yield argument
 /// will, when passed a non-empty yield context, suspend this coroutine instead
 /// of the blocking the thread of execution
 class optional_yield {
  boost::asio::io_context *c = nullptr;
-  boost::asio::yield_context *y = nullptr;
+  spawn::yield_context *y = nullptr;
 public:
  /// construct with a valid io and yield_context
  explicit optional_yield(boost::asio::io_context& c,
-                          boost::asio::yield_context& y) noexcept
+                          spawn::yield_context& y) noexcept
    : c(&c), y(&y) {}

  /// type tag to construct an empty object
@ -60,7 +57,7 @@ class optional_yield {
  boost::asio::io_context& get_io_context() const noexcept { return *c; }

  /// return a reference to the yield_context. only valid if non-empty
-  boost::asio::yield_context& get_yield_context() const noexcept { return *y; }
+  spawn::yield_context& get_yield_context() const noexcept { return *y; }
 };

 // type tag object to construct an empty optional_yield
--- a/ceph/src/common/buffer.cc
+++ b/ceph/src/common/buffer.cc
@ -108,8 +108,8 @@ static ceph::spinlock debug_lock;
    static raw_combined *create(unsigned len,
 				unsigned align,
 				int mempool = mempool::mempool_buffer_anon) {
-      if (!align)
-	align = sizeof(size_t);
+      // posix_memalign() requires a multiple of sizeof(void *)
+      align = std::max<unsigned>(align, sizeof(void *));
      size_t rawlen = round_up_to(sizeof(buffer::raw_combined),
 				  alignof(buffer::raw_combined));
      size_t datalen = round_up_to(len, alignof(buffer::raw_combined));
@ -169,8 +169,8 @@ static ceph::spinlock debug_lock;
    MEMPOOL_CLASS_HELPERS();

    raw_posix_aligned(unsigned l, unsigned _align) : raw(l) {
-      align = _align;
-      ceph_assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+      // posix_memalign() requires a multiple of sizeof(void *)
+      align = std::max<unsigned>(_align, sizeof(void *));
 #ifdef DARWIN
      data = (char *) valloc(len);
 #else
--- a/ceph/src/common/config.cc
+++ b/ceph/src/common/config.cc
@ -651,9 +651,6 @@ int md_config_t::parse_argv(ConfigValues& values,
    else if (ceph_argparse_flag(args, i, "--no-mon-config", (char*)NULL)) {
      values.no_mon_config = true;
    }
-    else if (ceph_argparse_flag(args, i, "--log-early", (char*)NULL)) {
-      values.log_early = true;
-    }
    else if (ceph_argparse_flag(args, i, "--mon-config", (char*)NULL)) {
      values.no_mon_config = false;
    }
--- a/ceph/src/common/config_values.h
+++ b/ceph/src/common/config_values.h
@ -28,7 +28,6 @@ public:
  string cluster;
  ceph::logging::SubsystemMap subsys;
  bool no_mon_config = false;
-  bool log_early = false;
  // Set of configuration options that have changed since the last
  // apply_changes
  using changed_set_t = std::set<std::string>;
--- a/ceph/src/common/ipaddr.cc
+++ b/ceph/src/common/ipaddr.cc
@ -3,7 +3,6 @@
 #include <ifaddrs.h>
 #include <stdlib.h>
 #include <string.h>
-#include <boost/algorithm/string/predicate.hpp>
 #if defined(__FreeBSD__)
 #include <sys/types.h>
 #include <sys/socket.h>
@ -29,54 +28,23 @@ void netmask_ipv4(const struct in_addr *addr,
  out->s_addr = addr->s_addr & mask;
 }

-
-static bool match_numa_node(const string& if_name, int numa_node)
+bool matches_ipv4_in_subnet(const struct ifaddrs& addrs,
+			  const struct sockaddr_in* net,
+			  unsigned int prefix_len)
 {
-#ifdef WITH_SEASTAR
-  return true;
-#else
-  int if_node = -1;
-  int r = get_iface_numa_node(if_name, &if_node);
-  if (r < 0) {
+  if (addrs.ifa_addr == nullptr)
    return false;
-  }
-  return if_node == numa_node;
-#endif
-}
-
-const struct ifaddrs *find_ipv4_in_subnet(const struct ifaddrs *addrs,
-					  const struct sockaddr_in *net,
-					  unsigned int prefix_len,
-					  int numa_node) {
-  struct in_addr want, temp;

+  if (addrs.ifa_addr->sa_family != net->sin_family)
+      return false;
+  struct in_addr want;
  netmask_ipv4(&net->sin_addr, prefix_len, &want);
-  for (; addrs != NULL; addrs = addrs->ifa_next) {
-
-    if (addrs->ifa_addr == NULL)
-      continue;
-
-    if (strcmp(addrs->ifa_name, "lo") == 0 || boost::starts_with(addrs->ifa_name, "lo:"))
-      continue;
-
-    if (numa_node >= 0 && !match_numa_node(addrs->ifa_name, numa_node))
-      continue;
-
-    if (addrs->ifa_addr->sa_family != net->sin_family)
-      continue;
-
-    struct in_addr *cur = &((struct sockaddr_in*)addrs->ifa_addr)->sin_addr;
-    netmask_ipv4(cur, prefix_len, &temp);
-
-    if (temp.s_addr == want.s_addr) {
-      return addrs;
-    }
-  }
-
-  return NULL;
+  struct in_addr *cur = &((struct sockaddr_in*)addrs.ifa_addr)->sin_addr;
+  struct in_addr temp;
+  netmask_ipv4(cur, prefix_len, &temp);
+  return temp.s_addr == want.s_addr;
 }

-
 void netmask_ipv6(const struct in6_addr *addr,
 		  unsigned int prefix_len,
 		  struct in6_addr *out) {
@ -90,59 +58,25 @@ void netmask_ipv6(const struct in6_addr *addr,
    memset(out->s6_addr+prefix_len/8+1, 0, 16-prefix_len/8-1);
 }

+bool matches_ipv6_in_subnet(const struct ifaddrs& addrs,
+			    const struct sockaddr_in6* net,
+			    unsigned int prefix_len)
+{
+  if (addrs.ifa_addr == nullptr)
+    return false;

-const struct ifaddrs *find_ipv6_in_subnet(const struct ifaddrs *addrs,
-					  const struct sockaddr_in6 *net,
-					  unsigned int prefix_len,
-					  int numa_node) {
-  struct in6_addr want, temp;
-
+  if (addrs.ifa_addr->sa_family != net->sin6_family)
+    return false;
+  struct in6_addr want;
  netmask_ipv6(&net->sin6_addr, prefix_len, &want);
-  for (; addrs != NULL; addrs = addrs->ifa_next) {
-
-    if (addrs->ifa_addr == NULL)
-      continue;
-
-    if (strcmp(addrs->ifa_name, "lo") == 0 || boost::starts_with(addrs->ifa_name, "lo:"))
-      continue;
-
-    if (numa_node >= 0 && !match_numa_node(addrs->ifa_name, numa_node))
-      continue;
-
-    if (addrs->ifa_addr->sa_family != net->sin6_family)
-      continue;
-
-    struct in6_addr *cur = &((struct sockaddr_in6*)addrs->ifa_addr)->sin6_addr;
-    if (IN6_IS_ADDR_LINKLOCAL(cur))
-      continue;
-    netmask_ipv6(cur, prefix_len, &temp);
-
-    if (IN6_ARE_ADDR_EQUAL(&temp, &want))
-      return addrs;
-  }
-
-  return NULL;
+  struct in6_addr temp;
+  struct in6_addr *cur = &((struct sockaddr_in6*)addrs.ifa_addr)->sin6_addr;
+  if (IN6_IS_ADDR_LINKLOCAL(cur))
+    return false;
+  netmask_ipv6(cur, prefix_len, &temp);
+  return IN6_ARE_ADDR_EQUAL(&temp, &want);
 }

-
-const struct ifaddrs *find_ip_in_subnet(const struct ifaddrs *addrs,
-					const struct sockaddr *net,
-					unsigned int prefix_len,
-					int numa_node) {
-  switch (net->sa_family) {
-    case AF_INET:
-      return find_ipv4_in_subnet(addrs, (struct sockaddr_in*)net, prefix_len,
-				 numa_node);
-
-    case AF_INET6:
-      return find_ipv6_in_subnet(addrs, (struct sockaddr_in6*)net, prefix_len,
-				 numa_node);
-    }
-
-  return NULL;
-}
-
-
 bool parse_network(const char *s, struct sockaddr_storage *network, unsigned int *prefix_len) {
  char *slash = strchr((char*)s, '/');
  if (!slash) {
--- a/ceph/src/common/legacy_config_opts.h
+++ b/ceph/src/common/legacy_config_opts.h
@ -801,6 +801,7 @@ OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE) // track the op if over thi
 OPTION(osd_target_transaction_size, OPT_INT)     // to adjust various transactions that batch smaller items
 OPTION(osd_failsafe_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" (failsafe)
 OPTION(osd_fast_shutdown, OPT_BOOL)
+OPTION(osd_fast_shutdown_notify_mon, OPT_BOOL) // tell mon the OSD is shutting down on osd_fast_shutdown
 OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL) // immediately mark OSDs as down once they refuse to accept connections

 OPTION(osd_pg_object_context_cache_count, OPT_INT)
--- a/ceph/src/common/options.cc
+++ b/ceph/src/common/options.cc
@ -2198,16 +2198,23 @@ std::vector<Option> get_global_options() {
    .add_service("mon")
    .set_description(""),

-    Option("paxos_service_trim_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("paxos_service_trim_min", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
    .set_default(250)
    .add_service("mon")
    .set_description(""),

-    Option("paxos_service_trim_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("paxos_service_trim_max", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
    .set_default(500)
    .add_service("mon")
    .set_description(""),

+    Option("paxos_service_trim_max_multiplier", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_min(0)
+    .add_service("mon")
+    .set_description("factor by which paxos_service_trim_max will be multiplied to get a new upper bound when trim sizes are high  (0 disables it)")
+    .set_flag(Option::FLAG_RUNTIME),
+
    Option("paxos_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
    .set_default(0)
    .add_service("mon")
@ -3931,6 +3938,12 @@ std::vector<Option> get_global_options() {
    .set_description("Fast, immediate shutdown")
    .set_long_description("Setting this to false makes the OSD do a slower teardown of all state when it receives a SIGINT or SIGTERM or when shutting down for any other reason.  That slow shutdown is primarilyy useful for doing memory leak checking with valgrind."),

+    Option("osd_fast_shutdown_notify_mon", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Tell mon about OSD shutdown on immediate shutdown")
+    .set_long_description("Tell the monitor the OSD is shutting down on immediate shutdown. This helps with cluster log messages from other OSDs reporting it immediately failed.")
+    .add_see_also({"osd_fast_shutdown", "osd_mon_shutdown_timeout"}),
+
    Option("osd_fast_fail_on_connection_refused", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
    .set_default(true)
    .set_description(""),
@ -4418,9 +4431,9 @@ std::vector<Option> get_global_options() {
    .set_description(""),

    Option("bluefs_buffered_io", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
-    .set_default(false)
+    .set_default(true)
    .set_description("Enabled buffered IO for bluefs reads.")
-    .set_long_description("When this option is enabled, bluefs will in some cases perform buffered reads.  This allows the kernel page cache to act as a secondary cache for things like RocksDB compaction.  For example, if the rocksdb block cache isn't large enough to hold blocks from the compressed SST files itself, they can be read from page cache instead of from the disk.  This option previously was enabled by default, however in some test cases it appears to cause excessive swap utilization by the linux kernel and a large negative performance impact after several hours of run time.  Please exercise caution when enabling."),
+    .set_long_description("When this option is enabled, bluefs will in some cases perform buffered reads.  This allows the kernel page cache to act as a secondary cache for things like RocksDB compaction.  For example, if the rocksdb block cache isn't large enough to hold blocks from the compressed SST files itself, they can be read from page cache instead of from the disk."),

    Option("bluefs_sync_write", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
    .set_default(false)
@ -4744,7 +4757,7 @@ std::vector<Option> get_global_options() {
    .set_description("How frequently we trim the bluestore cache"),

    Option("bluestore_cache_trim_max_skip_pinned", Option::TYPE_UINT, Option::LEVEL_DEV)
-    .set_default(64)
+    .set_default(1000)
    .set_description("Max pinned cache entries we consider before giving up"),

    Option("bluestore_cache_type", Option::TYPE_STR, Option::LEVEL_DEV)
--- a/ceph/src/common/pick_address.cc
+++ b/ceph/src/common/pick_address.cc
@ -14,6 +14,7 @@

 #include "common/pick_address.h"
 #include "include/ipaddr.h"
+#include "include/scope_guard.h"
 #include "include/str_list.h"
 #include "common/ceph_context.h"
 #ifndef WITH_SEASTAR
@ -25,12 +26,111 @@
 #include "common/numa.h"

 #include <netdb.h>
+#include <net/if.h>
+#include <netinet/in.h>
 #include <string>
 #include <string.h>
 #include <vector>

 #define dout_subsys ceph_subsys_

+namespace {
+
+bool matches_with_name(const ifaddrs& ifa, const std::string& if_name)
+{
+  return if_name.compare(ifa.ifa_name) == 0;
+}
+
+static int is_loopback_addr(sockaddr* addr)
+{
+  if (addr->sa_family == AF_INET) {
+    const sockaddr_in* sin = (struct sockaddr_in *)(addr);
+    const in_addr_t net = ntohl(sin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT;
+    return net == IN_LOOPBACKNET ? 1 : 0;
+  } else if (addr->sa_family == AF_INET6) {
+    sockaddr_in6* sin6 = (struct sockaddr_in6 *)(addr);
+    return IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) ? 1 : 0;
+  } else {
+    return -1;
+  }
+}
+
+static int grade_addr(const ifaddrs& ifa)
+{
+  if (ifa.ifa_addr == nullptr) {
+    return -1;
+  }
+  int score = 0;
+  if (ifa.ifa_flags & IFF_UP) {
+    score += 4;
+  }
+  switch (is_loopback_addr(ifa.ifa_addr)) {
+  case 0:
+    // prefer non-loopback addresses
+    score += 2;
+    break;
+  case 1:
+    score += 0;
+    break;
+  default:
+    score = -1;
+    break;
+  }
+  return score;
+}
+
+bool matches_with_net(const ifaddrs& ifa,
+                      const sockaddr* net,
+                      unsigned int prefix_len,
+                      unsigned ipv)
+{
+  switch (net->sa_family) {
+  case AF_INET:
+    if (ipv & CEPH_PICK_ADDRESS_IPV4) {
+      return matches_ipv4_in_subnet(ifa, (struct sockaddr_in*)net, prefix_len);
+    }
+    break;
+  case AF_INET6:
+    if (ipv & CEPH_PICK_ADDRESS_IPV6) {
+      return matches_ipv6_in_subnet(ifa, (struct sockaddr_in6*)net, prefix_len);
+    }
+    break;
+  }
+  return false;
+}
+
+bool matches_with_net(CephContext *cct,
+                      const ifaddrs& ifa,
+                      const std::string& s,
+                      unsigned ipv)
+{
+  struct sockaddr_storage net;
+  unsigned int prefix_len;
+  if (!parse_network(s.c_str(), &net, &prefix_len)) {
+    lderr(cct) << "unable to parse network: " << s << dendl;
+    exit(1);
+  }
+  return matches_with_net(ifa, (sockaddr*)&net, prefix_len, ipv);
+}
+
+int grade_with_numa_node(const ifaddrs& ifa, int numa_node)
+{
+#if defined(WITH_SEASTAR) || defined(_WIN32)
+  return 0;
+#else
+  if (numa_node < 0) {
+    return 0;
+  }
+  int if_node = -1;
+  int r = get_iface_numa_node(ifa.ifa_name, &if_node);
+  if (r < 0) {
+    return 0;
+  }
+  return if_node == numa_node ? 1 : 0;
+#endif
+}
+}
+
 const struct sockaddr *find_ip_in_subnet_list(
  CephContext *cct,
  const struct ifaddrs *ifa,
@ -39,86 +139,41 @@ const struct sockaddr *find_ip_in_subnet_list(
  const std::string &interfaces,
  int numa_node)
 {
-  std::list<string> nets;
-  get_str_list(networks, nets);
-  std::list<string> ifs;
-  get_str_list(interfaces, ifs);
-
-  // filter interfaces by name
-  const struct ifaddrs *filtered = nullptr;
-  if (ifs.empty()) {
-    filtered = ifa;
-  } else {
-    if (nets.empty()) {
+  const auto ifs = get_str_list(interfaces);
+  const auto nets = get_str_list(networks);
+  if (!ifs.empty() && nets.empty()) {
      lderr(cct) << "interface names specified but not network names" << dendl;
      exit(1);
-    }
-    const struct ifaddrs *t = ifa;
-    struct ifaddrs *head = 0;
-    while (t) {
-      bool match = false;
-      for (auto& i : ifs) {
-	if (strcmp(i.c_str(), t->ifa_name) == 0) {
-	  match = true;
-	  break;
-	}
-      }
-      if (match) {
-	struct ifaddrs *n = new ifaddrs;
-	memcpy(n, t, sizeof(*t));
-	n->ifa_next = head;
-	head = n;
-      }
-      t = t->ifa_next;
-    }
-    if (!head) {
-      lderr(cct) << "no interfaces matching " << ifs << dendl;
-      exit(1);
-    }
-    filtered = head;
  }

-  struct sockaddr *r = nullptr;
-  for (auto& s : nets) {
-    struct sockaddr_storage net;
-    unsigned int prefix_len;
-
-    if (!parse_network(s.c_str(), &net, &prefix_len)) {
-      lderr(cct) << "unable to parse network: " << s << dendl;
-      exit(1);
+  int best_score = 0;
+  const sockaddr* best_addr = nullptr;
+  for (const auto* addr = ifa; addr != nullptr; addr = addr->ifa_next) {
+    if (!ifs.empty() &&
+	std::none_of(std::begin(ifs), std::end(ifs),
+                     [&](const auto& if_name) {
+                       return matches_with_name(*addr, if_name);
+                     })) {
+      continue;
    }
-
-    switch (net.ss_family) {
-    case AF_INET:
-      if (!(ipv & CEPH_PICK_ADDRESS_IPV4)) {
-	continue;
-      }
-      break;
-    case AF_INET6:
-      if (!(ipv & CEPH_PICK_ADDRESS_IPV6)) {
-	continue;
-      }
-      break;
+    if (!nets.empty() &&
+	std::none_of(std::begin(nets), std::end(nets),
+                     [&](const auto& net) {
+                       return matches_with_net(cct, *addr, net, ipv);
+                     })) {
+      continue;
    }
-
-    const struct ifaddrs *found = find_ip_in_subnet(
-      filtered,
-      (struct sockaddr *) &net, prefix_len, numa_node);
-    if (found) {
-      r = found->ifa_addr;
-      break;
+    int score = grade_addr(*addr);
+    if (score < 0) {
+      continue;
+    }
+    score += grade_with_numa_node(*addr, numa_node);
+    if (score > best_score) {
+      best_score = score;
+      best_addr = addr->ifa_addr;
    }
  }
-
-  if (filtered != ifa) {
-    while (filtered) {
-      struct ifaddrs *t = filtered->ifa_next;
-      delete filtered;
-      filtered = t;
-    }
-  }
-
-  return r;
+  return best_addr;
 }

 #ifndef WITH_SEASTAR
@ -141,8 +196,8 @@ struct Observer : public md_config_obs_t {

 static void fill_in_one_address(CephContext *cct,
 				const struct ifaddrs *ifa,
-				const string networks,
-				const string interfaces,
+				const string &networks,
+				const string &interfaces,
 				const char *conf_var,
 				int numa_node = -1)
 {
@ -187,8 +242,6 @@ static void fill_in_one_address(CephContext *cct,

 void pick_addresses(CephContext *cct, int needs)
 {
-  struct ifaddrs *ifa;
-  int r = getifaddrs(&ifa);
  auto public_addr = cct->_conf.get_val<entity_addr_t>("public_addr");
  auto public_network = cct->_conf.get_val<std::string>("public_network");
  auto public_network_interface =
@ -198,33 +251,33 @@ void pick_addresses(CephContext *cct, int needs)
  auto cluster_network_interface =
    cct->_conf.get_val<std::string>("cluster_network_interface");

+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
  if (r < 0) {
    string err = cpp_strerror(errno);
    lderr(cct) << "unable to fetch interfaces and addresses: " << err << dendl;
    exit(1);
  }
-
+  auto free_ifa = make_scope_guard([ifa] { freeifaddrs(ifa); });
  if ((needs & CEPH_PICK_ADDRESS_PUBLIC) &&
    public_addr.is_blank_ip() && !public_network.empty()) {
    fill_in_one_address(cct, ifa, public_network, public_network_interface,
-      "public_addr");
+			"public_addr");
  }

  if ((needs & CEPH_PICK_ADDRESS_CLUSTER) && cluster_addr.is_blank_ip()) {
    if (!cluster_network.empty()) {
      fill_in_one_address(cct, ifa, cluster_network, cluster_network_interface,
-	"cluster_addr");
+			  "cluster_addr");
    } else {
      if (!public_network.empty()) {
        lderr(cct) << "Public network was set, but cluster network was not set " << dendl;
        lderr(cct) << "    Using public network also for cluster network" << dendl;
        fill_in_one_address(cct, ifa, public_network, public_network_interface,
-          "cluster_addr");
+			    "cluster_addr");
      }
    }
  }
-
-  freeifaddrs(ifa);
 }
 #endif	// !WITH_SEASTAR

@ -232,13 +285,15 @@ static int fill_in_one_address(
  CephContext *cct,
  const struct ifaddrs *ifa,
  unsigned ipv,
-  const string networks,
-  const string interfaces,
+  const string &networks,
+  const string &interfaces,
  entity_addrvec_t *addrs,
  int numa_node = -1)
 {
-  const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, ipv, networks,
-							interfaces, numa_node);
+  const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, ipv,
+							networks,
+							interfaces,
+							numa_node);
  if (!found) {
    std::string ip_type = "";
    if ((ipv & CEPH_PICK_ADDRESS_IPV4) && (ipv & CEPH_PICK_ADDRESS_IPV6)) {
@ -352,33 +407,29 @@ int pick_addresses(
      !networks.empty()) {
    int ipv4_r = !(ipv & CEPH_PICK_ADDRESS_IPV4) ? 0 : -1;
    int ipv6_r = !(ipv & CEPH_PICK_ADDRESS_IPV6) ? 0 : -1;
-    // first try on preferred numa node (if >= 0), then anywhere.
-    while (true) {
-      // note: pass in ipv to filter the matching addresses
-      if ((ipv & CEPH_PICK_ADDRESS_IPV4) &&
-	  (flags & CEPH_PICK_ADDRESS_PREFER_IPV4)) {
-	ipv4_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV4,
-                                     networks, interfaces, addrs,
-                                     preferred_numa_node);
-      }
-      if (ipv & CEPH_PICK_ADDRESS_IPV6) {
-	ipv6_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV6,
-                                     networks, interfaces, addrs,
-                                     preferred_numa_node);
-      }
-      if ((ipv & CEPH_PICK_ADDRESS_IPV4) &&
-	  !(flags & CEPH_PICK_ADDRESS_PREFER_IPV4)) {
-	ipv4_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV4,
-                                     networks, interfaces, addrs,
-                                     preferred_numa_node);
-      }
-      if (ipv4_r >= 0 && ipv6_r >= 0) {
-	break;
-      }
-      if (preferred_numa_node < 0) {
-	return ipv4_r >= 0 && ipv6_r >= 0 ? 0 : -1;
-      }
-      preferred_numa_node = -1;      // try any numa node
+    // note: pass in ipv to filter the matching addresses
+    if ((ipv & CEPH_PICK_ADDRESS_IPV4) &&
+	(flags & CEPH_PICK_ADDRESS_PREFER_IPV4)) {
+      ipv4_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV4,
+				   networks, interfaces,
+				   addrs,
+				   preferred_numa_node);
+    }
+    if (ipv & CEPH_PICK_ADDRESS_IPV6) {
+      ipv6_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV6,
+				   networks, interfaces,
+				   addrs,
+				   preferred_numa_node);
+    }
+    if ((ipv & CEPH_PICK_ADDRESS_IPV4) &&
+	!(flags & CEPH_PICK_ADDRESS_PREFER_IPV4)) {
+      ipv4_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV4,
+				   networks, interfaces,
+				   addrs,
+				   preferred_numa_node);
+    }
+    if (ipv4_r < 0 || ipv6_r < 0) {
+      return -1;
    }
  }

@ -461,20 +512,15 @@ std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network)
    lderr(cct) << "unable to fetch interfaces and addresses: " << err << dendl;
    return {};
  }
-
-  const unsigned int prefix_len = max(sizeof(in_addr::s_addr), sizeof(in6_addr::s6_addr)) * CHAR_BIT;
-  const struct ifaddrs *found = find_ip_in_subnet(
-    ifa,
-    (const struct sockaddr *) &network, prefix_len);
-
-  std::string result;
-  if (found) {
-    result = found->ifa_name;
+  auto free_ifa = make_scope_guard([ifa] { freeifaddrs(ifa); });
+  const unsigned int prefix_len = std::max(sizeof(in_addr::s_addr), sizeof(in6_addr::s6_addr)) * CHAR_BIT;
+  for (auto addr = ifa; addr != nullptr; addr = addr->ifa_next) {
+    if (matches_with_net(*ifa, (const struct sockaddr *) &network, prefix_len,
+			 CEPH_PICK_ADDRESS_IPV4 | CEPH_PICK_ADDRESS_IPV6)) {
+      return addr->ifa_name;
+    }
  }
-
-  freeifaddrs(ifa);
-
-  return result;
+  return {};
 }


@ -486,8 +532,8 @@ bool have_local_addr(CephContext *cct, const list<entity_addr_t>& ls, entity_add
    lderr(cct) << "unable to fetch interfaces and addresses: " << cpp_strerror(errno) << dendl;
    exit(1);
  }
+  auto free_ifa = make_scope_guard([ifa] { freeifaddrs(ifa); });

-  bool found = false;
  for (struct ifaddrs *addrs = ifa; addrs != nullptr; addrs = addrs->ifa_next) {
    if (addrs->ifa_addr) {
      entity_addr_t a;
@ -495,16 +541,12 @@ bool have_local_addr(CephContext *cct, const list<entity_addr_t>& ls, entity_add
      for (auto& p : ls) {
        if (a.is_same_host(p)) {
          *match = p;
-          found = true;
-          goto out;
+          return true;
        }
      }
    }
  }
-
- out:
-  freeifaddrs(ifa);
-  return found;
+  return false;
 }

 int get_iface_numa_node(
--- a/ceph/src/common/pick_address.h
+++ b/ceph/src/common/pick_address.h
@ -73,6 +73,20 @@ std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network)
 */
 bool have_local_addr(CephContext *cct, const std::list<entity_addr_t>& ls, entity_addr_t *match);

+/**
+ * filter the addresses in @c ifa with specified interfaces, networks and IPv
+ *
+ * @param cct
+ * @param ifa a list of network interface addresses to be filtered
+ * @param ipv bitmask of CEPH_PICK_ADDRESS_IPV4 and CEPH_PICK_ADDRESS_IPV6.
+ *        it is used to filter the @c networks
+ * @param networks a comma separated list of networks as the allow list. only
+ *        the addresses in the specified networks are allowed. all addresses
+ *        are accepted if it is empty.
+ * @param interfaces a comma separated list of interfaces for the allow list.
+ *        all addresses are accepted if it is empty
+ * @param exclude_lo_iface filter out network interface named "lo"
+ */
 const struct sockaddr *find_ip_in_subnet_list(
  CephContext *cct,
  const struct ifaddrs *ifa,
--- a/ceph/src/compressor/zstd/CMakeLists.txt
+++ b/ceph/src/compressor/zstd/CMakeLists.txt
@ -1,39 +1,18 @@
 # zstd

-# libzstd - build it statically
-set(ZSTD_C_FLAGS "-fPIC -Wno-unused-variable -O3")
+option(WITH_SYSTEM_ZSTD "use prebuilt libzstd in system" OFF)

-include(ExternalProject)
-ExternalProject_Add(zstd_ext
-  SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/zstd/build/cmake
-  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-             -DCMAKE_C_FLAGS=${ZSTD_C_FLAGS}
-             -DCMAKE_AR=${CMAKE_AR}
-             -DCMAKE_POSITION_INDEPENDENT_CODE=${ENABLE_SHARED}
-  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libzstd
-  BUILD_COMMAND $(MAKE) libzstd_static
-  INSTALL_COMMAND "true")
-
-# force zstd make to be called on each time
-ExternalProject_Add_Step(zstd_ext forcebuild
-  DEPENDEES configure
-  DEPENDERS build
-  COMMAND "true"
-  ALWAYS 1)
-
-add_library(zstd STATIC IMPORTED)
-set_target_properties(zstd PROPERTIES
-  INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/src/zstd/lib"
-  IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/libzstd/lib/libzstd.a")
-add_dependencies(zstd zstd_ext)
+if(WITH_SYSTEM_ZSTD)
+  find_package(Zstd 1.4.4 REQUIRED)
+else()
+  include(BuildZstd)
+  build_Zstd()
+endif()

 set(zstd_sources
-  CompressionPluginZstd.cc
-)
-
+  CompressionPluginZstd.cc)
 add_library(ceph_zstd SHARED ${zstd_sources})
-target_link_libraries(ceph_zstd PRIVATE zstd)
+target_link_libraries(ceph_zstd PRIVATE Zstd::Zstd)
 set_target_properties(ceph_zstd PROPERTIES
  VERSION 2.0.0
  SOVERSION 2
--- a/ceph/src/compressor/zstd/ZstdCompressor.h
+++ b/ceph/src/compressor/zstd/ZstdCompressor.h
@ -46,7 +46,7 @@ class ZstdCompressor : public Compressor {
      inbuf.size = p.get_ptr_and_advance(left, (const char**)&inbuf.src);
      left -= inbuf.size;
      ZSTD_EndDirective const zed = (left==0) ? ZSTD_e_end : ZSTD_e_continue;
-      size_t r = ZSTD_compress_generic(s, &outbuf, &inbuf, zed);
+      size_t r = ZSTD_compressStream2(s, &outbuf, &inbuf, zed);
      if (ZSTD_isError(r)) {
 	return -EINVAL;
      }
--- a/ceph/src/crush/CrushLocation.cc
+++ b/ceph/src/crush/CrushLocation.cc
@ -119,6 +119,5 @@ int CrushLocation::init_on_startup()
  loc.clear();
  loc.insert(make_pair<std::string,std::string>("host", hostname));
  loc.insert(make_pair<std::string,std::string>("root", "default"));
-  lgeneric_dout(cct, 10) << "crush_location is (default) " << loc << dendl;
  return 0;
 }
--- a/ceph/src/crush/CrushWrapper.cc
+++ b/ceph/src/crush/CrushWrapper.cc
@ -1376,6 +1376,12 @@ int CrushWrapper::update_item(
 		    << ((float)old_iweight/(float)0x10000) << " -> " << weight
 		    << dendl;
      adjust_item_weight_in_loc(cct, item, iweight, loc);
+      ret = rebuild_roots_with_classes(cct);
+      if (ret < 0) {
+	ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
+		      << cpp_strerror(ret) << dendl;
+	return ret;
+      }
      ret = 1;
    }
    if (get_item_name(item) != name) {
@ -1559,6 +1565,12 @@ int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight,
      }
    }
  }
+  int ret = rebuild_roots_with_classes(cct);
+  if (ret < 0) {
+    ldout(cct, 0) << __func__ << " unable to rebuild roots with classes: "
+		  << cpp_strerror(ret) << dendl;
+    return ret;
+  }
  return changed;
 }

--- a/ceph/src/global/global_init.cc
+++ b/ceph/src/global/global_init.cc
@ -145,11 +145,6 @@ void global_pre_init(
  // command line (as passed by caller)
  conf.parse_argv(args);

-  if (conf->log_early &&
-      !cct->_log->is_started()) {
-    cct->_log->start();
-  }
-
  if (!cct->_log->is_started()) {
    cct->_log->start();
  }
--- a/ceph/src/global/signal_handler.h
+++ b/ceph/src/global/signal_handler.h
@ -20,9 +20,9 @@

 typedef void (*signal_handler_t)(int);

-#ifdef HAVE_SIGDESCR_NP
+#if defined(HAVE_SIGDESCR_NP)
 # define sig_str(signum) sigdescr_np(signum)
-#elif HAVE_REENTRANT_STRSIGNAL
+#elif defined(HAVE_REENTRANT_STRSIGNAL)
 # define sig_str(signum) strsignal(signum)
 #else
 # define sig_str(signum) sys_siglist[signum]
--- a/ceph/src/include/ceph_fs.h
+++ b/ceph/src/include/ceph_fs.h
@ -47,9 +47,15 @@
 #define CEPH_MONC_PROTOCOL   15 /* server/client */


-#define CEPH_INO_ROOT   1
-#define CEPH_INO_CEPH   2       /* hidden .ceph dir */
-#define CEPH_INO_LOST_AND_FOUND 4	/* reserved ino for use in recovery */
+#define CEPH_INO_ROOT             1
+/*
+ * hidden .ceph dir, which is no longer created but
+ * recognised in existing filesystems so that we
+ * don't try to fragment it.
+ */
+#define CEPH_INO_CEPH             2
+#define CEPH_INO_GLOBAL_SNAPREALM 3
+#define CEPH_INO_LOST_AND_FOUND   4 /* reserved ino for use in recovery */

 /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
 #define CEPH_MAX_MON   31
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`../../../../../../distros/all/ubuntu_18.04.yaml`