update sources to v12.2.0

2025-04-28 16:34:15 +00:00 · 2017-08-30 08:42:40 +02:00 · 2017-08-30 08:42:40 +02:00 · b5b8bbf502
commit b5b8bbf502
parent 913cc16a67
118 changed files with 2665 additions and 757 deletions
--- a/ceph/CMakeLists.txt
+++ b/ceph/CMakeLists.txt
@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)

 project(ceph)
-set(VERSION 12.1.4)
+set(VERSION 12.2.0)

 if(POLICY CMP0046)
  # Tweak policies (this one disables "missing" dependency warning)
@ -274,7 +274,7 @@ find_package(snappy REQUIRED)

 option(WITH_LZ4 "LZ4 compression support" OFF)
 if(WITH_LZ4)
-  find_package(LZ4 REQUIRED)
+  find_package(LZ4 1.7 REQUIRED)
  set(HAVE_LZ4 ${LZ4_FOUND})
 endif(WITH_LZ4)

--- a/ceph/alpine/APKBUILD
+++ b/ceph/alpine/APKBUILD
@ -1,7 +1,7 @@
 # Contributor: John Coyle <dx9err@gmail.com>
 # Maintainer: John Coyle <dx9err@gmail.com>
 pkgname=ceph
-pkgver=12.1.4
+pkgver=12.2.0
 pkgrel=0
 pkgdesc="Ceph is a distributed object store and file system"
 pkgusers="ceph"
@ -63,7 +63,7 @@ makedepends="
 	xmlstarlet
 	yasm
 "
-source="ceph-12.1.4.tar.bz2"
+source="ceph-12.2.0.tar.bz2"
 subpackages="
 	$pkgname-base
 	$pkgname-common
@ -116,7 +116,7 @@ _sysconfdir=/etc
 _udevrulesdir=/etc/udev/rules.d
 _python_sitelib=/usr/lib/python2.7/site-packages

-builddir=$srcdir/ceph-12.1.4
+builddir=$srcdir/ceph-12.2.0

 build() {
 	export CEPH_BUILD_VIRTUALENV=$builddir
--- a/ceph/ceph.spec
+++ b/ceph/ceph.spec
@ -61,7 +61,7 @@
 # main package definition
 #################################################################################
 Name:		ceph
-Version:	12.1.4
+Version:	12.2.0
 Release:	0%{?dist}
 %if 0%{?fedora} || 0%{?rhel}
 Epoch:		2
@ -76,7 +76,7 @@ License:	LGPL-2.1 and CC-BY-SA-1.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
 Group:		System/Filesystems
 %endif
 URL:		http://ceph.com/
-Source0:	http://ceph.com/download/ceph-12.1.4.tar.bz2
+Source0:	http://ceph.com/download/ceph-12.2.0.tar.bz2
 %if 0%{?suse_version}
 %if 0%{?is_opensuse}
 ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@ -772,7 +772,7 @@ python-rbd, python-rgw or python-cephfs instead.
 # common
 #################################################################################
 %prep
-%autosetup -p1 -n ceph-12.1.4
+%autosetup -p1 -n ceph-12.2.0

 %build
 %if 0%{with cephfs_java}
@ -975,6 +975,8 @@ rm -rf %{buildroot}
 %{_mandir}/man8/ceph-detect-init.8*
 %{_mandir}/man8/ceph-create-keys.8*
 %{_mandir}/man8/ceph-disk.8*
+%{_mandir}/man8/ceph-volume.8*
+%{_mandir}/man8/ceph-volume-systemd.8*
 %{_mandir}/man8/ceph-run.8*
 %{_mandir}/man8/crushtool.8*
 %{_mandir}/man8/osdmaptool.8*
--- a/ceph/ceph.spec.in
+++ b/ceph/ceph.spec.in
@ -975,6 +975,8 @@ rm -rf %{buildroot}
 %{_mandir}/man8/ceph-detect-init.8*
 %{_mandir}/man8/ceph-create-keys.8*
 %{_mandir}/man8/ceph-disk.8*
+%{_mandir}/man8/ceph-volume.8*
+%{_mandir}/man8/ceph-volume-systemd.8*
 %{_mandir}/man8/ceph-run.8*
 %{_mandir}/man8/crushtool.8*
 %{_mandir}/man8/osdmaptool.8*
--- a/ceph/cmake/modules/FindLZ4.cmake
+++ b/ceph/cmake/modules/FindLZ4.cmake
@ -5,11 +5,30 @@
 # LZ4_FOUND
 # LZ4_INCLUDE_DIR
 # LZ4_LIBRARY
+# LZ4_VERSION_STRING
+# LZ4_VERSION_MAJOR
+# LZ4_VERSION_MINOR
+# LZ4_VERSION_RELEASE

 find_path(LZ4_INCLUDE_DIR NAMES lz4.h)
+
+if(LZ4_INCLUDE_DIR AND EXISTS "${LZ4_INCLUDE_DIR}/lz4.h")
+  foreach(ver "MAJOR" "MINOR" "RELEASE")
+    file(STRINGS "${LZ4_INCLUDE_DIR}/lz4.h" LZ4_VER_${ver}_LINE
+      REGEX "^#define[ \t]+LZ4_VERSION_${ver}[ \t]+[0-9]+[ \t]+.*$")
+    string(REGEX REPLACE "^#define[ \t]+LZ4_VERSION_${ver}[ \t]+([0-9]+)[ \t]+.*$"
+      "\\1" LZ4_VERSION_${ver} "${LZ4_VER_${ver}_LINE}")
+    unset(${LZ4_VER_${ver}_LINE})
+  endforeach()
+  set(LZ4_VERSION_STRING
+    "${LZ4_VERSION_MAJOR}.${LZ4_VERSION_MINOR}.${LZ4_VERSION_RELEASE}")
+endif()
+
 find_library(LZ4_LIBRARY NAMES lz4)

 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(LZ4 DEFAULT_MSG LZ4_LIBRARY LZ4_INCLUDE_DIR)
+find_package_handle_standard_args(LZ4
+  REQUIRED_VARS LZ4_LIBRARY LZ4_INCLUDE_DIR
+  VERSION_VAR LZ4_VERSION_STRING)

 mark_as_advanced(LZ4_INCLUDE_DIR LZ4_LIBRARY)
--- a/ceph/debian/ceph-osd.install
+++ b/ceph/debian/ceph-osd.install
@ -11,8 +11,11 @@ usr/lib/libosd_tp.so*
 usr/lib/python*/dist-packages/ceph_disk*
 usr/sbin/ceph-disk
 usr/sbin/ceph-volume
+usr/sbin/ceph-volume-systemd
 usr/lib/python*/dist-packages/ceph_volume/*
 usr/lib/python*/dist-packages/ceph_volume-*
 usr/share/man/man8/ceph-clsinfo.8
 usr/share/man/man8/ceph-disk.8
+usr/share/man/man8/ceph-volume.8
+usr/share/man/man8/ceph-volume-systemd.8
 usr/share/man/man8/ceph-osd.8
--- a/ceph/debian/changelog
+++ b/ceph/debian/changelog
@ -1,3 +1,9 @@
+ceph (12.2.0-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Mon, 28 Aug 2017 16:30:16 +0000
+
 ceph (12.1.4-1) stable; urgency=medium

  * New upstream release
--- a/ceph/doc/ceph-volume/index.rst
+++ b/ceph/doc/ceph-volume/index.rst
@ -0,0 +1,30 @@
+.. _ceph-volume:
+
+ceph-volume
+===========
+Deploy OSDs with different device technologies like lvm or physical disks using
+pluggable tools (:doc:`lvm/index` itself is treated like a plugin). It tries to
+follow the workflow of ``ceph-disk`` for deploying OSDs, with a predictable,
+and robust way of preparing, activating, and starting OSDs.
+
+:ref:`Overview <ceph-volume-overview>` |
+:ref:`Plugin Guide <ceph-volume-plugins>` |
+
+
+**Command Line Subcommands**
+Although currently there is support for ``lvm``, the plan is to support other
+technologies, including plain disks.
+
+* :ref:`ceph-volume-lvm`
+
+.. toctree::
+   :hidden:
+   :maxdepth: 3
+   :caption: Contents:
+
+   intro
+   lvm/index
+   lvm/activate
+   lvm/prepare
+   lvm/scan
+   lvm/systemd
--- a/ceph/doc/ceph-volume/intro.rst
+++ b/ceph/doc/ceph-volume/intro.rst
@ -0,0 +1,19 @@
+.. _ceph-volume-overview:
+
+Overview
+--------
+The ``ceph-volume`` tool aims to be a single purpose command line tool to deploy
+logical volumes as OSDs, trying to maintain a similar API to ``ceph-disk`` when
+preparing, activating, and creating OSDs.
+
+It deviates from ``ceph-disk`` by not interacting or relying on the udev rules
+that come installed for Ceph. These rules allow automatic detection of
+previously setup devices that are in turn fed into ``ceph-disk`` to activate
+them.
+
+
+``ceph-volume lvm``
+-------------------
+By making use of :term:`LVM tags`, the :ref:`ceph-volume-lvm` sub-command is
+able to store and later re-discover and query devices associated with OSDs so
+that they can later activated.
--- a/ceph/doc/ceph-volume/lvm/activate.rst
+++ b/ceph/doc/ceph-volume/lvm/activate.rst
@ -0,0 +1,74 @@
+.. _ceph-volume-lvm-activate:
+
+``activate``
+============
+Once :ref:`ceph-volume-lvm-prepare` is completed, and all the various steps
+that entails are done, the volume is ready to get "activated".
+
+This activation process enables a systemd unit that persists the OSD ID and its
+UUID (also called ``fsid`` in Ceph CLI tools), so that at boot time it can
+understand what OSD is enabled and needs to be mounted.
+
+.. note:: The execution of this call is fully idempotent, and there is no
+          side-effects when running multiple times
+
+New OSDs
+--------
+To activate newly prepared OSDs both the :term:`OSD id` and :term:`OSD uuid`
+need to be supplied. For example::
+
+    ceph-volume activate --filestore 0 0263644D-0BF1-4D6D-BC34-28BD98AE3BC8
+
+.. note:: The UUID is stored in the ``osd_fsid`` file in the OSD path, which is
+          generated when :ref:`ceph-volume-lvm-prepare` is used.
+
+requiring uuids
+^^^^^^^^^^^^^^^
+The :term:`OSD uuid` is being required as an extra step to ensure that the
+right OSD is being activated. It is entirely possible that a previous OSD with
+the same id exists and would end up activating the incorrect one.
+
+
+Discovery
+---------
+With either existing OSDs or new ones being activated, a *discovery* process is
+performed using :term:`LVM tags` to enable the systemd units.
+
+The systemd unit will capture the :term:`OSD id` and :term:`OSD uuid` and
+persist it. Internally, the activation will enable it like::
+
+    systemctl enable ceph-volume@$id-$uuid-lvm
+
+For example::
+
+    systemctl enable ceph-volume@0-8715BEB4-15C5-49DE-BA6F-401086EC7B41-lvm
+
+Would start the discovery process for the OSD with an id of ``0`` and a UUID of
+``8715BEB4-15C5-49DE-BA6F-401086EC7B41``.
+
+.. note:: for more details on the systemd workflow see :ref:`ceph-volume-systemd`
+
+The systemd unit will look for the matching OSD device, and by looking at its
+:term:`LVM tags` will proceed to:
+
+# mount the device in the corresponding location (by convention this is
+  ``/var/lib/ceph/osd/<cluster name>-<osd id>/``)
+
+# ensure that all required devices are ready for that OSD
+
+# start the ``ceph-osd@0`` systemd unit
+
+Existing OSDs
+-------------
+For exsiting OSDs that have been deployed with different tooling, the only way
+to port them over to the new mechanism is to prepare them again (losing data).
+See :ref:`ceph-volume-lvm-existing-osds` for details on how to proceed.
+
+Summary
+-------
+To recap the ``activate`` process:
+
+#. require both :term:`OSD id` and :term:`OSD uuid`
+#. enable the system unit with matching id and uuid
+#. the systemd unit will ensure all devices are ready and mounted (if needed)
+#. the matching ``ceph-osd`` systemd unit will get started
--- a/ceph/doc/ceph-volume/lvm/index.rst
+++ b/ceph/doc/ceph-volume/lvm/index.rst
@ -0,0 +1,24 @@
+.. _ceph-volume-lvm:
+
+``lvm``
+=======
+Implements the functionality needed to deploy OSDs from the ``lvm`` subcommand:
+``ceph-volume lvm``
+
+**Command Line Subcommands**
+
+* :ref:`ceph-volume-lvm-prepare`
+
+* :ref:`ceph-volume-lvm-activate`
+
+.. not yet implemented
+.. * :ref:`ceph-volume-lvm-scan`
+
+**Internal functionality**
+
+There are other aspects of the ``lvm`` subcommand that are internal and not
+exposed to the user, these sections explain how these pieces work together,
+clarifying the workflows of the tool.
+
+:ref:`Systemd Units <ceph-volume-systemd>` |
+:ref:`lvm <ceph-volume-lvm-api>`
--- a/ceph/doc/ceph-volume/lvm/prepare.rst
+++ b/ceph/doc/ceph-volume/lvm/prepare.rst
@ -0,0 +1,145 @@
+.. _ceph-volume-lvm-prepare:
+
+``prepare``
+===========
+This subcommand allows a :term:`filestore` setup (:term:`bluestore` support is
+planned) and currently consumes only logical volumes for both the data and
+journal. It will not create or modify the logical volumes except for adding
+extra metadata.
+
+.. note:: This is part of a two step process to deploy an OSD. If looking for
+          a single-call way, please see :ref:`ceph-volume-lvm-create`
+
+To help identify volumes, the process of preparing a volume (or volumes) to
+work with Ceph, the tool will assign a few pieces of metadata information using
+:term:`LVM tags`.
+
+:term:`LVM tags` makes volumes easy to discover later, and help identify them as
+part of a Ceph system, and what role they have (journal, filestore, bluestore,
+etc...)
+
+Although initially :term:`filestore` is supported (and supported by default)
+the back end can be specified with:
+
+
+* :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
+* ``--bluestore``
+
+.. when available, this will need to be updated to:
+.. * :ref:`--bluestore <ceph-volume-prepare_bluestore>`
+
+.. _ceph-volume-lvm-prepare_filestore:
+
+``filestore``
+-------------
+This is the default OSD backend and allows preparation of logical volumes for
+a :term:`filestore` OSD.
+
+The process is *very* strict, it requires two logical volumes that are ready to
+be used. No special preparation is needed for these volumes other than
+following the minimum size requirements for data and journal.
+
+The API call looks like::
+
+    ceph-volume prepare --filestore --data data --journal journal
+
+The journal *must* be a logical volume, just like the data volume, and that
+argument is always required even if both live under the same group.
+
+A generated uuid is used to ask the cluster for a new OSD. These two pieces are
+crucial for identifying an OSD and will later be used throughout the
+:ref:`ceph-volume-lvm-activate` process.
+
+The OSD data directory is created using the following convention::
+
+    /var/lib/ceph/osd/<cluster name>-<osd id>
+
+At this point the data volume is mounted at this location, and the journal
+volume is linked::
+
+      ln -s /path/to/journal /var/lib/ceph/osd/<cluster_name>-<osd-id>/journal
+
+The monmap is fetched using the bootstrap key from the OSD::
+
+      /usr/bin/ceph --cluster ceph --name client.bootstrap-osd
+      --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring
+      mon getmap -o /var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap
+
+``ceph-osd`` will be called to populate the OSD directory, that is already
+mounted, re-using all the pieces of information from the initial steps::
+
+      ceph-osd --cluster ceph --mkfs --mkkey -i <osd id> \
+      --monmap /var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap --osd-data \
+      /var/lib/ceph/osd/<cluster name>-<osd id> --osd-journal /var/lib/ceph/osd/<cluster name>-<osd id>/journal \
+      --osd-uuid <osd uuid> --keyring /var/lib/ceph/osd/<cluster name>-<osd id>/keyring \
+      --setuser ceph --setgroup ceph
+
+.. _ceph-volume-lvm-existing-osds:
+
+Existing OSDs
+-------------
+For existing clusters that want to use this new system and have OSDs that are
+already running there are a few things to take into account:
+
+.. warning:: this process will forcefully format the data device, destroying
+             existing data, if any.
+
+* OSD paths should follow this convention::
+
+     /var/lib/ceph/osd/<cluster name>-<osd id>
+
+* Preferably, no other mechanisms to mount the volume should exist, and should
+  be removed (like fstab mount points)
+* There is currently no support for encrypted volumes
+
+The one time process for an existing OSD, with an ID of 0 and
+using a ``"ceph"`` cluster name would look like::
+
+    ceph-volume lvm prepare --filestore --osd-id 0 --osd-fsid E3D291C1-E7BF-4984-9794-B60D9FA139CB
+
+The command line tool will not contact the monitor to generate an OSD ID and
+will format the LVM device in addition to storing the metadata on it so that it
+can later be startednot contact the monitor to generate an OSD ID and will
+format the LVM device in addition to storing the metadata on it so that it can
+later be started (for detailed metadata description see :ref:`ceph-volume-lvm-tags`).
+
+
+.. _ceph-volume-lvm-prepare_bluestore:
+
+``bluestore``
+-------------
+This subcommand is planned but not currently implemented.
+
+
+Storing metadata
+----------------
+The following tags will get applied as part of the prepartion process
+regardless of the type of volume (journal or data) and also regardless of the
+OSD backend:
+
+* ``cluster_fsid``
+* ``data_device``
+* ``journal_device``
+* ``encrypted``
+* ``osd_fsid``
+* ``osd_id``
+* ``block``
+* ``db``
+* ``wal``
+* ``lockbox_device``
+
+.. note:: For the complete lvm tag conventions see :ref:`ceph-volume-lvm-tag-api`
+
+
+Summary
+-------
+To recap the ``prepare`` process:
+
+#. Accept only logical volumes for data and journal (both required)
+#. Generate a UUID for the OSD
+#. Ask the monitor get an OSD ID reusing the generated UUID
+#. OSD data directory is created and data volume mounted
+#. Journal is symlinked from data volume to journal location
+#. monmap is fetched for activation
+#. devices is mounted and data directory is populated by ``ceph-osd``
+#. data and journal volumes are assigned all the Ceph metadata using lvm tags
--- a/ceph/doc/ceph-volume/lvm/scan.rst
+++ b/ceph/doc/ceph-volume/lvm/scan.rst
@ -0,0 +1,9 @@
+scan
+====
+This sub-command will allow to discover Ceph volumes previously setup by the
+tool by looking into the system's logical volumes and their tags.
+
+As part of the the :ref:`ceph-volume-lvm-prepare` process, the logical volumes are assigned
+a few tags with important pieces of information.
+
+.. note:: This sub-command is not yet implemented
--- a/ceph/doc/ceph-volume/lvm/systemd.rst
+++ b/ceph/doc/ceph-volume/lvm/systemd.rst
@ -0,0 +1,46 @@
+.. _ceph-volume-systemd:
+
+systemd
+=======
+As part of the :ref:`ceph-volume-lvm-activate` process, a few systemd units will get enabled
+that will use the OSD id and uuid as part of their name. These units will be
+run when the system boots, and will proceed to activate their corresponding
+volumes.
+
+The API for activation requires both the :term:`OSD id` and :term:`OSD uuid`,
+which get persisted by systemd. Internally, the activation process enables the
+systemd unit using the following convention::
+
+    ceph-volume@<type>-<extra metadata>
+
+Where ``type`` is the sub-command used to parse the extra metadata, and ``extra
+metadata`` is any additional information needed by the sub-command to be able
+to activate the OSD. For example an OSD with an ID of 0, for the ``lvm``
+sub-command would look like::
+
+    systemctl enable ceph-volume@lvm-0-0A3E1ED2-DA8A-4F0E-AA95-61DEC71768D6
+
+
+Process
+-------
+The systemd unit is a :term:`systemd oneshot` service, meant to start at boot after the
+local filesystem is ready to be used.
+
+Upon startup, it will identify the logical volume using :term:`LVM tags`,
+finding a matching ID and later ensuring it is the right one with
+the :term:`OSD uuid`.
+
+After identifying the correct volume it will then proceed to mount it by using
+the OSD destination conventions, that is::
+
+    /var/lib/ceph/osd/<cluster name>-<osd id>
+
+For our example OSD with an id of ``0``, that means the identified device will
+be mounted at::
+
+
+    /var/lib/ceph/osd/ceph-0
+
+Once that process is complete, a call will be made to start the OSD::
+
+    systemctl start ceph-osd@0
--- a/ceph/doc/conf.py
+++ b/ceph/doc/conf.py
@ -18,6 +18,8 @@ if tags.has('man'):
                         'install/*',
                         'mon/*',
                         'rados/*',
+                         'mgr/*',
+                         'ceph-volume/*',
                         'radosgw/*',
                         'rbd/*',
                         'start/*']
--- a/ceph/doc/dev/ceph-volume/index.rst
+++ b/ceph/doc/dev/ceph-volume/index.rst
@ -0,0 +1,13 @@
+===================================
+ceph-volume developer documentation
+===================================
+
+.. rubric:: Contents
+
+.. toctree::
+   :maxdepth: 1
+
+
+   plugins
+   lvm
+   systemd
--- a/ceph/doc/dev/ceph-volume/lvm.rst
+++ b/ceph/doc/dev/ceph-volume/lvm.rst
@ -0,0 +1,127 @@
+
+.. _ceph-volume-lvm-api:
+
+LVM
+===
+The backend of ``ceph-volume lvm`` is LVM, it relies heavily on the usage of
+tags, which is a way for LVM to allow extending its volume metadata. These
+values can later be queried against devices and it is how they get discovered
+later.
+
+.. warning:: These APIs are not meant to be public, but are documented so that
+             it is clear what the tool is doing behind the scenes. Do not alter
+             any of these values.
+
+
+.. _ceph-volume-lvm-tag-api:
+
+Tag API
+-------
+The process of identifying logical volumes as part of Ceph relies on applying
+tags on all volumes. It follows a naming convention for the namespace that
+looks like::
+
+    ceph.<tag name>=<tag value>
+
+All tags are prefixed by the ``ceph`` keyword do claim ownership of that
+namespace and make it easily identifiable. This is how the OSD ID would be used
+in the context of lvm tags::
+
+    ceph.osd_id=0
+
+
+.. _ceph-volume-lvm-tags:
+
+Metadata
+--------
+The following describes all the metadata from Ceph OSDs that is stored on an
+LVM volume:
+
+
+``type``
+--------
+Describes if the device is a an OSD or Journal, with the ability to expand to
+other types when supported (for example a lockbox)
+
+Example::
+
+    ceph.type=osd
+
+
+``cluster_fsid``
+----------------
+Example::
+
+    ceph.cluster_fsid=7146B649-AE00-4157-9F5D-1DBFF1D52C26
+
+``data_device``
+---------------
+Example::
+
+    ceph.data_device=/dev/ceph/data-0
+
+``journal_device``
+------------------
+Example::
+
+    ceph.journal_device=/dev/ceph/journal-0
+
+``encrypted``
+-------------
+Example for enabled encryption with ``luks``::
+
+    ceph.encrypted=luks
+
+For plain dmcrypt::
+
+    ceph.encrypted=dmcrypt
+
+For disabled encryption::
+
+    ceph.encrypted=0
+
+``osd_fsid``
+------------
+Example::
+
+    ceph.osd_fsid=88ab9018-f84b-4d62-90b4-ce7c076728ff
+
+``osd_id``
+----------
+Example::
+
+    ceph.osd_id=1
+
+``block``
+---------
+Just used on :term:`bluestore` backends.
+
+Example::
+
+    ceph.block=/dev/mapper/vg-block-0
+
+``db``
+------
+Just used on :term:`bluestore` backends.
+
+Example::
+
+    ceph.db=/dev/mapper/vg-db-0
+
+``wal``
+-------
+Just used on :term:`bluestore` backends.
+
+Example::
+
+    ceph.wal=/dev/mapper/vg-wal-0
+
+
+``lockbox_device``
+------------------
+Only used when encryption is enabled, to store keys in an unencrypted
+volume.
+
+Example::
+
+    ceph.lockbox_device=/dev/mapper/vg-lockbox-0
--- a/ceph/doc/dev/ceph-volume/plugins.rst
+++ b/ceph/doc/dev/ceph-volume/plugins.rst
@ -0,0 +1,65 @@
+.. _ceph-volume-plugins:
+
+Plugins
+=======
+``ceph-volume`` started initially to provide support for using ``lvm`` as
+the underlying system for an OSD. It is included as part of the tool but it is
+treated like a plugin.
+
+This modularity, allows for other device or device-like technologies to be able
+to consume and re-use the utilities and workflows provided.
+
+Adding Plugins
+--------------
+As a Python tool, plugins ``setuptools`` entry points. For a new plugin to be
+available, it should have an entry similar to this in its ``setup.py`` file:
+
+.. code-block:: python
+
+    setup(
+        ...
+        entry_points = dict(
+            ceph_volume_handlers = [
+                'my_command = my_package.my_module:MyClass',
+            ],
+        ),
+
+The ``MyClass`` should be a class that accepts ``sys.argv`` as its argument,
+``ceph-volume`` will pass that in at instantiation and call them ``main``
+method.
+
+This is how a plugin for ``ZFS`` could look like for example:
+
+.. code-block:: python
+
+    class ZFS(object):
+
+        help_menu = 'Deploy OSDs with ZFS'
+        _help = """
+    Use ZFS as the underlying technology for OSDs
+
+    --verbose   Increase the verbosity level
+        """
+
+        def __init__(self, argv):
+            self.argv = argv
+
+        def main(self):
+            parser = argparse.ArgumentParser()
+            args = parser.parse_args(self.argv)
+            ...
+
+And its entry point (via ``setuptools``) in ``setup.py`` would looke like:
+
+.. code-block:: python
+
+        entry_points = {
+            'ceph_volume_handlers': [
+                'zfs = ceph_volume_zfs.zfs:ZFS',
+            ],
+        },
+
+After installation, the ``zfs`` subcommand would be listed and could be used
+as::
+
+    ceph-volume zfs
--- a/ceph/doc/dev/ceph-volume/systemd.rst
+++ b/ceph/doc/dev/ceph-volume/systemd.rst
@ -0,0 +1,37 @@
+.. _ceph-volume-systemd-api:
+
+systemd
+=======
+The workflow to *"activate"* an OSD is by relying on systemd unit files and its
+ability to persist information as a suffix to the instance name.
+
+``ceph-volume`` exposes the following convention for unit files::
+
+    ceph-volume@<sub command>-<extra metadata>
+
+For example, this is how enabling an OSD could look like for the 
+:ref:`ceph-volume-lvm` sub command::
+
+    systemctl enable ceph-volume@lvm-0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+
+These 3 pieces of persisted information are needed by the sub-command so that
+it understands what OSD it needs to activate.
+
+Since ``lvm`` is not the only subcommand that will be supported, this
+is how it will allow other device types to be defined.
+
+At some point for example, for plain disks, it could be::
+
+    systemctl enable ceph-volume@disk-0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+At startup, the systemd unit will execute a helper script that will parse the
+suffix and will end up calling ``ceph-volume`` back. Using the previous
+example for lvm, that call will look like::
+
+    ceph-volume lvm activate 0 8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+
+.. warning:: These workflows are not meant to be public, but are documented so that
+             it is clear what the tool is doing behind the scenes. Do not alter
+             any of these values.
--- a/ceph/doc/dev/index-old.rst
+++ b/ceph/doc/dev/index-old.rst
@ -39,3 +39,4 @@ in the body of the message.
   osd_internals/index*
   mds_internals/index*
   radosgw/index*
+   ceph-volume/index*
--- a/ceph/doc/glossary.rst
+++ b/ceph/doc/glossary.rst
@ -4,7 +4,7 @@

 Ceph is growing rapidly. As firms deploy Ceph, the technical terms such as
 "RADOS", "RBD," "RGW" and so forth require corresponding marketing terms
-that explain what each component does. The terms in this glossary are 
+that explain what each component does. The terms in this glossary are
 intended to complement the existing technical terminology.

 Sometimes more than one term applies to a definition. Generally, the first
@ -12,21 +12,21 @@ term reflects a term consistent with Ceph's marketing, and secondary terms
 reflect either technical terms or legacy ways of referring to Ceph systems.


-.. glossary:: 
+.. glossary::

 	Ceph Project
-		The aggregate term for the people, software, mission and infrastructure 
+		The aggregate term for the people, software, mission and infrastructure
 		of Ceph.
-		
+
 	cephx
 		The Ceph authentication protocol. Cephx operates like Kerberos, but it
 		has no single point of failure.

 	Ceph
 	Ceph Platform
-		All Ceph software, which includes any piece of code hosted at 
+		All Ceph software, which includes any piece of code hosted at
 		`http://github.com/ceph`_.
-		
+
 	Ceph System
 	Ceph Stack
 		A collection of two or more components of Ceph.
@ -35,7 +35,7 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
 	Node
 	Host
 		Any single machine or server in a Ceph System.
-		
+
 	Ceph Storage Cluster
 	Ceph Object Store
 	RADOS
@ -45,7 +45,7 @@ reflect either technical terms or legacy ways of referring to Ceph systems.

 	Ceph Cluster Map
 	cluster map
-		The set of maps comprising the monitor map, OSD map, PG map, MDS map and 
+		The set of maps comprising the monitor map, OSD map, PG map, MDS map and
 		CRUSH map. See `Cluster Map`_ for details.

 	Ceph Object Storage
@ -56,13 +56,13 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
 	RADOS Gateway
 	RGW
 		The S3/Swift gateway component of Ceph.
-				
+
 	Ceph Block Device
 	RBD
 		The block storage component of Ceph.
-		
+
 	Ceph Block Storage
-		The block storage "product," service or capabilities when used in 
+		The block storage "product," service or capabilities when used in
 		conjunction with ``librbd``, a hypervisor such as QEMU or Xen, and a
 		hypervisor abstraction layer such as ``libvirt``.

@ -73,7 +73,7 @@ reflect either technical terms or legacy ways of referring to Ceph systems.

 	Cloud Platforms
 	Cloud Stacks
-		Third party cloud provisioning platforms such as OpenStack, CloudStack, 
+		Third party cloud provisioning platforms such as OpenStack, CloudStack,
 		OpenNebula, ProxMox, etc.

 	Object Storage Device
@ -82,7 +82,7 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
 		Sometimes, Ceph users use the
 		term "OSD" to refer to :term:`Ceph OSD Daemon`, though the
 		proper term is "Ceph OSD".
-		
+
 	Ceph OSD Daemon
 	Ceph OSD Daemons
 	Ceph OSD
@ -90,7 +90,29 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
 		disk (:term:`OSD`). Sometimes, Ceph users use the
 		term "OSD" to refer to "Ceph OSD Daemon", though the
 		proper term is "Ceph OSD".
-		
+
+	OSD id
+		The integer that defines an OSD. It is generated by the monitors as part
+		of the creation of a new OSD.
+
+	OSD fsid
+		This is a unique identifier used to further improve the uniqueness of an
+		OSD and it is found in the OSD path in a file called ``osd_fsid``. This
+		``fsid`` term is used interchangeably with ``uuid``
+
+	OSD uuid
+		Just like the OSD fsid, this is the OSD unique identifer and is used
+		interchangeably with ``fsid``
+
+	bluestore
+		OSD BlueStore is a new back end for OSD daemons (kraken and newer
+		versions). Unlike :term:`filestore` it stores objects directly on the
+		Ceph block devices without any file system interface.
+
+	filestore
+		A back end for OSD daemons, where a Journal is needed and files are
+		written to the filesystem.
+
 	Ceph Monitor
 	MON
 		The Ceph monitor software.
@ -106,22 +128,22 @@ reflect either technical terms or legacy ways of referring to Ceph systems.

 	Ceph Clients
 	Ceph Client
-		The collection of Ceph components which can access a Ceph Storage 
-		Cluster. These include the Ceph Object Gateway, the Ceph Block Device, 
-		the Ceph Filesystem, and their corresponding libraries, kernel modules, 
+		The collection of Ceph components which can access a Ceph Storage
+		Cluster. These include the Ceph Object Gateway, the Ceph Block Device,
+		the Ceph Filesystem, and their corresponding libraries, kernel modules,
 		and FUSEs.

 	Ceph Kernel Modules
-		The collection of kernel modules which can be used to interact with the 
+		The collection of kernel modules which can be used to interact with the
 		Ceph System (e.g,. ``ceph.ko``, ``rbd.ko``).

 	Ceph Client Libraries
-		The collection of libraries that can be used to interact with components 
+		The collection of libraries that can be used to interact with components
 		of the Ceph System.

 	Ceph Release
 		Any distinct numbered version of Ceph.
-	
+
 	Ceph Point Release
 		Any ad-hoc release that includes only bug or security fixes.

@ -130,11 +152,11 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
 		testing, but may contain new features.

 	Ceph Release Candidate
-		A major version of Ceph that has undergone initial quality assurance 
+		A major version of Ceph that has undergone initial quality assurance
 		testing and is ready for beta testers.

 	Ceph Stable Release
-		A major version of Ceph where all features from the preceding interim 
+		A major version of Ceph where all features from the preceding interim
 		releases have been put through quality assurance testing successfully.

 	Ceph Test Framework
@ -144,7 +166,7 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
 	CRUSH
 		Controlled Replication Under Scalable Hashing. It is the algorithm
 		Ceph uses to compute object storage locations.
-		
+
 	ruleset
 		A set of CRUSH data placement rules that applies to a particular pool(s).

@ -152,5 +174,14 @@ reflect either technical terms or legacy ways of referring to Ceph systems.
 	Pools
 		Pools are logical partitions for storing objects.

+	systemd oneshot
+		A systemd ``type`` where a command is defined in ``ExecStart`` which will
+		exit upon completion (it is not intended to daemonize)
+
+	LVM tags
+		Extensible metadata for LVM volumes and groups. It is used to store
+		Ceph-specific information about devices and its relationship with
+		OSDs.
+
 .. _http://github.com/ceph: http://github.com/ceph
 .. _Cluster Map: ../architecture#cluster-map
--- a/ceph/doc/index.rst
+++ b/ceph/doc/index.rst
@ -102,6 +102,7 @@ about Ceph, see our `Architecture`_ section.
   api/index
   architecture
   Development <dev/index>
+   ceph-volume/index
   release-notes
   releases
   Glossary <glossary>
--- a/ceph/doc/install/manual-deployment.rst
+++ b/ceph/doc/install/manual-deployment.rst
@ -12,12 +12,12 @@ default, so it's useful to know about them when setting up your cluster for
 production.

 Following the same configuration as `Installation (Quick)`_, we will set up a
-cluster with ``node1`` as  the monitor node, and ``node2`` and ``node3`` for 
+cluster with ``node1`` as  the monitor node, and ``node2`` and ``node3`` for
 OSD nodes.



-.. ditaa:: 
+.. ditaa::
           /------------------\         /----------------\
           |    Admin Node    |         |     node1      |
           |                  +-------->+                |
@ -43,51 +43,51 @@ Monitor Bootstrapping
 Bootstrapping a monitor (a Ceph Storage Cluster, in theory) requires
 a number of things:

- **Unique Identifier:** The ``fsid`` is a unique identifier for the cluster, 
-  and stands for File System ID from the days when the Ceph Storage Cluster was 
-  principally for the Ceph Filesystem. Ceph now supports native interfaces, 
-  block devices, and object storage gateway interfaces too, so ``fsid`` is a 
+- **Unique Identifier:** The ``fsid`` is a unique identifier for the cluster,
+  and stands for File System ID from the days when the Ceph Storage Cluster was
+  principally for the Ceph Filesystem. Ceph now supports native interfaces,
+  block devices, and object storage gateway interfaces too, so ``fsid`` is a
  bit of a misnomer.

 - **Cluster Name:** Ceph clusters have a cluster name, which is a simple string
  without spaces. The default cluster name is ``ceph``, but you may specify
-  a different cluster name. Overriding the default cluster name is 
-  especially useful when you are working with multiple clusters and you need to 
-  clearly understand which cluster your are working with. 
-  
-  For example, when you run multiple clusters in a `federated architecture`_, 
+  a different cluster name. Overriding the default cluster name is
+  especially useful when you are working with multiple clusters and you need to
+  clearly understand which cluster your are working with.
+
+  For example, when you run multiple clusters in a `federated architecture`_,
  the cluster name (e.g., ``us-west``, ``us-east``) identifies the cluster for
-  the current CLI session. **Note:** To identify the cluster name on the 
-  command line interface, specify the Ceph configuration file with the 
+  the current CLI session. **Note:** To identify the cluster name on the
+  command line interface, specify the Ceph configuration file with the
  cluster name (e.g., ``ceph.conf``, ``us-west.conf``, ``us-east.conf``, etc.).
  Also see CLI usage (``ceph --cluster {cluster-name}``).
-  
- **Monitor Name:** Each monitor instance within a cluster has a unique name. 
+
+- **Monitor Name:** Each monitor instance within a cluster has a unique name.
  In common practice, the Ceph Monitor name is the host name (we recommend one
-  Ceph Monitor per host, and no commingling of Ceph OSD Daemons with 
+  Ceph Monitor per host, and no commingling of Ceph OSD Daemons with
  Ceph Monitors). You may retrieve the short hostname with ``hostname -s``.

- **Monitor Map:** Bootstrapping the initial monitor(s) requires you to 
-  generate a monitor map. The monitor map requires the ``fsid``, the cluster 
+- **Monitor Map:** Bootstrapping the initial monitor(s) requires you to
+  generate a monitor map. The monitor map requires the ``fsid``, the cluster
  name (or uses the default), and at least one host name and its IP address.

- **Monitor Keyring**: Monitors communicate with each other via a 
-  secret key. You must generate a keyring with a monitor secret and provide 
+- **Monitor Keyring**: Monitors communicate with each other via a
+  secret key. You must generate a keyring with a monitor secret and provide
  it when bootstrapping the initial monitor(s).
-  
+
 - **Administrator Keyring**: To use the ``ceph`` CLI tools, you must have
  a ``client.admin`` user. So you must generate the admin user and keyring,
  and you must also add the ``client.admin`` user to the monitor keyring.

-The foregoing requirements do not imply the creation of a Ceph Configuration 
-file. However, as a best practice, we recommend creating a Ceph configuration 
+The foregoing requirements do not imply the creation of a Ceph Configuration
+file. However, as a best practice, we recommend creating a Ceph configuration
 file and populating it with the ``fsid``, the ``mon initial members`` and the
 ``mon host`` settings.

 You can get and set all of the monitor settings at runtime as well. However,
-a Ceph Configuration file may contain only those settings that override the 
+a Ceph Configuration file may contain only those settings that override the
 default values. When you add settings to a Ceph configuration file, these
-settings override the default settings. Maintaining those settings in a 
+settings override the default settings. Maintaining those settings in a
 Ceph configuration file makes it easier to maintain your cluster.

 The procedure is as follows:
@ -97,52 +97,52 @@ The procedure is as follows:

 	ssh {hostname}

-   For example:: 
+   For example::

 	ssh node1


-#. Ensure you have a directory for the Ceph configuration file. By default, 
-   Ceph uses ``/etc/ceph``. When you install ``ceph``, the installer will 
+#. Ensure you have a directory for the Ceph configuration file. By default,
+   Ceph uses ``/etc/ceph``. When you install ``ceph``, the installer will
   create the ``/etc/ceph`` directory automatically. ::

-	ls /etc/ceph   
+	ls /etc/ceph

   **Note:** Deployment tools may remove this directory when purging a
   cluster (e.g., ``ceph-deploy purgedata {node-name}``, ``ceph-deploy purge
   {node-name}``).

-#. Create a Ceph configuration file. By default, Ceph uses 
+#. Create a Ceph configuration file. By default, Ceph uses
   ``ceph.conf``, where ``ceph`` reflects the cluster name. ::

 	sudo vim /etc/ceph/ceph.conf


-#. Generate a unique ID (i.e., ``fsid``) for your cluster. :: 
+#. Generate a unique ID (i.e., ``fsid``) for your cluster. ::

 	uuidgen
-	

-#. Add the unique ID to your Ceph configuration file. :: 
+
+#. Add the unique ID to your Ceph configuration file. ::

 	fsid = {UUID}

-   For example:: 
+   For example::

 	fsid = a7f64266-0894-4f1e-a635-d0aeaca0e993


-#. Add the initial monitor(s) to your Ceph configuration file. :: 
+#. Add the initial monitor(s) to your Ceph configuration file. ::

 	mon initial members = {hostname}[,{hostname}]

-   For example:: 
+   For example::

 	mon initial members = node1


-#. Add the IP address(es) of the initial monitor(s) to your Ceph configuration 
-   file and save the file. :: 
+#. Add the IP address(es) of the initial monitor(s) to your Ceph configuration
+   file and save the file. ::

 	mon host = {ip-address}[,{ip-address}]

@ -160,18 +160,18 @@ The procedure is as follows:


 #. Generate an administrator keyring, generate a ``client.admin`` user and add
-   the user to the keyring. :: 
+   the user to the keyring. ::

 	sudo ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow *' --cap mgr 'allow *'


-#. Add the ``client.admin`` key to the ``ceph.mon.keyring``. :: 
+#. Add the ``client.admin`` key to the ``ceph.mon.keyring``. ::

 	ceph-authtool /tmp/ceph.mon.keyring --import-keyring /etc/ceph/ceph.client.admin.keyring


-#. Generate a monitor map using the hostname(s), host IP address(es) and the FSID. 
-   Save it as ``/tmp/monmap``:: 
+#. Generate a monitor map using the hostname(s), host IP address(es) and the FSID.
+   Save it as ``/tmp/monmap``::

 	monmaptool --create --add {hostname} {ip-address} --fsid {uuid} /tmp/monmap

@ -199,7 +199,7 @@ The procedure is as follows:
 	sudo -u ceph ceph-mon --mkfs -i node1 --monmap /tmp/monmap --keyring /tmp/ceph.mon.keyring


-#. Consider settings for a Ceph configuration file. Common settings include 
+#. Consider settings for a Ceph configuration file. Common settings include
   the following::

 	[global]
@ -215,7 +215,7 @@ The procedure is as follows:
 	osd pool default size = {n}  # Write an object n times.
 	osd pool default min size = {n} # Allow writing n copy in a degraded state.
 	osd pool default pg num = {n}
-	osd pool default pgp num = {n}	
+	osd pool default pgp num = {n}
 	osd crush chooseleaf type = {n}

   In the foregoing example, the ``[global]`` section of the configuration might
@ -233,7 +233,7 @@ The procedure is as follows:
 	osd pool default size = 2
 	osd pool default min size = 1
 	osd pool default pg num = 333
-	osd pool default pgp num = 333	
+	osd pool default pgp num = 333
 	osd crush chooseleaf type = 1

 #. Touch the ``done`` file.
@ -271,13 +271,13 @@ The procedure is as follows:
 	0 data,1 metadata,2 rbd,


-#. Verify that the monitor is running. :: 
+#. Verify that the monitor is running. ::

 	ceph -s

   You should see output that the monitor you started is up and running, and
   you should see a health error indicating that placement groups are stuck
-   inactive. It should look something like this:: 
+   inactive. It should look something like this::

 	cluster a7f64266-0894-4f1e-a635-d0aeaca0e993
 	  health HEALTH_ERR 192 pgs stuck inactive; 192 pgs stuck unclean; no osds
@ -295,7 +295,7 @@ Manager daemon configuration

 On each node where you run a ceph-mon daemon, you should also set up a ceph-mgr daemon.

-See :doc:`../mgr/administrator`
+See :ref:`mgr-administrator-guide`

 Adding OSDs
 ===========
@ -304,7 +304,7 @@ Once you have your initial monitor(s) running, you should add OSDs. Your cluster
 cannot reach an ``active + clean`` state until you have enough OSDs to handle the
 number of copies of an object (e.g., ``osd pool default size = 2`` requires at
 least two OSDs). After bootstrapping your monitor, your cluster has a default
-CRUSH map; however, the CRUSH map doesn't have any Ceph OSD Daemons mapped to 
+CRUSH map; however, the CRUSH map doesn't have any Ceph OSD Daemons mapped to
 a Ceph Node.


@ -314,7 +314,7 @@ Short Form
 Ceph provides the ``ceph-disk`` utility, which can prepare a disk, partition or
 directory for use with Ceph. The ``ceph-disk`` utility creates the OSD ID by
 incrementing the index. Additionally, ``ceph-disk`` will add the new OSD to the
-CRUSH map under the host for you. Execute ``ceph-disk -h`` for CLI details. 
+CRUSH map under the host for you. Execute ``ceph-disk -h`` for CLI details.
 The ``ceph-disk`` utility automates the steps of the `Long Form`_ below. To
 create the first two OSDs with the short form procedure, execute the following
 on  ``node2`` and ``node3``:
@ -335,7 +335,7 @@ on  ``node2`` and ``node3``:

 	sudo ceph-disk activate {data-path} [--activate-key {path}]

-   For example:: 
+   For example::

 	sudo ceph-disk activate /dev/hdd1

@ -372,7 +372,7 @@ OSDs with the long form procedure, execute the following steps for each OSD.
   ``client.bootstrap-osd`` key is present on the machine.  You may
   alternatively execute this command as ``client.admin`` on a
   different host where that key is present.::
-	
+
     ID=$(echo "{\"cephx_secret\": \"$OSD_SECRET\"}" | \
 	ceph osd new $UUID -i - \
 	-n client.bootstrap-osd -k /var/lib/ceph/bootstrap-osd/ceph.keyring)
@ -381,7 +381,7 @@ OSDs with the long form procedure, execute the following steps for each OSD.

     mkdir /var/lib/ceph/osd/ceph-$ID

-#. If the OSD is for a drive other than the OS drive, prepare it 
+#. If the OSD is for a drive other than the OS drive, prepare it
   for use with Ceph, and mount it to the directory you just created. ::

     mkfs.xfs /dev/{DEV}
@ -400,15 +400,15 @@ OSDs with the long form procedure, execute the following steps for each OSD.

     chown -R ceph:ceph /var/lib/ceph/osd/ceph-$ID

-#. After you add an OSD to Ceph, the OSD is in your configuration. However, 
-   it is not yet running. You must start 
+#. After you add an OSD to Ceph, the OSD is in your configuration. However,
+   it is not yet running. You must start
   your new OSD before it can begin receiving data.

   For modern systemd distributions::

     systemctl enable ceph-osd@$ID
     systemctl start ceph-osd@$ID
-     
+
   For example::

     systemctl enable ceph-osd@12
@ -427,11 +427,11 @@ In the below instructions, ``{id}`` is an arbitrary name, such as the hostname o
 #. Create a keyring.::

 	ceph-authtool --create-keyring /var/lib/ceph/mds/{cluster-name}-{id}/keyring --gen-key -n mds.{id}
-    
+
 #. Import the keyring and set caps.::

 	ceph auth add mds.{id} osd "allow rwx" mds "allow" mon "allow profile mds" -i /var/lib/ceph/mds/{cluster}-{id}/keyring
-   
+
 #. Add to ceph.conf.::

 	[mds.{id}]
@ -458,24 +458,24 @@ Summary
 =======

 Once you have your monitor and two OSDs up and running, you can watch the
-placement groups peer by executing the following:: 
+placement groups peer by executing the following::

 	ceph -w

-To view the tree, execute the following:: 
+To view the tree, execute the following::

 	ceph osd tree
-	
-You should see output that looks something like this:: 
+
+You should see output that looks something like this::

 	# id	weight	type name	up/down	reweight
 	-1	2	root default
 	-2	2		host node1
 	0	1			osd.0	up	1
 	-3	1		host node2
-	1	1			osd.1	up	1	
+	1	1			osd.1	up	1

-To add (or remove) additional monitors, see `Add/Remove Monitors`_. 
+To add (or remove) additional monitors, see `Add/Remove Monitors`_.
 To add (or remove) additional Ceph OSD Daemons, see `Add/Remove OSDs`_.


--- a/ceph/doc/man/8/CMakeLists.txt
+++ b/ceph/doc/man/8/CMakeLists.txt
@ -23,6 +23,8 @@ set(osd_srcs
  ceph-clsinfo.rst
  ceph-detect-init.rst
  ceph-disk.rst
+  ceph-volume.rst
+  ceph-volume-systemd.rst
  ceph-osd.rst
  osdmaptool.rst)

--- a/ceph/doc/man/8/ceph-volume-systemd.rst
+++ b/ceph/doc/man/8/ceph-volume-systemd.rst
@ -0,0 +1,56 @@
+:orphan:
+
+=======================================================
+ ceph-volume-systemd -- systemd ceph-volume helper tool
+=======================================================
+
+.. program:: ceph-volume-systemd
+
+Synopsis
+========
+
+| **ceph-volume-systemd** *systemd instance name*
+
+
+Description
+===========
+:program:`ceph-volume-systemd` is a systemd helper tool that receives input
+from (dynamically created) systemd units so that activation of OSDs can
+proceed.
+
+It translates the input into a system call to ceph-volume for activation
+purposes only.
+
+
+Examples
+========
+Its input is the ``systemd instance name`` (represented by ``%i`` in a systemd
+unit), and it should be in the following format::
+
+    <ceph-volume subcommand>-<extra metadata>
+
+In the case of ``lvm`` a call could look like::
+
+    /usr/bin/ceph-volume-systemd lvm-0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+Which in turn will call ``ceph-volume`` in the following way::
+
+    ceph-volume lvm trigger  0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+Any other subcommand will need to have implemented a ``trigger`` command that
+can consume the extra metadata in this format.
+
+
+Availability
+============
+
+:program:`ceph-volume-systemd` is part of Ceph, a massively scalable,
+open-source, distributed storage system. Please refer to the documentation at
+http://docs.ceph.com/ for more information.
+
+
+See also
+========
+
+:doc:`ceph-osd <ceph-osd>`\(8),
+:doc:`ceph-disk <ceph-volume>`\(8),
--- a/ceph/doc/man/8/ceph-volume.rst
+++ b/ceph/doc/man/8/ceph-volume.rst
@ -0,0 +1,122 @@
+:orphan:
+
+========================================
+ ceph-volume -- Ceph OSD deployment tool
+========================================
+
+.. program:: ceph-volume
+
+Synopsis
+========
+
+| **ceph-volume** [-h] [--cluster CLUSTER] [--log-level LOG_LEVEL]
+|                 [--log-path LOG_PATH]
+
+| **ceph-volume** **lvm** [ *trigger* | *create* | *activate* | *prepare* ]
+
+Description
+===========
+
+:program:`ceph-volume` is a single purpose command line tool to deploy logical
+volumes as OSDs, trying to maintain a similar API to ``ceph-disk`` when
+preparing, activating, and creating OSDs.
+
+It deviates from ``ceph-disk`` by not interacting or relying on the udev rules
+that come installed for Ceph. These rules allow automatic detection of
+previously setup devices that are in turn fed into ``ceph-disk`` to activate
+them.
+
+
+Commands
+========
+
+lvm
+---
+
+By making use of LVM tags, the ``lvm`` sub-command is able to store and later
+re-discover and query devices associated with OSDs so that they can later
+activated.
+
+Subcommands:
+
+**activate**
+Enables a systemd unit that persists the OSD ID and its UUID (also called
+``fsid`` in Ceph CLI tools), so that at boot time it can understand what OSD is
+enabled and needs to be mounted.
+
+Usage::
+
+    ceph-volume lvm activate --filestore <osd id> <osd fsid>
+
+Optional Arguments:
+
+* [-h, --help]  show the help message and exit
+* [--bluestore] filestore objectstore (not yet implemented)
+* [--filestore] filestore objectstore (current default)
+
+
+**prepare**
+Prepares a logical volume to be used as an OSD and journal using a ``filestore`` setup
+(``bluestore`` support is planned). It will not create or modify the logical volumes
+except for adding extra metadata.
+
+Usage::
+
+    ceph-volume lvm prepare --filestore --data <data lv> --journal <journal device>
+
+Optional arguments:
+
+* [-h, --help]          show the help message and exit
+* [--journal JOURNAL]   A logical group name, path to a logical volume, or path to a device
+* [--journal-size GB]   Size (in GB) A logical group name or a path to a logical volume
+* [--bluestore]         Use the bluestore objectstore (not currently supported)
+* [--filestore]         Use the filestore objectstore (currently the only supported object store)
+* [--osd-id OSD_ID]     Reuse an existing OSD id
+* [--osd-fsid OSD_FSID] Reuse an existing OSD fsid
+
+Required arguments:
+
+* --data                A logical group name or a path to a logical volume
+
+**create**
+Wraps the two-step process to provision a new osd (calling ``prepare`` first
+and then ``activate``) into a single one. The reason to prefer ``prepare`` and
+then ``activate`` is to gradually introduce new OSDs into a cluster, and
+avoiding large amounts of data being rebalanced.
+
+The single-call process unifies exactly what ``prepare`` and ``activate`` do,
+with the convenience of doing it all at once. Flags and general usage are
+equivalent to those of the ``prepare`` subcommand.
+
+**trigger**
+This subcommand is not meant to be used directly, and it is used by systemd so
+that it proxies input to ``ceph-volume lvm activate`` by parsing the
+input from systemd, detecting the UUID and ID associated with an OSD.
+
+Usage::
+
+    ceph-volume lvm trigger <SYSTEMD-DATA>
+
+The systemd "data" is expected to be in the format of::
+
+    <OSD ID>-<OSD UUID>
+
+The lvs associated with the OSD need to have been prepared previously,
+so that all needed tags and metadata exist.
+
+Positional arguments:
+
+* <SYSTEMD_DATA>  Data from a systemd unit containing ID and UUID of the OSD.
+
+Availability
+============
+
+:program:`ceph-volume` is part of Ceph, a massively scalable, open-source, distributed storage system. Please refer to
+the documentation at http://docs.ceph.com/ for more information.
+
+
+See also
+========
+
+:doc:`ceph-osd <ceph-osd>`\(8),
+:doc:`ceph-disk <ceph-disk>`\(8),
--- a/ceph/doc/man/8/ceph.rst
+++ b/ceph/doc/man/8/ceph.rst
@ -560,17 +560,19 @@ Usage::

 	ceph osd create {<uuid>} {<id>}

-Subcommand ``new`` reuses a previously destroyed OSD *id*. The new OSD will
-have the specified *uuid*, and the command expects a JSON file containing
-the base64 cephx key for auth entity *client.osd.<id>*, as well as optional
-base64 cepx key for dm-crypt lockbox access and a dm-crypt key. Specifying
-a dm-crypt requires specifying the accompanying lockbox cephx key.
+Subcommand ``new`` can be used to create a new OSD or to recreate a previously
+destroyed OSD with a specific *id*. The new OSD will have the specified *uuid*,
+and the command expects a JSON file containing the base64 cephx key for auth
+entity *client.osd.<id>*, as well as optional base64 cepx key for dm-crypt
+lockbox access and a dm-crypt key. Specifying a dm-crypt requires specifying
+the accompanying lockbox cephx key.

 Usage::

    ceph osd new {<id>} {<uuid>} -i {<secrets.json>}

-The secrets JSON file is expected to maintain a form of the following format::
+The secrets JSON file is optional but if provided, is expected to maintain
+a form of the following format::

    {
        "cephx_secret": "AQBWtwhZdBO5ExAAIDyjK2Bh16ZXylmzgYYEjg=="
--- a/ceph/doc/mgr/administrator.rst
+++ b/ceph/doc/mgr/administrator.rst
@ -1,3 +1,4 @@
+.. _mgr-administrator-guide:

 ceph-mgr administrator's guide
 ==============================
@ -39,7 +40,7 @@ High availability
 -----------------

 In general, you should set up a ceph-mgr on each of the hosts
-running a ceph-mon daemon to achieve the same level of availability. 
+running a ceph-mon daemon to achieve the same level of availability.

 By default, whichever ceph-mgr instance comes up first will be made
 active by the monitors, and the others will be standbys.  There is
--- a/ceph/doc/rados/command/list-inconsistent-obj.json
+++ b/ceph/doc/rados/command/list-inconsistent-obj.json
@ -66,7 +66,10 @@
                "ec_hash_error",
                "ec_size_error",
                "oi_attr_missing",
-                "oi_attr_corrupted"
+                "oi_attr_corrupted",
+                "obj_size_oi_mismatch",
+                "ss_attr_missing",
+                "ss_attr_corrupted"
              ]
            },
            "minItems": 0,
@ -104,6 +107,9 @@
                "osd": {
                  "type": "integer"
                },
+                "primary": {
+                  "type": "boolean"
+                },
                "size": {
                  "type": "integer"
                },
@ -129,7 +135,10 @@
                      "ec_hash_error",
                      "ec_size_error",
                      "oi_attr_missing",
-                      "oi_attr_corrupted"
+                      "oi_attr_corrupted",
+                      "obj_size_oi_mismatch",
+                      "ss_attr_missing",
+                      "ss_attr_corrupted"
                    ]
                  },
                  "minItems": 0,
@ -164,6 +173,7 @@
              },
              "required": [
                "osd",
+                "primary",
                "errors"
              ]
            }
--- a/ceph/doc/rados/man/index.rst
+++ b/ceph/doc/rados/man/index.rst
@ -6,6 +6,8 @@
   :maxdepth: 1

   ../../man/8/ceph-disk.rst
+   ../../man/8/ceph-volume.rst
+   ../../man/8/ceph-volume-systemd.rst
   ../../man/8/ceph.rst
   ../../man/8/ceph-deploy.rst
   ../../man/8/ceph-rest-api.rst
--- a/ceph/doc/rbd/rbd-mirroring.rst
+++ b/ceph/doc/rbd/rbd-mirroring.rst
@ -36,6 +36,13 @@ Ceph clusters.
   configuration file of the same name (e.g. /etc/ceph/remote.conf).  See the
   `ceph-conf`_ documentation for how to configure multiple clusters.

+.. note:: Images in a given pool will be mirrored to a pool with the same name
+   on the remote cluster. Images using a separate data-pool will use a data-pool
+   with the same name on the remote cluster. E.g., if an image being mirrored is
+   in the ``rbd`` pool on the local cluster and using a data-pool called
+   ``rbd-ec``, pools called ``rbd`` and ``rbd-ec`` must exist on the remote
+   cluster and will be used for mirroring the image.
+
 Enable Mirroring
 ----------------

--- a/ceph/qa/machine_types/schedule_subset.sh
+++ b/ceph/qa/machine_types/schedule_subset.sh
@ -25,8 +25,8 @@

 echo "Scheduling " $2 " branch"
 if [ $2 = "master" ] ; then
-        # run master branch with --newest option looking for good sha1 7 builds back
-        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 --newest 7 -e $5 $6
+        # run master branch with --newest option looking for good sha1 7 builds back with /999 jobs
+        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/999 --newest 7 -e $5 $6
 elif [ $2 = "hammer" ] ; then
        # run hammer branch with less jobs
        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/56 -e $5 $6
@ -34,11 +34,11 @@ elif [ $2 = "jewel" ] ; then
        # run jewel branch with /40 jobs
        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $5 $6
 elif [ $2 = "kraken" ] ; then
-        # run kraken branch with /40 jobs
-        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $5 $6
+        # run kraken branch with /999 jobs
+        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/999 -e $5 $6
 elif [ $2 = "luminous" ] ; then
-        # run luminous branch with /40 jobs
-        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/40 -e $5 $6
+        # run luminous branch with /999 jobs
+        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/999 -e $5 $6
 else
        # run NON master branches without --newest 
        teuthology-suite -v -c $2 -m $3 -k distro -s $4 --subset $(echo "(($(date +%U) % 4) * 7) + $1" | bc)/28 -e $5 $6
--- a/ceph/qa/run-standalone.sh
+++ b/ceph/qa/run-standalone.sh
@ -1,25 +1,123 @@
-#!/bin/sh -ex
+#!/usr/bin/env bash
+set -e

-if [ ! -e Makefile ]; then
+if [ ! -e Makefile -o ! -d bin ]; then
    echo 'run this from the build dir'
    exit 1
 fi

+if [ ! -d /tmp/ceph-disk-virtualenv -o ! -d /tmp/ceph-detect-init-virtualenv ]; then
+    echo '/tmp/*-virtualenv directories not built. Please run "make check" first.'
+    exit 1
+fi
+
 if [ `uname` = FreeBSD ]; then
    # otherwise module prettytable will not be found
    export PYTHONPATH=/usr/local/lib/python2.7/site-packages
    exec_mode=+111
+    KERNCORE="kern.corefile"
+    COREPATTERN="core.%N.%P"
 else
+    export PYTHONPATH=/usr/lib/python2.7/dist-packages
    exec_mode=/111
+    KERNCORE="kernel.core_pattern"
+    COREPATTERN="core.%e.%p.%t"
 fi

-for f in `find ../qa/standalone -perm $exec_mode -type f`
-do
-    echo '--- $f ---'
-    PATH=$PATH:bin \
-	CEPH_ROOT=.. \
-	CEPH_LIB=lib \
-	$f || exit 1
-done
+function finish() {
+    if [ -n "$precore" ]; then
+        sudo sysctl -w ${KERNCORE}=${precore}
+    fi
+    exit 0
+}

+trap finish TERM HUP INT
+
+PATH=$(pwd)/bin:$PATH
+
+# TODO: Use getops
+dryrun=false
+if [[ "$1" = "--dry-run" ]]; then
+    dryrun=true
+    shift
+fi
+
+all=false
+if [ "$1" = "" ]; then
+   all=true
+fi
+
+select=("$@")
+
+location="../qa/standalone"
+
+count=0
+errors=0
+userargs=""
+precore="$(sysctl -n $KERNCORE)"
+# If corepattern already set, avoid having to use sudo
+if [ "$precore" = "$COREPATTERN" ]; then
+    precore=""
+else
+    sudo sysctl -w ${KERNCORE}=${COREPATTERN}
+fi
+ulimit -c unlimited
+for f in $(cd $location ; find . -perm $exec_mode -type f)
+do
+    f=$(echo $f | sed 's/\.\///')
+    # This is tested with misc/test-ceph-helpers.sh
+    if [[ "$f" = "ceph-helpers.sh" ]]; then
+        continue
+    fi
+    if [[ "$all" = "false" ]]; then
+        found=false
+        for c in "${!select[@]}"
+        do
+            # Get command and any arguments of subset of tests ro tun
+            allargs="${select[$c]}"
+            arg1=$(echo "$allargs" | cut --delimiter " " --field 1)
+            # Get user args for this selection for use below
+            userargs="$(echo $allargs | cut -s --delimiter " " --field 2-)"
+            if [[ "$arg1" = $(basename $f) ]]; then
+                found=true
+                break
+            fi
+            if [[ "$arg1" = "$f" ]]; then
+                found=true
+                break
+            fi
+        done
+        if [[ "$found" = "false" ]]; then
+            continue
+        fi
+    fi
+    # Don't run test-failure.sh unless explicitly specified
+    if [ "$all" = "true" -a "$f" = "special/test-failure.sh" ]; then
+        continue
+    fi
+
+    cmd="$location/$f $userargs"
+    count=$(expr $count + 1)
+    echo "--- $cmd ---"
+    if [[ "$dryrun" != "true" ]]; then
+        if ! PATH=$PATH:bin \
+	    CEPH_ROOT=.. \
+	    CEPH_LIB=lib \
+	    LOCALRUN=yes \
+	    $cmd ; then
+          echo "$f .............. FAILED"
+          errors=$(expr $errors + 1)
+        fi
+    fi
+done
+if [ -n "$precore" ]; then
+    sudo sysctl -w ${KERNCORE}=${precore}
+fi
+
+if [ "$errors" != "0" ]; then
+    echo "$errors TESTS FAILED, $count TOTAL TESTS"
+    exit 1
+fi
+
+echo "ALL $count TESTS PASSED"
 exit 0
--- a/ceph/qa/standalone/README
+++ b/ceph/qa/standalone/README
@ -12,7 +12,12 @@ You can run them in a git checkout + build directory as well:
  * The qa/run-standalone.sh will run all of them in sequence.  This is slow
     since there is no parallelism.

-  * You can run an individual script by passing these environment args.  For
-    example, if you are in the build/ directory,
+  * You can run individual script(s) by specifying the basename or path below
+    qa/standalone as arguments to qa/run-standalone.sh.

-PATH=$PATH:bin CEPH_ROOT=.. CEPH_LIB=lib ../qa/standalone/mon/misc.sh
+../qa/run-standalone.sh misc.sh osd/osd-dup.sh
+
+  * Add support for specifying arguments to selected tests by simply adding
+    list of tests to each argument.
+
+../qa/run-standalone.sh "test-ceph-helpers.sh test_get_last_scrub_stamp"
--- a/ceph/qa/standalone/ceph-helpers.sh
+++ b/ceph/qa/standalone/ceph-helpers.sh
@ -33,6 +33,7 @@ fi
 if [ `uname` = FreeBSD ]; then
    SED=gsed
    DIFFCOLOPTS=""
+    KERNCORE="kern.corefile"
 else
    SED=sed
    termwidth=$(stty -a | head -1 | sed -e 's/.*columns \([0-9]*\).*/\1/')
@ -40,6 +41,7 @@ else
        termwidth="-W ${termwidth}"
    fi
    DIFFCOLOPTS="-y $termwidth"
+    KERNCORE="kernel.core_pattern"
 fi

 EXTRA_OPTS=""
@ -152,13 +154,43 @@ function test_setup() {
 #
 function teardown() {
    local dir=$1
+    local dumplogs=$2
    kill_daemons $dir KILL
    if [ `uname` != FreeBSD ] \
        && [ $(stat -f -c '%T' .) == "btrfs" ]; then
        __teardown_btrfs $dir
    fi
+    local cores="no"
+    local pattern="$(sysctl -n $KERNCORE)"
+    # See if we have apport core handling
+    if [ "${pattern:0:1}" = "|" ]; then
+      # TODO: Where can we get the dumps?
+      # Not sure where the dumps really are so this will look in the CWD
+      pattern=""
+    fi
+    # Local we start with core and teuthology ends with core
+    if ls $(dirname $pattern) | grep -q '^core\|core$' ; then
+        cores="yes"
+        if [ -n "$LOCALRUN" ]; then
+	    mkdir /tmp/cores.$$ 2> /dev/null || true
+	    for i in $(ls $(dirname $(sysctl -n $KERNCORE)) | grep '^core\|core$'); do
+		mv $i /tmp/cores.$$
+	    done
+        fi
+    fi
+    if [ "$cores" = "yes" -o "$dumplogs" = "1" ]; then
+        display_logs $dir
+    fi
    rm -fr $dir
    rm -rf $(get_asok_dir)
+    if [ "$cores" = "yes" ]; then
+        echo "ERROR: Failure due to cores found"
+        if [ -n "$LOCALRUN" ]; then
+	    echo "Find saved core files in /tmp/cores.$$"
+        fi
+        return 1
+    fi
+    return 0
 }

 function __teardown_btrfs() {
@ -406,6 +438,7 @@ function run_mon() {
        --id $id \
        --mon-osd-full-ratio=.99 \
        --mon-data-avail-crit=1 \
+        --mon-data-avail-warn=5 \
        --paxos-propose-interval=0.1 \
        --osd-crush-chooseleaf-type=0 \
        $EXTRA_OPTS \
@ -472,10 +505,15 @@ function test_run_mon() {

 function create_rbd_pool() {
    ceph osd pool delete rbd rbd --yes-i-really-really-mean-it || return 1
-    ceph osd pool create rbd $PG_NUM || return 1
+    create_pool rbd $PG_NUM || return 1
    rbd pool init rbd
 }

+function create_pool() {
+    ceph osd pool create "$@"
+    sleep 1
+}
+
 #######################################################################

 function run_mgr() {
@ -1266,7 +1304,7 @@ function test_get_last_scrub_stamp() {
    run_osd $dir 0 || return 1
    create_rbd_pool || return 1
    wait_for_clean || return 1
-    stamp=$(get_last_scrub_stamp 2.0)
+    stamp=$(get_last_scrub_stamp 1.0)
    test -n "$stamp" || return 1
    teardown $dir || return 1
 }
@ -1466,9 +1504,9 @@ function test_repair() {
    run_osd $dir 0 || return 1
    create_rbd_pool || return 1
    wait_for_clean || return 1
-    repair 2.0 || return 1
+    repair 1.0 || return 1
    kill_daemons $dir KILL osd || return 1
-    ! TIMEOUT=1 repair 2.0 || return 1
+    ! TIMEOUT=1 repair 1.0 || return 1
    teardown $dir || return 1
 }
 #######################################################################
@ -1506,9 +1544,9 @@ function test_pg_scrub() {
    run_osd $dir 0 || return 1
    create_rbd_pool || return 1
    wait_for_clean || return 1
-    pg_scrub 2.0 || return 1
+    pg_scrub 1.0 || return 1
    kill_daemons $dir KILL osd || return 1
-    ! TIMEOUT=1 pg_scrub 2.0 || return 1
+    ! TIMEOUT=1 pg_scrub 1.0 || return 1
    teardown $dir || return 1
 }

@ -1581,7 +1619,7 @@ function wait_for_scrub() {
    local sname=${3:-last_scrub_stamp}

    for ((i=0; i < $TIMEOUT; i++)); do
-        if test "$last_scrub" != "$(get_last_scrub_stamp $pgid $sname)" ; then
+        if test "$(get_last_scrub_stamp $pgid $sname)" '>' "$last_scrub" ; then
            return 0
        fi
        sleep 1
@ -1598,7 +1636,7 @@ function test_wait_for_scrub() {
    run_osd $dir 0 || return 1
    create_rbd_pool || return 1
    wait_for_clean || return 1
-    local pgid=2.0
+    local pgid=1.0
    ceph pg repair $pgid
    local last_scrub=$(get_last_scrub_stamp $pgid)
    wait_for_scrub $pgid "$last_scrub" || return 1
@ -1796,6 +1834,7 @@ function test_flush_pg_stats()
    bytes_used=`ceph df detail --format=json | jq "$jq_filter.bytes_used"`
    test $raw_bytes_used > 0 || return 1
    test $raw_bytes_used == $bytes_used || return 1
+    teardown $dir
 }

 #######################################################################
@ -1840,10 +1879,9 @@ function main() {
    if run $dir "$@" ; then
        code=0
    else
-        display_logs $dir
        code=1
    fi
-    teardown $dir || return 1
+    teardown $dir $code || return 1
    return $code
 }

@ -1858,7 +1896,7 @@ function run_tests() {

    export CEPH_MON="127.0.0.1:7109" # git grep '\<7109\>' : there must be only one
    export CEPH_ARGS
-    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+=" --fsid=$(uuidgen) --auth-supported=none "
    CEPH_ARGS+="--mon-host=$CEPH_MON "
    export CEPH_CONF=/dev/null

@ -1866,13 +1904,17 @@ function run_tests() {
    local dir=td/ceph-helpers

    for func in $funcs ; do
-        $func $dir || return 1
+        if ! $func $dir; then
+            teardown $dir 1
+            return 1
+        fi
    done
 }

 if test "$1" = TESTS ; then
    shift
    run_tests "$@"
+    exit $?
 fi

 # NOTE:
@ -1915,6 +1957,37 @@ function jq_success() {
  return 1
 }

+function inject_eio() {
+    local pooltype=$1
+    shift
+    local which=$1
+    shift
+    local poolname=$1
+    shift
+    local objname=$1
+    shift
+    local dir=$1
+    shift
+    local shard_id=$1
+    shift
+
+    local -a initial_osds=($(get_osds $poolname $objname))
+    local osd_id=${initial_osds[$shard_id]}
+    if [ "$pooltype" != "ec" ]; then
+        shard_id=""
+    fi
+    set_config osd $osd_id filestore_debug_inject_read_err true || return 1
+    local loop=0
+    while ( CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.$osd_id) \
+             inject${which}err $poolname $objname $shard_id | grep -q Invalid ); do
+        loop=$(expr $loop + 1)
+        if [ $loop = "10" ]; then
+            return 1
+        fi
+        sleep 1
+    done
+}
+
 # Local Variables:
 # compile-command: "cd ../../src ; make -j4 && ../qa/standalone/ceph-helpers.sh TESTS # test_get_config"
 # End:
--- a/ceph/qa/standalone/erasure-code/test-erasure-code.sh
+++ b/ceph/qa/standalone/erasure-code/test-erasure-code.sh
@ -57,7 +57,7 @@ function create_erasure_coded_pool() {

    ceph osd erasure-code-profile set myprofile \
        crush-failure-domain=osd || return 1
-    ceph osd pool create $poolname 12 12 erasure myprofile \
+    create_pool $poolname 12 12 erasure myprofile \
        || return 1
    wait_for_clean || return 1
 }
@ -164,7 +164,7 @@ function TEST_rados_put_get_lrc_advanced() {
        mapping=DD_ \
        crush-steps='[ [ "chooseleaf", "osd", 0 ] ]' \
        layers='[ [ "DDc", "" ] ]'  || return 1
-    ceph osd pool create $poolname 12 12 erasure $profile \
+    create_pool $poolname 12 12 erasure $profile \
        || return 1

    rados_put_get $dir $poolname || return 1
@ -182,7 +182,7 @@ function TEST_rados_put_get_lrc_kml() {
        plugin=lrc \
        k=4 m=2 l=3 \
        crush-failure-domain=osd || return 1
-    ceph osd pool create $poolname 12 12 erasure $profile \
+    create_pool $poolname 12 12 erasure $profile \
        || return 1

    rados_put_get $dir $poolname || return 1
@ -202,7 +202,7 @@ function TEST_rados_put_get_isa() {
    ceph osd erasure-code-profile set profile-isa \
        plugin=isa \
        crush-failure-domain=osd || return 1
-    ceph osd pool create $poolname 1 1 erasure profile-isa \
+    create_pool $poolname 1 1 erasure profile-isa \
        || return 1

    rados_put_get $dir $poolname || return 1
@ -222,7 +222,7 @@ function TEST_rados_put_get_jerasure() {
        plugin=jerasure \
        k=4 m=2 \
        crush-failure-domain=osd || return 1
-    ceph osd pool create $poolname 12 12 erasure $profile \
+    create_pool $poolname 12 12 erasure $profile \
        || return 1

    rados_put_get $dir $poolname || return 1
@ -242,7 +242,7 @@ function TEST_rados_put_get_shec() {
        plugin=shec \
        k=2 m=1 c=1 \
        crush-failure-domain=osd || return 1
-    ceph osd pool create $poolname 12 12 erasure $profile \
+    create_pool $poolname 12 12 erasure $profile \
        || return 1

    rados_put_get $dir $poolname || return 1
@ -318,7 +318,7 @@ function TEST_chunk_mapping() {
        mapping='_DD' \
        crush-steps='[ [ "choose", "osd", 0 ] ]' || return 1
    ceph osd erasure-code-profile get remap-profile
-    ceph osd pool create remap-pool 12 12 erasure remap-profile \
+    create_pool remap-pool 12 12 erasure remap-profile \
        || return 1

    #
--- a/ceph/qa/standalone/erasure-code/test-erasure-eio.sh
+++ b/ceph/qa/standalone/erasure-code/test-erasure-eio.sh
@ -60,7 +60,7 @@ function create_erasure_coded_pool() {
        plugin=jerasure \
        k=2 m=1 \
        crush-failure-domain=osd || return 1
-    ceph osd pool create $poolname 1 1 erasure myprofile \
+    create_pool $poolname 1 1 erasure myprofile \
        || return 1
    wait_for_clean || return 1
 }
@ -142,22 +142,6 @@ function rados_put_get() {
    rm $dir/ORIGINAL
 }

-function inject_eio() {
-    local objname=$1
-    shift
-    local dir=$1
-    shift
-    local shard_id=$1
-    shift
-
-    local poolname=pool-jerasure
-    local -a initial_osds=($(get_osds $poolname $objname))
-    local osd_id=${initial_osds[$shard_id]}
-    set_config osd $osd_id filestore_debug_inject_read_err true || return 1
-    CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.$osd_id) \
-             injectdataerr $poolname $objname $shard_id || return 1
-}
-
 function rados_get_data_eio() {
    local dir=$1
    shift
@ -170,11 +154,11 @@ function rados_get_data_eio() {
    #
    local poolname=pool-jerasure
    local objname=obj-eio-$$-$shard_id
-    inject_eio $objname $dir $shard_id || return 1
+    inject_eio ec data $poolname $objname $dir $shard_id || return 1
    rados_put_get $dir $poolname $objname $recovery || return 1

    shard_id=$(expr $shard_id + 1)
-    inject_eio $objname $dir $shard_id || return 1
+    inject_eio ec data $poolname $objname $dir $shard_id || return 1
    # Now 2 out of 3 shards get EIO, so should fail
    rados_get $dir $poolname $objname fail || return 1
 }
--- a/ceph/qa/standalone/misc/test-ceph-helpers.sh
+++ b/ceph/qa/standalone/misc/test-ceph-helpers.sh
@ -18,4 +18,4 @@
 # GNU Library Public License for more details.
 #

-$CEPH_ROOT/qa/standalone/ceph-helpers.sh TESTS
+$CEPH_ROOT/qa/standalone/ceph-helpers.sh TESTS "$@"
--- a/ceph/qa/standalone/mon/misc.sh
+++ b/ceph/qa/standalone/mon/misc.sh
@ -40,7 +40,7 @@ function TEST_osd_pool_get_set() {
    setup $dir || return 1
    run_mon $dir a || return 1
    create_rbd_pool || return 1
-    ceph osd pool create $TEST_POOL 8
+    create_pool $TEST_POOL 8

    local flag
    for flag in nodelete nopgchange nosizechange write_fadvise_dontneed noscrub nodeep-scrub; do
@ -82,7 +82,7 @@ function TEST_osd_pool_get_set() {
    ! ceph osd pool set $TEST_POOL min_size 0 || return 1

    local ecpool=erasepool
-    ceph osd pool create $ecpool 12 12 erasure default || return 1
+    create_pool $ecpool 12 12 erasure default || return 1
    #erasue pool size=k+m, min_size=k
    local size=$(ceph osd pool get $ecpool size|awk '{print $2}')
    local min_size=$(ceph osd pool get $ecpool min_size|awk '{print $2}')
--- a/ceph/qa/standalone/mon/mon-bind.sh
+++ b/ceph/qa/standalone/mon/mon-bind.sh
@ -136,7 +136,7 @@ function TEST_put_get() {
    run_osd $dir 1 || return 1
    run_osd $dir 2 || return 1

-    ceph osd pool create hello 8 || return 1
+    create_pool hello 8 || return 1

    echo "hello world" > $dir/hello
    rados --pool hello put foo $dir/hello || return 1
--- a/ceph/qa/standalone/mon/osd-erasure-code-profile.sh
+++ b/ceph/qa/standalone/mon/osd-erasure-code-profile.sh
@ -98,7 +98,7 @@ function TEST_rm() {
        grep "WRONG does not exist" || return 1

    ceph osd erasure-code-profile set $profile || return 1
-    ceph osd pool create poolname 12 12 erasure $profile || return 1
+    create_pool poolname 12 12 erasure $profile || return 1
    ! ceph osd erasure-code-profile rm $profile > $dir/out 2>&1 || return 1
    grep "poolname.*using.*$profile" $dir/out || return 1
    ceph osd pool delete poolname poolname --yes-i-really-really-mean-it || return 1
--- a/ceph/qa/standalone/mon/test_pool_quota.sh
+++ b/ceph/qa/standalone/mon/test_pool_quota.sh
@ -34,7 +34,7 @@ function TEST_pool_quota() {
    run_osd $dir 2 || return 1

    local poolname=testquoa
-    ceph osd  pool create $poolname 20
+    create_pool $poolname 20
    local objects=`ceph df detail | grep -w $poolname|awk '{print $3}'`
    local bytes=`ceph df detail | grep -w $poolname|awk '{print $4}'`

--- a/ceph/qa/standalone/osd/osd-dup.sh
+++ b/ceph/qa/standalone/osd/osd-dup.sh
@ -2,6 +2,8 @@

 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh

+[ `uname` = FreeBSD ] && exit 0
+
 function run() {
    local dir=$1
    shift
@ -38,7 +40,7 @@ function TEST_filestore_to_bluestore() {

    sleep 5

-    ceph osd pool create foo 16
+    create_pool foo 16

    # write some objects
    rados bench -p foo 10 write -b 4096 --no-cleanup || return 1
--- a/ceph/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/ceph/qa/standalone/scrub/osd-recovery-scrub.sh
@ -0,0 +1,129 @@
+#! /bin/bash
+#
+# Copyright (C) 2017 Red Hat <contact@redhat.com>
+#
+# Author: David Zafman <dzafman@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7124" # git grep '\<7124\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        $func $dir || return 1
+    done
+}
+
+function TEST_recovery_scrub() {
+    local dir=$1
+    local poolname=test
+
+    TESTDATA="testdata.$$"
+    OSDS=8
+    PGS=32
+    OBJECTS=4
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 || return 1
+    run_mgr $dir x || return 1
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+        run_osd $dir $osd || return 1
+    done
+
+    # Create a pool with $PGS pgs
+    create_pool $poolname $PGS $PGS
+    wait_for_clean || return 1
+    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+    dd if=/dev/urandom of=$TESTDATA bs=1M count=50
+    for i in $(seq 1 $OBJECTS)
+    do
+        rados -p $poolname put obj${i} $TESTDATA
+    done
+    rm -f $TESTDATA
+
+    ceph osd pool set $poolname size 4
+
+    pids=""
+    for pg in $(seq 0 $(expr $PGS - 1))
+    do
+        run_in_background pids pg_scrub $poolid.$(echo "{ obase=16; $pg }" | bc | tr '[:upper:]' '[:lower:]')
+    done
+    ceph pg dump pgs
+    wait_background pids
+    return_code=$?
+    if [ $return_code -ne 0 ]; then return $return_code; fi
+
+    ERRORS=0
+    pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
+    pid=$(cat $pidfile)
+    if ! kill -0 $pid
+    then
+        echo "OSD crash occurred"
+        tail -100 $dir/osd.0.log
+        ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    kill_daemons $dir || return 1
+
+    declare -a err_strings
+    err_strings[0]="not scheduling scrubs due to active recovery"
+    # Test with these two strings after disabled check in OSD::sched_scrub()
+    #err_strings[0]="handle_scrub_reserve_request: failed to reserve remotely"
+    #err_strings[1]="sched_scrub: failed to reserve locally"
+
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+        grep "failed to reserve\|not scheduling scrubs" $dir/osd.${osd}.log
+    done
+    for err_string in "${err_strings[@]}"
+    do
+        found=false
+        for osd in $(seq 0 $(expr $OSDS - 1))
+        do
+            if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
+            then
+                found=true
+            fi
+        done
+        if [ "$found" = "false" ]; then
+            echo "Missing log message '$err_string'"
+	    ERRORS=$(expr $ERRORS + 1)
+        fi
+    done
+
+    teardown $dir || return 1
+
+    if [ $ERRORS != "0" ];
+    then
+        echo "TEST FAILED WITH $ERRORS ERRORS"
+        return 1
+    fi
+
+    echo "TEST PASSED"
+    return 0
+}
+
+main osd-recovery-scrub "$@"
+
+# Local Variables:
+# compile-command: "cd build ; make -j4 && \
+#    ../qa/run-standalone.sh osd-recovery-scrub.sh"
--- a/ceph/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/ceph/qa/standalone/scrub/osd-scrub-repair.sh
--- a/ceph/qa/standalone/scrub/osd-scrub-snaps.sh
+++ b/ceph/qa/standalone/scrub/osd-scrub-snaps.sh
@ -46,7 +46,8 @@ function TEST_scrub_snaps() {
    wait_for_clean || return 1

    # Create a pool with a single pg
-    ceph osd pool create $poolname 1 1
+    create_pool $poolname 1 1
+    wait_for_clean || return 1
    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')

    dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
@ -449,15 +450,14 @@ EOF
    err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj9:1 is missing in clone_size"
    err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj11:1 is an unexpected clone"
    err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj14:1 size 1032 != clone_size 1033"
-    err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 23 errors"
+    err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 22 errors"
    err_strings[23]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj15:head can't decode 'snapset' attr buffer"
-    err_strings[24]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj12:1 has no oi or legacy_snaps; cannot convert 1=[[]1[]]:[[]1[]].stray_clone_snaps=[{]1=[[]1[]][}]"

-    for i in `seq 0 ${#err_strings[@]}`
+    for err_string in "${err_strings[@]}"
    do
-        if ! grep "${err_strings[$i]}" $dir/osd.0.log > /dev/null;
+        if ! grep "$err_string" $dir/osd.0.log > /dev/null;
        then
-            echo "Missing log message '${err_strings[$i]}'"
+            echo "Missing log message '$err_string'"
            ERRORS=$(expr $ERRORS + 1)
        fi
    done
--- a/ceph/qa/standalone/special/test-failure.sh
+++ b/ceph/qa/standalone/special/test-failure.sh
@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+set -ex
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7202" # git grep '\<7202\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function TEST_failure_log() {
+    local dir=$1
+
+    cat > $dir/test_failure.log << EOF
+This is a fake log file
+*
+*
+*
+*
+*
+This ends the fake log file
+EOF
+
+    # Test fails
+    return 1
+}
+
+function TEST_failure_core_only() {
+    local dir=$1
+
+    run_mon $dir a || return 1
+    kill_daemons $dir SEGV mon 5
+    return 0
+}
+
+main test_failure "$@"
--- a/ceph/qa/suites/powercycle/osd/whitelist_health.yaml
+++ b/ceph/qa/suites/powercycle/osd/whitelist_health.yaml
@ -2,3 +2,4 @@ overrides:
  ceph:
    log-whitelist:
      - \(MDS_TRIM\)
+      - Behind on trimming
--- a/ceph/qa/suites/rados/upgrade/jewel-x-singleton/1-jewel-install/jewel.yaml
+++ b/ceph/qa/suites/rados/upgrade/jewel-x-singleton/1-jewel-install/jewel.yaml
@ -8,4 +8,6 @@ tasks:
 - ceph:
    skip_mgr_daemons: true
    add_osds_to_crush: true
+    log-whitelist:
+      - required past_interval bounds are empty
 - print: "**** done ceph"
--- a/ceph/qa/suites/upgrade/jewel-x/parallel/0-cluster/start.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/0-cluster/start.yaml
@ -24,6 +24,8 @@ overrides:
    - scrub mismatch
    - ScrubResult
    - wrongly marked
-    - (MDS_FAILED)
+    - \(MDS_FAILED\)
+    - \(OBJECT_
+    - is unresponsive
    conf:
    fs: xfs
--- a/ceph/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/1-jewel-install/jewel.yaml
@ -23,6 +23,7 @@ tasks:
      - \(PG_
      - Monitor daemon marked osd
      - Behind on trimming
+      - is unresponsive
    conf:
      global:
        mon warn on pool no app: false
--- a/ceph/qa/suites/upgrade/jewel-x/parallel/2-workload/rados_loadgenbig.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/2-workload/rados_loadgenbig.yaml
@ -1,11 +0,0 @@
-meta:
- desc: |
-   generate read/write load with rados objects ranging from 1MB to 25MB
-workload:
-  full_sequential:
-    - workunit:
-        branch: jewel
-        clients:
-          client.0:
-            - rados/load-gen-big.sh
-    - print: "**** done rados/load-gen-big.sh 2-workload"
--- a/ceph/qa/suites/upgrade/jewel-x/stress-split/1-jewel-install/jewel.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/1-jewel-install/jewel.yaml
@ -8,4 +8,6 @@ tasks:
 - ceph:
    skip_mgr_daemons: true
    add_osds_to_crush: true
+    log-whitelist:
+      - required past_interval bounds are empty
 - print: "**** done ceph"
--- a/ceph/qa/tasks/ceph_deploy.py
+++ b/ceph/qa/tasks/ceph_deploy.py
@ -297,7 +297,6 @@ def build_ceph_cluster(ctx, config):
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(mon_create_nodes)
-        execute_ceph_deploy(mgr_create)

        # create-keys is explicit now
        # http://tracker.ceph.com/issues/16036
@ -307,6 +306,9 @@ def build_ceph_cluster(ctx, config):
                             '--id', remote.shortname])

        estatus_gather = execute_ceph_deploy(gather_keys)
+
+        execute_ceph_deploy(mgr_create)
+
        if mds_nodes:
            estatus_mds = execute_ceph_deploy(deploy_mds)
            if estatus_mds != 0:
--- a/ceph/qa/tasks/rgw.py
+++ b/ceph/qa/tasks/rgw.py
@ -142,12 +142,12 @@ def create_pools(ctx, clients):

        if ctx.rgw.ec_data_pool:
            create_ec_pool(remote, data_pool, client, 64,
-                           ctx.rgw.erasure_code_profile, cluster_name)
+                           ctx.rgw.erasure_code_profile, cluster_name, 'rgw')
        else:
-            create_replicated_pool(remote, data_pool, 64, cluster_name)
+            create_replicated_pool(remote, data_pool, 64, cluster_name, 'rgw')
        if ctx.rgw.cache_pools:
            create_cache_pool(remote, data_pool, data_pool + '.cache', 64,
-                              64*1024*1024, cluster_name)
+                              64*1024*1024, cluster_name, 'rgw')
    log.debug('Pools created')
    yield

--- a/ceph/qa/tasks/rgw_multisite.py
+++ b/ceph/qa/tasks/rgw_multisite.py
@ -409,9 +409,9 @@ def create_zone_pools(ctx, zone):
        pool_name = pool_config['val']['data_pool']
        if ctx.rgw.ec_data_pool:
            create_ec_pool(gateway.remote, pool_name, zone.name, 64,
-                           ctx.rgw.erasure_code_profile, cluster.name)
+                           ctx.rgw.erasure_code_profile, cluster.name, 'rgw')
        else:
-            create_replicated_pool(gateway.remote, pool_name, 64, cluster.name)
+            create_replicated_pool(gateway.remote, pool_name, 64, cluster.name, 'rgw')

 def configure_zone_compression(zone, compression):
    """ Set compression type in the zone's default-placement """
--- a/ceph/qa/tasks/util/rados.py
+++ b/ceph/qa/tasks/util/rados.py
@ -24,20 +24,28 @@ def rados(ctx, remote, cmd, wait=True, check_status=False):
    else:
        return proc

-def create_ec_pool(remote, name, profile_name, pgnum, profile={}, cluster_name="ceph"):
+def create_ec_pool(remote, name, profile_name, pgnum, profile={}, cluster_name="ceph", application=None):
    remote.run(args=['sudo', 'ceph'] +
               cmd_erasure_code_profile(profile_name, profile) + ['--cluster', cluster_name])
    remote.run(args=[
        'sudo', 'ceph', 'osd', 'pool', 'create', name,
        str(pgnum), str(pgnum), 'erasure', profile_name, '--cluster', cluster_name
        ])
+    if application:
+        remote.run(args=[
+            'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
+        ])

-def create_replicated_pool(remote, name, pgnum, cluster_name="ceph"):
+def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application=None):
    remote.run(args=[
        'sudo', 'ceph', 'osd', 'pool', 'create', name, str(pgnum), str(pgnum), '--cluster', cluster_name
        ])
+    if application:
+        remote.run(args=[
+            'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
+        ])

-def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"):
+def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph", application=None):
    remote.run(args=[
        'sudo', 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum), '--cluster', cluster_name
    ])
@ -45,6 +53,10 @@ def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="
        'sudo', 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name,
        str(size), '--cluster', cluster_name
    ])
+    if application:
+        remote.run(args=[
+            'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
+        ])

 def cmd_erasure_code_profile(profile_name, profile):
    """
--- a/ceph/qa/workunits/mon/crush_ops.sh
+++ b/ceph/qa/workunits/mon/crush_ops.sh
@ -26,6 +26,10 @@ ceph osd crush set-device-class ssd osd.0
 ceph osd crush set-device-class hdd osd.1
 ceph osd crush rule create-replicated foo-ssd default host ssd
 ceph osd crush rule create-replicated foo-hdd default host hdd
+ceph osd crush rule ls-by-class ssd | grep 'foo-ssd'
+ceph osd crush rule ls-by-class ssd | expect_false grep 'foo-hdd'
+ceph osd crush rule ls-by-class hdd | grep 'foo-hdd'
+ceph osd crush rule ls-by-class hdd | expect_false grep 'foo-ssd'

 ceph osd erasure-code-profile set ec-foo-ssd crush-device-class=ssd m=2 k=2
 ceph osd pool create ec-foo 2 erasure ec-foo-ssd
@ -33,6 +37,16 @@ ceph osd pool rm ec-foo ec-foo --yes-i-really-really-mean-it

 ceph osd crush rule ls | grep foo

+ceph osd crush rule rename foo foo-asdf
+ceph osd crush rule rename bar bar-asdf
+ceph osd crush rule ls | grep 'foo-asdf'
+ceph osd crush rule ls | grep 'bar-asdf'
+ceph osd crush rule rm foo 2>&1 | grep 'does not exist'
+ceph osd crush rule rm bar 2>&1 | grep 'does not exist'
+ceph osd crush rule rename foo-asdf foo
+ceph osd crush rule rename bar-asdf bar
+ceph osd crush rule ls | expect_false grep 'foo-asdf'
+ceph osd crush rule ls | expect_false grep 'bar-asdf'
 ceph osd crush rule rm foo
 ceph osd crush rule rm foo  # idempotent
 ceph osd crush rule rm bar
--- a/ceph/qa/workunits/rbd/rbd_mirror.sh
+++ b/ceph/qa/workunits/rbd/rbd_mirror.sh
@ -216,6 +216,24 @@ compare_images ${POOL} ${clone_image}
 expect_failure "is non-primary" clone_image ${CLUSTER1} ${PARENT_POOL} \
    ${parent_image} ${parent_snap} ${POOL} ${clone_image}1

+testlog "TEST: data pool"
+dp_image=test_data_pool
+create_image ${CLUSTER2} ${POOL} ${dp_image} 128 --data-pool ${PARENT_POOL}
+data_pool=$(get_image_data_pool ${CLUSTER2} ${POOL} ${dp_image})
+test "${data_pool}" = "${PARENT_POOL}"
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${dp_image}
+data_pool=$(get_image_data_pool ${CLUSTER1} ${POOL} ${dp_image})
+test "${data_pool}" = "${PARENT_POOL}"
+create_snapshot ${CLUSTER2} ${POOL} ${dp_image} 'snap1'
+write_image ${CLUSTER2} ${POOL} ${dp_image} 100
+create_snapshot ${CLUSTER2} ${POOL} ${dp_image} 'snap2'
+write_image ${CLUSTER2} ${POOL} ${dp_image} 100
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${dp_image}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${dp_image} 'up+replaying' 'master_position'
+compare_images ${POOL} ${dp_image}@snap1
+compare_images ${POOL} ${dp_image}@snap2
+compare_images ${POOL} ${dp_image}
+
 testlog "TEST: disable mirroring / delete non-primary image"
 image2=test2
 image3=test3
--- a/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh
+++ b/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh
@ -867,6 +867,16 @@ request_resync_image()
    rbd --cluster=${cluster} -p ${pool} mirror image resync ${image}
 }

+get_image_data_pool()
+{
+    local cluster=$1
+    local pool=$2
+    local image=$3
+
+    rbd --cluster ${cluster} -p ${pool} info ${image} |
+        awk '$1 == "data_pool:" {print $2}'
+}
+
 #
 # Main
 #
--- a/ceph/src/.git_version
+++ b/ceph/src/.git_version
@ -1,2 +1,2 @@
-a5f84b37668fc8e03165aaf5cbb380c78e4deba4
-v12.1.4
+32ce2a3ae5239ee33d6150705cdb24d43bab910c
+v12.2.0
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/api.py
@ -3,11 +3,45 @@ API for CRUD lvm tag operations. Follows the Ceph LVM tag naming convention
 that prefixes tags with ``ceph.`` and uses ``=`` for assignment, and provides
 set of utilities for interacting with LVM.
 """
-import json
 from ceph_volume import process
 from ceph_volume.exceptions import MultipleLVsError, MultipleVGsError


+def _output_parser(output, fields):
+    """
+    Newer versions of LVM allow ``--reportformat=json``, but older versions,
+    like the one included in Xenial do not. LVM has the ability to filter and
+    format its output so we assume the output will be in a format this parser
+    can handle (using ',' as a delimiter)
+
+    :param fields: A string, possibly using ',' to group many items, as it
+                   would be used on the CLI
+    :param output: The CLI output from the LVM call
+    """
+    field_items = fields.split(',')
+    report = []
+    for line in output:
+        # clear the leading/trailing whitespace
+        line = line.strip()
+
+        # remove the extra '"' in each field
+        line = line.replace('"', '')
+
+        # prevent moving forward with empty contents
+        if not line:
+            continue
+
+        # spliting on ';' because that is what the lvm call uses as
+        # '--separator'
+        output_items = [i.strip() for i in line.split(';')]
+        # map the output to the fiels
+        report.append(
+            dict(zip(field_items, output_items))
+        )
+
+    return report
+
+
 def parse_tags(lv_tags):
    """
    Return a dictionary mapping of all the tags associated with
@ -37,49 +71,22 @@ def parse_tags(lv_tags):

 def get_api_vgs():
    """
-    Return the list of group volumes available in the system using flags to include common
-    metadata associated with them
+    Return the list of group volumes available in the system using flags to
+    include common metadata associated with them

-    Command and sample JSON output, should look like::
+    Command and sample delimeted output, should look like::

-        $ sudo vgs --reportformat=json
-        {
-            "report": [
-                {
-                    "vg": [
-                        {
-                            "vg_name":"VolGroup00",
-                            "pv_count":"1",
-                            "lv_count":"2",
-                            "snap_count":"0",
-                            "vg_attr":"wz--n-",
-                            "vg_size":"38.97g",
-                            "vg_free":"0 "},
-                        {
-                            "vg_name":"osd_vg",
-                            "pv_count":"3",
-                            "lv_count":"1",
-                            "snap_count":"0",
-                            "vg_attr":"wz--n-",
-                            "vg_size":"32.21g",
-                            "vg_free":"9.21g"
-                        }
-                    ]
-                }
-            ]
-        }
+        $ sudo vgs --noheadings --separator=';' \
+          -o vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free
+          ubuntubox-vg;1;2;0;wz--n-;299.52g;12.00m
+          osd_vg;3;1;0;wz--n-;29.21g;9.21g

    """
+    fields = 'vg_name,pv_count,lv_count,snap_count,vg_attr,vg_size,vg_free'
    stdout, stderr, returncode = process.call(
-        [
-            'sudo', 'vgs', '--reportformat=json'
-        ]
+        ['sudo', 'vgs', '--noheadings', '--separator=";"', '-o', fields]
    )
-    report = json.loads(''.join(stdout))
-    for report_item in report.get('report', []):
-        # is it possible to get more than one item in "report" ?
-        return report_item['vg']
-    return []
+    return _output_parser(stdout, fields)


 def get_api_lvs():
@ -87,37 +94,18 @@ def get_api_lvs():
    Return the list of logical volumes available in the system using flags to include common
    metadata associated with them

-    Command and sample JSON output, should look like::
+    Command and delimeted output, should look like::

-        $ sudo lvs -o  lv_tags,lv_path,lv_name,vg_name --reportformat=json
-        {
-            "report": [
-                {
-                    "lv": [
-                        {
-                            "lv_tags":"",
-                            "lv_path":"/dev/VolGroup00/LogVol00",
-                            "lv_name":"LogVol00",
-                            "vg_name":"VolGroup00"},
-                        {
-                            "lv_tags":"ceph.osd_fsid=aaa-fff-0000,ceph.osd_fsid=aaa-fff-bbbb,ceph.osd_id=0",
-                            "lv_path":"/dev/osd_vg/OriginLV",
-                            "lv_name":"OriginLV",
-                            "vg_name":"osd_vg"
-                        }
-                    ]
-                }
-            ]
-        }
+        $ sudo lvs --noheadings --separator=';' -o lv_tags,lv_path,lv_name,vg_name
+          ;/dev/ubuntubox-vg/root;root;ubuntubox-vg
+          ;/dev/ubuntubox-vg/swap_1;swap_1;ubuntubox-vg

    """
+    fields = 'lv_tags,lv_path,lv_name,vg_name'
    stdout, stderr, returncode = process.call(
-        ['sudo', 'lvs', '-o', 'lv_tags,lv_path,lv_name,vg_name', '--reportformat=json'])
-    report = json.loads(''.join(stdout))
-    for report_item in report.get('report', []):
-        # is it possible to get more than one item in "report" ?
-        return report_item['lv']
-    return []
+        ['sudo', 'lvs', '--noheadings', '--separator=";"', '-o', fields]
+    )
+    return _output_parser(stdout, fields)


 def get_lv(lv_name=None, vg_name=None, lv_path=None, lv_tags=None):
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/common.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/common.py
@ -1,3 +1,4 @@
+from ceph_volume.util import arg_validators
 import argparse


@ -14,12 +15,13 @@ def common_parser(prog, description):
    required_args = parser.add_argument_group('required arguments')
    parser.add_argument(
        '--journal',
-        help='A logical group name, path to a logical volume, or path to a device',
+        help='A logical volume (vg_name/lv_name), or path to a device',
    )
    required_args.add_argument(
        '--data',
        required=True,
-        help='A logical group name or a path to a logical volume',
+        type=arg_validators.LVPath(),
+        help='A logical volume (vg_name/lv_name) for OSD data',
    )
    parser.add_argument(
        '--journal-size',
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/create.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/create.py
@ -28,28 +28,18 @@ class Create(object):
        all the metadata to the logical volumes using LVM tags, and starting
        the OSD daemon.

-        Most basic Usage looks like (journal will be collocated from the same volume group):
-
-            ceph-volume lvm create --data {volume group name}
-
-
        Example calls for supported scenarios:

-        Dedicated volume group for Journal(s)
-        -------------------------------------
+        Filestore
+        ---------

          Existing logical volume (lv) or device:

-              ceph-volume lvm create --data {logical volume} --journal /path/to/{lv}|{device}
+              ceph-volume lvm create --filestore --data {vg name/lv name} --journal /path/to/device

          Or:

-              ceph-volume lvm create --data {data volume group} --journal {journal volume group}
-
-        Collocated (same group) for data and journal
-        --------------------------------------------
-
-              ceph-volume lvm create --data {volume group}
+              ceph-volume lvm create --filestore --data {vg name/lv name} --journal {vg name/lv name}

        """)
        parser = create_parser(
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/prepare.py
@ -9,20 +9,6 @@ from . import api
 from .common import prepare_parser


-def canonical_device_path(device):
-    """
-    Ensure that a device is canonical (full path) and that it exists so that
-    it can be used throughout the prepare/activate process
-    """
-    # FIXME: this is obviously super naive
-    inferred = os.path.join('/dev', device)
-    if os.path.exists(os.path.abspath(device)):
-        return device
-    elif os.path.exists(inferred):
-        return inferred
-    raise RuntimeError('Selected device does not exist: %s' % device)
-
-
 def prepare_filestore(device, journal, secrets, id_=None, fsid=None):
    """
    :param device: The name of the volume group or lvm to work with
@ -65,6 +51,19 @@ class Prepare(object):
    def __init__(self, argv):
        self.argv = argv

+    def get_journal_lv(self, argument):
+        """
+        Perform some parsing of the value of ``--journal`` so that the process
+        can determine correctly if it got a device path or an lv
+        :param argument: The value of ``--journal``, that will need to be split
+        to retrieve the actual lv
+        """
+        try:
+            vg_name, lv_name = argument.split('/')
+        except (ValueError, AttributeError):
+            return None
+        return api.get_lv(lv_name=lv_name, vg_name=vg_name)
+
    @decorators.needs_root
    def prepare(self, args):
        # FIXME we don't allow re-using a keyring, we always generate one for the
@ -78,66 +77,40 @@ class Prepare(object):
        #osd_id = args.osd_id or prepare_utils.create_id(fsid)
        # allow re-using an id, in case a prepare failed
        osd_id = args.osd_id or prepare_utils.create_id(fsid, json.dumps(secrets))
-        journal_name = "journal_%s" % fsid
-        osd_name = "osd_%s" % fsid
-
+        vg_name, lv_name = args.data.split('/')
        if args.filestore:
-            data_vg = api.get_vg(vg_name=args.data)
-            data_lv = api.get_lv(lv_name=args.data)
-            journal_vg = api.get_vg(vg_name=args.journal)
-            journal_lv = api.get_lv(lv_name=args.journal)
-            journal_device = None
-            # it is possible to pass a device as a journal that is not
-            # an actual logical volume (or group)
-            if not args.journal:
-                if data_lv:
-                    raise RuntimeError('--journal is required when not using a vg for OSD data')
-                # collocated: carve out the journal from the data vg
-                if data_vg:
-                    journal_lv = api.create_lv(
-                        name=journal_name,
-                        group=data_vg.name,
-                        size=args.journal_size,
-                        osd_fsid=fsid,
-                        osd_id=osd_id,
-                        type='journal',
-                        cluster_fsid=cluster_fsid
-                    )
+            data_lv = api.get_lv(lv_name=lv_name, vg_name=vg_name)

-            # if a volume group was defined for the journal create that first
-            if journal_vg:
-                journal_lv = api.create_lv(
-                    name=journal_name,
-                    group=args.journal,
-                    size=args.journal_size,
-                    osd_fsid=fsid,
-                    osd_id=osd_id,
-                    type='journal',
-                    cluster_fsid=cluster_fsid
-                )
-            if journal_lv:
-                journal_device = journal_lv.lv_path
-            # The journal is probably a device, not in LVM
-            elif args.journal:
-                journal_device = canonical_device_path(args.journal)
-            # At this point we must have a journal_lv or a journal device
-            # now create the osd from the group if that was found
-            if data_vg:
-                # XXX make sure that a there aren't more OSDs than physical
-                # devices from this volume group
-                data_lv = api.create_lv(
-                    name=osd_name,
-                    group=args.data,
-                    osd_fsid=fsid,
-                    osd_id=osd_id,
-                    type='data',
-                    journal_device=journal_device,
-                    cluster_fsid=cluster_fsid
-                )
            # we must have either an existing data_lv or a newly created, so lets make
            # sure that the tags are correct
            if not data_lv:
                raise RuntimeError('no data logical volume found with: %s' % args.data)
+
+            if not args.journal:
+                raise RuntimeError('--journal is required when using --filestore')
+            journal_device = None
+            journal_lv = self.get_journal_lv(args.journal)
+
+            # check if we have an actual path to a device, which is allowed
+            if not journal_lv:
+                if os.path.exists(args.journal):
+                    journal_device = args.journal
+                else:
+                    raise RuntimeError(
+                        '--journal specified an invalid or non-existent device: %s' % args.journal
+                    )
+            # Otherwise the journal_device is the path to the lv
+            else:
+                journal_device = journal_lv.lv_path
+                journal_lv.set_tags({
+                    'ceph.type': 'journal',
+                    'ceph.osd_fsid': fsid,
+                    'ceph.osd_id': osd_id,
+                    'ceph.cluster_fsid': cluster_fsid,
+                    'ceph.journal_device': journal_device,
+                    'ceph.data_device': data_lv.lv_path,
+                })
+
            data_lv.set_tags({
                'ceph.type': 'data',
                'ceph.osd_fsid': fsid,
--- a/ceph/src/ceph-volume/ceph_volume/systemd/main.py
+++ b/ceph/src/ceph-volume/ceph_volume/systemd/main.py
@ -57,7 +57,6 @@ def main(args=None):

    Expected input is similar to::

-        ['/path/to/ceph-volume-systemd', '<osd id>-<osd uuid>-<device type>']
        ['/path/to/ceph-volume-systemd', '<type>-<extra metadata>']

    For example::
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_api.py
@ -24,50 +24,53 @@ class TestParseTags(object):
 class TestGetAPIVgs(object):

    def test_report_is_emtpy(self, monkeypatch):
-        monkeypatch.setattr(api.process, 'call', lambda x: ('{}', '', 0))
+        monkeypatch.setattr(api.process, 'call', lambda x: ('\n\n', '', 0))
        assert api.get_api_vgs() == []

    def test_report_has_stuff(self, monkeypatch):
-        report = '{"report":[{"vg":[{"vg_name":"VolGroup00"}]}]}'
+        report = ['  VolGroup00']
        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
        assert api.get_api_vgs() == [{'vg_name': 'VolGroup00'}]

+    def test_report_has_stuff_with_empty_attrs(self, monkeypatch):
+        report = ['  VolGroup00 ;;;;;;9g']
+        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
+        result = api.get_api_vgs()[0]
+        assert len(result.keys()) == 7
+        assert result['vg_name'] == 'VolGroup00'
+        assert result['vg_free'] == '9g'
+
    def test_report_has_multiple_items(self, monkeypatch):
-        report = '{"report":[{"vg":[{"vg_name":"VolGroup00"},{"vg_name":"ceph_vg"}]}]}'
+        report = ['   VolGroup00;;;;;;;', '    ceph_vg;;;;;;;']
        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        assert api.get_api_vgs() == [{'vg_name': 'VolGroup00'}, {'vg_name': 'ceph_vg'}]
-
-    def test_does_not_get_poluted_with_non_vg_items(self, monkeypatch):
-        report = '{"report":[{"vg":[{"vg_name":"VolGroup00"}],"lv":[{"lv":"1"}]}]}'
-        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        assert api.get_api_vgs() == [{'vg_name': 'VolGroup00'}]
+        result = api.get_api_vgs()
+        assert result[0]['vg_name'] == 'VolGroup00'
+        assert result[1]['vg_name'] == 'ceph_vg'


 class TestGetAPILvs(object):

    def test_report_is_emtpy(self, monkeypatch):
-        monkeypatch.setattr(api.process, 'call', lambda x: ('{}', '', 0))
+        monkeypatch.setattr(api.process, 'call', lambda x: ('', '', 0))
        assert api.get_api_lvs() == []

    def test_report_has_stuff(self, monkeypatch):
-        report = '{"report":[{"lv":[{"lv_name":"VolGroup00"}]}]}'
+        report = ['  ;/path;VolGroup00;root']
        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        assert api.get_api_lvs() == [{'lv_name': 'VolGroup00'}]
+        result = api.get_api_lvs()
+        assert result[0]['lv_name'] == 'VolGroup00'

    def test_report_has_multiple_items(self, monkeypatch):
-        report = '{"report":[{"lv":[{"lv_name":"VolName"},{"lv_name":"ceph_lv"}]}]}'
+        report = ['  ;/path;VolName;root', ';/dev/path;ceph_lv;ceph_vg']
        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        assert api.get_api_lvs() == [{'lv_name': 'VolName'}, {'lv_name': 'ceph_lv'}]
-
-    def test_does_not_get_poluted_with_non_lv_items(self, monkeypatch):
-        report = '{"report":[{"lv":[{"lv_name":"VolName"}],"vg":[{"vg":"1"}]}]}'
-        monkeypatch.setattr(api.process, 'call', lambda x: (report, '', 0))
-        assert api.get_api_lvs() == [{'lv_name': 'VolName'}]
+        result = api.get_api_lvs()
+        assert result[0]['lv_name'] == 'VolName'
+        assert result[1]['lv_name'] == 'ceph_lv'


@pytest.fixture
 def volumes(monkeypatch):
-    monkeypatch.setattr(process, 'call', lambda x: ('{}', '', 0))
+    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
    volumes = api.Volumes()
    volumes._purge()
    return volumes
@ -75,7 +78,7 @@ def volumes(monkeypatch):

@pytest.fixture
 def volume_groups(monkeypatch):
-    monkeypatch.setattr(process, 'call', lambda x: ('{}', '', 0))
+    monkeypatch.setattr(process, 'call', lambda x: ('', '', 0))
    vgs = api.VolumeGroups()
    vgs._purge()
    return vgs
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_prepare.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_prepare.py
@ -37,6 +37,21 @@ class TestPrepare(object):
        assert 'A logical group name or a path' in stdout


+class TestGetJournalLV(object):
+
+    @pytest.mark.parametrize('arg', ['', '///', None, '/dev/sda1'])
+    def test_no_journal_on_invalid_path(self, monkeypatch, arg):
+        monkeypatch.setattr(lvm.prepare.api, 'get_lv', lambda **kw: False)
+        prepare = lvm.prepare.Prepare([])
+        assert prepare.get_journal_lv(arg) is None
+
+    def test_no_journal_lv_found(self, monkeypatch):
+        # patch it with 0 so we know we are getting to get_lv
+        monkeypatch.setattr(lvm.prepare.api, 'get_lv', lambda **kw: 0)
+        prepare = lvm.prepare.Prepare([])
+        assert prepare.get_journal_lv('vg/lv') == 0
+
+
 class TestActivate(object):

    def test_main_spits_help_with_no_arguments(self, capsys):
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/centos7/create/group_vars/all
@ -11,7 +11,9 @@ osd_scenario: lvm
 copy_admin_key: true
 # test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
 lvm_volumes:
-  test_volume: /dev/sdc
+  - data: test_volume
+    journal: /dev/sdc
+    data_vg: test_group
 os_tuning_params:
  - { name: kernel.pid_max, value: 4194303 }
  - { name: fs.file-max, value: 26234859 }
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/xenial/create/group_vars/all
@ -11,7 +11,9 @@ osd_scenario: lvm
 copy_admin_key: true
 # test-volume is created by tests/functional/lvm_setup.yml from /dev/sda
 lvm_volumes:
-  test_volume: /dev/sdc
+  - data: test_volume
+    journal: /dev/sdc
+    data_vg: test_group
 os_tuning_params:
  - { name: kernel.pid_max, value: 4194303 }
  - { name: fs.file-max, value: 26234859 }
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py
@ -0,0 +1,24 @@
+import pytest
+import argparse
+from ceph_volume.util import arg_validators
+
+
+invalid_lv_paths = [
+    '', 'lv_name', '///', '/lv_name', 'lv_name/',
+    '/dev/lv_group/lv_name'
+]
+
+
+class TestLVPath(object):
+
+    def setup(self):
+        self.validator = arg_validators.LVPath()
+
+    @pytest.mark.parametrize('path', invalid_lv_paths)
+    def test_no_slash_is_an_error(self, path):
+        with pytest.raises(argparse.ArgumentError):
+            self.validator(path)
+
+    def test_is_valid(self):
+        path = 'vg/lv'
+        assert self.validator(path) == path
--- a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
@ -0,0 +1,29 @@
+import argparse
+
+
+class LVPath(object):
+    """
+    A simple validator to ensure that a logical volume is specified like::
+
+        <vg name>/<lv name>
+
+    Because for LVM it is better to be specific on what group does an lv
+    belongs to.
+    """
+
+    def __call__(self, string):
+        error = None
+        try:
+            vg, lv = string.split('/')
+        except ValueError:
+            error = "Logical volume must be specified as 'volume_group/logical_volume' but got: %s" % string
+            raise argparse.ArgumentError(None, error)
+
+        if not vg:
+            error = "Didn't specify a volume group like 'volume_group/logical_volume', got: %s" % string
+        if not lv:
+            error = "Didn't specify a logical volume like 'volume_group/logical_volume', got: %s" % string
+
+        if error:
+            raise argparse.ArgumentError(None, error)
+        return string
--- a/ceph/src/ceph_mgr.cc
+++ b/ceph/src/ceph_mgr.cc
@ -20,6 +20,7 @@
 #include "common/config.h"
 #include "common/ceph_argparse.h"
 #include "common/errno.h"
+#include "common/pick_address.h"
 #include "global/global_init.h"

 #include "mgr/MgrStandby.h"
@ -52,6 +53,8 @@ int main(int argc, const char **argv)
    usage();
  }

+  pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
+
  global_init_daemonize(g_ceph_context);
  global_init_chdir(g_ceph_context);
  common_init_finish(g_ceph_context);
--- a/ceph/src/client/Client.cc
+++ b/ceph/src/client/Client.cc
@ -9378,11 +9378,11 @@ int Client::chdir(const char *relpath, std::string &new_cwd,
    cwd.swap(in);
  ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;

-  getcwd(new_cwd, perms);
+  _getcwd(new_cwd, perms);
  return 0;
 }

-void Client::getcwd(string& dir, const UserPerm& perms)
+void Client::_getcwd(string& dir, const UserPerm& perms)
 {
  filepath path;
  ldout(cct, 10) << "getcwd " << *cwd << dendl;
@ -9422,6 +9422,12 @@ void Client::getcwd(string& dir, const UserPerm& perms)
  dir += path.get_path();
 }

+void Client::getcwd(string& dir, const UserPerm& perms)
+{
+  Mutex::Locker l(client_lock);
+  _getcwd(dir, perms);
+}
+
 int Client::statfs(const char *path, struct statvfs *stbuf,
 		   const UserPerm& perms)
 {
--- a/ceph/src/client/Client.h
+++ b/ceph/src/client/Client.h
@ -956,6 +956,7 @@ public:

  // crap
  int chdir(const char *s, std::string &new_cwd, const UserPerm& perms);
+  void _getcwd(std::string& cwd, const UserPerm& perms);
  void getcwd(std::string& cwd, const UserPerm& perms);

  // namespace ops
--- a/ceph/src/client/fuse_ll.cc
+++ b/ceph/src/client/fuse_ll.cc
@ -112,14 +112,15 @@ static int getgroups(fuse_req_t req, gid_t **sgids)
    return 0;
  }

-  *sgids = (gid_t*)malloc(c*sizeof(**sgids));
-  if (!*sgids) {
+  gid_t *gids = new (std::nothrow) gid_t[c];
+  if (!gids) {
    return -ENOMEM;
  }
-  c = fuse_req_getgroups(req, c, *sgids);
+  c = fuse_req_getgroups(req, c, gids);
  if (c < 0) {
-    free(*sgids);
-    return c;
+    delete gids;
+  } else {
+    *sgids = gids;
  }
  return c;
 #endif
--- a/ceph/src/cls/log/cls_log.cc
+++ b/ceph/src/cls/log/cls_log.cc
@ -200,9 +200,7 @@ static int cls_log_list(cls_method_context_t hctx, bufferlist *in, bufferlist *o
    }
  }

-  if (ret.truncated) {
-    ret.marker = marker;
-  }
+  ret.marker = marker;

  ::encode(ret, *out);

--- a/ceph/src/common/AsyncReserver.h
+++ b/ceph/src/common/AsyncReserver.h
@ -144,6 +144,16 @@ public:
    }
    do_queues();
  }
+
+  /**
+   * Has reservations
+   *
+   * Return true if there are reservations in progress
+   */
+  bool has_reservation() {
+    Mutex::Locker l(lock);
+    return !in_progress.empty();
+  }
  static const unsigned MAX_PRIORITY = (unsigned)-1;
 };

--- a/ceph/src/common/LogClient.cc
+++ b/ceph/src/common/LogClient.cc
@ -224,11 +224,17 @@ void LogChannel::do_log(clog_type prio, const std::string& s)
  // seq and who should be set for syslog/graylog/log_to_mon
  e.who = parent->get_myinst();
  e.name = parent->get_myname();
-  e.seq = parent->get_next_seq();
  e.prio = prio;
  e.msg = s;
  e.channel = get_log_channel();

+  // log to monitor?
+  if (log_to_monitors) {
+    e.seq = parent->queue(e);
+  } else {
+    e.seq = parent->get_next_seq();
+  }
+
  // log to syslog?
  if (do_log_to_syslog()) {
    ldout(cct,0) << __func__ << " log to syslog"  << dendl;
@ -240,11 +246,6 @@ void LogChannel::do_log(clog_type prio, const std::string& s)
    ldout(cct,0) << __func__ << " log to graylog"  << dendl;
    graylog->log_log_entry(&e);
  }
-
-  // log to monitor?
-  if (log_to_monitors) {
-    parent->queue(e);
-  }
 }

 Message *LogClient::get_mon_log_message(bool flush)
@ -268,8 +269,8 @@ bool LogClient::are_pending()
 Message *LogClient::_get_mon_log_message()
 {
  assert(log_lock.is_locked());
-   if (log_queue.empty())
-     return NULL;
+  if (log_queue.empty())
+    return NULL;

  // only send entries that haven't been sent yet during this mon
  // session!  monclient needs to call reset_session() on mon session
@ -324,6 +325,7 @@ void LogClient::_send_to_mon()
 version_t LogClient::queue(LogEntry &entry)
 {
  Mutex::Locker l(log_lock);
+  entry.seq = ++last_log;
  log_queue.push_back(entry);

  if (is_mon) {
@ -335,6 +337,7 @@ version_t LogClient::queue(LogEntry &entry)

 uint64_t LogClient::get_next_seq()
 {
+  Mutex::Locker l(log_lock);
  return ++last_log;
 }

--- a/ceph/src/common/LogClient.h
+++ b/ceph/src/common/LogClient.h
@ -245,7 +245,7 @@ private:
  bool is_mon;
  Mutex log_lock;
  version_t last_log_sent;
-  std::atomic<uint64_t> last_log;
+  version_t last_log;
  std::deque<LogEntry> log_queue;

  std::map<std::string, LogChannelRef> channels;
--- a/ceph/src/common/ipaddr.cc
+++ b/ceph/src/common/ipaddr.cc
@ -27,7 +27,7 @@ static void netmask_ipv4(const struct in_addr *addr,
 }


-const struct sockaddr *find_ipv4_in_subnet(const struct ifaddrs *addrs,
+const struct ifaddrs *find_ipv4_in_subnet(const struct ifaddrs *addrs,
 					   const struct sockaddr_in *net,
 					   unsigned int prefix_len) {
  struct in_addr want, temp;
@ -49,7 +49,7 @@ const struct sockaddr *find_ipv4_in_subnet(const struct ifaddrs *addrs,
    netmask_ipv4(cur, prefix_len, &temp);

    if (temp.s_addr == want.s_addr) {
-      return addrs->ifa_addr;
+      return addrs;
    }
  }

@ -71,7 +71,7 @@ static void netmask_ipv6(const struct in6_addr *addr,
 }


-const struct sockaddr *find_ipv6_in_subnet(const struct ifaddrs *addrs,
+const struct ifaddrs *find_ipv6_in_subnet(const struct ifaddrs *addrs,
 					   const struct sockaddr_in6 *net,
 					   unsigned int prefix_len) {
  struct in6_addr want, temp;
@ -93,14 +93,14 @@ const struct sockaddr *find_ipv6_in_subnet(const struct ifaddrs *addrs,
    netmask_ipv6(cur, prefix_len, &temp);

    if (IN6_ARE_ADDR_EQUAL(&temp, &want))
-      return addrs->ifa_addr;
+      return addrs;
  }

  return NULL;
 }


-const struct sockaddr *find_ip_in_subnet(const struct ifaddrs *addrs,
+const struct ifaddrs *find_ip_in_subnet(const struct ifaddrs *addrs,
 					 const struct sockaddr *net,
 					 unsigned int prefix_len) {
  switch (net->sa_family) {
--- a/ceph/src/common/options.cc
+++ b/ceph/src/common/options.cc
@ -2510,7 +2510,7 @@ std::vector<Option> get_global_options() {
    .set_description(""),

    Option("osd_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_default(1500)
+    .set_default(3000)
    .set_description("minimum number of entries to maintain in the PG log")
    .add_service("osd")
    .add_see_also("osd_max_pg_log_entries")
--- a/ceph/src/common/pick_address.cc
+++ b/ceph/src/common/pick_address.cc
@ -38,9 +38,10 @@ static const struct sockaddr *find_ip_in_subnet_list(CephContext *cct,
 	exit(1);
      }

-      const struct sockaddr *found = find_ip_in_subnet(ifa, (struct sockaddr *) &net, prefix_len);
+      const struct ifaddrs *found = find_ip_in_subnet(ifa,
+                                      (struct sockaddr *) &net, prefix_len);
      if (found)
-	return found;
+	return found->ifa_addr;
    }

  return NULL;
@ -133,6 +134,32 @@ void pick_addresses(CephContext *cct, int needs)
  freeifaddrs(ifa);
 }

+
+std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network)
+{
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  if (r < 0) {
+    string err = cpp_strerror(errno);
+    lderr(cct) << "unable to fetch interfaces and addresses: " << err << dendl;
+    return {};
+  }
+
+  unsigned int prefix_len = 0;
+  const struct ifaddrs *found = find_ip_in_subnet(ifa,
+                                  (const struct sockaddr *) &network, prefix_len);
+
+  std::string result;
+  if (found) {
+    result = found->ifa_name;
+  }
+
+  freeifaddrs(ifa);
+
+  return result;
+}
+
+
 bool have_local_addr(CephContext *cct, const list<entity_addr_t>& ls, entity_addr_t *match)
 {
  struct ifaddrs *ifa;
--- a/ceph/src/common/pick_address.h
+++ b/ceph/src/common/pick_address.h
@ -30,6 +30,12 @@ class CephContext;
 */
 void pick_addresses(CephContext *cct, int needs);

+/**
+ * Find a network interface whose address matches the address/netmask
+ * in `network`.
+ */
+std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network);
+
 /**
 * check for a locally configured address
 *
--- a/ceph/src/common/scrub_types.cc
+++ b/ceph/src/common/scrub_types.cc
@ -70,8 +70,9 @@ void shard_info_wrapper::set_object(const ScrubMap::object& object)

 void shard_info_wrapper::encode(bufferlist& bl) const
 {
-  ENCODE_START(2, 1, bl);
+  ENCODE_START(3, 3, bl);
  ::encode(errors, bl);
+  ::encode(primary, bl);
  if (has_shard_missing()) {
    return;
  }
@ -87,8 +88,9 @@ void shard_info_wrapper::encode(bufferlist& bl) const

 void shard_info_wrapper::decode(bufferlist::iterator& bp)
 {
-  DECODE_START(2, bp);
+  DECODE_START(3, bp);
  ::decode(errors, bp);
+  ::decode(primary, bp);
  if (has_shard_missing()) {
    return;
  }
@ -98,8 +100,7 @@ void shard_info_wrapper::decode(bufferlist::iterator& bp)
  ::decode(omap_digest, bp);
  ::decode(data_digest_present, bp);
  ::decode(data_digest, bp);
-  if (struct_v > 1)
-    ::decode(selected_oi, bp);
+  ::decode(selected_oi, bp);
  DECODE_FINISH(bp);
 }

@ -120,10 +121,12 @@ void
 inconsistent_obj_wrapper::set_auth_missing(const hobject_t& hoid,
                                           const map<pg_shard_t, ScrubMap*>& maps,
 					   map<pg_shard_t, shard_info_wrapper> &shard_map,
-					   int &shallow_errors, int &deep_errors)
+					   int &shallow_errors, int &deep_errors,
+					   const pg_shard_t &primary)
 {
  for (auto pg_map : maps) {
    auto oid_object = pg_map.second->objects.find(hoid);
+    shard_map[pg_map.first].primary = (pg_map.first == primary);
    if (oid_object == pg_map.second->objects.end())
      shard_map[pg_map.first].set_missing();
    else
--- a/ceph/src/common/scrub_types.h
+++ b/ceph/src/common/scrub_types.h
@ -78,6 +78,9 @@ public:
  void set_ss_attr_corrupted() {
    errors |= err_t::SS_ATTR_CORRUPTED;
  }
+  void set_obj_size_oi_mismatch() {
+    errors |= err_t::OBJ_SIZE_OI_MISMATCH;
+  }
  void encode(bufferlist& bl) const;
  void decode(bufferlist::iterator& bp);
 };
@ -116,7 +119,8 @@ struct inconsistent_obj_wrapper : librados::inconsistent_obj_t {
  void set_auth_missing(const hobject_t& hoid,
                        const map<pg_shard_t, ScrubMap*>&,
 			map<pg_shard_t, shard_info_wrapper>&,
-			int &shallow_errors, int &deep_errors);
+			int &shallow_errors, int &deep_errors,
+			const pg_shard_t &primary);
  void set_version(uint64_t ver) { version = ver; }
  void encode(bufferlist& bl) const;
  void decode(bufferlist::iterator& bp);
--- a/ceph/src/compressor/Compressor.cc
+++ b/ceph/src/compressor/Compressor.cc
@ -27,7 +27,9 @@ const char * Compressor::get_comp_alg_name(int a) {
  case COMP_ALG_SNAPPY: return "snappy";
  case COMP_ALG_ZLIB: return "zlib";
  case COMP_ALG_ZSTD: return "zstd";
+#ifdef HAVE_LZ4
  case COMP_ALG_LZ4: return "lz4";
+#endif
  default: return "???";
  }
 }
@ -39,8 +41,10 @@ boost::optional<Compressor::CompressionAlgorithm> Compressor::get_comp_alg_type(
    return COMP_ALG_ZLIB;
  if (s == "zstd")
    return COMP_ALG_ZSTD;
+#ifdef HAVE_LZ4
  if (s == "lz4")
    return COMP_ALG_LZ4;
+#endif
  if (s == "" || s == "none")
    return COMP_ALG_NONE;

--- a/ceph/src/compressor/Compressor.h
+++ b/ceph/src/compressor/Compressor.h
@ -34,7 +34,9 @@ public:
    COMP_ALG_SNAPPY = 1,
    COMP_ALG_ZLIB = 2,
    COMP_ALG_ZSTD = 3,
+#ifdef HAVE_LZ4
    COMP_ALG_LZ4 = 4,
+#endif
    COMP_ALG_LAST	//the last value for range checks
  };
  // compression options
--- a/ceph/src/crush/CrushCompiler.cc
+++ b/ceph/src/crush/CrushCompiler.cc
@ -587,11 +587,10 @@ int CrushCompiler::parse_bucket(iter_t const& i)
      if (verbose) err << "bucket " << name << " id " << maybe_id;
      if (sub->children.size() > 2) {
        string class_name = string_node(sub->children[3]);
-        if (!crush.class_exists(class_name)) {
-          err << " unknown device class '" << class_name << "'" << std::endl;
-          return -EINVAL;
-        }
-        int cid = crush.get_class_id(class_name);
+        // note that we do not verify class existence here,
+        // as this bucket might come from an empty shadow tree
+        // which currently has no OSDs but is still referenced by a rule!
+        int cid = crush.get_or_create_class_id(class_name);
        if (class_id.count(cid) != 0) {
          err << "duplicate device class " << class_name << " for bucket " << name << std::endl;
          return -ERANGE;
@ -741,7 +740,9 @@ int CrushCompiler::parse_bucket(iter_t const& i)
  item_weight[id] = bucketweight;
  
  assert(id != 0);
-  int r = crush.add_bucket(id, alg, hash, type, size, &items[0], &weights[0], NULL);
+  int idout;
+  int r = crush.add_bucket(id, alg, hash, type, size,
+                           &items[0], &weights[0], &idout);
  if (r < 0) {
    if (r == -EEXIST)
      err << "Duplicate bucket id " << id << std::endl;
--- a/ceph/src/crush/CrushWrapper.cc
+++ b/ceph/src/crush/CrushWrapper.cc
@ -291,6 +291,33 @@ int CrushWrapper::rename_bucket(const string& srcname,
  return set_item_name(oldid, dstname);
 }

+int CrushWrapper::rename_rule(const string& srcname,
+                              const string& dstname,
+                              ostream *ss)
+{
+  if (!rule_exists(srcname)) {
+    if (ss) {
+      *ss << "source rule name '" << srcname << "' does not exist";
+    }
+    return -ENOENT;
+  }
+  if (rule_exists(dstname)) {
+    if (ss) {
+      *ss << "destination rule name '" << dstname << "' already exists";
+    }
+    return -EEXIST;
+  }
+  int rule_id = get_rule_id(srcname);
+  auto it = rule_name_map.find(rule_id);
+  assert(it != rule_name_map.end());
+  it->second = dstname;
+  if (have_rmaps) {
+    rule_name_rmap.erase(srcname);
+    rule_name_rmap[dstname] = rule_id;
+  }
+  return 0;
+}
+
 void CrushWrapper::find_takes(set<int>& roots) const
 {
  for (unsigned i=0; i<crush->max_rules; i++) {
@ -1075,7 +1102,7 @@ int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)

  // swap names
  swap_names(src, dst);
-  return 0;
+  return rebuild_roots_with_classes();
 }

 int CrushWrapper::link_bucket(
@ -1667,7 +1694,7 @@ int CrushWrapper::remove_rule(int ruleno)
  crush->rules[ruleno] = NULL;
  rule_name_map.erase(ruleno);
  have_rmaps = false;
-  return 0;
+  return rebuild_roots_with_classes();
 }

 int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *bucket, int item, int weight)
@ -1678,8 +1705,8 @@ int CrushWrapper::bucket_adjust_item_weight(CephContext *cct, crush_bucket *buck
      if (bucket->items[position] == item)
 	break;
    assert(position != bucket->size);
-    for (auto w : choose_args) {
-      crush_choose_arg_map arg_map = w.second;
+    for (auto &w : choose_args) {
+      crush_choose_arg_map &arg_map = w.second;
      crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
      for (__u32 j = 0; j < arg->weight_set_size; j++) {
 	crush_weight_set *weight_set = &arg->weight_set[j];
@ -1702,26 +1729,30 @@ int CrushWrapper::add_bucket(
  crush_bucket *b = crush_make_bucket(crush, alg, hash, type, size, items,
 				      weights);
  assert(b);
+  assert(idout);
  int r = crush_add_bucket(crush, bucketno, b, idout);
+  int pos = -1 - *idout;
  for (auto& p : choose_args) {
    crush_choose_arg_map& cmap = p.second;
    if (cmap.args) {
-      if ((int)cmap.size <= *idout) {
+      if ((int)cmap.size <= pos) {
 	cmap.args = (crush_choose_arg*)realloc(
 	  cmap.args,
-	  sizeof(crush_choose_arg) * (*idout + 1));
+	  sizeof(crush_choose_arg) * (pos + 1));
+        assert(cmap.args);
 	memset(&cmap.args[cmap.size], 0,
-	       sizeof(crush_choose_arg) * (*idout + 1 - cmap.size));
-	cmap.size = *idout + 1;
+	       sizeof(crush_choose_arg) * (pos + 1 - cmap.size));
+	cmap.size = pos + 1;
      }
    } else {
      cmap.args = (crush_choose_arg*)calloc(sizeof(crush_choose_arg),
-					    *idout + 1);
-      cmap.size = *idout + 1;
+					    pos + 1);
+      assert(cmap.args);
+      cmap.size = pos + 1;
    }
    if (size > 0) {
      int positions = get_choose_args_positions(cmap);
-      crush_choose_arg& carg = cmap.args[*idout];
+      crush_choose_arg& carg = cmap.args[pos];
      carg.weight_set = (crush_weight_set*)calloc(sizeof(crush_weight_set),
 						  size);
      carg.weight_set_size = positions;
@ -1744,8 +1775,8 @@ int CrushWrapper::bucket_add_item(crush_bucket *bucket, int item, int weight)
  if (r < 0) {
    return r;
  }
-  for (auto w : choose_args) {
-    crush_choose_arg_map arg_map = w.second;
+  for (auto &w : choose_args) {
+    crush_choose_arg_map &arg_map = w.second;
    crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
    for (__u32 j = 0; j < arg->weight_set_size; j++) {
      crush_weight_set *weight_set = &arg->weight_set[j];
@ -1777,8 +1808,8 @@ int CrushWrapper::bucket_remove_item(crush_bucket *bucket, int item)
  if (r < 0) {
    return r;
  }
-  for (auto w : choose_args) {
-    crush_choose_arg_map arg_map = w.second;
+  for (auto &w : choose_args) {
+    crush_choose_arg_map &arg_map = w.second;
    crush_choose_arg *arg = &arg_map.args[-1-bucket->id];
    for (__u32 j = 0; j < arg->weight_set_size; j++) {
      crush_weight_set *weight_set = &arg->weight_set[j];
@ -1960,8 +1991,10 @@ int CrushWrapper::device_class_clone(
      unsigned new_size = -1-bno + 1;
      cmap.args = (crush_choose_arg*)realloc(cmap.args,
 					     new_size * sizeof(cmap.args[0]));
+      assert(cmap.args);
      memset(cmap.args + cmap.size, 0,
 	     (new_size - cmap.size) * sizeof(cmap.args[0]));
+      cmap.size = new_size;
    }
    auto& o = cmap.args[-1-original_id];
    auto& n = cmap.args[-1-bno];
@ -1990,6 +2023,37 @@ int CrushWrapper::device_class_clone(
  return 0;
 }

+int CrushWrapper::get_rules_by_class(const string &class_name, set<int> *rules)
+{
+  assert(rules);
+  rules->clear();
+  if (!class_exists(class_name)) {
+    return -ENOENT;
+  }
+  int class_id = get_class_id(class_name);
+  for (unsigned i = 0; i < crush->max_rules; ++i) {
+    crush_rule *r = crush->rules[i];
+    if (!r)
+      continue;
+    for (unsigned j = 0; j < r->len; ++j) {
+      if (r->steps[j].op == CRUSH_RULE_TAKE) {
+        int step_item = r->steps[j].arg1;
+        int original_item;
+        int c;
+        int res = split_id_class(step_item, &original_item, &c);
+        if (res < 0) {
+          return res;
+        }
+        if (c != -1 && c == class_id) {
+          rules->insert(i);
+          break;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
 bool CrushWrapper::_class_is_dead(int class_id)
 {
  for (auto &p: class_map) {
@ -2299,7 +2363,7 @@ void CrushWrapper::decode(bufferlist::iterator& blp)
      __u32 choose_args_size;
      ::decode(choose_args_size, blp);
      for (__u32 i = 0; i < choose_args_size; i++) {
-	uint64_t choose_args_index;
+        typename decltype(choose_args)::key_type choose_args_index;
 	::decode(choose_args_index, blp);
 	crush_choose_arg_map arg_map;
 	arg_map.size = crush->max_buckets;
--- a/ceph/src/crush/CrushWrapper.h
+++ b/ceph/src/crush/CrushWrapper.h
@ -539,6 +539,9 @@ public:
 		    ostream *ss);

  // rule names
+  int rename_rule(const string& srcname,
+                  const string& dstname,
+                  ostream *ss);
  bool rule_exists(string name) const {
    build_rmaps();
    return rule_name_rmap.count(name);
@ -1217,6 +1220,7 @@ public:
  int rename_class(const string& srcname, const string& dstname);
  int populate_classes(
    const std::map<int32_t, map<int32_t, int32_t>>& old_class_bucket);
+  int get_rules_by_class(const string &class_name, set<int> *rules);
  bool _class_is_dead(int class_id);
  void cleanup_dead_classes();
  int rebuild_roots_with_classes();
--- a/ceph/src/include/ipaddr.h
+++ b/ceph/src/include/ipaddr.h
@ -7,7 +7,7 @@
  If there are multiple matches, the first one is returned; this order
  is system-dependent and should not be relied on.
 */
-const struct sockaddr *find_ip_in_subnet(const struct ifaddrs *addrs,
+const struct ifaddrs *find_ip_in_subnet(const struct ifaddrs *addrs,
 					 const struct sockaddr *net,
 					 unsigned int prefix_len);

--- a/ceph/src/include/rados/rados_types.hpp
+++ b/ceph/src/include/rados/rados_types.hpp
@ -63,11 +63,12 @@ struct err_t {
    OI_ATTR_MISSING         = 1 << 14,
    OI_ATTR_CORRUPTED       = 1 << 15,
    SS_ATTR_MISSING         = 1 << 16,
-    SS_ATTR_CORRUPTED       = 1 << 17
+    SS_ATTR_CORRUPTED       = 1 << 17,
+    OBJ_SIZE_OI_MISMATCH      = 1 << 18
    // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
  };
  uint64_t errors = 0;
-  static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_OI|OI_ATTR_MISSING|OI_ATTR_CORRUPTED|SS_ATTR_MISSING|SS_ATTR_CORRUPTED;
+  static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_OI|OI_ATTR_MISSING|OI_ATTR_CORRUPTED|SS_ATTR_MISSING|SS_ATTR_CORRUPTED|OBJ_SIZE_OI_MISMATCH;
  static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_OI|OMAP_DIGEST_MISMATCH_OI|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH;
  bool has_shard_missing() const {
    return errors & SHARD_MISSING;
@ -111,6 +112,9 @@ struct err_t {
  bool has_deep_errors() const {
    return errors & DEEP_ERRORS;
  }
+  bool has_obj_size_oi_mismatch() const {
+    return errors & OBJ_SIZE_OI_MISMATCH;
+  }
 };

 struct shard_info_t : err_t {
@ -121,6 +125,7 @@ struct shard_info_t : err_t {
  bool data_digest_present = false;
  uint32_t data_digest = 0;
  bool selected_oi = false;
+  bool primary = false;
 };

 struct osd_shard_t {
--- a/ceph/src/mds/MDSDaemon.cc
+++ b/ceph/src/mds/MDSDaemon.cc
@ -1343,26 +1343,28 @@ bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
    if (caps_info.allow_all) {
      // Flag for auth providers that don't provide cap strings
      s->auth_caps.set_allow_all();
-    }
+    } else {
+      bufferlist::iterator p = caps_info.caps.begin();
+      string auth_cap_str;
+      try {
+        ::decode(auth_cap_str, p);

-    bufferlist::iterator p = caps_info.caps.begin();
-    string auth_cap_str;
-    try {
-      ::decode(auth_cap_str, p);
-
-      dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl;
-      std::ostringstream errstr;
-      if (!s->auth_caps.parse(g_ceph_context, auth_cap_str, &errstr)) {
-        dout(1) << __func__ << ": auth cap parse error: " << errstr.str()
-		<< " parsing '" << auth_cap_str << "'" << dendl;
-	clog->warn() << name << " mds cap '" << auth_cap_str
-		     << "' does not parse: " << errstr.str();
+        dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl;
+        std::ostringstream errstr;
+        if (!s->auth_caps.parse(g_ceph_context, auth_cap_str, &errstr)) {
+          dout(1) << __func__ << ": auth cap parse error: " << errstr.str()
+		  << " parsing '" << auth_cap_str << "'" << dendl;
+	  clog->warn() << name << " mds cap '" << auth_cap_str
+		       << "' does not parse: " << errstr.str();
+          is_valid = false;
+        }
+      } catch (buffer::error& e) {
+        // Assume legacy auth, defaults to:
+        //  * permit all filesystem ops
+        //  * permit no `tell` ops
+        dout(1) << __func__ << ": cannot decode auth caps bl of length " << caps_info.caps.length() << dendl;
+        is_valid = false;
      }
-    } catch (buffer::error& e) {
-      // Assume legacy auth, defaults to:
-      //  * permit all filesystem ops
-      //  * permit no `tell` ops
-      dout(1) << __func__ << ": cannot decode auth caps bl of length " << caps_info.caps.length() << dendl;
    }
  }

--- a/ceph/src/messages/MStatfs.h
+++ b/ceph/src/messages/MStatfs.h
@ -22,13 +22,13 @@
 class MStatfs : public PaxosServiceMessage {

  static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 0;
+  static const int COMPAT_VERSION = 1;

 public:
  uuid_d fsid;
  boost::optional<int64_t> data_pool;

-  MStatfs() : PaxosServiceMessage(CEPH_MSG_STATFS, 0, HEAD_VERSION) {}
+  MStatfs() : PaxosServiceMessage(CEPH_MSG_STATFS, 0, HEAD_VERSION, COMPAT_VERSION) {}
  MStatfs(const uuid_d& f, ceph_tid_t t, boost::optional<int64_t> _data_pool,
 	      version_t v) : PaxosServiceMessage(CEPH_MSG_STATFS, v,
                                            HEAD_VERSION, COMPAT_VERSION),
--- a/ceph/src/mon/FSCommands.cc
+++ b/ceph/src/mon/FSCommands.cc
@ -521,8 +521,17 @@ class AddDataPoolHandler : public FileSystemCommandHandler
      return 0;
    }

-    mon->osdmon()->do_application_enable(poolid,
-                                         pg_pool_t::APPLICATION_NAME_CEPHFS);
+    // if we're running as luminous, we have to set the pool application metadata
+    if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS ||
+	mon->osdmon()->pending_inc.new_require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+      if (!mon->osdmon()->is_writeable()) {
+	// not allowed to write yet, so retry when we can
+	mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op));
+	return -EAGAIN;
+      }
+      mon->osdmon()->do_application_enable(poolid, pg_pool_t::APPLICATION_NAME_CEPHFS);
+      mon->osdmon()->propose_pending();
+    }

    fsmap.modify_filesystem(
        fs->fscid,
--- a/ceph/src/mon/MonCommands.h
+++ b/ceph/src/mon/MonCommands.h
@ -509,6 +509,10 @@ COMMAND("osd lspools " \
 COMMAND_WITH_FLAG("osd crush rule list", "list crush rules", "osd", "r", "cli,rest",
 		  FLAG(DEPRECATED))
 COMMAND("osd crush rule ls", "list crush rules", "osd", "r", "cli,rest")
+COMMAND("osd crush rule ls-by-class " \
+        "name=class,type=CephString,goodchars=[A-Za-z0-9-_.]", \
+        "list all crush rules that reference the same <class>", \
+        "osd", "r", "cli,rest")
 COMMAND("osd crush rule dump " \
 	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.],req=false", \
 	"dump crush rule <name> (default all)", \
@ -646,6 +650,11 @@ COMMAND("osd crush rule create-erasure " \
 COMMAND("osd crush rule rm " \
 	"name=name,type=CephString,goodchars=[A-Za-z0-9-_.] ",	\
 	"remove crush rule <name>", "osd", "rw", "cli,rest")
+COMMAND("osd crush rule rename " \
+        "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_.] "  \
+        "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_.]",  \
+        "rename crush rule <srcname> to <dstname>",
+        "osd", "rw", "cli,rest")
 COMMAND("osd crush tree "
        "name=shadow,type=CephChoices,strings=--show-shadow,req=false", \
 	"dump crush buckets and items in a tree view",
--- a/ceph/src/mon/Monitor.cc
+++ b/ceph/src/mon/Monitor.cc
@ -796,6 +796,8 @@ int Monitor::init()
  mgr_messenger->add_dispatcher_tail(this);  // for auth ms_* calls

  bootstrap();
+  // add features of myself into feature_map
+  session_map.feature_map.add_mon(con_self->get_features());
  return 0;
 }

@ -2707,7 +2709,12 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f)

  if (f) {
    f->dump_stream("fsid") << monmap->get_fsid();
-    get_health_status(false, f, nullptr);
+    if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+      get_health_status(false, f, nullptr);
+    } else {
+      list<string> health_str;
+      get_health(health_str, nullptr, f);
+    }
    f->dump_unsigned("election_epoch", get_epoch());
    {
      f->open_array_section("quorum");
--- a/ceph/src/mon/OSDMonitor.cc
+++ b/ceph/src/mon/OSDMonitor.cc
@ -3261,6 +3261,12 @@ epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) cons
  dout(30) << __func__ << " osd." << osd << " next=" << next
 	   << " " << creating_pgs_by_osd_epoch << dendl;
  std::lock_guard<std::mutex> l(creating_pgs_lock);
+  if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
+    dout(20) << __func__
+	     << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
+    // the subscribers will be updated when the mapping is completed anyway
+    return next;
+  }
  auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
  if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
    return next;
@ -4923,6 +4929,34 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      osdmap.crush->list_rules(&ss);
      rdata.append(ss.str());
    }
+  } else if (prefix == "osd crush rule ls-by-class") {
+    string class_name;
+    cmd_getval(g_ceph_context, cmdmap, "class", class_name);
+    if (class_name.empty()) {
+      ss << "no class specified";
+      r = -EINVAL;
+      goto reply;
+    }
+    set<int> rules;
+    r = osdmap.crush->get_rules_by_class(class_name, &rules);
+    if (r < 0) {
+      ss << "failed to get rules by class '" << class_name << "'";
+      goto reply;
+    }
+    if (f) {
+      f->open_array_section("rules");
+      for (auto &rule: rules) {
+        f->dump_string("name", osdmap.crush->get_rule_name(rule));
+      }
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      ostringstream rs;
+      for (auto &rule: rules) {
+        rs << osdmap.crush->get_rule_name(rule) << "\n";
+      }
+      rdata.append(rs.str());
+    }
  } else if (prefix == "osd crush rule dump") {
    string name;
    cmd_getval(g_ceph_context, cmdmap, "name", name);
@ -5034,14 +5068,24 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
  } else if (prefix == "osd crush class ls-osd") {
    string name;
    cmd_getval(g_ceph_context, cmdmap, "class", name);
-    boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
    set<int> osds;
    osdmap.crush->get_devices_by_class(name, &osds);
-    f->open_array_section("osds");
-    for (auto& osd : osds)
-      f->dump_int("osd", osd);
-    f->close_section();
-    f->flush(rdata);
+    if (f) {
+      f->open_array_section("osds");
+      for (auto &osd: osds)
+        f->dump_int("osd", osd);
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      bool first = true;
+      for (auto &osd : osds) {
+        if (!first)
+          ds << "\n";
+        first = false;
+        ds << osd;
+      }
+      rdata.append(ds);
+    }
  } else if (prefix == "osd erasure-code-profile ls") {
    const auto &profiles = osdmap.get_erasure_code_profiles();
    if (f)
@ -5719,15 +5763,20 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
    _get_pending_crush(newcrush);
    ostringstream err;
    CrushTester tester(newcrush, err);
+    tester.set_min_x(0);
    tester.set_max_x(50);
    tester.set_rule(crush_rule);
+    auto start = ceph::coarse_mono_clock::now();
    r = tester.test_with_fork(g_conf->mon_lease);
+    auto duration = ceph::coarse_mono_clock::now() - start;
    if (r < 0) {
      dout(10) << " tester.test_with_fork returns " << r
 	       << ": " << err.str() << dendl;
      *ss << "crush test failed with " << r << ": " << err.str();
      return r;
    }
+    dout(10) << __func__ << " crush somke test duration: "
+             << duration << dendl;
  }
  unsigned size, min_size;
  r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
@ -6703,6 +6752,11 @@ int OSDMonitor::prepare_command_osd_create(
 {
  dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
  assert(existing_id);
+  if (osdmap.is_destroyed(id)) {
+    ss << "ceph osd create has been deprecated. Please use ceph osd new "
+          "instead.";
+    return -EINVAL;
+  }

  if (uuid.is_zero()) {
    dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
@ -7294,8 +7348,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      dout(10) << " testing map" << dendl;
      stringstream ess;
      CrushTester tester(crush, ess);
+      tester.set_min_x(0);
      tester.set_max_x(50);
+      auto start = ceph::coarse_mono_clock::now();
      int r = tester.test_with_fork(g_conf->mon_lease);
+      auto duration = ceph::coarse_mono_clock::now() - start;
      if (r < 0) {
 	dout(10) << " tester.test_with_fork returns " << r
 		 << ": " << ess.str() << dendl;
@ -7303,7 +7360,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	err = r;
 	goto reply;
      }
-      dout(10) << " crush test result " << ess.str() << dendl;
+      dout(10) << __func__ << " crush somke test duration: "
+               << duration << ", result: " << ess.str() << dendl;
    }

    pending_inc.crush = data;
@ -8470,6 +8528,36 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 					      get_last_committed() + 1));
    return true;

+  } else if (prefix == "osd crush rule rename") {
+    string srcname;
+    string dstname;
+    cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
+    cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
+    if (srcname.empty() || dstname.empty()) {
+      ss << "must specify both source rule name and destination rule name";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (srcname == dstname) {
+      ss << "destination rule name is equal to source rule name";
+      err = 0;
+      goto reply;
+    }
+
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    err = newcrush.rename_rule(srcname, dstname, &ss);
+    if (err < 0) {
+      // ss has reason for failure
+      goto reply;
+    }
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+                               get_last_committed() + 1));
+    return true;
+
  } else if (prefix == "osd setmaxosd") {
    int64_t newmax;
    if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
--- a/ceph/src/mon/PGMap.cc
+++ b/ceph/src/mon/PGMap.cc
@ -396,7 +396,7 @@ void PGMapDigest::recovery_summary(Formatter *f, list<string> *psl,
    } else {
      ostringstream ss;
      ss << delta_sum.stats.sum.num_objects_unfound
-         << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)";
+         << "/" << delta_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
      psl->push_back(ss.str());
    }
  }
@ -3097,7 +3097,7 @@ void PGMap::get_health_checks(
    snprintf(b, sizeof(b), "%.3lf", pc);
    ostringstream ss;
    ss << pg_sum.stats.sum.num_objects_unfound
-       << "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)";
+       << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
    auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());

    for (auto& p : pg_stat) {
@ -3188,7 +3188,7 @@ void PGMap::get_health_checks(
    }
    if (!error_detail.empty()) {
      ostringstream ss;
-      ss << warn << " stuck requests are blocked > "
+      ss << error << " stuck requests are blocked > "
 	 << err_age << " sec";
      auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
      d.detail.swap(error_detail);
@ -4567,6 +4567,9 @@ int reweight::by_utilization(
      if (pools && pools->count(pg.first.pool()) == 0)
 	continue;
      for (const auto acting : pg.second.acting) {
+        if (!osdmap.exists(acting)) {
+          continue;
+        }
 	if (acting >= (int)pgs_by_osd.size())
 	  pgs_by_osd.resize(acting);
 	if (pgs_by_osd[acting] == 0) {
--- a/Show More
+++ b/Show More