import source of Ceph Squid 19.2.1 release

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Thomas Lamprecht 2025-02-07 15:10:43 +01:00
parent 3815e3391b
commit 1852f3843b
1016 changed files with 115945 additions and 60986 deletions

View File

@ -154,6 +154,7 @@ crimson:
- src/crimson/**
- src/test/crimson/**
- qa/suites/crimson-rados/**
- src/seastar/**
dashboard:
- src/pybind/mgr/dashboard/**

View File

@ -27,7 +27,7 @@ b-ranto Boris Ranto <branto@redhat.com>
badone Brad Hubbard <bhubbard@redhat.com>
baruza Barbora Ančincová <bara@redhat.com>
bassamtabbara Bassam Tabbara <bassam.tabbara@quantum.com>
batrick Patrick Donnelly <pdonnell@redhat.com>
batrick Patrick Donnelly <pdonnell@ibm.com>
bigjust Justin Caratzas <jcaratza@redhat.com>
bk201 Kiefer Chang <kiefer.chang@suse.com>
BlaineEXE Blaine Gardner <bgardner@suse.com>

3
ceph/.gitmodules vendored
View File

@ -50,9 +50,6 @@
[submodule "src/c-ares"]
path = src/c-ares
url = https://github.com/ceph/c-ares.git
[submodule "src/spawn"]
path = src/spawn
url = https://github.com/ceph/spawn.git
[submodule "src/pybind/mgr/rook/rook-client-python"]
path = src/pybind/mgr/rook/rook-client-python
url = https://github.com/ceph/rook-client-python.git

View File

@ -543,7 +543,8 @@ Pan Liu <pan.liu@istuary.com> <liupan1111@gmail.com>
Parth Arora <paarora@redhat.com> parth-gr <paarora@redhat.com>
Pascal de Bruijn <pascal@unilogicnetworks.net>
Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
Patrick Donnelly <pdonnell@redhat.com> <pdonell@redhat.com>
Patrick Donnelly <pdonnell@ibm.com> <pdonnell@redhat.com>
Patrick Donnelly <pdonnell@ibm.com> <batrick@batbytes.com>
Patrick McGarry <patrick@inktank.com>
Patrick McGarry <pmcgarry@redhat.com> <pmcgarry@gmail.com>
Patrick Seidensal <pseidensal@suse.com>

View File

@ -357,6 +357,10 @@ IBM <contact@IBM.com> Neeraj Pratap Singh <Neeraj.Pratap.Singh1@ibm.com>
IBM <contact@IBM.com> Or Ozeri <oro@il.ibm.com>
IBM <contact@IBM.com> Paul Cuzner <pcuzner@ibm.com>
IBM <contact@IBM.com> Samuel Matzek <smatzek@us.ibm.com>
IBM <contact@IBM.com> Shraddha Agrawal <shraddhaag@ibm.com>
IBM <contact@IBM.com> Kushal Deb <Kushal.Deb@ibm.com>
IBM <contact@IBM.com> Shweta Bhosale <Shweta.Bhosale1@ibm.com>
IBM <contact@IBM.com> Patrick Donnelly <pdonnell@ibm.com>
IBM <contact@IBM.com> Sunil Angadi <Sunil.Angadi@ibm.com>
IBM <contact@IBM.com> Teoman Onay <tonay@ibm.com>
IBM <contact@ibm.com> Ulrich Weigand <ulrich.weigand@de.ibm.com>

View File

@ -73,5 +73,5 @@ Yehuda Sadeh <ysadehwe@redhat.com> Yehuda Sadeh <yehuda@inktank.com>
Yuri Weinstein <yuriw@redhat.com> Yuri Weinstein <yuri.weinstein@inktank.com>
Zhi Zhang <zhangz.david@outlook.com> Zhi (David) Zhang <zhangz@yahoo-inc.com>
Zheng Yin <zhengyin@huayun.com> Zheng Yin <zhengyin@chinac.com>
Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
Patrick Donnelly <pdonnell@ibm.com> Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
Myoungwon Oh <myoungwon.oh@samsung.com> Myoungwon Oh <omwmw@sk.com> Myoungwon Oh <ohmyoungwon@gmail.com>

View File

@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.16)
project(ceph
VERSION 19.2.0
VERSION 19.2.1
LANGUAGES CXX C ASM)
foreach(policy CMP0127 CMP0135)

View File

@ -173,6 +173,11 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
default json format produces a rather massive output in large clusters and
isn't scalable. So we have removed the 'network_ping_times' section from
the output. Details in the tracker: https://tracker.ceph.com/issues/57460
* mgr/REST: The REST manager module will trim requests based on the 'max_requests' option.
Without this feature, and in the absence of manual deletion of old requests,
the accumulation of requests in the array can lead to Out Of Memory (OOM) issues,
resulting in the Manager crashing.
* CephFS: The `subvolume snapshot clone` command now depends on the config option
`snapshot_clone_no_wait` which is used to reject the clone operation when
all the cloner threads are busy. This config option is enabled by default which means
@ -231,6 +236,23 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
confirmation flag when some MDSs exhibit health warning MDS_TRIM or
MDS_CACHE_OVERSIZED. This is to prevent accidental MDS failover causing
further delays in recovery.
* Based on tests performed at scale on a HDD based Ceph cluster, it was found
that scheduling with mClock was not optimal with multiple OSD shards. For
example, in the test cluster with multiple OSD node failures, the client
throughput was found to be inconsistent across test runs coupled with multiple
reported slow requests. However, the same test with a single OSD shard and
with multiple worker threads yielded significantly better results in terms of
consistency of client and recovery throughput across multiple test runs.
Therefore, as an interim measure until the issue with multiple OSD shards
(or multiple mClock queues per OSD) is investigated and fixed, the following
change to the default HDD OSD shard configuration is made:
- osd_op_num_shards_hdd = 1 (was 5)
- osd_op_num_threads_per_shard_hdd = 5 (was 1)
For more details see https://tracker.ceph.com/issues/66289.
* NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under FSAL block,
* NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under the FSAL block,
which specifies the path within the CephFS to mount this export on. If this and the other
`EXPORT { FSAL {} }` options are the same between multiple exports, those exports will share a single CephFS client. If not specified, the default is `/`.
>=18.0.0

View File

@ -45,19 +45,21 @@ out the git submodules associated with it:
## Build Prerequisites
*section last updated 27 Jul 2023*
*section last updated 06 Sep 2024*
Make sure that ``curl`` is installed. The Debian and Ubuntu ``apt`` command is
provided here, but if you use a system with a different package manager, then
you must use whatever command is the proper counterpart of this one:
We provide the Debian and Ubuntu ``apt`` commands in this procedure. If you use
a system with a different package manager, then you will have to use different
commands.
#. Install ``curl``:
apt install curl
Install Debian or RPM package dependencies by running the following command:
#. Install package dependencies by running the ``install-deps.sh`` script:
./install-deps.sh
Install the ``python3-routes`` package:
#. Install the ``python3-routes`` package:
apt install python3-routes
@ -70,44 +72,56 @@ we recommend that you build `.deb` or `.rpm` packages, or refer to
``ceph.spec.in`` or ``debian/rules`` to see which configuration options are
specified for production builds.
To build Ceph, make sure that you are in the top-level `ceph` directory that
contains `do_cmake.sh` and `CONTRIBUTING.rst` and run the following commands:
To build Ceph, follow this procedure:
./do_cmake.sh
cd build
ninja
1. Make sure that you are in the top-level `ceph` directory that
contains `do_cmake.sh` and `CONTRIBUTING.rst`.
2. Run the `do_cmake.sh` script:
``do_cmake.sh`` by default creates a "debug build" of Ceph, which can be up to
five times slower than a non-debug build. Pass
``-DCMAKE_BUILD_TYPE=RelWithDebInfo`` to ``do_cmake.sh`` to create a non-debug
build.
./do_cmake.sh
[Ninja](https://ninja-build.org/) is the buildsystem used by the Ceph project
to build test builds. The number of jobs used by `ninja` is derived from the
number of CPU cores of the building host if unspecified. Use the `-j` option to
limit the job number if the build jobs are running out of memory. If you
attempt to run `ninja` and receive a message that reads `g++: fatal error:
Killed signal terminated program cc1plus`, then you have run out of memory.
Using the `-j` option with an argument appropriate to the hardware on which the
`ninja` command is run is expected to result in a successful build. For example,
to limit the job number to 3, run the command `ninja -j 3`. On average, each
`ninja` job run in parallel needs approximately 2.5 GiB of RAM.
``do_cmake.sh`` by default creates a "debug build" of Ceph, which can be
up to five times slower than a non-debug build. Pass
``-DCMAKE_BUILD_TYPE=RelWithDebInfo`` to ``do_cmake.sh`` to create a
non-debug build.
3. Move into the `build` directory:
This documentation assumes that your build directory is a subdirectory of the
`ceph.git` checkout. If the build directory is located elsewhere, point
`CEPH_GIT_DIR` to the correct path of the checkout. Additional CMake args can
be specified by setting ARGS before invoking ``do_cmake.sh``. See [cmake
options](#cmake-options) for more details. For example:
cd build
4. Use the `ninja` buildsystem to build the development environment:
ARGS="-DCMAKE_C_COMPILER=gcc-7" ./do_cmake.sh
ninja -j3
To build only certain targets, run a command of the following form:
> [IMPORTANT]
>
> [Ninja](https://ninja-build.org/) is the build system used by the Ceph
> project to build test builds. The number of jobs used by `ninja` is
> derived from the number of CPU cores of the building host if unspecified.
> Use the `-j` option to limit the job number if build jobs are running
> out of memory. If you attempt to run `ninja` and receive a message that
> reads `g++: fatal error: Killed signal terminated program cc1plus`, then
> you have run out of memory.
>
> Using the `-j` option with an argument appropriate to the hardware on
> which the `ninja` command is run is expected to result in a successful
> build. For example, to limit the job number to 3, run the command `ninja
> -j3`. On average, each `ninja` job run in parallel needs approximately
> 2.5 GiB of RAM.
ninja [target name]
This documentation assumes that your build directory is a subdirectory of
the `ceph.git` checkout. If the build directory is located elsewhere, point
`CEPH_GIT_DIR` to the correct path of the checkout. Additional CMake args
can be specified by setting ARGS before invoking ``do_cmake.sh``.
See [cmake options](#cmake-options) for more details. For example:
To install:
ARGS="-DCMAKE_C_COMPILER=gcc-7" ./do_cmake.sh
ninja install
To build only certain targets, run a command of the following form:
ninja [target name]
5. Install the vstart cluster:
ninja install
### CMake Options

View File

@ -121,14 +121,11 @@ If you do not have sufficient permissions to modify any field of the tracker
issue, just add a comment describing what changes you would like to make.
Someone with permissions will make the necessary modifications on your behalf.
For straightforward backports, that's all that you (as the developer of the fix)
need to do. Volunteers from the `Stable Releases and Backports team`_ will
proceed to create Backport issues to track the necessary backports and stage the
backports by opening GitHub PRs with the cherry-picks. If you don't want to
wait, and provided you have sufficient permissions at https://tracker.ceph.com,
you can `create Backport tracker issues` and `stage backports`_ yourself. In
that case, read on.
Authors of pull requests are responsible for creating associated backport pull
requests. As long as you have sufficient permissions at
https://tracker.ceph.com, you can `create Backport tracker issues` and `stage
backports`_ yourself. Read these linked sections to learn how to create
backport tracker issues and how to stage backports:
.. _`create backport tracker issues`:
.. _`backport tracker issue`:
@ -146,10 +143,7 @@ issues can be created in the backport tracker issue for tracking the backporting
Under ordinary circumstances, the developer who merges the ``main`` PR will flag
the ``main`` branch tracker issue for backport by changing the Status to "Pending
Backport", and volunteers from the `Stable Releases and Backports team`_
periodically create backport tracker issues by running the
``backport-create-issue`` script. They also do the actual backporting. But that
does take time and you may not want to wait.
Backport".
You might be tempted to forge ahead and create the backport issues yourself.
Please don't do that - it is difficult (bordering on impossible) to get all the
@ -360,20 +354,11 @@ Once the backport PR is open, the first order of business is to set the
Milestone tag to the stable release the backport PR is targeting. For example,
if the PR is targeting "nautilus", set the Milestone tag to "nautilus".
If you don't have sufficient GitHub permissions to set the Milestone, don't
worry. Members of the `Stable Releases and Backports team`_ periodically run
a script (``ceph-backport.sh --milestones``) which scans all PRs targetting stable
branches and automatically adds the correct Milestone tag if it is missing.
Next, check which component label was applied to the ``main`` PR corresponding to
this backport, and double-check that that label is applied to the backport PR as
well. For example, if the ``main`` PR carries the component label "core", the
backport PR should also get that label.
In general, it is the responsibility of the `Stable Releases and Backports
team`_ to ensure that backport PRs are properly labelled. If in doubt, just
leave the labelling to them.
.. _`backport PR reviewing`:
.. _`backport PR testing`:
.. _`backport PR merging`:
@ -381,9 +366,8 @@ leave the labelling to them.
Reviewing, testing, and merging of backport PRs
-----------------------------------------------
Once your backport PR is open and the Milestone is set properly, the
`Stable Releases and Backports team` will take care of getting the PR
reviewed and tested. Once the PR is reviewed and tested, it will be merged.
Once your backport PR is open, it will be reviewed and tested. When the PR has
been reviewed and tested, it will be merged.
If you would like to facilitate this process, you can solicit reviews and run
integration tests on the PR. In this case, add comments to the PR describing the
@ -394,22 +378,3 @@ it will be merged. Even if you have sufficient GitHub permissions to merge the
PR, please do *not* merge it yourself. (Uncontrolled merging to stable branches
unnecessarily complicates the release preparation process, which is done by
volunteers.)
Stable Releases and Backports team
----------------------------------
Ceph has a `Stable Releases and Backports`_ team, staffed by volunteers,
which is charged with maintaining the stable releases and backporting bugfixes
from the ``main`` branch to them. (That team maintains a wiki, accessible by
clicking the `Stable Releases and Backports`_ link, which describes various
workflows in the backporting lifecycle.)
.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
Ordinarily, it is enough to fill out the "Backport" field in the bug (tracker
issue). The volunteers from the Stable Releases and Backports team will
backport the fix, run regression tests on it, and include it in one or more
future point releases.

View File

@ -181,7 +181,7 @@
# main package definition
#################################################################################
Name: ceph
Version: 19.2.0
Version: 19.2.1
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
@ -197,7 +197,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
Group: System/Filesystems
%endif
URL: http://ceph.com/
Source0: %{?_remote_tarball_prefix}ceph-19.2.0.tar.bz2
Source0: %{?_remote_tarball_prefix}ceph-19.2.1.tar.bz2
%if 0%{?suse_version}
# _insert_obs_source_lines_here
ExclusiveArch: x86_64 aarch64 ppc64le s390x riscv64
@ -432,9 +432,9 @@ BuildRequires: python%{python3_pkgversion}-scipy
BuildRequires: python%{python3_pkgversion}-werkzeug
BuildRequires: python%{python3_pkgversion}-pyOpenSSL
%endif
BuildRequires: jsonnet
%if 0%{?suse_version}
BuildRequires: golang-github-prometheus-prometheus
BuildRequires: jsonnet
BuildRequires: libxmlsec1-1
BuildRequires: libxmlsec1-nss1
BuildRequires: libxmlsec1-openssl1
@ -927,7 +927,6 @@ Requires: parted
Requires: util-linux
Requires: xfsprogs
Requires: python%{python3_pkgversion}-setuptools
Requires: python%{python3_pkgversion}-packaging
Requires: python%{python3_pkgversion}-ceph-common = %{_epoch_prefix}%{version}-%{release}
%description volume
This package contains a tool to deploy OSD with different devices like
@ -1335,7 +1334,7 @@ This package provides a Ceph hardware monitoring agent.
# common
#################################################################################
%prep
%autosetup -p1 -n ceph-19.2.0
%autosetup -p1 -n ceph-19.2.1
%build
# Disable lto on systems that do not support symver attribute

View File

@ -432,9 +432,9 @@ BuildRequires: python%{python3_pkgversion}-scipy
BuildRequires: python%{python3_pkgversion}-werkzeug
BuildRequires: python%{python3_pkgversion}-pyOpenSSL
%endif
BuildRequires: jsonnet
%if 0%{?suse_version}
BuildRequires: golang-github-prometheus-prometheus
BuildRequires: jsonnet
BuildRequires: libxmlsec1-1
BuildRequires: libxmlsec1-nss1
BuildRequires: libxmlsec1-openssl1
@ -927,7 +927,6 @@ Requires: parted
Requires: util-linux
Requires: xfsprogs
Requires: python%{python3_pkgversion}-setuptools
Requires: python%{python3_pkgversion}-packaging
Requires: python%{python3_pkgversion}-ceph-common = %{_epoch_prefix}%{version}-%{release}
%description volume
This package contains a tool to deploy OSD with different devices like

View File

@ -1,3 +1,9 @@
ceph (19.2.1-1) stable; urgency=medium
* New upstream release
-- Ceph Release Team <ceph-maintainers@ceph.io> Fri, 31 Jan 2025 23:14:10 +0000
ceph (19.2.0-1) stable; urgency=medium
* New upstream release

View File

@ -0,0 +1,42 @@
# use an ExternalProject to build isa-l using its makefile
function(build_isal)
set(isal_BINARY_DIR ${CMAKE_BINARY_DIR}/src/isa-l)
set(isal_INSTALL_DIR ${isal_BINARY_DIR}/install)
set(isal_INCLUDE_DIR "${isal_INSTALL_DIR}/include")
set(isal_LIBRARY "${isal_INSTALL_DIR}/lib/libisal.a")
# this include directory won't exist until the install step, but the
# imported targets need it early for INTERFACE_INCLUDE_DIRECTORIES
file(MAKE_DIRECTORY "${isal_INCLUDE_DIR}")
set(configure_cmd env CC=${CMAKE_C_COMPILER} ./configure --prefix=${isal_INSTALL_DIR})
# build a static library with -fPIC that we can link into crypto/compressor plugins
list(APPEND configure_cmd --with-pic --enable-static --disable-shared)
# clear the DESTDIR environment variable from debian/rules,
# because it messes with the internal install paths of arrow's bundled deps
set(NO_DESTDIR_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR)
include(ExternalProject)
ExternalProject_Add(isal_ext
SOURCE_DIR "${PROJECT_SOURCE_DIR}/src/isa-l"
CONFIGURE_COMMAND ./autogen.sh COMMAND ${configure_cmd}
BUILD_COMMAND ${NO_DESTDIR_COMMAND} make -j3
BUILD_IN_SOURCE 1
BUILD_BYPRODUCTS ${isal_LIBRARY}
INSTALL_COMMAND ${NO_DESTDIR_COMMAND} make install
UPDATE_COMMAND ""
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON)
# add imported library target ISAL::Crypto
add_library(ISAL::ISAL STATIC IMPORTED GLOBAL)
add_dependencies(ISAL::ISAL isal_ext)
set_target_properties(ISAL::ISAL PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${isal_INCLUDE_DIR}
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
IMPORTED_LOCATION ${isal_LIBRARY})
endfunction()

View File

@ -0,0 +1,31 @@
# use an ExternalProject to build isa-l_crypto using its makefile
function(build_isal_crypto)
set(ISAL_CRYPTO_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
set(ISAL_CRYPTO_INCLUDE_DIR "${ISAL_CRYPTO_SOURCE_DIR}/include")
set(ISAL_CRYPTO_LIBRARY "${ISAL_CRYPTO_SOURCE_DIR}/bin/isa-l_crypto.a")
include(FindMake)
find_make("MAKE_EXECUTABLE" "make_cmd")
include(ExternalProject)
ExternalProject_Add(isal_crypto_ext
SOURCE_DIR ${ISAL_CRYPTO_SOURCE_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ${make_cmd} -f <SOURCE_DIR>/Makefile.unx
BUILD_IN_SOURCE 1
BUILD_BYPRODUCTS ${ISAL_CRYPTO_LIBRARY}
INSTALL_COMMAND ""
UPDATE_COMMAND ""
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON)
# add imported library target ISAL::Crypto
add_library(ISAL::Crypto STATIC IMPORTED GLOBAL)
add_dependencies(ISAL::Crypto isal_crypto_ext)
set_target_properties(ISAL::Crypto PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${ISAL_CRYPTO_INCLUDE_DIR}
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
IMPORTED_LOCATION ${ISAL_CRYPTO_LIBRARY})
endfunction()

View File

@ -0,0 +1,218 @@
ARG FROM_IMAGE="quay.io/centos/centos:stream9"
FROM $FROM_IMAGE
# allow FROM_IMAGE to be visible inside this stage
ARG FROM_IMAGE
# Ceph branch name
ARG CEPH_REF="main"
# Ceph SHA1
ARG CEPH_SHA1
# Ceph git repo (ceph-ci.git or ceph.git)
ARG CEPH_GIT_REPO
# (optional) Define the baseurl= for the ganesha.repo
ARG GANESHA_REPO_BASEURL="https://buildlogs.centos.org/centos/\$releasever-stream/storage/\$basearch/nfsganesha-5/"
# (optional) Set to "crimson" to install crimson packages.
ARG OSD_FLAVOR="default"
# (optional) Should be 'true' for CI builds (pull from shaman, etc.)
ARG CI_CONTAINER="true"
RUN /bin/echo -e "\
FROM_IMAGE: ${FROM_IMAGE}\n\
CEPH_REF: ${CEPH_REF}\n\
GANESHA_REPO_BASEURL: ${GANESHA_REPO_BASEURL} \n\
OSD_FLAVOR: ${OSD_FLAVOR} \n\
CI_CONTAINER: ${CI_CONTAINER}"
# Other labels are set automatically by container/build github action
# See: https://github.com/opencontainers/image-spec/blob/main/annotations.md
LABEL org.opencontainers.image.authors="Ceph Release Team <ceph-maintainers@ceph.io>" \
org.opencontainers.image.documentation="https://docs.ceph.com/"
LABEL \
FROM_IMAGE=${FROM_IMAGE} \
CEPH_REF=${CEPH_REF} \
CEPH_SHA1=${CEPH_SHA1} \
CEPH_GIT_REPO=${CEPH_GIT_REPO} \
GANESHA_REPO_BASEURL=${GANESHA_REPO_BASEURL} \
OSD_FLAVOR=${OSD_FLAVOR}
#===================================================================================================
# Install ceph and dependencies, and clean up
# IMPORTANT: in official builds, use '--squash' build option to keep image as small as possible
# keeping run steps separate makes local rebuilds quick, but images are big without squash option
#===================================================================================================
# Pre-reqs
RUN dnf install -y --setopt=install_weak_deps=False epel-release jq
# Add NFS-Ganesha repo
RUN \
echo "[ganesha]" > /etc/yum.repos.d/ganesha.repo && \
echo "name=ganesha" >> /etc/yum.repos.d/ganesha.repo && \
echo "baseurl=${GANESHA_REPO_BASEURL}" >> /etc/yum.repos.d/ganesha.repo && \
echo "gpgcheck=0" >> /etc/yum.repos.d/ganesha.repo && \
echo "enabled=1" >> /etc/yum.repos.d/ganesha.repo
# ISCSI repo
RUN set -ex && \
curl -s -L https://shaman.ceph.com/api/repos/tcmu-runner/main/latest/centos/9/repo?arch=$(arch) -o /etc/yum.repos.d/tcmu-runner.repo && \
case "${CEPH_REF}" in \
quincy|reef) \
curl -fs -L https://download.ceph.com/ceph-iscsi/3/rpm/el9/ceph-iscsi.repo -o /etc/yum.repos.d/ceph-iscsi.repo ;\
;;\
main|*) \
curl -fs -L https://shaman.ceph.com/api/repos/ceph-iscsi/main/latest/centos/9/repo -o /etc/yum.repos.d/ceph-iscsi.repo ;\
;;\
esac
# Ceph repo
RUN --mount=type=secret,id=prerelease_creds set -ex && \
rpm --import 'https://download.ceph.com/keys/release.asc' && \
ARCH=$(arch); if [ "${ARCH}" == "aarch64" ]; then ARCH="arm64"; fi ;\
IS_RELEASE=0 ;\
if [[ "${CI_CONTAINER}" == "true" ]] ; then \
# TODO: this can return different ceph builds (SHA1) for x86 vs. arm runs. is it important to fix?
REPO_URL=$(curl -fs "https://shaman.ceph.com/api/search/?project=ceph&distros=centos/9/${ARCH}&flavor=${OSD_FLAVOR}&ref=${CEPH_REF}&sha1=latest" | jq -r .[0].url) ;\
else \
IS_RELEASE=1 ;\
source /run/secrets/prerelease_creds; \
REPO_URL="https://${PRERELEASE_USERNAME}:${PRERELEASE_PASSWORD}@download.ceph.com/prerelease/ceph/rpm-${CEPH_REF}/el9/" ;\
fi && \
rpm -Uvh "$REPO_URL/noarch/ceph-release-1-${IS_RELEASE}.el9.noarch.rpm" ; \
if [[ "$IS_RELEASE" == 1 ]] ; then \
sed -i "s;http://download.ceph.com/;https://${PRERELEASE_USERNAME}:${PRERELEASE_PASSWORD}@download.ceph.com/prerelease/ceph/;" /etc/yum.repos.d/ceph.repo ; \
dnf clean expire-cache ; \
fi
# Copr repos
# scikit for mgr-diskprediction-local
# ref: https://github.com/ceph/ceph-container/pull/1821
RUN \
dnf install -y --setopt=install_weak_deps=False dnf-plugins-core && \
dnf copr enable -y tchaikov/python-scikit-learn
# Update package mgr
RUN dnf update -y --setopt=install_weak_deps=False
# Define and install packages
# General
RUN echo "ca-certificates" > packages.txt
# Ceph
# TODO: remove lua-devel and luarocks once they are present in ceph.spec.in
# ref: https://github.com/ceph/ceph/pull/54575#discussion_r1401199635
RUN echo \
"ceph-common \
ceph-exporter \
ceph-grafana-dashboards \
ceph-immutable-object-cache \
ceph-mds \
ceph-mgr-cephadm \
ceph-mgr-dashboard \
ceph-mgr-diskprediction-local \
ceph-mgr-k8sevents \
ceph-mgr-rook \
ceph-mgr \
ceph-mon \
ceph-osd \
ceph-radosgw lua-devel luarocks \
ceph-volume \
cephfs-mirror \
cephfs-top \
kmod \
libradosstriper1 \
rbd-mirror" \
>> packages.txt
# Optional crimson package(s)
RUN if [ "${OSD_FLAVOR}" == "crimson" ]; then \
echo "ceph-crimson-osd" >> packages.txt ; \
fi
# Ceph "Recommends"
RUN echo "nvme-cli python3-saml smartmontools" >> packages.txt
# NFS-Ganesha
RUN echo "\
dbus-daemon \
nfs-ganesha-ceph \
nfs-ganesha-rados-grace \
nfs-ganesha-rados-urls \
nfs-ganesha-rgw \
nfs-ganesha \
rpcbind \
sssd-client" >> packages.txt
# ISCSI
RUN echo "ceph-iscsi tcmu-runner python3-rtslib" >> packages.txt
# Ceph-CSI
# TODO: coordinate with @Madhu-1 to have Ceph-CSI install these itself if unused by ceph
# @adk3798 does cephadm use these?
RUN echo "attr ceph-fuse rbd-nbd" >> packages.txt
# Rook (only if packages must be in ceph container image)
RUN echo "systemd-udev" >> packages.txt
# Util packages (should be kept to only utils that are truly very useful)
# 'sgdisk' (from gdisk) is used in docs and scripts for clearing disks (could be a risk? @travisn @guits @ktdreyer ?)
# 'ps' (from procps-ng) and 'hostname' are very valuable for debugging and CI
# TODO: remove sg3_utils once they are moved to ceph.spec.in with libstoragemgmt
# ref: https://github.com/ceph/ceph-container/pull/2013#issuecomment-1248606472
RUN echo "gdisk hostname procps-ng sg3_utils e2fsprogs lvm2 gcc" >> packages.txt
# scikit
RUN echo "python3-scikit-learn" >> packages.txt
# ceph-node-proxy
RUN echo "ceph-node-proxy" >> packages.txt
RUN echo "=== PACKAGES TO BE INSTALLED ==="; cat packages.txt
RUN echo "=== INSTALLING ===" ; \
dnf install -y --setopt=install_weak_deps=False --setopt=skip_missing_names_on_install=False --enablerepo=crb $(cat packages.txt)
# XXX why isn't this done in the ganesha package?
RUN mkdir -p /var/run/ganesha
# Disable sync with udev since the container can not contact udev
RUN \
sed -i -e 's/udev_rules = 1/udev_rules = 0/' \
-e 's/udev_sync = 1/udev_sync = 0/' \
-e 's/obtain_device_list_from_udev = 1/obtain_device_list_from_udev = 0/' \
/etc/lvm/lvm.conf && \
# validate the sed command worked as expected
grep -sqo "udev_sync = 0" /etc/lvm/lvm.conf && \
grep -sqo "udev_rules = 0" /etc/lvm/lvm.conf && \
grep -sqo "obtain_device_list_from_udev = 0" /etc/lvm/lvm.conf
# CLEAN UP!
RUN set -ex && \
dnf clean all && \
rm -rf /var/cache/dnf/* && \
rm -rf /var/lib/dnf/* && \
rm -f /var/lib/rpm/__db* && \
# remove unnecessary files with big impact
rm -rf /etc/selinux /usr/share/{doc,man,selinux} && \
# don't keep compiled python binaries
find / -xdev \( -name "*.pyc" -o -name "*.pyo" \) -delete && \
rm -f /etc/yum.repos.d/{ceph,ganesha,tcmu-runner,ceph-iscsi}.repo
# Verify that the packages installed haven't been accidentally cleaned, then
# clean the package list and re-clean unnecessary RPM database files
RUN rpm -q $(cat packages.txt) && rm -f /var/lib/rpm/__db* && rm -f *packages.txt
#
# Set some envs in the container for quickly inspecting details about the build at runtime
ENV CEPH_IS_DEVEL="${CI_CONTAINER}" \
CEPH_REF="${CEPH_REF}" \
CEPH_VERSION="${CEPH_REF}" \
CEPH_OSD_FLAVOR="${OSD_FLAVOR}" \
FROM_IMAGE="${FROM_IMAGE}"

198
ceph/container/build.sh Executable file
View File

@ -0,0 +1,198 @@
#!/bin/bash -ex
# vim: ts=4 sw=4 expandtab
# repo auth with write perms must be present (this script does not log into
# repos named by CONTAINER_REPO_*).
# If NO_PUSH is set, no login is necessary
CFILE=${1:-Containerfile}
shift || true
usage() {
cat << EOF
$0 [containerfile] (defaults to 'Containerfile')
For a CI build (from ceph-ci.git, built and pushed to shaman):
CI_CONTAINER: must be 'true'
FLAVOR (OSD flavor, default or crimson)
BRANCH (of Ceph. <remote>/<ref>)
CEPH_SHA1 (of Ceph)
ARCH (of build host, and resulting container)
CONTAINER_REPO_HOSTNAME (quay.ceph.io, for CI, for instance)
CONTAINER_REPO_ORGANIZATION (ceph-ci, for CI, for instance)
CONTAINER_REPO (ceph, for CI, or prerelease-<arch> for release, for instance)
CONTAINER_REPO_USERNAME
CONTAINER_REPO_PASSWORD
PRERELEASE_USERNAME for download.ceph.com:/prerelease/ceph
PRERELEASE_PASSWORD
For a release build: (from ceph.git, built and pushed to download.ceph.com)
CI_CONTAINER: must be 'false'
and you must also add
VERSION (for instance, 19.1.0) for tagging the image
You can avoid the push step (for testing) by setting NO_PUSH to anything
EOF
}
CI_CONTAINER=${CI_CONTAINER:-false}
FLAVOR=${FLAVOR:-default}
# default: current checked-out branch
BRANCH=${BRANCH:-$(git rev-parse --abbrev-ref HEAD)}
# default: current checked-out branch
CEPH_SHA1=${CEPH_SHA1:-$(git rev-parse HEAD)}
# default: build host arch
ARCH=${ARCH:-$(arch)}
if [[ "${ARCH}" == "aarch64" ]] ; then ARCH=arm64; fi
REPO_ARCH=amd64
if [[ "${ARCH}" = arm64 ]] ; then
REPO_ARCH=arm64
fi
if [[ ${CI_CONTAINER} == "true" ]] ; then
CONTAINER_REPO_HOSTNAME=${CONTAINER_REPO_HOSTNAME:-quay.ceph.io}
CONTAINER_REPO_ORGANIZATION=${CONTAINER_REPO_ORGANIZATION:-ceph-ci}
CONTAINER_REPO=${CONTAINER_REPO:-ceph}
else
CONTAINER_REPO_HOSTNAME=${CONTAINER_REPO_HOSTNAME:-quay.ceph.io}
CONTAINER_REPO_ORGANIZATION=${CONTAINER_REPO_ORGANIZATION:-ceph}
CONTAINER_REPO=${CONTAINER_REPO:-prerelease-${REPO_ARCH}}
# default: most-recent annotated tag
VERSION=${VERSION:-$(git describe --abbrev=0)}
fi
# check for existence of all required variables
: "${CI_CONTAINER:?}"
: "${FLAVOR:?}"
: "${BRANCH:?}"
: "${CEPH_SHA1:?}"
: "${ARCH:?}"
if [[ ${NO_PUSH} != "true" ]] ; then
: "${CONTAINER_REPO_HOSTNAME:?}"
: "${CONTAINER_REPO_ORGANIZATION:?}"
: "${CONTAINER_REPO_USERNAME:?}"
: "${CONTAINER_REPO_PASSWORD:?}"
fi
if [[ ${CI_CONTAINER} != "true" ]] ; then : "${VERSION:?}"; fi
# check for valid repo auth (if pushing)
repopath=${CONTAINER_REPO_HOSTNAME}/${CONTAINER_REPO_ORGANIZATION}/${CONTAINER_REPO}
MINIMAL_IMAGE=${repopath}:minimal-test
if [[ ${NO_PUSH} != "true" ]] ; then
podman rmi ${MINIMAL_IMAGE} || true
echo "FROM scratch" | podman build -f - -t ${MINIMAL_IMAGE}
if ! podman push ${MINIMAL_IMAGE} ; then
echo "Not authenticated to ${repopath}; need docker/podman login?"
exit 1
fi
podman rmi ${MINIMAL_IMAGE} | true
fi
if [[ -z "${CEPH_GIT_REPO}" ]] ; then
if [[ ${CI_CONTAINER} == "true" ]]; then
CEPH_GIT_REPO=https://github.com/ceph/ceph-ci.git
else
CEPH_GIT_REPO=https://github.com/ceph/ceph.git
fi
fi
# BRANCH will be, say, origin/main. remove <remote>/
BRANCH=${BRANCH##*/}
# podman build only supports secret files.
# This must be removed after podman build
touch prerelease.secret.txt
chmod 600 prerelease.secret.txt
echo -e "\
PRERELEASE_USERNAME=${PRERELEASE_USERNAME}\n
PRERELEASE_PASSWORD=${PRERELEASE_PASSWORD}\n " > prerelease.secret.txt
podman build --pull=newer --squash -f $CFILE -t build.sh.output \
--build-arg FROM_IMAGE=${FROM_IMAGE:-quay.io/centos/centos:stream9} \
--build-arg CEPH_SHA1=${CEPH_SHA1} \
--build-arg CEPH_GIT_REPO=${CEPH_GIT_REPO} \
--build-arg CEPH_REF=${BRANCH:-main} \
--build-arg OSD_FLAVOR=${FLAVOR:-default} \
--build-arg CI_CONTAINER=${CI_CONTAINER:-default} \
--secret=id=prerelease_creds,src=./prerelease.secret.txt \
2>&1
rm ./prerelease.secret.txt
image_id=$(podman image ls localhost/build.sh.output --format '{{.ID}}')
# grab useful image attributes for building the tag
#
# the variable settings are prefixed with "export CEPH_CONTAINER_" so that
# an eval or . can be used to put them into the environment
#
# PATH is removed from the output as it would cause problems for this
# parent script and its children
#
# notes:
#
# we want .Architecture and everything in .Config.Env
#
# printf will not accept "\n" (is this a podman bug?)
# so construct vars with two calls to podman inspect, joined by a newline,
# so that vars will get the output of the first command, newline, output
# of the second command
#
vars="$(podman inspect -f '{{printf "export CEPH_CONTAINER_ARCH=%v" .Architecture}}' ${image_id})
$(podman inspect -f '{{range $index, $value := .Config.Env}}export CEPH_CONTAINER_{{$value}}{{println}}{{end}}' ${image_id})"
vars="$(echo "${vars}" | grep -v PATH)"
eval ${vars}
# remove everything up to and including the last slash
fromtag=${CEPH_CONTAINER_FROM_IMAGE##*/}
# translate : to -
fromtag=${fromtag/:/-}
builddate=$(date +%Y%m%d)
local_tag=${fromtag}-${CEPH_CONTAINER_CEPH_REF}-${CEPH_CONTAINER_ARCH}-${builddate}
repopath=${CONTAINER_REPO_HOSTNAME}/${CONTAINER_REPO_ORGANIZATION}/${CONTAINER_REPO}
if [[ ${CI_CONTAINER} == "true" ]] ; then
# ceph-ci conventions for remote tags:
# requires ARCH, BRANCH, CEPH_SHA1, FLAVOR
full_repo_tag=${repopath}:${BRANCH}-${fromtag}-${ARCH}-devel
branch_repo_tag=${repopath}:${BRANCH}
sha1_repo_tag=${repopath}:${CEPH_SHA1}
if [[ "${ARCH}" == "arm64" ]] ; then
branch_repo_tag=${branch_repo_tag}-arm64
sha1_repo_tag=${sha1_repo_tag}-arm64
fi
podman tag ${image_id} ${full_repo_tag}
podman tag ${image_id} ${branch_repo_tag}
podman tag ${image_id} ${sha1_repo_tag}
if [[ ${FLAVOR} == "crimson" && ${ARCH} == "x86_64" ]] ; then
sha1_flavor_repo_tag=${sha1_repo_tag}-${FLAVOR}
podman tag ${image_id} ${sha1_flavor_repo_tag}
if [[ -z "${NO_PUSH}" ]] ; then
podman push ${sha1_flavor_repo_tag}
fi
exit
fi
if [[ -z "${NO_PUSH}" ]] ; then
podman push ${full_repo_tag}
podman push ${branch_repo_tag}
podman push ${sha1_repo_tag}
fi
else
#
# non-CI build. Tags are like v19.1.0-20240701
# push to quay.ceph.io/ceph/prerelease-$REPO_ARCH
#
version_tag=${repopath}:v${VERSION}-${builddate}
podman tag ${image_id} ${version_tag}
if [[ -z "${NO_PUSH}" ]] ; then
podman push ${version_tag}
fi
fi

View File

@ -0,0 +1,252 @@
#!/usr/bin/python3
#
# in default mode:
# make a combined "manifest-list" container out of two arch-specific containers
# searches for latest tags on HOST/{AMD,ARM}64_REPO, makes sure they refer
# to the same Ceph SHA1, and creates a manifest-list ("fat") image on
# MANIFEST_HOST/MANIFEST_REPO with the 'standard' set of tags:
# v<major>
# v<major>.<minor>
# v<major>.<minor>.<micro>
# v<major>.<minor>.<micro>-<YYYYMMDD>
#
# uses scratch local manifest LOCALMANIFEST, defined here; will be destroyed if present
#
# in promote mode (by adding the --promote argument):
# instead of building the manifest-list container, copy it
# (and all of its tags) from the prerelease repo to the release repo
#
# Assumes valid logins to the necessary hosts/repos with permission to write images
#
# Environment variables to set:
# ARCH_SPECIFIC_HOST (default 'quay.ceph.io'): host of prerelease repos
# AMD64_REPO (default 'ceph/prerelease-amd64') prerelease amd64 repo
# ARM64_REPO (default 'ceph/prerelease-arm64') prerelease arm64 repo
# MANIFEST_HOST (default 'quay.ceph.io') prerelease manifest-list host
# MANIFEST_REPO (default 'ceph/prerelease') prerelease manifest-list repo
# RELEASE_MANIFEST_HOST (default 'quay.io') release host
# RELEASE_MANIFEST_REPO (default 'ceph/ceph') release repo
import argparse
from datetime import datetime
import functools
import json
import os
import re
import subprocess
import sys
# Manifest image. Will be destroyed if already present.
LOCALMANIFEST = 'localhost/m'
def dump_vars(names, vardict):
for name in names:
print(f'{name}: {vardict[name]}', file=sys.stderr)
def run_command(args):
print(f'running {args}', file=sys.stderr)
if not isinstance(args, list):
args = args.split()
try:
result = subprocess.run(
args,
capture_output=True,
text=True,
check=True)
return True, result.stdout, result.stderr
except subprocess.CalledProcessError as e:
return False, e.output, e.stderr
def get_command_output(args):
success, stdout, stderr = run_command(args)
return (stdout if success else None)
def run_command_show_failure(args):
success, stdout, stderr = run_command(args)
if not success:
print(f'{args} failed:', file=sys.stderr)
print(f'stdout:\n{stdout}')
print(f'stderr:\n{stderr}')
return success
@functools.lru_cache
def get_tags(path):
cmdout = get_command_output(f'skopeo list-tags docker://{path}')
return json.loads(cmdout)['Tags']
def get_latest_tag(path):
try:
latest_tag = get_tags(path)[-1]
except IndexError:
return None
return latest_tag
@functools.lru_cache
def get_image_inspect(path):
info = json.loads(
get_command_output(f'skopeo inspect docker://{path}')
)
return info
def get_sha1(info):
labels = info.get('Labels', None)
if not labels:
return None
return labels.get('CEPH_SHA1', None)
@functools.lru_cache
def get_all_matching_digest_tags(path, tag):
matching_tags = list()
digest = get_image_inspect(f'{path}:{tag}')['Digest']
for t in get_tags(path):
this_digest = get_image_inspect(f'{path}:{t}')['Digest']
if this_digest == digest:
matching_tags.append(t)
return matching_tags
def parse_args():
ap = argparse.ArgumentParser()
ap.add_argument('-n', '--dry-run', action='store_true', help='do all local manipulations but do not push final containers to MANIFEST_HOST, or in --promote, calculate but do not copy images to release host')
ap.add_argument('-P', '--promote', action='store_true', help='promote newest prerelease manifest container to released (move from MANIFEST_HOST to RELEASE_MANIFEST_HOST')
args = ap.parse_args()
return args
def build_prerelease(sysargs):
global args
arch_specific_host = os.environ.get('ARCH_SPECIFIC_HOST', 'quay.ceph.io')
amd64_repo = os.environ.get('AMD64_REPO', 'ceph/prerelease-amd64')
arm64_repo = os.environ.get('ARM64_REPO', 'ceph/prerelease-arm64')
manifest_host = os.environ.get('MANIFEST_HOST', 'quay.ceph.io')
manifest_repo = os.environ.get('MANIFEST_REPO', 'ceph/prerelease')
dump_vars(
('arch_specific_host',
'amd64_repo',
'arm64_repo',
'manifest_host',
'manifest_repo',
),
locals())
repopaths = (
f'{arch_specific_host}/{amd64_repo}',
f'{arch_specific_host}/{arm64_repo}',
)
tags = [get_latest_tag(p) for p in repopaths]
print(f'latest tags: amd64:{tags[0]} arm64:{tags[1]}')
# check that version of latest tag matches
version_re = \
r'v(?P<major>\d+)\.(?P<minor>\d+)\.(?P<micro>\d+)-(?P<date>\d+)'
versions = list()
for tag in tags:
mo = re.match(version_re, tag)
ver = f'{mo.group("major")}.{mo.group("minor")}.{mo.group("micro")}'
versions.append(ver)
if versions[0] != versions[1]:
print(
f'version mismatch: amd64:{versions[0]} arm64:{versions[1]}',
file=sys.stderr,
)
return(1)
major, minor, micro = mo.group(1), mo.group(2), mo.group(3)
print(f'Ceph version: {major}.{minor}.{micro}', file=sys.stderr)
# check that ceph sha1 of two arch images matches
paths_with_tags = [f'{p}:{t}' for (p, t) in zip(repopaths, tags)]
info = [get_image_inspect(p) for p in paths_with_tags]
sha1s = [get_sha1(i) for i in info]
if sha1s[0] != sha1s[1]:
print(
f'sha1 mismatch: amd64: {sha1s[0]} arm64: {sha1s[1]}',
file=sys.stderr,
)
builddate = [i['Created'] for i in info]
print(
f'Build dates: amd64: {builddate[0]} arm64: {builddate[1]}',
file=sys.stderr,
)
return(1)
# create manifest list image with the standard list of tags
# ignore failure on manifest rm
run_command(f'podman manifest rm {LOCALMANIFEST}')
run_command_show_failure(f'podman manifest create {LOCALMANIFEST}')
for p in paths_with_tags:
run_command_show_failure(f'podman manifest add m {p}')
base = f'{manifest_host}/{manifest_repo}'
for t in (
f'v{major}',
f'v{major}.{minor}',
f'v{major}.{minor}.{micro}',
f'v{major}.{minor}.{micro}-{datetime.today().strftime("%Y%m%d")}',
):
if sysargs.dry_run:
print(f'skipping podman manifest push {LOCALMANIFEST} {base}:{t}')
else:
run_command_show_failure(
f'podman manifest push {LOCALMANIFEST} {base}:{t}')
def promote(sysargs):
manifest_host = os.environ.get('MANIFEST_HOST', 'quay.ceph.io')
manifest_repo = os.environ.get('MANIFEST_REPO', 'ceph/prerelease')
release_manifest_host = os.environ.get('RELEASE_MANIFEST_HOST', 'quay.io')
release_manifest_repo = os.environ.get('RELEASE_MANIFEST_REPO', 'ceph/ceph')
dump_vars(
('manifest_host',
'manifest_repo',
'release_manifest_host',
'release_manifest_repo',
),
locals())
manifest_path = f'{manifest_host}/{manifest_repo}'
release_path = f'{release_manifest_host}/{release_manifest_repo}'
latest_tag = get_latest_tag(manifest_path)
all_tags = get_all_matching_digest_tags(manifest_path, latest_tag)
copypaths = list()
for t in all_tags:
from_path = f'{manifest_path}:{t}'
to_path = f'{release_path}:{t}'
copypaths.append((from_path, to_path))
if sysargs.dry_run:
for f, t in copypaths:
print(f'dry-run: Would copy: {f} -> {t}')
return(0)
for f, t in copypaths:
print(f'Will copy: {f} -> {t}')
for f, t in copypaths:
run_command_show_failure(f'skopeo copy --multi-arch=all docker://{f} docker://{t}')
def main():
args = parse_args()
if args.promote:
promote(args)
else:
build_prerelease(args)
if (__name__ == '__main__'):
sys.exit(main())

View File

@ -1215,7 +1215,7 @@ exemplary implementations.
Summary
-------
Ceph Storage Clusters are dynamic--like a living organism. Whereas, many storage
Ceph Storage Clusters are dynamic--like a living organism. Although many storage
appliances do not fully utilize the CPU and RAM of a typical commodity server,
Ceph does. From heartbeats, to peering, to rebalancing the cluster or
recovering from faults, Ceph offloads work from clients (and from a centralized

View File

@ -9,3 +9,48 @@ Logical volume name format is vg/lv. Fails if OSD has already got attached DB.
Attach vgname/lvname as a DB volume to OSD 1::
ceph-volume lvm new-db --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D --target vgname/new_db
Reversing BlueFS Spillover to Slow Devices
------------------------------------------
Under certain circumstances, OSD RocksDB databases spill onto slow storage and
the Ceph cluster returns specifics regarding BlueFS spillover warnings. ``ceph
health detail`` returns these spillover warnings. Here is an example of a
spillover warning::
osd.76 spilled over 128 KiB metadata from 'db' device (56 GiB used of 60 GiB) to slow device
To move this DB metadata from the slower device to the faster device, take the
following steps:
#. Expand the database's logical volume (LV):
.. prompt:: bash #
lvextend -l ${size} ${lv}/${db} ${ssd_dev}
#. Stop the OSD:
.. prompt:: bash #
cephadm unit --fsid $cid --name osd.${osd} stop
#. Run the ``bluefs-bdev-expand`` command:
.. prompt:: bash #
cephadm shell --fsid $cid --name osd.${osd} -- ceph-bluestore-tool bluefs-bdev-expand --path /var/lib/ceph/osd/ceph-${osd}
#. Run the ``bluefs-bdev-migrate`` command:
.. prompt:: bash #
cephadm shell --fsid $cid --name osd.${osd} -- ceph-bluestore-tool bluefs-bdev-migrate --path /var/lib/ceph/osd/ceph-${osd} --devs-source /var/lib/ceph/osd/ceph-${osd}/block --dev-target /var/lib/ceph/osd/ceph-${osd}/block.db
#. Restart the OSD:
.. prompt:: bash #
cephadm unit --fsid $cid --name osd.${osd} start
.. note:: *The above procedure was developed by Chris Dunlop on the [ceph-users] mailing list, and can be seen in its original context here:* `[ceph-users] Re: Fixing BlueFS spillover (pacific 16.2.14) <https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/message/POPUFSZGXR3P2RPYPJ4WJ4HGHZ3QESF6/>`_

View File

@ -61,6 +61,12 @@ For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` f
ceph-volume lvm prepare --bluestore --dmcrypt --data vg/lv
Starting with Ceph Squid, you can opt for TPM2 token enrollment for the created LUKS2 devices with the ``--with-tpm`` flag:
.. prompt:: bash #
ceph-volume lvm prepare --bluestore --dmcrypt --with-tpm --data vg/lv
If a ``block.db`` device or a ``block.wal`` device is needed, it can be
specified with ``--block.db`` or ``--block.wal``. These can be physical
devices, partitions, or logical volumes. ``block.db`` and ``block.wal`` are

View File

@ -1,8 +1,8 @@
.. _cephadm_deploying_new_cluster:
============================
Deploying a new Ceph cluster
============================
==========================================
Using cephadm to Deploy a New Ceph Cluster
==========================================
Cephadm creates a new Ceph cluster by bootstrapping a single
host, expanding the cluster to encompass any additional hosts, and
@ -24,6 +24,10 @@ Requirements
Any modern Linux distribution should be sufficient. Dependencies
are installed automatically by the bootstrap process below.
See `Docker Live Restore <https://docs.docker.com/engine/daemon/live-restore/>`_
for an optional feature that allows restarting Docker Engine without restarting
all running containers.
See the section :ref:`Compatibility With Podman
Versions<cephadm-compatibility-with-podman>` for a table of Ceph versions that
are compatible with Podman. Not every version of Podman is compatible with

View File

@ -375,7 +375,7 @@ One or more hosts have failed the basic cephadm host check, which verifies
that (1) the host is reachable and cephadm can be executed there, and (2)
that the host satisfies basic prerequisites, like a working container
runtime (podman or docker) and working time synchronization.
If this test fails, cephadm will no be able to manage services on that host.
If this test fails, cephadm will not be able to manage services on that host.
You can manually run this check by running the following command:
@ -734,3 +734,72 @@ Purge ceph daemons from all hosts in the cluster
# For each host:
cephadm rm-cluster --force --zap-osds --fsid <fsid>
Replacing a device
==================
The ``ceph orch device replace`` command automates the process of replacing the underlying device of an OSD.
Previously, this process required manual intervention at various stages.
With this new command, all necessary operations are performed automatically, streamlining the replacement process
and improving the overall user experience.
.. note:: This only supports LVM-based deployed OSD(s)
.. prompt:: bash #
ceph orch device replace <host> <device-path>
In the case the device being replaced is shared by multiple OSDs (eg: DB/WAL device shared by multiple OSDs), the orchestrator will warn you.
.. prompt:: bash #
[ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd
Error EINVAL: /dev/vdd is a shared device.
Replacing /dev/vdd implies destroying OSD(s): ['0', '1'].
Please, *be very careful*, this can be a very dangerous operation.
If you know what you are doing, pass --yes-i-really-mean-it
If you know what you are doing, you can go ahead and pass ``--yes-i-really-mean-it``.
.. prompt:: bash #
[ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd --yes-i-really-mean-it
Scheduled to destroy osds: ['6', '7', '8'] and mark /dev/vdd as being replaced.
``cephadm`` will make ``ceph-volume`` zap and destroy all related devices and mark the corresponding OSD as ``destroyed`` so the
different OSD(s) ID(s) will be preserved:
.. prompt:: bash #
[ceph: root@ceph-1 /]# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 0.97659 root default
-3 0.97659 host devel-1
0 hdd 0.29300 osd.0 destroyed 1.00000 1.00000
1 hdd 0.29300 osd.1 destroyed 1.00000 1.00000
2 hdd 0.19530 osd.2 up 1.00000 1.00000
3 hdd 0.19530 osd.3 up 1.00000 1.00000
The device being replaced is finally seen as ``being replaced`` preventing ``cephadm`` from redeploying the OSDs too fast:
.. prompt:: bash #
[ceph: root@ceph-1 /]# ceph orch device ls
HOST PATH TYPE DEVICE ID SIZE AVAILABLE REFRESHED REJECT REASONS
osd-1 /dev/vdb hdd 200G Yes 13s ago
osd-1 /dev/vdc hdd 200G Yes 13s ago
osd-1 /dev/vdd hdd 200G Yes 13s ago Is being replaced
osd-1 /dev/vde hdd 200G No 13s ago Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
osd-1 /dev/vdf hdd 200G No 13s ago Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
If for any reason you need to clear the 'device replace header' on a device, then you can use ``ceph orch device replace <host> <device> --clear``:
.. prompt:: bash #
[ceph: root@devel-1 /]# ceph orch device replace devel-1 /dev/vdk --clear
Replacement header cleared on /dev/vdk
[ceph: root@devel-1 /]#
After that, ``cephadm`` will redeploy the OSD service spec within a few minutes (unless the service is set to ``unmanaged``).

View File

@ -355,6 +355,8 @@ Or in YAML:
* See :ref:`orchestrator-host-labels`
.. _cephadm-services-placement-by-pattern-matching:
Placement by pattern matching
-----------------------------

View File

@ -345,7 +345,7 @@ definition and management of the embedded Prometheus service. The endpoint liste
``https://<mgr-ip>:8765/sd/`` (the port is
configurable through the variable ``service_discovery_port``) and returns scrape target
information in `http_sd_config format
<https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config/>`_
<https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config>`_
Customers with external monitoring stack can use `ceph-mgr` service discovery endpoint
to get scraping configuration. Root certificate of the server can be obtained by the

View File

@ -84,6 +84,39 @@ information about interacting with these LEDs, refer to :ref:`devices`.
The current release of `libstoragemgmt`_ (1.8.8) supports SCSI, SAS, and SATA based
local disks only. There is no official support for NVMe devices (PCIe)
Retrieve Exact Size of Block Devices
====================================
Run a command of the following form to discover the exact size of a block
device. The value returned here is used by the orchestrator when comparing high
and low values:
.. prompt:: bash #
cephadm shell ceph-volume inventory </dev/sda> --format json | jq .sys_api.human_readable_size
The exact size in GB is the size reported in TB, multiplied by 1000.
Example
-------
The following provides a specific example of this command based upon the
general form of the command above:
.. prompt:: bash #
cephadm shell ceph-volume inventory /dev/sdc --format json | jq .sys_api.human_readable_size
::
"3.64 TB"
This means that the exact device size is 3.64 * 1000, or 3640GB.
This procedure was developed by Frédéric Nass. See `this thread on the
[ceph-users] mailing list
<https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/message/5BAAYFCQAZZDRSNCUPCVBNEPGJDARRZA/>`_
for discussion of this matter.
.. _cephadm-deploy-osds:
Deploy OSDs
@ -445,22 +478,27 @@ for that OSD and also set a specific memory target. For example,
Advanced OSD Service Specifications
===================================
:ref:`orchestrator-cli-service-spec`\s of type ``osd`` are a way to describe a
cluster layout, using the properties of disks. Service specifications give the
user an abstract way to tell Ceph which disks should turn into OSDs with which
configurations, without knowing the specifics of device names and paths.
:ref:`orchestrator-cli-service-spec`\s of type ``osd`` provide a way to use the
properties of disks to describe a Ceph cluster's layout. Service specifications
are an abstraction used to tell Ceph which disks it should transform into OSDs
and which configurations to apply to those OSDs.
:ref:`orchestrator-cli-service-spec`\s make it possible to target these disks
for transformation into OSDs even when the Ceph cluster operator does not know
the specific device names and paths associated with those disks.
Service specifications make it possible to define a yaml or json file that can
be used to reduce the amount of manual work involved in creating OSDs.
:ref:`orchestrator-cli-service-spec`\s make it possible to define a ``.yaml``
or ``.json`` file that can be used to reduce the amount of manual work involved
in creating OSDs.
.. note::
It is recommended that advanced OSD specs include the ``service_id`` field
set. The plain ``osd`` service with no service id is where OSDs created
using ``ceph orch daemon add`` or ``ceph orch apply osd --all-available-devices``
are placed. Not including a ``service_id`` in your OSD spec would mix
the OSDs from your spec with those OSDs and potentially overwrite services
specs created by cephadm to track them. Newer versions of cephadm will even
block creation of advanced OSD specs without the service_id present
We recommend that advanced OSD specs include the ``service_id`` field set.
OSDs created using ``ceph orch daemon add`` or ``ceph orch apply osd
--all-available-devices`` are placed in the plain ``osd`` service. Failing
to include a ``service_id`` in your OSD spec causes the Ceph cluster to mix
the OSDs from your spec with those OSDs, which can potentially result in the
overwriting of service specs created by ``cephadm`` to track them. Newer
versions of ``cephadm`` will even block creation of advanced OSD specs that
do not include the ``service_id``.
For example, instead of running the following command:
@ -468,8 +506,8 @@ For example, instead of running the following command:
ceph orch daemon add osd *<host>*:*<path-to-device>*
for each device and each host, we can define a yaml or json file that allows us
to describe the layout. Here's the most basic example.
for each device and each host, we can define a ``.yaml`` or ``.json`` file that
allows us to describe the layout. Here is the most basic example:
Create a file called (for example) ``osd_spec.yml``:
@ -487,17 +525,18 @@ This means :
#. Turn any available device (ceph-volume decides what 'available' is) into an
OSD on all hosts that match the glob pattern '*'. (The glob pattern matches
against the registered hosts from `host ls`) A more detailed section on
host_pattern is available below.
against the registered hosts from `ceph orch host ls`) See
:ref:`cephadm-services-placement-by-pattern-matching` for more on using
``host_pattern``-matching to turn devices into OSDs.
#. Then pass it to `osd create` like this:
#. Pass ``osd_spec.yml`` to ``osd create`` by using the following command:
.. prompt:: bash [monitor.1]#
ceph orch apply -i /path/to/osd_spec.yml
This instruction will be issued to all the matching hosts, and will deploy
these OSDs.
This instruction is issued to all the matching hosts, and will deploy these
OSDs.
Setups more complex than the one specified by the ``all`` filter are
possible. See :ref:`osd_filters` for details.
@ -666,6 +705,21 @@ This example would deploy all OSDs with encryption enabled.
all: true
encrypted: true
Ceph Squid onwards support tpm2 token enrollment to LUKS2 devices.
You can add the `tpm2` to your OSD spec:
.. code-block:: yaml
service_type: osd
service_id: example_osd_spec_with_tpm2
placement:
host_pattern: '*'
spec:
data_devices:
all: true
encrypted: true
tpm2: true
See a full list in the DriveGroupSpecs
.. py:currentmodule:: ceph.deployment.drive_group

View File

@ -26,7 +26,7 @@ Samba Containers with the following command:
.. prompt:: bash #
orch apply smb <cluster_id> <config_uri> [--features ...] [--placement ...] ...
ceph orch apply smb <cluster_id> <config_uri> [--features ...] [--placement ...] ...
There are a number of additional parameters that the command accepts. See
the Service Specification for a description of these options.

View File

@ -131,7 +131,21 @@ doesn't use ``cephadm shell``) to a version compatible with the new version.
Potential problems
==================
There are a few health alerts that can arise during the upgrade process.
Error: ENOENT: Module not found
-------------------------------
The message ``Error ENOENT: Module not found`` appears in response to the command ``ceph orch upgrade status`` if the orchestrator has crashed:
.. prompt:: bash #
ceph orch upgrade status
::
Error ENOENT: Module not found
This is possibly caused by invalid JSON in a mgr config-key. See `Redmine tracker Issue #67329 <https://tracker.ceph.com/issues/67329>`_ and `the discussion on the [ceph-users] mailing list <https://www.spinics.net/lists/ceph-users/msg83667.html>`_.
UPGRADE_NO_STANDBY_MGR
----------------------

View File

@ -53,8 +53,7 @@ the MDS server. Even if a single MDS daemon is unable to fully utilize the
hardware, it may be desirable later on to start more active MDS daemons on the
same node to fully utilize the available cores and memory. Additionally, it may
become clear with workloads on the cluster that performance improves with
multiple active MDS on the same node rather than over-provisioning a single
MDS.
multiple active MDS on the same node rather than a single overloaded MDS.
Finally, be aware that CephFS is a highly-available file system by supporting
standby MDS (see also :ref:`mds-standby`) for rapid failover. To get a real

View File

@ -209,3 +209,70 @@ cache. The limit is configured via:
It is not recommended to set this value above 5M but it may be helpful with
some workloads.
Dealing with "clients failing to respond to cache pressure" messages
--------------------------------------------------------------------
Every second (or every interval set by the ``mds_cache_trim_interval``
configuration paramater), the MDS runs the "cache trim" procedure. One of the
steps of this procedure is "recall client state". During this step, the MDS
checks every client (session) to determine whether it needs to recall caps.
If any of the following are true, then the MDS needs to recall caps:
1. the cache is full (the ``mds_cache_memory_limit`` has been exceeded) and
needs some inodes to be released
2. the client exceeds ``mds_max_caps_per_client`` (1M by default)
3. the client is inactive
To determine whether a client (a session) is inactive, the session's
``cache_liveness`` parameters is checked and compared with the value::
(num_caps >> mds_session_cache_liveness_magnitude)
where ``mds_session_cache_liveness_magnitude`` is a config param (``10`` by
default). If ``cache_liveness`` is smaller than this calculated value, the
session is considered inactive and the MDS sends a "recall caps" request for
all cached caps (the actual recall value is ``num_caps -
mds_min_caps_per_client(100)``).
Under certain circumstances, many "recall caps" requests can be sent so quickly
that the health warning is generated: "clients failing to respond to cache
pressure". If the client does not release the caps fast enough, the MDS repeats
the "recall caps" request one second later. This means that the MDS will send
"recall caps" again and again. The "total" counter of "recall caps" for the
session will grow and grow, and will eventually exceed the "mon warning limit".
A throttling mechanism, controlled by the ``mds_recall_max_decay_threshold``
parameter (126K by default), is available for reducing the rate of "recall
caps" counter growth, but sometimes it is not enough to slow the "recall caps"
counter's growth rate. If altering the ``mds_recall_max_decay_threshold`` value
does not sufficiently reduce the rate of the "recall caps" counter's growth,
decrease ``mds_recall_max_caps`` incrementally until the "clients failing to
respond to cache pressure" messages no longer appear in the logs.
Example Scenario
~~~~~~~~~~~~~~~~
Here is an example. A client is having 20k caps cached. At some moment the
server decides the client is inactive (because the session's ``cache_liveness``
value is low). It starts to ask the client to release caps down to
``mds_min_caps_per_client`` value (100 by default). Every second, it
sends recall_caps asking to release ``caps_num - mds_min_caps_per_client`` caps
(but not more than ``mds_recall_max_caps``, which is 30k by default). A client
is starting to release, but is releasing with a rate of (for example) only 100
caps per second.
So in the first second of time, the mds sends recall_caps = 20k - 100 the
second second recall_caps = (20k - 100) - 100 the third second recall_caps =
(20k - 200) - 100 and so on. And every time it sends recall_caps it updates the
session's recall_caps value, which is calculated how many recall_caps sent in
the last minute. I.e. the counter is growing quickly, eventually exceeding
mds_recall_warning_threshold, which is 128K by default, and ceph starts to
report "failing to respond to cache pressure" warning in the status. Now,
after we set mds_recall_max_caps to 3K, in this situation the mds server sends
only 3K recall_caps per second, and the maximum value the session's recall_caps
value may have (if the mds is sending 3K every second for at least one minute)
is 60 * 3K = 180K. This means that it is still possible to achieve
``mds_recall_warning_threshold`` but only if a client does not "respond" for a
long time, and as your experiments show it is not the case.

View File

@ -24,7 +24,7 @@ This will mount the default ceph filesystem using the drive letter ``x``.
If ``ceph.conf`` is placed at the default location, which is
``%ProgramData%\ceph\ceph.conf``, then this argument becomes optional.
The ``-l`` argument also allows using an empty folder as a mountpoint
The ``-l`` argument also allows using an empty folder as a mount point
instead of a drive letter.
The uid and gid used for mounting the filesystem default to 0 and may be
@ -75,7 +75,7 @@ like so::
ceph-dokan.exe unmap -l x
Note that when unmapping Ceph filesystems, the exact same mountpoint argument
Note that when unmapping Ceph filesystems, the exact same mount point argument
must be used as when the mapping was created.
Limitations

View File

@ -120,7 +120,9 @@ system, run a command of the following form:
.. note:: "Mirroring module" commands are prefixed with ``fs snapshot mirror``.
This distinguishes them from "monitor commands", which are prefixed with ``fs
mirror``. Be sure (in this context) to use module commands.
mirror``. Enabling mirroring by using monitor commands will result in the mirror daemon
entering the "failed" state due to the absence of the `cephfs_mirror` index object.
So be sure (in this context) to use module commands.
To disable mirroring for a given file system, run a command of the following form:
@ -340,8 +342,9 @@ command is of format `filesystem-name@filesystem-id peer-uuid`::
"last_synced_snap": {
"id": 120,
"name": "snap1",
"sync_duration": 0.079997898999999997,
"sync_time_stamp": "274900.558797s"
"sync_duration": 3,
"sync_time_stamp": "274900.558797s",
"sync_bytes": 52428800
},
"snaps_synced": 2,
"snaps_deleted": 0,
@ -359,6 +362,32 @@ A directory can be in one of the following states::
- `syncing`: The directory is currently being synchronized
- `failed`: The directory has hit upper limit of consecutive failures
When a directory is currently being synchronized, the mirror daemon marks it as `syncing` and
`fs mirror peer status` shows the snapshot being synchronized under the `current_syncing_snap`::
$ ceph --admin-daemon /var/run/ceph/cephfs-mirror.asok fs mirror peer status cephfs@360 a2dc7784-e7a1-4723-b103-03ee8d8768f8
{
"/d0": {
"state": "syncing",
"current_syncing_snap": {
"id": 121,
"name": "snap2"
},
"last_synced_snap": {
"id": 120,
"name": "snap1",
"sync_duration": 3,
"sync_time_stamp": "274900.558797s",
"sync_bytes": 52428800
},
"snaps_synced": 2,
"snaps_deleted": 0,
"snaps_renamed": 0
}
}
The mirror daemon marks it back to `idle`, when the syncing completes.
When a directory experiences a configured number of consecutive synchronization failures, the
mirror daemon marks it as `failed`. Synchronization for these directories is retried.
By default, the number of consecutive failures before a directory is marked as failed
@ -374,12 +403,13 @@ E.g., adding a regular file for synchronization would result in failed status::
"/d0": {
"state": "idle",
"last_synced_snap": {
"id": 120,
"name": "snap1",
"sync_duration": 0.079997898999999997,
"sync_time_stamp": "274900.558797s"
"id": 121,
"name": "snap2",
"sync_duration": 5,
"sync_time_stamp": "500900.600797s",
"sync_bytes": 78643200
},
"snaps_synced": 2,
"snaps_synced": 3,
"snaps_deleted": 0,
"snaps_renamed": 0
},
@ -395,9 +425,110 @@ This allows a user to add a non-existent directory for synchronization. The mirr
will mark such a directory as failed and retry (less frequently). When the directory is
created, the mirror daemon will clear the failed state upon successful synchronization.
Adding a new snapshot or a new directory manually in the .snap directory of the
remote filesystem will result in failed status of the corresponding configured directory.
In the remote filesystem::
$ ceph fs subvolume snapshot create cephfs subvol1 snap2 group1
or
$ mkdir /d0/.snap/snap2
$ ceph --admin-daemon /var/run/ceph/cephfs-mirror.asok fs mirror peer status cephfs@360 a2dc7784-e7a1-4723-b103-03ee8d8768f8
{
"/d0": {
"state": "failed",
"failure_reason": "snapshot 'snap2' has invalid metadata",
"last_synced_snap": {
"id": 120,
"name": "snap1",
"sync_duration": 3,
"sync_time_stamp": "274900.558797s"
},
"snaps_synced": 2,
"snaps_deleted": 0,
"snaps_renamed": 0
},
"/f0": {
"state": "failed",
"snaps_synced": 0,
"snaps_deleted": 0,
"snaps_renamed": 0
}
}
When the snapshot or the directory is removed from the remote filesystem, the mirror daemon will
clear the failed state upon successful synchronization of the pending snapshots, if any.
.. note:: Treat the remote filesystem as read-only. Nothing is inherently enforced by CephFS.
But with the right mds caps, users would not be able to snapshot directories in the
remote file system.
When mirroring is disabled, the respective `fs mirror status` command for the file system
will not show up in command help.
Metrics
-------
CephFS exports mirroring metrics as :ref:`Labeled Perf Counters` which will be consumed by the OCP/ODF Dashboard to provide monitoring of the Geo Replication. These metrics can be used to measure the progress of cephfs_mirror syncing and thus provide the monitoring capability. CephFS exports the following mirroring metrics, which are displayed using the ``counter dump`` command.
.. list-table:: Mirror Status Metrics
:widths: 25 25 75
:header-rows: 1
* - Name
- Type
- Description
* - mirroring_peers
- Gauge
- The number of peers involved in mirroring
* - directory_count
- Gauge
- The total number of directories being synchronized
* - mirrored_filesystems
- Gauge
- The total number of filesystems which are mirrored
* - mirror_enable_failures
- Counter
- Enable mirroring failures
.. list-table:: Replication Metrics
:widths: 25 25 75
:header-rows: 1
* - Name
- Type
- Description
* - snaps_synced
- Counter
- The total number of snapshots successfully synchronized
* - sync_bytes
- Counter
- The total bytes being synchronized
* - sync_failures
- Counter
- The total number of failed snapshot synchronizations
* - snaps_deleted
- Counter
- The total number of snapshots deleted
* - snaps_renamed
- Counter
- The total number of snapshots renamed
* - avg_sync_time
- Gauge
- The average time taken by all snapshot synchronizations
* - last_synced_start
- Gauge
- The sync start time of the last synced snapshot
* - last_synced_end
- Gauge
- The sync end time of the last synced snapshot
* - last_synced_duration
- Gauge
- The time duration of the last synchronization
* - last_synced_bytes
- counter
- The total bytes being synchronized for the last synced snapshot
Configuration Options
---------------------
@ -410,6 +541,7 @@ Configuration Options
.. confval:: cephfs_mirror_retry_failed_directories_interval
.. confval:: cephfs_mirror_restart_mirror_on_failure_interval
.. confval:: cephfs_mirror_mount_timeout
.. confval:: cephfs_mirror_perf_stats_prio
Re-adding Peers
---------------

View File

@ -106,6 +106,8 @@ If quotas are not enabled or if no quota is set on the mounted sub-directory,
then the overall usage of the file system will be reported irrespective of the
value of this setting.
.. _cephfs-layout-and-quota-restriction:
Layout and Quota restriction (the 'p' flag)
===========================================
@ -274,7 +276,7 @@ Client ``someuser`` is authorized for only one file system:
caps mon = "allow r"
caps osd = "allow rw tag cephfs data=cephfs"
Mounting ``cephfs1`` on the already-created mountpoint ``/mnt/cephfs1`` with
Mounting ``cephfs1`` on the already-created mount point ``/mnt/cephfs1`` with
``someuser`` works:
.. prompt:: bash #

View File

@ -6,6 +6,9 @@ File layouts
The layout of a file controls how its contents are mapped to Ceph RADOS objects. You can
read and write a file's layout using *virtual extended attributes* or xattrs.
Clients must use the ``p`` flag when writing a file's layout. See :ref:`Layout
and Quota restriction (the 'p' flag) <cephfs-layout-and-quota-restriction>`.
The name of the layout xattrs depends on whether a file is a regular file or a directory. Regular
files' layout xattrs are called ``ceph.file.layout``, whereas directories' layout xattrs are called
``ceph.dir.layout``. Where subsequent examples refer to ``ceph.file.layout``, substitute ``dir`` as appropriate
@ -20,26 +23,38 @@ Layout fields
-------------
pool
String, giving ID or name. String can only have characters in the set [a-zA-Z0-9\_-.]. Which RADOS pool a file's data objects will be stored in.
This is a string and contains either an ID or a name. Strings may contain
only characters in the set ``[a-zA-Z0-9\_-.]``. This determines the RADOS
pool that stores a file's data objects.
pool_id
String of digits. This is the system assigned pool id for the RADOS pool whenever it is created.
This is a string of digits. This is the pool ID that was assigned by Ceph
at the time of the creation of the RADOS pool.
pool_name
String, given name. This is the user defined name for the RADOS pool whenever user creates it.
This is a string. This is the name of the RADOS pool as defined by the user
when the pool was created.
pool_namespace
String with only characters in the set [a-zA-Z0-9\_-.]. Within the data pool, which RADOS namespace the objects will
be written to. Empty by default (i.e. default namespace).
This is a string containing only characters in the set ``[a-zA-Z0-9\_-.]``.
This determines which RADOS namespace within the data pool that the objects
will be written to.
Empty by default (i.e. default namespace).
stripe_unit
Integer in bytes. The size (in bytes) of a block of data used in the RAID 0 distribution of a file. All stripe units for a file have equal size. The last stripe unit is typically incompletei.e. it represents the data at the end of the file as well as unused “space” beyond it up to the end of the fixed stripe unit size.
This is an integer. The size (in bytes) of a block of data used in the
distribution of a file. All stripe units for a file have equal size. The
last stripe unit is typically only partly full of data: it holds file data
through EOF as well as padding that fills the balance of the fixed stripe
unit size.
stripe_count
Integer. The number of consecutive stripe units that constitute a RAID 0 “stripe” of file data.
Integer. The number of consecutive stripe units that constitute a RAID 0
“stripe” of file data.
object_size
Integer in bytes. File data is chunked into RADOS objects of this size.
Integer. The size of the object in bytes. File data is chunked into RADOS
objects of this size.
.. tip::

View File

@ -14,12 +14,12 @@ abstractions:
* FS volumes, an abstraction for CephFS file systems
* FS subvolumes, an abstraction for independent CephFS directory trees
* FS subvolume groups, an abstraction for a directory level higher than FS
subvolumes. Used to effect policies (e.g., :doc:`/cephfs/file-layouts`)
across a set of subvolumes
* FS subvolumes, an abstraction for independent CephFS directory trees
Possible use-cases for the export abstractions:
* FS subvolumes used as Manila shares or CSI volumes
@ -276,7 +276,7 @@ Use a command of the following form to create a subvolume:
.. prompt:: bash #
ceph fs subvolume create <vol_name> <subvol_name> [--size <size_in_bytes>] [--group_name <subvol_group_name>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>] [--namespace-isolated]
ceph fs subvolume create <vol_name> <subvol_name> [--size <size_in_bytes>] [--group_name <subvol_group_name>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>] [--namespace-isolated] [--earmark <earmark>]
The command succeeds even if the subvolume already exists.
@ -289,6 +289,33 @@ The subvolume can be created in a separate RADOS namespace by specifying the
default subvolume group with an octal file mode of ``755``, a uid of its
subvolume group, a gid of its subvolume group, a data pool layout of its parent
directory, and no size limit.
You can also assign an earmark to a subvolume using the ``--earmark`` option.
The earmark is a unique identifier that tags the subvolume for specific purposes,
such as NFS or SMB services. By default, no earmark is set, allowing for flexible
assignment based on administrative needs. An empty string ("") can be used to remove
any existing earmark from a subvolume.
The earmarking mechanism ensures that subvolumes are correctly tagged and managed,
helping to avoid conflicts and ensuring that each subvolume is associated
with the intended service or use case.
Valid Earmarks
~~~~~~~~~~~~~~~~~~~~
- **For NFS:**
- The valid earmark format is the top-level scope: ``'nfs'``.
- **For SMB:**
- The valid earmark formats are:
- The top-level scope: ``'smb'``.
- The top-level scope with an intra-module level scope: ``'smb.cluster.{cluster_id}'``, where ``cluster_id`` is a short string uniquely identifying the cluster.
- Example without intra-module scope: ``smb``
- Example with intra-module scope: ``smb.cluster.cluster_1``
.. note:: If you are changing an earmark from one scope to another (e.g., from nfs to smb or vice versa),
be aware that user permissions and ACLs associated with the previous scope might still apply. Ensure that
any necessary permissions are updated as needed to maintain proper access control.
Removing a subvolume
~~~~~~~~~~~~~~~~~~~~
@ -418,6 +445,7 @@ The output format is JSON and contains the following fields.
* ``pool_namespace``: RADOS namespace of the subvolume
* ``features``: features supported by the subvolume
* ``state``: current state of the subvolume
* ``earmark``: earmark of the subvolume
If a subvolume has been removed but its snapshots have been retained, the
output contains only the following fields.
@ -522,6 +550,33 @@ subvolume using the metadata key:
Using the ``--force`` flag allows the command to succeed when it would
otherwise fail (if the metadata key did not exist).
Getting earmark of a subvolume
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Use a command of the following form to get the earmark of a subvolume:
.. prompt:: bash #
ceph fs subvolume earmark get <vol_name> <subvol_name> [--group_name <subvol_group_name>]
Setting earmark of a subvolume
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Use a command of the following form to set the earmark of a subvolume:
.. prompt:: bash #
ceph fs subvolume earmark set <vol_name> <subvol_name> [--group_name <subvol_group_name>] <earmark>
Removing earmark of a subvolume
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Use a command of the following form to remove the earmark of a subvolume:
.. prompt:: bash #
ceph fs subvolume earmark rm <vol_name> <subvol_name> [--group_name <subvol_group_name>]
Creating a Snapshot of a Subvolume
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -758,16 +813,40 @@ Here is an example of an ``in-progress`` clone:
::
{
"status": {
"state": "in-progress",
"source": {
"volume": "cephfs",
"subvolume": "subvol1",
"snapshot": "snap1"
}
"status": {
"state": "in-progress",
"source": {
"volume": "cephfs",
"subvolume": "subvol1",
"snapshot": "snap1"
},
"progress_report": {
"percentage cloned": "12.24%",
"amount cloned": "376M/3.0G",
"files cloned": "4/6"
}
}
}
A progress report is also printed in the output when clone is ``in-progress``.
Here the progress is reported only for the specific clone. For collective
progress made by all ongoing clones, a progress bar is printed at the bottom
in ouput of ``ceph status`` command::
progress:
3 ongoing clones - average progress is 47.569% (10s)
[=============...............] (remaining: 11s)
If the number of clone jobs are more than cloner threads, two progress bars
are printed, one for ongoing clones (same as above) and other for all
(ongoing+pending) clones::
progress:
4 ongoing clones - average progress is 27.669% (15s)
[=======.....................] (remaining: 41s)
Total 5 clones - average progress is 41.667% (3s)
[===========.................] (remaining: 4s)
.. note:: The ``failure`` section will be shown only if the clone's state is ``failed`` or ``cancelled``
Here is an example of a ``failed`` clone:
@ -1340,5 +1419,28 @@ set with this id was present in the database
$ ceph fs quiesce fs1 sub1 sub2 sub3 --set-id="external-id" --if-version=0
.. _disabling-volumes-plugin:
Disabling Volumes Plugin
------------------------
By default the volumes plugin is enabled and set to ``always on``. However, in
certain cases it might be appropriate to disable it. For example, when a CephFS
is in a degraded state, the volumes plugin commands may accumulate in MGR
instead of getting served. Which eventually causes policy throttles to kick in
and the MGR becomes unresponsive.
In this event, volumes plugin can be disabled even though it is an
``always on`` module in MGR. To do so, run ``ceph mgr module disable volumes
--yes-i-really-mean-it``. Do note that this command will disable operations
and remove commands of volumes plugin since it will disable all CephFS
services on the Ceph cluster accessed through this plugin.
Before resorting to a measure as drastic as this, it is a good idea to try less
drastic measures and then assess if the file system experience has improved due
to it. One example of such less drastic measure is to disable asynchronous
threads launched by volumes plugins for cloning and purging trash.
.. _manila: https://github.com/openstack/manila
.. _CSI: https://github.com/ceph/ceph-csi

View File

@ -20,9 +20,11 @@ in `Mount CephFS: Prerequisites`_ page.
Synopsis
========
In general, the command to mount CephFS via FUSE looks like this::
In general, the command to mount CephFS via FUSE looks like this:
ceph-fuse {mountpoint} {options}
.. prompt:: bash #
ceph-fuse {mount point} {options}
Mounting CephFS
===============

View File

@ -109,29 +109,40 @@ Backward Compatibility
======================
The old syntax is supported for backward compatibility.
To mount CephFS with the kernel driver::
To mount CephFS with the kernel driver, run the following commands:
mkdir /mnt/mycephfs
mount -t ceph :/ /mnt/mycephfs -o name=admin
.. prompt:: bash #
The key-value argument right after option ``-o`` is CephX credential;
``name`` is the username of the CephX user we are using to mount CephFS.
mkdir /mnt/mycephfs
mount -t ceph :/ /mnt/mycephfs -o name=admin
To mount a non-default FS ``cephfs2``, in case the cluster has multiple FSs::
The key-value argument right after the option ``-o`` is the CephX credential.
``name`` is the username of the CephX user that is mounting CephFS.
mount -t ceph :/ /mnt/mycephfs -o name=admin,fs=cephfs2
To mount a non-default FS (in this example, ``cephfs2``), run commands of the following form. These commands are to be used in cases in which the cluster
has multiple file systems:
or
.. prompt:: bash #
mount -t ceph :/ /mnt/mycephfs -o name=admin,mds_namespace=cephfs2
mount -t ceph :/ /mnt/mycephfs -o name=admin,fs=cephfs2
.. note:: The option ``mds_namespace`` is deprecated. Use ``fs=`` instead when using the old syntax for mounting.
or
.. prompt:: bash #
mount -t ceph :/ /mnt/mycephfs -o name=admin,mds_namespace=cephfs2
.. note:: The option ``mds_namespace`` is deprecated. Use ``fs=`` instead when
using the old syntax for mounting.
Unmounting CephFS
=================
To unmount the Ceph file system, use the ``umount`` command as usual::
To unmount the Ceph file system, use the ``umount`` command, as in this
example:
umount /mnt/mycephfs
.. prompt:: bash #
umount /mnt/mycephfs
.. tip:: Ensure that you are not within the file system directories before
executing this command.
@ -148,11 +159,12 @@ For example::
cephuser@.cephfs=/ /mnt/ceph ceph mon_addr=192.168.0.1:6789,noatime,_netdev 0 0
If the ``secret`` or ``secretfile`` options are not specified then the mount helper
will attempt to find a secret for the given ``name`` in one of the configured keyrings.
If the ``secret`` or ``secretfile`` options are not specified, the mount
helper will attempt to find a secret for the given ``name`` in one of the
configured keyrings.
See `User Management`_ for details on CephX user management and mount.ceph_
manual for more options it can take. For troubleshooting, see
See `User Management`_ for details on CephX user management and the mount.ceph_
manual for a list of the options it recognizes. For troubleshooting, see
:ref:`kernel_mount_debugging`.
.. _fstab: ../fstab/#kernel-driver

View File

@ -143,6 +143,14 @@ The types of damage that can be reported and repaired by File System Scrub are:
* BACKTRACE : Inode's backtrace in the data pool is corrupted.
These above named MDS damages can be repaired by using the following command::
ceph tell mds.<fsname>:0 scrub start /path recursive, repair, force
If scrub is able to repair the damage, the corresponding entry is automatically
removed from the damage table.
Evaluate strays using recursive scrub
=====================================

View File

@ -407,6 +407,12 @@ its associated key. A less drastic but half-fix is to change the osd cap for
your user to just ``caps osd = "allow rw"`` and delete ``tag cephfs
data=....``
Disabling the Volumes Plugin
============================
In certain scenarios, the Volumes plugin may need to be disabled to prevent
compromise for rest of the Ceph cluster. For details see:
:ref:`disabling-volumes-plugin`
Reporting Issues
================

View File

@ -17,12 +17,10 @@ Key Idea
--------
For a given snapshot pair in a directory, `cephfs-mirror` daemon will rely on
readdir diff to identify changes in a directory tree. The diffs are applied to
`CephFS Snapdiff Feature` to identify changes in a directory tree. The diffs are applied to
directory in the remote file system thereby only synchronizing files that have
changed between two snapshots.
This feature is tracked here: https://tracker.ceph.com/issues/47034.
Currently, snapshot data is synchronized by bulk copying to the remote
filesystem.
@ -407,3 +405,5 @@ Feature Status
--------------
`cephfs-mirror` daemon is built by default (follows `WITH_CEPHFS` CMake rule).
.. _CephFS Snapdiff Feature: https://croit.io/blog/cephfs-snapdiff-feature

View File

@ -287,16 +287,13 @@ See :ref:`kubernetes-dev`
Backporting
-----------
All bugfixes should be merged to the ``main`` branch before being
backported. To flag a bugfix for backporting, make sure it has a
`tracker issue`_ associated with it and set the ``Backport`` field to a
comma-separated list of previous releases (e.g. "hammer,jewel") that you think
need the backport.
The rest (including the actual backporting) will be taken care of by the
`Stable Releases and Backports`_ team.
All bugfixes should be merged to the ``main`` branch before being backported.
To flag a bugfix for backporting, make sure it has a `tracker issue`_
associated with it and set the ``Backport`` field to a comma-separated list of
previous releases (e.g. "hammer,jewel") that you think need the backport. You
are responsible for the backporting of pull requests that you raise.
.. _`tracker issue`: http://tracker.ceph.com/
.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
Dependabot
----------

View File

@ -19,6 +19,7 @@ Contributing to Ceph: A Guide for Developers
Tests: Unit Tests <tests-unit-tests>
Tests: Integration Tests (Teuthology) <testing_integration_tests/index>
Tests: Running Tests (Locally) <running-tests-locally>
Tests: Windows <tests-windows>
Ceph Dashboard Developer Documentation (formerly HACKING.rst) <dash-devel>
Tracing Developer Documentation <jaegertracing>
Cephadm Developer Documentation <../cephadm/index>

View File

@ -3,11 +3,68 @@
Integration Tests using Teuthology Workflow
===========================================
Scheduling Test Run
-------------------
Infrastructure
--------------
Getting binaries
****************
Components:
1. `ceph-ci`_: Clone of the main Ceph repository, used for triggering Jenkins
Ceph builds for development.
2. `Ceph Jenkins`_: Responsible for triggering builds, uploading packages
to Chacra, and pushing updates about the build to Shaman.
3. `Shaman`_: UI Interface used to check build status. In its backend,
it is a REST API to query and store build information.
4. `Chacra`_: Service where packages are uploaded. The binaries uploaded
here can be downloaded and used by anyone.
5. `Teuthology CLI`_: Developers can use various Teuthology commands to schedule
and manage test runs.
6. Teuthology: This component is responsible for pushing test jobs to
the Beanstalk queue and Paddles. It also picks jobs from
the queue and runs tests.
7. Beanstalk queue: A priority queue containing all the queued jobs.
Developers typically do not need to interact with it.
8. Paddles: A backend service that stores all test run information.
Developers typically do not need to interact with it.
9. `Pulpito`_: A UI interface (for information stored in Paddles) that allows
developers to see detailed information about their scheduled tests,
including status and results.
10. Testnodes: A cluster of various machines that are used for running tests.
Developers usually schedule tests to run on `smithi`_ machines, which are
dedicated test nodes for Teuthology integration testing.
Each Teuthology test *run* contains multiple test *jobs*. Each job runs in an
environment isolated from other jobs, on a different collection of test nodes.
To test a change in Ceph, follow these steps:
1. Getting binaries - Build Ceph.
2. Scheduling Test Run:
a. About Test Suites.
b. Triggering Teuthology Tests.
c. Testing QA changes (without re-building binaries).
d. Filtering Tests.
3. Viewing Test Results:
a. Pulpito Dashboard.
b. Teuthology Archives (Reviewing Logs).
4. Killing tests.
5. Re-running tests.
Getting binaries - Build Ceph
-----------------------------
Ceph binaries must be built for your branch before you can use teuthology to run integration tests on them. Follow these steps to build the Ceph binaries:
@ -41,8 +98,44 @@ Ceph binaries must be built for your branch before you can use teuthology to run
.. _the Chacra site: https://shaman.ceph.com/api/search/?status=ready&project=ceph
Triggering Tests
****************
Naming the ceph-ci branch
*************************
Prepend your branch with your name before you push it to ceph-ci. For example,
a branch named ``feature-x`` should be named ``wip-$yourname-feature-x``, where
``$yourname`` is replaced with your name. Identifying your branch with your
name makes your branch easily findable on Shaman and Pulpito.
If you are using one of the stable branches (`quincy`, `pacific`, etc.), include
the name of that stable branch in your ceph-ci branch name.
For example, the ``feature-x`` PR branch should be named
``wip-feature-x-nautilus``. *This is not just a convention. This ensures that your branch is built in the correct environment.*
You can choose to only trigger a CentOS 9.Stream build (excluding other distro like ubuntu)
by adding "centos9-only" at the end of the ceph-ci branch name. For example,
``wip-$yourname-feature-centos9-only``. This helps to get quicker builds and save resources
when you don't require binaries for other distros.
Delete the branch from ceph-ci when you no longer need it. If you are
logged in to GitHub, all your branches on ceph-ci can be found here:
https://github.com/ceph/ceph-ci/branches.
Scheduling Test Run
-------------------
About Test Suites
*****************
Integration tests are organized into “suites”, which are defined in ``qa/suites``
sub-directory of the Ceph repository. These test suites can be run with the teuthology-suite
command.
See `Suites Inventory`_ for a list of available suites of integration tests.
More details understanding of how these test suites are defined can be found on `Integration Test Introduction Page`_.
Triggering Teuthology Tests
***************************
After you have built Ceph binaries for your branch, you can run tests using
teuthology. This procedure explains how to run tests using teuthology.
@ -54,7 +147,10 @@ teuthology. This procedure explains how to run tests using teuthology.
ssh <username>@teuthology.front.sepia.ceph.com
This requires Sepia lab access. To request access to the Sepia lab, see:
https://ceph.github.io/sepia/adding_users/
https://ceph.github.io/sepia/adding_users/.
#. For initial setup, follow `teuthology installation guide`_ to setup teuthology for
your user on teuthology machine. This will enable you to run teuthology commands.
#. Run the ``teuthology-suite`` command:
@ -66,7 +162,7 @@ teuthology. This procedure explains how to run tests using teuthology.
-s fs \
-p 110 \
--filter "cephfs-shell" \
-e foo@gmail.com \
-e foo@gmail.com
The options in the above command are defined here:
@ -101,10 +197,13 @@ teuthology. This procedure explains how to run tests using teuthology.
`Pulpito`_ where the test results can be viewed.
The ``--dry-run`` option allows you to demo-run ``teuthology-suite`` command without
actually scheduling teuthology tests. This is helpful to check how many jobs and which jobs
a command will schedule.
Other frequently used/useful options are ``-d`` (or ``--distro``),
``--distroversion``, ``--filter-out``, ``--timeout``, ``flavor``, ``-rerun``,
``-l`` (for limiting number of jobs) , ``-N`` (for how many times the job will
``--distro-version``, ``--filter-out``, ``--timeout``, ``flavor``, ``--rerun``,
``--limit`` (for limiting number of jobs) , ``-N`` (for how many times the job will
run), and ``--subset`` (used to reduce the number of tests that are triggered). Run
``teuthology-suite --help`` to read descriptions of these and other options.
@ -159,15 +258,15 @@ job config printed at the beginning of the teuthology job.
for the builds to finish, then triggering tests and waiting for
the test results.
About Suites and Filters
************************
Filtering Tests
***************
See `Suites Inventory`_ for a list of available suites of integration tests.
Each directory under ``qa/suites`` in the Ceph repository is an integration
test suite, and arguments appropriate to follow ``-s`` can be found there.
Test suites includes combinations of many yaml files which can results in massive
amount of jobs being scheduled for a suite. So filters can help to reduce the amount
of jobs or schedule particular jobs within a suite.
Keywords for filtering tests can be found in
``qa/suites/<suite-name>/<subsuite-name>/tasks`` and can be used as arguments
``qa/suites/<suite-name>/<subsuite-name>/tasks`` in Ceph repository and can be used as arguments
for ``--filter``. Each YAML file in that directory can trigger tests; using the
name of the file without its filename extension as an argument to the
``--filter`` triggers those tests.
@ -182,6 +281,8 @@ contents of the file for the ``modules`` attribute. For ``cephfs-shell.yaml``
the ``modules`` attribute is ``tasks.cephfs.test_cephfs_shell``. This means
that it triggers all tests in ``qa/tasks/cephfs/test_cephfs_shell.py``.
Read more about how to `Filter Tests by their Description`_.
Viewing Test Results
---------------------
@ -195,22 +296,26 @@ Teuthology Archives
*******************
After the tests have finished running, the log for the job can be obtained by
clicking on the job ID at the Pulpito page associated with your tests. It's
clicking on the job ID at the Pulpito run page associated with your tests. It's
more convenient to download the log and then view it rather than viewing it in
an internet browser since these logs can easily be up to 1 GB in size. It is
easier to ssh into the teuthology machine (``teuthology.front.sepia.ceph.com``)
and access the following path::
an internet browser since these logs can easily be up to 1 GB in size.
It is also possible to ssh into a `developer playground machine`_ and access the following path::
/ceph/teuthology-archive/<test-id>/<job-id>/teuthology.log
/teuthology/<run-name>/<job-id>/teuthology.log
For example: for the above test ID, the path is::
/ceph/teuthology-archive/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/4588482/teuthology.log
/teuthology/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/4588482/teuthology.log
This method can be used to view the log more quickly than would be possible through a browser.
In addition to ``teuthology.log``, some other files are included for debugging
purposes:
To view ceph logs (cephadm, ceph monitors, ceph-mgr, etc) or system logs,
remove ``teuthology.log`` from the job's teuthology log url on browser and then navigate
to ``remote/<machine>/log/``. System logs can be found at ``remote/<machine>/syslog/``.
Similarly, these logs can be found on developer playground machines at
``/teuthology/<test-id>/<job-id>/remote/<machine>/``.
Some other files that are included for debugging purposes:
* ``unit_test_summary.yaml``: Provides a summary of all unit test failures.
Generated (optionally) when the ``unit_test_scan`` configuration option is
@ -219,7 +324,7 @@ purposes:
* ``valgrind.yaml``: Summarizes any Valgrind errors that may occur.
.. note:: To access archives more conveniently, ``/a/`` has been symbolically
linked to ``/ceph/teuthology-archive/``. For instance, to access the previous
linked to ``/teuthology/``. For instance, to access the previous
example, we can use something like::
/a/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/4588482/teuthology.log
@ -234,9 +339,9 @@ Here is the command that terminates jobs:
.. prompt:: bash $
teuthology-kill -r teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi
teuthology-kill -p -r teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi -m smithi -o scheduled_teuthology@teuthology
Let's call the argument passed to ``-r`` as test ID. It can be found
The argument passed to ``-r`` is run name. It can be found
easily in the link to the Pulpito page for the tests you triggered. For
example, for the above test ID, the link is - http://pulpito.front.sepia.ceph.com/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/
@ -275,23 +380,9 @@ Following's the definition of new options introduced in this section:
'waiting'. Default value: 'fail,dead'
======================= ===============================================
Naming the ceph-ci branch
-------------------------
Prepend your branch with your name before you push it to ceph-ci. For example,
a branch named ``feature-x`` should be named ``wip-$yourname-feature-x``, where
``$yourname`` is replaced with your name. Identifying your branch with your
name makes your branch easily findable on Shaman and Pulpito.
If you are using one of the stable branches (`quincy`, `pacific`, etc.), include
the name of that stable branch in your ceph-ci branch name.
For example, the ``feature-x`` PR branch should be named
``wip-feature-x-nautilus``. *This is not just a convention. This ensures that your branch is built in the correct environment.*
Delete the branch from ceph-ci when you no longer need it. If you are
logged in to GitHub, all your branches on ceph-ci can be found here:
https://github.com/ceph/ceph-ci/branches.
.. _ceph-ci: https://github.com/ceph/ceph-ci
.. _Ceph Jenkins: https://jenkins.ceph.com/
.. _Teuthology CLI: https://docs.ceph.com/projects/teuthology/en/latest/commands/list.html
.. _Chacra: https://github.com/ceph/chacra/blob/master/README.rst
.. _Pulpito: http://pulpito.front.sepia.ceph.com/
.. _Running Your First Test: ../../running-tests-locally/#running-your-first-test
@ -299,4 +390,9 @@ https://github.com/ceph/ceph-ci/branches.
.. _Suites Inventory: ../tests-integration-testing-teuthology-intro/#suites-inventory
.. _Testing Priority: ../tests-integration-testing-teuthology-intro/#testing-priority
.. _Triggering Tests: ../tests-integration-testing-teuthology-workflow/#triggering-tests
.. _Integration Test Introduction Page: ../tests-integration-testing-teuthology-intro/#how-integration-tests-are-defined
.. _tests-sentry-developers-guide: ../tests-sentry-developers-guide/
.. _smithi: https://wiki.sepia.ceph.com/doku.php?id=hardware:smithi
.. _teuthology installation guide: https://docs.ceph.com/projects/teuthology/en/latest/INSTALL.html#installation-and-setup
.. _Filter Tests by their Description: ../tests-integration-testing-teuthology-intro/#filtering-tests-by-their-description
.. _developer playground machine: https://wiki.sepia.ceph.com/doku.php?id=devplayground

View File

@ -0,0 +1,143 @@
.. _dev-testing-windows:
=================
Testing - Windows
=================
Since Pacific, the Ceph client tools and libraries can be natively used on
Windows. This allows Windows nodes to consume Ceph without additional layers
such as iSCSI gateways or SMB shares.
A significant amount of unit tests and integration tests were ported in order
to ensure that these components continue to function properly on Windows.
Windows CI Job
==============
The `Windows CI job`_ performs the following steps for each GitHub pull request:
* spin up a Linux VM in which to build the server-side (Linux) Ceph binaries
and cross-compile the Windows (client) binaries.
* recreate the Linux VM and start a Ceph vstart cluster
* boot a Windows VM and run the Ceph tests there
`A small PowerShell framework`_ parallelizes the tests, aggregates the results
and isolates or skips certain tests that are known to be flaky.
The console output can contain compilation errors as well as the name of the
tests that failed. To get the console output of the failing tests as well as
Ceph and operating system logs, please check the build artifacts from the
Jenkins "Status" page.
.. image:: ../../images/windows_ci_status_page.png
:align: center
The Windows CI artifacts can be downloaded as a zip archive or viewed inside
the browser. Click the "artifacts" button to see the contents of the artifacts
folder.
.. image:: ../../images/windows_ci_artifacts.png
:align: center
Artifact contents:
* ``client/`` - Ceph client-side logs (Windows)
* ``eventlog/`` - Windows system logs
* ``logs/`` - Ceph logs
* ``-windows.conf`` - Ceph configuration file
* ``cluster/`` - Ceph server-side logs (Linux)
* ``ceph_logs/``
* ``journal``
* ``test_results/``
* ``out/`` - raw and xml test output grouped by the test executable
* ``test_results.html`` - aggregated test report (html)
* ``test_results.txt`` - aggregated test report (plaintext)
We're using the `subunit`_ format and associated tools to aggregate the test
results, which is especially handy when running a large amount of tests in
parallel.
The aggregated test report provides a great overview of the failing tests.
Go to the end of the file to see the actual errors::
{0} unittest_mempool.mempool.bufferlist_reassign [0.000000s] ... ok
{0} unittest_mempool.mempool.bufferlist_c_str [0.006000s] ... ok
{0} unittest_mempool.mempool.btree_map_test [0.000000s] ... ok
{0} ceph_test_dokan.DokanTests.test_mount [9.203000s] ... FAILED
Captured details:
~~~~~~~~~~~~~~~~~
b'/home/ubuntu/ceph/src/test/dokan/dokan.cc:136'
b'Expected equality of these values:'
b' wait_for_mount(mountpoint)'
b' Which is: -138'
b' 0'
b''
b'/home/ubuntu/ceph/src/test/dokan/dokan.cc:208'
b'Expected equality of these values:'
b' ret'
b' Which is: "ceph-dokan: exit status: -22"'
b' ""'
b'Failed unmapping: Y:\\'
{0} ceph_test_dokan.DokanTests.test_mount_read_only [9.140000s] ... FAILED
The html report conveniently groups the test results by test suite (test binary).
For security reasons it isn't rendered by default but it can be downloaded and
viewed locally:
.. image:: ../../images/windows_ci_html_report.png
:align: center
Timeouts and missing test results are often an indication that a process crashed.
Note that the ceph status is printed out on the console before and after
performing the tests, which can help identify crashed services.
You may also want to check the service logs (both client and server side). Also,
be aware that the Windows "application" event log will contain entries in case
of crashed Windows processes.
Frequently asked questions
==========================
1. Why is the Windows CI job the only one that fails on my PR?
Ceph integration tests are normally performed through Teuthology on the Ceph
Lab infrastructure. These tests are triggered on-demand by the Ceph QA
team and do not run automatically for every submitted pull request.
Since the Windows CI job focuses only on the client-side Ceph components,
it can run various integration tests in a timely manner for every pull request
on GitHub. **In other words, it runs various librados, librbd and libcephfs
tests that other checks such as "make check" do not.**
For this reason, the Windows CI often catches regressions that are missed by the
other checks and would otherwise only come up through Teuthology. More often
than not, these regressions are not platform-specific and affect Linux as well.
In case of Windows CI failures, we strongly suggest checking the test results
as described above.
Be aware that the `Windows build script`_ may use different compilation flags
and ``-D`` options passed to CMake. For example, it defaults to ``Release`` mode
instead of ``Debug`` mode. At the same time, it uses a different toolchain
(``mingw-llvm``) and a separate set of `dependencies`_, make sure to bump the
versions if needed.
2. Why is the Windows CI job mandatory?
The test job was initially optional, as a result regressions were introduced
very often.
After a time, Windows support became mature enough to make this CI job mandatory.
This significantly reduces the amount of work required to address regressions
and assures Ceph users of continued Windows support.
As said before, another great advantage is that it runs integration tests that
quickly catch regressions which often affect Linux builds as well. This spares
developers from having to wait for the full Teuthology results.
.. _Windows CI job: https://github.com/ceph/ceph-build/blob/main/ceph-windows-pull-requests/config/definitions/ceph-windows-pull-requests.yml
.. _A small PowerShell framework: https://github.com/ceph/ceph-win32-tests/
.. _Windows build script: https://github.com/ceph/ceph/blob/main/win32_build.sh
.. _dependencies: https://github.com/ceph/ceph/blob/main/win32_deps_build.sh
.. _subunit: https://github.com/testing-cabal/subunit

View File

@ -243,6 +243,10 @@ differences:
* All commits are cherry-picked with ``git cherry-pick -x`` to
reference the original commit
.. note: If a backport is appropriate, the submitter is responsible for
determining appropriate target stable branches to which backports must be
made.
See `the backporter manual
<http://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO>`_ for more
information.

View File

@ -25,7 +25,7 @@ Concepts
a temporary placement group acting set that is used while backfilling the
primary OSD. Assume that the acting set is ``[0,1,2]`` and we are
``active+clean``. Now assume that something happens and the acting set
becomes ``[2,1,2]``. Under these circumstances, OSD ``3`` is empty and can't
becomes ``[3,1,2]``. Under these circumstances, OSD ``3`` is empty and can't
serve reads even though it is the primary. ``osd.3`` will respond by
requesting a *PG temp* of ``[1,2,3]`` to the monitors using a ``MOSDPGTemp``
message, and ``osd.1`` will become the primary temporarily. ``osd.1`` will

View File

@ -152,8 +152,8 @@ First release candidate
=======================
- [x] src/ceph_release: change type to `rc`
- [ ] opt-in to all telemetry channels, generate telemetry reports, and verify no sensitive details (like pools names) are collected
- [ ] check if new pool flags exist in pg_pool_t (osd/osd_types.h), and add them to telemetry's basic_pool_flags collection, in case they are not sensitive
- [x] opt-in to all telemetry channels, generate telemetry reports, and verify no sensitive details (like pools names) are collected
- [x] check if new pool flags exist in pg_pool_t (osd/osd_types.h), and add them to telemetry's basic_pool_flags collection, in case they are not sensitive
First stable release
@ -162,3 +162,5 @@ First stable release
- [x] src/ceph_release: change type `stable`
- [ ] generate new object corpus for encoding/decoding tests - see :doc:`corpus`
- [ ] src/cephadm/cephadmlib/constants.py: update `LATEST_STABLE_RELEASE`
- [x] activate latest release in readthedocs, as described in `the readthedocs
documentation <https://docs.readthedocs.io/en/stable/versions.html>`_

View File

@ -133,7 +133,9 @@ See `the Ceph Tracker wiki page that explains how to write the release notes <ht
#. Obtain the sha1 of the version commit from the `build job <https://jenkins.ceph.com/view/all/job/ceph>`_ or the ``sha1`` file created by the `ceph-setup <https://jenkins.ceph.com/job/ceph-setup/>`_ job.
#. Download the packages from chacra.ceph.com to the signing virtual machine. These packages get downloaded to ``/opt/repos`` where the `Sepia Lab Long Running (Ceph) Cluster <https://wiki.sepia.ceph.com/doku.php?id=services:longrunningcluster>`_ is mounted.
#. Download the packages from chacra.ceph.com to the signing virtual machine. These packages get downloaded to ``/opt/repos`` where the `Sepia Lab Long Running (Ceph) Cluster <https://wiki.sepia.ceph.com/doku.php?id=services:longrunningcluster>`_ is mounted. Note: this step will also run a command to transfer the
source tarballs from chacra.ceph.com to download.ceph.com directly, by
ssh'ing to download.ceph.com and running /home/signer/bin/get-tarballs.sh.
.. prompt:: bash $
@ -207,19 +209,63 @@ See `the Ceph Tracker wiki page that explains how to write the release notes <ht
sync-push ceph octopus
This leaves the packages in a password-protected prerelease area
at https://download.ceph.com/prerelease/ceph. Verify them from there.
When done and ready for release, mv the directories to the release
directory (that is, "mv <whatever you're promoting> ../..".
This leaves the packages, and the tarball, in a password-protected
prerelease area at https://download.ceph.com/prerelease/ceph. Verify them
from there. When done and ready for release, log into download.ceph.com and
mv the directories and the tarballs from the prerelease home
(/data/download.ceph.com/www/prerelease/ceph) to the release directory
(/data/download.ceph.com/www).
5. Build Containers
===================
Start the following two jobs:
Architecture-specific containers are built during the ceph build and
pushed to quay.ceph.io/ceph/prerelease-{amd64,arm64}, containing the
packages built in that ceph build. The prerelease 'fat' container,
or manifest-list container, that refers to both arch-specific containers,
is built by hand using the command "make-manifest-list.py" in
ceph.git:src/container/make-manifest-list.py. Note that you must
be logged into the appropriate container repos for any of these
manipulations: quay.ceph.io for fetching prerelease arch-specific
containers and pushing the prerelease manifest-list container, and
quay.io for promoting the prerelease containers to released containers.
.. prompt:: bash
cd <ceph-checkout>/src/container
./make-manifest-list.py
Reasonable defaults are set for all inputs, but environment variables
can be used to override:
* ARCH_SPECIFIC_HOST (default 'quay.ceph.io'): host of prerelease repos
* AMD64_REPO (default 'ceph/prerelease-amd64') prerelease amd64 repo
* ARM64_REPO (default 'ceph/prerelease-arm64') prerelease arm64 repo
(prerelease arch-specific containers will be copied from here)
* MANIFEST_HOST (default 'quay.ceph.io') prerelease manifest-list host
* MANIFEST_REPO (default 'ceph/prerelease') prerelease manifest-list repo
(prerelease manifest-list containers will be placed here)
Finally, when all appropriate testing/ verification is done on the
container images, you can use make-manifest-list.py to promote them to
their final release location on quay.io/ceph/ceph:
.. prompt:: bash
cd <ceph-checkout>/src/container
./make-manifest-list.py --promote
Two more environment variables can override the default destination for
promotion (the source of the prerelease container to be promoted is
as above, in MANIFEST_HOST/REPO):
* RELEASE_MANIFEST_HOST (default 'quay.io') release host
* RELEASE_MANIFEST_REPO (default 'ceph/ceph') release repo
#. https://2.jenkins.ceph.com/job/ceph-container-build-ceph-base-push-imgs/
#. https://2.jenkins.ceph.com/job/ceph-container-build-ceph-base-push-imgs-arm64/
6. Announce the Release
=======================

View File

@ -42,6 +42,11 @@
Ceph is a distributed network storage and file system with
distributed metadata management and POSIX semantics.
`ceph-ansible <https://docs.ceph.com/projects/ceph-ansible/en/latest/index.html>`_
A GitHub repository, supported from the Jewel release to the
Quincy release, that facilitates the installation of a Ceph
cluster.
Ceph Block Device
Also called "RADOS Block Device" and :term:`RBD`. A software
instrument that orchestrates the storage of block-based data in
@ -256,6 +261,21 @@
Another name for :term:`Dashboard`.
Dashboard Plugin
The dashboard plugin was a Mimic-era web application that
visualized information and statistics about the Ceph cluster
using a web server hosted by the :ref:`Ceph
Manager<ceph-manager-daemon>`.
See `the Mimic-era Dashboard Plugin documentation
<https://docs.ceph.com/en/mimic/mgr/dashboard/>`_.
DC
**D**\ata **C**\enter.
Flapping OSD
An OSD that is repeatedly marked ``up`` and then ``down`` in
rapid succession. See :ref:`rados_tshooting_flapping_osd`.
FQDN
**F**\ully **Q**\ualified **D**\omain **N**\ame. A domain name
that is applied to a node in a network and that specifies the
@ -315,6 +335,12 @@
Node
See :term:`Ceph Node`.
Object Storage
Object storage is one of three kinds of storage relevant to
Ceph. The other two kinds of storage relevant to Ceph are file
storage and block storage. Object storage is the category of
storage most fundamental to Ceph.
Object Storage Device
See :term:`OSD`.
@ -350,6 +376,9 @@
mid-2010s to insist that "OSD" should refer to "Object Storage
Device", so it is important to know which meaning is intended.
OSD, flapping
See :term:`Flapping OSD`.
OSD FSID
The OSD fsid is a unique identifier that is used to identify an
OSD. It is found in the OSD path in a file called ``osd_fsid``.
@ -384,7 +413,15 @@
placement group, and each placement group belongs to exactly
one Ceph pool.
PLP
**P**\ower **L**\oss **P**\rotection. A technology that
protects the data of solid-state drives by using capacitors to
extend the amount of time available for transferring data from
the DRAM cache to the SSD's permanent memory. Consumer-grade
SSDs are rarely equipped with PLP.
:ref:`Pool<rados_pools>`
A pool is a logical partition used to store objects.
Pools

View File

@ -21,14 +21,56 @@ Bodies
Ceph Executive Council
======================
Responsibilities
----------------
.. _exec-council-responsibilities:
* Arbiter in cases where decisions cannot be reached by consensus
* Distribute key responsibilities amongst themselves or others
* Point of contact for the project
* Representatives for Ceph foundation board meetings
* Ensure things get done
Ceph Executive Council Responsibilities
---------------------------------------
- Spokesperson
- welcome/keynote for cephalocon
- maintaining slides and presenting about the project
- Community focal point (user interaction, conference talks, mailing list,
etc)
- Community
- managing community manager
- LF Program Manager person, Social Media person
- liase with the ambassadors
- make sure ceph events happen, successfully: cephalocon, ceph days, cds, user/dev, cdm
- coordinating with LF
- creating program committee
- recordings on youtube
- getting sponsors for events
- communications, schedule, venue decisions
- coordinate blog posts
- Ceph Foundation
- ensure foundation is healthy: financials, operations
- represent the CLT on the Board
- present project status regularly (yearly)
- collect member ideas / feedback
- ensure members feel valued
- guide the members how to support the project (events, testing, marketing, hardware, ...)
Membership
----------
@ -47,7 +89,7 @@ Membership
Current Members
^^^^^^^^^^^^^^^
* Dan van der Ster <daniel.vanderster@cern.ch>
* Dan van der Ster <dan.vanderster@clyso.com>
* Josh Durgin <jdurgin@redhat.com>
* Neha Ojha <nojha@redhat.com>
@ -82,28 +124,28 @@ Current Members
* Casey Bodley <cbodley@redhat.com>
* Dan van der Ster <dan.vanderster@clyso.com>
* David Orman <ormandj@1111systems.com>
* Ernesto Puerta <epuerta@redhat.com>
* Ernesto Puerta <epuertat@redhat.com>
* Gregory Farnum <gfarnum@redhat.com>
* Haomai Wang <haomai@xsky.com>
* Ilya Dryomov <idryomov@redhat.com>
* Igor Fedotov <igor.fedotov@croit.io>
* Jeff Layton <jlayton@redhat.com>
* Josh Durgin <jdurgin@redhat.com>
* João Eduardo Luis <joao@suse.de>
* João Eduardo Luis <joao@clyso.com>
* Ken Dreyer <kdreyer@redhat.com>
* Mark Nelson <mnelson@redhat.com>
* Mark Nelson <mark.nelson@clyso.com>
* Matt Benjamin <mbenjami@redhat.com>
* Mike Perez <miperez@redhat.com>
* Myoungwon Oh <myoungwon.oh@samsung.com>
* Neha Ojha <nojha@redhat.com>
* Patrick Donnelly <pdonnell@redhat.com>
* Patrick Donnelly <pdonnell@ibm.com>
* Sam Just <sjust@redhat.com>
* Vikhyat Umrao <vikhyat@redhat.com>
* Xie Xingguo <xie.xingguo@zte.com.cn>
* Yehuda Sadeh <yehuda@redhat.com>
* Yingxin Cheng <yingxin.cheng@intel.com>
* Yuri Weinstein <yweinste@redhat.com>
* Zac Dover <zac.dover@gmail.com>
* Zac Dover <zac.dover@proton.me>
.. _ctl:

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

View File

@ -94,7 +94,7 @@ about Ceph, see our `Architecture`_ section.
.. _Ceph Object Store: radosgw
.. _Ceph Block Device: rbd
.. _Ceph File System: cephfs
.. _Getting Started: install
.. _Getting Started: start
.. _Architecture: architecture
.. toctree::

View File

@ -475,7 +475,7 @@ thread on the ceph-users mailing list
.. prompt:: bash #
ceph auth get-or-create client.short-hostname-of-rgw mon 'allow rw' osd 'allow rwx'
ceph auth get-or-create client.$(hostname -s) mon 'allow rw' osd 'allow rwx'
#. On one of the RGW nodes, do the following:

View File

@ -85,3 +85,4 @@ Further reading
.. _Windows troubleshooting: ../windows-troubleshooting
.. _General CephFS Prerequisites: ../../cephfs/mount-prerequisites
.. _Client Authentication: ../../cephfs/client-auth
.. _Windows testing: ../dev/tests-windows

View File

@ -14,8 +14,8 @@ BASIC ARCHITECTURE AND TERMINOLOGY
Protocol. The agent is meant to be placed on the same host as the
instrumented application. (The Jaeger agent acts like a sidecar listener.)
* JAEGER COLLECTOR: A daemon that receives spans sent by the Jaeger agent. The
Jaeger collector then stitches the spans together to form a trace. (A databse
can be enabled to persist a database for these traces).
Jaeger collector then stitches the spans together to form a trace. (A database
can be enabled to persist these traces).
* JAEGER QUERY AND CONSOLE FRONTEND: The UI-based frontend that presents
reports of the jaeger traces. Accessible at http://<jaeger frontend host>:16686.

View File

@ -29,6 +29,7 @@ Synopsis
| **ceph-bluestore-tool** free-dump|free-score --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ]
| **ceph-bluestore-tool** reshard --path *osd path* --sharding *new sharding* [ --sharding-ctrl *control string* ]
| **ceph-bluestore-tool** show-sharding --path *osd path*
| **ceph-bluestore-tool** zap-device --dev *dev path*
Description
@ -93,19 +94,22 @@ Commands
:command:`bluefs-bdev-migrate` --dev-target *new-device* --devs-source *device1* [--devs-source *device2*]
Moves BlueFS data from source device(s) to the target one, source devices
(except the main one) are removed on success. Target device can be both
already attached or new device. In the latter case it's added to OSD
replacing one of the source devices. Following replacement rules apply
(in the order of precedence, stop on the first match):
Moves BlueFS data from source device(s) to the target device. Source devices
(except the main one) are removed on success. Expands the target storage
(updates the size label), making "bluefs-bdev-expand" unnecessary. The
target device can be either a new device or a device that is already
attached. If the device is a new device, it is added to the OSD replacing
one of the source devices. The following replacement rules apply (in the
order of precedence, stop on the first match):
- if source list has DB volume - target device replaces it.
- if source list has WAL volume - target device replace it.
- if source list has slow volume only - operation isn't permitted, requires explicit allocation via new-db/new-wal command.
- if the source list has DB volume - the target device replaces it.
- if the source list has WAL volume - the target device replaces it.
- if the source list has slow volume only - the operation isn't permitted and requires explicit allocation via a new-DB/new-WAL command.
:command:`show-label` --dev *device* [...]
Show device label(s).
The label may be printed while an OSD is running.
:command:`free-dump` --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ]
@ -131,6 +135,10 @@ Commands
Show sharding that is currently applied to BlueStore's RocksDB.
:command: `zap-device` --dev *dev path*
Zeros all device label locations. This effectively makes device appear empty.
Options
=======
@ -192,8 +200,8 @@ Useful to provide necessary configuration options when access to monitor/ceph.co
Device labels
=============
Every BlueStore block device has a single block label at the beginning of the
device. You can dump the contents of the label with::
Every BlueStore block device has a block label at the beginning of the device.
You can dump the contents of the label with::
ceph-bluestore-tool show-label --dev *device*
@ -201,6 +209,10 @@ The main device will have a lot of metadata, including information
that used to be stored in small files in the OSD data directory. The
auxiliary devices (db and wal) will only have the minimum required
fields (OSD UUID, size, device type, birth time).
The main device contains additional label copies at offsets: 1G, 10G, 100G and 1000G.
Corrupted labels are fixed as part of repair::
ceph-bluestore-tool repair --dev *device*
OSD directory priming
=====================

View File

@ -56,7 +56,7 @@ A sub-directory of the file system can be mounted by specifying the (absolute)
path to the sub-directory right after "=" in the device part of the mount command.
Mount helper application conventions dictate that the first two options are
device to be mounted and the mountpoint for that device. Options must be
device to be mounted and the mount point for that device. Options must be
passed only after these fixed arguments.

View File

@ -11,7 +11,7 @@ Synopsis
| **mount.fuse.ceph** [-h] [-o OPTIONS [*OPTIONS* ...]]
device [*device* ...]
mountpoint [*mountpoint* ...]
mountpoint [*mount point* ...]
Description
===========

View File

@ -476,26 +476,19 @@ as follows:
Cancel resharding a bucket
:command:`topic list`
List bucket notifications/pubsub topics
List bucket notifications topics
:command:`topic get`
Get a bucket notifications/pubsub topic
Get a bucket notification topic
:command:`topic rm`
Remove a bucket notifications/pubsub topic
Remove a bucket notifications topic
:command:`subscription get`
Get a pubsub subscription definition
:command:`subscription rm`
Remove a pubsub subscription
:command:`subscription pull`
Show events in a pubsub subscription
:command:`subscription ack`
Acknowledge (remove) events in a pubsub subscription
:command:`topic stats`
Get a bucket notifications persistent topic stats (i.e. reservations, entries & size)
:command:`topic dump`
Dump (in JSON format) all pending bucket notifications of a persistent topic
Options
=======

View File

@ -520,28 +520,28 @@ Commands
Show RBD mirroring status for an image.
:command:`mirror pool demote` [*pool-name*]
Demote all primary images within a pool to non-primary.
Every mirror-enabled image in the pool will be demoted.
Demote all primary images within a pool or namespace to non-primary.
Every mirror-enabled image in the pool or namespace will be demoted.
:command:`mirror pool disable` [*pool-name*]
Disable RBD mirroring by default within a pool. When mirroring
is disabled on a pool in this way, mirroring will also be
disabled on any images (within the pool) for which mirroring
was enabled explicitly.
Disable RBD mirroring within a pool or namespace. When mirroring
is disabled on a pool or namespace in this way, mirroring will also be
disabled on all images (within the pool or namespace) for which mirroring
was enabled, whether by default or explicitly.
:command:`mirror pool enable` [*pool-name*] *mode*
Enable RBD mirroring by default within a pool.
Enable RBD mirroring within a pool or namespace.
The mirroring mode can either be ``pool`` or ``image``.
If configured in ``pool`` mode, all images in the pool
If configured in ``pool`` mode, all images in the pool or namespace
with the journaling feature enabled are mirrored.
If configured in ``image`` mode, mirroring needs to be
explicitly enabled (by ``mirror image enable`` command)
on each image.
:command:`mirror pool info` [*pool-name*]
Show information about the pool mirroring configuration.
It includes mirroring mode, peer UUID, remote cluster name,
and remote client name.
Show information about the pool or namespace mirroring configuration.
For a pool, it includes mirroring mode, peer UUID, remote cluster name,
and remote client name. For a namespace, it includes only mirroring mode.
:command:`mirror pool peer add` [*pool-name*] *remote-cluster-spec*
Add a mirroring peer to a pool.
@ -561,13 +561,13 @@ Commands
is corresponding to remote client name or remote cluster name.
:command:`mirror pool promote` [--force] [*pool-name*]
Promote all non-primary images within a pool to primary.
Every mirror-enabled image in the pool will be promoted.
Promote all non-primary images within a pool or namespace to primary.
Every mirror-enabled image in the pool or namespace will be promoted.
:command:`mirror pool status` [--verbose] [*pool-name*]
Show status for all mirrored images in the pool.
Show status for all mirrored images in the pool or namespace.
With ``--verbose``, show additional output status
details for every mirror-enabled image in the pool.
details for every mirror-enabled image in the pool or namespace.
:command:`mirror snapshot schedule add` [-p | --pool *pool*] [--namespace *namespace*] [--image *image*] *interval* [*start-time*]
Add mirror snapshot schedule.

View File

@ -1441,9 +1441,9 @@ commands:
/var/log/ceph/$cluster-$name.log
#. Ensure the SSL/TSL support is configured properly:
#. Ensure the SSL/TLS support is configured properly:
* Check if the SSL/TSL support is enabled:
* Check if the SSL/TLS support is enabled:
.. prompt:: bash $

View File

@ -283,7 +283,7 @@ Create CephFS Export
.. code:: bash
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>] [--sectype <value>...]
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>] [--sectype <value>...] [--cmount_path <value>]
This creates export RADOS objects containing the export block, where
@ -318,9 +318,16 @@ values may be separated by a comma (example: ``--sectype krb5p,krb5i``). The
server will negotatiate a supported security type with the client preferring
the supplied methods left-to-right.
``<cmount_path>`` specifies the path within the CephFS to mount this export on. It is
allowed to be any complete path hierarchy between ``/`` and the ``EXPORT {path}``. (i.e. if ``EXPORT { Path }`` parameter is ``/foo/bar`` then cmount_path could be ``/``, ``/foo`` or ``/foo/bar``).
.. note:: If this and the other ``EXPORT { FSAL {} }`` options are the same between multiple exports, those exports will share a single CephFS client.
If not specified, the default is ``/``.
.. note:: Specifying values for sectype that require Kerberos will only function on servers
that are configured to support Kerberos. Setting up NFS-Ganesha to support Kerberos
is outside the scope of this document.
can be found here `Kerberos setup for NFS Ganesha in Ceph <https://github.com/nfs-ganesha/nfs-ganesha/wiki/Kerberos-setup-for-NFS-Ganesha-in-Ceph>`_.
.. note:: Export creation is supported only for NFS Ganesha clusters deployed using nfs interface.
@ -477,9 +484,9 @@ For example,::
],
"fsal": {
"name": "CEPH",
"user_id": "nfs.mynfs.1",
"fs_name": "a",
"sec_label_xattr": ""
"sec_label_xattr": "",
"cmount_path": "/"
},
"clients": []
}
@ -494,6 +501,9 @@ as when creating a new export), with the exception of the
authentication credentials, which will be carried over from the
previous state of the export where possible.
!! NOTE: The ``user_id`` in the ``fsal`` block should not be modified or mentioned in the JSON file as it is auto-generated for CephFS exports.
It's auto-generated in the format ``nfs.<cluster_id>.<fs_name>.<hash_id>``.
::
$ ceph nfs export apply mynfs -i update_cephfs_export.json
@ -514,9 +524,9 @@ previous state of the export where possible.
],
"fsal": {
"name": "CEPH",
"user_id": "nfs.mynfs.1",
"fs_name": "a",
"sec_label_xattr": ""
"sec_label_xattr": "",
"cmount_path": "/"
},
"clients": []
}

View File

@ -77,6 +77,19 @@ If the port is not configured, *restful* will bind to port ``8003``.
If the address it not configured, the *restful* will bind to ``::``,
which corresponds to all available IPv4 and IPv6 addresses.
Configuring max_request
---------------------------
The maximum request size can be configured via a central configuration
option::
ceph config set mgr mgr/restful/$name/max_requests $NUM
where ``$name`` is the ID of the ceph-mgr daemon (usually the hostname).
.. mgr_module:: restful
.. confval:: max_requests
.. _creating-an-api-user:
Creating an API User

View File

@ -355,7 +355,7 @@ invoking methods of the `Ioctx` and other classes.
.. --------------
.. The Ceph Storage Cluster allows you to make a snapshot of a pool's state.
.. Whereas, basic pool operations only require a connection to the cluster,
.. Although basic pool operations require only a connection to the cluster,
.. snapshots require an I/O context.
.. Ioctx.create_snap(self, snap_name)

View File

@ -179,7 +179,7 @@ Naming Clusters (deprecated)
Each Ceph cluster has an internal name. This internal name is used as part of
configuration, and as part of "log file" names as well as part of directory
names and as part of mountpoint names. This name defaults to "ceph". Previous
names and as part of mount point names. This name defaults to "ceph". Previous
releases of Ceph allowed one to specify a custom name instead, for example
"ceph2". This option was intended to facilitate the running of multiple logical
clusters on the same physical hardware, but in practice it was rarely

View File

@ -164,6 +164,60 @@ parameters. This profile should be used with caution and is meant for advanced
users, who understand mclock and Ceph related configuration options.
.. index:: mclock; shard config for HDD clusters
.. _mclock-hdd-cfg:
OSD Shard Configuration For HDD Based Clusters With mClock
==========================================================
Each OSD is configured with one or more shards to perform tasks. Each shard
comprises a unique queue to handle various types of OSD specific operations
like client I/O, recovery, scrub and so on. The scheduling of these operations
in the queue is performed by a scheduler - in this case the mClock scheduler.
For HDD based OSDs, the number of shards is controlled by configuration
:confval:`osd_op_num_shards_hdd`. Items are queued and dequeued by one or
more worker threads and this is controlled by configuration
:confval:`osd_op_num_threads_per_shard_hdd`.
As described in :ref:`dmclock-qos-caveats`, the number of OSD shards employed
determines the impact of mClock queue. In general, a lower number of shards
increases the impact of mClock queues with respect to scheduling accuracy.
This is providing there are enough number of worker threads per shard
to help process the items in the mClock queue.
Based on tests performed at scale with small objects in the range
[1 KiB - 256 KiB] on a HDD based cluster (192 OSDs, 8 nodes,
150 Million objects), it was found that scheduling with mClock was not optimal
with multiple OSD shards. For example, in this cluster with multiple OSD node
failures, the client throughput was found to be inconsistent across test runs
coupled with multiple reported slow requests. For more details
see https://tracker.ceph.com/issues/66289. With multiple shards, the situation
was exacerbated when MAX limit was allocated to both client and background
recovery class of operations. During the OSD failure phase, since both client
and recovery ops were in direct competition to utilize the full bandwidth of
OSDs, there was no predictability with respect to the throughput of either
class of services.
However, the same test with a single OSD shard and with multiple worker threads
yielded significantly better results in terms of consistency of client and
recovery throughput across multiple test runs. Please refer to the tracker
above for more details. For sanity, the same test executed using this shard
configuration with large objects in the range [1 MiB - 256 MiB] yielded similar
results.
Therefore, as an interim measure until the issue with multiple OSD shards
(or multiple mClock queues per OSD) is investigated and fixed, the following
change to the default HDD OSD shard configuration is made:
+---------------------------------------------+------------------+----------------+
| Config Option | Old Default | New Default |
+=============================================+==================+================+
| :confval:`osd_op_num_shards_hdd` | 5 | 1 |
+---------------------------------------------+------------------+----------------+
| :confval:`osd_op_num_threads_per_shard_hdd` | 1 | 5 |
+---------------------------------------------+------------------+----------------+
.. index:: mclock; built-in profiles
mClock Built-in Profiles - Locked Config Options
@ -694,6 +748,8 @@ mClock Config Options
.. confval:: osd_mclock_skip_benchmark
.. confval:: osd_mclock_override_recovery_settings
.. confval:: osd_mclock_iops_capacity_threshold_hdd
.. confval:: osd_mclock_iops_capacity_low_threshold_hdd
.. confval:: osd_mclock_iops_capacity_threshold_ssd
.. confval:: osd_mclock_iops_capacity_low_threshold_ssd
.. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf

View File

@ -113,7 +113,7 @@ Consistency
When you add monitor settings to your Ceph configuration file, you need to be
aware of some of the architectural aspects of Ceph Monitors. **Ceph imposes
strict consistency requirements** for a Ceph monitor when discovering another
Ceph Monitor within the cluster. Whereas, Ceph Clients and other Ceph daemons
Ceph Monitor within the cluster. Although Ceph Clients and other Ceph daemons
use the Ceph configuration file to discover monitors, monitors discover each
other using the monitor map (monmap), not the Ceph configuration file.

View File

@ -63,3 +63,6 @@ to the values of the SRV weight fields.
For the above example, this will result in approximate 40% of the clients and daemons connecting to mon1,
60% of them connecting to mon2. However, if neither of them is reachable, then mon3 will be reconsidered as a fallback.
See also `Messenger v2 <msgr2>`_.

View File

@ -189,6 +189,9 @@ Operations
.. confval:: osd_op_num_shards
.. confval:: osd_op_num_shards_hdd
.. confval:: osd_op_num_shards_ssd
.. confval:: osd_op_num_threads_per_shard
.. confval:: osd_op_num_threads_per_shard_hdd
.. confval:: osd_op_num_threads_per_shard_ssd
.. confval:: osd_op_queue
.. confval:: osd_op_queue_cut_off
.. confval:: osd_client_op_priority
@ -292,6 +295,9 @@ of the current time. The ultimate lesson is that values for weight
should not be too large. They should be under the number of requests
one expects to be serviced each second.
.. _dmclock-qos-caveats:
Caveats
```````
@ -303,6 +309,11 @@ number of shards can be controlled with the configuration options
:confval:`osd_op_num_shards`, :confval:`osd_op_num_shards_hdd`, and
:confval:`osd_op_num_shards_ssd`. A lower number of shards will increase the
impact of the mClock queues, but may have other deleterious effects.
This is especially the case if there are insufficient shard worker
threads. The number of shard worker threads can be controlled with the
configuration options :confval:`osd_op_num_threads_per_shard`,
:confval:`osd_op_num_threads_per_shard_hdd` and
:confval:`osd_op_num_threads_per_shard_ssd`.
Second, requests are transferred from the operation queue to the
operation sequencer, in which they go through the phases of
@ -362,6 +373,8 @@ considerably. To maintain operational performance, Ceph performs this migration
with 'backfilling', which allows Ceph to set backfill operations to a lower
priority than requests to read or write data.
.. note:: Some of these settings are automatically reset if the `mClock`_
scheduler is active, see `mClock backfill`_.
.. confval:: osd_max_backfills
.. confval:: osd_backfill_scan_min
@ -404,6 +417,9 @@ To maintain operational performance, Ceph performs recovery with limitations on
the number recovery requests, threads and object chunk sizes which allows Ceph
perform well in a degraded state.
.. note:: Some of these settings are automatically reset if the `mClock`_
scheduler is active, see `mClock backfill`_.
.. confval:: osd_recovery_delay_start
.. confval:: osd_recovery_max_active
.. confval:: osd_recovery_max_active_hdd
@ -441,6 +457,8 @@ Miscellaneous
.. _pool: ../../operations/pools
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction
.. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
.. _mClock: ../mclock-config-ref.rst
.. _mClock backfill: ../mclock-config-ref.rst#recovery-backfill-options
.. _Pool & PG Config Reference: ../pool-pg-config-ref
.. _Journal Config Reference: ../journal-ref
.. _cache target dirty high ratio: ../../operations/pools#cache-target-dirty-high-ratio

View File

@ -17,8 +17,8 @@ It's a good idea to check the capacity of your cluster so that you know when it
approaches its capacity limits. If your cluster has reached its ``near full``
ratio, then you should add OSDs to expand your cluster's capacity.
.. warning:: Do not add an OSD after your cluster has reached its ``full
ratio``. OSD failures that occur after the cluster reaches its ``near full
.. warning:: Do not let your cluster reach its ``full ratio`` before adding an
OSD. OSD failures that occur after the cluster reaches its ``near full
ratio`` might cause the cluster to exceed its ``full ratio``.

View File

@ -247,6 +247,18 @@ To see the status in greater detail, run the following command:
ceph balancer status detail
To enable `ceph balancer status detail`, run the following command:
.. prompt:: bash $
ceph config set mgr mgr/balancer/update_pg_upmap_activity True
To disable `ceph balancer status detail`, run the following command:
.. prompt:: bash $
ceph config set mgr mgr/balancer/update_pg_upmap_activity False
To evaluate the distribution that would result from executing a specific plan,
run the following command:

View File

@ -549,6 +549,63 @@ disable and remove it.
ceph osd tier remove cold-storage hot-storage
Troubleshooting Unfound Objects
===============================
Under certain circumstances, restarting OSDs may result in unfound objects.
Here is an example of unfound objects appearing during an upgrade from Ceph
14.2.6 to Ceph 14.2.7::
2/543658058 objects unfound (0.000%)
pg 19.12 has 1 unfound objects
pg 19.2d has 1 unfound objects
Possible data damage: 2 pgs recovery_unfound
pg 19.12 is active+recovery_unfound+undersized+degraded+remapped, acting [299,310], 1 unfound
pg 19.2d is active+recovery_unfound+undersized+degraded+remapped, acting [290,309], 1 unfound
# ceph pg 19.12 list_unfound
{
"num_missing": 1,
"num_unfound": 1,
"objects": [
{
"oid": {
"oid": "hit_set_19.12_archive_2020-02-25 13:43:50.256316Z_2020-02-25 13:43:50.325825Z",
"key": "",
"snapid": -2,
"hash": 18,
"max": 0,
"pool": 19,
"namespace": ".ceph-internal"
},
"need": "3312398'55868341",
"have": "0'0",
"flags": "none",
"locations": []
}
],
"more": false
Some tests in the field indicate that the unfound objects can be deleted with
no adverse effects (see `Tracker Issue #44286, Note 3
<https://tracker.ceph.com/issues/44286#note-3>`_). Pawel Stefanski suggests
that deleting missing or unfound objects is safe as long as the objects are a
part of ``.ceph-internal::hit_set_PGID_archive``.
Various members of the upstream Ceph community have reported in `Tracker Issue
#44286 <https://tracker.ceph.com/issues/44286>`_ that the following versions of
Ceph have been affected by this issue:
* 14.2.8
* 14.2.16
* 15.2.15
* 16.2.5
* 17.2.7
See `Tracker Issue #44286 <https://tracker.ceph.com/issues/44286>`_ for the
history of this issue.
.. _Create a Pool: ../pools#create-a-pool
.. _Pools - Set Pool Values: ../pools#set-pool-values

View File

@ -60,6 +60,24 @@ Where:
*blaum_roth*, *liber8tion* are *RAID6* equivalents in
the sense that they can only be configured with *m=2*.
.. note:: When using ``blaum_roth`` coding, the default
word size of ``w=7`` is suboptimal because ``blaum_roth``
works best when ``w+1`` is prime. When creating a new
erasure-code profile with ``technique=blaum_roth``,
set ``w`` to a number that is one integer less than a prime
number (for example, ``6``). See `Loic Dachary's
commit f51d21b to ceph/ceph <https://github.com/ceph/ceph/commit/f51d21b53d26d4f27c950cb1ba3f989e713ab325>`_ for information about
why this default cannot be changed easily in the
source code, and see `the second bullet point on
page 29 of Plank and Greenan's "Jerasure: A Library
in C Facilitating Erasure Coding for Storage
Applications" <https://github.com/ceph/jerasure/blob/master/Manual.pdf>`_ for an unequivocal statement of the restriction that applies
to ``w`` when using Blaum-Roth coding.
(Information about the proper value of ``w`` when
using ``blaum_roth`` coding was provided to the
Ceph upstream in September of 2024 by Benjamin
Mare.)
:Type: String
:Required: No.
:Default: reed_sol_van

View File

@ -7,19 +7,18 @@
Overview
========
There is a finite set of health messages that a Ceph cluster can raise. These
messages are known as *health checks*. Each health check has a unique
identifier.
There is a set of health states that a Ceph cluster can raise. These
are known as *health checks*. Each health check has a unique identifier.
The identifier is a terse human-readable string -- that is, the identifier is
readable in much the same way as a typical variable name. It is intended to
enable tools (for example, UIs) to make sense of health checks and present them
enable tools (for example, monitoring and UIs) to make sense of health checks and present them
in a way that reflects their meaning.
This page lists the health checks that are raised by the monitor and manager
daemons. In addition to these, you might see health checks that originate
from MDS daemons (see :ref:`cephfs-health-messages`), and health checks
that are defined by ``ceph-mgr`` python modules.
daemons. In addition to these, you may see health checks that originate
from CephFS MDS daemons (see :ref:`cephfs-health-messages`), and health checks
that are defined by ``ceph-mgr`` modules.
Definitions
===========
@ -30,49 +29,57 @@ Monitor
DAEMON_OLD_VERSION
__________________
Warn if one or more old versions of Ceph are running on any daemons. A health
check is raised if multiple versions are detected. This condition must exist
for a period of time greater than ``mon_warn_older_version_delay`` (set to one
week by default) in order for the health check to be raised. This allows most
upgrades to proceed without the occurrence of a false warning. If the upgrade
is paused for an extended time period, ``health mute`` can be used by running
``ceph health mute DAEMON_OLD_VERSION --sticky``. Be sure, however, to run
``ceph health unmute DAEMON_OLD_VERSION`` after the upgrade has finished.
One or more Ceph daemons are running an old Ceph release. A health check is
raised if multiple versions are detected. This condition must exist for a
period of time greater than ``mon_warn_older_version_delay`` (set to one week
by default) in order for the health check to be raised. This allows most
upgrades to proceed without raising a warning that is both expected and
ephemeral. If the upgrade is paused for an extended time, ``health mute`` can
be used by running ``ceph health mute DAEMON_OLD_VERSION --sticky``. Be sure,
however, to run ``ceph health unmute DAEMON_OLD_VERSION`` after the upgrade has
finished so that any future, unexpected instances are not masked.
MON_DOWN
________
One or more monitor daemons are currently down. The cluster requires a majority
(more than one-half) of the monitors to be available. When one or more monitors
are down, clients might have a harder time forming their initial connection to
the cluster, as they might need to try more addresses before they reach an
operating monitor.
One or more Ceph Monitor daemons are down. The cluster requires a majority
(more than one-half) of the provsioned monitors to be available. When one or
more monitors are down, clients may have a harder time forming their initial
connection to the cluster, as they may need to try additional IP addresses
before they reach an operating monitor.
The down monitor daemon should be restarted as soon as possible to reduce the
risk of a subsequent monitor failure leading to a service outage.
Down monitor daemons should be restored or restarted as soon as possible to
reduce the risk that an additional monitor failure may cause a service outage.
MON_CLOCK_SKEW
______________
The clocks on the hosts running the ceph-mon monitor daemons are not
well-synchronized. This health check is raised if the cluster detects a clock
skew greater than ``mon_clock_drift_allowed``.
The clocks on hosts running Ceph Monitor daemons are not well-synchronized.
This health check is raised if the cluster detects a clock skew greater than
``mon_clock_drift_allowed``.
This issue is best resolved by synchronizing the clocks by using a tool like
``ntpd`` or ``chrony``.
the legacy ``ntpd`` or the newer ``chrony``. It is ideal to configure NTP
daemons to sync against multiple internal and external sources for resilience;
the protocol will adaptively determine the best available source. It is also
beneficial to have the NTP daemons on Ceph Monitor hosts sync against each
other, as it is even more important that Monitors be synchronized with each
other than it is for them to be _correct_ with respect to reference time.
If it is impractical to keep the clocks closely synchronized, the
``mon_clock_drift_allowed`` threshold can also be increased. However, this
value must stay significantly below the ``mon_lease`` interval in order for the
monitor cluster to function properly.
``mon_clock_drift_allowed`` threshold can be increased. However, this value
must stay significantly below the ``mon_lease`` interval in order for the
monitor cluster to function properly. It is not difficult with a quality NTP
or PTP configuration to have sub-millisecond synchronization, so there are
very, very few occasions when it is appropriate to change this value.
MON_MSGR2_NOT_ENABLED
_____________________
The :confval:`ms_bind_msgr2` option is enabled but one or more monitors are
not configured to bind to a v2 port in the cluster's monmap. This
means that features specific to the msgr2 protocol (for example, encryption)
are unavailable on some or all connections.
The :confval:`ms_bind_msgr2` option is enabled but one or more monitors are not
configured in the cluster's monmap to bind to a v2 port. This means that
features specific to the msgr2 protocol (for example, encryption) are
unavailable on some or all connections.
In most cases this can be corrected by running the following command:
@ -85,35 +92,39 @@ port (6789) will continue to listen for v1 connections on 6789 and begin to
listen for v2 connections on the new default port 3300.
If a monitor is configured to listen for v1 connections on a non-standard port
(that is, a port other than 6789), then the monmap will need to be modified
(that is, a port other than 6789), the monmap will need to be modified
manually.
MON_DISK_LOW
____________
One or more monitors are low on disk space. This health check is raised if the
percentage of available space on the file system used by the monitor database
(normally ``/var/lib/ceph/mon``) drops below the percentage value
One or more monitors are low on storage space. This health check is raised if
the percentage of available space on the file system used by the monitor
database (normally ``/var/lib/ceph/mon``) drops below the percentage value
``mon_data_avail_warn`` (default: 30%).
This alert might indicate that some other process or user on the system is
filling up the file system used by the monitor. It might also
indicate that the monitor database is too large (see ``MON_DISK_BIG``
below).
filling up the file system used by the monitor. It might also indicate that the
monitor database is too large (see ``MON_DISK_BIG`` below). Another common
scenario is that Ceph logging subsystem levels have been raised for
troubleshooting purposes without subsequent return to default levels. Ongoing
verbose logging can easily fill up the files system containing ``/var/log``. If
you trim logs that are currently open, remember to restart or instruct your
syslog or other daemon to re-open the log file.
If space cannot be freed, the monitor's data directory might need to be
moved to another storage device or file system (this relocation process must be carried out while the monitor
daemon is not running).
If space cannot be freed, the monitor's data directory might need to be moved
to another storage device or file system (this relocation process must be
carried out while the monitor daemon is not running).
MON_DISK_CRIT
_____________
One or more monitors are critically low on disk space. This health check is raised if the
percentage of available space on the file system used by the monitor database
(normally ``/var/lib/ceph/mon``) drops below the percentage value
``mon_data_avail_crit`` (default: 5%). See ``MON_DISK_LOW``, above.
One or more monitors are critically low on storage space. This health check is
raised if the percentage of available space on the file system used by the
monitor database (normally ``/var/lib/ceph/mon``) drops below the percentage
value ``mon_data_avail_crit`` (default: 5%). See ``MON_DISK_LOW``, above.
MON_DISK_BIG
____________
@ -124,14 +135,15 @@ raised if the size of the monitor database is larger than
A large database is unusual, but does not necessarily indicate a problem.
Monitor databases might grow in size when there are placement groups that have
not reached an ``active+clean`` state in a long time.
not reached an ``active+clean`` state in a long time, or when extensive cluster
recovery, expansion, or topology changes have recently occurred.
This alert might also indicate that the monitor's database is not properly
This alert may also indicate that the monitor's database is not properly
compacting, an issue that has been observed with some older versions of
RocksDB. Forcing a compaction with ``ceph daemon mon.<id> compact`` might
shrink the database's on-disk size.
RocksDB. Forcing compaction with ``ceph daemon mon.<id> compact`` may suffice
to shrink the database's storage usage.
This alert might also indicate that the monitor has a bug that prevents it from
This alert may also indicate that the monitor has a bug that prevents it from
pruning the cluster metadata that it stores. If the problem persists, please
report a bug.
@ -222,8 +234,8 @@ this alert can be temporarily silenced by running the following command:
ceph health mute AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED 1w # 1 week
Although we do NOT recommend doing so, you can also disable this alert indefinitely
by running the following command:
Although we do NOT recommend doing so, you can also disable this alert
indefinitely by running the following command:
.. prompt:: bash $
@ -236,17 +248,17 @@ Manager
MGR_DOWN
________
All manager daemons are currently down. The cluster should normally have at
least one running manager (``ceph-mgr``) daemon. If no manager daemon is
running, the cluster's ability to monitor itself will be compromised, and parts
of the management API will become unavailable (for example, the dashboard will
not work, and most CLI commands that report metrics or runtime state will
block). However, the cluster will still be able to perform all I/O operations
and to recover from failures.
All Ceph Manager daemons are currently down. The cluster should normally have
at least one running manager (``ceph-mgr``) daemon. If no manager daemon is
running, the cluster's ability to monitor itself will be compromised, parts of
the management API will become unavailable (for example, the dashboard will not
work, and most CLI commands that report metrics or runtime state will block).
However, the cluster will still be able to perform client I/O operations and
recover from failures.
The "down" manager daemon should be restarted as soon as possible to ensure
that the cluster can be monitored (for example, so that the ``ceph -s``
information is up to date, or so that metrics can be scraped by Prometheus).
The down manager daemon(s) should be restarted as soon as possible to ensure
that the cluster can be monitored (for example, so that ``ceph -s`` information
is available and up to date, and so that metrics can be scraped by Prometheus).
MGR_MODULE_DEPENDENCY
@ -285,14 +297,14 @@ OSDs
OSD_DOWN
________
One or more OSDs are marked "down". The ceph-osd daemon might have been
stopped, or peer OSDs might be unable to reach the OSD over the network.
Common causes include a stopped or crashed daemon, a "down" host, or a network
outage.
One or more OSDs are marked ``down``. The ceph-osd daemon(s) or their host(s)
may have crashed or been stopped, or peer OSDs might be unable to reach the OSD
over the public or private network. Common causes include a stopped or crashed
daemon, a "down" host, or a network failure.
Verify that the host is healthy, the daemon is started, and the network is
functioning. If the daemon has crashed, the daemon log file
(``/var/log/ceph/ceph-osd.*``) might contain debugging information.
(``/var/log/ceph/ceph-osd.*``) may contain troubleshooting information.
OSD_<crush type>_DOWN
_____________________
@ -319,7 +331,7 @@ _____________________
The utilization thresholds for `nearfull`, `backfillfull`, `full`, and/or
`failsafe_full` are not ascending. In particular, the following pattern is
expected: `nearfull < backfillfull`, `backfillfull < full`, and `full <
failsafe_full`.
failsafe_full`. This can result in unexpected cluster behavior.
To adjust these utilization thresholds, run the following commands:
@ -355,8 +367,14 @@ threshold by a small amount. To do so, run the following command:
ceph osd set-full-ratio <ratio>
Additional OSDs should be deployed in order to add new storage to the cluster,
or existing data should be deleted in order to free up space in the cluster.
Additional OSDs should be deployed within appropriate CRUSH failure domains
in order to increase capacity, and / or existing data should be deleted
in order to free up space in the cluster. One subtle situation is that the
``rados bench`` tool may have been used to test one or more pools' performance,
and the resulting RADOS objects were not subsequently cleaned up. You may
check for this by invoking ``rados ls`` against each pool and looking for
objects with names beginning with ``bench`` or other job names. These may
then be manually but very, very carefully deleted in order to reclaim capacity.
OSD_BACKFILLFULL
________________
@ -493,9 +511,9 @@ or newer to start. To safely set the flag, run the following command:
OSD_FILESTORE
__________________
Warn if OSDs are running Filestore. The Filestore OSD back end has been
deprecated; the BlueStore back end has been the default object store since the
Ceph Luminous release.
Warn if OSDs are running the old Filestore back end. The Filestore OSD back end
is deprecated; the BlueStore back end has been the default object store since
the Ceph Luminous release.
The 'mclock_scheduler' is not supported for Filestore OSDs. For this reason,
the default 'osd_op_queue' is set to 'wpq' for Filestore OSDs and is enforced
@ -518,16 +536,16 @@ temporarily silence this alert by running the following command:
ceph health mute OSD_FILESTORE
Since this migration can take a considerable amount of time to complete, we
recommend that you begin the process well in advance of any update to Reef or
to later releases.
Since migration of Filestore OSDs to BlueStore can take a considerable amount
of time to complete, we recommend that you begin the process well in advance
of any update to Reef or to later releases.
OSD_UNREACHABLE
_______________
Registered v1/v2 public address of one or more OSD(s) is/are out of the
defined `public_network` subnet, which will prevent these unreachable OSDs
from communicating with ceph clients properly.
The registered v1/v2 public address or addresses of one or more OSD(s) is or
are out of the defined `public_network` subnet, which prevents these
unreachable OSDs from communicating with ceph clients properly.
Even though these unreachable OSDs are in up state, rados clients
will hang till TCP timeout before erroring out due to this inconsistency.
@ -535,7 +553,7 @@ will hang till TCP timeout before erroring out due to this inconsistency.
POOL_FULL
_________
One or more pools have reached their quota and are no longer allowing writes.
One or more pools have reached quota and no longer allow writes.
To see pool quotas and utilization, run the following command:
@ -621,9 +639,10 @@ command:
BLUESTORE_FRAGMENTATION
_______________________
As BlueStore operates, the free space on the underlying storage will become
fragmented. This is normal and unavoidable, but excessive fragmentation causes
slowdown. To inspect BlueStore fragmentation, run the following command:
``BLUESTORE_FRAGMENTATION`` indicates that the free space that underlies
BlueStore has become fragmented. This is normal and unavoidable, but excessive
fragmentation causes slowdown. To inspect BlueStore fragmentation, run the
following command:
.. prompt:: bash $
@ -662,11 +681,9 @@ One or more OSDs have BlueStore volumes that were created prior to the
Nautilus release. (In Nautilus, BlueStore tracks its internal usage
statistics on a granular, per-pool basis.)
If *all* OSDs
are older than Nautilus, this means that the per-pool metrics are
simply unavailable. But if there is a mixture of pre-Nautilus and
post-Nautilus OSDs, the cluster usage statistics reported by ``ceph
df`` will be inaccurate.
If *all* OSDs are older than Nautilus, this means that the per-pool metrics are
simply unavailable. But if there is a mixture of pre-Nautilus and post-Nautilus
OSDs, the cluster usage statistics reported by ``ceph df`` will be inaccurate.
The old OSDs can be updated to use the new usage-tracking scheme by stopping
each OSD, running a repair operation, and then restarting the OSD. For example,
@ -778,10 +795,10 @@ about the source of the problem.
BLUESTORE_SPURIOUS_READ_ERRORS
______________________________
One or more BlueStore OSDs detect spurious read errors on the main device.
One (or more) BlueStore OSDs detects read errors on the main device.
BlueStore has recovered from these errors by retrying disk reads. This alert
might indicate issues with underlying hardware, issues with the I/O subsystem,
or something similar. In theory, such issues can cause permanent data
or something similar. Such issues can cause permanent data
corruption. Some observations on the root cause of spurious read errors can be
found here: https://tracker.ceph.com/issues/22464
@ -801,6 +818,105 @@ Or, to disable this alert on a specific OSD, run the following command:
ceph config set osd.123 bluestore_warn_on_spurious_read_errors false
BLOCK_DEVICE_STALLED_READ_ALERT
_______________________________
There are BlueStore log messages that reveal storage drive issues
that can cause performance degradation and potentially data unavailability or
loss. These may indicate a storage drive that is failing and should be
evaluated and possibly removed and replaced.
``read stalled read 0x29f40370000~100000 (buffered) since 63410177.290546s, timeout is 5.000000s``
However, this is difficult to spot because there no discernible warning (a
health warning or info in ``ceph health detail`` for example). More observations
can be found here: https://tracker.ceph.com/issues/62500
Also because there can be false positive ``stalled read`` instances, a mechanism
has been added to increase accuracy. If in the last ``bdev_stalled_read_warn_lifetime``
seconds the number of ``stalled read`` events is found to be greater than or equal to
``bdev_stalled_read_warn_threshold`` for a given BlueStore block device, this
warning will be reported in ``ceph health detail``. The warning state will be
removed when the condition clears.
The defaults for :confval:`bdev_stalled_read_warn_lifetime`
and :confval:`bdev_stalled_read_warn_threshold` may be overridden globally or for
specific OSDs.
To change this, run the following command:
.. prompt:: bash $
ceph config set global bdev_stalled_read_warn_lifetime 10
ceph config set global bdev_stalled_read_warn_threshold 5
This may be done for specific OSDs or a given mask. For example,
to apply only to SSD OSDs:
.. prompt:: bash $
ceph config set osd.123 bdev_stalled_read_warn_lifetime 10
ceph config set osd.123 bdev_stalled_read_warn_threshold 5
ceph config set class:ssd bdev_stalled_read_warn_lifetime 10
ceph config set class:ssd bdev_stalled_read_warn_threshold 5
WAL_DEVICE_STALLED_READ_ALERT
_____________________________
The warning state ``WAL_DEVICE_STALLED_READ_ALERT`` is raised to indicate
``stalled read`` instances on a given BlueStore OSD's ``WAL_DEVICE``. This
warning can be configured via the :confval:`bdev_stalled_read_warn_lifetime`
and :confval:`bdev_stalled_read_warn_threshold` options with commands similar
to those described in the ``BLOCK_DEVICE_STALLED_READ_ALERT`` warning section.
DB_DEVICE_STALLED_READ_ALERT
____________________________
The warning state ``DB_DEVICE_STALLED_READ_ALERT`` is raised to indicate
``stalled read`` instances on a given BlueStore OSD's ``DB_DEVICE``. This
warning can be configured via the :confval:`bdev_stalled_read_warn_lifetime`
and :confval:`bdev_stalled_read_warn_threshold` options with commands similar
to those described in the ``BLOCK_DEVICE_STALLED_READ_ALERT`` warning section.
BLUESTORE_SLOW_OP_ALERT
_______________________
There are BlueStore log messages that reveal storage drive issues that can lead
to performance degradation and data unavailability or loss. These indicate
that the storage drive may be failing and should be investigated and
potentially replaced.
``log_latency_fn slow operation observed for _txc_committed_kv, latency = 12.028621219s, txc = 0x55a107c30f00``
``log_latency_fn slow operation observed for upper_bound, latency = 6.25955s``
``log_latency slow operation observed for submit_transaction..``
As there can be false positive ``slow ops`` instances, a mechanism has
been added for more reliability. If in the last ``bluestore_slow_ops_warn_lifetime``
seconds the number of ``slow ops`` indications are found greater than or equal to
:confval:`bluestore_slow_ops_warn_threshold` for a given BlueStore OSD, this
warning will be reported in ``ceph health detail``. The warning state is
cleared when the condition clears.
The defaults for :confval:`bluestore_slow_ops_warn_lifetime` and
:confval:`bluestore_slow_ops_warn_threshold` may be overidden globally or for
specific OSDs.
To change this, run the following command:
.. prompt:: bash $
ceph config set global bluestore_slow_ops_warn_lifetime 10
ceph config set global bluestore_slow_ops_warn_threshold 5
this may be done for specific OSDs or a given mask, for example:
.. prompt:: bash $
ceph config set osd.123 bluestore_slow_ops_warn_lifetime 10
ceph config set osd.123 bluestore_slow_ops_warn_threshold 5
ceph config set class:ssd bluestore_slow_ops_warn_lifetime 10
ceph config set class:ssd bluestore_slow_ops_warn_threshold 5
Device health
-------------
@ -815,7 +931,12 @@ appropriate response to this expected failure is (1) to mark the OSD ``out`` so
that data is migrated off of the OSD, and then (2) to remove the hardware from
the system. Note that this marking ``out`` is normally done automatically if
``mgr/devicehealth/self_heal`` is enabled (as determined by
``mgr/devicehealth/mark_out_threshold``).
``mgr/devicehealth/mark_out_threshold``). If an OSD device is compromised but
the OSD(s) on that device are still ``up``, recovery can be degraded. In such
cases it may be advantageous to forcibly stop the OSD daemon(s) in question so
that recovery can proceed from surviving healthly OSDs. This must be
done with extreme care and attention to failure domains so that data availability
is not compromised.
To check device health, run the following command:
@ -823,8 +944,8 @@ To check device health, run the following command:
ceph device info <device-id>
Device life expectancy is set either by a prediction model that the mgr runs or
by an external tool that is activated by running the following command:
Device life expectancy is set either by a prediction model that the Ceph Manager
runs or by an external tool that runs a command the following form:
.. prompt:: bash $
@ -978,7 +1099,7 @@ ____________________
The count of read repairs has exceeded the config value threshold
``mon_osd_warn_num_repaired`` (default: ``10``). Because scrub handles errors
only for data at rest, and because any read error that occurs when another
replica is available will be repaired immediately so that the client can get
replica is available is repaired immediately so that the client can get
the object data, there might exist failing disks that are not registering any
scrub errors. This repair count is maintained as a way of identifying any such
failing disks.
@ -988,8 +1109,8 @@ LARGE_OMAP_OBJECTS
__________________
One or more pools contain large omap objects, as determined by
``osd_deep_scrub_large_omap_object_key_threshold`` (threshold for the number of
keys to determine what is considered a large omap object) or
``osd_deep_scrub_large_omap_object_key_threshold`` (the threshold for the
number of keys to determine what is considered a large omap object) or
``osd_deep_scrub_large_omap_object_value_sum_threshold`` (the threshold for the
summed size in bytes of all key values to determine what is considered a large
omap object) or both. To find more information on object name, key count, and
@ -1009,7 +1130,7 @@ CACHE_POOL_NEAR_FULL
____________________
A cache-tier pool is nearly full, as determined by the ``target_max_bytes`` and
``target_max_objects`` properties of the cache pool. Once the pool reaches the
``target_max_objects`` properties of the cache pool. When the pool reaches the
target threshold, write requests to the pool might block while data is flushed
and evicted from the cache. This state normally leads to very high latencies
and poor performance.
@ -1155,10 +1276,10 @@ For more information, see :ref:`choosing-number-of-placement-groups` and
POOL_TARGET_SIZE_BYTES_OVERCOMMITTED
____________________________________
One or more pools have a ``target_size_bytes`` property that is set in order to
estimate the expected size of the pool, but the value(s) of this property are
greater than the total available storage (either by themselves or in
combination with other pools).
One or more pools does have a ``target_size_bytes`` property that is set in
order to estimate the expected size of the pool, but the value or values of
this property are greater than the total available storage (either by
themselves or in combination with other pools).
This alert is usually an indication that the ``target_size_bytes`` value for
the pool is too large and should be reduced or set to zero. To reduce the
@ -1230,7 +1351,7 @@ data have too many PGs. See *TOO_MANY_PGS* above.
To silence the health check, raise the threshold by adjusting the
``mon_pg_warn_max_object_skew`` config option on the managers.
The health check will be silenced for a specific pool only if
The health check is silenced for a specific pool only if
``pg_autoscale_mode`` is set to ``on``.
POOL_APP_NOT_ENABLED
@ -1297,8 +1418,8 @@ resolution, see :ref:`storage-capacity` and :ref:`no-free-drive-space`.
OBJECT_MISPLACED
________________
One or more objects in the cluster are not stored on the node that CRUSH would
prefer that they be stored on. This alert is an indication that data migration
One or more objects in the cluster are not stored on the node that CRUSH
prefers that they be stored on. This alert is an indication that data migration
due to a recent cluster change has not yet completed.
Misplaced data is not a dangerous condition in and of itself; data consistency
@ -1365,7 +1486,7 @@ percentage (determined by ``mon_warn_pg_not_scrubbed_ratio``) of the interval
has elapsed after the time the scrub was scheduled and no scrub has been
performed.
PGs will be scrubbed only if they are flagged as ``clean`` (which means that
PGs are scrubbed only if they are flagged as ``clean`` (which means that
they are to be cleaned, and not that they have been examined and found to be
clean). Misplaced or degraded PGs will not be flagged as ``clean`` (see
*PG_AVAILABILITY* and *PG_DEGRADED* above).
@ -1382,13 +1503,22 @@ ____________________
One or more Placement Groups (PGs) have not been deep scrubbed recently. PGs
are normally scrubbed every :confval:`osd_deep_scrub_interval` seconds at most.
This health check is raised if a certain percentage (determined by
``mon_warn_pg_not_deep_scrubbed_ratio``) of the interval has elapsed after the
time the scrub was scheduled and no scrub has been performed.
:confval:`mon_warn_pg_not_deep_scrubbed_ratio`) of the interval has elapsed
after the time the scrub was scheduled and no scrub has been performed.
PGs will receive a deep scrub only if they are flagged as *clean* (which means
that they are to be cleaned, and not that they have been examined and found to
be clean). Misplaced or degraded PGs might not be flagged as ``clean`` (see
*PG_AVAILABILITY* and *PG_DEGRADED* above).
PGs will receive a deep scrub only if they are flagged as ``clean`` (which
means that they are to be cleaned, and not that they have been examined and
found to be clean). Misplaced or degraded PGs might not be flagged as ``clean``
(see *PG_AVAILABILITY* and *PG_DEGRADED* above).
This document offers two methods of setting the value of
:confval:`osd_deep_scrub_interval`. The first method listed here changes the
value of :confval:`osd_deep_scrub_interval` globally. The second method listed
here changes the value of :confval:`osd_deep scrub interval` for OSDs and for
the Manager daemon.
First Method
~~~~~~~~~~~~
To manually initiate a deep scrub of a clean PG, run the following command:
@ -1396,6 +1526,72 @@ To manually initiate a deep scrub of a clean PG, run the following command:
ceph pg deep-scrub <pgid>
Under certain conditions, the warning ``PGs not deep-scrubbed in time``
appears. This might be because the cluster contains many large PGs, which take
longer to deep-scrub. To remedy this situation, you must change the value of
:confval:`osd_deep_scrub_interval` globally.
#. Confirm that ``ceph health detail`` returns a ``pgs not deep-scrubbed in
time`` warning::
# ceph health detail
HEALTH_WARN 1161 pgs not deep-scrubbed in time
[WRN] PG_NOT_DEEP_SCRUBBED: 1161 pgs not deep-scrubbed in time
pg 86.fff not deep-scrubbed since 2024-08-21T02:35:25.733187+0000
#. Change ``osd_deep_scrub_interval`` globally:
.. prompt:: bash #
ceph config set global osd_deep_scrub_interval 1209600
The above procedure was developed by Eugen Block in September of 2024.
See `Eugen Block's blog post <https://heiterbiswolkig.blogs.nde.ag/2024/09/06/pgs-not-deep-scrubbed-in-time/>`_ for much more detail.
See `Redmine tracker issue #44959 <https://tracker.ceph.com/issues/44959>`_.
Second Method
~~~~~~~~~~~~~
To manually initiate a deep scrub of a clean PG, run the following command:
.. prompt:: bash $
ceph pg deep-scrub <pgid>
Under certain conditions, the warning ``PGs not deep-scrubbed in time``
appears. This might be because the cluster contains many large PGs, which take
longer to deep-scrub. To remedy this situation, you must change the value of
:confval:`osd_deep_scrub_interval` for OSDs and for the Manager daemon.
#. Confirm that ``ceph health detail`` returns a ``pgs not deep-scrubbed in
time`` warning::
# ceph health detail
HEALTH_WARN 1161 pgs not deep-scrubbed in time
[WRN] PG_NOT_DEEP_SCRUBBED: 1161 pgs not deep-scrubbed in time
pg 86.fff not deep-scrubbed since 2024-08-21T02:35:25.733187+0000
#. Change the ``osd_deep_scrub_interval`` for OSDs:
.. prompt:: bash #
ceph config set osd osd_deep_scrub_interval 1209600
#. Change the ``osd_deep_scrub_interval`` for Managers:
.. prompt:: bash #
ceph config set mgr osd_deep_scrub_interval 1209600
The above procedure was developed by Eugen Block in September of 2024.
See `Eugen Block's blog post <https://heiterbiswolkig.blogs.nde.ag/2024/09/06/pgs-not-deep-scrubbed-in-time/>`_ for much more detail.
See `Redmine tracker issue #44959 <https://tracker.ceph.com/issues/44959>`_.
PG_SLOW_SNAP_TRIMMING
_____________________
@ -1422,9 +1618,10 @@ Stretch Mode
INCORRECT_NUM_BUCKETS_STRETCH_MODE
__________________________________
Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests
that the number of dividing buckets is not equal to 2 after stretch mode is enabled.
You can expect unpredictable failures and MON assertions until the condition is fixed.
Stretch mode currently only support 2 dividing buckets with OSDs, this warning
suggests that the number of dividing buckets is not equal to 2 after stretch
mode is enabled. You can expect unpredictable failures and MON assertions
until the condition is fixed.
We encourage you to fix this by removing additional dividing buckets or bump the
number of dividing buckets to 2.
@ -1441,6 +1638,27 @@ We encourage you to fix this by making the weights even on both dividing buckets
This can be done by making sure the combined weight of the OSDs on each dividing
bucket are the same.
NVMeoF Gateway
--------------
NVMEOF_SINGLE_GATEWAY
_____________________
One of the gateway group has only one gateway. This is not ideal because it
makes high availability (HA) impossible with a single gatway in a group. This
can lead to problems with failover and failback operations for the NVMeoF
gateway.
It's recommended to have multiple NVMeoF gateways in a group.
NVMEOF_GATEWAY_DOWN
___________________
Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has
crashed, the daemon log file (found at ``/var/log/ceph/``) may contain
troubleshooting information.
Miscellaneous
-------------

View File

@ -419,7 +419,10 @@ conditions change.
Ceph provides a number of settings to manage the load spike associated with the
reassignment of PGs to an OSD (especially a new OSD). The ``osd_max_backfills``
setting specifies the maximum number of concurrent backfills to and from an OSD
(default: 1). The ``backfill_full_ratio`` setting allows an OSD to refuse a
(default: 1; note you cannot change this if the `mClock`_ scheduler is active,
unless you set ``osd_mclock_override_recovery_settings = true``, see
`mClock backfill`_).
The ``backfill_full_ratio`` setting allows an OSD to refuse a
backfill request if the OSD is approaching its full ratio (default: 90%). This
setting can be changed with the ``ceph osd set-backfillfull-ratio`` command. If
an OSD refuses a backfill request, the ``osd_backfill_retry_interval`` setting
@ -545,6 +548,8 @@ performing the migration. For details, see the `Architecture`_ section.
.. _data placement: ../data-placement
.. _pool: ../pools
.. _placement group: ../placement-groups
.. _mClock: ../../configuration/mclock-config-ref.rst
.. _mClock backfill: ../../configuration/mclock-config-ref.rst#recovery-backfill-options
.. _Architecture: ../../../architecture
.. _OSD Not Running: ../../troubleshooting/troubleshooting-osd#osd-not-running
.. _Troubleshooting PG Errors: ../../troubleshooting/troubleshooting-pg#troubleshooting-pg-errors

View File

@ -737,6 +737,117 @@ Managing pools that are flagged with ``--bulk``
===============================================
See :ref:`managing_bulk_flagged_pools`.
Setting values for a stretch pool
=================================
To set values for a stretch pool, run a command of the following form:
.. prompt:: bash $
ceph osd pool stretch set {pool-name} {peering_crush_bucket_count} {peering_crush_bucket_target} {peering_crush_bucket_barrier} {crush_rule} {size} {min_size} [--yes-i-really-mean-it]
Here are the break downs of the arguments:
.. describe:: {pool-name}
The name of the pool. It must be an existing pool, this command doesn't create a new pool.
:Type: String
:Required: Yes.
.. describe:: {peering_crush_bucket_count}
The value is used along with peering_crush_bucket_barrier to determined whether the set of
OSDs in the chosen acting set can peer with each other, based on the number of distinct
buckets there are in the acting set.
:Type: Integer
:Required: Yes.
.. describe:: {peering_crush_bucket_target}
This value is used along with peering_crush_bucket_barrier and size to calculate
the value bucket_max which limits the number of OSDs in the same bucket from getting chose to be in the acting set of a PG.
:Type: Integer
:Required: Yes.
.. describe:: {peering_crush_bucket_barrier}
The type of bucket a pool is stretched across, e.g., rack, row, or datacenter.
:Type: String
:Required: Yes.
.. describe:: {crush_rule}
The crush rule to use for the stretch pool. The type of pool must match the type of crush_rule
(replicated or erasure).
:Type: String
:Required: Yes.
.. describe:: {size}
The number of replicas for objects in the stretch pool.
:Type: Integer
:Required: Yes.
.. describe:: {min_size}
The minimum number of replicas required for I/O in the stretch pool.
:Type: Integer
:Required: Yes.
.. describe:: {--yes-i-really-mean-it}
This flag is required to confirm that you really want to by-pass
the safety checks and set the values for a stretch pool, e.g,
when you are trying to set ``peering_crush_bucket_count`` or
``peering_crush_bucket_target`` to be more than the number of buckets in the crush map.
:Type: Flag
:Required: No.
.. _setting_values_for_a_stretch_pool:
Unsetting values for a stretch pool
===================================
To move the pool back to non-stretch, run a command of the following form:
.. prompt:: bash $
ceph osd pool stretch unset {pool-name}
Here are the break downs of the argument:
.. describe:: {pool-name}
The name of the pool. It must be an existing pool that is stretched,
i.e., it has already been set with the command `ceph osd pool stretch set`.
:Type: String
:Required: Yes.
Showing values of a stretch pool
================================
To show values for a stretch pool, run a command of the following form:
.. prompt:: bash $
ceph osd pool stretch show {pool-name}
Here are the break downs of the argument:
.. describe:: {pool-name}
The name of the pool. It must be an existing pool that is stretched,
i.e., it has already been set with the command `ceph osd pool stretch set`.
:Type: String
:Required: Yes.
.. _Pool, PG and CRUSH Config Reference: ../../configuration/pool-pg-config-ref
.. _Bloom Filter: https://en.wikipedia.org/wiki/Bloom_filter
.. _setting the number of placement groups: ../placement-groups#set-the-number-of-placement-groups

View File

@ -81,6 +81,18 @@ Data Center B. In a situation of this kind, the loss of Data Center A means
that the data is lost and Ceph will not be able to operate on it. This
situation is surprisingly difficult to avoid using only standard CRUSH rules.
Individual Stretch Pools
========================
Setting individual ``stretch pool`` is an option that allows for the configuration
of specific pools to be distributed across ``two or more data centers``.
This is achieved by executing the ``ceph osd pool stretch set`` command on each desired pool,
as opposed to applying a cluster-wide configuration ``with stretch mode``.
See :ref:`setting_values_for_a_stretch_pool`
Use ``stretch mode`` when you have exactly ``two data centers`` and require a uniform
configuration across the entire cluster. Conversely, opt for a ``stretch pool``
when you need a particular pool to be replicated across ``more than two data centers``,
providing a more granular level of control and a larger cluster size.
Stretch Mode
============
@ -260,8 +272,21 @@ SSDs (including NVMe OSDs). Hybrid HDD+SDD or HDD-only OSDs are not recommended
due to the long time it takes for them to recover after connectivity between
data centers has been restored. This reduces the potential for data loss.
In the future, stretch mode might support erasure-coded pools and might support
deployments that have more than two data centers.
.. warning:: Device class is currently not supported in stretch mode.
For example, the following rule containing ``device class`` will not work::
rule stretch_replicated_rule {
id 2
type replicated class hdd
step take default
step choose firstn 0 type datacenter
step chooseleaf firstn 2 type host
step emit
}
In the future, stretch mode could support erasure-coded pools,
enable deployments across multiple data centers,
and accommodate various device classes.
Other commands
==============

View File

@ -6,23 +6,24 @@ Ceph component debug log levels can be adjusted at runtime, while services are
running. In some circumstances you might want to adjust debug log levels in
``ceph.conf`` or in the central config store. Increased debug logging can be
useful if you are encountering issues when operating your cluster. By default,
Ceph log files are in ``/var/log/ceph``.
Ceph log files are in ``/var/log/ceph``; containerized deployments often log
elsewhere under ``/var/log``.
.. tip:: Remember that debug output can slow down your system, and that this
latency sometimes hides race conditions.
Debug logging is resource intensive. If you encounter a problem in a specific
component of your cluster, begin troubleshooting by enabling logging for only
that component of the cluster. For example, if your OSDs are running without
errors, but your metadata servers are not, enable logging for any specific
metadata server instances that are having problems. Continue by enabling
that component. For example, if your OSDs are running without
errors, but your CephFS metadata servers (MDS) are not, enable logging for specific
instances that are having problems. Continue by enabling
logging for each subsystem only as needed.
.. important:: Verbose logging sometimes generates over 1 GB of data per hour.
If the disk that your operating system runs on (your "OS disk") reaches its
capacity, the node associated with that disk will stop working.
Whenever you enable or increase the rate of debug logging, make sure that you
Whenever you enable or increase the level of debug logging, ensure that you
have ample capacity for log files, as this may dramatically increase their
size. For details on rotating log files, see `Accelerating Log Rotation`_.
When your system is running well again, remove unnecessary debugging settings
@ -34,7 +35,7 @@ For details on available settings, see `Subsystem, Log and Debug Settings`_.
Runtime
=======
To see the configuration settings at runtime, log in to a host that has a
To see configuration settings at runtime, log in to a host that has a
running daemon and run a command of the following form:
.. prompt:: bash $
@ -57,7 +58,7 @@ tell`` command of the following form:
Here ``{daemon-type}`` is ``osd``, ``mon``, or ``mds``. Apply the runtime
setting either to a specific daemon (by specifying its ID) or to all daemons of
a particular type (by using the ``*`` operator). For example, to increase
a particular type (by using the ``*`` wildcard as the ID). For example, to increase
debug logging for a specific ``ceph-osd`` daemon named ``osd.0``, run the
following command:
@ -81,7 +82,8 @@ Boot Time
=========
To activate Ceph's debugging output (that is, the ``dout()`` logging function)
at boot time, you must add settings to your Ceph configuration file.
at boot time, you must add settings to your Ceph configuration file (or
set corresponding values in the central config store).
Subsystems that are common to all daemons are set under ``[global]`` in the
configuration file. Subsystems for a specific daemon are set under the relevant
daemon section in the configuration file (for example, ``[mon]``, ``[osd]``,
@ -115,7 +117,7 @@ For details, see `Subsystem, Log and Debug Settings`_.
Accelerating Log Rotation
=========================
If your log filesystem is nearly full, you can accelerate log rotation by
If a host's log filesystem is nearly full, you can accelerate log rotation by
modifying the Ceph log rotation file at ``/etc/logrotate.d/ceph``. To increase
the frequency of log rotation (which will guard against a filesystem reaching
capacity), add a ``size`` directive after the ``weekly`` frequency directive.
@ -149,8 +151,8 @@ setting is shown immediately below.
30 * * * * /usr/sbin/logrotate /etc/logrotate.d/ceph >/dev/null 2>&1
In this example, the ``etc/logrotate.d/ceph`` file will be checked every 30
minutes.
In this example, the ``etc/logrotate.d/ceph`` file will be checked and possibly
rotated every 30 minutes.
Valgrind
========
@ -175,7 +177,7 @@ For each subsystem, there is a logging level for its output logs (a so-called
"log level") and a logging level for its in-memory logs (a so-called "memory
level"). Different values may be set for these two logging levels in each
subsystem. Ceph's logging levels operate on a scale of ``1`` to ``20``, where
``1`` is terse and ``20`` is verbose. In certain rare cases, there are logging
``1`` is terse and ``20`` is verbose. In a certain few cases, there are logging
levels that can take a value greater than 20. The resulting logs are extremely
verbose.
@ -184,7 +186,7 @@ following conditions are true:
- a fatal signal has been raised or
- an assertion within Ceph code has been triggered or
- the sending of in-memory logs to the output log has been manually triggered.
- sending in-memory logs to the output log has been manually triggered.
Consult `the portion of the "Ceph Administration Tool documentation
that provides an example of how to submit admin socket commands
<http://docs.ceph.com/en/latest/man/8/ceph/#daemon>`_ for more detail.
@ -206,8 +208,8 @@ following:
debug mds balancer = 1/20
The following table provides a list of Ceph subsystems and their default log and
memory levels. Once you complete your logging efforts, restore the subsystems
to their default level or to a level suitable for normal operations.
memory levels. Once you complete your logging efforts, restore each subsystem's
values to their defaults or to a level suitable for normal operations.
+--------------------------+-----------+--------------+
| Subsystem | Log Level | Memory Level |

View File

@ -618,6 +618,7 @@ Possible causes include:
- A bug in the kernel file system (check ``dmesg`` output)
- An overloaded cluster (check system load, iostat, etc.)
- A bug in the ``ceph-osd`` daemon.
- Suboptimal OSD shard configuration (on HDD based cluster with mClock scheduler)
Possible solutions:
@ -626,6 +627,8 @@ Possible solutions:
- Upgrade Ceph
- Restart OSDs
- Replace failed or failing components
- Override OSD shard configuration (on HDD based cluster with mClock scheduler)
- See :ref:`mclock-tblshoot-hdd-shard-config` for resolution
Debugging Slow Requests
-----------------------
@ -680,6 +683,44 @@ Although some of these events may appear redundant, they cross important
boundaries in the internal code (such as passing data across locks into new
threads).
.. _mclock-tblshoot-hdd-shard-config:
Slow Requests or Slow Recovery With mClock Scheduler
----------------------------------------------------
.. note:: This troubleshooting is applicable only for HDD based clusters running
mClock scheduler and with the following OSD shard configuration:
``osd_op_num_shards_hdd`` = 5 and ``osd_op_num_threads_per_shard_hdd`` = 1.
Also, see :ref:`mclock-hdd-cfg` for details around the reason for the change
made to the default OSD HDD shard configuration for mClock.
On scaled HDD based clusters with mClock scheduler enabled and under multiple
OSD node failure condition, the following could be reported or observed:
- slow requests: This also manifests into degraded client I/O performance.
- slow background recoveries: Lower than expected recovery throughput.
**Troubleshooting Steps:**
#. Verify from OSD events that the slow requests are predominantly of type
``queued_for_pg``.
#. Verify if the reported recovery rate is significantly lower than the expected
rate considering the QoS allocations for background recovery service.
If either of the above steps are true, then the following resolution may be
applied. Note that this is disruptive as it involves OSD restarts. Run the
following commands to change the default OSD shard configuration for HDDs:
.. prompt:: bash
ceph config set osd osd_op_num_shards_hdd 1
ceph config set osd osd_op_num_threads_per_shard_hdd 5
The above configuration won't take effect immediately and would require a
restart of the OSDs in the environment. For this process to be least disruptive,
the OSDs may be restarted in a carefully staggered manner.
.. _rados_tshooting_flapping_osd:
Flapping OSDs
=============

View File

@ -5,16 +5,16 @@
Placement Groups Never Get Clean
================================
If, after you have created your cluster, any Placement Groups (PGs) remain in
the ``active`` status, the ``active+remapped`` status or the
``active+degraded`` status and never achieves an ``active+clean`` status, you
likely have a problem with your configuration.
Placement Groups (PGs) that remain in the ``active`` status, the
``active+remapped`` status or the ``active+degraded`` status and never achieve
an ``active+clean`` status might indicate a problem with the configuration of
the Ceph cluster.
In such a situation, it may be necessary to review the settings in the `Pool,
PG and CRUSH Config Reference`_ and make appropriate adjustments.
In such a situation, review the settings in the `Pool, PG and CRUSH Config
Reference`_ and make appropriate adjustments.
As a general rule, run your cluster with more than one OSD and a pool size
greater than two object replicas.
of greater than two object replicas.
.. _one-node-cluster:

View File

@ -77,14 +77,14 @@ allow it. The account root user can add identity policies to its users in
several ways.
* Add policy directly to the user with the ``iam:PutUserPolicy`` and
``iam:AttachUserPoliicy`` actions.
``iam:AttachUserPolicy`` actions.
* Create an IAM group and add group policy with the ``iam:PutGroupPolicy`` and
``iam:AttachGroupPoliicy`` actions. Users added to that group with the
``iam:AttachGroupPolicy`` actions. Users added to that group with the
``iam:AddUserToGroup`` action will inherit all of the group's policy.
* Create an IAM role and add role policy with the ``iam:PutRolePolicy`` and
``iam:AttachRolePoliicy`` actions. Users that assume this role with the
``iam:AttachRolePolicy`` actions. Users that assume this role with the
``sts:AssumeRole`` and ``sts:AssumeRoleWithWebIdentity`` actions will inherit
all of the role's policy.
@ -177,8 +177,8 @@ An existing user can be adopted into an account with ``user modify``::
.. warning:: Ownership of the user's notification topics will not be
transferred to the account. Notifications will continue to work, but
the topics will no longer be visible to SNS Topic APIs. Topics and
their associated bucket notifications should be removed before migration
and recreated within the account.
their associated bucket notifications can be migrated as described below
in `Migrating Notification Topics`_.
Because account users have no permissions by default, some identity policy must
be added to restore the user's original permissions.
@ -187,6 +187,44 @@ Alternatively, you may want to create a new account for each existing user. In
that case, you may want to add the ``--account-root`` option to make each user
the root user of their account.
Migrating Notification Topics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Account topics are supported only when the ``notification_v2`` feature is enabled,
as described in `Bucket Notifications`_ and `Supported Zone Features`_.
1. ``Migration Impact``: When a non-account user is migrated to an account, the
the existing notification topics remain accessible through the RadosGW admin API,
but the user loses access to them via the SNS Topic API. Despite this, the topics
remain functional, and bucket notifications will continue to be delivered as expected.
2. ``Re-creation of Topics``: The account user should re-create the topics using
the same names. The old topics (now inaccessible) and the new account-owned topics
will coexist without interference.
3. ``Updating Bucket Notification Configurations``: Buckets that are subscribed to
the old user-owned topics should be updated to use the new account-owned topics.
To prevent duplicate notifications, maintain the same notification IDs.
For example, if a bucket's existing notification configuration is:
.. code-block:: json
{"TopicConfigurations": [{ "Id": "ID1", "TopicArn": "arn:aws:sns:default::topic1", "Events": ["s3:ObjectCreated:*"]}]}
The updated configuration would be:
.. code-block:: json
{"TopicConfigurations": [{ "Id": "ID1", "TopicArn": "arn:aws:sns:default:RGW00000000000000001:topic1", "Events": ["s3:ObjectCreated:*"]}]}
In this example, `RGW00000000000000001` is the account ID, `topic1` is the
topic name and `ID1` is the notification ID.
4. ``Removing Old Topics``: Once no buckets are subscribed to the old user-owned topics,
they can be removed by an admin::
$ radosgw-admin topic rm --topic topic1
Account Root example
--------------------
@ -252,3 +290,5 @@ This example uses `awscli`_ to create an IAM user for S3 operations.
.. _Evaluating policies within a single account: https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_evaluation-logic.html#policy-eval-basics
.. _Cross-account policy evaluation logic: https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_evaluation-logic-cross-account.html
.. _awscli: https://docs.aws.amazon.com/cli/latest/
.. _Bucket Notifications: ../notifications/
.. _Supported Zone Features: ../zone-features/#supported-features

View File

@ -4,34 +4,30 @@ Archive Sync Module
.. versionadded:: Nautilus
This sync module leverages the versioning feature of the S3 objects in RGW to
have an archive zone that captures the different versions of the S3 objects
as they occur over time in the other zones.
The Archive Sync module uses the RGW versioning feature of S3 objects to
maintain an archive zone that captures successive versions of objects
as they are updated in other zones. Archive zone objects can
be removed only through gateways associated with the archive zone.
An archive zone allows to have a history of versions of S3 objects that can
only be eliminated through the gateways associated with the archive zone.
This functionality is useful to have a configuration where several
This enables a deployment where several
non-versioned zones replicate their data and metadata through their zone
gateways (mirror configuration) providing high availability to the end users,
while the archive zone captures all the data updates and metadata for
consolidate them as versions of S3 objects.
while the archive zone captures data and metadata updates.
Including an archive zone in a multizone configuration allows you to have the
flexibility of an S3 object history in one only zone while saving the space
that the replicas of the versioned S3 objects would consume in the rest of the
Deploying an archive zone in a multizone configuration enables the
flexibility of S3 object history in a single zone while saving the space
that replicas of versioned S3 objects would consume in the rest of the
zones.
Archive Sync Tier Type Configuration
------------------------------------
How to Configure
~~~~~~~~~~~~~~~~
See `Multisite Configuration`_ for how to multisite config instructions. The
archive sync module requires a creation of a new zone. The zone tier type needs
See `Multisite Configuration`_ for multisite configuration instructions. The
archive sync module requires the creation of a new zone. The zone tier type needs
to be defined as ``archive``:
::

View File

@ -265,18 +265,18 @@ QoS settings
.. versionadded:: Nautilus
The ``civetweb`` frontend has a threading model that uses a thread per
The older and now non-default``civetweb`` frontend has a threading model that uses a thread per
connection and hence is automatically throttled by :confval:`rgw_thread_pool_size`
configurable when it comes to accepting connections. The newer ``beast`` frontend is
not restricted by the thread pool size when it comes to accepting new
connections, so a scheduler abstraction is introduced in the Nautilus release
to support future methods of scheduling requests.
when accepting connections. The newer and default ``beast`` frontend is
not limited by the thread pool size when it comes to accepting new
connections, so a scheduler abstraction was introduced in the Nautilus release
to support additional methods of scheduling requests.
Currently the scheduler defaults to a throttler which throttles the active
connections to a configured limit. QoS based on mClock is currently in an
*experimental* phase and not recommended for production yet. Current
implementation of *dmclock_client* op queue divides RGW ops on admin, auth
(swift auth, sts) metadata & data requests.
Currently the scheduler defaults to a throttler that limits active
connections to a configured limit. QoS rate limiting based on mClock is currently
*experimental* phase and not recommended for production. The current
implementation of the *dmclock_client* op queue divides RGW ops into admin, auth
(swift auth, sts) metadata, and data requests.
.. confval:: rgw_max_concurrent_requests
@ -306,9 +306,9 @@ D4N Settings
============
D4N is a caching architecture that utilizes Redis to speed up S3 object storage
operations by establishing shared databases between different RGW access points.
operations by establishing shared databases among Ceph Object Gateway (RGW) daemons.
Currently, the architecture can only function on one Redis instance at a time.
The D4N architecture can only function on one Redis instance at a time.
The address is configurable and can be changed by accessing the parameters
below.
@ -318,18 +318,18 @@ below.
Topic persistency settings
==========================
Topic persistency will persistently push the notification until it succeeds.
Topic persistency will repeatedly push notifications until they succeed.
For more information, see `Bucket Notifications`_.
The default behavior is to push indefinitely and as frequently as possible.
With these settings you can control how long and how often to retry an
unsuccessful notification. How long to persistently push can be controlled
by providing maximum time of retention or maximum amount of retries.
Frequency of persistent push retries can be controlled with the sleep duration
unsuccessful notification by configuring the maximum retention time and/or or
maximum number of retries.
The interval between push retries can be configured via the sleep duration
parameter.
All of these values have default value 0 (persistent retention is indefinite,
and retried as frequently as possible).
All of these options default to the value `0`, which means that persistent
retention is indefinite, and notifications are retried as frequently as possible.
.. confval:: rgw_topic_persistency_time_to_live
.. confval:: rgw_topic_persistency_max_retries

View File

@ -82,6 +82,14 @@ is mounted at `/mnt/nvme0` and has `10 GB` of free space available for the cache
The persistent path directory has to be created before starting the Gateway.
(``mkdir -p /mnt/nvme0/rgw_datacache/client.rgw.8000/``)
In containerized deployments the cache directory should be mounted as a volume::
extra_container_args:
- "-v"
- "/mnt/nvme0/rgw_datacache/client.rgw.8000/:/mnt/nvme0/rgw_datacache/client.rgw.8000/"
(Reference: `Service Management - Mounting Files with Extra Container Arguments`_)
If another Gateway is co-located on the same machine, configure it's persistent path to a discrete directory,
for example in the case of `[client.rgw.8001]` configure
``rgw_d3n_l1_datacache_persistent_path = "/mnt/nvme0/rgw_datacache/client.rgw.8001/"``
@ -114,3 +122,4 @@ The following D3N related settings can be added to the Ceph configuration file
.. _Rados Gateway Compression: ../compression/
.. _Rados Gateway Encryption: ../encryption/
.. _RGW Data cache and CDN: ../rgw-cache/
.. _Service Management - Mounting Files with Extra Container Arguments: ../cephadm/services/#mounting-files-with-extra-container-arguments

View File

@ -508,7 +508,7 @@ For example:
Updating the Period
-------------------
After updating the master zone configuration, update the period:
After updating the secondary zone configuration, update the period:
.. prompt:: bash #

View File

@ -7,6 +7,10 @@ Bucket Notifications
.. versionchanged:: Squid
A new "v2" format for Topic and Notification metadata can be enabled with
the :ref:`feature_notification_v2` zone feature.
Enabling this feature after an upgrade from an older version will trigger
migration of the existing Topic and Notification metadata.
In a greenfield deployment, the new format will be used.
The new format allows for the data to be synced between zones in the zonegroup.
.. contents::
@ -61,9 +65,15 @@ Asynchronous Notifications
~~~~~~~~~~~~~~~~~~~~~~~~~~
Notifications can be sent asynchronously. They are committed into persistent
storage and then asynchronously sent to the topic's configured endpoint. In
this case, the only latency added to the original operation is the latency
storage and then asynchronously sent to the topic's configured endpoint.
The notification will be committed to persistent storage only if the triggering
operation was successful.
In this case, the only latency added to the original operation is the latency
added when the notification is committed to persistent storage.
If the endpoint of the topic to which the notification is sent is not available for a long
period of time, the persistent storage allocated for this topic will eventually fill up.
When this happens the triggering operations will fail with ``503 Service Unavailable``,
which tells the client that it may retry later.
.. note:: If the notification fails with an error, cannot be delivered, or
times out, it is retried until it is successfully acknowledged.
@ -98,6 +108,18 @@ Remove a topic by running the following command:
radosgw-admin topic rm --topic={topic-name} [--tenant={tenant}]
Fetch persistent topic stats (i.e. reservations, entries and size) by running the following command:
.. prompt:: bash #
radosgw-admin topic stats --topic={topic-name} [--tenant={tenant}]
Dump (in JSON format) all pending bucket notifications of a persistent topic by running the following command:
.. prompt:: bash #
radosgw-admin topic dump --topic={topic-name} [--tenant={tenant}] [--max-entries={max-entries}]
Notification Performance Statistics
-----------------------------------

View File

@ -33,13 +33,20 @@ QAT Environment Setup
encryption and compression services. And QAT driver in kernel space have to
be loaded to drive the hardware.
The driver package can be downloaded from `Intel Quickassist Technology`_.
The out-of-tree QAT driver package can be downloaded from `Intel Quickassist
Technology`_.
2. The implementation for QAT based encryption is directly base on QAT API which
is included the driver package. But QAT support for compression depends on
QATzip project, which is a user space library which builds on top of the QAT
API. Currently, QATzip speeds up gzip compression and decompression at the
time of writing.
The QATlib can be downloaded from `qatlib`_, which is used for the in-tree QAT
driver.
.. note::
The out-of-tree QAT driver is gradually being migrated to an in-tree driver+QATlib.
2. The implementation of QAT-based encryption is directly based on the QAT API,
which is included the driver package. However, QAT support for compression
depends on the QATzip project, which is a userspace library that builds on
top of the QAT API. At the time of writing (July 2024), QATzip speeds up
gzip compression and decompression.
See `QATzip`_.
@ -48,36 +55,39 @@ Implementation
1. QAT based Encryption for RGW
`OpenSSL support for RGW encryption`_ has been merged into Ceph, and Intel also
provides one `QAT Engine`_ for OpenSSL. So, theoretically speaking, QAT based
encryption in Ceph can be directly supported through OpenSSl+QAT Engine.
provides one `QAT Engine`_ for OpenSSL. Theoretically, QAT-based encryption in
Ceph can be directly supported through the OpenSSl+QAT Engine.
But the QAT Engine for OpenSSL currently supports chained operations only, and
so Ceph will not be able to utilize QAT hardware feature for crypto operations
based on OpenSSL crypto plugin. As a result, one QAT plugin based on native
QAT API is added into crypto framework.
However, the QAT Engine for OpenSSL currently supports only chained operations,
which means that Ceph will not be able to utilize QAT hardware features for
crypto operations based on the OpenSSL crypto plugin. As a result, one QAT plugin
based on native QAT API is added into the crypto framework.
2. QAT Support for Compression
As mentioned above, QAT support for compression is based on QATzip library in
user space, which is designed to take full advantage of the performance provided
by QuickAssist Technology. Unlike QAT based encryption, QAT based compression
is supported through a tool class for QAT acceleration rather than a compressor
plugin. The common tool class can transparently accelerate the existing compression
types, but only zlib compressor can be supported at the time of writing. So
user is allowed to use it to speed up zlib compressor as long as the QAT
hardware is available and QAT is capable to handle it.
As mentioned above, QAT support for compression is based on the QATzip library
in user space, which is designed to take full advantage of the performance that
QuickAssist Technology provides. Unlike QAT-based encryption, QAT-based
compression is supported through a tool class for QAT acceleration rather than
a compressor plugin. This common tool class can transparently accelerate the
existing compression types, but only the zlib compressor is supported at the
time of writing. This means that this tool class can be used to speed up
the zlib compressor if QAT hardware is available.
Configuration
=============
#. Prerequisites
Make sure the QAT driver with version v1.7.L.4.14.0 or higher has been installed.
Remember to set an environment variable "ICP_ROOT" for your QAT driver package
root directory.
**For out-of-tree QAT**
To enable the QAT based encryption and compression, user needs to modify the QAT
configuration files. For example, for Intel QuickAssist Adapter 8970 product, revise
c6xx_dev0/1/2.conf in the directory ``/etc/`` and keep them the same, e.g.:
Make sure the out-of-tree QAT driver with version v1.7.L.4.14.0 or higher
has been installed. Remember to set an environment variable ``ICP_ROOT``
for your QAT driver package root directory.
To enable the QAT based encryption and compression, the user must modify the
QAT configuration files. For example, for the Intel QuickAssist Adapter 8970
product, revise ``c6xx_dev0/1/2.conf`` in the directory ``/etc/`` and keep them
the same. For example:
.. code-block:: ini
@ -101,51 +111,121 @@ Configuration
# List of core affinities
Dc0CoreAffinity = 0
#. QAT based Encryption for RGW
**For in-tree QAT**
The CMake option ``WITH_QAT=ON`` must be configured. If you build Ceph from
There are some prerequisites for using QATlib. Make sure that your system
meets the `QATlib System Requirements`_ .
* To properly use the QATlib library, the Intel VT-d and SR-IOV parameters
must be enabled in the platform BIOS.
* Some QATlib features require a recent kernel driver or firmware version.
See `QATlib Kernel Driver Releases`_.
* The supported platform contains a 4xxx Intel Communications device or
newer.
* The ``intel_iommu`` parameter must be enabled. Verify that this setting is
enabled by running the following commands:
.. prompt:: bash $
cat /proc/cmdline | grep intel_iommu=on
sudo sh -c 'echo "@qat - memlock 204800" >> /etc/security/limits.conf'
sudo su -l $USER
For configuration and Tuning see `QATlib Configuration and Tuning`_.
#. QAT-based Encryption for RGW
The CMake option ``WITH_QATDRV=ON`` must be set. If you build Ceph from
source code (see: :ref:`build-ceph`), navigate to your cloned Ceph repository
and execute the following:
.. prompt:: bash $
cd ceph
./do_cmake.sh -DWITH_QAT=ON
./do_cmake.sh -DWITH_QATDRV=ON
cd build
ininja
.. note::
The section name of the QAT configuration files must be ``CEPH`` since
the section name is set as "CEPH" in Ceph crypto source code.
.. note:: The section name in QAT configuration files must be ``CEPH``,
because the section name is set to ``CEPH`` in the Ceph crypto source code.
Then, edit the Ceph configuration file to make use of QAT based crypto plugin::
Edit the Ceph configuration file (usually ``ceph.conf``) to make use of the
QAT-based crypto plugin::
plugin crypto accelerator = crypto_qat
#. QAT Support for Compression
Before starting, make sure both QAT driver and `QATzip`_ have been installed. Besides
"ICP_ROOT", remember to set the environment variable "QZ_ROOT" for the root directory
of your QATzip source tree.
**For out-of-tree QAT**
The following CMake options have to be configured to trigger QAT based compression
when building Ceph:
For the out-of-tree QAT driver package, before building ensure that both the QAT
driver and `QATzip`_ have been installed. In addition to ``ICP_ROOT``,
set the environment variable ``QZ_ROOT`` to the root directory of your QATzip
source tree.
The following CMake options must be configured to trigger QAT-based
compression when building Ceph:
.. prompt:: bash $
./do_cmake.sh -DWITH_QAT=ON -DWITH_QATZIP=ON
./do_cmake.sh -DWITH_QATDRV=ON -DWITH_QATZIP=ON -DWITH_SYSTEM_QATZIP=ON -DWITH_QATLIB=OFF
Then, set an environment variable to clarify the section name of User Process Instance
Section in QAT configuration files, e.g.:
Set an environment variable to clarify the section name of the User Process
Instance Section in the QAT configuration files. For example:
.. prompt:: bash $
export QAT_SECTION_NAME=CEPH
Next, edit the Ceph configuration file to enable QAT support for compression::
**For in-tree QAT**
For in-tree QAT, ensure that your system meets the `QATlib System
Requirements`_. QATlib can be installed from pre-built packages or from
source code. See `QATlib Installation`_ . After QATlib is installed, you
can run ``cpa_sample_code`` to check if the QAT environment is OK.
If you are using QATlib source code, the Ceph `cmake` build enables the
qatlib and qatzip options by default. Our normal compilation
already includes QAT-compressor-related code.
.. prompt:: bash $
./do_cmake.sh
If you are using pre-built packages installed on the system, the following
CMake options must be configured when building Ceph:
.. prompt:: bash $
./do_cmake.sh -DWITH_SYSTEM_QATLIB=ON -DWITH_SYSTEM_QATZIP=ON
**For both out-of-tree QAT and in-tree QAT**
Edit Ceph's central config DB or configuration file (usually ``ceph.conf``) to enable QAT
support for *zlib* compression::
qat compressor enabled=true
Set the RGW compression method:
.. prompt:: bash $
# for storage class(STANDARD)
radosgw-admin zone placement modify --rgw-zone=default --placement-id=default-placement --compression=zlib
# or create a new storage class(COLD) and define data pool(default.rgw.cold.data)
radosgw-admin zonegroup placement add --rgw-zonegroup default --placement-id default-placement --storage-class COLD
radosgw-admin zone placement add --rgw-zone default --placement-id default-placement --storage-class COLD --compression zlib --data-pool default.rgw.cold.data
CONFIG REFERENCE
================
The following QAT-related settings can be added to the Ceph configuration file
(usually `ceph.conf`) under the ``[client.rgw.{instance-name}]`` section.
.. confval:: qat_compressor_session_max_number
.. confval:: qat_compressor_busy_polling
.. _QAT Support for Compression: https://github.com/ceph/ceph/pull/19714
.. _QAT based Encryption for RGW: https://github.com/ceph/ceph/pull/19386
@ -153,3 +233,9 @@ Configuration
.. _QATzip: https://github.com/intel/QATzip
.. _OpenSSL support for RGW encryption: https://github.com/ceph/ceph/pull/15168
.. _QAT Engine: https://github.com/intel/QAT_Engine
.. _qatlib: https://github.com/intel/qatlib
.. _QATlib User's Guide: https://intel.github.io/quickassist/qatlib/index.html
.. _QATlib System Requirements: https://intel.github.io/quickassist/qatlib/requirements.html
.. _QATlib Installation: https://intel.github.io/quickassist/qatlib/install.html
.. _QATlib Configuration and Tuning: https://intel.github.io/quickassist/qatlib/configuration.html
.. _QATlib Kernel Driver Releases: https://intel.github.io/quickassist/RN/In-Tree/in_tree_firmware_RN.html#qat-kernel-driver-releases-features

View File

@ -7,22 +7,47 @@
Bucket and Host Name
--------------------
There are two different modes of accessing the buckets. The first (preferred) method
identifies the bucket as the top-level directory in the URI. ::
There are two different modes of accessing buckets. The first method identifies
the bucket as the top-level directory in the URI::
GET /mybucket HTTP/1.1
Host: cname.domain.com
The second method identifies the bucket via a virtual bucket host name. For example::
Most S3 clients nowadays rely on vhost-style access. The desired bucket is
indicated by a DNS FQDN. For example::
GET / HTTP/1.1
Host: mybucket.cname.domain.com
To configure virtual hosted buckets, you can either set ``rgw_dns_name = cname.domain.com`` in ceph.conf, or add ``cname.domain.com`` to the list of ``hostnames`` in your zonegroup configuration. See `Ceph Object Gateway - Multisite Configuration`_ for more on zonegroups.
The second method is deprecated by AWS. See the `Amazon S3 Path Deprecation
Plan`_ for more information.
.. tip:: We prefer the first method, because the second method requires expensive domain certification and DNS wild cards.
To configure virtual hosted buckets, you can either set ``rgw_dns_name =
cname.domain.com`` in ``ceph.conf`` or add ``cname.domain.com`` to the list of
``hostnames`` in your zonegroup configuration. See `Ceph Object Gateway -
Multisite Configuration`_ for more on zonegroups.
Here is an example of a ``ceph config set`` comamnd that sets ``rgw_dns_name``
to ``cname.domain.com``:
.. prompt:: bash $
ceph config set client.rgw.<ceph authx client for rgw> rgw_dns_name cname.domain.dom
.. tip:: You can define multiple hostnames directly with the
:confval:`rgw_dns_name` parameter.
.. tip:: When SSL is enabled, the certificates must use a wildcard in the
domain name in order to match the bucket subdomains.
.. note:: When Ceph Object Gateways are behind a proxy, use the proxy's DNS
name instead. Then you can use ``ceph config set client.rgw`` to set the DNS
name for all instances.
.. note:: The static website view for the `s3website` API must be served under
a different domain name. This is configured separately from
:confval:`rgw_dns_name`, in :confval:`rgw_dns_s3website_name`.
.. tip:: You can define multiple hostname directly with the :confval:`rgw_dns_name` parameter.
Common Request Headers
----------------------
@ -111,3 +136,4 @@ Common Response Status
+---------------+-----------------------------------+
.. _`Ceph Object Gateway - Multisite Configuration`: ../../multisite
.. _`Amazon S3 Path Deprecation Plan`: https://aws.amazon.com/blogs/aws/amazon-s3-path-deprecation-plan-the-rest-of-the-story/

View File

@ -8,6 +8,9 @@ Ceph is a clustered and distributed storage manager. If that's too cryptic,
then just think of Ceph as a computer program that stores data and uses a
network to make sure that there is a backup copy of the data.
Components of Ceph
==================
Storage Interfaces
------------------
@ -94,6 +97,89 @@ MDS
A metadata server (MDS) is necessary for the proper functioning of CephFS.
See :ref:`orchestrator-cli-cephfs` and :ref:`arch-cephfs`.
Vstart Cluster Installation and Configuration Procedure
=======================================================
#. Clone the ``ceph/ceph`` repository:
.. prompt:: bash #
git clone git@github.com:ceph/ceph
#. Update the submodules in the ``ceph/ceph`` repository:
.. prompt:: bash #
git submodule update --init --recursive --progress
#. Run ``install-deps.sh`` from within the directory into which you cloned the
``ceph/ceph`` repository:
.. prompt:: bash #
./install-deps.sh
#. Install the ``python3-routes`` package:
.. prompt:: bash #
apt install python3-routes
#. Move into the ``ceph`` directory. You will know that you are in the correct
directory if it contains the file ``do_cmake.sh``:
.. prompt:: bash #
cd ceph
#. Run the ``do_cmake.sh`` script:
.. prompt:: bash #
./do_cmake.sh
#. The ``do_cmake.sh`` script creates a ``build/`` directory. Move into the
``build/`` directory:
.. prompt:: bash #
cd build
#. Use ``ninja`` to build the development environment:
.. prompt:: bash #
ninja -j3
.. note:: This step takes a long time to run. The ``ninja -j3`` command
kicks off a process consisting of 2289 steps. This step took over three
hours when I ran it on an Intel NUC with an i7 in September of 2024.
#. Install the Ceph development environment:
.. prompt:: bash #
ninja install
This step does not take as long as the previous step.
#. Build the vstart cluster:
.. prompt:: bash #
ninja vstart
#. Start the vstart cluster:
.. prompt:: bash #
../src/vstart.sh --debug --new -x --localhost --bluestore
.. note:: Run this command from within the ``ceph/build`` directory.
LINKS
-----

View File

@ -860,7 +860,7 @@ possible, we prefer to maintain this convention with text, lists, literal text
lines should begin at the same character position as the text of the
indented text (less numbers, bullets, etc.).
Indented text may include literal text examples. Whereas, text indentation
Indented text may include literal text examples. Although text indentation
should be done with spaces, literal text examples should be indented with
tabs. This convention enables you to add an additional indented paragraph
following a literal example by leaving a blank line and beginning the

View File

@ -22,13 +22,12 @@ another, but below are some general guidelines.
CPU
===
CephFS Metadata Servers (MDS) are CPU-intensive. They are
are single-threaded and perform best with CPUs with a high clock rate (GHz). MDS
servers do not need a large number of CPU cores unless they are also hosting other
services, such as SSD OSDs for the CephFS metadata pool.
OSD nodes need enough processing power to run the RADOS service, to calculate data
placement with CRUSH, to replicate data, and to maintain their own copies of the
cluster map.
CephFS Metadata Servers (MDS) are CPU-intensive. They are single-threaded
and perform best with CPUs with a high clock rate (GHz). MDS servers do not
need a large number of CPU cores unless they are also hosting other services,
such as SSD OSDs for the CephFS metadata pool. OSD nodes need enough
processing power to run the RADOS service, to calculate data placement with
CRUSH, to replicate data, and to maintain their own copies of the cluster map.
With earlier releases of Ceph, we would make hardware recommendations based on
the number of cores per OSD, but this cores-per-osd metric is no longer as

View File

@ -43,36 +43,68 @@ distribution that includes a supported kernel and supported system startup
framework, for example ``sysvinit`` or ``systemd``. Ceph is sometimes ported to
non-Linux systems but these are not supported by the core Ceph effort.
+---------------+---------------+------------------+------------------+------------------+
| | Reef (18.2.z) | Quincy (17.2.z) | Pacific (16.2.z) | Octopus (15.2.z) |
+===============+===============+==================+==================+==================+
| Centos 7 | | | | B |
+---------------+---------------+------------------+------------------+------------------+
| Centos 8 | | | | |
+---------------+---------------+------------------+------------------+------------------+
| Centos 9 | A H | A :sup:`1` H | | |
+---------------+---------------+------------------+------------------+------------------+
| Debian 10 | C | | C | C |
+---------------+---------------+------------------+------------------+------------------+
| Debian 11 | C | C | C | |
+---------------+---------------+------------------+------------------+------------------+
| OpenSUSE 15.2 | C | | C | C |
+---------------+---------------+------------------+------------------+------------------+
| OpenSUSE 15.3 | C | C | | |
+---------------+---------------+------------------+------------------+------------------+
| Ubuntu 18.04 | | | C | C |
+---------------+---------------+------------------+------------------+------------------+
| Ubuntu 20.04 | A | A | A | A |
+---------------+---------------+------------------+------------------+------------------+
| Ubuntu 22.04 | A H | | | |
+---------------+---------------+------------------+------------------+------------------+
+---------------+----------------+---------------+------------------+------------------+------------------+
| | Squid (19.2.z) | Reef (18.2.z) | Quincy (17.2.z) | Pacific (16.2.z) | Octopus (15.2.z) |
+===============+================+===============+==================+==================+==================+
| Centos 7 | | | | | B |
+---------------+----------------+---------------+------------------+------------------+------------------+
| Centos 8 | | | | | |
+---------------+----------------+---------------+------------------+------------------+------------------+
| Centos 9 | A | A | A :sup:`1` | | |
+---------------+----------------+---------------+------------------+------------------+------------------+
| Debian 10 | | C | | C | C |
+---------------+----------------+---------------+------------------+------------------+------------------+
| Debian 11 | | C | C | C | |
+---------------+----------------+---------------+------------------+------------------+------------------+
| Debian 12 | C | C | | | |
+---------------+----------------+---------------+------------------+------------------+------------------+
| OpenSUSE 15.2 | | C | | C | C |
+---------------+----------------+---------------+------------------+------------------+------------------+
| OpenSUSE 15.3 | | C | C | | |
+---------------+----------------+---------------+------------------+------------------+------------------+
| Ubuntu 18.04 | | | | C | C |
+---------------+----------------+---------------+------------------+------------------+------------------+
| Ubuntu 20.04 | | A | A | A | A |
+---------------+----------------+---------------+------------------+------------------+------------------+
| Ubuntu 22.04 | A | A | | | |
+---------------+----------------+---------------+------------------+------------------+------------------+
- **A**: Ceph provides packages and has done comprehensive tests on the software in them.
- **B**: Ceph provides packages and has done basic tests on the software in them.
- **C**: Ceph provides packages only. No tests have been done on these releases.
- **H**: Ceph tests this distribution as a container host.
- **1**: Testing has been done on Centos 9 starting on version 17.2.8 for Quincy.
Container Hosts
---------------
This table shows the operating systems that support Ceph's official container images.
+---------------+----------------+------------------+------------------+
| | Squid (19.2.z) | Reef (18.2.z) | Quincy (17.2.z) |
+===============+================+==================+==================+
| Centos 7 | | | |
+---------------+----------------+------------------+------------------+
| Centos 8 | | | |
+---------------+----------------+------------------+------------------+
| Centos 9 | H | H | H |
+---------------+----------------+------------------+------------------+
| Debian 10 | | | |
+---------------+----------------+------------------+------------------+
| Debian 11 | | | |
+---------------+----------------+------------------+------------------+
| OpenSUSE 15.2 | | | |
+---------------+----------------+------------------+------------------+
| OpenSUSE 15.3 | | | |
+---------------+----------------+------------------+------------------+
| Ubuntu 18.04 | | | |
+---------------+----------------+------------------+------------------+
| Ubuntu 20.04 | | | |
+---------------+----------------+------------------+------------------+
| Ubuntu 22.04 | H | H | |
+---------------+----------------+------------------+------------------+
- **H**: Ceph tests this distribution as a container host.
.. note::
**For Centos 7 Users**

View File

@ -10,6 +10,7 @@ overrides:
- MDS_FAILED
- MDS_INSUFFICIENT_STANDBY
- MDS_UP_LESS_THAN_MAX
- online, but wants
- filesystem is online with fewer MDS than max_mds
- POOL_APP_NOT_ENABLED
- do not have an application enabled

View File

@ -9,8 +9,6 @@ overrides:
osd pool default crimson: true
osd:
crimson osd obc lru size: 10
mgr:
mgr stats period: 30
flavor: crimson
workunit:
env:

View File

@ -4,6 +4,7 @@ overrides:
selinux:
allowlist:
- scontext=system_u:system_r:logrotate_t:s0
- scontext=system_u:system_r:getty_t:s0
tasks:
- pexec:

View File

@ -1,11 +1,13 @@
Default object size:
$ rbd create --size 20M img
$ DEV=$(sudo rbd map img)
$ blockdev --getiomin $DEV
65536
$ blockdev --getioopt $DEV
65536
4194304
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
65536
$ sudo rbd unmap $DEV
@ -14,7 +16,7 @@
$ blockdev --getiomin $DEV
512
$ blockdev --getioopt $DEV
512
4194304
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
512
$ sudo rbd unmap $DEV
@ -38,3 +40,45 @@
$ sudo rbd unmap $DEV
$ rbd rm --no-progress img
Custom object size:
$ rbd create --size 20M --object-size 1M img
$ DEV=$(sudo rbd map img)
$ blockdev --getiomin $DEV
65536
$ blockdev --getioopt $DEV
1048576
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
65536
$ sudo rbd unmap $DEV
$ DEV=$(sudo rbd map -o alloc_size=512 img)
$ blockdev --getiomin $DEV
512
$ blockdev --getioopt $DEV
1048576
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
512
$ sudo rbd unmap $DEV
$ DEV=$(sudo rbd map -o alloc_size=1048576 img)
$ blockdev --getiomin $DEV
1048576
$ blockdev --getioopt $DEV
1048576
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
1048576
$ sudo rbd unmap $DEV
$ DEV=$(sudo rbd map -o alloc_size=2097152 img)
$ blockdev --getiomin $DEV
1048576
$ blockdev --getioopt $DEV
1048576
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
1048576
$ sudo rbd unmap $DEV
$ rbd rm --no-progress img

View File

@ -76,6 +76,10 @@ function wait_for_state() {
function wait_for_recovery_toofull() {
local timeout=$1
wait_for_state recovery_toofull $timeout
if [ $ret -ne 0 ]; then
echo "Error: Recovery toofull timeout"
return 1
fi
}
@ -131,7 +135,11 @@ function TEST_recovery_test_simple() {
done
# If this times out, we'll detected errors below
wait_for_recovery_toofull 30
wait_for_recovery_toofull 120
if [ $? -ne 0 ]; then
echo "Error: Recovery toofull timeout"
return 1
fi
ERRORS=0
if [ "$(ceph pg dump pgs | grep +recovery_toofull | wc -l)" != "1" ];

View File

@ -229,138 +229,6 @@ function wait_background_check() {
return $return_code
}
# osd_scrub_during_recovery=true make sure scrub happens
function TEST_recovery_scrub_2() {
local dir=$1
local poolname=test
TESTDATA="testdata.$$"
OSDS=8
PGS=32
OBJECTS=40
setup $dir || return 1
run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
run_mgr $dir x || return 1
local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 "
ceph_osd_args+="--osd_scrub_backoff_ratio=0 "
ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 "
ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
for osd in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=10 \
$ceph_osd_args || return 1
done
# Create a pool with $PGS pgs
create_pool $poolname $PGS $PGS
wait_for_clean || return 1
poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
dd if=/dev/urandom of=$TESTDATA bs=1M count=50
for i in $(seq 1 $OBJECTS)
do
rados -p $poolname put obj${i} $TESTDATA
done
rm -f $TESTDATA
ceph osd pool set $poolname size 3
ceph pg dump pgs
# note that the following will be needed if the mclock scheduler is specified
#ceph tell osd.* config get osd_mclock_override_recovery_settings
# the '_max_active' is expected to be 0
ceph tell osd.1 config get osd_recovery_max_active
# both next parameters are expected to be >=3
ceph tell osd.1 config get osd_recovery_max_active_hdd
ceph tell osd.1 config get osd_recovery_max_active_ssd
# Wait for recovery to start
count=0
while(true)
do
#ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
if test $(ceph --format json pg dump pgs |
jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
then
break
fi
sleep 2
if test "$count" -eq "10"
then
echo "Not enough recovery started simultaneously"
return 1
fi
count=$(expr $count + 1)
done
ceph pg dump pgs
pids=""
recov_scrub_count=0
for pg in $(seq 0 $(expr $PGS - 1))
do
run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
done
wait_background_check pids
return_code=$?
if [ $return_code -ne 0 ]; then return $return_code; fi
ERRORS=0
if test $recov_scrub_count -eq 0
then
echo "No scrubs occurred while PG recovering"
ERRORS=$(expr $ERRORS + 1)
fi
pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
pid=$(cat $pidfile)
if ! kill -0 $pid
then
echo "OSD crash occurred"
#tail -100 $dir/osd.0.log
ERRORS=$(expr $ERRORS + 1)
fi
# Work around for http://tracker.ceph.com/issues/38195
kill_daemons $dir #|| return 1
declare -a err_strings
err_strings[0]="not scheduling scrubs due to active recovery"
for osd in $(seq 0 $(expr $OSDS - 1))
do
grep "not scheduling scrubs" $dir/osd.${osd}.log
done
for err_string in "${err_strings[@]}"
do
found=false
for osd in $(seq 0 $(expr $OSDS - 1))
do
if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
then
found=true
fi
done
if [ "$found" = "true" ]; then
echo "Found log message not expected '$err_string'"
ERRORS=$(expr $ERRORS + 1)
fi
done
teardown $dir || return 1
if [ $ERRORS != "0" ];
then
echo "TEST FAILED WITH $ERRORS ERRORS"
return 1
fi
echo "TEST PASSED"
return 0
}
main osd-recovery-scrub "$@"
# Local Variables:

Some files were not shown because too many files have changed in this diff Show More