mirror of
https://git.proxmox.com/git/ceph.git
synced 2025-04-28 10:57:11 +00:00
import source of Ceph Squid 19.2.1 release
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
3815e3391b
commit
1852f3843b
1
ceph/.github/labeler.yml
vendored
1
ceph/.github/labeler.yml
vendored
@ -154,6 +154,7 @@ crimson:
|
||||
- src/crimson/**
|
||||
- src/test/crimson/**
|
||||
- qa/suites/crimson-rados/**
|
||||
- src/seastar/**
|
||||
|
||||
dashboard:
|
||||
- src/pybind/mgr/dashboard/**
|
||||
|
@ -27,7 +27,7 @@ b-ranto Boris Ranto <branto@redhat.com>
|
||||
badone Brad Hubbard <bhubbard@redhat.com>
|
||||
baruza Barbora Ančincová <bara@redhat.com>
|
||||
bassamtabbara Bassam Tabbara <bassam.tabbara@quantum.com>
|
||||
batrick Patrick Donnelly <pdonnell@redhat.com>
|
||||
batrick Patrick Donnelly <pdonnell@ibm.com>
|
||||
bigjust Justin Caratzas <jcaratza@redhat.com>
|
||||
bk201 Kiefer Chang <kiefer.chang@suse.com>
|
||||
BlaineEXE Blaine Gardner <bgardner@suse.com>
|
||||
|
3
ceph/.gitmodules
vendored
3
ceph/.gitmodules
vendored
@ -50,9 +50,6 @@
|
||||
[submodule "src/c-ares"]
|
||||
path = src/c-ares
|
||||
url = https://github.com/ceph/c-ares.git
|
||||
[submodule "src/spawn"]
|
||||
path = src/spawn
|
||||
url = https://github.com/ceph/spawn.git
|
||||
[submodule "src/pybind/mgr/rook/rook-client-python"]
|
||||
path = src/pybind/mgr/rook/rook-client-python
|
||||
url = https://github.com/ceph/rook-client-python.git
|
||||
|
@ -543,7 +543,8 @@ Pan Liu <pan.liu@istuary.com> <liupan1111@gmail.com>
|
||||
Parth Arora <paarora@redhat.com> parth-gr <paarora@redhat.com>
|
||||
Pascal de Bruijn <pascal@unilogicnetworks.net>
|
||||
Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
|
||||
Patrick Donnelly <pdonnell@redhat.com> <pdonell@redhat.com>
|
||||
Patrick Donnelly <pdonnell@ibm.com> <pdonnell@redhat.com>
|
||||
Patrick Donnelly <pdonnell@ibm.com> <batrick@batbytes.com>
|
||||
Patrick McGarry <patrick@inktank.com>
|
||||
Patrick McGarry <pmcgarry@redhat.com> <pmcgarry@gmail.com>
|
||||
Patrick Seidensal <pseidensal@suse.com>
|
||||
|
@ -357,6 +357,10 @@ IBM <contact@IBM.com> Neeraj Pratap Singh <Neeraj.Pratap.Singh1@ibm.com>
|
||||
IBM <contact@IBM.com> Or Ozeri <oro@il.ibm.com>
|
||||
IBM <contact@IBM.com> Paul Cuzner <pcuzner@ibm.com>
|
||||
IBM <contact@IBM.com> Samuel Matzek <smatzek@us.ibm.com>
|
||||
IBM <contact@IBM.com> Shraddha Agrawal <shraddhaag@ibm.com>
|
||||
IBM <contact@IBM.com> Kushal Deb <Kushal.Deb@ibm.com>
|
||||
IBM <contact@IBM.com> Shweta Bhosale <Shweta.Bhosale1@ibm.com>
|
||||
IBM <contact@IBM.com> Patrick Donnelly <pdonnell@ibm.com>
|
||||
IBM <contact@IBM.com> Sunil Angadi <Sunil.Angadi@ibm.com>
|
||||
IBM <contact@IBM.com> Teoman Onay <tonay@ibm.com>
|
||||
IBM <contact@ibm.com> Ulrich Weigand <ulrich.weigand@de.ibm.com>
|
||||
|
@ -73,5 +73,5 @@ Yehuda Sadeh <ysadehwe@redhat.com> Yehuda Sadeh <yehuda@inktank.com>
|
||||
Yuri Weinstein <yuriw@redhat.com> Yuri Weinstein <yuri.weinstein@inktank.com>
|
||||
Zhi Zhang <zhangz.david@outlook.com> Zhi (David) Zhang <zhangz@yahoo-inc.com>
|
||||
Zheng Yin <zhengyin@huayun.com> Zheng Yin <zhengyin@chinac.com>
|
||||
Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
|
||||
Patrick Donnelly <pdonnell@ibm.com> Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
|
||||
Myoungwon Oh <myoungwon.oh@samsung.com> Myoungwon Oh <omwmw@sk.com> Myoungwon Oh <ohmyoungwon@gmail.com>
|
||||
|
@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
|
||||
project(ceph
|
||||
VERSION 19.2.0
|
||||
VERSION 19.2.1
|
||||
LANGUAGES CXX C ASM)
|
||||
|
||||
foreach(policy CMP0127 CMP0135)
|
||||
|
@ -173,6 +173,11 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
|
||||
default json format produces a rather massive output in large clusters and
|
||||
isn't scalable. So we have removed the 'network_ping_times' section from
|
||||
the output. Details in the tracker: https://tracker.ceph.com/issues/57460
|
||||
* mgr/REST: The REST manager module will trim requests based on the 'max_requests' option.
|
||||
Without this feature, and in the absence of manual deletion of old requests,
|
||||
the accumulation of requests in the array can lead to Out Of Memory (OOM) issues,
|
||||
resulting in the Manager crashing.
|
||||
|
||||
* CephFS: The `subvolume snapshot clone` command now depends on the config option
|
||||
`snapshot_clone_no_wait` which is used to reject the clone operation when
|
||||
all the cloner threads are busy. This config option is enabled by default which means
|
||||
@ -231,6 +236,23 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
|
||||
confirmation flag when some MDSs exhibit health warning MDS_TRIM or
|
||||
MDS_CACHE_OVERSIZED. This is to prevent accidental MDS failover causing
|
||||
further delays in recovery.
|
||||
* Based on tests performed at scale on a HDD based Ceph cluster, it was found
|
||||
that scheduling with mClock was not optimal with multiple OSD shards. For
|
||||
example, in the test cluster with multiple OSD node failures, the client
|
||||
throughput was found to be inconsistent across test runs coupled with multiple
|
||||
reported slow requests. However, the same test with a single OSD shard and
|
||||
with multiple worker threads yielded significantly better results in terms of
|
||||
consistency of client and recovery throughput across multiple test runs.
|
||||
Therefore, as an interim measure until the issue with multiple OSD shards
|
||||
(or multiple mClock queues per OSD) is investigated and fixed, the following
|
||||
change to the default HDD OSD shard configuration is made:
|
||||
- osd_op_num_shards_hdd = 1 (was 5)
|
||||
- osd_op_num_threads_per_shard_hdd = 5 (was 1)
|
||||
For more details see https://tracker.ceph.com/issues/66289.
|
||||
* NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under FSAL block,
|
||||
* NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under the FSAL block,
|
||||
which specifies the path within the CephFS to mount this export on. If this and the other
|
||||
`EXPORT { FSAL {} }` options are the same between multiple exports, those exports will share a single CephFS client. If not specified, the default is `/`.
|
||||
|
||||
>=18.0.0
|
||||
|
||||
|
@ -45,19 +45,21 @@ out the git submodules associated with it:
|
||||
|
||||
## Build Prerequisites
|
||||
|
||||
*section last updated 27 Jul 2023*
|
||||
*section last updated 06 Sep 2024*
|
||||
|
||||
Make sure that ``curl`` is installed. The Debian and Ubuntu ``apt`` command is
|
||||
provided here, but if you use a system with a different package manager, then
|
||||
you must use whatever command is the proper counterpart of this one:
|
||||
We provide the Debian and Ubuntu ``apt`` commands in this procedure. If you use
|
||||
a system with a different package manager, then you will have to use different
|
||||
commands.
|
||||
|
||||
#. Install ``curl``:
|
||||
|
||||
apt install curl
|
||||
|
||||
Install Debian or RPM package dependencies by running the following command:
|
||||
#. Install package dependencies by running the ``install-deps.sh`` script:
|
||||
|
||||
./install-deps.sh
|
||||
|
||||
Install the ``python3-routes`` package:
|
||||
#. Install the ``python3-routes`` package:
|
||||
|
||||
apt install python3-routes
|
||||
|
||||
@ -70,44 +72,56 @@ we recommend that you build `.deb` or `.rpm` packages, or refer to
|
||||
``ceph.spec.in`` or ``debian/rules`` to see which configuration options are
|
||||
specified for production builds.
|
||||
|
||||
To build Ceph, make sure that you are in the top-level `ceph` directory that
|
||||
contains `do_cmake.sh` and `CONTRIBUTING.rst` and run the following commands:
|
||||
To build Ceph, follow this procedure:
|
||||
|
||||
./do_cmake.sh
|
||||
cd build
|
||||
ninja
|
||||
1. Make sure that you are in the top-level `ceph` directory that
|
||||
contains `do_cmake.sh` and `CONTRIBUTING.rst`.
|
||||
2. Run the `do_cmake.sh` script:
|
||||
|
||||
``do_cmake.sh`` by default creates a "debug build" of Ceph, which can be up to
|
||||
five times slower than a non-debug build. Pass
|
||||
``-DCMAKE_BUILD_TYPE=RelWithDebInfo`` to ``do_cmake.sh`` to create a non-debug
|
||||
build.
|
||||
./do_cmake.sh
|
||||
|
||||
[Ninja](https://ninja-build.org/) is the buildsystem used by the Ceph project
|
||||
to build test builds. The number of jobs used by `ninja` is derived from the
|
||||
number of CPU cores of the building host if unspecified. Use the `-j` option to
|
||||
limit the job number if the build jobs are running out of memory. If you
|
||||
attempt to run `ninja` and receive a message that reads `g++: fatal error:
|
||||
Killed signal terminated program cc1plus`, then you have run out of memory.
|
||||
Using the `-j` option with an argument appropriate to the hardware on which the
|
||||
`ninja` command is run is expected to result in a successful build. For example,
|
||||
to limit the job number to 3, run the command `ninja -j 3`. On average, each
|
||||
`ninja` job run in parallel needs approximately 2.5 GiB of RAM.
|
||||
``do_cmake.sh`` by default creates a "debug build" of Ceph, which can be
|
||||
up to five times slower than a non-debug build. Pass
|
||||
``-DCMAKE_BUILD_TYPE=RelWithDebInfo`` to ``do_cmake.sh`` to create a
|
||||
non-debug build.
|
||||
3. Move into the `build` directory:
|
||||
|
||||
This documentation assumes that your build directory is a subdirectory of the
|
||||
`ceph.git` checkout. If the build directory is located elsewhere, point
|
||||
`CEPH_GIT_DIR` to the correct path of the checkout. Additional CMake args can
|
||||
be specified by setting ARGS before invoking ``do_cmake.sh``. See [cmake
|
||||
options](#cmake-options) for more details. For example:
|
||||
cd build
|
||||
4. Use the `ninja` buildsystem to build the development environment:
|
||||
|
||||
ARGS="-DCMAKE_C_COMPILER=gcc-7" ./do_cmake.sh
|
||||
ninja -j3
|
||||
|
||||
To build only certain targets, run a command of the following form:
|
||||
> [IMPORTANT]
|
||||
>
|
||||
> [Ninja](https://ninja-build.org/) is the build system used by the Ceph
|
||||
> project to build test builds. The number of jobs used by `ninja` is
|
||||
> derived from the number of CPU cores of the building host if unspecified.
|
||||
> Use the `-j` option to limit the job number if build jobs are running
|
||||
> out of memory. If you attempt to run `ninja` and receive a message that
|
||||
> reads `g++: fatal error: Killed signal terminated program cc1plus`, then
|
||||
> you have run out of memory.
|
||||
>
|
||||
> Using the `-j` option with an argument appropriate to the hardware on
|
||||
> which the `ninja` command is run is expected to result in a successful
|
||||
> build. For example, to limit the job number to 3, run the command `ninja
|
||||
> -j3`. On average, each `ninja` job run in parallel needs approximately
|
||||
> 2.5 GiB of RAM.
|
||||
|
||||
ninja [target name]
|
||||
This documentation assumes that your build directory is a subdirectory of
|
||||
the `ceph.git` checkout. If the build directory is located elsewhere, point
|
||||
`CEPH_GIT_DIR` to the correct path of the checkout. Additional CMake args
|
||||
can be specified by setting ARGS before invoking ``do_cmake.sh``.
|
||||
See [cmake options](#cmake-options) for more details. For example:
|
||||
|
||||
To install:
|
||||
ARGS="-DCMAKE_C_COMPILER=gcc-7" ./do_cmake.sh
|
||||
|
||||
ninja install
|
||||
To build only certain targets, run a command of the following form:
|
||||
|
||||
ninja [target name]
|
||||
|
||||
5. Install the vstart cluster:
|
||||
|
||||
ninja install
|
||||
|
||||
### CMake Options
|
||||
|
||||
|
@ -121,14 +121,11 @@ If you do not have sufficient permissions to modify any field of the tracker
|
||||
issue, just add a comment describing what changes you would like to make.
|
||||
Someone with permissions will make the necessary modifications on your behalf.
|
||||
|
||||
For straightforward backports, that's all that you (as the developer of the fix)
|
||||
need to do. Volunteers from the `Stable Releases and Backports team`_ will
|
||||
proceed to create Backport issues to track the necessary backports and stage the
|
||||
backports by opening GitHub PRs with the cherry-picks. If you don't want to
|
||||
wait, and provided you have sufficient permissions at https://tracker.ceph.com,
|
||||
you can `create Backport tracker issues` and `stage backports`_ yourself. In
|
||||
that case, read on.
|
||||
|
||||
Authors of pull requests are responsible for creating associated backport pull
|
||||
requests. As long as you have sufficient permissions at
|
||||
https://tracker.ceph.com, you can `create Backport tracker issues` and `stage
|
||||
backports`_ yourself. Read these linked sections to learn how to create
|
||||
backport tracker issues and how to stage backports:
|
||||
|
||||
.. _`create backport tracker issues`:
|
||||
.. _`backport tracker issue`:
|
||||
@ -146,10 +143,7 @@ issues can be created in the backport tracker issue for tracking the backporting
|
||||
|
||||
Under ordinary circumstances, the developer who merges the ``main`` PR will flag
|
||||
the ``main`` branch tracker issue for backport by changing the Status to "Pending
|
||||
Backport", and volunteers from the `Stable Releases and Backports team`_
|
||||
periodically create backport tracker issues by running the
|
||||
``backport-create-issue`` script. They also do the actual backporting. But that
|
||||
does take time and you may not want to wait.
|
||||
Backport".
|
||||
|
||||
You might be tempted to forge ahead and create the backport issues yourself.
|
||||
Please don't do that - it is difficult (bordering on impossible) to get all the
|
||||
@ -360,20 +354,11 @@ Once the backport PR is open, the first order of business is to set the
|
||||
Milestone tag to the stable release the backport PR is targeting. For example,
|
||||
if the PR is targeting "nautilus", set the Milestone tag to "nautilus".
|
||||
|
||||
If you don't have sufficient GitHub permissions to set the Milestone, don't
|
||||
worry. Members of the `Stable Releases and Backports team`_ periodically run
|
||||
a script (``ceph-backport.sh --milestones``) which scans all PRs targetting stable
|
||||
branches and automatically adds the correct Milestone tag if it is missing.
|
||||
|
||||
Next, check which component label was applied to the ``main`` PR corresponding to
|
||||
this backport, and double-check that that label is applied to the backport PR as
|
||||
well. For example, if the ``main`` PR carries the component label "core", the
|
||||
backport PR should also get that label.
|
||||
|
||||
In general, it is the responsibility of the `Stable Releases and Backports
|
||||
team`_ to ensure that backport PRs are properly labelled. If in doubt, just
|
||||
leave the labelling to them.
|
||||
|
||||
.. _`backport PR reviewing`:
|
||||
.. _`backport PR testing`:
|
||||
.. _`backport PR merging`:
|
||||
@ -381,9 +366,8 @@ leave the labelling to them.
|
||||
Reviewing, testing, and merging of backport PRs
|
||||
-----------------------------------------------
|
||||
|
||||
Once your backport PR is open and the Milestone is set properly, the
|
||||
`Stable Releases and Backports team` will take care of getting the PR
|
||||
reviewed and tested. Once the PR is reviewed and tested, it will be merged.
|
||||
Once your backport PR is open, it will be reviewed and tested. When the PR has
|
||||
been reviewed and tested, it will be merged.
|
||||
|
||||
If you would like to facilitate this process, you can solicit reviews and run
|
||||
integration tests on the PR. In this case, add comments to the PR describing the
|
||||
@ -394,22 +378,3 @@ it will be merged. Even if you have sufficient GitHub permissions to merge the
|
||||
PR, please do *not* merge it yourself. (Uncontrolled merging to stable branches
|
||||
unnecessarily complicates the release preparation process, which is done by
|
||||
volunteers.)
|
||||
|
||||
|
||||
Stable Releases and Backports team
|
||||
----------------------------------
|
||||
|
||||
Ceph has a `Stable Releases and Backports`_ team, staffed by volunteers,
|
||||
which is charged with maintaining the stable releases and backporting bugfixes
|
||||
from the ``main`` branch to them. (That team maintains a wiki, accessible by
|
||||
clicking the `Stable Releases and Backports`_ link, which describes various
|
||||
workflows in the backporting lifecycle.)
|
||||
|
||||
.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
|
||||
|
||||
Ordinarily, it is enough to fill out the "Backport" field in the bug (tracker
|
||||
issue). The volunteers from the Stable Releases and Backports team will
|
||||
backport the fix, run regression tests on it, and include it in one or more
|
||||
future point releases.
|
||||
|
||||
|
||||
|
@ -181,7 +181,7 @@
|
||||
# main package definition
|
||||
#################################################################################
|
||||
Name: ceph
|
||||
Version: 19.2.0
|
||||
Version: 19.2.1
|
||||
Release: 0%{?dist}
|
||||
%if 0%{?fedora} || 0%{?rhel}
|
||||
Epoch: 2
|
||||
@ -197,7 +197,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
|
||||
Group: System/Filesystems
|
||||
%endif
|
||||
URL: http://ceph.com/
|
||||
Source0: %{?_remote_tarball_prefix}ceph-19.2.0.tar.bz2
|
||||
Source0: %{?_remote_tarball_prefix}ceph-19.2.1.tar.bz2
|
||||
%if 0%{?suse_version}
|
||||
# _insert_obs_source_lines_here
|
||||
ExclusiveArch: x86_64 aarch64 ppc64le s390x riscv64
|
||||
@ -432,9 +432,9 @@ BuildRequires: python%{python3_pkgversion}-scipy
|
||||
BuildRequires: python%{python3_pkgversion}-werkzeug
|
||||
BuildRequires: python%{python3_pkgversion}-pyOpenSSL
|
||||
%endif
|
||||
BuildRequires: jsonnet
|
||||
%if 0%{?suse_version}
|
||||
BuildRequires: golang-github-prometheus-prometheus
|
||||
BuildRequires: jsonnet
|
||||
BuildRequires: libxmlsec1-1
|
||||
BuildRequires: libxmlsec1-nss1
|
||||
BuildRequires: libxmlsec1-openssl1
|
||||
@ -927,7 +927,6 @@ Requires: parted
|
||||
Requires: util-linux
|
||||
Requires: xfsprogs
|
||||
Requires: python%{python3_pkgversion}-setuptools
|
||||
Requires: python%{python3_pkgversion}-packaging
|
||||
Requires: python%{python3_pkgversion}-ceph-common = %{_epoch_prefix}%{version}-%{release}
|
||||
%description volume
|
||||
This package contains a tool to deploy OSD with different devices like
|
||||
@ -1335,7 +1334,7 @@ This package provides a Ceph hardware monitoring agent.
|
||||
# common
|
||||
#################################################################################
|
||||
%prep
|
||||
%autosetup -p1 -n ceph-19.2.0
|
||||
%autosetup -p1 -n ceph-19.2.1
|
||||
|
||||
%build
|
||||
# Disable lto on systems that do not support symver attribute
|
||||
|
@ -432,9 +432,9 @@ BuildRequires: python%{python3_pkgversion}-scipy
|
||||
BuildRequires: python%{python3_pkgversion}-werkzeug
|
||||
BuildRequires: python%{python3_pkgversion}-pyOpenSSL
|
||||
%endif
|
||||
BuildRequires: jsonnet
|
||||
%if 0%{?suse_version}
|
||||
BuildRequires: golang-github-prometheus-prometheus
|
||||
BuildRequires: jsonnet
|
||||
BuildRequires: libxmlsec1-1
|
||||
BuildRequires: libxmlsec1-nss1
|
||||
BuildRequires: libxmlsec1-openssl1
|
||||
@ -927,7 +927,6 @@ Requires: parted
|
||||
Requires: util-linux
|
||||
Requires: xfsprogs
|
||||
Requires: python%{python3_pkgversion}-setuptools
|
||||
Requires: python%{python3_pkgversion}-packaging
|
||||
Requires: python%{python3_pkgversion}-ceph-common = %{_epoch_prefix}%{version}-%{release}
|
||||
%description volume
|
||||
This package contains a tool to deploy OSD with different devices like
|
||||
|
@ -1,3 +1,9 @@
|
||||
ceph (19.2.1-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
||||
-- Ceph Release Team <ceph-maintainers@ceph.io> Fri, 31 Jan 2025 23:14:10 +0000
|
||||
|
||||
ceph (19.2.0-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
42
ceph/cmake/modules/BuildISAL.cmake
Normal file
42
ceph/cmake/modules/BuildISAL.cmake
Normal file
@ -0,0 +1,42 @@
|
||||
# use an ExternalProject to build isa-l using its makefile
|
||||
function(build_isal)
|
||||
set(isal_BINARY_DIR ${CMAKE_BINARY_DIR}/src/isa-l)
|
||||
set(isal_INSTALL_DIR ${isal_BINARY_DIR}/install)
|
||||
set(isal_INCLUDE_DIR "${isal_INSTALL_DIR}/include")
|
||||
set(isal_LIBRARY "${isal_INSTALL_DIR}/lib/libisal.a")
|
||||
|
||||
# this include directory won't exist until the install step, but the
|
||||
# imported targets need it early for INTERFACE_INCLUDE_DIRECTORIES
|
||||
file(MAKE_DIRECTORY "${isal_INCLUDE_DIR}")
|
||||
|
||||
set(configure_cmd env CC=${CMAKE_C_COMPILER} ./configure --prefix=${isal_INSTALL_DIR})
|
||||
# build a static library with -fPIC that we can link into crypto/compressor plugins
|
||||
list(APPEND configure_cmd --with-pic --enable-static --disable-shared)
|
||||
|
||||
# clear the DESTDIR environment variable from debian/rules,
|
||||
# because it messes with the internal install paths of arrow's bundled deps
|
||||
set(NO_DESTDIR_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR)
|
||||
|
||||
include(ExternalProject)
|
||||
ExternalProject_Add(isal_ext
|
||||
SOURCE_DIR "${PROJECT_SOURCE_DIR}/src/isa-l"
|
||||
CONFIGURE_COMMAND ./autogen.sh COMMAND ${configure_cmd}
|
||||
BUILD_COMMAND ${NO_DESTDIR_COMMAND} make -j3
|
||||
BUILD_IN_SOURCE 1
|
||||
BUILD_BYPRODUCTS ${isal_LIBRARY}
|
||||
INSTALL_COMMAND ${NO_DESTDIR_COMMAND} make install
|
||||
UPDATE_COMMAND ""
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON)
|
||||
|
||||
# add imported library target ISAL::Crypto
|
||||
add_library(ISAL::ISAL STATIC IMPORTED GLOBAL)
|
||||
add_dependencies(ISAL::ISAL isal_ext)
|
||||
set_target_properties(ISAL::ISAL PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES ${isal_INCLUDE_DIR}
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
|
||||
IMPORTED_LOCATION ${isal_LIBRARY})
|
||||
endfunction()
|
31
ceph/cmake/modules/BuildISALCrypto.cmake
Normal file
31
ceph/cmake/modules/BuildISALCrypto.cmake
Normal file
@ -0,0 +1,31 @@
|
||||
# use an ExternalProject to build isa-l_crypto using its makefile
|
||||
function(build_isal_crypto)
|
||||
set(ISAL_CRYPTO_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
|
||||
set(ISAL_CRYPTO_INCLUDE_DIR "${ISAL_CRYPTO_SOURCE_DIR}/include")
|
||||
set(ISAL_CRYPTO_LIBRARY "${ISAL_CRYPTO_SOURCE_DIR}/bin/isa-l_crypto.a")
|
||||
|
||||
include(FindMake)
|
||||
find_make("MAKE_EXECUTABLE" "make_cmd")
|
||||
|
||||
include(ExternalProject)
|
||||
ExternalProject_Add(isal_crypto_ext
|
||||
SOURCE_DIR ${ISAL_CRYPTO_SOURCE_DIR}
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ${make_cmd} -f <SOURCE_DIR>/Makefile.unx
|
||||
BUILD_IN_SOURCE 1
|
||||
BUILD_BYPRODUCTS ${ISAL_CRYPTO_LIBRARY}
|
||||
INSTALL_COMMAND ""
|
||||
UPDATE_COMMAND ""
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON)
|
||||
|
||||
# add imported library target ISAL::Crypto
|
||||
add_library(ISAL::Crypto STATIC IMPORTED GLOBAL)
|
||||
add_dependencies(ISAL::Crypto isal_crypto_ext)
|
||||
set_target_properties(ISAL::Crypto PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES ${ISAL_CRYPTO_INCLUDE_DIR}
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
|
||||
IMPORTED_LOCATION ${ISAL_CRYPTO_LIBRARY})
|
||||
endfunction()
|
218
ceph/container/Containerfile
Normal file
218
ceph/container/Containerfile
Normal file
@ -0,0 +1,218 @@
|
||||
ARG FROM_IMAGE="quay.io/centos/centos:stream9"
|
||||
FROM $FROM_IMAGE
|
||||
|
||||
# allow FROM_IMAGE to be visible inside this stage
|
||||
ARG FROM_IMAGE
|
||||
|
||||
# Ceph branch name
|
||||
ARG CEPH_REF="main"
|
||||
|
||||
# Ceph SHA1
|
||||
ARG CEPH_SHA1
|
||||
|
||||
# Ceph git repo (ceph-ci.git or ceph.git)
|
||||
ARG CEPH_GIT_REPO
|
||||
|
||||
# (optional) Define the baseurl= for the ganesha.repo
|
||||
ARG GANESHA_REPO_BASEURL="https://buildlogs.centos.org/centos/\$releasever-stream/storage/\$basearch/nfsganesha-5/"
|
||||
|
||||
# (optional) Set to "crimson" to install crimson packages.
|
||||
ARG OSD_FLAVOR="default"
|
||||
|
||||
# (optional) Should be 'true' for CI builds (pull from shaman, etc.)
|
||||
ARG CI_CONTAINER="true"
|
||||
|
||||
|
||||
RUN /bin/echo -e "\
|
||||
FROM_IMAGE: ${FROM_IMAGE}\n\
|
||||
CEPH_REF: ${CEPH_REF}\n\
|
||||
GANESHA_REPO_BASEURL: ${GANESHA_REPO_BASEURL} \n\
|
||||
OSD_FLAVOR: ${OSD_FLAVOR} \n\
|
||||
CI_CONTAINER: ${CI_CONTAINER}"
|
||||
|
||||
# Other labels are set automatically by container/build github action
|
||||
# See: https://github.com/opencontainers/image-spec/blob/main/annotations.md
|
||||
LABEL org.opencontainers.image.authors="Ceph Release Team <ceph-maintainers@ceph.io>" \
|
||||
org.opencontainers.image.documentation="https://docs.ceph.com/"
|
||||
|
||||
LABEL \
|
||||
FROM_IMAGE=${FROM_IMAGE} \
|
||||
CEPH_REF=${CEPH_REF} \
|
||||
CEPH_SHA1=${CEPH_SHA1} \
|
||||
CEPH_GIT_REPO=${CEPH_GIT_REPO} \
|
||||
GANESHA_REPO_BASEURL=${GANESHA_REPO_BASEURL} \
|
||||
OSD_FLAVOR=${OSD_FLAVOR}
|
||||
|
||||
|
||||
#===================================================================================================
|
||||
# Install ceph and dependencies, and clean up
|
||||
# IMPORTANT: in official builds, use '--squash' build option to keep image as small as possible
|
||||
# keeping run steps separate makes local rebuilds quick, but images are big without squash option
|
||||
#===================================================================================================
|
||||
|
||||
# Pre-reqs
|
||||
RUN dnf install -y --setopt=install_weak_deps=False epel-release jq
|
||||
|
||||
# Add NFS-Ganesha repo
|
||||
RUN \
|
||||
echo "[ganesha]" > /etc/yum.repos.d/ganesha.repo && \
|
||||
echo "name=ganesha" >> /etc/yum.repos.d/ganesha.repo && \
|
||||
echo "baseurl=${GANESHA_REPO_BASEURL}" >> /etc/yum.repos.d/ganesha.repo && \
|
||||
echo "gpgcheck=0" >> /etc/yum.repos.d/ganesha.repo && \
|
||||
echo "enabled=1" >> /etc/yum.repos.d/ganesha.repo
|
||||
|
||||
# ISCSI repo
|
||||
RUN set -ex && \
|
||||
curl -s -L https://shaman.ceph.com/api/repos/tcmu-runner/main/latest/centos/9/repo?arch=$(arch) -o /etc/yum.repos.d/tcmu-runner.repo && \
|
||||
case "${CEPH_REF}" in \
|
||||
quincy|reef) \
|
||||
curl -fs -L https://download.ceph.com/ceph-iscsi/3/rpm/el9/ceph-iscsi.repo -o /etc/yum.repos.d/ceph-iscsi.repo ;\
|
||||
;;\
|
||||
main|*) \
|
||||
curl -fs -L https://shaman.ceph.com/api/repos/ceph-iscsi/main/latest/centos/9/repo -o /etc/yum.repos.d/ceph-iscsi.repo ;\
|
||||
;;\
|
||||
esac
|
||||
|
||||
# Ceph repo
|
||||
RUN --mount=type=secret,id=prerelease_creds set -ex && \
|
||||
rpm --import 'https://download.ceph.com/keys/release.asc' && \
|
||||
ARCH=$(arch); if [ "${ARCH}" == "aarch64" ]; then ARCH="arm64"; fi ;\
|
||||
IS_RELEASE=0 ;\
|
||||
if [[ "${CI_CONTAINER}" == "true" ]] ; then \
|
||||
# TODO: this can return different ceph builds (SHA1) for x86 vs. arm runs. is it important to fix?
|
||||
REPO_URL=$(curl -fs "https://shaman.ceph.com/api/search/?project=ceph&distros=centos/9/${ARCH}&flavor=${OSD_FLAVOR}&ref=${CEPH_REF}&sha1=latest" | jq -r .[0].url) ;\
|
||||
else \
|
||||
IS_RELEASE=1 ;\
|
||||
source /run/secrets/prerelease_creds; \
|
||||
REPO_URL="https://${PRERELEASE_USERNAME}:${PRERELEASE_PASSWORD}@download.ceph.com/prerelease/ceph/rpm-${CEPH_REF}/el9/" ;\
|
||||
fi && \
|
||||
rpm -Uvh "$REPO_URL/noarch/ceph-release-1-${IS_RELEASE}.el9.noarch.rpm" ; \
|
||||
if [[ "$IS_RELEASE" == 1 ]] ; then \
|
||||
sed -i "s;http://download.ceph.com/;https://${PRERELEASE_USERNAME}:${PRERELEASE_PASSWORD}@download.ceph.com/prerelease/ceph/;" /etc/yum.repos.d/ceph.repo ; \
|
||||
dnf clean expire-cache ; \
|
||||
fi
|
||||
|
||||
|
||||
# Copr repos
|
||||
# scikit for mgr-diskprediction-local
|
||||
# ref: https://github.com/ceph/ceph-container/pull/1821
|
||||
RUN \
|
||||
dnf install -y --setopt=install_weak_deps=False dnf-plugins-core && \
|
||||
dnf copr enable -y tchaikov/python-scikit-learn
|
||||
|
||||
# Update package mgr
|
||||
RUN dnf update -y --setopt=install_weak_deps=False
|
||||
|
||||
# Define and install packages
|
||||
# General
|
||||
RUN echo "ca-certificates" > packages.txt
|
||||
# Ceph
|
||||
# TODO: remove lua-devel and luarocks once they are present in ceph.spec.in
|
||||
# ref: https://github.com/ceph/ceph/pull/54575#discussion_r1401199635
|
||||
RUN echo \
|
||||
"ceph-common \
|
||||
ceph-exporter \
|
||||
ceph-grafana-dashboards \
|
||||
ceph-immutable-object-cache \
|
||||
ceph-mds \
|
||||
ceph-mgr-cephadm \
|
||||
ceph-mgr-dashboard \
|
||||
ceph-mgr-diskprediction-local \
|
||||
ceph-mgr-k8sevents \
|
||||
ceph-mgr-rook \
|
||||
ceph-mgr \
|
||||
ceph-mon \
|
||||
ceph-osd \
|
||||
ceph-radosgw lua-devel luarocks \
|
||||
ceph-volume \
|
||||
cephfs-mirror \
|
||||
cephfs-top \
|
||||
kmod \
|
||||
libradosstriper1 \
|
||||
rbd-mirror" \
|
||||
>> packages.txt
|
||||
|
||||
# Optional crimson package(s)
|
||||
RUN if [ "${OSD_FLAVOR}" == "crimson" ]; then \
|
||||
echo "ceph-crimson-osd" >> packages.txt ; \
|
||||
fi
|
||||
|
||||
# Ceph "Recommends"
|
||||
RUN echo "nvme-cli python3-saml smartmontools" >> packages.txt
|
||||
# NFS-Ganesha
|
||||
RUN echo "\
|
||||
dbus-daemon \
|
||||
nfs-ganesha-ceph \
|
||||
nfs-ganesha-rados-grace \
|
||||
nfs-ganesha-rados-urls \
|
||||
nfs-ganesha-rgw \
|
||||
nfs-ganesha \
|
||||
rpcbind \
|
||||
sssd-client" >> packages.txt
|
||||
|
||||
# ISCSI
|
||||
RUN echo "ceph-iscsi tcmu-runner python3-rtslib" >> packages.txt
|
||||
|
||||
# Ceph-CSI
|
||||
# TODO: coordinate with @Madhu-1 to have Ceph-CSI install these itself if unused by ceph
|
||||
# @adk3798 does cephadm use these?
|
||||
RUN echo "attr ceph-fuse rbd-nbd" >> packages.txt
|
||||
|
||||
# Rook (only if packages must be in ceph container image)
|
||||
RUN echo "systemd-udev" >> packages.txt
|
||||
|
||||
# Util packages (should be kept to only utils that are truly very useful)
|
||||
# 'sgdisk' (from gdisk) is used in docs and scripts for clearing disks (could be a risk? @travisn @guits @ktdreyer ?)
|
||||
# 'ps' (from procps-ng) and 'hostname' are very valuable for debugging and CI
|
||||
# TODO: remove sg3_utils once they are moved to ceph.spec.in with libstoragemgmt
|
||||
# ref: https://github.com/ceph/ceph-container/pull/2013#issuecomment-1248606472
|
||||
RUN echo "gdisk hostname procps-ng sg3_utils e2fsprogs lvm2 gcc" >> packages.txt
|
||||
|
||||
# scikit
|
||||
RUN echo "python3-scikit-learn" >> packages.txt
|
||||
|
||||
# ceph-node-proxy
|
||||
RUN echo "ceph-node-proxy" >> packages.txt
|
||||
|
||||
RUN echo "=== PACKAGES TO BE INSTALLED ==="; cat packages.txt
|
||||
RUN echo "=== INSTALLING ===" ; \
|
||||
dnf install -y --setopt=install_weak_deps=False --setopt=skip_missing_names_on_install=False --enablerepo=crb $(cat packages.txt)
|
||||
|
||||
# XXX why isn't this done in the ganesha package?
|
||||
RUN mkdir -p /var/run/ganesha
|
||||
|
||||
# Disable sync with udev since the container can not contact udev
|
||||
RUN \
|
||||
sed -i -e 's/udev_rules = 1/udev_rules = 0/' \
|
||||
-e 's/udev_sync = 1/udev_sync = 0/' \
|
||||
-e 's/obtain_device_list_from_udev = 1/obtain_device_list_from_udev = 0/' \
|
||||
/etc/lvm/lvm.conf && \
|
||||
# validate the sed command worked as expected
|
||||
grep -sqo "udev_sync = 0" /etc/lvm/lvm.conf && \
|
||||
grep -sqo "udev_rules = 0" /etc/lvm/lvm.conf && \
|
||||
grep -sqo "obtain_device_list_from_udev = 0" /etc/lvm/lvm.conf
|
||||
|
||||
# CLEAN UP!
|
||||
RUN set -ex && \
|
||||
dnf clean all && \
|
||||
rm -rf /var/cache/dnf/* && \
|
||||
rm -rf /var/lib/dnf/* && \
|
||||
rm -f /var/lib/rpm/__db* && \
|
||||
# remove unnecessary files with big impact
|
||||
rm -rf /etc/selinux /usr/share/{doc,man,selinux} && \
|
||||
# don't keep compiled python binaries
|
||||
find / -xdev \( -name "*.pyc" -o -name "*.pyo" \) -delete && \
|
||||
rm -f /etc/yum.repos.d/{ceph,ganesha,tcmu-runner,ceph-iscsi}.repo
|
||||
|
||||
# Verify that the packages installed haven't been accidentally cleaned, then
|
||||
# clean the package list and re-clean unnecessary RPM database files
|
||||
RUN rpm -q $(cat packages.txt) && rm -f /var/lib/rpm/__db* && rm -f *packages.txt
|
||||
|
||||
#
|
||||
# Set some envs in the container for quickly inspecting details about the build at runtime
|
||||
ENV CEPH_IS_DEVEL="${CI_CONTAINER}" \
|
||||
CEPH_REF="${CEPH_REF}" \
|
||||
CEPH_VERSION="${CEPH_REF}" \
|
||||
CEPH_OSD_FLAVOR="${OSD_FLAVOR}" \
|
||||
FROM_IMAGE="${FROM_IMAGE}"
|
||||
|
198
ceph/container/build.sh
Executable file
198
ceph/container/build.sh
Executable file
@ -0,0 +1,198 @@
|
||||
#!/bin/bash -ex
|
||||
# vim: ts=4 sw=4 expandtab
|
||||
|
||||
# repo auth with write perms must be present (this script does not log into
|
||||
# repos named by CONTAINER_REPO_*).
|
||||
# If NO_PUSH is set, no login is necessary
|
||||
|
||||
|
||||
CFILE=${1:-Containerfile}
|
||||
shift || true
|
||||
|
||||
usage() {
|
||||
cat << EOF
|
||||
$0 [containerfile] (defaults to 'Containerfile')
|
||||
For a CI build (from ceph-ci.git, built and pushed to shaman):
|
||||
CI_CONTAINER: must be 'true'
|
||||
FLAVOR (OSD flavor, default or crimson)
|
||||
BRANCH (of Ceph. <remote>/<ref>)
|
||||
CEPH_SHA1 (of Ceph)
|
||||
ARCH (of build host, and resulting container)
|
||||
CONTAINER_REPO_HOSTNAME (quay.ceph.io, for CI, for instance)
|
||||
CONTAINER_REPO_ORGANIZATION (ceph-ci, for CI, for instance)
|
||||
CONTAINER_REPO (ceph, for CI, or prerelease-<arch> for release, for instance)
|
||||
CONTAINER_REPO_USERNAME
|
||||
CONTAINER_REPO_PASSWORD
|
||||
PRERELEASE_USERNAME for download.ceph.com:/prerelease/ceph
|
||||
PRERELEASE_PASSWORD
|
||||
|
||||
For a release build: (from ceph.git, built and pushed to download.ceph.com)
|
||||
CI_CONTAINER: must be 'false'
|
||||
and you must also add
|
||||
VERSION (for instance, 19.1.0) for tagging the image
|
||||
|
||||
You can avoid the push step (for testing) by setting NO_PUSH to anything
|
||||
EOF
|
||||
}
|
||||
|
||||
CI_CONTAINER=${CI_CONTAINER:-false}
|
||||
FLAVOR=${FLAVOR:-default}
|
||||
# default: current checked-out branch
|
||||
BRANCH=${BRANCH:-$(git rev-parse --abbrev-ref HEAD)}
|
||||
# default: current checked-out branch
|
||||
CEPH_SHA1=${CEPH_SHA1:-$(git rev-parse HEAD)}
|
||||
# default: build host arch
|
||||
ARCH=${ARCH:-$(arch)}
|
||||
if [[ "${ARCH}" == "aarch64" ]] ; then ARCH=arm64; fi
|
||||
REPO_ARCH=amd64
|
||||
if [[ "${ARCH}" = arm64 ]] ; then
|
||||
REPO_ARCH=arm64
|
||||
fi
|
||||
|
||||
if [[ ${CI_CONTAINER} == "true" ]] ; then
|
||||
CONTAINER_REPO_HOSTNAME=${CONTAINER_REPO_HOSTNAME:-quay.ceph.io}
|
||||
CONTAINER_REPO_ORGANIZATION=${CONTAINER_REPO_ORGANIZATION:-ceph-ci}
|
||||
CONTAINER_REPO=${CONTAINER_REPO:-ceph}
|
||||
else
|
||||
CONTAINER_REPO_HOSTNAME=${CONTAINER_REPO_HOSTNAME:-quay.ceph.io}
|
||||
CONTAINER_REPO_ORGANIZATION=${CONTAINER_REPO_ORGANIZATION:-ceph}
|
||||
CONTAINER_REPO=${CONTAINER_REPO:-prerelease-${REPO_ARCH}}
|
||||
# default: most-recent annotated tag
|
||||
VERSION=${VERSION:-$(git describe --abbrev=0)}
|
||||
fi
|
||||
|
||||
# check for existence of all required variables
|
||||
: "${CI_CONTAINER:?}"
|
||||
: "${FLAVOR:?}"
|
||||
: "${BRANCH:?}"
|
||||
: "${CEPH_SHA1:?}"
|
||||
: "${ARCH:?}"
|
||||
if [[ ${NO_PUSH} != "true" ]] ; then
|
||||
: "${CONTAINER_REPO_HOSTNAME:?}"
|
||||
: "${CONTAINER_REPO_ORGANIZATION:?}"
|
||||
: "${CONTAINER_REPO_USERNAME:?}"
|
||||
: "${CONTAINER_REPO_PASSWORD:?}"
|
||||
fi
|
||||
if [[ ${CI_CONTAINER} != "true" ]] ; then : "${VERSION:?}"; fi
|
||||
|
||||
# check for valid repo auth (if pushing)
|
||||
repopath=${CONTAINER_REPO_HOSTNAME}/${CONTAINER_REPO_ORGANIZATION}/${CONTAINER_REPO}
|
||||
MINIMAL_IMAGE=${repopath}:minimal-test
|
||||
if [[ ${NO_PUSH} != "true" ]] ; then
|
||||
podman rmi ${MINIMAL_IMAGE} || true
|
||||
echo "FROM scratch" | podman build -f - -t ${MINIMAL_IMAGE}
|
||||
if ! podman push ${MINIMAL_IMAGE} ; then
|
||||
echo "Not authenticated to ${repopath}; need docker/podman login?"
|
||||
exit 1
|
||||
fi
|
||||
podman rmi ${MINIMAL_IMAGE} | true
|
||||
fi
|
||||
|
||||
if [[ -z "${CEPH_GIT_REPO}" ]] ; then
|
||||
if [[ ${CI_CONTAINER} == "true" ]]; then
|
||||
CEPH_GIT_REPO=https://github.com/ceph/ceph-ci.git
|
||||
else
|
||||
CEPH_GIT_REPO=https://github.com/ceph/ceph.git
|
||||
fi
|
||||
fi
|
||||
|
||||
# BRANCH will be, say, origin/main. remove <remote>/
|
||||
BRANCH=${BRANCH##*/}
|
||||
|
||||
# podman build only supports secret files.
|
||||
# This must be removed after podman build
|
||||
touch prerelease.secret.txt
|
||||
chmod 600 prerelease.secret.txt
|
||||
echo -e "\
|
||||
PRERELEASE_USERNAME=${PRERELEASE_USERNAME}\n
|
||||
PRERELEASE_PASSWORD=${PRERELEASE_PASSWORD}\n " > prerelease.secret.txt
|
||||
|
||||
podman build --pull=newer --squash -f $CFILE -t build.sh.output \
|
||||
--build-arg FROM_IMAGE=${FROM_IMAGE:-quay.io/centos/centos:stream9} \
|
||||
--build-arg CEPH_SHA1=${CEPH_SHA1} \
|
||||
--build-arg CEPH_GIT_REPO=${CEPH_GIT_REPO} \
|
||||
--build-arg CEPH_REF=${BRANCH:-main} \
|
||||
--build-arg OSD_FLAVOR=${FLAVOR:-default} \
|
||||
--build-arg CI_CONTAINER=${CI_CONTAINER:-default} \
|
||||
--secret=id=prerelease_creds,src=./prerelease.secret.txt \
|
||||
2>&1
|
||||
|
||||
rm ./prerelease.secret.txt
|
||||
|
||||
image_id=$(podman image ls localhost/build.sh.output --format '{{.ID}}')
|
||||
|
||||
# grab useful image attributes for building the tag
|
||||
#
|
||||
# the variable settings are prefixed with "export CEPH_CONTAINER_" so that
|
||||
# an eval or . can be used to put them into the environment
|
||||
#
|
||||
# PATH is removed from the output as it would cause problems for this
|
||||
# parent script and its children
|
||||
#
|
||||
# notes:
|
||||
#
|
||||
# we want .Architecture and everything in .Config.Env
|
||||
#
|
||||
# printf will not accept "\n" (is this a podman bug?)
|
||||
# so construct vars with two calls to podman inspect, joined by a newline,
|
||||
# so that vars will get the output of the first command, newline, output
|
||||
# of the second command
|
||||
#
|
||||
vars="$(podman inspect -f '{{printf "export CEPH_CONTAINER_ARCH=%v" .Architecture}}' ${image_id})
|
||||
$(podman inspect -f '{{range $index, $value := .Config.Env}}export CEPH_CONTAINER_{{$value}}{{println}}{{end}}' ${image_id})"
|
||||
vars="$(echo "${vars}" | grep -v PATH)"
|
||||
eval ${vars}
|
||||
|
||||
# remove everything up to and including the last slash
|
||||
fromtag=${CEPH_CONTAINER_FROM_IMAGE##*/}
|
||||
# translate : to -
|
||||
fromtag=${fromtag/:/-}
|
||||
builddate=$(date +%Y%m%d)
|
||||
local_tag=${fromtag}-${CEPH_CONTAINER_CEPH_REF}-${CEPH_CONTAINER_ARCH}-${builddate}
|
||||
|
||||
repopath=${CONTAINER_REPO_HOSTNAME}/${CONTAINER_REPO_ORGANIZATION}/${CONTAINER_REPO}
|
||||
|
||||
if [[ ${CI_CONTAINER} == "true" ]] ; then
|
||||
# ceph-ci conventions for remote tags:
|
||||
# requires ARCH, BRANCH, CEPH_SHA1, FLAVOR
|
||||
full_repo_tag=${repopath}:${BRANCH}-${fromtag}-${ARCH}-devel
|
||||
branch_repo_tag=${repopath}:${BRANCH}
|
||||
sha1_repo_tag=${repopath}:${CEPH_SHA1}
|
||||
|
||||
if [[ "${ARCH}" == "arm64" ]] ; then
|
||||
branch_repo_tag=${branch_repo_tag}-arm64
|
||||
sha1_repo_tag=${sha1_repo_tag}-arm64
|
||||
fi
|
||||
|
||||
podman tag ${image_id} ${full_repo_tag}
|
||||
podman tag ${image_id} ${branch_repo_tag}
|
||||
podman tag ${image_id} ${sha1_repo_tag}
|
||||
|
||||
if [[ ${FLAVOR} == "crimson" && ${ARCH} == "x86_64" ]] ; then
|
||||
sha1_flavor_repo_tag=${sha1_repo_tag}-${FLAVOR}
|
||||
podman tag ${image_id} ${sha1_flavor_repo_tag}
|
||||
if [[ -z "${NO_PUSH}" ]] ; then
|
||||
podman push ${sha1_flavor_repo_tag}
|
||||
fi
|
||||
exit
|
||||
fi
|
||||
|
||||
if [[ -z "${NO_PUSH}" ]] ; then
|
||||
podman push ${full_repo_tag}
|
||||
podman push ${branch_repo_tag}
|
||||
podman push ${sha1_repo_tag}
|
||||
fi
|
||||
else
|
||||
#
|
||||
# non-CI build. Tags are like v19.1.0-20240701
|
||||
# push to quay.ceph.io/ceph/prerelease-$REPO_ARCH
|
||||
#
|
||||
version_tag=${repopath}:v${VERSION}-${builddate}
|
||||
|
||||
podman tag ${image_id} ${version_tag}
|
||||
if [[ -z "${NO_PUSH}" ]] ; then
|
||||
podman push ${version_tag}
|
||||
fi
|
||||
fi
|
||||
|
||||
|
252
ceph/container/make-manifest-list.py
Executable file
252
ceph/container/make-manifest-list.py
Executable file
@ -0,0 +1,252 @@
|
||||
#!/usr/bin/python3
|
||||
#
|
||||
# in default mode:
|
||||
# make a combined "manifest-list" container out of two arch-specific containers
|
||||
# searches for latest tags on HOST/{AMD,ARM}64_REPO, makes sure they refer
|
||||
# to the same Ceph SHA1, and creates a manifest-list ("fat") image on
|
||||
# MANIFEST_HOST/MANIFEST_REPO with the 'standard' set of tags:
|
||||
# v<major>
|
||||
# v<major>.<minor>
|
||||
# v<major>.<minor>.<micro>
|
||||
# v<major>.<minor>.<micro>-<YYYYMMDD>
|
||||
#
|
||||
# uses scratch local manifest LOCALMANIFEST, defined here; will be destroyed if present
|
||||
#
|
||||
# in promote mode (by adding the --promote argument):
|
||||
# instead of building the manifest-list container, copy it
|
||||
# (and all of its tags) from the prerelease repo to the release repo
|
||||
#
|
||||
# Assumes valid logins to the necessary hosts/repos with permission to write images
|
||||
#
|
||||
# Environment variables to set:
|
||||
# ARCH_SPECIFIC_HOST (default 'quay.ceph.io'): host of prerelease repos
|
||||
# AMD64_REPO (default 'ceph/prerelease-amd64') prerelease amd64 repo
|
||||
# ARM64_REPO (default 'ceph/prerelease-arm64') prerelease arm64 repo
|
||||
# MANIFEST_HOST (default 'quay.ceph.io') prerelease manifest-list host
|
||||
# MANIFEST_REPO (default 'ceph/prerelease') prerelease manifest-list repo
|
||||
# RELEASE_MANIFEST_HOST (default 'quay.io') release host
|
||||
# RELEASE_MANIFEST_REPO (default 'ceph/ceph') release repo
|
||||
|
||||
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
import functools
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# Manifest image. Will be destroyed if already present.
|
||||
LOCALMANIFEST = 'localhost/m'
|
||||
|
||||
|
||||
def dump_vars(names, vardict):
|
||||
for name in names:
|
||||
print(f'{name}: {vardict[name]}', file=sys.stderr)
|
||||
|
||||
|
||||
def run_command(args):
|
||||
print(f'running {args}', file=sys.stderr)
|
||||
if not isinstance(args, list):
|
||||
args = args.split()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True)
|
||||
return True, result.stdout, result.stderr
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
return False, e.output, e.stderr
|
||||
|
||||
|
||||
def get_command_output(args):
|
||||
success, stdout, stderr = run_command(args)
|
||||
return (stdout if success else None)
|
||||
|
||||
|
||||
def run_command_show_failure(args):
|
||||
success, stdout, stderr = run_command(args)
|
||||
if not success:
|
||||
print(f'{args} failed:', file=sys.stderr)
|
||||
print(f'stdout:\n{stdout}')
|
||||
print(f'stderr:\n{stderr}')
|
||||
return success
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def get_tags(path):
|
||||
cmdout = get_command_output(f'skopeo list-tags docker://{path}')
|
||||
return json.loads(cmdout)['Tags']
|
||||
|
||||
|
||||
def get_latest_tag(path):
|
||||
try:
|
||||
latest_tag = get_tags(path)[-1]
|
||||
except IndexError:
|
||||
return None
|
||||
return latest_tag
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def get_image_inspect(path):
|
||||
info = json.loads(
|
||||
get_command_output(f'skopeo inspect docker://{path}')
|
||||
)
|
||||
return info
|
||||
|
||||
|
||||
def get_sha1(info):
|
||||
labels = info.get('Labels', None)
|
||||
if not labels:
|
||||
return None
|
||||
return labels.get('CEPH_SHA1', None)
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def get_all_matching_digest_tags(path, tag):
|
||||
|
||||
matching_tags = list()
|
||||
digest = get_image_inspect(f'{path}:{tag}')['Digest']
|
||||
|
||||
for t in get_tags(path):
|
||||
this_digest = get_image_inspect(f'{path}:{t}')['Digest']
|
||||
if this_digest == digest:
|
||||
matching_tags.append(t)
|
||||
|
||||
return matching_tags
|
||||
|
||||
|
||||
def parse_args():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument('-n', '--dry-run', action='store_true', help='do all local manipulations but do not push final containers to MANIFEST_HOST, or in --promote, calculate but do not copy images to release host')
|
||||
ap.add_argument('-P', '--promote', action='store_true', help='promote newest prerelease manifest container to released (move from MANIFEST_HOST to RELEASE_MANIFEST_HOST')
|
||||
args = ap.parse_args()
|
||||
return args
|
||||
|
||||
def build_prerelease(sysargs):
|
||||
global args
|
||||
|
||||
arch_specific_host = os.environ.get('ARCH_SPECIFIC_HOST', 'quay.ceph.io')
|
||||
amd64_repo = os.environ.get('AMD64_REPO', 'ceph/prerelease-amd64')
|
||||
arm64_repo = os.environ.get('ARM64_REPO', 'ceph/prerelease-arm64')
|
||||
manifest_host = os.environ.get('MANIFEST_HOST', 'quay.ceph.io')
|
||||
manifest_repo = os.environ.get('MANIFEST_REPO', 'ceph/prerelease')
|
||||
|
||||
dump_vars(
|
||||
('arch_specific_host',
|
||||
'amd64_repo',
|
||||
'arm64_repo',
|
||||
'manifest_host',
|
||||
'manifest_repo',
|
||||
),
|
||||
locals())
|
||||
repopaths = (
|
||||
f'{arch_specific_host}/{amd64_repo}',
|
||||
f'{arch_specific_host}/{arm64_repo}',
|
||||
)
|
||||
tags = [get_latest_tag(p) for p in repopaths]
|
||||
print(f'latest tags: amd64:{tags[0]} arm64:{tags[1]}')
|
||||
|
||||
# check that version of latest tag matches
|
||||
version_re = \
|
||||
r'v(?P<major>\d+)\.(?P<minor>\d+)\.(?P<micro>\d+)-(?P<date>\d+)'
|
||||
versions = list()
|
||||
for tag in tags:
|
||||
mo = re.match(version_re, tag)
|
||||
ver = f'{mo.group("major")}.{mo.group("minor")}.{mo.group("micro")}'
|
||||
versions.append(ver)
|
||||
if versions[0] != versions[1]:
|
||||
print(
|
||||
f'version mismatch: amd64:{versions[0]} arm64:{versions[1]}',
|
||||
file=sys.stderr,
|
||||
)
|
||||
return(1)
|
||||
|
||||
major, minor, micro = mo.group(1), mo.group(2), mo.group(3)
|
||||
print(f'Ceph version: {major}.{minor}.{micro}', file=sys.stderr)
|
||||
|
||||
# check that ceph sha1 of two arch images matches
|
||||
paths_with_tags = [f'{p}:{t}' for (p, t) in zip(repopaths, tags)]
|
||||
info = [get_image_inspect(p) for p in paths_with_tags]
|
||||
sha1s = [get_sha1(i) for i in info]
|
||||
if sha1s[0] != sha1s[1]:
|
||||
print(
|
||||
f'sha1 mismatch: amd64: {sha1s[0]} arm64: {sha1s[1]}',
|
||||
file=sys.stderr,
|
||||
)
|
||||
builddate = [i['Created'] for i in info]
|
||||
print(
|
||||
f'Build dates: amd64: {builddate[0]} arm64: {builddate[1]}',
|
||||
file=sys.stderr,
|
||||
)
|
||||
return(1)
|
||||
|
||||
# create manifest list image with the standard list of tags
|
||||
# ignore failure on manifest rm
|
||||
run_command(f'podman manifest rm {LOCALMANIFEST}')
|
||||
run_command_show_failure(f'podman manifest create {LOCALMANIFEST}')
|
||||
for p in paths_with_tags:
|
||||
run_command_show_failure(f'podman manifest add m {p}')
|
||||
base = f'{manifest_host}/{manifest_repo}'
|
||||
for t in (
|
||||
f'v{major}',
|
||||
f'v{major}.{minor}',
|
||||
f'v{major}.{minor}.{micro}',
|
||||
f'v{major}.{minor}.{micro}-{datetime.today().strftime("%Y%m%d")}',
|
||||
):
|
||||
if sysargs.dry_run:
|
||||
print(f'skipping podman manifest push {LOCALMANIFEST} {base}:{t}')
|
||||
else:
|
||||
run_command_show_failure(
|
||||
f'podman manifest push {LOCALMANIFEST} {base}:{t}')
|
||||
|
||||
def promote(sysargs):
|
||||
manifest_host = os.environ.get('MANIFEST_HOST', 'quay.ceph.io')
|
||||
manifest_repo = os.environ.get('MANIFEST_REPO', 'ceph/prerelease')
|
||||
release_manifest_host = os.environ.get('RELEASE_MANIFEST_HOST', 'quay.io')
|
||||
release_manifest_repo = os.environ.get('RELEASE_MANIFEST_REPO', 'ceph/ceph')
|
||||
dump_vars(
|
||||
('manifest_host',
|
||||
'manifest_repo',
|
||||
'release_manifest_host',
|
||||
'release_manifest_repo',
|
||||
),
|
||||
locals())
|
||||
|
||||
manifest_path = f'{manifest_host}/{manifest_repo}'
|
||||
release_path = f'{release_manifest_host}/{release_manifest_repo}'
|
||||
latest_tag = get_latest_tag(manifest_path)
|
||||
all_tags = get_all_matching_digest_tags(manifest_path, latest_tag)
|
||||
|
||||
copypaths = list()
|
||||
for t in all_tags:
|
||||
from_path = f'{manifest_path}:{t}'
|
||||
to_path = f'{release_path}:{t}'
|
||||
copypaths.append((from_path, to_path))
|
||||
|
||||
if sysargs.dry_run:
|
||||
for f, t in copypaths:
|
||||
print(f'dry-run: Would copy: {f} -> {t}')
|
||||
return(0)
|
||||
|
||||
for f, t in copypaths:
|
||||
print(f'Will copy: {f} -> {t}')
|
||||
|
||||
for f, t in copypaths:
|
||||
run_command_show_failure(f'skopeo copy --multi-arch=all docker://{f} docker://{t}')
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if args.promote:
|
||||
promote(args)
|
||||
else:
|
||||
build_prerelease(args)
|
||||
|
||||
|
||||
if (__name__ == '__main__'):
|
||||
sys.exit(main())
|
@ -1215,7 +1215,7 @@ exemplary implementations.
|
||||
Summary
|
||||
-------
|
||||
|
||||
Ceph Storage Clusters are dynamic--like a living organism. Whereas, many storage
|
||||
Ceph Storage Clusters are dynamic--like a living organism. Although many storage
|
||||
appliances do not fully utilize the CPU and RAM of a typical commodity server,
|
||||
Ceph does. From heartbeats, to peering, to rebalancing the cluster or
|
||||
recovering from faults, Ceph offloads work from clients (and from a centralized
|
||||
|
@ -9,3 +9,48 @@ Logical volume name format is vg/lv. Fails if OSD has already got attached DB.
|
||||
Attach vgname/lvname as a DB volume to OSD 1::
|
||||
|
||||
ceph-volume lvm new-db --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D --target vgname/new_db
|
||||
|
||||
Reversing BlueFS Spillover to Slow Devices
|
||||
------------------------------------------
|
||||
|
||||
Under certain circumstances, OSD RocksDB databases spill onto slow storage and
|
||||
the Ceph cluster returns specifics regarding BlueFS spillover warnings. ``ceph
|
||||
health detail`` returns these spillover warnings. Here is an example of a
|
||||
spillover warning::
|
||||
|
||||
osd.76 spilled over 128 KiB metadata from 'db' device (56 GiB used of 60 GiB) to slow device
|
||||
|
||||
To move this DB metadata from the slower device to the faster device, take the
|
||||
following steps:
|
||||
|
||||
#. Expand the database's logical volume (LV):
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
lvextend -l ${size} ${lv}/${db} ${ssd_dev}
|
||||
|
||||
#. Stop the OSD:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cephadm unit --fsid $cid --name osd.${osd} stop
|
||||
|
||||
#. Run the ``bluefs-bdev-expand`` command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cephadm shell --fsid $cid --name osd.${osd} -- ceph-bluestore-tool bluefs-bdev-expand --path /var/lib/ceph/osd/ceph-${osd}
|
||||
|
||||
#. Run the ``bluefs-bdev-migrate`` command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cephadm shell --fsid $cid --name osd.${osd} -- ceph-bluestore-tool bluefs-bdev-migrate --path /var/lib/ceph/osd/ceph-${osd} --devs-source /var/lib/ceph/osd/ceph-${osd}/block --dev-target /var/lib/ceph/osd/ceph-${osd}/block.db
|
||||
|
||||
#. Restart the OSD:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cephadm unit --fsid $cid --name osd.${osd} start
|
||||
|
||||
.. note:: *The above procedure was developed by Chris Dunlop on the [ceph-users] mailing list, and can be seen in its original context here:* `[ceph-users] Re: Fixing BlueFS spillover (pacific 16.2.14) <https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/message/POPUFSZGXR3P2RPYPJ4WJ4HGHZ3QESF6/>`_
|
||||
|
@ -61,6 +61,12 @@ For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` f
|
||||
|
||||
ceph-volume lvm prepare --bluestore --dmcrypt --data vg/lv
|
||||
|
||||
Starting with Ceph Squid, you can opt for TPM2 token enrollment for the created LUKS2 devices with the ``--with-tpm`` flag:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-volume lvm prepare --bluestore --dmcrypt --with-tpm --data vg/lv
|
||||
|
||||
If a ``block.db`` device or a ``block.wal`` device is needed, it can be
|
||||
specified with ``--block.db`` or ``--block.wal``. These can be physical
|
||||
devices, partitions, or logical volumes. ``block.db`` and ``block.wal`` are
|
||||
|
@ -1,8 +1,8 @@
|
||||
.. _cephadm_deploying_new_cluster:
|
||||
|
||||
============================
|
||||
Deploying a new Ceph cluster
|
||||
============================
|
||||
==========================================
|
||||
Using cephadm to Deploy a New Ceph Cluster
|
||||
==========================================
|
||||
|
||||
Cephadm creates a new Ceph cluster by bootstrapping a single
|
||||
host, expanding the cluster to encompass any additional hosts, and
|
||||
@ -24,6 +24,10 @@ Requirements
|
||||
Any modern Linux distribution should be sufficient. Dependencies
|
||||
are installed automatically by the bootstrap process below.
|
||||
|
||||
See `Docker Live Restore <https://docs.docker.com/engine/daemon/live-restore/>`_
|
||||
for an optional feature that allows restarting Docker Engine without restarting
|
||||
all running containers.
|
||||
|
||||
See the section :ref:`Compatibility With Podman
|
||||
Versions<cephadm-compatibility-with-podman>` for a table of Ceph versions that
|
||||
are compatible with Podman. Not every version of Podman is compatible with
|
||||
|
@ -375,7 +375,7 @@ One or more hosts have failed the basic cephadm host check, which verifies
|
||||
that (1) the host is reachable and cephadm can be executed there, and (2)
|
||||
that the host satisfies basic prerequisites, like a working container
|
||||
runtime (podman or docker) and working time synchronization.
|
||||
If this test fails, cephadm will no be able to manage services on that host.
|
||||
If this test fails, cephadm will not be able to manage services on that host.
|
||||
|
||||
You can manually run this check by running the following command:
|
||||
|
||||
@ -734,3 +734,72 @@ Purge ceph daemons from all hosts in the cluster
|
||||
|
||||
# For each host:
|
||||
cephadm rm-cluster --force --zap-osds --fsid <fsid>
|
||||
|
||||
|
||||
Replacing a device
|
||||
==================
|
||||
|
||||
The ``ceph orch device replace`` command automates the process of replacing the underlying device of an OSD.
|
||||
Previously, this process required manual intervention at various stages.
|
||||
With this new command, all necessary operations are performed automatically, streamlining the replacement process
|
||||
and improving the overall user experience.
|
||||
|
||||
.. note:: This only supports LVM-based deployed OSD(s)
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch device replace <host> <device-path>
|
||||
|
||||
In the case the device being replaced is shared by multiple OSDs (eg: DB/WAL device shared by multiple OSDs), the orchestrator will warn you.
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
[ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd
|
||||
|
||||
Error EINVAL: /dev/vdd is a shared device.
|
||||
Replacing /dev/vdd implies destroying OSD(s): ['0', '1'].
|
||||
Please, *be very careful*, this can be a very dangerous operation.
|
||||
If you know what you are doing, pass --yes-i-really-mean-it
|
||||
|
||||
If you know what you are doing, you can go ahead and pass ``--yes-i-really-mean-it``.
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
[ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd --yes-i-really-mean-it
|
||||
Scheduled to destroy osds: ['6', '7', '8'] and mark /dev/vdd as being replaced.
|
||||
|
||||
``cephadm`` will make ``ceph-volume`` zap and destroy all related devices and mark the corresponding OSD as ``destroyed`` so the
|
||||
different OSD(s) ID(s) will be preserved:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
[ceph: root@ceph-1 /]# ceph osd tree
|
||||
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
|
||||
-1 0.97659 root default
|
||||
-3 0.97659 host devel-1
|
||||
0 hdd 0.29300 osd.0 destroyed 1.00000 1.00000
|
||||
1 hdd 0.29300 osd.1 destroyed 1.00000 1.00000
|
||||
2 hdd 0.19530 osd.2 up 1.00000 1.00000
|
||||
3 hdd 0.19530 osd.3 up 1.00000 1.00000
|
||||
|
||||
The device being replaced is finally seen as ``being replaced`` preventing ``cephadm`` from redeploying the OSDs too fast:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
[ceph: root@ceph-1 /]# ceph orch device ls
|
||||
HOST PATH TYPE DEVICE ID SIZE AVAILABLE REFRESHED REJECT REASONS
|
||||
osd-1 /dev/vdb hdd 200G Yes 13s ago
|
||||
osd-1 /dev/vdc hdd 200G Yes 13s ago
|
||||
osd-1 /dev/vdd hdd 200G Yes 13s ago Is being replaced
|
||||
osd-1 /dev/vde hdd 200G No 13s ago Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
|
||||
osd-1 /dev/vdf hdd 200G No 13s ago Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
|
||||
|
||||
If for any reason you need to clear the 'device replace header' on a device, then you can use ``ceph orch device replace <host> <device> --clear``:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
[ceph: root@devel-1 /]# ceph orch device replace devel-1 /dev/vdk --clear
|
||||
Replacement header cleared on /dev/vdk
|
||||
[ceph: root@devel-1 /]#
|
||||
|
||||
After that, ``cephadm`` will redeploy the OSD service spec within a few minutes (unless the service is set to ``unmanaged``).
|
||||
|
@ -355,6 +355,8 @@ Or in YAML:
|
||||
|
||||
* See :ref:`orchestrator-host-labels`
|
||||
|
||||
.. _cephadm-services-placement-by-pattern-matching:
|
||||
|
||||
Placement by pattern matching
|
||||
-----------------------------
|
||||
|
||||
|
@ -345,7 +345,7 @@ definition and management of the embedded Prometheus service. The endpoint liste
|
||||
``https://<mgr-ip>:8765/sd/`` (the port is
|
||||
configurable through the variable ``service_discovery_port``) and returns scrape target
|
||||
information in `http_sd_config format
|
||||
<https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config/>`_
|
||||
<https://prometheus.io/docs/prometheus/latest/configuration/configuration/#http_sd_config>`_
|
||||
|
||||
Customers with external monitoring stack can use `ceph-mgr` service discovery endpoint
|
||||
to get scraping configuration. Root certificate of the server can be obtained by the
|
||||
|
@ -84,6 +84,39 @@ information about interacting with these LEDs, refer to :ref:`devices`.
|
||||
The current release of `libstoragemgmt`_ (1.8.8) supports SCSI, SAS, and SATA based
|
||||
local disks only. There is no official support for NVMe devices (PCIe)
|
||||
|
||||
Retrieve Exact Size of Block Devices
|
||||
====================================
|
||||
|
||||
Run a command of the following form to discover the exact size of a block
|
||||
device. The value returned here is used by the orchestrator when comparing high
|
||||
and low values:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cephadm shell ceph-volume inventory </dev/sda> --format json | jq .sys_api.human_readable_size
|
||||
|
||||
The exact size in GB is the size reported in TB, multiplied by 1000.
|
||||
|
||||
Example
|
||||
-------
|
||||
The following provides a specific example of this command based upon the
|
||||
general form of the command above:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cephadm shell ceph-volume inventory /dev/sdc --format json | jq .sys_api.human_readable_size
|
||||
|
||||
::
|
||||
|
||||
"3.64 TB"
|
||||
|
||||
This means that the exact device size is 3.64 * 1000, or 3640GB.
|
||||
|
||||
This procedure was developed by Frédéric Nass. See `this thread on the
|
||||
[ceph-users] mailing list
|
||||
<https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/message/5BAAYFCQAZZDRSNCUPCVBNEPGJDARRZA/>`_
|
||||
for discussion of this matter.
|
||||
|
||||
.. _cephadm-deploy-osds:
|
||||
|
||||
Deploy OSDs
|
||||
@ -445,22 +478,27 @@ for that OSD and also set a specific memory target. For example,
|
||||
Advanced OSD Service Specifications
|
||||
===================================
|
||||
|
||||
:ref:`orchestrator-cli-service-spec`\s of type ``osd`` are a way to describe a
|
||||
cluster layout, using the properties of disks. Service specifications give the
|
||||
user an abstract way to tell Ceph which disks should turn into OSDs with which
|
||||
configurations, without knowing the specifics of device names and paths.
|
||||
:ref:`orchestrator-cli-service-spec`\s of type ``osd`` provide a way to use the
|
||||
properties of disks to describe a Ceph cluster's layout. Service specifications
|
||||
are an abstraction used to tell Ceph which disks it should transform into OSDs
|
||||
and which configurations to apply to those OSDs.
|
||||
:ref:`orchestrator-cli-service-spec`\s make it possible to target these disks
|
||||
for transformation into OSDs even when the Ceph cluster operator does not know
|
||||
the specific device names and paths associated with those disks.
|
||||
|
||||
Service specifications make it possible to define a yaml or json file that can
|
||||
be used to reduce the amount of manual work involved in creating OSDs.
|
||||
:ref:`orchestrator-cli-service-spec`\s make it possible to define a ``.yaml``
|
||||
or ``.json`` file that can be used to reduce the amount of manual work involved
|
||||
in creating OSDs.
|
||||
|
||||
.. note::
|
||||
It is recommended that advanced OSD specs include the ``service_id`` field
|
||||
set. The plain ``osd`` service with no service id is where OSDs created
|
||||
using ``ceph orch daemon add`` or ``ceph orch apply osd --all-available-devices``
|
||||
are placed. Not including a ``service_id`` in your OSD spec would mix
|
||||
the OSDs from your spec with those OSDs and potentially overwrite services
|
||||
specs created by cephadm to track them. Newer versions of cephadm will even
|
||||
block creation of advanced OSD specs without the service_id present
|
||||
We recommend that advanced OSD specs include the ``service_id`` field set.
|
||||
OSDs created using ``ceph orch daemon add`` or ``ceph orch apply osd
|
||||
--all-available-devices`` are placed in the plain ``osd`` service. Failing
|
||||
to include a ``service_id`` in your OSD spec causes the Ceph cluster to mix
|
||||
the OSDs from your spec with those OSDs, which can potentially result in the
|
||||
overwriting of service specs created by ``cephadm`` to track them. Newer
|
||||
versions of ``cephadm`` will even block creation of advanced OSD specs that
|
||||
do not include the ``service_id``.
|
||||
|
||||
For example, instead of running the following command:
|
||||
|
||||
@ -468,8 +506,8 @@ For example, instead of running the following command:
|
||||
|
||||
ceph orch daemon add osd *<host>*:*<path-to-device>*
|
||||
|
||||
for each device and each host, we can define a yaml or json file that allows us
|
||||
to describe the layout. Here's the most basic example.
|
||||
for each device and each host, we can define a ``.yaml`` or ``.json`` file that
|
||||
allows us to describe the layout. Here is the most basic example:
|
||||
|
||||
Create a file called (for example) ``osd_spec.yml``:
|
||||
|
||||
@ -487,17 +525,18 @@ This means :
|
||||
|
||||
#. Turn any available device (ceph-volume decides what 'available' is) into an
|
||||
OSD on all hosts that match the glob pattern '*'. (The glob pattern matches
|
||||
against the registered hosts from `host ls`) A more detailed section on
|
||||
host_pattern is available below.
|
||||
against the registered hosts from `ceph orch host ls`) See
|
||||
:ref:`cephadm-services-placement-by-pattern-matching` for more on using
|
||||
``host_pattern``-matching to turn devices into OSDs.
|
||||
|
||||
#. Then pass it to `osd create` like this:
|
||||
#. Pass ``osd_spec.yml`` to ``osd create`` by using the following command:
|
||||
|
||||
.. prompt:: bash [monitor.1]#
|
||||
|
||||
ceph orch apply -i /path/to/osd_spec.yml
|
||||
|
||||
This instruction will be issued to all the matching hosts, and will deploy
|
||||
these OSDs.
|
||||
This instruction is issued to all the matching hosts, and will deploy these
|
||||
OSDs.
|
||||
|
||||
Setups more complex than the one specified by the ``all`` filter are
|
||||
possible. See :ref:`osd_filters` for details.
|
||||
@ -666,6 +705,21 @@ This example would deploy all OSDs with encryption enabled.
|
||||
all: true
|
||||
encrypted: true
|
||||
|
||||
Ceph Squid onwards support tpm2 token enrollment to LUKS2 devices.
|
||||
You can add the `tpm2` to your OSD spec:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
service_type: osd
|
||||
service_id: example_osd_spec_with_tpm2
|
||||
placement:
|
||||
host_pattern: '*'
|
||||
spec:
|
||||
data_devices:
|
||||
all: true
|
||||
encrypted: true
|
||||
tpm2: true
|
||||
|
||||
See a full list in the DriveGroupSpecs
|
||||
|
||||
.. py:currentmodule:: ceph.deployment.drive_group
|
||||
|
@ -26,7 +26,7 @@ Samba Containers with the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
orch apply smb <cluster_id> <config_uri> [--features ...] [--placement ...] ...
|
||||
ceph orch apply smb <cluster_id> <config_uri> [--features ...] [--placement ...] ...
|
||||
|
||||
There are a number of additional parameters that the command accepts. See
|
||||
the Service Specification for a description of these options.
|
||||
|
@ -131,7 +131,21 @@ doesn't use ``cephadm shell``) to a version compatible with the new version.
|
||||
Potential problems
|
||||
==================
|
||||
|
||||
There are a few health alerts that can arise during the upgrade process.
|
||||
|
||||
Error: ENOENT: Module not found
|
||||
-------------------------------
|
||||
|
||||
The message ``Error ENOENT: Module not found`` appears in response to the command ``ceph orch upgrade status`` if the orchestrator has crashed:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch upgrade status
|
||||
|
||||
::
|
||||
|
||||
Error ENOENT: Module not found
|
||||
|
||||
This is possibly caused by invalid JSON in a mgr config-key. See `Redmine tracker Issue #67329 <https://tracker.ceph.com/issues/67329>`_ and `the discussion on the [ceph-users] mailing list <https://www.spinics.net/lists/ceph-users/msg83667.html>`_.
|
||||
|
||||
UPGRADE_NO_STANDBY_MGR
|
||||
----------------------
|
||||
|
@ -53,8 +53,7 @@ the MDS server. Even if a single MDS daemon is unable to fully utilize the
|
||||
hardware, it may be desirable later on to start more active MDS daemons on the
|
||||
same node to fully utilize the available cores and memory. Additionally, it may
|
||||
become clear with workloads on the cluster that performance improves with
|
||||
multiple active MDS on the same node rather than over-provisioning a single
|
||||
MDS.
|
||||
multiple active MDS on the same node rather than a single overloaded MDS.
|
||||
|
||||
Finally, be aware that CephFS is a highly-available file system by supporting
|
||||
standby MDS (see also :ref:`mds-standby`) for rapid failover. To get a real
|
||||
|
@ -209,3 +209,70 @@ cache. The limit is configured via:
|
||||
|
||||
It is not recommended to set this value above 5M but it may be helpful with
|
||||
some workloads.
|
||||
|
||||
|
||||
Dealing with "clients failing to respond to cache pressure" messages
|
||||
--------------------------------------------------------------------
|
||||
|
||||
Every second (or every interval set by the ``mds_cache_trim_interval``
|
||||
configuration paramater), the MDS runs the "cache trim" procedure. One of the
|
||||
steps of this procedure is "recall client state". During this step, the MDS
|
||||
checks every client (session) to determine whether it needs to recall caps.
|
||||
If any of the following are true, then the MDS needs to recall caps:
|
||||
|
||||
1. the cache is full (the ``mds_cache_memory_limit`` has been exceeded) and
|
||||
needs some inodes to be released
|
||||
2. the client exceeds ``mds_max_caps_per_client`` (1M by default)
|
||||
3. the client is inactive
|
||||
|
||||
To determine whether a client (a session) is inactive, the session's
|
||||
``cache_liveness`` parameters is checked and compared with the value::
|
||||
|
||||
(num_caps >> mds_session_cache_liveness_magnitude)
|
||||
|
||||
where ``mds_session_cache_liveness_magnitude`` is a config param (``10`` by
|
||||
default). If ``cache_liveness`` is smaller than this calculated value, the
|
||||
session is considered inactive and the MDS sends a "recall caps" request for
|
||||
all cached caps (the actual recall value is ``num_caps -
|
||||
mds_min_caps_per_client(100)``).
|
||||
|
||||
Under certain circumstances, many "recall caps" requests can be sent so quickly
|
||||
that the health warning is generated: "clients failing to respond to cache
|
||||
pressure". If the client does not release the caps fast enough, the MDS repeats
|
||||
the "recall caps" request one second later. This means that the MDS will send
|
||||
"recall caps" again and again. The "total" counter of "recall caps" for the
|
||||
session will grow and grow, and will eventually exceed the "mon warning limit".
|
||||
|
||||
A throttling mechanism, controlled by the ``mds_recall_max_decay_threshold``
|
||||
parameter (126K by default), is available for reducing the rate of "recall
|
||||
caps" counter growth, but sometimes it is not enough to slow the "recall caps"
|
||||
counter's growth rate. If altering the ``mds_recall_max_decay_threshold`` value
|
||||
does not sufficiently reduce the rate of the "recall caps" counter's growth,
|
||||
decrease ``mds_recall_max_caps`` incrementally until the "clients failing to
|
||||
respond to cache pressure" messages no longer appear in the logs.
|
||||
|
||||
Example Scenario
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Here is an example. A client is having 20k caps cached. At some moment the
|
||||
server decides the client is inactive (because the session's ``cache_liveness``
|
||||
value is low). It starts to ask the client to release caps down to
|
||||
``mds_min_caps_per_client`` value (100 by default). Every second, it
|
||||
sends recall_caps asking to release ``caps_num - mds_min_caps_per_client`` caps
|
||||
(but not more than ``mds_recall_max_caps``, which is 30k by default). A client
|
||||
is starting to release, but is releasing with a rate of (for example) only 100
|
||||
caps per second.
|
||||
|
||||
So in the first second of time, the mds sends recall_caps = 20k - 100 the
|
||||
second second recall_caps = (20k - 100) - 100 the third second recall_caps =
|
||||
(20k - 200) - 100 and so on. And every time it sends recall_caps it updates the
|
||||
session's recall_caps value, which is calculated how many recall_caps sent in
|
||||
the last minute. I.e. the counter is growing quickly, eventually exceeding
|
||||
mds_recall_warning_threshold, which is 128K by default, and ceph starts to
|
||||
report "failing to respond to cache pressure" warning in the status. Now,
|
||||
after we set mds_recall_max_caps to 3K, in this situation the mds server sends
|
||||
only 3K recall_caps per second, and the maximum value the session's recall_caps
|
||||
value may have (if the mds is sending 3K every second for at least one minute)
|
||||
is 60 * 3K = 180K. This means that it is still possible to achieve
|
||||
``mds_recall_warning_threshold`` but only if a client does not "respond" for a
|
||||
long time, and as your experiments show it is not the case.
|
||||
|
@ -24,7 +24,7 @@ This will mount the default ceph filesystem using the drive letter ``x``.
|
||||
If ``ceph.conf`` is placed at the default location, which is
|
||||
``%ProgramData%\ceph\ceph.conf``, then this argument becomes optional.
|
||||
|
||||
The ``-l`` argument also allows using an empty folder as a mountpoint
|
||||
The ``-l`` argument also allows using an empty folder as a mount point
|
||||
instead of a drive letter.
|
||||
|
||||
The uid and gid used for mounting the filesystem default to 0 and may be
|
||||
@ -75,7 +75,7 @@ like so::
|
||||
|
||||
ceph-dokan.exe unmap -l x
|
||||
|
||||
Note that when unmapping Ceph filesystems, the exact same mountpoint argument
|
||||
Note that when unmapping Ceph filesystems, the exact same mount point argument
|
||||
must be used as when the mapping was created.
|
||||
|
||||
Limitations
|
||||
|
@ -120,7 +120,9 @@ system, run a command of the following form:
|
||||
|
||||
.. note:: "Mirroring module" commands are prefixed with ``fs snapshot mirror``.
|
||||
This distinguishes them from "monitor commands", which are prefixed with ``fs
|
||||
mirror``. Be sure (in this context) to use module commands.
|
||||
mirror``. Enabling mirroring by using monitor commands will result in the mirror daemon
|
||||
entering the "failed" state due to the absence of the `cephfs_mirror` index object.
|
||||
So be sure (in this context) to use module commands.
|
||||
|
||||
To disable mirroring for a given file system, run a command of the following form:
|
||||
|
||||
@ -340,8 +342,9 @@ command is of format `filesystem-name@filesystem-id peer-uuid`::
|
||||
"last_synced_snap": {
|
||||
"id": 120,
|
||||
"name": "snap1",
|
||||
"sync_duration": 0.079997898999999997,
|
||||
"sync_time_stamp": "274900.558797s"
|
||||
"sync_duration": 3,
|
||||
"sync_time_stamp": "274900.558797s",
|
||||
"sync_bytes": 52428800
|
||||
},
|
||||
"snaps_synced": 2,
|
||||
"snaps_deleted": 0,
|
||||
@ -359,6 +362,32 @@ A directory can be in one of the following states::
|
||||
- `syncing`: The directory is currently being synchronized
|
||||
- `failed`: The directory has hit upper limit of consecutive failures
|
||||
|
||||
When a directory is currently being synchronized, the mirror daemon marks it as `syncing` and
|
||||
`fs mirror peer status` shows the snapshot being synchronized under the `current_syncing_snap`::
|
||||
|
||||
$ ceph --admin-daemon /var/run/ceph/cephfs-mirror.asok fs mirror peer status cephfs@360 a2dc7784-e7a1-4723-b103-03ee8d8768f8
|
||||
{
|
||||
"/d0": {
|
||||
"state": "syncing",
|
||||
"current_syncing_snap": {
|
||||
"id": 121,
|
||||
"name": "snap2"
|
||||
},
|
||||
"last_synced_snap": {
|
||||
"id": 120,
|
||||
"name": "snap1",
|
||||
"sync_duration": 3,
|
||||
"sync_time_stamp": "274900.558797s",
|
||||
"sync_bytes": 52428800
|
||||
},
|
||||
"snaps_synced": 2,
|
||||
"snaps_deleted": 0,
|
||||
"snaps_renamed": 0
|
||||
}
|
||||
}
|
||||
|
||||
The mirror daemon marks it back to `idle`, when the syncing completes.
|
||||
|
||||
When a directory experiences a configured number of consecutive synchronization failures, the
|
||||
mirror daemon marks it as `failed`. Synchronization for these directories is retried.
|
||||
By default, the number of consecutive failures before a directory is marked as failed
|
||||
@ -374,12 +403,13 @@ E.g., adding a regular file for synchronization would result in failed status::
|
||||
"/d0": {
|
||||
"state": "idle",
|
||||
"last_synced_snap": {
|
||||
"id": 120,
|
||||
"name": "snap1",
|
||||
"sync_duration": 0.079997898999999997,
|
||||
"sync_time_stamp": "274900.558797s"
|
||||
"id": 121,
|
||||
"name": "snap2",
|
||||
"sync_duration": 5,
|
||||
"sync_time_stamp": "500900.600797s",
|
||||
"sync_bytes": 78643200
|
||||
},
|
||||
"snaps_synced": 2,
|
||||
"snaps_synced": 3,
|
||||
"snaps_deleted": 0,
|
||||
"snaps_renamed": 0
|
||||
},
|
||||
@ -395,9 +425,110 @@ This allows a user to add a non-existent directory for synchronization. The mirr
|
||||
will mark such a directory as failed and retry (less frequently). When the directory is
|
||||
created, the mirror daemon will clear the failed state upon successful synchronization.
|
||||
|
||||
Adding a new snapshot or a new directory manually in the .snap directory of the
|
||||
remote filesystem will result in failed status of the corresponding configured directory.
|
||||
In the remote filesystem::
|
||||
|
||||
$ ceph fs subvolume snapshot create cephfs subvol1 snap2 group1
|
||||
or
|
||||
$ mkdir /d0/.snap/snap2
|
||||
|
||||
$ ceph --admin-daemon /var/run/ceph/cephfs-mirror.asok fs mirror peer status cephfs@360 a2dc7784-e7a1-4723-b103-03ee8d8768f8
|
||||
{
|
||||
"/d0": {
|
||||
"state": "failed",
|
||||
"failure_reason": "snapshot 'snap2' has invalid metadata",
|
||||
"last_synced_snap": {
|
||||
"id": 120,
|
||||
"name": "snap1",
|
||||
"sync_duration": 3,
|
||||
"sync_time_stamp": "274900.558797s"
|
||||
},
|
||||
"snaps_synced": 2,
|
||||
"snaps_deleted": 0,
|
||||
"snaps_renamed": 0
|
||||
},
|
||||
"/f0": {
|
||||
"state": "failed",
|
||||
"snaps_synced": 0,
|
||||
"snaps_deleted": 0,
|
||||
"snaps_renamed": 0
|
||||
}
|
||||
}
|
||||
|
||||
When the snapshot or the directory is removed from the remote filesystem, the mirror daemon will
|
||||
clear the failed state upon successful synchronization of the pending snapshots, if any.
|
||||
|
||||
.. note:: Treat the remote filesystem as read-only. Nothing is inherently enforced by CephFS.
|
||||
But with the right mds caps, users would not be able to snapshot directories in the
|
||||
remote file system.
|
||||
|
||||
When mirroring is disabled, the respective `fs mirror status` command for the file system
|
||||
will not show up in command help.
|
||||
|
||||
Metrics
|
||||
-------
|
||||
|
||||
CephFS exports mirroring metrics as :ref:`Labeled Perf Counters` which will be consumed by the OCP/ODF Dashboard to provide monitoring of the Geo Replication. These metrics can be used to measure the progress of cephfs_mirror syncing and thus provide the monitoring capability. CephFS exports the following mirroring metrics, which are displayed using the ``counter dump`` command.
|
||||
|
||||
.. list-table:: Mirror Status Metrics
|
||||
:widths: 25 25 75
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - mirroring_peers
|
||||
- Gauge
|
||||
- The number of peers involved in mirroring
|
||||
* - directory_count
|
||||
- Gauge
|
||||
- The total number of directories being synchronized
|
||||
* - mirrored_filesystems
|
||||
- Gauge
|
||||
- The total number of filesystems which are mirrored
|
||||
* - mirror_enable_failures
|
||||
- Counter
|
||||
- Enable mirroring failures
|
||||
|
||||
.. list-table:: Replication Metrics
|
||||
:widths: 25 25 75
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Type
|
||||
- Description
|
||||
* - snaps_synced
|
||||
- Counter
|
||||
- The total number of snapshots successfully synchronized
|
||||
* - sync_bytes
|
||||
- Counter
|
||||
- The total bytes being synchronized
|
||||
* - sync_failures
|
||||
- Counter
|
||||
- The total number of failed snapshot synchronizations
|
||||
* - snaps_deleted
|
||||
- Counter
|
||||
- The total number of snapshots deleted
|
||||
* - snaps_renamed
|
||||
- Counter
|
||||
- The total number of snapshots renamed
|
||||
* - avg_sync_time
|
||||
- Gauge
|
||||
- The average time taken by all snapshot synchronizations
|
||||
* - last_synced_start
|
||||
- Gauge
|
||||
- The sync start time of the last synced snapshot
|
||||
* - last_synced_end
|
||||
- Gauge
|
||||
- The sync end time of the last synced snapshot
|
||||
* - last_synced_duration
|
||||
- Gauge
|
||||
- The time duration of the last synchronization
|
||||
* - last_synced_bytes
|
||||
- counter
|
||||
- The total bytes being synchronized for the last synced snapshot
|
||||
|
||||
Configuration Options
|
||||
---------------------
|
||||
|
||||
@ -410,6 +541,7 @@ Configuration Options
|
||||
.. confval:: cephfs_mirror_retry_failed_directories_interval
|
||||
.. confval:: cephfs_mirror_restart_mirror_on_failure_interval
|
||||
.. confval:: cephfs_mirror_mount_timeout
|
||||
.. confval:: cephfs_mirror_perf_stats_prio
|
||||
|
||||
Re-adding Peers
|
||||
---------------
|
||||
|
@ -106,6 +106,8 @@ If quotas are not enabled or if no quota is set on the mounted sub-directory,
|
||||
then the overall usage of the file system will be reported irrespective of the
|
||||
value of this setting.
|
||||
|
||||
.. _cephfs-layout-and-quota-restriction:
|
||||
|
||||
Layout and Quota restriction (the 'p' flag)
|
||||
===========================================
|
||||
|
||||
@ -274,7 +276,7 @@ Client ``someuser`` is authorized for only one file system:
|
||||
caps mon = "allow r"
|
||||
caps osd = "allow rw tag cephfs data=cephfs"
|
||||
|
||||
Mounting ``cephfs1`` on the already-created mountpoint ``/mnt/cephfs1`` with
|
||||
Mounting ``cephfs1`` on the already-created mount point ``/mnt/cephfs1`` with
|
||||
``someuser`` works:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
@ -6,6 +6,9 @@ File layouts
|
||||
The layout of a file controls how its contents are mapped to Ceph RADOS objects. You can
|
||||
read and write a file's layout using *virtual extended attributes* or xattrs.
|
||||
|
||||
Clients must use the ``p`` flag when writing a file's layout. See :ref:`Layout
|
||||
and Quota restriction (the 'p' flag) <cephfs-layout-and-quota-restriction>`.
|
||||
|
||||
The name of the layout xattrs depends on whether a file is a regular file or a directory. Regular
|
||||
files' layout xattrs are called ``ceph.file.layout``, whereas directories' layout xattrs are called
|
||||
``ceph.dir.layout``. Where subsequent examples refer to ``ceph.file.layout``, substitute ``dir`` as appropriate
|
||||
@ -20,26 +23,38 @@ Layout fields
|
||||
-------------
|
||||
|
||||
pool
|
||||
String, giving ID or name. String can only have characters in the set [a-zA-Z0-9\_-.]. Which RADOS pool a file's data objects will be stored in.
|
||||
This is a string and contains either an ID or a name. Strings may contain
|
||||
only characters in the set ``[a-zA-Z0-9\_-.]``. This determines the RADOS
|
||||
pool that stores a file's data objects.
|
||||
|
||||
pool_id
|
||||
String of digits. This is the system assigned pool id for the RADOS pool whenever it is created.
|
||||
This is a string of digits. This is the pool ID that was assigned by Ceph
|
||||
at the time of the creation of the RADOS pool.
|
||||
|
||||
pool_name
|
||||
String, given name. This is the user defined name for the RADOS pool whenever user creates it.
|
||||
This is a string. This is the name of the RADOS pool as defined by the user
|
||||
when the pool was created.
|
||||
|
||||
pool_namespace
|
||||
String with only characters in the set [a-zA-Z0-9\_-.]. Within the data pool, which RADOS namespace the objects will
|
||||
be written to. Empty by default (i.e. default namespace).
|
||||
This is a string containing only characters in the set ``[a-zA-Z0-9\_-.]``.
|
||||
This determines which RADOS namespace within the data pool that the objects
|
||||
will be written to.
|
||||
Empty by default (i.e. default namespace).
|
||||
|
||||
stripe_unit
|
||||
Integer in bytes. The size (in bytes) of a block of data used in the RAID 0 distribution of a file. All stripe units for a file have equal size. The last stripe unit is typically incomplete–i.e. it represents the data at the end of the file as well as unused “space” beyond it up to the end of the fixed stripe unit size.
|
||||
This is an integer. The size (in bytes) of a block of data used in the
|
||||
distribution of a file. All stripe units for a file have equal size. The
|
||||
last stripe unit is typically only partly full of data: it holds file data
|
||||
through EOF as well as padding that fills the balance of the fixed stripe
|
||||
unit size.
|
||||
|
||||
stripe_count
|
||||
Integer. The number of consecutive stripe units that constitute a RAID 0 “stripe” of file data.
|
||||
Integer. The number of consecutive stripe units that constitute a RAID 0
|
||||
“stripe” of file data.
|
||||
|
||||
object_size
|
||||
Integer in bytes. File data is chunked into RADOS objects of this size.
|
||||
Integer. The size of the object in bytes. File data is chunked into RADOS
|
||||
objects of this size.
|
||||
|
||||
.. tip::
|
||||
|
||||
|
@ -14,12 +14,12 @@ abstractions:
|
||||
|
||||
* FS volumes, an abstraction for CephFS file systems
|
||||
|
||||
* FS subvolumes, an abstraction for independent CephFS directory trees
|
||||
|
||||
* FS subvolume groups, an abstraction for a directory level higher than FS
|
||||
subvolumes. Used to effect policies (e.g., :doc:`/cephfs/file-layouts`)
|
||||
across a set of subvolumes
|
||||
|
||||
* FS subvolumes, an abstraction for independent CephFS directory trees
|
||||
|
||||
Possible use-cases for the export abstractions:
|
||||
|
||||
* FS subvolumes used as Manila shares or CSI volumes
|
||||
@ -276,7 +276,7 @@ Use a command of the following form to create a subvolume:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph fs subvolume create <vol_name> <subvol_name> [--size <size_in_bytes>] [--group_name <subvol_group_name>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>] [--namespace-isolated]
|
||||
ceph fs subvolume create <vol_name> <subvol_name> [--size <size_in_bytes>] [--group_name <subvol_group_name>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>] [--namespace-isolated] [--earmark <earmark>]
|
||||
|
||||
|
||||
The command succeeds even if the subvolume already exists.
|
||||
@ -289,6 +289,33 @@ The subvolume can be created in a separate RADOS namespace by specifying the
|
||||
default subvolume group with an octal file mode of ``755``, a uid of its
|
||||
subvolume group, a gid of its subvolume group, a data pool layout of its parent
|
||||
directory, and no size limit.
|
||||
You can also assign an earmark to a subvolume using the ``--earmark`` option.
|
||||
The earmark is a unique identifier that tags the subvolume for specific purposes,
|
||||
such as NFS or SMB services. By default, no earmark is set, allowing for flexible
|
||||
assignment based on administrative needs. An empty string ("") can be used to remove
|
||||
any existing earmark from a subvolume.
|
||||
|
||||
The earmarking mechanism ensures that subvolumes are correctly tagged and managed,
|
||||
helping to avoid conflicts and ensuring that each subvolume is associated
|
||||
with the intended service or use case.
|
||||
|
||||
Valid Earmarks
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- **For NFS:**
|
||||
- The valid earmark format is the top-level scope: ``'nfs'``.
|
||||
|
||||
- **For SMB:**
|
||||
- The valid earmark formats are:
|
||||
- The top-level scope: ``'smb'``.
|
||||
- The top-level scope with an intra-module level scope: ``'smb.cluster.{cluster_id}'``, where ``cluster_id`` is a short string uniquely identifying the cluster.
|
||||
- Example without intra-module scope: ``smb``
|
||||
- Example with intra-module scope: ``smb.cluster.cluster_1``
|
||||
|
||||
.. note:: If you are changing an earmark from one scope to another (e.g., from nfs to smb or vice versa),
|
||||
be aware that user permissions and ACLs associated with the previous scope might still apply. Ensure that
|
||||
any necessary permissions are updated as needed to maintain proper access control.
|
||||
|
||||
|
||||
Removing a subvolume
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
@ -418,6 +445,7 @@ The output format is JSON and contains the following fields.
|
||||
* ``pool_namespace``: RADOS namespace of the subvolume
|
||||
* ``features``: features supported by the subvolume
|
||||
* ``state``: current state of the subvolume
|
||||
* ``earmark``: earmark of the subvolume
|
||||
|
||||
If a subvolume has been removed but its snapshots have been retained, the
|
||||
output contains only the following fields.
|
||||
@ -522,6 +550,33 @@ subvolume using the metadata key:
|
||||
Using the ``--force`` flag allows the command to succeed when it would
|
||||
otherwise fail (if the metadata key did not exist).
|
||||
|
||||
Getting earmark of a subvolume
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Use a command of the following form to get the earmark of a subvolume:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph fs subvolume earmark get <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Setting earmark of a subvolume
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Use a command of the following form to set the earmark of a subvolume:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph fs subvolume earmark set <vol_name> <subvol_name> [--group_name <subvol_group_name>] <earmark>
|
||||
|
||||
Removing earmark of a subvolume
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Use a command of the following form to remove the earmark of a subvolume:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph fs subvolume earmark rm <vol_name> <subvol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
Creating a Snapshot of a Subvolume
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -758,16 +813,40 @@ Here is an example of an ``in-progress`` clone:
|
||||
::
|
||||
|
||||
{
|
||||
"status": {
|
||||
"state": "in-progress",
|
||||
"source": {
|
||||
"volume": "cephfs",
|
||||
"subvolume": "subvol1",
|
||||
"snapshot": "snap1"
|
||||
}
|
||||
"status": {
|
||||
"state": "in-progress",
|
||||
"source": {
|
||||
"volume": "cephfs",
|
||||
"subvolume": "subvol1",
|
||||
"snapshot": "snap1"
|
||||
},
|
||||
"progress_report": {
|
||||
"percentage cloned": "12.24%",
|
||||
"amount cloned": "376M/3.0G",
|
||||
"files cloned": "4/6"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
A progress report is also printed in the output when clone is ``in-progress``.
|
||||
Here the progress is reported only for the specific clone. For collective
|
||||
progress made by all ongoing clones, a progress bar is printed at the bottom
|
||||
in ouput of ``ceph status`` command::
|
||||
|
||||
progress:
|
||||
3 ongoing clones - average progress is 47.569% (10s)
|
||||
[=============...............] (remaining: 11s)
|
||||
|
||||
If the number of clone jobs are more than cloner threads, two progress bars
|
||||
are printed, one for ongoing clones (same as above) and other for all
|
||||
(ongoing+pending) clones::
|
||||
|
||||
progress:
|
||||
4 ongoing clones - average progress is 27.669% (15s)
|
||||
[=======.....................] (remaining: 41s)
|
||||
Total 5 clones - average progress is 41.667% (3s)
|
||||
[===========.................] (remaining: 4s)
|
||||
|
||||
.. note:: The ``failure`` section will be shown only if the clone's state is ``failed`` or ``cancelled``
|
||||
|
||||
Here is an example of a ``failed`` clone:
|
||||
@ -1340,5 +1419,28 @@ set with this id was present in the database
|
||||
|
||||
$ ceph fs quiesce fs1 sub1 sub2 sub3 --set-id="external-id" --if-version=0
|
||||
|
||||
|
||||
.. _disabling-volumes-plugin:
|
||||
|
||||
Disabling Volumes Plugin
|
||||
------------------------
|
||||
By default the volumes plugin is enabled and set to ``always on``. However, in
|
||||
certain cases it might be appropriate to disable it. For example, when a CephFS
|
||||
is in a degraded state, the volumes plugin commands may accumulate in MGR
|
||||
instead of getting served. Which eventually causes policy throttles to kick in
|
||||
and the MGR becomes unresponsive.
|
||||
|
||||
In this event, volumes plugin can be disabled even though it is an
|
||||
``always on`` module in MGR. To do so, run ``ceph mgr module disable volumes
|
||||
--yes-i-really-mean-it``. Do note that this command will disable operations
|
||||
and remove commands of volumes plugin since it will disable all CephFS
|
||||
services on the Ceph cluster accessed through this plugin.
|
||||
|
||||
Before resorting to a measure as drastic as this, it is a good idea to try less
|
||||
drastic measures and then assess if the file system experience has improved due
|
||||
to it. One example of such less drastic measure is to disable asynchronous
|
||||
threads launched by volumes plugins for cloning and purging trash.
|
||||
|
||||
|
||||
.. _manila: https://github.com/openstack/manila
|
||||
.. _CSI: https://github.com/ceph/ceph-csi
|
||||
|
@ -20,9 +20,11 @@ in `Mount CephFS: Prerequisites`_ page.
|
||||
|
||||
Synopsis
|
||||
========
|
||||
In general, the command to mount CephFS via FUSE looks like this::
|
||||
In general, the command to mount CephFS via FUSE looks like this:
|
||||
|
||||
ceph-fuse {mountpoint} {options}
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-fuse {mount point} {options}
|
||||
|
||||
Mounting CephFS
|
||||
===============
|
||||
|
@ -109,29 +109,40 @@ Backward Compatibility
|
||||
======================
|
||||
The old syntax is supported for backward compatibility.
|
||||
|
||||
To mount CephFS with the kernel driver::
|
||||
To mount CephFS with the kernel driver, run the following commands:
|
||||
|
||||
mkdir /mnt/mycephfs
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin
|
||||
.. prompt:: bash #
|
||||
|
||||
The key-value argument right after option ``-o`` is CephX credential;
|
||||
``name`` is the username of the CephX user we are using to mount CephFS.
|
||||
mkdir /mnt/mycephfs
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin
|
||||
|
||||
To mount a non-default FS ``cephfs2``, in case the cluster has multiple FSs::
|
||||
The key-value argument right after the option ``-o`` is the CephX credential.
|
||||
``name`` is the username of the CephX user that is mounting CephFS.
|
||||
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin,fs=cephfs2
|
||||
To mount a non-default FS (in this example, ``cephfs2``), run commands of the following form. These commands are to be used in cases in which the cluster
|
||||
has multiple file systems:
|
||||
|
||||
or
|
||||
.. prompt:: bash #
|
||||
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin,mds_namespace=cephfs2
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin,fs=cephfs2
|
||||
|
||||
.. note:: The option ``mds_namespace`` is deprecated. Use ``fs=`` instead when using the old syntax for mounting.
|
||||
or
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin,mds_namespace=cephfs2
|
||||
|
||||
.. note:: The option ``mds_namespace`` is deprecated. Use ``fs=`` instead when
|
||||
using the old syntax for mounting.
|
||||
|
||||
Unmounting CephFS
|
||||
=================
|
||||
To unmount the Ceph file system, use the ``umount`` command as usual::
|
||||
To unmount the Ceph file system, use the ``umount`` command, as in this
|
||||
example:
|
||||
|
||||
umount /mnt/mycephfs
|
||||
.. prompt:: bash #
|
||||
|
||||
umount /mnt/mycephfs
|
||||
|
||||
.. tip:: Ensure that you are not within the file system directories before
|
||||
executing this command.
|
||||
@ -148,11 +159,12 @@ For example::
|
||||
|
||||
cephuser@.cephfs=/ /mnt/ceph ceph mon_addr=192.168.0.1:6789,noatime,_netdev 0 0
|
||||
|
||||
If the ``secret`` or ``secretfile`` options are not specified then the mount helper
|
||||
will attempt to find a secret for the given ``name`` in one of the configured keyrings.
|
||||
If the ``secret`` or ``secretfile`` options are not specified, the mount
|
||||
helper will attempt to find a secret for the given ``name`` in one of the
|
||||
configured keyrings.
|
||||
|
||||
See `User Management`_ for details on CephX user management and mount.ceph_
|
||||
manual for more options it can take. For troubleshooting, see
|
||||
See `User Management`_ for details on CephX user management and the mount.ceph_
|
||||
manual for a list of the options it recognizes. For troubleshooting, see
|
||||
:ref:`kernel_mount_debugging`.
|
||||
|
||||
.. _fstab: ../fstab/#kernel-driver
|
||||
|
@ -143,6 +143,14 @@ The types of damage that can be reported and repaired by File System Scrub are:
|
||||
|
||||
* BACKTRACE : Inode's backtrace in the data pool is corrupted.
|
||||
|
||||
These above named MDS damages can be repaired by using the following command::
|
||||
|
||||
ceph tell mds.<fsname>:0 scrub start /path recursive, repair, force
|
||||
|
||||
If scrub is able to repair the damage, the corresponding entry is automatically
|
||||
removed from the damage table.
|
||||
|
||||
|
||||
Evaluate strays using recursive scrub
|
||||
=====================================
|
||||
|
||||
|
@ -407,6 +407,12 @@ its associated key. A less drastic but half-fix is to change the osd cap for
|
||||
your user to just ``caps osd = "allow rw"`` and delete ``tag cephfs
|
||||
data=....``
|
||||
|
||||
Disabling the Volumes Plugin
|
||||
============================
|
||||
In certain scenarios, the Volumes plugin may need to be disabled to prevent
|
||||
compromise for rest of the Ceph cluster. For details see:
|
||||
:ref:`disabling-volumes-plugin`
|
||||
|
||||
Reporting Issues
|
||||
================
|
||||
|
||||
|
@ -17,12 +17,10 @@ Key Idea
|
||||
--------
|
||||
|
||||
For a given snapshot pair in a directory, `cephfs-mirror` daemon will rely on
|
||||
readdir diff to identify changes in a directory tree. The diffs are applied to
|
||||
`CephFS Snapdiff Feature` to identify changes in a directory tree. The diffs are applied to
|
||||
directory in the remote file system thereby only synchronizing files that have
|
||||
changed between two snapshots.
|
||||
|
||||
This feature is tracked here: https://tracker.ceph.com/issues/47034.
|
||||
|
||||
Currently, snapshot data is synchronized by bulk copying to the remote
|
||||
filesystem.
|
||||
|
||||
@ -407,3 +405,5 @@ Feature Status
|
||||
--------------
|
||||
|
||||
`cephfs-mirror` daemon is built by default (follows `WITH_CEPHFS` CMake rule).
|
||||
|
||||
.. _CephFS Snapdiff Feature: https://croit.io/blog/cephfs-snapdiff-feature
|
||||
|
@ -287,16 +287,13 @@ See :ref:`kubernetes-dev`
|
||||
Backporting
|
||||
-----------
|
||||
|
||||
All bugfixes should be merged to the ``main`` branch before being
|
||||
backported. To flag a bugfix for backporting, make sure it has a
|
||||
`tracker issue`_ associated with it and set the ``Backport`` field to a
|
||||
comma-separated list of previous releases (e.g. "hammer,jewel") that you think
|
||||
need the backport.
|
||||
The rest (including the actual backporting) will be taken care of by the
|
||||
`Stable Releases and Backports`_ team.
|
||||
All bugfixes should be merged to the ``main`` branch before being backported.
|
||||
To flag a bugfix for backporting, make sure it has a `tracker issue`_
|
||||
associated with it and set the ``Backport`` field to a comma-separated list of
|
||||
previous releases (e.g. "hammer,jewel") that you think need the backport. You
|
||||
are responsible for the backporting of pull requests that you raise.
|
||||
|
||||
.. _`tracker issue`: http://tracker.ceph.com/
|
||||
.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
|
||||
|
||||
Dependabot
|
||||
----------
|
||||
|
@ -19,6 +19,7 @@ Contributing to Ceph: A Guide for Developers
|
||||
Tests: Unit Tests <tests-unit-tests>
|
||||
Tests: Integration Tests (Teuthology) <testing_integration_tests/index>
|
||||
Tests: Running Tests (Locally) <running-tests-locally>
|
||||
Tests: Windows <tests-windows>
|
||||
Ceph Dashboard Developer Documentation (formerly HACKING.rst) <dash-devel>
|
||||
Tracing Developer Documentation <jaegertracing>
|
||||
Cephadm Developer Documentation <../cephadm/index>
|
||||
|
@ -3,11 +3,68 @@
|
||||
Integration Tests using Teuthology Workflow
|
||||
===========================================
|
||||
|
||||
Scheduling Test Run
|
||||
-------------------
|
||||
Infrastructure
|
||||
--------------
|
||||
|
||||
Getting binaries
|
||||
****************
|
||||
Components:
|
||||
|
||||
1. `ceph-ci`_: Clone of the main Ceph repository, used for triggering Jenkins
|
||||
Ceph builds for development.
|
||||
|
||||
2. `Ceph Jenkins`_: Responsible for triggering builds, uploading packages
|
||||
to Chacra, and pushing updates about the build to Shaman.
|
||||
|
||||
3. `Shaman`_: UI Interface used to check build status. In its backend,
|
||||
it is a REST API to query and store build information.
|
||||
|
||||
4. `Chacra`_: Service where packages are uploaded. The binaries uploaded
|
||||
here can be downloaded and used by anyone.
|
||||
|
||||
5. `Teuthology CLI`_: Developers can use various Teuthology commands to schedule
|
||||
and manage test runs.
|
||||
|
||||
6. Teuthology: This component is responsible for pushing test jobs to
|
||||
the Beanstalk queue and Paddles. It also picks jobs from
|
||||
the queue and runs tests.
|
||||
|
||||
7. Beanstalk queue: A priority queue containing all the queued jobs.
|
||||
Developers typically do not need to interact with it.
|
||||
|
||||
8. Paddles: A backend service that stores all test run information.
|
||||
Developers typically do not need to interact with it.
|
||||
|
||||
9. `Pulpito`_: A UI interface (for information stored in Paddles) that allows
|
||||
developers to see detailed information about their scheduled tests,
|
||||
including status and results.
|
||||
|
||||
10. Testnodes: A cluster of various machines that are used for running tests.
|
||||
Developers usually schedule tests to run on `smithi`_ machines, which are
|
||||
dedicated test nodes for Teuthology integration testing.
|
||||
|
||||
Each Teuthology test *run* contains multiple test *jobs*. Each job runs in an
|
||||
environment isolated from other jobs, on a different collection of test nodes.
|
||||
|
||||
To test a change in Ceph, follow these steps:
|
||||
|
||||
1. Getting binaries - Build Ceph.
|
||||
2. Scheduling Test Run:
|
||||
|
||||
a. About Test Suites.
|
||||
b. Triggering Teuthology Tests.
|
||||
c. Testing QA changes (without re-building binaries).
|
||||
d. Filtering Tests.
|
||||
|
||||
3. Viewing Test Results:
|
||||
|
||||
a. Pulpito Dashboard.
|
||||
b. Teuthology Archives (Reviewing Logs).
|
||||
|
||||
4. Killing tests.
|
||||
5. Re-running tests.
|
||||
|
||||
|
||||
Getting binaries - Build Ceph
|
||||
-----------------------------
|
||||
|
||||
Ceph binaries must be built for your branch before you can use teuthology to run integration tests on them. Follow these steps to build the Ceph binaries:
|
||||
|
||||
@ -41,8 +98,44 @@ Ceph binaries must be built for your branch before you can use teuthology to run
|
||||
.. _the Chacra site: https://shaman.ceph.com/api/search/?status=ready&project=ceph
|
||||
|
||||
|
||||
Triggering Tests
|
||||
****************
|
||||
Naming the ceph-ci branch
|
||||
*************************
|
||||
Prepend your branch with your name before you push it to ceph-ci. For example,
|
||||
a branch named ``feature-x`` should be named ``wip-$yourname-feature-x``, where
|
||||
``$yourname`` is replaced with your name. Identifying your branch with your
|
||||
name makes your branch easily findable on Shaman and Pulpito.
|
||||
|
||||
If you are using one of the stable branches (`quincy`, `pacific`, etc.), include
|
||||
the name of that stable branch in your ceph-ci branch name.
|
||||
For example, the ``feature-x`` PR branch should be named
|
||||
``wip-feature-x-nautilus``. *This is not just a convention. This ensures that your branch is built in the correct environment.*
|
||||
|
||||
You can choose to only trigger a CentOS 9.Stream build (excluding other distro like ubuntu)
|
||||
by adding "centos9-only" at the end of the ceph-ci branch name. For example,
|
||||
``wip-$yourname-feature-centos9-only``. This helps to get quicker builds and save resources
|
||||
when you don't require binaries for other distros.
|
||||
|
||||
Delete the branch from ceph-ci when you no longer need it. If you are
|
||||
logged in to GitHub, all your branches on ceph-ci can be found here:
|
||||
https://github.com/ceph/ceph-ci/branches.
|
||||
|
||||
|
||||
Scheduling Test Run
|
||||
-------------------
|
||||
|
||||
About Test Suites
|
||||
*****************
|
||||
|
||||
Integration tests are organized into “suites”, which are defined in ``qa/suites``
|
||||
sub-directory of the Ceph repository. These test suites can be run with the teuthology-suite
|
||||
command.
|
||||
|
||||
See `Suites Inventory`_ for a list of available suites of integration tests.
|
||||
|
||||
More details understanding of how these test suites are defined can be found on `Integration Test Introduction Page`_.
|
||||
|
||||
Triggering Teuthology Tests
|
||||
***************************
|
||||
|
||||
After you have built Ceph binaries for your branch, you can run tests using
|
||||
teuthology. This procedure explains how to run tests using teuthology.
|
||||
@ -54,7 +147,10 @@ teuthology. This procedure explains how to run tests using teuthology.
|
||||
ssh <username>@teuthology.front.sepia.ceph.com
|
||||
|
||||
This requires Sepia lab access. To request access to the Sepia lab, see:
|
||||
https://ceph.github.io/sepia/adding_users/
|
||||
https://ceph.github.io/sepia/adding_users/.
|
||||
|
||||
#. For initial setup, follow `teuthology installation guide`_ to setup teuthology for
|
||||
your user on teuthology machine. This will enable you to run teuthology commands.
|
||||
|
||||
#. Run the ``teuthology-suite`` command:
|
||||
|
||||
@ -66,7 +162,7 @@ teuthology. This procedure explains how to run tests using teuthology.
|
||||
-s fs \
|
||||
-p 110 \
|
||||
--filter "cephfs-shell" \
|
||||
-e foo@gmail.com \
|
||||
-e foo@gmail.com
|
||||
|
||||
The options in the above command are defined here:
|
||||
|
||||
@ -101,10 +197,13 @@ teuthology. This procedure explains how to run tests using teuthology.
|
||||
`Pulpito`_ where the test results can be viewed.
|
||||
|
||||
|
||||
The ``--dry-run`` option allows you to demo-run ``teuthology-suite`` command without
|
||||
actually scheduling teuthology tests. This is helpful to check how many jobs and which jobs
|
||||
a command will schedule.
|
||||
|
||||
Other frequently used/useful options are ``-d`` (or ``--distro``),
|
||||
``--distroversion``, ``--filter-out``, ``--timeout``, ``flavor``, ``-rerun``,
|
||||
``-l`` (for limiting number of jobs) , ``-N`` (for how many times the job will
|
||||
``--distro-version``, ``--filter-out``, ``--timeout``, ``flavor``, ``--rerun``,
|
||||
``--limit`` (for limiting number of jobs) , ``-N`` (for how many times the job will
|
||||
run), and ``--subset`` (used to reduce the number of tests that are triggered). Run
|
||||
``teuthology-suite --help`` to read descriptions of these and other options.
|
||||
|
||||
@ -159,15 +258,15 @@ job config printed at the beginning of the teuthology job.
|
||||
for the builds to finish, then triggering tests and waiting for
|
||||
the test results.
|
||||
|
||||
About Suites and Filters
|
||||
************************
|
||||
|
||||
See `Suites Inventory`_ for a list of available suites of integration tests.
|
||||
Each directory under ``qa/suites`` in the Ceph repository is an integration
|
||||
test suite, and arguments appropriate to follow ``-s`` can be found there.
|
||||
Filtering Tests
|
||||
***************
|
||||
|
||||
Test suites includes combinations of many yaml files which can results in massive
|
||||
amount of jobs being scheduled for a suite. So filters can help to reduce the amount
|
||||
of jobs or schedule particular jobs within a suite.
|
||||
|
||||
Keywords for filtering tests can be found in
|
||||
``qa/suites/<suite-name>/<subsuite-name>/tasks`` and can be used as arguments
|
||||
``qa/suites/<suite-name>/<subsuite-name>/tasks`` in Ceph repository and can be used as arguments
|
||||
for ``--filter``. Each YAML file in that directory can trigger tests; using the
|
||||
name of the file without its filename extension as an argument to the
|
||||
``--filter`` triggers those tests.
|
||||
@ -182,6 +281,8 @@ contents of the file for the ``modules`` attribute. For ``cephfs-shell.yaml``
|
||||
the ``modules`` attribute is ``tasks.cephfs.test_cephfs_shell``. This means
|
||||
that it triggers all tests in ``qa/tasks/cephfs/test_cephfs_shell.py``.
|
||||
|
||||
Read more about how to `Filter Tests by their Description`_.
|
||||
|
||||
Viewing Test Results
|
||||
---------------------
|
||||
|
||||
@ -195,22 +296,26 @@ Teuthology Archives
|
||||
*******************
|
||||
|
||||
After the tests have finished running, the log for the job can be obtained by
|
||||
clicking on the job ID at the Pulpito page associated with your tests. It's
|
||||
clicking on the job ID at the Pulpito run page associated with your tests. It's
|
||||
more convenient to download the log and then view it rather than viewing it in
|
||||
an internet browser since these logs can easily be up to 1 GB in size. It is
|
||||
easier to ssh into the teuthology machine (``teuthology.front.sepia.ceph.com``)
|
||||
and access the following path::
|
||||
an internet browser since these logs can easily be up to 1 GB in size.
|
||||
It is also possible to ssh into a `developer playground machine`_ and access the following path::
|
||||
|
||||
/ceph/teuthology-archive/<test-id>/<job-id>/teuthology.log
|
||||
/teuthology/<run-name>/<job-id>/teuthology.log
|
||||
|
||||
For example: for the above test ID, the path is::
|
||||
|
||||
/ceph/teuthology-archive/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/4588482/teuthology.log
|
||||
/teuthology/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/4588482/teuthology.log
|
||||
|
||||
This method can be used to view the log more quickly than would be possible through a browser.
|
||||
|
||||
In addition to ``teuthology.log``, some other files are included for debugging
|
||||
purposes:
|
||||
To view ceph logs (cephadm, ceph monitors, ceph-mgr, etc) or system logs,
|
||||
remove ``teuthology.log`` from the job's teuthology log url on browser and then navigate
|
||||
to ``remote/<machine>/log/``. System logs can be found at ``remote/<machine>/syslog/``.
|
||||
Similarly, these logs can be found on developer playground machines at
|
||||
``/teuthology/<test-id>/<job-id>/remote/<machine>/``.
|
||||
|
||||
Some other files that are included for debugging purposes:
|
||||
|
||||
* ``unit_test_summary.yaml``: Provides a summary of all unit test failures.
|
||||
Generated (optionally) when the ``unit_test_scan`` configuration option is
|
||||
@ -219,7 +324,7 @@ purposes:
|
||||
* ``valgrind.yaml``: Summarizes any Valgrind errors that may occur.
|
||||
|
||||
.. note:: To access archives more conveniently, ``/a/`` has been symbolically
|
||||
linked to ``/ceph/teuthology-archive/``. For instance, to access the previous
|
||||
linked to ``/teuthology/``. For instance, to access the previous
|
||||
example, we can use something like::
|
||||
|
||||
/a/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/4588482/teuthology.log
|
||||
@ -234,9 +339,9 @@ Here is the command that terminates jobs:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
teuthology-kill -r teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi
|
||||
teuthology-kill -p -r teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi -m smithi -o scheduled_teuthology@teuthology
|
||||
|
||||
Let's call the argument passed to ``-r`` as test ID. It can be found
|
||||
The argument passed to ``-r`` is run name. It can be found
|
||||
easily in the link to the Pulpito page for the tests you triggered. For
|
||||
example, for the above test ID, the link is - http://pulpito.front.sepia.ceph.com/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/
|
||||
|
||||
@ -275,23 +380,9 @@ Following's the definition of new options introduced in this section:
|
||||
'waiting'. Default value: 'fail,dead'
|
||||
======================= ===============================================
|
||||
|
||||
Naming the ceph-ci branch
|
||||
-------------------------
|
||||
Prepend your branch with your name before you push it to ceph-ci. For example,
|
||||
a branch named ``feature-x`` should be named ``wip-$yourname-feature-x``, where
|
||||
``$yourname`` is replaced with your name. Identifying your branch with your
|
||||
name makes your branch easily findable on Shaman and Pulpito.
|
||||
|
||||
If you are using one of the stable branches (`quincy`, `pacific`, etc.), include
|
||||
the name of that stable branch in your ceph-ci branch name.
|
||||
For example, the ``feature-x`` PR branch should be named
|
||||
``wip-feature-x-nautilus``. *This is not just a convention. This ensures that your branch is built in the correct environment.*
|
||||
|
||||
Delete the branch from ceph-ci when you no longer need it. If you are
|
||||
logged in to GitHub, all your branches on ceph-ci can be found here:
|
||||
https://github.com/ceph/ceph-ci/branches.
|
||||
|
||||
.. _ceph-ci: https://github.com/ceph/ceph-ci
|
||||
.. _Ceph Jenkins: https://jenkins.ceph.com/
|
||||
.. _Teuthology CLI: https://docs.ceph.com/projects/teuthology/en/latest/commands/list.html
|
||||
.. _Chacra: https://github.com/ceph/chacra/blob/master/README.rst
|
||||
.. _Pulpito: http://pulpito.front.sepia.ceph.com/
|
||||
.. _Running Your First Test: ../../running-tests-locally/#running-your-first-test
|
||||
@ -299,4 +390,9 @@ https://github.com/ceph/ceph-ci/branches.
|
||||
.. _Suites Inventory: ../tests-integration-testing-teuthology-intro/#suites-inventory
|
||||
.. _Testing Priority: ../tests-integration-testing-teuthology-intro/#testing-priority
|
||||
.. _Triggering Tests: ../tests-integration-testing-teuthology-workflow/#triggering-tests
|
||||
.. _Integration Test Introduction Page: ../tests-integration-testing-teuthology-intro/#how-integration-tests-are-defined
|
||||
.. _tests-sentry-developers-guide: ../tests-sentry-developers-guide/
|
||||
.. _smithi: https://wiki.sepia.ceph.com/doku.php?id=hardware:smithi
|
||||
.. _teuthology installation guide: https://docs.ceph.com/projects/teuthology/en/latest/INSTALL.html#installation-and-setup
|
||||
.. _Filter Tests by their Description: ../tests-integration-testing-teuthology-intro/#filtering-tests-by-their-description
|
||||
.. _developer playground machine: https://wiki.sepia.ceph.com/doku.php?id=devplayground
|
||||
|
143
ceph/doc/dev/developer_guide/tests-windows.rst
Normal file
143
ceph/doc/dev/developer_guide/tests-windows.rst
Normal file
@ -0,0 +1,143 @@
|
||||
.. _dev-testing-windows:
|
||||
|
||||
=================
|
||||
Testing - Windows
|
||||
=================
|
||||
|
||||
Since Pacific, the Ceph client tools and libraries can be natively used on
|
||||
Windows. This allows Windows nodes to consume Ceph without additional layers
|
||||
such as iSCSI gateways or SMB shares.
|
||||
|
||||
A significant amount of unit tests and integration tests were ported in order
|
||||
to ensure that these components continue to function properly on Windows.
|
||||
|
||||
Windows CI Job
|
||||
==============
|
||||
|
||||
The `Windows CI job`_ performs the following steps for each GitHub pull request:
|
||||
|
||||
* spin up a Linux VM in which to build the server-side (Linux) Ceph binaries
|
||||
and cross-compile the Windows (client) binaries.
|
||||
* recreate the Linux VM and start a Ceph vstart cluster
|
||||
* boot a Windows VM and run the Ceph tests there
|
||||
|
||||
`A small PowerShell framework`_ parallelizes the tests, aggregates the results
|
||||
and isolates or skips certain tests that are known to be flaky.
|
||||
|
||||
The console output can contain compilation errors as well as the name of the
|
||||
tests that failed. To get the console output of the failing tests as well as
|
||||
Ceph and operating system logs, please check the build artifacts from the
|
||||
Jenkins "Status" page.
|
||||
|
||||
.. image:: ../../images/windows_ci_status_page.png
|
||||
:align: center
|
||||
|
||||
The Windows CI artifacts can be downloaded as a zip archive or viewed inside
|
||||
the browser. Click the "artifacts" button to see the contents of the artifacts
|
||||
folder.
|
||||
|
||||
.. image:: ../../images/windows_ci_artifacts.png
|
||||
:align: center
|
||||
|
||||
Artifact contents:
|
||||
|
||||
* ``client/`` - Ceph client-side logs (Windows)
|
||||
* ``eventlog/`` - Windows system logs
|
||||
* ``logs/`` - Ceph logs
|
||||
* ``-windows.conf`` - Ceph configuration file
|
||||
* ``cluster/`` - Ceph server-side logs (Linux)
|
||||
* ``ceph_logs/``
|
||||
* ``journal``
|
||||
* ``test_results/``
|
||||
* ``out/`` - raw and xml test output grouped by the test executable
|
||||
* ``test_results.html`` - aggregated test report (html)
|
||||
* ``test_results.txt`` - aggregated test report (plaintext)
|
||||
|
||||
We're using the `subunit`_ format and associated tools to aggregate the test
|
||||
results, which is especially handy when running a large amount of tests in
|
||||
parallel.
|
||||
|
||||
The aggregated test report provides a great overview of the failing tests.
|
||||
Go to the end of the file to see the actual errors::
|
||||
|
||||
{0} unittest_mempool.mempool.bufferlist_reassign [0.000000s] ... ok
|
||||
{0} unittest_mempool.mempool.bufferlist_c_str [0.006000s] ... ok
|
||||
{0} unittest_mempool.mempool.btree_map_test [0.000000s] ... ok
|
||||
{0} ceph_test_dokan.DokanTests.test_mount [9.203000s] ... FAILED
|
||||
|
||||
Captured details:
|
||||
~~~~~~~~~~~~~~~~~
|
||||
b'/home/ubuntu/ceph/src/test/dokan/dokan.cc:136'
|
||||
b'Expected equality of these values:'
|
||||
b' wait_for_mount(mountpoint)'
|
||||
b' Which is: -138'
|
||||
b' 0'
|
||||
b''
|
||||
b'/home/ubuntu/ceph/src/test/dokan/dokan.cc:208'
|
||||
b'Expected equality of these values:'
|
||||
b' ret'
|
||||
b' Which is: "ceph-dokan: exit status: -22"'
|
||||
b' ""'
|
||||
b'Failed unmapping: Y:\\'
|
||||
{0} ceph_test_dokan.DokanTests.test_mount_read_only [9.140000s] ... FAILED
|
||||
|
||||
The html report conveniently groups the test results by test suite (test binary).
|
||||
For security reasons it isn't rendered by default but it can be downloaded and
|
||||
viewed locally:
|
||||
|
||||
.. image:: ../../images/windows_ci_html_report.png
|
||||
:align: center
|
||||
|
||||
Timeouts and missing test results are often an indication that a process crashed.
|
||||
Note that the ceph status is printed out on the console before and after
|
||||
performing the tests, which can help identify crashed services.
|
||||
|
||||
You may also want to check the service logs (both client and server side). Also,
|
||||
be aware that the Windows "application" event log will contain entries in case
|
||||
of crashed Windows processes.
|
||||
|
||||
Frequently asked questions
|
||||
==========================
|
||||
|
||||
1. Why is the Windows CI job the only one that fails on my PR?
|
||||
|
||||
Ceph integration tests are normally performed through Teuthology on the Ceph
|
||||
Lab infrastructure. These tests are triggered on-demand by the Ceph QA
|
||||
team and do not run automatically for every submitted pull request.
|
||||
|
||||
Since the Windows CI job focuses only on the client-side Ceph components,
|
||||
it can run various integration tests in a timely manner for every pull request
|
||||
on GitHub. **In other words, it runs various librados, librbd and libcephfs
|
||||
tests that other checks such as "make check" do not.**
|
||||
|
||||
For this reason, the Windows CI often catches regressions that are missed by the
|
||||
other checks and would otherwise only come up through Teuthology. More often
|
||||
than not, these regressions are not platform-specific and affect Linux as well.
|
||||
|
||||
In case of Windows CI failures, we strongly suggest checking the test results
|
||||
as described above.
|
||||
|
||||
Be aware that the `Windows build script`_ may use different compilation flags
|
||||
and ``-D`` options passed to CMake. For example, it defaults to ``Release`` mode
|
||||
instead of ``Debug`` mode. At the same time, it uses a different toolchain
|
||||
(``mingw-llvm``) and a separate set of `dependencies`_, make sure to bump the
|
||||
versions if needed.
|
||||
|
||||
2. Why is the Windows CI job mandatory?
|
||||
|
||||
The test job was initially optional, as a result regressions were introduced
|
||||
very often.
|
||||
|
||||
After a time, Windows support became mature enough to make this CI job mandatory.
|
||||
This significantly reduces the amount of work required to address regressions
|
||||
and assures Ceph users of continued Windows support.
|
||||
|
||||
As said before, another great advantage is that it runs integration tests that
|
||||
quickly catch regressions which often affect Linux builds as well. This spares
|
||||
developers from having to wait for the full Teuthology results.
|
||||
|
||||
.. _Windows CI job: https://github.com/ceph/ceph-build/blob/main/ceph-windows-pull-requests/config/definitions/ceph-windows-pull-requests.yml
|
||||
.. _A small PowerShell framework: https://github.com/ceph/ceph-win32-tests/
|
||||
.. _Windows build script: https://github.com/ceph/ceph/blob/main/win32_build.sh
|
||||
.. _dependencies: https://github.com/ceph/ceph/blob/main/win32_deps_build.sh
|
||||
.. _subunit: https://github.com/testing-cabal/subunit
|
@ -243,6 +243,10 @@ differences:
|
||||
* All commits are cherry-picked with ``git cherry-pick -x`` to
|
||||
reference the original commit
|
||||
|
||||
.. note: If a backport is appropriate, the submitter is responsible for
|
||||
determining appropriate target stable branches to which backports must be
|
||||
made.
|
||||
|
||||
See `the backporter manual
|
||||
<http://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO>`_ for more
|
||||
information.
|
||||
|
@ -25,7 +25,7 @@ Concepts
|
||||
a temporary placement group acting set that is used while backfilling the
|
||||
primary OSD. Assume that the acting set is ``[0,1,2]`` and we are
|
||||
``active+clean``. Now assume that something happens and the acting set
|
||||
becomes ``[2,1,2]``. Under these circumstances, OSD ``3`` is empty and can't
|
||||
becomes ``[3,1,2]``. Under these circumstances, OSD ``3`` is empty and can't
|
||||
serve reads even though it is the primary. ``osd.3`` will respond by
|
||||
requesting a *PG temp* of ``[1,2,3]`` to the monitors using a ``MOSDPGTemp``
|
||||
message, and ``osd.1`` will become the primary temporarily. ``osd.1`` will
|
||||
|
@ -152,8 +152,8 @@ First release candidate
|
||||
=======================
|
||||
|
||||
- [x] src/ceph_release: change type to `rc`
|
||||
- [ ] opt-in to all telemetry channels, generate telemetry reports, and verify no sensitive details (like pools names) are collected
|
||||
- [ ] check if new pool flags exist in pg_pool_t (osd/osd_types.h), and add them to telemetry's basic_pool_flags collection, in case they are not sensitive
|
||||
- [x] opt-in to all telemetry channels, generate telemetry reports, and verify no sensitive details (like pools names) are collected
|
||||
- [x] check if new pool flags exist in pg_pool_t (osd/osd_types.h), and add them to telemetry's basic_pool_flags collection, in case they are not sensitive
|
||||
|
||||
|
||||
First stable release
|
||||
@ -162,3 +162,5 @@ First stable release
|
||||
- [x] src/ceph_release: change type `stable`
|
||||
- [ ] generate new object corpus for encoding/decoding tests - see :doc:`corpus`
|
||||
- [ ] src/cephadm/cephadmlib/constants.py: update `LATEST_STABLE_RELEASE`
|
||||
- [x] activate latest release in readthedocs, as described in `the readthedocs
|
||||
documentation <https://docs.readthedocs.io/en/stable/versions.html>`_
|
||||
|
@ -133,7 +133,9 @@ See `the Ceph Tracker wiki page that explains how to write the release notes <ht
|
||||
|
||||
#. Obtain the sha1 of the version commit from the `build job <https://jenkins.ceph.com/view/all/job/ceph>`_ or the ``sha1`` file created by the `ceph-setup <https://jenkins.ceph.com/job/ceph-setup/>`_ job.
|
||||
|
||||
#. Download the packages from chacra.ceph.com to the signing virtual machine. These packages get downloaded to ``/opt/repos`` where the `Sepia Lab Long Running (Ceph) Cluster <https://wiki.sepia.ceph.com/doku.php?id=services:longrunningcluster>`_ is mounted.
|
||||
#. Download the packages from chacra.ceph.com to the signing virtual machine. These packages get downloaded to ``/opt/repos`` where the `Sepia Lab Long Running (Ceph) Cluster <https://wiki.sepia.ceph.com/doku.php?id=services:longrunningcluster>`_ is mounted. Note: this step will also run a command to transfer the
|
||||
source tarballs from chacra.ceph.com to download.ceph.com directly, by
|
||||
ssh'ing to download.ceph.com and running /home/signer/bin/get-tarballs.sh.
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -207,19 +209,63 @@ See `the Ceph Tracker wiki page that explains how to write the release notes <ht
|
||||
|
||||
sync-push ceph octopus
|
||||
|
||||
This leaves the packages in a password-protected prerelease area
|
||||
at https://download.ceph.com/prerelease/ceph. Verify them from there.
|
||||
When done and ready for release, mv the directories to the release
|
||||
directory (that is, "mv <whatever you're promoting> ../..".
|
||||
This leaves the packages, and the tarball, in a password-protected
|
||||
prerelease area at https://download.ceph.com/prerelease/ceph. Verify them
|
||||
from there. When done and ready for release, log into download.ceph.com and
|
||||
mv the directories and the tarballs from the prerelease home
|
||||
(/data/download.ceph.com/www/prerelease/ceph) to the release directory
|
||||
(/data/download.ceph.com/www).
|
||||
|
||||
|
||||
5. Build Containers
|
||||
===================
|
||||
|
||||
Start the following two jobs:
|
||||
Architecture-specific containers are built during the ceph build and
|
||||
pushed to quay.ceph.io/ceph/prerelease-{amd64,arm64}, containing the
|
||||
packages built in that ceph build. The prerelease 'fat' container,
|
||||
or manifest-list container, that refers to both arch-specific containers,
|
||||
is built by hand using the command "make-manifest-list.py" in
|
||||
ceph.git:src/container/make-manifest-list.py. Note that you must
|
||||
be logged into the appropriate container repos for any of these
|
||||
manipulations: quay.ceph.io for fetching prerelease arch-specific
|
||||
containers and pushing the prerelease manifest-list container, and
|
||||
quay.io for promoting the prerelease containers to released containers.
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
cd <ceph-checkout>/src/container
|
||||
./make-manifest-list.py
|
||||
|
||||
Reasonable defaults are set for all inputs, but environment variables
|
||||
can be used to override:
|
||||
|
||||
* ARCH_SPECIFIC_HOST (default 'quay.ceph.io'): host of prerelease repos
|
||||
* AMD64_REPO (default 'ceph/prerelease-amd64') prerelease amd64 repo
|
||||
* ARM64_REPO (default 'ceph/prerelease-arm64') prerelease arm64 repo
|
||||
|
||||
(prerelease arch-specific containers will be copied from here)
|
||||
|
||||
* MANIFEST_HOST (default 'quay.ceph.io') prerelease manifest-list host
|
||||
* MANIFEST_REPO (default 'ceph/prerelease') prerelease manifest-list repo
|
||||
|
||||
(prerelease manifest-list containers will be placed here)
|
||||
|
||||
Finally, when all appropriate testing/ verification is done on the
|
||||
container images, you can use make-manifest-list.py to promote them to
|
||||
their final release location on quay.io/ceph/ceph:
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
cd <ceph-checkout>/src/container
|
||||
./make-manifest-list.py --promote
|
||||
|
||||
Two more environment variables can override the default destination for
|
||||
promotion (the source of the prerelease container to be promoted is
|
||||
as above, in MANIFEST_HOST/REPO):
|
||||
|
||||
* RELEASE_MANIFEST_HOST (default 'quay.io') release host
|
||||
* RELEASE_MANIFEST_REPO (default 'ceph/ceph') release repo
|
||||
|
||||
#. https://2.jenkins.ceph.com/job/ceph-container-build-ceph-base-push-imgs/
|
||||
#. https://2.jenkins.ceph.com/job/ceph-container-build-ceph-base-push-imgs-arm64/
|
||||
|
||||
6. Announce the Release
|
||||
=======================
|
||||
|
@ -42,6 +42,11 @@
|
||||
Ceph is a distributed network storage and file system with
|
||||
distributed metadata management and POSIX semantics.
|
||||
|
||||
`ceph-ansible <https://docs.ceph.com/projects/ceph-ansible/en/latest/index.html>`_
|
||||
A GitHub repository, supported from the Jewel release to the
|
||||
Quincy release, that facilitates the installation of a Ceph
|
||||
cluster.
|
||||
|
||||
Ceph Block Device
|
||||
Also called "RADOS Block Device" and :term:`RBD`. A software
|
||||
instrument that orchestrates the storage of block-based data in
|
||||
@ -256,6 +261,21 @@
|
||||
Another name for :term:`Dashboard`.
|
||||
|
||||
Dashboard Plugin
|
||||
The dashboard plugin was a Mimic-era web application that
|
||||
visualized information and statistics about the Ceph cluster
|
||||
using a web server hosted by the :ref:`Ceph
|
||||
Manager<ceph-manager-daemon>`.
|
||||
|
||||
See `the Mimic-era Dashboard Plugin documentation
|
||||
<https://docs.ceph.com/en/mimic/mgr/dashboard/>`_.
|
||||
|
||||
DC
|
||||
**D**\ata **C**\enter.
|
||||
|
||||
Flapping OSD
|
||||
An OSD that is repeatedly marked ``up`` and then ``down`` in
|
||||
rapid succession. See :ref:`rados_tshooting_flapping_osd`.
|
||||
|
||||
FQDN
|
||||
**F**\ully **Q**\ualified **D**\omain **N**\ame. A domain name
|
||||
that is applied to a node in a network and that specifies the
|
||||
@ -315,6 +335,12 @@
|
||||
Node
|
||||
See :term:`Ceph Node`.
|
||||
|
||||
Object Storage
|
||||
Object storage is one of three kinds of storage relevant to
|
||||
Ceph. The other two kinds of storage relevant to Ceph are file
|
||||
storage and block storage. Object storage is the category of
|
||||
storage most fundamental to Ceph.
|
||||
|
||||
Object Storage Device
|
||||
See :term:`OSD`.
|
||||
|
||||
@ -350,6 +376,9 @@
|
||||
mid-2010s to insist that "OSD" should refer to "Object Storage
|
||||
Device", so it is important to know which meaning is intended.
|
||||
|
||||
OSD, flapping
|
||||
See :term:`Flapping OSD`.
|
||||
|
||||
OSD FSID
|
||||
The OSD fsid is a unique identifier that is used to identify an
|
||||
OSD. It is found in the OSD path in a file called ``osd_fsid``.
|
||||
@ -384,7 +413,15 @@
|
||||
placement group, and each placement group belongs to exactly
|
||||
one Ceph pool.
|
||||
|
||||
PLP
|
||||
**P**\ower **L**\oss **P**\rotection. A technology that
|
||||
protects the data of solid-state drives by using capacitors to
|
||||
extend the amount of time available for transferring data from
|
||||
the DRAM cache to the SSD's permanent memory. Consumer-grade
|
||||
SSDs are rarely equipped with PLP.
|
||||
|
||||
:ref:`Pool<rados_pools>`
|
||||
|
||||
A pool is a logical partition used to store objects.
|
||||
|
||||
Pools
|
||||
|
@ -21,14 +21,56 @@ Bodies
|
||||
Ceph Executive Council
|
||||
======================
|
||||
|
||||
Responsibilities
|
||||
----------------
|
||||
.. _exec-council-responsibilities:
|
||||
|
||||
* Arbiter in cases where decisions cannot be reached by consensus
|
||||
* Distribute key responsibilities amongst themselves or others
|
||||
* Point of contact for the project
|
||||
* Representatives for Ceph foundation board meetings
|
||||
* Ensure things get done
|
||||
Ceph Executive Council Responsibilities
|
||||
---------------------------------------
|
||||
|
||||
- Spokesperson
|
||||
|
||||
- welcome/keynote for cephalocon
|
||||
|
||||
- maintaining slides and presenting about the project
|
||||
|
||||
- Community focal point (user interaction, conference talks, mailing list,
|
||||
etc)
|
||||
|
||||
- Community
|
||||
|
||||
- managing community manager
|
||||
|
||||
- LF Program Manager person, Social Media person
|
||||
|
||||
- liase with the ambassadors
|
||||
|
||||
- make sure ceph events happen, successfully: cephalocon, ceph days, cds, user/dev, cdm
|
||||
|
||||
- coordinating with LF
|
||||
|
||||
- creating program committee
|
||||
|
||||
- recordings on youtube
|
||||
|
||||
- getting sponsors for events
|
||||
|
||||
- communications, schedule, venue decisions
|
||||
|
||||
- coordinate blog posts
|
||||
|
||||
|
||||
- Ceph Foundation
|
||||
|
||||
- ensure foundation is healthy: financials, operations
|
||||
|
||||
- represent the CLT on the Board
|
||||
|
||||
- present project status regularly (yearly)
|
||||
|
||||
- collect member ideas / feedback
|
||||
|
||||
- ensure members feel valued
|
||||
|
||||
- guide the members how to support the project (events, testing, marketing, hardware, ...)
|
||||
|
||||
Membership
|
||||
----------
|
||||
@ -47,7 +89,7 @@ Membership
|
||||
Current Members
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
* Dan van der Ster <daniel.vanderster@cern.ch>
|
||||
* Dan van der Ster <dan.vanderster@clyso.com>
|
||||
* Josh Durgin <jdurgin@redhat.com>
|
||||
* Neha Ojha <nojha@redhat.com>
|
||||
|
||||
@ -82,28 +124,28 @@ Current Members
|
||||
* Casey Bodley <cbodley@redhat.com>
|
||||
* Dan van der Ster <dan.vanderster@clyso.com>
|
||||
* David Orman <ormandj@1111systems.com>
|
||||
* Ernesto Puerta <epuerta@redhat.com>
|
||||
* Ernesto Puerta <epuertat@redhat.com>
|
||||
* Gregory Farnum <gfarnum@redhat.com>
|
||||
* Haomai Wang <haomai@xsky.com>
|
||||
* Ilya Dryomov <idryomov@redhat.com>
|
||||
* Igor Fedotov <igor.fedotov@croit.io>
|
||||
* Jeff Layton <jlayton@redhat.com>
|
||||
* Josh Durgin <jdurgin@redhat.com>
|
||||
* João Eduardo Luis <joao@suse.de>
|
||||
* João Eduardo Luis <joao@clyso.com>
|
||||
* Ken Dreyer <kdreyer@redhat.com>
|
||||
* Mark Nelson <mnelson@redhat.com>
|
||||
* Mark Nelson <mark.nelson@clyso.com>
|
||||
* Matt Benjamin <mbenjami@redhat.com>
|
||||
* Mike Perez <miperez@redhat.com>
|
||||
* Myoungwon Oh <myoungwon.oh@samsung.com>
|
||||
* Neha Ojha <nojha@redhat.com>
|
||||
* Patrick Donnelly <pdonnell@redhat.com>
|
||||
* Patrick Donnelly <pdonnell@ibm.com>
|
||||
* Sam Just <sjust@redhat.com>
|
||||
* Vikhyat Umrao <vikhyat@redhat.com>
|
||||
* Xie Xingguo <xie.xingguo@zte.com.cn>
|
||||
* Yehuda Sadeh <yehuda@redhat.com>
|
||||
* Yingxin Cheng <yingxin.cheng@intel.com>
|
||||
* Yuri Weinstein <yweinste@redhat.com>
|
||||
* Zac Dover <zac.dover@gmail.com>
|
||||
* Zac Dover <zac.dover@proton.me>
|
||||
|
||||
.. _ctl:
|
||||
|
||||
|
BIN
ceph/doc/images/windows_ci_artifacts.png
Normal file
BIN
ceph/doc/images/windows_ci_artifacts.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 31 KiB |
BIN
ceph/doc/images/windows_ci_html_report.png
Normal file
BIN
ceph/doc/images/windows_ci_html_report.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 56 KiB |
BIN
ceph/doc/images/windows_ci_status_page.png
Normal file
BIN
ceph/doc/images/windows_ci_status_page.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 29 KiB |
@ -94,7 +94,7 @@ about Ceph, see our `Architecture`_ section.
|
||||
.. _Ceph Object Store: radosgw
|
||||
.. _Ceph Block Device: rbd
|
||||
.. _Ceph File System: cephfs
|
||||
.. _Getting Started: install
|
||||
.. _Getting Started: start
|
||||
.. _Architecture: architecture
|
||||
|
||||
.. toctree::
|
||||
|
@ -475,7 +475,7 @@ thread on the ceph-users mailing list
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph auth get-or-create client.short-hostname-of-rgw mon 'allow rw' osd 'allow rwx'
|
||||
ceph auth get-or-create client.$(hostname -s) mon 'allow rw' osd 'allow rwx'
|
||||
|
||||
#. On one of the RGW nodes, do the following:
|
||||
|
||||
|
@ -85,3 +85,4 @@ Further reading
|
||||
.. _Windows troubleshooting: ../windows-troubleshooting
|
||||
.. _General CephFS Prerequisites: ../../cephfs/mount-prerequisites
|
||||
.. _Client Authentication: ../../cephfs/client-auth
|
||||
.. _Windows testing: ../dev/tests-windows
|
||||
|
@ -14,8 +14,8 @@ BASIC ARCHITECTURE AND TERMINOLOGY
|
||||
Protocol. The agent is meant to be placed on the same host as the
|
||||
instrumented application. (The Jaeger agent acts like a sidecar listener.)
|
||||
* JAEGER COLLECTOR: A daemon that receives spans sent by the Jaeger agent. The
|
||||
Jaeger collector then stitches the spans together to form a trace. (A databse
|
||||
can be enabled to persist a database for these traces).
|
||||
Jaeger collector then stitches the spans together to form a trace. (A database
|
||||
can be enabled to persist these traces).
|
||||
* JAEGER QUERY AND CONSOLE FRONTEND: The UI-based frontend that presents
|
||||
reports of the jaeger traces. Accessible at http://<jaeger frontend host>:16686.
|
||||
|
||||
|
@ -29,6 +29,7 @@ Synopsis
|
||||
| **ceph-bluestore-tool** free-dump|free-score --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ]
|
||||
| **ceph-bluestore-tool** reshard --path *osd path* --sharding *new sharding* [ --sharding-ctrl *control string* ]
|
||||
| **ceph-bluestore-tool** show-sharding --path *osd path*
|
||||
| **ceph-bluestore-tool** zap-device --dev *dev path*
|
||||
|
||||
|
||||
Description
|
||||
@ -93,19 +94,22 @@ Commands
|
||||
|
||||
:command:`bluefs-bdev-migrate` --dev-target *new-device* --devs-source *device1* [--devs-source *device2*]
|
||||
|
||||
Moves BlueFS data from source device(s) to the target one, source devices
|
||||
(except the main one) are removed on success. Target device can be both
|
||||
already attached or new device. In the latter case it's added to OSD
|
||||
replacing one of the source devices. Following replacement rules apply
|
||||
(in the order of precedence, stop on the first match):
|
||||
Moves BlueFS data from source device(s) to the target device. Source devices
|
||||
(except the main one) are removed on success. Expands the target storage
|
||||
(updates the size label), making "bluefs-bdev-expand" unnecessary. The
|
||||
target device can be either a new device or a device that is already
|
||||
attached. If the device is a new device, it is added to the OSD replacing
|
||||
one of the source devices. The following replacement rules apply (in the
|
||||
order of precedence, stop on the first match):
|
||||
|
||||
- if source list has DB volume - target device replaces it.
|
||||
- if source list has WAL volume - target device replace it.
|
||||
- if source list has slow volume only - operation isn't permitted, requires explicit allocation via new-db/new-wal command.
|
||||
- if the source list has DB volume - the target device replaces it.
|
||||
- if the source list has WAL volume - the target device replaces it.
|
||||
- if the source list has slow volume only - the operation isn't permitted and requires explicit allocation via a new-DB/new-WAL command.
|
||||
|
||||
:command:`show-label` --dev *device* [...]
|
||||
|
||||
Show device label(s).
|
||||
Show device label(s).
|
||||
The label may be printed while an OSD is running.
|
||||
|
||||
:command:`free-dump` --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ]
|
||||
|
||||
@ -131,6 +135,10 @@ Commands
|
||||
|
||||
Show sharding that is currently applied to BlueStore's RocksDB.
|
||||
|
||||
:command: `zap-device` --dev *dev path*
|
||||
|
||||
Zeros all device label locations. This effectively makes device appear empty.
|
||||
|
||||
Options
|
||||
=======
|
||||
|
||||
@ -192,8 +200,8 @@ Useful to provide necessary configuration options when access to monitor/ceph.co
|
||||
Device labels
|
||||
=============
|
||||
|
||||
Every BlueStore block device has a single block label at the beginning of the
|
||||
device. You can dump the contents of the label with::
|
||||
Every BlueStore block device has a block label at the beginning of the device.
|
||||
You can dump the contents of the label with::
|
||||
|
||||
ceph-bluestore-tool show-label --dev *device*
|
||||
|
||||
@ -201,6 +209,10 @@ The main device will have a lot of metadata, including information
|
||||
that used to be stored in small files in the OSD data directory. The
|
||||
auxiliary devices (db and wal) will only have the minimum required
|
||||
fields (OSD UUID, size, device type, birth time).
|
||||
The main device contains additional label copies at offsets: 1G, 10G, 100G and 1000G.
|
||||
Corrupted labels are fixed as part of repair::
|
||||
|
||||
ceph-bluestore-tool repair --dev *device*
|
||||
|
||||
OSD directory priming
|
||||
=====================
|
||||
|
@ -56,7 +56,7 @@ A sub-directory of the file system can be mounted by specifying the (absolute)
|
||||
path to the sub-directory right after "=" in the device part of the mount command.
|
||||
|
||||
Mount helper application conventions dictate that the first two options are
|
||||
device to be mounted and the mountpoint for that device. Options must be
|
||||
device to be mounted and the mount point for that device. Options must be
|
||||
passed only after these fixed arguments.
|
||||
|
||||
|
||||
|
@ -11,7 +11,7 @@ Synopsis
|
||||
|
||||
| **mount.fuse.ceph** [-h] [-o OPTIONS [*OPTIONS* ...]]
|
||||
device [*device* ...]
|
||||
mountpoint [*mountpoint* ...]
|
||||
mountpoint [*mount point* ...]
|
||||
|
||||
Description
|
||||
===========
|
||||
|
@ -476,26 +476,19 @@ as follows:
|
||||
Cancel resharding a bucket
|
||||
|
||||
:command:`topic list`
|
||||
List bucket notifications/pubsub topics
|
||||
List bucket notifications topics
|
||||
|
||||
:command:`topic get`
|
||||
Get a bucket notifications/pubsub topic
|
||||
|
||||
Get a bucket notification topic
|
||||
|
||||
:command:`topic rm`
|
||||
Remove a bucket notifications/pubsub topic
|
||||
Remove a bucket notifications topic
|
||||
|
||||
:command:`subscription get`
|
||||
Get a pubsub subscription definition
|
||||
|
||||
:command:`subscription rm`
|
||||
Remove a pubsub subscription
|
||||
|
||||
:command:`subscription pull`
|
||||
Show events in a pubsub subscription
|
||||
|
||||
:command:`subscription ack`
|
||||
Acknowledge (remove) events in a pubsub subscription
|
||||
:command:`topic stats`
|
||||
Get a bucket notifications persistent topic stats (i.e. reservations, entries & size)
|
||||
|
||||
:command:`topic dump`
|
||||
Dump (in JSON format) all pending bucket notifications of a persistent topic
|
||||
|
||||
Options
|
||||
=======
|
||||
|
@ -520,28 +520,28 @@ Commands
|
||||
Show RBD mirroring status for an image.
|
||||
|
||||
:command:`mirror pool demote` [*pool-name*]
|
||||
Demote all primary images within a pool to non-primary.
|
||||
Every mirror-enabled image in the pool will be demoted.
|
||||
Demote all primary images within a pool or namespace to non-primary.
|
||||
Every mirror-enabled image in the pool or namespace will be demoted.
|
||||
|
||||
:command:`mirror pool disable` [*pool-name*]
|
||||
Disable RBD mirroring by default within a pool. When mirroring
|
||||
is disabled on a pool in this way, mirroring will also be
|
||||
disabled on any images (within the pool) for which mirroring
|
||||
was enabled explicitly.
|
||||
Disable RBD mirroring within a pool or namespace. When mirroring
|
||||
is disabled on a pool or namespace in this way, mirroring will also be
|
||||
disabled on all images (within the pool or namespace) for which mirroring
|
||||
was enabled, whether by default or explicitly.
|
||||
|
||||
:command:`mirror pool enable` [*pool-name*] *mode*
|
||||
Enable RBD mirroring by default within a pool.
|
||||
Enable RBD mirroring within a pool or namespace.
|
||||
The mirroring mode can either be ``pool`` or ``image``.
|
||||
If configured in ``pool`` mode, all images in the pool
|
||||
If configured in ``pool`` mode, all images in the pool or namespace
|
||||
with the journaling feature enabled are mirrored.
|
||||
If configured in ``image`` mode, mirroring needs to be
|
||||
explicitly enabled (by ``mirror image enable`` command)
|
||||
on each image.
|
||||
|
||||
:command:`mirror pool info` [*pool-name*]
|
||||
Show information about the pool mirroring configuration.
|
||||
It includes mirroring mode, peer UUID, remote cluster name,
|
||||
and remote client name.
|
||||
Show information about the pool or namespace mirroring configuration.
|
||||
For a pool, it includes mirroring mode, peer UUID, remote cluster name,
|
||||
and remote client name. For a namespace, it includes only mirroring mode.
|
||||
|
||||
:command:`mirror pool peer add` [*pool-name*] *remote-cluster-spec*
|
||||
Add a mirroring peer to a pool.
|
||||
@ -561,13 +561,13 @@ Commands
|
||||
is corresponding to remote client name or remote cluster name.
|
||||
|
||||
:command:`mirror pool promote` [--force] [*pool-name*]
|
||||
Promote all non-primary images within a pool to primary.
|
||||
Every mirror-enabled image in the pool will be promoted.
|
||||
Promote all non-primary images within a pool or namespace to primary.
|
||||
Every mirror-enabled image in the pool or namespace will be promoted.
|
||||
|
||||
:command:`mirror pool status` [--verbose] [*pool-name*]
|
||||
Show status for all mirrored images in the pool.
|
||||
Show status for all mirrored images in the pool or namespace.
|
||||
With ``--verbose``, show additional output status
|
||||
details for every mirror-enabled image in the pool.
|
||||
details for every mirror-enabled image in the pool or namespace.
|
||||
|
||||
:command:`mirror snapshot schedule add` [-p | --pool *pool*] [--namespace *namespace*] [--image *image*] *interval* [*start-time*]
|
||||
Add mirror snapshot schedule.
|
||||
|
@ -1441,9 +1441,9 @@ commands:
|
||||
|
||||
/var/log/ceph/$cluster-$name.log
|
||||
|
||||
#. Ensure the SSL/TSL support is configured properly:
|
||||
#. Ensure the SSL/TLS support is configured properly:
|
||||
|
||||
* Check if the SSL/TSL support is enabled:
|
||||
* Check if the SSL/TLS support is enabled:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
|
@ -283,7 +283,7 @@ Create CephFS Export
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>] [--sectype <value>...]
|
||||
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>] [--sectype <value>...] [--cmount_path <value>]
|
||||
|
||||
This creates export RADOS objects containing the export block, where
|
||||
|
||||
@ -318,9 +318,16 @@ values may be separated by a comma (example: ``--sectype krb5p,krb5i``). The
|
||||
server will negotatiate a supported security type with the client preferring
|
||||
the supplied methods left-to-right.
|
||||
|
||||
``<cmount_path>`` specifies the path within the CephFS to mount this export on. It is
|
||||
allowed to be any complete path hierarchy between ``/`` and the ``EXPORT {path}``. (i.e. if ``EXPORT { Path }`` parameter is ``/foo/bar`` then cmount_path could be ``/``, ``/foo`` or ``/foo/bar``).
|
||||
|
||||
.. note:: If this and the other ``EXPORT { FSAL {} }`` options are the same between multiple exports, those exports will share a single CephFS client.
|
||||
If not specified, the default is ``/``.
|
||||
|
||||
.. note:: Specifying values for sectype that require Kerberos will only function on servers
|
||||
that are configured to support Kerberos. Setting up NFS-Ganesha to support Kerberos
|
||||
is outside the scope of this document.
|
||||
can be found here `Kerberos setup for NFS Ganesha in Ceph <https://github.com/nfs-ganesha/nfs-ganesha/wiki/Kerberos-setup-for-NFS-Ganesha-in-Ceph>`_.
|
||||
|
||||
|
||||
.. note:: Export creation is supported only for NFS Ganesha clusters deployed using nfs interface.
|
||||
|
||||
@ -477,9 +484,9 @@ For example,::
|
||||
],
|
||||
"fsal": {
|
||||
"name": "CEPH",
|
||||
"user_id": "nfs.mynfs.1",
|
||||
"fs_name": "a",
|
||||
"sec_label_xattr": ""
|
||||
"sec_label_xattr": "",
|
||||
"cmount_path": "/"
|
||||
},
|
||||
"clients": []
|
||||
}
|
||||
@ -494,6 +501,9 @@ as when creating a new export), with the exception of the
|
||||
authentication credentials, which will be carried over from the
|
||||
previous state of the export where possible.
|
||||
|
||||
!! NOTE: The ``user_id`` in the ``fsal`` block should not be modified or mentioned in the JSON file as it is auto-generated for CephFS exports.
|
||||
It's auto-generated in the format ``nfs.<cluster_id>.<fs_name>.<hash_id>``.
|
||||
|
||||
::
|
||||
|
||||
$ ceph nfs export apply mynfs -i update_cephfs_export.json
|
||||
@ -514,9 +524,9 @@ previous state of the export where possible.
|
||||
],
|
||||
"fsal": {
|
||||
"name": "CEPH",
|
||||
"user_id": "nfs.mynfs.1",
|
||||
"fs_name": "a",
|
||||
"sec_label_xattr": ""
|
||||
"sec_label_xattr": "",
|
||||
"cmount_path": "/"
|
||||
},
|
||||
"clients": []
|
||||
}
|
||||
|
@ -77,6 +77,19 @@ If the port is not configured, *restful* will bind to port ``8003``.
|
||||
If the address it not configured, the *restful* will bind to ``::``,
|
||||
which corresponds to all available IPv4 and IPv6 addresses.
|
||||
|
||||
Configuring max_request
|
||||
---------------------------
|
||||
|
||||
The maximum request size can be configured via a central configuration
|
||||
option::
|
||||
|
||||
ceph config set mgr mgr/restful/$name/max_requests $NUM
|
||||
|
||||
where ``$name`` is the ID of the ceph-mgr daemon (usually the hostname).
|
||||
|
||||
.. mgr_module:: restful
|
||||
.. confval:: max_requests
|
||||
|
||||
.. _creating-an-api-user:
|
||||
|
||||
Creating an API User
|
||||
|
@ -355,7 +355,7 @@ invoking methods of the `Ioctx` and other classes.
|
||||
.. --------------
|
||||
|
||||
.. The Ceph Storage Cluster allows you to make a snapshot of a pool's state.
|
||||
.. Whereas, basic pool operations only require a connection to the cluster,
|
||||
.. Although basic pool operations require only a connection to the cluster,
|
||||
.. snapshots require an I/O context.
|
||||
|
||||
.. Ioctx.create_snap(self, snap_name)
|
||||
|
@ -179,7 +179,7 @@ Naming Clusters (deprecated)
|
||||
|
||||
Each Ceph cluster has an internal name. This internal name is used as part of
|
||||
configuration, and as part of "log file" names as well as part of directory
|
||||
names and as part of mountpoint names. This name defaults to "ceph". Previous
|
||||
names and as part of mount point names. This name defaults to "ceph". Previous
|
||||
releases of Ceph allowed one to specify a custom name instead, for example
|
||||
"ceph2". This option was intended to facilitate the running of multiple logical
|
||||
clusters on the same physical hardware, but in practice it was rarely
|
||||
|
@ -164,6 +164,60 @@ parameters. This profile should be used with caution and is meant for advanced
|
||||
users, who understand mclock and Ceph related configuration options.
|
||||
|
||||
|
||||
.. index:: mclock; shard config for HDD clusters
|
||||
|
||||
.. _mclock-hdd-cfg:
|
||||
|
||||
OSD Shard Configuration For HDD Based Clusters With mClock
|
||||
==========================================================
|
||||
Each OSD is configured with one or more shards to perform tasks. Each shard
|
||||
comprises a unique queue to handle various types of OSD specific operations
|
||||
like client I/O, recovery, scrub and so on. The scheduling of these operations
|
||||
in the queue is performed by a scheduler - in this case the mClock scheduler.
|
||||
|
||||
For HDD based OSDs, the number of shards is controlled by configuration
|
||||
:confval:`osd_op_num_shards_hdd`. Items are queued and dequeued by one or
|
||||
more worker threads and this is controlled by configuration
|
||||
:confval:`osd_op_num_threads_per_shard_hdd`.
|
||||
|
||||
As described in :ref:`dmclock-qos-caveats`, the number of OSD shards employed
|
||||
determines the impact of mClock queue. In general, a lower number of shards
|
||||
increases the impact of mClock queues with respect to scheduling accuracy.
|
||||
This is providing there are enough number of worker threads per shard
|
||||
to help process the items in the mClock queue.
|
||||
|
||||
Based on tests performed at scale with small objects in the range
|
||||
[1 KiB - 256 KiB] on a HDD based cluster (192 OSDs, 8 nodes,
|
||||
150 Million objects), it was found that scheduling with mClock was not optimal
|
||||
with multiple OSD shards. For example, in this cluster with multiple OSD node
|
||||
failures, the client throughput was found to be inconsistent across test runs
|
||||
coupled with multiple reported slow requests. For more details
|
||||
see https://tracker.ceph.com/issues/66289. With multiple shards, the situation
|
||||
was exacerbated when MAX limit was allocated to both client and background
|
||||
recovery class of operations. During the OSD failure phase, since both client
|
||||
and recovery ops were in direct competition to utilize the full bandwidth of
|
||||
OSDs, there was no predictability with respect to the throughput of either
|
||||
class of services.
|
||||
|
||||
However, the same test with a single OSD shard and with multiple worker threads
|
||||
yielded significantly better results in terms of consistency of client and
|
||||
recovery throughput across multiple test runs. Please refer to the tracker
|
||||
above for more details. For sanity, the same test executed using this shard
|
||||
configuration with large objects in the range [1 MiB - 256 MiB] yielded similar
|
||||
results.
|
||||
|
||||
Therefore, as an interim measure until the issue with multiple OSD shards
|
||||
(or multiple mClock queues per OSD) is investigated and fixed, the following
|
||||
change to the default HDD OSD shard configuration is made:
|
||||
|
||||
+---------------------------------------------+------------------+----------------+
|
||||
| Config Option | Old Default | New Default |
|
||||
+=============================================+==================+================+
|
||||
| :confval:`osd_op_num_shards_hdd` | 5 | 1 |
|
||||
+---------------------------------------------+------------------+----------------+
|
||||
| :confval:`osd_op_num_threads_per_shard_hdd` | 1 | 5 |
|
||||
+---------------------------------------------+------------------+----------------+
|
||||
|
||||
.. index:: mclock; built-in profiles
|
||||
|
||||
mClock Built-in Profiles - Locked Config Options
|
||||
@ -694,6 +748,8 @@ mClock Config Options
|
||||
.. confval:: osd_mclock_skip_benchmark
|
||||
.. confval:: osd_mclock_override_recovery_settings
|
||||
.. confval:: osd_mclock_iops_capacity_threshold_hdd
|
||||
.. confval:: osd_mclock_iops_capacity_low_threshold_hdd
|
||||
.. confval:: osd_mclock_iops_capacity_threshold_ssd
|
||||
.. confval:: osd_mclock_iops_capacity_low_threshold_ssd
|
||||
|
||||
.. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf
|
||||
|
@ -113,7 +113,7 @@ Consistency
|
||||
When you add monitor settings to your Ceph configuration file, you need to be
|
||||
aware of some of the architectural aspects of Ceph Monitors. **Ceph imposes
|
||||
strict consistency requirements** for a Ceph monitor when discovering another
|
||||
Ceph Monitor within the cluster. Whereas, Ceph Clients and other Ceph daemons
|
||||
Ceph Monitor within the cluster. Although Ceph Clients and other Ceph daemons
|
||||
use the Ceph configuration file to discover monitors, monitors discover each
|
||||
other using the monitor map (monmap), not the Ceph configuration file.
|
||||
|
||||
|
@ -63,3 +63,6 @@ to the values of the SRV weight fields.
|
||||
|
||||
For the above example, this will result in approximate 40% of the clients and daemons connecting to mon1,
|
||||
60% of them connecting to mon2. However, if neither of them is reachable, then mon3 will be reconsidered as a fallback.
|
||||
|
||||
See also `Messenger v2 <msgr2>`_.
|
||||
|
||||
|
@ -189,6 +189,9 @@ Operations
|
||||
.. confval:: osd_op_num_shards
|
||||
.. confval:: osd_op_num_shards_hdd
|
||||
.. confval:: osd_op_num_shards_ssd
|
||||
.. confval:: osd_op_num_threads_per_shard
|
||||
.. confval:: osd_op_num_threads_per_shard_hdd
|
||||
.. confval:: osd_op_num_threads_per_shard_ssd
|
||||
.. confval:: osd_op_queue
|
||||
.. confval:: osd_op_queue_cut_off
|
||||
.. confval:: osd_client_op_priority
|
||||
@ -292,6 +295,9 @@ of the current time. The ultimate lesson is that values for weight
|
||||
should not be too large. They should be under the number of requests
|
||||
one expects to be serviced each second.
|
||||
|
||||
|
||||
.. _dmclock-qos-caveats:
|
||||
|
||||
Caveats
|
||||
```````
|
||||
|
||||
@ -303,6 +309,11 @@ number of shards can be controlled with the configuration options
|
||||
:confval:`osd_op_num_shards`, :confval:`osd_op_num_shards_hdd`, and
|
||||
:confval:`osd_op_num_shards_ssd`. A lower number of shards will increase the
|
||||
impact of the mClock queues, but may have other deleterious effects.
|
||||
This is especially the case if there are insufficient shard worker
|
||||
threads. The number of shard worker threads can be controlled with the
|
||||
configuration options :confval:`osd_op_num_threads_per_shard`,
|
||||
:confval:`osd_op_num_threads_per_shard_hdd` and
|
||||
:confval:`osd_op_num_threads_per_shard_ssd`.
|
||||
|
||||
Second, requests are transferred from the operation queue to the
|
||||
operation sequencer, in which they go through the phases of
|
||||
@ -362,6 +373,8 @@ considerably. To maintain operational performance, Ceph performs this migration
|
||||
with 'backfilling', which allows Ceph to set backfill operations to a lower
|
||||
priority than requests to read or write data.
|
||||
|
||||
.. note:: Some of these settings are automatically reset if the `mClock`_
|
||||
scheduler is active, see `mClock backfill`_.
|
||||
|
||||
.. confval:: osd_max_backfills
|
||||
.. confval:: osd_backfill_scan_min
|
||||
@ -404,6 +417,9 @@ To maintain operational performance, Ceph performs recovery with limitations on
|
||||
the number recovery requests, threads and object chunk sizes which allows Ceph
|
||||
perform well in a degraded state.
|
||||
|
||||
.. note:: Some of these settings are automatically reset if the `mClock`_
|
||||
scheduler is active, see `mClock backfill`_.
|
||||
|
||||
.. confval:: osd_recovery_delay_start
|
||||
.. confval:: osd_recovery_max_active
|
||||
.. confval:: osd_recovery_max_active_hdd
|
||||
@ -441,6 +457,8 @@ Miscellaneous
|
||||
.. _pool: ../../operations/pools
|
||||
.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction
|
||||
.. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
|
||||
.. _mClock: ../mclock-config-ref.rst
|
||||
.. _mClock backfill: ../mclock-config-ref.rst#recovery-backfill-options
|
||||
.. _Pool & PG Config Reference: ../pool-pg-config-ref
|
||||
.. _Journal Config Reference: ../journal-ref
|
||||
.. _cache target dirty high ratio: ../../operations/pools#cache-target-dirty-high-ratio
|
||||
|
@ -17,8 +17,8 @@ It's a good idea to check the capacity of your cluster so that you know when it
|
||||
approaches its capacity limits. If your cluster has reached its ``near full``
|
||||
ratio, then you should add OSDs to expand your cluster's capacity.
|
||||
|
||||
.. warning:: Do not add an OSD after your cluster has reached its ``full
|
||||
ratio``. OSD failures that occur after the cluster reaches its ``near full
|
||||
.. warning:: Do not let your cluster reach its ``full ratio`` before adding an
|
||||
OSD. OSD failures that occur after the cluster reaches its ``near full
|
||||
ratio`` might cause the cluster to exceed its ``full ratio``.
|
||||
|
||||
|
||||
|
@ -247,6 +247,18 @@ To see the status in greater detail, run the following command:
|
||||
|
||||
ceph balancer status detail
|
||||
|
||||
To enable `ceph balancer status detail`, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/update_pg_upmap_activity True
|
||||
|
||||
To disable `ceph balancer status detail`, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/update_pg_upmap_activity False
|
||||
|
||||
To evaluate the distribution that would result from executing a specific plan,
|
||||
run the following command:
|
||||
|
||||
|
@ -549,6 +549,63 @@ disable and remove it.
|
||||
|
||||
ceph osd tier remove cold-storage hot-storage
|
||||
|
||||
Troubleshooting Unfound Objects
|
||||
===============================
|
||||
Under certain circumstances, restarting OSDs may result in unfound objects.
|
||||
|
||||
Here is an example of unfound objects appearing during an upgrade from Ceph
|
||||
14.2.6 to Ceph 14.2.7::
|
||||
|
||||
2/543658058 objects unfound (0.000%)
|
||||
pg 19.12 has 1 unfound objects
|
||||
pg 19.2d has 1 unfound objects
|
||||
|
||||
Possible data damage: 2 pgs recovery_unfound
|
||||
pg 19.12 is active+recovery_unfound+undersized+degraded+remapped, acting [299,310], 1 unfound
|
||||
pg 19.2d is active+recovery_unfound+undersized+degraded+remapped, acting [290,309], 1 unfound
|
||||
|
||||
# ceph pg 19.12 list_unfound
|
||||
{
|
||||
"num_missing": 1,
|
||||
"num_unfound": 1,
|
||||
"objects": [
|
||||
{
|
||||
"oid": {
|
||||
"oid": "hit_set_19.12_archive_2020-02-25 13:43:50.256316Z_2020-02-25 13:43:50.325825Z",
|
||||
"key": "",
|
||||
"snapid": -2,
|
||||
"hash": 18,
|
||||
"max": 0,
|
||||
"pool": 19,
|
||||
"namespace": ".ceph-internal"
|
||||
},
|
||||
"need": "3312398'55868341",
|
||||
"have": "0'0",
|
||||
"flags": "none",
|
||||
"locations": []
|
||||
}
|
||||
],
|
||||
"more": false
|
||||
|
||||
Some tests in the field indicate that the unfound objects can be deleted with
|
||||
no adverse effects (see `Tracker Issue #44286, Note 3
|
||||
<https://tracker.ceph.com/issues/44286#note-3>`_). Pawel Stefanski suggests
|
||||
that deleting missing or unfound objects is safe as long as the objects are a
|
||||
part of ``.ceph-internal::hit_set_PGID_archive``.
|
||||
|
||||
Various members of the upstream Ceph community have reported in `Tracker Issue
|
||||
#44286 <https://tracker.ceph.com/issues/44286>`_ that the following versions of
|
||||
Ceph have been affected by this issue:
|
||||
|
||||
* 14.2.8
|
||||
* 14.2.16
|
||||
* 15.2.15
|
||||
* 16.2.5
|
||||
* 17.2.7
|
||||
|
||||
See `Tracker Issue #44286 <https://tracker.ceph.com/issues/44286>`_ for the
|
||||
history of this issue.
|
||||
|
||||
|
||||
.. _Create a Pool: ../pools#create-a-pool
|
||||
.. _Pools - Set Pool Values: ../pools#set-pool-values
|
||||
|
@ -60,6 +60,24 @@ Where:
|
||||
*blaum_roth*, *liber8tion* are *RAID6* equivalents in
|
||||
the sense that they can only be configured with *m=2*.
|
||||
|
||||
.. note:: When using ``blaum_roth`` coding, the default
|
||||
word size of ``w=7`` is suboptimal because ``blaum_roth``
|
||||
works best when ``w+1`` is prime. When creating a new
|
||||
erasure-code profile with ``technique=blaum_roth``,
|
||||
set ``w`` to a number that is one integer less than a prime
|
||||
number (for example, ``6``). See `Loic Dachary's
|
||||
commit f51d21b to ceph/ceph <https://github.com/ceph/ceph/commit/f51d21b53d26d4f27c950cb1ba3f989e713ab325>`_ for information about
|
||||
why this default cannot be changed easily in the
|
||||
source code, and see `the second bullet point on
|
||||
page 29 of Plank and Greenan's "Jerasure: A Library
|
||||
in C Facilitating Erasure Coding for Storage
|
||||
Applications" <https://github.com/ceph/jerasure/blob/master/Manual.pdf>`_ for an unequivocal statement of the restriction that applies
|
||||
to ``w`` when using Blaum-Roth coding.
|
||||
(Information about the proper value of ``w`` when
|
||||
using ``blaum_roth`` coding was provided to the
|
||||
Ceph upstream in September of 2024 by Benjamin
|
||||
Mare.)
|
||||
|
||||
:Type: String
|
||||
:Required: No.
|
||||
:Default: reed_sol_van
|
||||
|
@ -7,19 +7,18 @@
|
||||
Overview
|
||||
========
|
||||
|
||||
There is a finite set of health messages that a Ceph cluster can raise. These
|
||||
messages are known as *health checks*. Each health check has a unique
|
||||
identifier.
|
||||
There is a set of health states that a Ceph cluster can raise. These
|
||||
are known as *health checks*. Each health check has a unique identifier.
|
||||
|
||||
The identifier is a terse human-readable string -- that is, the identifier is
|
||||
readable in much the same way as a typical variable name. It is intended to
|
||||
enable tools (for example, UIs) to make sense of health checks and present them
|
||||
enable tools (for example, monitoring and UIs) to make sense of health checks and present them
|
||||
in a way that reflects their meaning.
|
||||
|
||||
This page lists the health checks that are raised by the monitor and manager
|
||||
daemons. In addition to these, you might see health checks that originate
|
||||
from MDS daemons (see :ref:`cephfs-health-messages`), and health checks
|
||||
that are defined by ``ceph-mgr`` python modules.
|
||||
daemons. In addition to these, you may see health checks that originate
|
||||
from CephFS MDS daemons (see :ref:`cephfs-health-messages`), and health checks
|
||||
that are defined by ``ceph-mgr`` modules.
|
||||
|
||||
Definitions
|
||||
===========
|
||||
@ -30,49 +29,57 @@ Monitor
|
||||
DAEMON_OLD_VERSION
|
||||
__________________
|
||||
|
||||
Warn if one or more old versions of Ceph are running on any daemons. A health
|
||||
check is raised if multiple versions are detected. This condition must exist
|
||||
for a period of time greater than ``mon_warn_older_version_delay`` (set to one
|
||||
week by default) in order for the health check to be raised. This allows most
|
||||
upgrades to proceed without the occurrence of a false warning. If the upgrade
|
||||
is paused for an extended time period, ``health mute`` can be used by running
|
||||
``ceph health mute DAEMON_OLD_VERSION --sticky``. Be sure, however, to run
|
||||
``ceph health unmute DAEMON_OLD_VERSION`` after the upgrade has finished.
|
||||
One or more Ceph daemons are running an old Ceph release. A health check is
|
||||
raised if multiple versions are detected. This condition must exist for a
|
||||
period of time greater than ``mon_warn_older_version_delay`` (set to one week
|
||||
by default) in order for the health check to be raised. This allows most
|
||||
upgrades to proceed without raising a warning that is both expected and
|
||||
ephemeral. If the upgrade is paused for an extended time, ``health mute`` can
|
||||
be used by running ``ceph health mute DAEMON_OLD_VERSION --sticky``. Be sure,
|
||||
however, to run ``ceph health unmute DAEMON_OLD_VERSION`` after the upgrade has
|
||||
finished so that any future, unexpected instances are not masked.
|
||||
|
||||
MON_DOWN
|
||||
________
|
||||
|
||||
One or more monitor daemons are currently down. The cluster requires a majority
|
||||
(more than one-half) of the monitors to be available. When one or more monitors
|
||||
are down, clients might have a harder time forming their initial connection to
|
||||
the cluster, as they might need to try more addresses before they reach an
|
||||
operating monitor.
|
||||
One or more Ceph Monitor daemons are down. The cluster requires a majority
|
||||
(more than one-half) of the provsioned monitors to be available. When one or
|
||||
more monitors are down, clients may have a harder time forming their initial
|
||||
connection to the cluster, as they may need to try additional IP addresses
|
||||
before they reach an operating monitor.
|
||||
|
||||
The down monitor daemon should be restarted as soon as possible to reduce the
|
||||
risk of a subsequent monitor failure leading to a service outage.
|
||||
Down monitor daemons should be restored or restarted as soon as possible to
|
||||
reduce the risk that an additional monitor failure may cause a service outage.
|
||||
|
||||
MON_CLOCK_SKEW
|
||||
______________
|
||||
|
||||
The clocks on the hosts running the ceph-mon monitor daemons are not
|
||||
well-synchronized. This health check is raised if the cluster detects a clock
|
||||
skew greater than ``mon_clock_drift_allowed``.
|
||||
The clocks on hosts running Ceph Monitor daemons are not well-synchronized.
|
||||
This health check is raised if the cluster detects a clock skew greater than
|
||||
``mon_clock_drift_allowed``.
|
||||
|
||||
This issue is best resolved by synchronizing the clocks by using a tool like
|
||||
``ntpd`` or ``chrony``.
|
||||
the legacy ``ntpd`` or the newer ``chrony``. It is ideal to configure NTP
|
||||
daemons to sync against multiple internal and external sources for resilience;
|
||||
the protocol will adaptively determine the best available source. It is also
|
||||
beneficial to have the NTP daemons on Ceph Monitor hosts sync against each
|
||||
other, as it is even more important that Monitors be synchronized with each
|
||||
other than it is for them to be _correct_ with respect to reference time.
|
||||
|
||||
If it is impractical to keep the clocks closely synchronized, the
|
||||
``mon_clock_drift_allowed`` threshold can also be increased. However, this
|
||||
value must stay significantly below the ``mon_lease`` interval in order for the
|
||||
monitor cluster to function properly.
|
||||
``mon_clock_drift_allowed`` threshold can be increased. However, this value
|
||||
must stay significantly below the ``mon_lease`` interval in order for the
|
||||
monitor cluster to function properly. It is not difficult with a quality NTP
|
||||
or PTP configuration to have sub-millisecond synchronization, so there are
|
||||
very, very few occasions when it is appropriate to change this value.
|
||||
|
||||
MON_MSGR2_NOT_ENABLED
|
||||
_____________________
|
||||
|
||||
The :confval:`ms_bind_msgr2` option is enabled but one or more monitors are
|
||||
not configured to bind to a v2 port in the cluster's monmap. This
|
||||
means that features specific to the msgr2 protocol (for example, encryption)
|
||||
are unavailable on some or all connections.
|
||||
The :confval:`ms_bind_msgr2` option is enabled but one or more monitors are not
|
||||
configured in the cluster's monmap to bind to a v2 port. This means that
|
||||
features specific to the msgr2 protocol (for example, encryption) are
|
||||
unavailable on some or all connections.
|
||||
|
||||
In most cases this can be corrected by running the following command:
|
||||
|
||||
@ -85,35 +92,39 @@ port (6789) will continue to listen for v1 connections on 6789 and begin to
|
||||
listen for v2 connections on the new default port 3300.
|
||||
|
||||
If a monitor is configured to listen for v1 connections on a non-standard port
|
||||
(that is, a port other than 6789), then the monmap will need to be modified
|
||||
(that is, a port other than 6789), the monmap will need to be modified
|
||||
manually.
|
||||
|
||||
|
||||
MON_DISK_LOW
|
||||
____________
|
||||
|
||||
One or more monitors are low on disk space. This health check is raised if the
|
||||
percentage of available space on the file system used by the monitor database
|
||||
(normally ``/var/lib/ceph/mon``) drops below the percentage value
|
||||
One or more monitors are low on storage space. This health check is raised if
|
||||
the percentage of available space on the file system used by the monitor
|
||||
database (normally ``/var/lib/ceph/mon``) drops below the percentage value
|
||||
``mon_data_avail_warn`` (default: 30%).
|
||||
|
||||
This alert might indicate that some other process or user on the system is
|
||||
filling up the file system used by the monitor. It might also
|
||||
indicate that the monitor database is too large (see ``MON_DISK_BIG``
|
||||
below).
|
||||
filling up the file system used by the monitor. It might also indicate that the
|
||||
monitor database is too large (see ``MON_DISK_BIG`` below). Another common
|
||||
scenario is that Ceph logging subsystem levels have been raised for
|
||||
troubleshooting purposes without subsequent return to default levels. Ongoing
|
||||
verbose logging can easily fill up the files system containing ``/var/log``. If
|
||||
you trim logs that are currently open, remember to restart or instruct your
|
||||
syslog or other daemon to re-open the log file.
|
||||
|
||||
If space cannot be freed, the monitor's data directory might need to be
|
||||
moved to another storage device or file system (this relocation process must be carried out while the monitor
|
||||
daemon is not running).
|
||||
If space cannot be freed, the monitor's data directory might need to be moved
|
||||
to another storage device or file system (this relocation process must be
|
||||
carried out while the monitor daemon is not running).
|
||||
|
||||
|
||||
MON_DISK_CRIT
|
||||
_____________
|
||||
|
||||
One or more monitors are critically low on disk space. This health check is raised if the
|
||||
percentage of available space on the file system used by the monitor database
|
||||
(normally ``/var/lib/ceph/mon``) drops below the percentage value
|
||||
``mon_data_avail_crit`` (default: 5%). See ``MON_DISK_LOW``, above.
|
||||
One or more monitors are critically low on storage space. This health check is
|
||||
raised if the percentage of available space on the file system used by the
|
||||
monitor database (normally ``/var/lib/ceph/mon``) drops below the percentage
|
||||
value ``mon_data_avail_crit`` (default: 5%). See ``MON_DISK_LOW``, above.
|
||||
|
||||
MON_DISK_BIG
|
||||
____________
|
||||
@ -124,14 +135,15 @@ raised if the size of the monitor database is larger than
|
||||
|
||||
A large database is unusual, but does not necessarily indicate a problem.
|
||||
Monitor databases might grow in size when there are placement groups that have
|
||||
not reached an ``active+clean`` state in a long time.
|
||||
not reached an ``active+clean`` state in a long time, or when extensive cluster
|
||||
recovery, expansion, or topology changes have recently occurred.
|
||||
|
||||
This alert might also indicate that the monitor's database is not properly
|
||||
This alert may also indicate that the monitor's database is not properly
|
||||
compacting, an issue that has been observed with some older versions of
|
||||
RocksDB. Forcing a compaction with ``ceph daemon mon.<id> compact`` might
|
||||
shrink the database's on-disk size.
|
||||
RocksDB. Forcing compaction with ``ceph daemon mon.<id> compact`` may suffice
|
||||
to shrink the database's storage usage.
|
||||
|
||||
This alert might also indicate that the monitor has a bug that prevents it from
|
||||
This alert may also indicate that the monitor has a bug that prevents it from
|
||||
pruning the cluster metadata that it stores. If the problem persists, please
|
||||
report a bug.
|
||||
|
||||
@ -222,8 +234,8 @@ this alert can be temporarily silenced by running the following command:
|
||||
|
||||
ceph health mute AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED 1w # 1 week
|
||||
|
||||
Although we do NOT recommend doing so, you can also disable this alert indefinitely
|
||||
by running the following command:
|
||||
Although we do NOT recommend doing so, you can also disable this alert
|
||||
indefinitely by running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -236,17 +248,17 @@ Manager
|
||||
MGR_DOWN
|
||||
________
|
||||
|
||||
All manager daemons are currently down. The cluster should normally have at
|
||||
least one running manager (``ceph-mgr``) daemon. If no manager daemon is
|
||||
running, the cluster's ability to monitor itself will be compromised, and parts
|
||||
of the management API will become unavailable (for example, the dashboard will
|
||||
not work, and most CLI commands that report metrics or runtime state will
|
||||
block). However, the cluster will still be able to perform all I/O operations
|
||||
and to recover from failures.
|
||||
All Ceph Manager daemons are currently down. The cluster should normally have
|
||||
at least one running manager (``ceph-mgr``) daemon. If no manager daemon is
|
||||
running, the cluster's ability to monitor itself will be compromised, parts of
|
||||
the management API will become unavailable (for example, the dashboard will not
|
||||
work, and most CLI commands that report metrics or runtime state will block).
|
||||
However, the cluster will still be able to perform client I/O operations and
|
||||
recover from failures.
|
||||
|
||||
The "down" manager daemon should be restarted as soon as possible to ensure
|
||||
that the cluster can be monitored (for example, so that the ``ceph -s``
|
||||
information is up to date, or so that metrics can be scraped by Prometheus).
|
||||
The down manager daemon(s) should be restarted as soon as possible to ensure
|
||||
that the cluster can be monitored (for example, so that ``ceph -s`` information
|
||||
is available and up to date, and so that metrics can be scraped by Prometheus).
|
||||
|
||||
|
||||
MGR_MODULE_DEPENDENCY
|
||||
@ -285,14 +297,14 @@ OSDs
|
||||
OSD_DOWN
|
||||
________
|
||||
|
||||
One or more OSDs are marked "down". The ceph-osd daemon might have been
|
||||
stopped, or peer OSDs might be unable to reach the OSD over the network.
|
||||
Common causes include a stopped or crashed daemon, a "down" host, or a network
|
||||
outage.
|
||||
One or more OSDs are marked ``down``. The ceph-osd daemon(s) or their host(s)
|
||||
may have crashed or been stopped, or peer OSDs might be unable to reach the OSD
|
||||
over the public or private network. Common causes include a stopped or crashed
|
||||
daemon, a "down" host, or a network failure.
|
||||
|
||||
Verify that the host is healthy, the daemon is started, and the network is
|
||||
functioning. If the daemon has crashed, the daemon log file
|
||||
(``/var/log/ceph/ceph-osd.*``) might contain debugging information.
|
||||
(``/var/log/ceph/ceph-osd.*``) may contain troubleshooting information.
|
||||
|
||||
OSD_<crush type>_DOWN
|
||||
_____________________
|
||||
@ -319,7 +331,7 @@ _____________________
|
||||
The utilization thresholds for `nearfull`, `backfillfull`, `full`, and/or
|
||||
`failsafe_full` are not ascending. In particular, the following pattern is
|
||||
expected: `nearfull < backfillfull`, `backfillfull < full`, and `full <
|
||||
failsafe_full`.
|
||||
failsafe_full`. This can result in unexpected cluster behavior.
|
||||
|
||||
To adjust these utilization thresholds, run the following commands:
|
||||
|
||||
@ -355,8 +367,14 @@ threshold by a small amount. To do so, run the following command:
|
||||
|
||||
ceph osd set-full-ratio <ratio>
|
||||
|
||||
Additional OSDs should be deployed in order to add new storage to the cluster,
|
||||
or existing data should be deleted in order to free up space in the cluster.
|
||||
Additional OSDs should be deployed within appropriate CRUSH failure domains
|
||||
in order to increase capacity, and / or existing data should be deleted
|
||||
in order to free up space in the cluster. One subtle situation is that the
|
||||
``rados bench`` tool may have been used to test one or more pools' performance,
|
||||
and the resulting RADOS objects were not subsequently cleaned up. You may
|
||||
check for this by invoking ``rados ls`` against each pool and looking for
|
||||
objects with names beginning with ``bench`` or other job names. These may
|
||||
then be manually but very, very carefully deleted in order to reclaim capacity.
|
||||
|
||||
OSD_BACKFILLFULL
|
||||
________________
|
||||
@ -493,9 +511,9 @@ or newer to start. To safely set the flag, run the following command:
|
||||
OSD_FILESTORE
|
||||
__________________
|
||||
|
||||
Warn if OSDs are running Filestore. The Filestore OSD back end has been
|
||||
deprecated; the BlueStore back end has been the default object store since the
|
||||
Ceph Luminous release.
|
||||
Warn if OSDs are running the old Filestore back end. The Filestore OSD back end
|
||||
is deprecated; the BlueStore back end has been the default object store since
|
||||
the Ceph Luminous release.
|
||||
|
||||
The 'mclock_scheduler' is not supported for Filestore OSDs. For this reason,
|
||||
the default 'osd_op_queue' is set to 'wpq' for Filestore OSDs and is enforced
|
||||
@ -518,16 +536,16 @@ temporarily silence this alert by running the following command:
|
||||
|
||||
ceph health mute OSD_FILESTORE
|
||||
|
||||
Since this migration can take a considerable amount of time to complete, we
|
||||
recommend that you begin the process well in advance of any update to Reef or
|
||||
to later releases.
|
||||
Since migration of Filestore OSDs to BlueStore can take a considerable amount
|
||||
of time to complete, we recommend that you begin the process well in advance
|
||||
of any update to Reef or to later releases.
|
||||
|
||||
OSD_UNREACHABLE
|
||||
_______________
|
||||
|
||||
Registered v1/v2 public address of one or more OSD(s) is/are out of the
|
||||
defined `public_network` subnet, which will prevent these unreachable OSDs
|
||||
from communicating with ceph clients properly.
|
||||
The registered v1/v2 public address or addresses of one or more OSD(s) is or
|
||||
are out of the defined `public_network` subnet, which prevents these
|
||||
unreachable OSDs from communicating with ceph clients properly.
|
||||
|
||||
Even though these unreachable OSDs are in up state, rados clients
|
||||
will hang till TCP timeout before erroring out due to this inconsistency.
|
||||
@ -535,7 +553,7 @@ will hang till TCP timeout before erroring out due to this inconsistency.
|
||||
POOL_FULL
|
||||
_________
|
||||
|
||||
One or more pools have reached their quota and are no longer allowing writes.
|
||||
One or more pools have reached quota and no longer allow writes.
|
||||
|
||||
To see pool quotas and utilization, run the following command:
|
||||
|
||||
@ -621,9 +639,10 @@ command:
|
||||
BLUESTORE_FRAGMENTATION
|
||||
_______________________
|
||||
|
||||
As BlueStore operates, the free space on the underlying storage will become
|
||||
fragmented. This is normal and unavoidable, but excessive fragmentation causes
|
||||
slowdown. To inspect BlueStore fragmentation, run the following command:
|
||||
``BLUESTORE_FRAGMENTATION`` indicates that the free space that underlies
|
||||
BlueStore has become fragmented. This is normal and unavoidable, but excessive
|
||||
fragmentation causes slowdown. To inspect BlueStore fragmentation, run the
|
||||
following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -662,11 +681,9 @@ One or more OSDs have BlueStore volumes that were created prior to the
|
||||
Nautilus release. (In Nautilus, BlueStore tracks its internal usage
|
||||
statistics on a granular, per-pool basis.)
|
||||
|
||||
If *all* OSDs
|
||||
are older than Nautilus, this means that the per-pool metrics are
|
||||
simply unavailable. But if there is a mixture of pre-Nautilus and
|
||||
post-Nautilus OSDs, the cluster usage statistics reported by ``ceph
|
||||
df`` will be inaccurate.
|
||||
If *all* OSDs are older than Nautilus, this means that the per-pool metrics are
|
||||
simply unavailable. But if there is a mixture of pre-Nautilus and post-Nautilus
|
||||
OSDs, the cluster usage statistics reported by ``ceph df`` will be inaccurate.
|
||||
|
||||
The old OSDs can be updated to use the new usage-tracking scheme by stopping
|
||||
each OSD, running a repair operation, and then restarting the OSD. For example,
|
||||
@ -778,10 +795,10 @@ about the source of the problem.
|
||||
BLUESTORE_SPURIOUS_READ_ERRORS
|
||||
______________________________
|
||||
|
||||
One or more BlueStore OSDs detect spurious read errors on the main device.
|
||||
One (or more) BlueStore OSDs detects read errors on the main device.
|
||||
BlueStore has recovered from these errors by retrying disk reads. This alert
|
||||
might indicate issues with underlying hardware, issues with the I/O subsystem,
|
||||
or something similar. In theory, such issues can cause permanent data
|
||||
or something similar. Such issues can cause permanent data
|
||||
corruption. Some observations on the root cause of spurious read errors can be
|
||||
found here: https://tracker.ceph.com/issues/22464
|
||||
|
||||
@ -801,6 +818,105 @@ Or, to disable this alert on a specific OSD, run the following command:
|
||||
|
||||
ceph config set osd.123 bluestore_warn_on_spurious_read_errors false
|
||||
|
||||
BLOCK_DEVICE_STALLED_READ_ALERT
|
||||
_______________________________
|
||||
|
||||
There are BlueStore log messages that reveal storage drive issues
|
||||
that can cause performance degradation and potentially data unavailability or
|
||||
loss. These may indicate a storage drive that is failing and should be
|
||||
evaluated and possibly removed and replaced.
|
||||
|
||||
``read stalled read 0x29f40370000~100000 (buffered) since 63410177.290546s, timeout is 5.000000s``
|
||||
|
||||
However, this is difficult to spot because there no discernible warning (a
|
||||
health warning or info in ``ceph health detail`` for example). More observations
|
||||
can be found here: https://tracker.ceph.com/issues/62500
|
||||
|
||||
Also because there can be false positive ``stalled read`` instances, a mechanism
|
||||
has been added to increase accuracy. If in the last ``bdev_stalled_read_warn_lifetime``
|
||||
seconds the number of ``stalled read`` events is found to be greater than or equal to
|
||||
``bdev_stalled_read_warn_threshold`` for a given BlueStore block device, this
|
||||
warning will be reported in ``ceph health detail``. The warning state will be
|
||||
removed when the condition clears.
|
||||
|
||||
The defaults for :confval:`bdev_stalled_read_warn_lifetime`
|
||||
and :confval:`bdev_stalled_read_warn_threshold` may be overridden globally or for
|
||||
specific OSDs.
|
||||
|
||||
To change this, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set global bdev_stalled_read_warn_lifetime 10
|
||||
ceph config set global bdev_stalled_read_warn_threshold 5
|
||||
|
||||
This may be done for specific OSDs or a given mask. For example,
|
||||
to apply only to SSD OSDs:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set osd.123 bdev_stalled_read_warn_lifetime 10
|
||||
ceph config set osd.123 bdev_stalled_read_warn_threshold 5
|
||||
ceph config set class:ssd bdev_stalled_read_warn_lifetime 10
|
||||
ceph config set class:ssd bdev_stalled_read_warn_threshold 5
|
||||
|
||||
WAL_DEVICE_STALLED_READ_ALERT
|
||||
_____________________________
|
||||
|
||||
The warning state ``WAL_DEVICE_STALLED_READ_ALERT`` is raised to indicate
|
||||
``stalled read`` instances on a given BlueStore OSD's ``WAL_DEVICE``. This
|
||||
warning can be configured via the :confval:`bdev_stalled_read_warn_lifetime`
|
||||
and :confval:`bdev_stalled_read_warn_threshold` options with commands similar
|
||||
to those described in the ``BLOCK_DEVICE_STALLED_READ_ALERT`` warning section.
|
||||
|
||||
DB_DEVICE_STALLED_READ_ALERT
|
||||
____________________________
|
||||
|
||||
The warning state ``DB_DEVICE_STALLED_READ_ALERT`` is raised to indicate
|
||||
``stalled read`` instances on a given BlueStore OSD's ``DB_DEVICE``. This
|
||||
warning can be configured via the :confval:`bdev_stalled_read_warn_lifetime`
|
||||
and :confval:`bdev_stalled_read_warn_threshold` options with commands similar
|
||||
to those described in the ``BLOCK_DEVICE_STALLED_READ_ALERT`` warning section.
|
||||
|
||||
BLUESTORE_SLOW_OP_ALERT
|
||||
_______________________
|
||||
|
||||
There are BlueStore log messages that reveal storage drive issues that can lead
|
||||
to performance degradation and data unavailability or loss. These indicate
|
||||
that the storage drive may be failing and should be investigated and
|
||||
potentially replaced.
|
||||
|
||||
``log_latency_fn slow operation observed for _txc_committed_kv, latency = 12.028621219s, txc = 0x55a107c30f00``
|
||||
``log_latency_fn slow operation observed for upper_bound, latency = 6.25955s``
|
||||
``log_latency slow operation observed for submit_transaction..``
|
||||
|
||||
As there can be false positive ``slow ops`` instances, a mechanism has
|
||||
been added for more reliability. If in the last ``bluestore_slow_ops_warn_lifetime``
|
||||
seconds the number of ``slow ops`` indications are found greater than or equal to
|
||||
:confval:`bluestore_slow_ops_warn_threshold` for a given BlueStore OSD, this
|
||||
warning will be reported in ``ceph health detail``. The warning state is
|
||||
cleared when the condition clears.
|
||||
|
||||
The defaults for :confval:`bluestore_slow_ops_warn_lifetime` and
|
||||
:confval:`bluestore_slow_ops_warn_threshold` may be overidden globally or for
|
||||
specific OSDs.
|
||||
|
||||
To change this, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set global bluestore_slow_ops_warn_lifetime 10
|
||||
ceph config set global bluestore_slow_ops_warn_threshold 5
|
||||
|
||||
this may be done for specific OSDs or a given mask, for example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set osd.123 bluestore_slow_ops_warn_lifetime 10
|
||||
ceph config set osd.123 bluestore_slow_ops_warn_threshold 5
|
||||
ceph config set class:ssd bluestore_slow_ops_warn_lifetime 10
|
||||
ceph config set class:ssd bluestore_slow_ops_warn_threshold 5
|
||||
|
||||
Device health
|
||||
-------------
|
||||
|
||||
@ -815,7 +931,12 @@ appropriate response to this expected failure is (1) to mark the OSD ``out`` so
|
||||
that data is migrated off of the OSD, and then (2) to remove the hardware from
|
||||
the system. Note that this marking ``out`` is normally done automatically if
|
||||
``mgr/devicehealth/self_heal`` is enabled (as determined by
|
||||
``mgr/devicehealth/mark_out_threshold``).
|
||||
``mgr/devicehealth/mark_out_threshold``). If an OSD device is compromised but
|
||||
the OSD(s) on that device are still ``up``, recovery can be degraded. In such
|
||||
cases it may be advantageous to forcibly stop the OSD daemon(s) in question so
|
||||
that recovery can proceed from surviving healthly OSDs. This must be
|
||||
done with extreme care and attention to failure domains so that data availability
|
||||
is not compromised.
|
||||
|
||||
To check device health, run the following command:
|
||||
|
||||
@ -823,8 +944,8 @@ To check device health, run the following command:
|
||||
|
||||
ceph device info <device-id>
|
||||
|
||||
Device life expectancy is set either by a prediction model that the mgr runs or
|
||||
by an external tool that is activated by running the following command:
|
||||
Device life expectancy is set either by a prediction model that the Ceph Manager
|
||||
runs or by an external tool that runs a command the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -978,7 +1099,7 @@ ____________________
|
||||
The count of read repairs has exceeded the config value threshold
|
||||
``mon_osd_warn_num_repaired`` (default: ``10``). Because scrub handles errors
|
||||
only for data at rest, and because any read error that occurs when another
|
||||
replica is available will be repaired immediately so that the client can get
|
||||
replica is available is repaired immediately so that the client can get
|
||||
the object data, there might exist failing disks that are not registering any
|
||||
scrub errors. This repair count is maintained as a way of identifying any such
|
||||
failing disks.
|
||||
@ -988,8 +1109,8 @@ LARGE_OMAP_OBJECTS
|
||||
__________________
|
||||
|
||||
One or more pools contain large omap objects, as determined by
|
||||
``osd_deep_scrub_large_omap_object_key_threshold`` (threshold for the number of
|
||||
keys to determine what is considered a large omap object) or
|
||||
``osd_deep_scrub_large_omap_object_key_threshold`` (the threshold for the
|
||||
number of keys to determine what is considered a large omap object) or
|
||||
``osd_deep_scrub_large_omap_object_value_sum_threshold`` (the threshold for the
|
||||
summed size in bytes of all key values to determine what is considered a large
|
||||
omap object) or both. To find more information on object name, key count, and
|
||||
@ -1009,7 +1130,7 @@ CACHE_POOL_NEAR_FULL
|
||||
____________________
|
||||
|
||||
A cache-tier pool is nearly full, as determined by the ``target_max_bytes`` and
|
||||
``target_max_objects`` properties of the cache pool. Once the pool reaches the
|
||||
``target_max_objects`` properties of the cache pool. When the pool reaches the
|
||||
target threshold, write requests to the pool might block while data is flushed
|
||||
and evicted from the cache. This state normally leads to very high latencies
|
||||
and poor performance.
|
||||
@ -1155,10 +1276,10 @@ For more information, see :ref:`choosing-number-of-placement-groups` and
|
||||
POOL_TARGET_SIZE_BYTES_OVERCOMMITTED
|
||||
____________________________________
|
||||
|
||||
One or more pools have a ``target_size_bytes`` property that is set in order to
|
||||
estimate the expected size of the pool, but the value(s) of this property are
|
||||
greater than the total available storage (either by themselves or in
|
||||
combination with other pools).
|
||||
One or more pools does have a ``target_size_bytes`` property that is set in
|
||||
order to estimate the expected size of the pool, but the value or values of
|
||||
this property are greater than the total available storage (either by
|
||||
themselves or in combination with other pools).
|
||||
|
||||
This alert is usually an indication that the ``target_size_bytes`` value for
|
||||
the pool is too large and should be reduced or set to zero. To reduce the
|
||||
@ -1230,7 +1351,7 @@ data have too many PGs. See *TOO_MANY_PGS* above.
|
||||
To silence the health check, raise the threshold by adjusting the
|
||||
``mon_pg_warn_max_object_skew`` config option on the managers.
|
||||
|
||||
The health check will be silenced for a specific pool only if
|
||||
The health check is silenced for a specific pool only if
|
||||
``pg_autoscale_mode`` is set to ``on``.
|
||||
|
||||
POOL_APP_NOT_ENABLED
|
||||
@ -1297,8 +1418,8 @@ resolution, see :ref:`storage-capacity` and :ref:`no-free-drive-space`.
|
||||
OBJECT_MISPLACED
|
||||
________________
|
||||
|
||||
One or more objects in the cluster are not stored on the node that CRUSH would
|
||||
prefer that they be stored on. This alert is an indication that data migration
|
||||
One or more objects in the cluster are not stored on the node that CRUSH
|
||||
prefers that they be stored on. This alert is an indication that data migration
|
||||
due to a recent cluster change has not yet completed.
|
||||
|
||||
Misplaced data is not a dangerous condition in and of itself; data consistency
|
||||
@ -1365,7 +1486,7 @@ percentage (determined by ``mon_warn_pg_not_scrubbed_ratio``) of the interval
|
||||
has elapsed after the time the scrub was scheduled and no scrub has been
|
||||
performed.
|
||||
|
||||
PGs will be scrubbed only if they are flagged as ``clean`` (which means that
|
||||
PGs are scrubbed only if they are flagged as ``clean`` (which means that
|
||||
they are to be cleaned, and not that they have been examined and found to be
|
||||
clean). Misplaced or degraded PGs will not be flagged as ``clean`` (see
|
||||
*PG_AVAILABILITY* and *PG_DEGRADED* above).
|
||||
@ -1382,13 +1503,22 @@ ____________________
|
||||
One or more Placement Groups (PGs) have not been deep scrubbed recently. PGs
|
||||
are normally scrubbed every :confval:`osd_deep_scrub_interval` seconds at most.
|
||||
This health check is raised if a certain percentage (determined by
|
||||
``mon_warn_pg_not_deep_scrubbed_ratio``) of the interval has elapsed after the
|
||||
time the scrub was scheduled and no scrub has been performed.
|
||||
:confval:`mon_warn_pg_not_deep_scrubbed_ratio`) of the interval has elapsed
|
||||
after the time the scrub was scheduled and no scrub has been performed.
|
||||
|
||||
PGs will receive a deep scrub only if they are flagged as *clean* (which means
|
||||
that they are to be cleaned, and not that they have been examined and found to
|
||||
be clean). Misplaced or degraded PGs might not be flagged as ``clean`` (see
|
||||
*PG_AVAILABILITY* and *PG_DEGRADED* above).
|
||||
PGs will receive a deep scrub only if they are flagged as ``clean`` (which
|
||||
means that they are to be cleaned, and not that they have been examined and
|
||||
found to be clean). Misplaced or degraded PGs might not be flagged as ``clean``
|
||||
(see *PG_AVAILABILITY* and *PG_DEGRADED* above).
|
||||
|
||||
This document offers two methods of setting the value of
|
||||
:confval:`osd_deep_scrub_interval`. The first method listed here changes the
|
||||
value of :confval:`osd_deep_scrub_interval` globally. The second method listed
|
||||
here changes the value of :confval:`osd_deep scrub interval` for OSDs and for
|
||||
the Manager daemon.
|
||||
|
||||
First Method
|
||||
~~~~~~~~~~~~
|
||||
|
||||
To manually initiate a deep scrub of a clean PG, run the following command:
|
||||
|
||||
@ -1396,6 +1526,72 @@ To manually initiate a deep scrub of a clean PG, run the following command:
|
||||
|
||||
ceph pg deep-scrub <pgid>
|
||||
|
||||
Under certain conditions, the warning ``PGs not deep-scrubbed in time``
|
||||
appears. This might be because the cluster contains many large PGs, which take
|
||||
longer to deep-scrub. To remedy this situation, you must change the value of
|
||||
:confval:`osd_deep_scrub_interval` globally.
|
||||
|
||||
#. Confirm that ``ceph health detail`` returns a ``pgs not deep-scrubbed in
|
||||
time`` warning::
|
||||
|
||||
# ceph health detail
|
||||
HEALTH_WARN 1161 pgs not deep-scrubbed in time
|
||||
[WRN] PG_NOT_DEEP_SCRUBBED: 1161 pgs not deep-scrubbed in time
|
||||
pg 86.fff not deep-scrubbed since 2024-08-21T02:35:25.733187+0000
|
||||
|
||||
#. Change ``osd_deep_scrub_interval`` globally:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config set global osd_deep_scrub_interval 1209600
|
||||
|
||||
The above procedure was developed by Eugen Block in September of 2024.
|
||||
|
||||
See `Eugen Block's blog post <https://heiterbiswolkig.blogs.nde.ag/2024/09/06/pgs-not-deep-scrubbed-in-time/>`_ for much more detail.
|
||||
|
||||
See `Redmine tracker issue #44959 <https://tracker.ceph.com/issues/44959>`_.
|
||||
|
||||
Second Method
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
To manually initiate a deep scrub of a clean PG, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg deep-scrub <pgid>
|
||||
|
||||
Under certain conditions, the warning ``PGs not deep-scrubbed in time``
|
||||
appears. This might be because the cluster contains many large PGs, which take
|
||||
longer to deep-scrub. To remedy this situation, you must change the value of
|
||||
:confval:`osd_deep_scrub_interval` for OSDs and for the Manager daemon.
|
||||
|
||||
#. Confirm that ``ceph health detail`` returns a ``pgs not deep-scrubbed in
|
||||
time`` warning::
|
||||
|
||||
# ceph health detail
|
||||
HEALTH_WARN 1161 pgs not deep-scrubbed in time
|
||||
[WRN] PG_NOT_DEEP_SCRUBBED: 1161 pgs not deep-scrubbed in time
|
||||
pg 86.fff not deep-scrubbed since 2024-08-21T02:35:25.733187+0000
|
||||
|
||||
#. Change the ``osd_deep_scrub_interval`` for OSDs:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config set osd osd_deep_scrub_interval 1209600
|
||||
|
||||
#. Change the ``osd_deep_scrub_interval`` for Managers:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config set mgr osd_deep_scrub_interval 1209600
|
||||
|
||||
The above procedure was developed by Eugen Block in September of 2024.
|
||||
|
||||
See `Eugen Block's blog post <https://heiterbiswolkig.blogs.nde.ag/2024/09/06/pgs-not-deep-scrubbed-in-time/>`_ for much more detail.
|
||||
|
||||
See `Redmine tracker issue #44959 <https://tracker.ceph.com/issues/44959>`_.
|
||||
|
||||
|
||||
|
||||
PG_SLOW_SNAP_TRIMMING
|
||||
_____________________
|
||||
@ -1422,9 +1618,10 @@ Stretch Mode
|
||||
INCORRECT_NUM_BUCKETS_STRETCH_MODE
|
||||
__________________________________
|
||||
|
||||
Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests
|
||||
that the number of dividing buckets is not equal to 2 after stretch mode is enabled.
|
||||
You can expect unpredictable failures and MON assertions until the condition is fixed.
|
||||
Stretch mode currently only support 2 dividing buckets with OSDs, this warning
|
||||
suggests that the number of dividing buckets is not equal to 2 after stretch
|
||||
mode is enabled. You can expect unpredictable failures and MON assertions
|
||||
until the condition is fixed.
|
||||
|
||||
We encourage you to fix this by removing additional dividing buckets or bump the
|
||||
number of dividing buckets to 2.
|
||||
@ -1441,6 +1638,27 @@ We encourage you to fix this by making the weights even on both dividing buckets
|
||||
This can be done by making sure the combined weight of the OSDs on each dividing
|
||||
bucket are the same.
|
||||
|
||||
NVMeoF Gateway
|
||||
--------------
|
||||
|
||||
NVMEOF_SINGLE_GATEWAY
|
||||
_____________________
|
||||
|
||||
One of the gateway group has only one gateway. This is not ideal because it
|
||||
makes high availability (HA) impossible with a single gatway in a group. This
|
||||
can lead to problems with failover and failback operations for the NVMeoF
|
||||
gateway.
|
||||
|
||||
It's recommended to have multiple NVMeoF gateways in a group.
|
||||
|
||||
NVMEOF_GATEWAY_DOWN
|
||||
___________________
|
||||
|
||||
Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has
|
||||
crashed, the daemon log file (found at ``/var/log/ceph/``) may contain
|
||||
troubleshooting information.
|
||||
|
||||
|
||||
Miscellaneous
|
||||
-------------
|
||||
|
||||
|
@ -419,7 +419,10 @@ conditions change.
|
||||
Ceph provides a number of settings to manage the load spike associated with the
|
||||
reassignment of PGs to an OSD (especially a new OSD). The ``osd_max_backfills``
|
||||
setting specifies the maximum number of concurrent backfills to and from an OSD
|
||||
(default: 1). The ``backfill_full_ratio`` setting allows an OSD to refuse a
|
||||
(default: 1; note you cannot change this if the `mClock`_ scheduler is active,
|
||||
unless you set ``osd_mclock_override_recovery_settings = true``, see
|
||||
`mClock backfill`_).
|
||||
The ``backfill_full_ratio`` setting allows an OSD to refuse a
|
||||
backfill request if the OSD is approaching its full ratio (default: 90%). This
|
||||
setting can be changed with the ``ceph osd set-backfillfull-ratio`` command. If
|
||||
an OSD refuses a backfill request, the ``osd_backfill_retry_interval`` setting
|
||||
@ -545,6 +548,8 @@ performing the migration. For details, see the `Architecture`_ section.
|
||||
.. _data placement: ../data-placement
|
||||
.. _pool: ../pools
|
||||
.. _placement group: ../placement-groups
|
||||
.. _mClock: ../../configuration/mclock-config-ref.rst
|
||||
.. _mClock backfill: ../../configuration/mclock-config-ref.rst#recovery-backfill-options
|
||||
.. _Architecture: ../../../architecture
|
||||
.. _OSD Not Running: ../../troubleshooting/troubleshooting-osd#osd-not-running
|
||||
.. _Troubleshooting PG Errors: ../../troubleshooting/troubleshooting-pg#troubleshooting-pg-errors
|
||||
|
@ -737,6 +737,117 @@ Managing pools that are flagged with ``--bulk``
|
||||
===============================================
|
||||
See :ref:`managing_bulk_flagged_pools`.
|
||||
|
||||
Setting values for a stretch pool
|
||||
=================================
|
||||
To set values for a stretch pool, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool stretch set {pool-name} {peering_crush_bucket_count} {peering_crush_bucket_target} {peering_crush_bucket_barrier} {crush_rule} {size} {min_size} [--yes-i-really-mean-it]
|
||||
|
||||
Here are the break downs of the arguments:
|
||||
|
||||
.. describe:: {pool-name}
|
||||
|
||||
The name of the pool. It must be an existing pool, this command doesn't create a new pool.
|
||||
|
||||
:Type: String
|
||||
:Required: Yes.
|
||||
|
||||
.. describe:: {peering_crush_bucket_count}
|
||||
|
||||
The value is used along with peering_crush_bucket_barrier to determined whether the set of
|
||||
OSDs in the chosen acting set can peer with each other, based on the number of distinct
|
||||
buckets there are in the acting set.
|
||||
|
||||
:Type: Integer
|
||||
:Required: Yes.
|
||||
|
||||
.. describe:: {peering_crush_bucket_target}
|
||||
|
||||
This value is used along with peering_crush_bucket_barrier and size to calculate
|
||||
the value bucket_max which limits the number of OSDs in the same bucket from getting chose to be in the acting set of a PG.
|
||||
|
||||
:Type: Integer
|
||||
:Required: Yes.
|
||||
|
||||
.. describe:: {peering_crush_bucket_barrier}
|
||||
|
||||
The type of bucket a pool is stretched across, e.g., rack, row, or datacenter.
|
||||
|
||||
:Type: String
|
||||
:Required: Yes.
|
||||
|
||||
.. describe:: {crush_rule}
|
||||
|
||||
The crush rule to use for the stretch pool. The type of pool must match the type of crush_rule
|
||||
(replicated or erasure).
|
||||
|
||||
:Type: String
|
||||
:Required: Yes.
|
||||
|
||||
.. describe:: {size}
|
||||
|
||||
The number of replicas for objects in the stretch pool.
|
||||
|
||||
:Type: Integer
|
||||
:Required: Yes.
|
||||
|
||||
.. describe:: {min_size}
|
||||
|
||||
The minimum number of replicas required for I/O in the stretch pool.
|
||||
|
||||
:Type: Integer
|
||||
:Required: Yes.
|
||||
|
||||
.. describe:: {--yes-i-really-mean-it}
|
||||
|
||||
This flag is required to confirm that you really want to by-pass
|
||||
the safety checks and set the values for a stretch pool, e.g,
|
||||
when you are trying to set ``peering_crush_bucket_count`` or
|
||||
``peering_crush_bucket_target`` to be more than the number of buckets in the crush map.
|
||||
|
||||
:Type: Flag
|
||||
:Required: No.
|
||||
|
||||
.. _setting_values_for_a_stretch_pool:
|
||||
|
||||
Unsetting values for a stretch pool
|
||||
===================================
|
||||
To move the pool back to non-stretch, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool stretch unset {pool-name}
|
||||
|
||||
Here are the break downs of the argument:
|
||||
|
||||
.. describe:: {pool-name}
|
||||
|
||||
The name of the pool. It must be an existing pool that is stretched,
|
||||
i.e., it has already been set with the command `ceph osd pool stretch set`.
|
||||
|
||||
:Type: String
|
||||
:Required: Yes.
|
||||
|
||||
Showing values of a stretch pool
|
||||
================================
|
||||
To show values for a stretch pool, run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool stretch show {pool-name}
|
||||
|
||||
Here are the break downs of the argument:
|
||||
|
||||
.. describe:: {pool-name}
|
||||
|
||||
The name of the pool. It must be an existing pool that is stretched,
|
||||
i.e., it has already been set with the command `ceph osd pool stretch set`.
|
||||
|
||||
:Type: String
|
||||
:Required: Yes.
|
||||
|
||||
.. _Pool, PG and CRUSH Config Reference: ../../configuration/pool-pg-config-ref
|
||||
.. _Bloom Filter: https://en.wikipedia.org/wiki/Bloom_filter
|
||||
.. _setting the number of placement groups: ../placement-groups#set-the-number-of-placement-groups
|
||||
|
@ -81,6 +81,18 @@ Data Center B. In a situation of this kind, the loss of Data Center A means
|
||||
that the data is lost and Ceph will not be able to operate on it. This
|
||||
situation is surprisingly difficult to avoid using only standard CRUSH rules.
|
||||
|
||||
Individual Stretch Pools
|
||||
========================
|
||||
Setting individual ``stretch pool`` is an option that allows for the configuration
|
||||
of specific pools to be distributed across ``two or more data centers``.
|
||||
This is achieved by executing the ``ceph osd pool stretch set`` command on each desired pool,
|
||||
as opposed to applying a cluster-wide configuration ``with stretch mode``.
|
||||
See :ref:`setting_values_for_a_stretch_pool`
|
||||
|
||||
Use ``stretch mode`` when you have exactly ``two data centers`` and require a uniform
|
||||
configuration across the entire cluster. Conversely, opt for a ``stretch pool``
|
||||
when you need a particular pool to be replicated across ``more than two data centers``,
|
||||
providing a more granular level of control and a larger cluster size.
|
||||
|
||||
Stretch Mode
|
||||
============
|
||||
@ -260,8 +272,21 @@ SSDs (including NVMe OSDs). Hybrid HDD+SDD or HDD-only OSDs are not recommended
|
||||
due to the long time it takes for them to recover after connectivity between
|
||||
data centers has been restored. This reduces the potential for data loss.
|
||||
|
||||
In the future, stretch mode might support erasure-coded pools and might support
|
||||
deployments that have more than two data centers.
|
||||
.. warning:: Device class is currently not supported in stretch mode.
|
||||
For example, the following rule containing ``device class`` will not work::
|
||||
|
||||
rule stretch_replicated_rule {
|
||||
id 2
|
||||
type replicated class hdd
|
||||
step take default
|
||||
step choose firstn 0 type datacenter
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
}
|
||||
|
||||
In the future, stretch mode could support erasure-coded pools,
|
||||
enable deployments across multiple data centers,
|
||||
and accommodate various device classes.
|
||||
|
||||
Other commands
|
||||
==============
|
||||
|
@ -6,23 +6,24 @@ Ceph component debug log levels can be adjusted at runtime, while services are
|
||||
running. In some circumstances you might want to adjust debug log levels in
|
||||
``ceph.conf`` or in the central config store. Increased debug logging can be
|
||||
useful if you are encountering issues when operating your cluster. By default,
|
||||
Ceph log files are in ``/var/log/ceph``.
|
||||
Ceph log files are in ``/var/log/ceph``; containerized deployments often log
|
||||
elsewhere under ``/var/log``.
|
||||
|
||||
.. tip:: Remember that debug output can slow down your system, and that this
|
||||
latency sometimes hides race conditions.
|
||||
|
||||
Debug logging is resource intensive. If you encounter a problem in a specific
|
||||
component of your cluster, begin troubleshooting by enabling logging for only
|
||||
that component of the cluster. For example, if your OSDs are running without
|
||||
errors, but your metadata servers are not, enable logging for any specific
|
||||
metadata server instances that are having problems. Continue by enabling
|
||||
that component. For example, if your OSDs are running without
|
||||
errors, but your CephFS metadata servers (MDS) are not, enable logging for specific
|
||||
instances that are having problems. Continue by enabling
|
||||
logging for each subsystem only as needed.
|
||||
|
||||
.. important:: Verbose logging sometimes generates over 1 GB of data per hour.
|
||||
If the disk that your operating system runs on (your "OS disk") reaches its
|
||||
capacity, the node associated with that disk will stop working.
|
||||
|
||||
Whenever you enable or increase the rate of debug logging, make sure that you
|
||||
Whenever you enable or increase the level of debug logging, ensure that you
|
||||
have ample capacity for log files, as this may dramatically increase their
|
||||
size. For details on rotating log files, see `Accelerating Log Rotation`_.
|
||||
When your system is running well again, remove unnecessary debugging settings
|
||||
@ -34,7 +35,7 @@ For details on available settings, see `Subsystem, Log and Debug Settings`_.
|
||||
Runtime
|
||||
=======
|
||||
|
||||
To see the configuration settings at runtime, log in to a host that has a
|
||||
To see configuration settings at runtime, log in to a host that has a
|
||||
running daemon and run a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
@ -57,7 +58,7 @@ tell`` command of the following form:
|
||||
|
||||
Here ``{daemon-type}`` is ``osd``, ``mon``, or ``mds``. Apply the runtime
|
||||
setting either to a specific daemon (by specifying its ID) or to all daemons of
|
||||
a particular type (by using the ``*`` operator). For example, to increase
|
||||
a particular type (by using the ``*`` wildcard as the ID). For example, to increase
|
||||
debug logging for a specific ``ceph-osd`` daemon named ``osd.0``, run the
|
||||
following command:
|
||||
|
||||
@ -81,7 +82,8 @@ Boot Time
|
||||
=========
|
||||
|
||||
To activate Ceph's debugging output (that is, the ``dout()`` logging function)
|
||||
at boot time, you must add settings to your Ceph configuration file.
|
||||
at boot time, you must add settings to your Ceph configuration file (or
|
||||
set corresponding values in the central config store).
|
||||
Subsystems that are common to all daemons are set under ``[global]`` in the
|
||||
configuration file. Subsystems for a specific daemon are set under the relevant
|
||||
daemon section in the configuration file (for example, ``[mon]``, ``[osd]``,
|
||||
@ -115,7 +117,7 @@ For details, see `Subsystem, Log and Debug Settings`_.
|
||||
Accelerating Log Rotation
|
||||
=========================
|
||||
|
||||
If your log filesystem is nearly full, you can accelerate log rotation by
|
||||
If a host's log filesystem is nearly full, you can accelerate log rotation by
|
||||
modifying the Ceph log rotation file at ``/etc/logrotate.d/ceph``. To increase
|
||||
the frequency of log rotation (which will guard against a filesystem reaching
|
||||
capacity), add a ``size`` directive after the ``weekly`` frequency directive.
|
||||
@ -149,8 +151,8 @@ setting is shown immediately below.
|
||||
|
||||
30 * * * * /usr/sbin/logrotate /etc/logrotate.d/ceph >/dev/null 2>&1
|
||||
|
||||
In this example, the ``etc/logrotate.d/ceph`` file will be checked every 30
|
||||
minutes.
|
||||
In this example, the ``etc/logrotate.d/ceph`` file will be checked and possibly
|
||||
rotated every 30 minutes.
|
||||
|
||||
Valgrind
|
||||
========
|
||||
@ -175,7 +177,7 @@ For each subsystem, there is a logging level for its output logs (a so-called
|
||||
"log level") and a logging level for its in-memory logs (a so-called "memory
|
||||
level"). Different values may be set for these two logging levels in each
|
||||
subsystem. Ceph's logging levels operate on a scale of ``1`` to ``20``, where
|
||||
``1`` is terse and ``20`` is verbose. In certain rare cases, there are logging
|
||||
``1`` is terse and ``20`` is verbose. In a certain few cases, there are logging
|
||||
levels that can take a value greater than 20. The resulting logs are extremely
|
||||
verbose.
|
||||
|
||||
@ -184,7 +186,7 @@ following conditions are true:
|
||||
|
||||
- a fatal signal has been raised or
|
||||
- an assertion within Ceph code has been triggered or
|
||||
- the sending of in-memory logs to the output log has been manually triggered.
|
||||
- sending in-memory logs to the output log has been manually triggered.
|
||||
Consult `the portion of the "Ceph Administration Tool documentation
|
||||
that provides an example of how to submit admin socket commands
|
||||
<http://docs.ceph.com/en/latest/man/8/ceph/#daemon>`_ for more detail.
|
||||
@ -206,8 +208,8 @@ following:
|
||||
debug mds balancer = 1/20
|
||||
|
||||
The following table provides a list of Ceph subsystems and their default log and
|
||||
memory levels. Once you complete your logging efforts, restore the subsystems
|
||||
to their default level or to a level suitable for normal operations.
|
||||
memory levels. Once you complete your logging efforts, restore each subsystem's
|
||||
values to their defaults or to a level suitable for normal operations.
|
||||
|
||||
+--------------------------+-----------+--------------+
|
||||
| Subsystem | Log Level | Memory Level |
|
||||
|
@ -618,6 +618,7 @@ Possible causes include:
|
||||
- A bug in the kernel file system (check ``dmesg`` output)
|
||||
- An overloaded cluster (check system load, iostat, etc.)
|
||||
- A bug in the ``ceph-osd`` daemon.
|
||||
- Suboptimal OSD shard configuration (on HDD based cluster with mClock scheduler)
|
||||
|
||||
Possible solutions:
|
||||
|
||||
@ -626,6 +627,8 @@ Possible solutions:
|
||||
- Upgrade Ceph
|
||||
- Restart OSDs
|
||||
- Replace failed or failing components
|
||||
- Override OSD shard configuration (on HDD based cluster with mClock scheduler)
|
||||
- See :ref:`mclock-tblshoot-hdd-shard-config` for resolution
|
||||
|
||||
Debugging Slow Requests
|
||||
-----------------------
|
||||
@ -680,6 +683,44 @@ Although some of these events may appear redundant, they cross important
|
||||
boundaries in the internal code (such as passing data across locks into new
|
||||
threads).
|
||||
|
||||
.. _mclock-tblshoot-hdd-shard-config:
|
||||
|
||||
Slow Requests or Slow Recovery With mClock Scheduler
|
||||
----------------------------------------------------
|
||||
|
||||
.. note:: This troubleshooting is applicable only for HDD based clusters running
|
||||
mClock scheduler and with the following OSD shard configuration:
|
||||
``osd_op_num_shards_hdd`` = 5 and ``osd_op_num_threads_per_shard_hdd`` = 1.
|
||||
Also, see :ref:`mclock-hdd-cfg` for details around the reason for the change
|
||||
made to the default OSD HDD shard configuration for mClock.
|
||||
|
||||
On scaled HDD based clusters with mClock scheduler enabled and under multiple
|
||||
OSD node failure condition, the following could be reported or observed:
|
||||
|
||||
- slow requests: This also manifests into degraded client I/O performance.
|
||||
- slow background recoveries: Lower than expected recovery throughput.
|
||||
|
||||
**Troubleshooting Steps:**
|
||||
|
||||
#. Verify from OSD events that the slow requests are predominantly of type
|
||||
``queued_for_pg``.
|
||||
#. Verify if the reported recovery rate is significantly lower than the expected
|
||||
rate considering the QoS allocations for background recovery service.
|
||||
|
||||
If either of the above steps are true, then the following resolution may be
|
||||
applied. Note that this is disruptive as it involves OSD restarts. Run the
|
||||
following commands to change the default OSD shard configuration for HDDs:
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
ceph config set osd osd_op_num_shards_hdd 1
|
||||
ceph config set osd osd_op_num_threads_per_shard_hdd 5
|
||||
|
||||
The above configuration won't take effect immediately and would require a
|
||||
restart of the OSDs in the environment. For this process to be least disruptive,
|
||||
the OSDs may be restarted in a carefully staggered manner.
|
||||
|
||||
.. _rados_tshooting_flapping_osd:
|
||||
|
||||
Flapping OSDs
|
||||
=============
|
||||
|
@ -5,16 +5,16 @@
|
||||
Placement Groups Never Get Clean
|
||||
================================
|
||||
|
||||
If, after you have created your cluster, any Placement Groups (PGs) remain in
|
||||
the ``active`` status, the ``active+remapped`` status or the
|
||||
``active+degraded`` status and never achieves an ``active+clean`` status, you
|
||||
likely have a problem with your configuration.
|
||||
Placement Groups (PGs) that remain in the ``active`` status, the
|
||||
``active+remapped`` status or the ``active+degraded`` status and never achieve
|
||||
an ``active+clean`` status might indicate a problem with the configuration of
|
||||
the Ceph cluster.
|
||||
|
||||
In such a situation, it may be necessary to review the settings in the `Pool,
|
||||
PG and CRUSH Config Reference`_ and make appropriate adjustments.
|
||||
In such a situation, review the settings in the `Pool, PG and CRUSH Config
|
||||
Reference`_ and make appropriate adjustments.
|
||||
|
||||
As a general rule, run your cluster with more than one OSD and a pool size
|
||||
greater than two object replicas.
|
||||
of greater than two object replicas.
|
||||
|
||||
.. _one-node-cluster:
|
||||
|
||||
|
@ -77,14 +77,14 @@ allow it. The account root user can add identity policies to its users in
|
||||
several ways.
|
||||
|
||||
* Add policy directly to the user with the ``iam:PutUserPolicy`` and
|
||||
``iam:AttachUserPoliicy`` actions.
|
||||
``iam:AttachUserPolicy`` actions.
|
||||
|
||||
* Create an IAM group and add group policy with the ``iam:PutGroupPolicy`` and
|
||||
``iam:AttachGroupPoliicy`` actions. Users added to that group with the
|
||||
``iam:AttachGroupPolicy`` actions. Users added to that group with the
|
||||
``iam:AddUserToGroup`` action will inherit all of the group's policy.
|
||||
|
||||
* Create an IAM role and add role policy with the ``iam:PutRolePolicy`` and
|
||||
``iam:AttachRolePoliicy`` actions. Users that assume this role with the
|
||||
``iam:AttachRolePolicy`` actions. Users that assume this role with the
|
||||
``sts:AssumeRole`` and ``sts:AssumeRoleWithWebIdentity`` actions will inherit
|
||||
all of the role's policy.
|
||||
|
||||
@ -177,8 +177,8 @@ An existing user can be adopted into an account with ``user modify``::
|
||||
.. warning:: Ownership of the user's notification topics will not be
|
||||
transferred to the account. Notifications will continue to work, but
|
||||
the topics will no longer be visible to SNS Topic APIs. Topics and
|
||||
their associated bucket notifications should be removed before migration
|
||||
and recreated within the account.
|
||||
their associated bucket notifications can be migrated as described below
|
||||
in `Migrating Notification Topics`_.
|
||||
|
||||
Because account users have no permissions by default, some identity policy must
|
||||
be added to restore the user's original permissions.
|
||||
@ -187,6 +187,44 @@ Alternatively, you may want to create a new account for each existing user. In
|
||||
that case, you may want to add the ``--account-root`` option to make each user
|
||||
the root user of their account.
|
||||
|
||||
Migrating Notification Topics
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Account topics are supported only when the ``notification_v2`` feature is enabled,
|
||||
as described in `Bucket Notifications`_ and `Supported Zone Features`_.
|
||||
|
||||
1. ``Migration Impact``: When a non-account user is migrated to an account, the
|
||||
the existing notification topics remain accessible through the RadosGW admin API,
|
||||
but the user loses access to them via the SNS Topic API. Despite this, the topics
|
||||
remain functional, and bucket notifications will continue to be delivered as expected.
|
||||
|
||||
2. ``Re-creation of Topics``: The account user should re-create the topics using
|
||||
the same names. The old topics (now inaccessible) and the new account-owned topics
|
||||
will coexist without interference.
|
||||
|
||||
3. ``Updating Bucket Notification Configurations``: Buckets that are subscribed to
|
||||
the old user-owned topics should be updated to use the new account-owned topics.
|
||||
To prevent duplicate notifications, maintain the same notification IDs.
|
||||
For example, if a bucket's existing notification configuration is:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{"TopicConfigurations": [{ "Id": "ID1", "TopicArn": "arn:aws:sns:default::topic1", "Events": ["s3:ObjectCreated:*"]}]}
|
||||
|
||||
The updated configuration would be:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{"TopicConfigurations": [{ "Id": "ID1", "TopicArn": "arn:aws:sns:default:RGW00000000000000001:topic1", "Events": ["s3:ObjectCreated:*"]}]}
|
||||
|
||||
In this example, `RGW00000000000000001` is the account ID, `topic1` is the
|
||||
topic name and `ID1` is the notification ID.
|
||||
|
||||
4. ``Removing Old Topics``: Once no buckets are subscribed to the old user-owned topics,
|
||||
they can be removed by an admin::
|
||||
|
||||
$ radosgw-admin topic rm --topic topic1
|
||||
|
||||
Account Root example
|
||||
--------------------
|
||||
|
||||
@ -252,3 +290,5 @@ This example uses `awscli`_ to create an IAM user for S3 operations.
|
||||
.. _Evaluating policies within a single account: https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_evaluation-logic.html#policy-eval-basics
|
||||
.. _Cross-account policy evaluation logic: https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_evaluation-logic-cross-account.html
|
||||
.. _awscli: https://docs.aws.amazon.com/cli/latest/
|
||||
.. _Bucket Notifications: ../notifications/
|
||||
.. _Supported Zone Features: ../zone-features/#supported-features
|
||||
|
@ -4,34 +4,30 @@ Archive Sync Module
|
||||
|
||||
.. versionadded:: Nautilus
|
||||
|
||||
This sync module leverages the versioning feature of the S3 objects in RGW to
|
||||
have an archive zone that captures the different versions of the S3 objects
|
||||
as they occur over time in the other zones.
|
||||
The Archive Sync module uses the RGW versioning feature of S3 objects to
|
||||
maintain an archive zone that captures successive versions of objects
|
||||
as they are updated in other zones. Archive zone objects can
|
||||
be removed only through gateways associated with the archive zone.
|
||||
|
||||
An archive zone allows to have a history of versions of S3 objects that can
|
||||
only be eliminated through the gateways associated with the archive zone.
|
||||
|
||||
This functionality is useful to have a configuration where several
|
||||
This enables a deployment where several
|
||||
non-versioned zones replicate their data and metadata through their zone
|
||||
gateways (mirror configuration) providing high availability to the end users,
|
||||
while the archive zone captures all the data updates and metadata for
|
||||
consolidate them as versions of S3 objects.
|
||||
while the archive zone captures data and metadata updates.
|
||||
|
||||
Including an archive zone in a multizone configuration allows you to have the
|
||||
flexibility of an S3 object history in one only zone while saving the space
|
||||
that the replicas of the versioned S3 objects would consume in the rest of the
|
||||
Deploying an archive zone in a multizone configuration enables the
|
||||
flexibility of S3 object history in a single zone while saving the space
|
||||
that replicas of versioned S3 objects would consume in the rest of the
|
||||
zones.
|
||||
|
||||
|
||||
|
||||
Archive Sync Tier Type Configuration
|
||||
------------------------------------
|
||||
|
||||
How to Configure
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
See `Multisite Configuration`_ for how to multisite config instructions. The
|
||||
archive sync module requires a creation of a new zone. The zone tier type needs
|
||||
See `Multisite Configuration`_ for multisite configuration instructions. The
|
||||
archive sync module requires the creation of a new zone. The zone tier type needs
|
||||
to be defined as ``archive``:
|
||||
|
||||
::
|
||||
|
@ -265,18 +265,18 @@ QoS settings
|
||||
|
||||
.. versionadded:: Nautilus
|
||||
|
||||
The ``civetweb`` frontend has a threading model that uses a thread per
|
||||
The older and now non-default``civetweb`` frontend has a threading model that uses a thread per
|
||||
connection and hence is automatically throttled by :confval:`rgw_thread_pool_size`
|
||||
configurable when it comes to accepting connections. The newer ``beast`` frontend is
|
||||
not restricted by the thread pool size when it comes to accepting new
|
||||
connections, so a scheduler abstraction is introduced in the Nautilus release
|
||||
to support future methods of scheduling requests.
|
||||
when accepting connections. The newer and default ``beast`` frontend is
|
||||
not limited by the thread pool size when it comes to accepting new
|
||||
connections, so a scheduler abstraction was introduced in the Nautilus release
|
||||
to support additional methods of scheduling requests.
|
||||
|
||||
Currently the scheduler defaults to a throttler which throttles the active
|
||||
connections to a configured limit. QoS based on mClock is currently in an
|
||||
*experimental* phase and not recommended for production yet. Current
|
||||
implementation of *dmclock_client* op queue divides RGW ops on admin, auth
|
||||
(swift auth, sts) metadata & data requests.
|
||||
Currently the scheduler defaults to a throttler that limits active
|
||||
connections to a configured limit. QoS rate limiting based on mClock is currently
|
||||
*experimental* phase and not recommended for production. The current
|
||||
implementation of the *dmclock_client* op queue divides RGW ops into admin, auth
|
||||
(swift auth, sts) metadata, and data requests.
|
||||
|
||||
|
||||
.. confval:: rgw_max_concurrent_requests
|
||||
@ -306,9 +306,9 @@ D4N Settings
|
||||
============
|
||||
|
||||
D4N is a caching architecture that utilizes Redis to speed up S3 object storage
|
||||
operations by establishing shared databases between different RGW access points.
|
||||
operations by establishing shared databases among Ceph Object Gateway (RGW) daemons.
|
||||
|
||||
Currently, the architecture can only function on one Redis instance at a time.
|
||||
The D4N architecture can only function on one Redis instance at a time.
|
||||
The address is configurable and can be changed by accessing the parameters
|
||||
below.
|
||||
|
||||
@ -318,18 +318,18 @@ below.
|
||||
Topic persistency settings
|
||||
==========================
|
||||
|
||||
Topic persistency will persistently push the notification until it succeeds.
|
||||
Topic persistency will repeatedly push notifications until they succeed.
|
||||
For more information, see `Bucket Notifications`_.
|
||||
|
||||
The default behavior is to push indefinitely and as frequently as possible.
|
||||
With these settings you can control how long and how often to retry an
|
||||
unsuccessful notification. How long to persistently push can be controlled
|
||||
by providing maximum time of retention or maximum amount of retries.
|
||||
Frequency of persistent push retries can be controlled with the sleep duration
|
||||
unsuccessful notification by configuring the maximum retention time and/or or
|
||||
maximum number of retries.
|
||||
The interval between push retries can be configured via the sleep duration
|
||||
parameter.
|
||||
|
||||
All of these values have default value 0 (persistent retention is indefinite,
|
||||
and retried as frequently as possible).
|
||||
All of these options default to the value `0`, which means that persistent
|
||||
retention is indefinite, and notifications are retried as frequently as possible.
|
||||
|
||||
.. confval:: rgw_topic_persistency_time_to_live
|
||||
.. confval:: rgw_topic_persistency_max_retries
|
||||
|
@ -82,6 +82,14 @@ is mounted at `/mnt/nvme0` and has `10 GB` of free space available for the cache
|
||||
The persistent path directory has to be created before starting the Gateway.
|
||||
(``mkdir -p /mnt/nvme0/rgw_datacache/client.rgw.8000/``)
|
||||
|
||||
In containerized deployments the cache directory should be mounted as a volume::
|
||||
|
||||
extra_container_args:
|
||||
- "-v"
|
||||
- "/mnt/nvme0/rgw_datacache/client.rgw.8000/:/mnt/nvme0/rgw_datacache/client.rgw.8000/"
|
||||
|
||||
(Reference: `Service Management - Mounting Files with Extra Container Arguments`_)
|
||||
|
||||
If another Gateway is co-located on the same machine, configure it's persistent path to a discrete directory,
|
||||
for example in the case of `[client.rgw.8001]` configure
|
||||
``rgw_d3n_l1_datacache_persistent_path = "/mnt/nvme0/rgw_datacache/client.rgw.8001/"``
|
||||
@ -114,3 +122,4 @@ The following D3N related settings can be added to the Ceph configuration file
|
||||
.. _Rados Gateway Compression: ../compression/
|
||||
.. _Rados Gateway Encryption: ../encryption/
|
||||
.. _RGW Data cache and CDN: ../rgw-cache/
|
||||
.. _Service Management - Mounting Files with Extra Container Arguments: ../cephadm/services/#mounting-files-with-extra-container-arguments
|
||||
|
@ -508,7 +508,7 @@ For example:
|
||||
Updating the Period
|
||||
-------------------
|
||||
|
||||
After updating the master zone configuration, update the period:
|
||||
After updating the secondary zone configuration, update the period:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
|
@ -7,6 +7,10 @@ Bucket Notifications
|
||||
.. versionchanged:: Squid
|
||||
A new "v2" format for Topic and Notification metadata can be enabled with
|
||||
the :ref:`feature_notification_v2` zone feature.
|
||||
Enabling this feature after an upgrade from an older version will trigger
|
||||
migration of the existing Topic and Notification metadata.
|
||||
In a greenfield deployment, the new format will be used.
|
||||
The new format allows for the data to be synced between zones in the zonegroup.
|
||||
|
||||
.. contents::
|
||||
|
||||
@ -61,9 +65,15 @@ Asynchronous Notifications
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Notifications can be sent asynchronously. They are committed into persistent
|
||||
storage and then asynchronously sent to the topic's configured endpoint. In
|
||||
this case, the only latency added to the original operation is the latency
|
||||
storage and then asynchronously sent to the topic's configured endpoint.
|
||||
The notification will be committed to persistent storage only if the triggering
|
||||
operation was successful.
|
||||
In this case, the only latency added to the original operation is the latency
|
||||
added when the notification is committed to persistent storage.
|
||||
If the endpoint of the topic to which the notification is sent is not available for a long
|
||||
period of time, the persistent storage allocated for this topic will eventually fill up.
|
||||
When this happens the triggering operations will fail with ``503 Service Unavailable``,
|
||||
which tells the client that it may retry later.
|
||||
|
||||
.. note:: If the notification fails with an error, cannot be delivered, or
|
||||
times out, it is retried until it is successfully acknowledged.
|
||||
@ -98,6 +108,18 @@ Remove a topic by running the following command:
|
||||
|
||||
radosgw-admin topic rm --topic={topic-name} [--tenant={tenant}]
|
||||
|
||||
Fetch persistent topic stats (i.e. reservations, entries and size) by running the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
radosgw-admin topic stats --topic={topic-name} [--tenant={tenant}]
|
||||
|
||||
Dump (in JSON format) all pending bucket notifications of a persistent topic by running the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
radosgw-admin topic dump --topic={topic-name} [--tenant={tenant}] [--max-entries={max-entries}]
|
||||
|
||||
|
||||
Notification Performance Statistics
|
||||
-----------------------------------
|
||||
|
@ -33,13 +33,20 @@ QAT Environment Setup
|
||||
encryption and compression services. And QAT driver in kernel space have to
|
||||
be loaded to drive the hardware.
|
||||
|
||||
The driver package can be downloaded from `Intel Quickassist Technology`_.
|
||||
The out-of-tree QAT driver package can be downloaded from `Intel Quickassist
|
||||
Technology`_.
|
||||
|
||||
2. The implementation for QAT based encryption is directly base on QAT API which
|
||||
is included the driver package. But QAT support for compression depends on
|
||||
QATzip project, which is a user space library which builds on top of the QAT
|
||||
API. Currently, QATzip speeds up gzip compression and decompression at the
|
||||
time of writing.
|
||||
The QATlib can be downloaded from `qatlib`_, which is used for the in-tree QAT
|
||||
driver.
|
||||
|
||||
.. note::
|
||||
The out-of-tree QAT driver is gradually being migrated to an in-tree driver+QATlib.
|
||||
|
||||
2. The implementation of QAT-based encryption is directly based on the QAT API,
|
||||
which is included the driver package. However, QAT support for compression
|
||||
depends on the QATzip project, which is a userspace library that builds on
|
||||
top of the QAT API. At the time of writing (July 2024), QATzip speeds up
|
||||
gzip compression and decompression.
|
||||
|
||||
See `QATzip`_.
|
||||
|
||||
@ -48,36 +55,39 @@ Implementation
|
||||
1. QAT based Encryption for RGW
|
||||
|
||||
`OpenSSL support for RGW encryption`_ has been merged into Ceph, and Intel also
|
||||
provides one `QAT Engine`_ for OpenSSL. So, theoretically speaking, QAT based
|
||||
encryption in Ceph can be directly supported through OpenSSl+QAT Engine.
|
||||
provides one `QAT Engine`_ for OpenSSL. Theoretically, QAT-based encryption in
|
||||
Ceph can be directly supported through the OpenSSl+QAT Engine.
|
||||
|
||||
But the QAT Engine for OpenSSL currently supports chained operations only, and
|
||||
so Ceph will not be able to utilize QAT hardware feature for crypto operations
|
||||
based on OpenSSL crypto plugin. As a result, one QAT plugin based on native
|
||||
QAT API is added into crypto framework.
|
||||
However, the QAT Engine for OpenSSL currently supports only chained operations,
|
||||
which means that Ceph will not be able to utilize QAT hardware features for
|
||||
crypto operations based on the OpenSSL crypto plugin. As a result, one QAT plugin
|
||||
based on native QAT API is added into the crypto framework.
|
||||
|
||||
2. QAT Support for Compression
|
||||
|
||||
As mentioned above, QAT support for compression is based on QATzip library in
|
||||
user space, which is designed to take full advantage of the performance provided
|
||||
by QuickAssist Technology. Unlike QAT based encryption, QAT based compression
|
||||
is supported through a tool class for QAT acceleration rather than a compressor
|
||||
plugin. The common tool class can transparently accelerate the existing compression
|
||||
types, but only zlib compressor can be supported at the time of writing. So
|
||||
user is allowed to use it to speed up zlib compressor as long as the QAT
|
||||
hardware is available and QAT is capable to handle it.
|
||||
As mentioned above, QAT support for compression is based on the QATzip library
|
||||
in user space, which is designed to take full advantage of the performance that
|
||||
QuickAssist Technology provides. Unlike QAT-based encryption, QAT-based
|
||||
compression is supported through a tool class for QAT acceleration rather than
|
||||
a compressor plugin. This common tool class can transparently accelerate the
|
||||
existing compression types, but only the zlib compressor is supported at the
|
||||
time of writing. This means that this tool class can be used to speed up
|
||||
the zlib compressor if QAT hardware is available.
|
||||
|
||||
Configuration
|
||||
=============
|
||||
#. Prerequisites
|
||||
|
||||
Make sure the QAT driver with version v1.7.L.4.14.0 or higher has been installed.
|
||||
Remember to set an environment variable "ICP_ROOT" for your QAT driver package
|
||||
root directory.
|
||||
**For out-of-tree QAT**
|
||||
|
||||
To enable the QAT based encryption and compression, user needs to modify the QAT
|
||||
configuration files. For example, for Intel QuickAssist Adapter 8970 product, revise
|
||||
c6xx_dev0/1/2.conf in the directory ``/etc/`` and keep them the same, e.g.:
|
||||
Make sure the out-of-tree QAT driver with version v1.7.L.4.14.0 or higher
|
||||
has been installed. Remember to set an environment variable ``ICP_ROOT``
|
||||
for your QAT driver package root directory.
|
||||
|
||||
To enable the QAT based encryption and compression, the user must modify the
|
||||
QAT configuration files. For example, for the Intel QuickAssist Adapter 8970
|
||||
product, revise ``c6xx_dev0/1/2.conf`` in the directory ``/etc/`` and keep them
|
||||
the same. For example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
@ -101,51 +111,121 @@ Configuration
|
||||
# List of core affinities
|
||||
Dc0CoreAffinity = 0
|
||||
|
||||
#. QAT based Encryption for RGW
|
||||
**For in-tree QAT**
|
||||
|
||||
The CMake option ``WITH_QAT=ON`` must be configured. If you build Ceph from
|
||||
There are some prerequisites for using QATlib. Make sure that your system
|
||||
meets the `QATlib System Requirements`_ .
|
||||
|
||||
* To properly use the QATlib library, the Intel VT-d and SR-IOV parameters
|
||||
must be enabled in the platform BIOS.
|
||||
* Some QATlib features require a recent kernel driver or firmware version.
|
||||
See `QATlib Kernel Driver Releases`_.
|
||||
* The supported platform contains a 4xxx Intel Communications device or
|
||||
newer.
|
||||
* The ``intel_iommu`` parameter must be enabled. Verify that this setting is
|
||||
enabled by running the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
cat /proc/cmdline | grep intel_iommu=on
|
||||
sudo sh -c 'echo "@qat - memlock 204800" >> /etc/security/limits.conf'
|
||||
sudo su -l $USER
|
||||
|
||||
For configuration and Tuning see `QATlib Configuration and Tuning`_.
|
||||
|
||||
#. QAT-based Encryption for RGW
|
||||
|
||||
The CMake option ``WITH_QATDRV=ON`` must be set. If you build Ceph from
|
||||
source code (see: :ref:`build-ceph`), navigate to your cloned Ceph repository
|
||||
and execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
cd ceph
|
||||
./do_cmake.sh -DWITH_QAT=ON
|
||||
./do_cmake.sh -DWITH_QATDRV=ON
|
||||
cd build
|
||||
ininja
|
||||
|
||||
.. note::
|
||||
The section name of the QAT configuration files must be ``CEPH`` since
|
||||
the section name is set as "CEPH" in Ceph crypto source code.
|
||||
.. note:: The section name in QAT configuration files must be ``CEPH``,
|
||||
because the section name is set to ``CEPH`` in the Ceph crypto source code.
|
||||
|
||||
Then, edit the Ceph configuration file to make use of QAT based crypto plugin::
|
||||
Edit the Ceph configuration file (usually ``ceph.conf``) to make use of the
|
||||
QAT-based crypto plugin::
|
||||
|
||||
plugin crypto accelerator = crypto_qat
|
||||
|
||||
#. QAT Support for Compression
|
||||
|
||||
Before starting, make sure both QAT driver and `QATzip`_ have been installed. Besides
|
||||
"ICP_ROOT", remember to set the environment variable "QZ_ROOT" for the root directory
|
||||
of your QATzip source tree.
|
||||
**For out-of-tree QAT**
|
||||
|
||||
The following CMake options have to be configured to trigger QAT based compression
|
||||
when building Ceph:
|
||||
For the out-of-tree QAT driver package, before building ensure that both the QAT
|
||||
driver and `QATzip`_ have been installed. In addition to ``ICP_ROOT``,
|
||||
set the environment variable ``QZ_ROOT`` to the root directory of your QATzip
|
||||
source tree.
|
||||
|
||||
The following CMake options must be configured to trigger QAT-based
|
||||
compression when building Ceph:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
./do_cmake.sh -DWITH_QAT=ON -DWITH_QATZIP=ON
|
||||
./do_cmake.sh -DWITH_QATDRV=ON -DWITH_QATZIP=ON -DWITH_SYSTEM_QATZIP=ON -DWITH_QATLIB=OFF
|
||||
|
||||
Then, set an environment variable to clarify the section name of User Process Instance
|
||||
Section in QAT configuration files, e.g.:
|
||||
Set an environment variable to clarify the section name of the User Process
|
||||
Instance Section in the QAT configuration files. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
export QAT_SECTION_NAME=CEPH
|
||||
|
||||
Next, edit the Ceph configuration file to enable QAT support for compression::
|
||||
**For in-tree QAT**
|
||||
|
||||
For in-tree QAT, ensure that your system meets the `QATlib System
|
||||
Requirements`_. QATlib can be installed from pre-built packages or from
|
||||
source code. See `QATlib Installation`_ . After QATlib is installed, you
|
||||
can run ``cpa_sample_code`` to check if the QAT environment is OK.
|
||||
|
||||
If you are using QATlib source code, the Ceph `cmake` build enables the
|
||||
qatlib and qatzip options by default. Our normal compilation
|
||||
already includes QAT-compressor-related code.
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
./do_cmake.sh
|
||||
|
||||
If you are using pre-built packages installed on the system, the following
|
||||
CMake options must be configured when building Ceph:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
./do_cmake.sh -DWITH_SYSTEM_QATLIB=ON -DWITH_SYSTEM_QATZIP=ON
|
||||
|
||||
|
||||
**For both out-of-tree QAT and in-tree QAT**
|
||||
|
||||
Edit Ceph's central config DB or configuration file (usually ``ceph.conf``) to enable QAT
|
||||
support for *zlib* compression::
|
||||
|
||||
qat compressor enabled=true
|
||||
|
||||
Set the RGW compression method:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
# for storage class(STANDARD)
|
||||
radosgw-admin zone placement modify --rgw-zone=default --placement-id=default-placement --compression=zlib
|
||||
# or create a new storage class(COLD) and define data pool(default.rgw.cold.data)
|
||||
radosgw-admin zonegroup placement add --rgw-zonegroup default --placement-id default-placement --storage-class COLD
|
||||
radosgw-admin zone placement add --rgw-zone default --placement-id default-placement --storage-class COLD --compression zlib --data-pool default.rgw.cold.data
|
||||
|
||||
CONFIG REFERENCE
|
||||
================
|
||||
The following QAT-related settings can be added to the Ceph configuration file
|
||||
(usually `ceph.conf`) under the ``[client.rgw.{instance-name}]`` section.
|
||||
|
||||
.. confval:: qat_compressor_session_max_number
|
||||
.. confval:: qat_compressor_busy_polling
|
||||
|
||||
|
||||
|
||||
.. _QAT Support for Compression: https://github.com/ceph/ceph/pull/19714
|
||||
.. _QAT based Encryption for RGW: https://github.com/ceph/ceph/pull/19386
|
||||
@ -153,3 +233,9 @@ Configuration
|
||||
.. _QATzip: https://github.com/intel/QATzip
|
||||
.. _OpenSSL support for RGW encryption: https://github.com/ceph/ceph/pull/15168
|
||||
.. _QAT Engine: https://github.com/intel/QAT_Engine
|
||||
.. _qatlib: https://github.com/intel/qatlib
|
||||
.. _QATlib User's Guide: https://intel.github.io/quickassist/qatlib/index.html
|
||||
.. _QATlib System Requirements: https://intel.github.io/quickassist/qatlib/requirements.html
|
||||
.. _QATlib Installation: https://intel.github.io/quickassist/qatlib/install.html
|
||||
.. _QATlib Configuration and Tuning: https://intel.github.io/quickassist/qatlib/configuration.html
|
||||
.. _QATlib Kernel Driver Releases: https://intel.github.io/quickassist/RN/In-Tree/in_tree_firmware_RN.html#qat-kernel-driver-releases-features
|
||||
|
@ -7,22 +7,47 @@
|
||||
|
||||
Bucket and Host Name
|
||||
--------------------
|
||||
There are two different modes of accessing the buckets. The first (preferred) method
|
||||
identifies the bucket as the top-level directory in the URI. ::
|
||||
There are two different modes of accessing buckets. The first method identifies
|
||||
the bucket as the top-level directory in the URI::
|
||||
|
||||
GET /mybucket HTTP/1.1
|
||||
Host: cname.domain.com
|
||||
|
||||
The second method identifies the bucket via a virtual bucket host name. For example::
|
||||
Most S3 clients nowadays rely on vhost-style access. The desired bucket is
|
||||
indicated by a DNS FQDN. For example::
|
||||
|
||||
GET / HTTP/1.1
|
||||
Host: mybucket.cname.domain.com
|
||||
|
||||
To configure virtual hosted buckets, you can either set ``rgw_dns_name = cname.domain.com`` in ceph.conf, or add ``cname.domain.com`` to the list of ``hostnames`` in your zonegroup configuration. See `Ceph Object Gateway - Multisite Configuration`_ for more on zonegroups.
|
||||
The second method is deprecated by AWS. See the `Amazon S3 Path Deprecation
|
||||
Plan`_ for more information.
|
||||
|
||||
.. tip:: We prefer the first method, because the second method requires expensive domain certification and DNS wild cards.
|
||||
To configure virtual hosted buckets, you can either set ``rgw_dns_name =
|
||||
cname.domain.com`` in ``ceph.conf`` or add ``cname.domain.com`` to the list of
|
||||
``hostnames`` in your zonegroup configuration. See `Ceph Object Gateway -
|
||||
Multisite Configuration`_ for more on zonegroups.
|
||||
|
||||
Here is an example of a ``ceph config set`` comamnd that sets ``rgw_dns_name``
|
||||
to ``cname.domain.com``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set client.rgw.<ceph authx client for rgw> rgw_dns_name cname.domain.dom
|
||||
|
||||
.. tip:: You can define multiple hostnames directly with the
|
||||
:confval:`rgw_dns_name` parameter.
|
||||
|
||||
.. tip:: When SSL is enabled, the certificates must use a wildcard in the
|
||||
domain name in order to match the bucket subdomains.
|
||||
|
||||
.. note:: When Ceph Object Gateways are behind a proxy, use the proxy's DNS
|
||||
name instead. Then you can use ``ceph config set client.rgw`` to set the DNS
|
||||
name for all instances.
|
||||
|
||||
.. note:: The static website view for the `s3website` API must be served under
|
||||
a different domain name. This is configured separately from
|
||||
:confval:`rgw_dns_name`, in :confval:`rgw_dns_s3website_name`.
|
||||
|
||||
.. tip:: You can define multiple hostname directly with the :confval:`rgw_dns_name` parameter.
|
||||
|
||||
Common Request Headers
|
||||
----------------------
|
||||
@ -111,3 +136,4 @@ Common Response Status
|
||||
+---------------+-----------------------------------+
|
||||
|
||||
.. _`Ceph Object Gateway - Multisite Configuration`: ../../multisite
|
||||
.. _`Amazon S3 Path Deprecation Plan`: https://aws.amazon.com/blogs/aws/amazon-s3-path-deprecation-plan-the-rest-of-the-story/
|
||||
|
@ -8,6 +8,9 @@ Ceph is a clustered and distributed storage manager. If that's too cryptic,
|
||||
then just think of Ceph as a computer program that stores data and uses a
|
||||
network to make sure that there is a backup copy of the data.
|
||||
|
||||
Components of Ceph
|
||||
==================
|
||||
|
||||
Storage Interfaces
|
||||
------------------
|
||||
|
||||
@ -94,6 +97,89 @@ MDS
|
||||
A metadata server (MDS) is necessary for the proper functioning of CephFS.
|
||||
See :ref:`orchestrator-cli-cephfs` and :ref:`arch-cephfs`.
|
||||
|
||||
Vstart Cluster Installation and Configuration Procedure
|
||||
=======================================================
|
||||
|
||||
#. Clone the ``ceph/ceph`` repository:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
git clone git@github.com:ceph/ceph
|
||||
|
||||
#. Update the submodules in the ``ceph/ceph`` repository:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
git submodule update --init --recursive --progress
|
||||
|
||||
#. Run ``install-deps.sh`` from within the directory into which you cloned the
|
||||
``ceph/ceph`` repository:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
./install-deps.sh
|
||||
|
||||
#. Install the ``python3-routes`` package:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
apt install python3-routes
|
||||
|
||||
#. Move into the ``ceph`` directory. You will know that you are in the correct
|
||||
directory if it contains the file ``do_cmake.sh``:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cd ceph
|
||||
|
||||
#. Run the ``do_cmake.sh`` script:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
./do_cmake.sh
|
||||
|
||||
#. The ``do_cmake.sh`` script creates a ``build/`` directory. Move into the
|
||||
``build/`` directory:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cd build
|
||||
|
||||
#. Use ``ninja`` to build the development environment:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ninja -j3
|
||||
|
||||
.. note:: This step takes a long time to run. The ``ninja -j3`` command
|
||||
kicks off a process consisting of 2289 steps. This step took over three
|
||||
hours when I ran it on an Intel NUC with an i7 in September of 2024.
|
||||
|
||||
#. Install the Ceph development environment:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ninja install
|
||||
|
||||
This step does not take as long as the previous step.
|
||||
|
||||
#. Build the vstart cluster:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ninja vstart
|
||||
|
||||
#. Start the vstart cluster:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
../src/vstart.sh --debug --new -x --localhost --bluestore
|
||||
|
||||
.. note:: Run this command from within the ``ceph/build`` directory.
|
||||
|
||||
|
||||
|
||||
|
||||
LINKS
|
||||
-----
|
||||
|
||||
|
@ -860,7 +860,7 @@ possible, we prefer to maintain this convention with text, lists, literal text
|
||||
lines should begin at the same character position as the text of the
|
||||
indented text (less numbers, bullets, etc.).
|
||||
|
||||
Indented text may include literal text examples. Whereas, text indentation
|
||||
Indented text may include literal text examples. Although text indentation
|
||||
should be done with spaces, literal text examples should be indented with
|
||||
tabs. This convention enables you to add an additional indented paragraph
|
||||
following a literal example by leaving a blank line and beginning the
|
||||
|
@ -22,13 +22,12 @@ another, but below are some general guidelines.
|
||||
CPU
|
||||
===
|
||||
|
||||
CephFS Metadata Servers (MDS) are CPU-intensive. They are
|
||||
are single-threaded and perform best with CPUs with a high clock rate (GHz). MDS
|
||||
servers do not need a large number of CPU cores unless they are also hosting other
|
||||
services, such as SSD OSDs for the CephFS metadata pool.
|
||||
OSD nodes need enough processing power to run the RADOS service, to calculate data
|
||||
placement with CRUSH, to replicate data, and to maintain their own copies of the
|
||||
cluster map.
|
||||
CephFS Metadata Servers (MDS) are CPU-intensive. They are single-threaded
|
||||
and perform best with CPUs with a high clock rate (GHz). MDS servers do not
|
||||
need a large number of CPU cores unless they are also hosting other services,
|
||||
such as SSD OSDs for the CephFS metadata pool. OSD nodes need enough
|
||||
processing power to run the RADOS service, to calculate data placement with
|
||||
CRUSH, to replicate data, and to maintain their own copies of the cluster map.
|
||||
|
||||
With earlier releases of Ceph, we would make hardware recommendations based on
|
||||
the number of cores per OSD, but this cores-per-osd metric is no longer as
|
||||
|
@ -43,36 +43,68 @@ distribution that includes a supported kernel and supported system startup
|
||||
framework, for example ``sysvinit`` or ``systemd``. Ceph is sometimes ported to
|
||||
non-Linux systems but these are not supported by the core Ceph effort.
|
||||
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| | Reef (18.2.z) | Quincy (17.2.z) | Pacific (16.2.z) | Octopus (15.2.z) |
|
||||
+===============+===============+==================+==================+==================+
|
||||
| Centos 7 | | | | B |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| Centos 8 | | | | |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| Centos 9 | A H | A :sup:`1` H | | |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| Debian 10 | C | | C | C |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| Debian 11 | C | C | C | |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| OpenSUSE 15.2 | C | | C | C |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| OpenSUSE 15.3 | C | C | | |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| Ubuntu 18.04 | | | C | C |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| Ubuntu 20.04 | A | A | A | A |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
| Ubuntu 22.04 | A H | | | |
|
||||
+---------------+---------------+------------------+------------------+------------------+
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| | Squid (19.2.z) | Reef (18.2.z) | Quincy (17.2.z) | Pacific (16.2.z) | Octopus (15.2.z) |
|
||||
+===============+================+===============+==================+==================+==================+
|
||||
| Centos 7 | | | | | B |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| Centos 8 | | | | | |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| Centos 9 | A | A | A :sup:`1` | | |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| Debian 10 | | C | | C | C |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| Debian 11 | | C | C | C | |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| Debian 12 | C | C | | | |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| OpenSUSE 15.2 | | C | | C | C |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| OpenSUSE 15.3 | | C | C | | |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| Ubuntu 18.04 | | | | C | C |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| Ubuntu 20.04 | | A | A | A | A |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
| Ubuntu 22.04 | A | A | | | |
|
||||
+---------------+----------------+---------------+------------------+------------------+------------------+
|
||||
|
||||
- **A**: Ceph provides packages and has done comprehensive tests on the software in them.
|
||||
- **B**: Ceph provides packages and has done basic tests on the software in them.
|
||||
- **C**: Ceph provides packages only. No tests have been done on these releases.
|
||||
- **H**: Ceph tests this distribution as a container host.
|
||||
- **1**: Testing has been done on Centos 9 starting on version 17.2.8 for Quincy.
|
||||
|
||||
Container Hosts
|
||||
---------------
|
||||
|
||||
This table shows the operating systems that support Ceph's official container images.
|
||||
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| | Squid (19.2.z) | Reef (18.2.z) | Quincy (17.2.z) |
|
||||
+===============+================+==================+==================+
|
||||
| Centos 7 | | | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| Centos 8 | | | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| Centos 9 | H | H | H |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| Debian 10 | | | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| Debian 11 | | | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| OpenSUSE 15.2 | | | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| OpenSUSE 15.3 | | | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| Ubuntu 18.04 | | | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| Ubuntu 20.04 | | | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
| Ubuntu 22.04 | H | H | |
|
||||
+---------------+----------------+------------------+------------------+
|
||||
|
||||
- **H**: Ceph tests this distribution as a container host.
|
||||
|
||||
.. note::
|
||||
**For Centos 7 Users**
|
||||
|
||||
|
@ -10,6 +10,7 @@ overrides:
|
||||
- MDS_FAILED
|
||||
- MDS_INSUFFICIENT_STANDBY
|
||||
- MDS_UP_LESS_THAN_MAX
|
||||
- online, but wants
|
||||
- filesystem is online with fewer MDS than max_mds
|
||||
- POOL_APP_NOT_ENABLED
|
||||
- do not have an application enabled
|
||||
|
@ -9,8 +9,6 @@ overrides:
|
||||
osd pool default crimson: true
|
||||
osd:
|
||||
crimson osd obc lru size: 10
|
||||
mgr:
|
||||
mgr stats period: 30
|
||||
flavor: crimson
|
||||
workunit:
|
||||
env:
|
||||
|
@ -4,6 +4,7 @@ overrides:
|
||||
selinux:
|
||||
allowlist:
|
||||
- scontext=system_u:system_r:logrotate_t:s0
|
||||
- scontext=system_u:system_r:getty_t:s0
|
||||
|
||||
tasks:
|
||||
- pexec:
|
||||
|
@ -1,11 +1,13 @@
|
||||
|
||||
Default object size:
|
||||
|
||||
$ rbd create --size 20M img
|
||||
|
||||
$ DEV=$(sudo rbd map img)
|
||||
$ blockdev --getiomin $DEV
|
||||
65536
|
||||
$ blockdev --getioopt $DEV
|
||||
65536
|
||||
4194304
|
||||
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
|
||||
65536
|
||||
$ sudo rbd unmap $DEV
|
||||
@ -14,7 +16,7 @@
|
||||
$ blockdev --getiomin $DEV
|
||||
512
|
||||
$ blockdev --getioopt $DEV
|
||||
512
|
||||
4194304
|
||||
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
|
||||
512
|
||||
$ sudo rbd unmap $DEV
|
||||
@ -38,3 +40,45 @@
|
||||
$ sudo rbd unmap $DEV
|
||||
|
||||
$ rbd rm --no-progress img
|
||||
|
||||
Custom object size:
|
||||
|
||||
$ rbd create --size 20M --object-size 1M img
|
||||
|
||||
$ DEV=$(sudo rbd map img)
|
||||
$ blockdev --getiomin $DEV
|
||||
65536
|
||||
$ blockdev --getioopt $DEV
|
||||
1048576
|
||||
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
|
||||
65536
|
||||
$ sudo rbd unmap $DEV
|
||||
|
||||
$ DEV=$(sudo rbd map -o alloc_size=512 img)
|
||||
$ blockdev --getiomin $DEV
|
||||
512
|
||||
$ blockdev --getioopt $DEV
|
||||
1048576
|
||||
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
|
||||
512
|
||||
$ sudo rbd unmap $DEV
|
||||
|
||||
$ DEV=$(sudo rbd map -o alloc_size=1048576 img)
|
||||
$ blockdev --getiomin $DEV
|
||||
1048576
|
||||
$ blockdev --getioopt $DEV
|
||||
1048576
|
||||
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
|
||||
1048576
|
||||
$ sudo rbd unmap $DEV
|
||||
|
||||
$ DEV=$(sudo rbd map -o alloc_size=2097152 img)
|
||||
$ blockdev --getiomin $DEV
|
||||
1048576
|
||||
$ blockdev --getioopt $DEV
|
||||
1048576
|
||||
$ cat /sys/block/${DEV#/dev/}/queue/discard_granularity
|
||||
1048576
|
||||
$ sudo rbd unmap $DEV
|
||||
|
||||
$ rbd rm --no-progress img
|
||||
|
@ -76,6 +76,10 @@ function wait_for_state() {
|
||||
function wait_for_recovery_toofull() {
|
||||
local timeout=$1
|
||||
wait_for_state recovery_toofull $timeout
|
||||
if [ $ret -ne 0 ]; then
|
||||
echo "Error: Recovery toofull timeout"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@ -131,7 +135,11 @@ function TEST_recovery_test_simple() {
|
||||
done
|
||||
|
||||
# If this times out, we'll detected errors below
|
||||
wait_for_recovery_toofull 30
|
||||
wait_for_recovery_toofull 120
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: Recovery toofull timeout"
|
||||
return 1
|
||||
fi
|
||||
|
||||
ERRORS=0
|
||||
if [ "$(ceph pg dump pgs | grep +recovery_toofull | wc -l)" != "1" ];
|
||||
|
@ -229,138 +229,6 @@ function wait_background_check() {
|
||||
return $return_code
|
||||
}
|
||||
|
||||
# osd_scrub_during_recovery=true make sure scrub happens
|
||||
function TEST_recovery_scrub_2() {
|
||||
local dir=$1
|
||||
local poolname=test
|
||||
|
||||
TESTDATA="testdata.$$"
|
||||
OSDS=8
|
||||
PGS=32
|
||||
OBJECTS=40
|
||||
|
||||
setup $dir || return 1
|
||||
run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
|
||||
run_mgr $dir x || return 1
|
||||
local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 "
|
||||
ceph_osd_args+="--osd_scrub_backoff_ratio=0 "
|
||||
ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 "
|
||||
ceph_osd_args+="--osd_stats_update_period_scrubbing=2"
|
||||
for osd in $(seq 0 $(expr $OSDS - 1))
|
||||
do
|
||||
run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=10 \
|
||||
$ceph_osd_args || return 1
|
||||
done
|
||||
|
||||
# Create a pool with $PGS pgs
|
||||
create_pool $poolname $PGS $PGS
|
||||
wait_for_clean || return 1
|
||||
poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
|
||||
|
||||
dd if=/dev/urandom of=$TESTDATA bs=1M count=50
|
||||
for i in $(seq 1 $OBJECTS)
|
||||
do
|
||||
rados -p $poolname put obj${i} $TESTDATA
|
||||
done
|
||||
rm -f $TESTDATA
|
||||
|
||||
ceph osd pool set $poolname size 3
|
||||
|
||||
ceph pg dump pgs
|
||||
|
||||
# note that the following will be needed if the mclock scheduler is specified
|
||||
#ceph tell osd.* config get osd_mclock_override_recovery_settings
|
||||
|
||||
# the '_max_active' is expected to be 0
|
||||
ceph tell osd.1 config get osd_recovery_max_active
|
||||
# both next parameters are expected to be >=3
|
||||
ceph tell osd.1 config get osd_recovery_max_active_hdd
|
||||
ceph tell osd.1 config get osd_recovery_max_active_ssd
|
||||
|
||||
# Wait for recovery to start
|
||||
count=0
|
||||
while(true)
|
||||
do
|
||||
#ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
|
||||
if test $(ceph --format json pg dump pgs |
|
||||
jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
|
||||
then
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
if test "$count" -eq "10"
|
||||
then
|
||||
echo "Not enough recovery started simultaneously"
|
||||
return 1
|
||||
fi
|
||||
count=$(expr $count + 1)
|
||||
done
|
||||
ceph pg dump pgs
|
||||
|
||||
pids=""
|
||||
recov_scrub_count=0
|
||||
for pg in $(seq 0 $(expr $PGS - 1))
|
||||
do
|
||||
run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
|
||||
done
|
||||
wait_background_check pids
|
||||
return_code=$?
|
||||
if [ $return_code -ne 0 ]; then return $return_code; fi
|
||||
|
||||
ERRORS=0
|
||||
if test $recov_scrub_count -eq 0
|
||||
then
|
||||
echo "No scrubs occurred while PG recovering"
|
||||
ERRORS=$(expr $ERRORS + 1)
|
||||
fi
|
||||
|
||||
pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
|
||||
pid=$(cat $pidfile)
|
||||
if ! kill -0 $pid
|
||||
then
|
||||
echo "OSD crash occurred"
|
||||
#tail -100 $dir/osd.0.log
|
||||
ERRORS=$(expr $ERRORS + 1)
|
||||
fi
|
||||
|
||||
# Work around for http://tracker.ceph.com/issues/38195
|
||||
kill_daemons $dir #|| return 1
|
||||
|
||||
declare -a err_strings
|
||||
err_strings[0]="not scheduling scrubs due to active recovery"
|
||||
|
||||
for osd in $(seq 0 $(expr $OSDS - 1))
|
||||
do
|
||||
grep "not scheduling scrubs" $dir/osd.${osd}.log
|
||||
done
|
||||
for err_string in "${err_strings[@]}"
|
||||
do
|
||||
found=false
|
||||
for osd in $(seq 0 $(expr $OSDS - 1))
|
||||
do
|
||||
if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
|
||||
then
|
||||
found=true
|
||||
fi
|
||||
done
|
||||
if [ "$found" = "true" ]; then
|
||||
echo "Found log message not expected '$err_string'"
|
||||
ERRORS=$(expr $ERRORS + 1)
|
||||
fi
|
||||
done
|
||||
|
||||
teardown $dir || return 1
|
||||
|
||||
if [ $ERRORS != "0" ];
|
||||
then
|
||||
echo "TEST FAILED WITH $ERRORS ERRORS"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "TEST PASSED"
|
||||
return 0
|
||||
}
|
||||
|
||||
main osd-recovery-scrub "$@"
|
||||
|
||||
# Local Variables:
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user